Skip to content

Commit

Permalink
Merge branch 'main' into fix-saeb-2021
Browse files Browse the repository at this point in the history
  • Loading branch information
vilelaluiza authored Aug 14, 2024
2 parents c279437 + 9a90673 commit 500ed90
Show file tree
Hide file tree
Showing 9 changed files with 245 additions and 14 deletions.
14 changes: 7 additions & 7 deletions models/br_ibge_pib/br_ibge_pib__municipio.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
select
safe_cast(id_municipio as string) id_municipio,
safe_cast(ano as int64) ano,
safe_cast(pib as int64) pib,
safe_cast(impostos_liquidos as int64) impostos_liquidos,
safe_cast(va as int64) va,
safe_cast(va_agropecuaria as int64) va_agropecuaria,
safe_cast(va_industria as int64) va_industria,
safe_cast(va_servicos as int64) va_servicos,
safe_cast(va_adespss as int64) va_adespss
1000 * safe_cast(pib as int64) pib,
1000 * safe_cast(impostos_liquidos as int64) impostos_liquidos,
1000 * safe_cast(va as int64) va,
1000 * safe_cast(va_agropecuaria as int64) va_agropecuaria,
1000 * safe_cast(va_industria as int64) va_industria,
1000 * safe_cast(va_servicos as int64) va_servicos,
1000 * safe_cast(va_adespss as int64) va_adespss
from `basedosdados-staging.br_ibge_pib_staging.municipio` as t
16 changes: 16 additions & 0 deletions models/br_inep_ideb/br_inep_ideb__uf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{{ config(alias="uf", schema="br_inep_ideb", materialized="table") }}

select
safe_cast(ano as int64) ano,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(rede as string) rede,
safe_cast(ensino as string) ensino,
safe_cast(anos_escolares as string) anos_escolares,
safe_cast(taxa_aprovacao as float64) taxa_aprovacao,
safe_cast(indicador_rendimento as float64) indicador_rendimento,
safe_cast(nota_saeb_matematica as float64) nota_saeb_matematica,
safe_cast(nota_saeb_lingua_portuguesa as float64) nota_saeb_lingua_portuguesa,
safe_cast(nota_saeb_media_padronizada as float64) nota_saeb_media_padronizada,
safe_cast(ideb as float64) ideb,
safe_cast(projecao as float64) projecao,
from `basedosdados-staging.br_inep_ideb_staging.uf` as t
136 changes: 136 additions & 0 deletions models/br_inep_ideb/code/fix_ideb_uf_2021.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import pandas as pd
import numpy as np
import os
import zipfile
import basedosdados as bd

ROOT = os.path.join("models", "br_inep_ideb")
INPUT = os.path.join(ROOT, "input")
TMP = os.path.join(ROOT, "tmp")
OUTPUT = os.path.join(ROOT, "output")

os.makedirs(INPUT, exist_ok=True)
os.makedirs(TMP, exist_ok=True)
os.makedirs(OUTPUT, exist_ok=True)

exit_code = os.system(
f"cd {INPUT}; curl -O -k https://download.inep.gov.br/educacao_basica/portal_ideb/planilhas_para_download/2021/divulgacao_regioes_ufs_ideb_2021.zip"
)

assert exit_code == 0

with zipfile.ZipFile(
os.path.join(INPUT, "divulgacao_regioes_ufs_ideb_2021.zip"), "r"
) as zip_ref:
zip_ref.extractall(TMP)

XLSX_PATH = os.path.join(TMP, "divulgacao_regioes_ufs_ideb_2021.xlsx")

sheet_names: list[str] = pd.ExcelFile(XLSX_PATH).sheet_names


RENAMES = {
"Unnamed: 0": "sigla_uf",
"Unnamed: 1": "rede",
"VL_APROVACAO_2021_SI_4": "taxa_aprovacao",
"VL_INDICADOR_REND_2021": "indicador_rendimento",
"VL_NOTA_MATEMATICA_2021": "nota_saeb_matematica",
"VL_NOTA_PORTUGUES_2021": "nota_saeb_lingua_portuguesa",
"VL_NOTA_MEDIA_2021": "nota_saeb_media_padronizada",
"VL_OBSERVADO_2021": "ideb",
}

df = pd.concat(
[
pd.read_excel(XLSX_PATH, sheet_name=sheet_name, skiprows=9)[
list(RENAMES.keys())
]
.rename(columns=RENAMES, errors="raise") # type: ignore
.assign(anos_escolares=sheet_name)
for sheet_name in sheet_names
]
)

df["sigla_uf"].unique()

SIGLA_UFS_REPLACES = {
"R. G. do Norte": "Rio Grande do Norte",
"R. G. do Sul": "Rio Grande do Sul",
"M. G. do Sul": "Mato Grosso do Sul",
}

df["sigla_uf"] = df["sigla_uf"].replace(SIGLA_UFS_REPLACES)

df["sigla_uf"].unique()

br_dirs = bd.read_sql(
"SELECT * from `basedosdados.br_bd_diretorios_brasil.uf`",
billing_project_id="basedosdados-dev",
)

assert isinstance(br_dirs, pd.DataFrame)

df = df.loc[df["sigla_uf"].isin(br_dirs["nome"].tolist())]

assert len(df["sigla_uf"].unique()) == 27

replaces_name_sigla_uf = {i["nome"]: i["sigla"] for i in br_dirs.to_dict("records")}

df["sigla_uf"] = df["sigla_uf"].replace(replaces_name_sigla_uf)

df["anos_escolares"].unique()

ANOS_ESCOLARES = {
"UF e Regiões (AI)": "iniciais (1-5)",
"UF e Regiões (AF)": "finais (6-9)",
"UF e Regiões (EM)": "todos (1-4)",
}

df["anos_escolares"] = df["anos_escolares"].replace(ANOS_ESCOLARES)

df["anos_escolares"].unique()

# add col ensino
df["ensino"] = df["anos_escolares"].apply(
lambda v: "medio" if v == "todos (1-4)" else "fundamental"
)

df["rede"] = df["rede"].str.lower().replace({"pública": "publica"})

df["rede"].unique()

for col in df.columns:
print(col, " -- ", df[col].unique())

# replace `-` with nan
df = df.replace({"-": np.nan})

tb = bd.Table(dataset_id="br_inep_ideb", table_id="uf")

cols_from_bq = tb._get_columns_from_bq()

order_cols = [i["name"] for i in cols_from_bq["columns"]]

# add `ano` and `projecao` column

df["ano"] = 2021
df["projecao"] = None

df[order_cols]

df_upstream = bd.read_sql(
"select * from `basedosdados.br_inep_ideb.uf` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

OUTPUT_PATH = os.path.join(OUTPUT, "uf.csv")

pd.concat([df[order_cols], df_upstream]).to_csv( # type: ignore
OUTPUT_PATH, index=False
)

tb.create(
OUTPUT_PATH,
if_table_exists="replace",
if_storage_data_exists="replace",
)
47 changes: 47 additions & 0 deletions models/br_inep_ideb/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,50 @@ models:
description: IDEB (Nota SAEB - Média Padronizada x Indicador de Rendimento)
- name: projecao
description: Projeção
- name: br_inep_ideb__uf
description: Ideb a nível de UF, por ano, rede e anos escolares.
tests:
- not_null_proportion_multiple_columns:
at_least: 0.7
columns:
- name: ano
description: Ano
tests:
- relationships:
to: ref('br_bd_diretorios_data_tempo__ano')
field: ano.ano
- name: sigla_uf
description: Sigla da Unidade da Federação
tests:
- relationships:
to: ref('br_bd_diretorios_brasil__uf')
field: sigla
- name: rede
description: Rede Escolar
tests:
- accepted_values:
values: [privada, publica, total, estadual]
- name: ensino
description: Tipo de Ensino
test:
- accepted_values:
values: [medio, fundamental]
- name: anos_escolares
description: Anos Escolares
test:
- accepted_values:
values: [iniciais (1-5), finais (6-9), todos (1-4)]
- name: taxa_aprovacao
description: Taxa de Aprovação
- name: indicador_rendimento
description: Indicador de Rendimento (P)
- name: nota_saeb_matematica
description: Nota SAEB - Matemática
- name: nota_saeb_lingua_portuguesa
description: Nota SAEB - Língua Portuguesa
- name: nota_saeb_media_padronizada
description: Nota SAEB - Média Padronizada (N)
- name: ideb
description: IDEB (N x P)
- name: projecao
description: Projeção
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
"""
Script para baixar is dados da taxa de alfabetização divulgado pelo SAEB.
Fonte: https://www.gov.br/inep/pt-br/areas-de-atuacao/avaliacao-e-exames-educacionais/saeb/resultados
São duas tabelas:
- brasil_taxa_alfabetizacao
- uf_taxa_alfabetizacao
"""

import pandas as pd
import basedosdados as bd
import os
Expand All @@ -6,6 +15,7 @@

INPUT = os.path.join(os.getcwd(), "input")
OUTPUT = os.path.join(os.getcwd(), "output")

URL = "https://download.inep.gov.br/saeb/resultados/saeb_2021_brasil_estados_municipios_c_tx_alfabetizado.xlsx"

os.makedirs(INPUT, exist_ok=True)
Expand Down Expand Up @@ -50,7 +60,14 @@
errors="raise",
)

df_br.to_csv(os.path.join(OUTPUT, "brasil_taxa_alfabetizacao.csv"), index=False)
df_br_from_bigquery = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.brasil_taxa_alfabetizacao`",
billing_project_id="basedosdados-dev",
)

pd.concat([df_br_from_bigquery, df_br]).to_csv( # type: ignore
os.path.join(OUTPUT, "brasil_taxa_alfabetizacao.csv"), index=False
)

# Estados

Expand All @@ -67,7 +84,7 @@
billing_project_id="basedosdados-dev",
)

uf_map = dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")]) # type: ignore
uf_map = dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")]) # type: ignore

df_ufs["NO_UF"].unique()

Expand All @@ -89,7 +106,14 @@
errors="raise",
)

df_ufs.to_csv(os.path.join(OUTPUT, "uf_taxa_alfabetizacao.csv"), index=False)
df_ufs_from_bigquery = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.uf_taxa_alfabetizacao`",
billing_project_id="basedosdados-dev",
)

pd.concat([df_ufs_from_bigquery, df_ufs]).to_csv( # type: ignore
os.path.join(OUTPUT, "uf_taxa_alfabetizacao.csv"), index=False
)

# Upload

Expand Down
4 changes: 3 additions & 1 deletion models/br_rf_cno/br_rf_cno__areas.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@ select
safe_cast(tipo_area_complementar as string) tipo_area_complementar,
safe_cast(metragem as float64) metragem,
from `basedosdados-staging.br_rf_cno_staging.areas` as t
{% if is_incremental() %} where data > (select max(data) from {{ this }}) {% endif %}
{% if is_incremental() %}
where safe_cast(data as date) > (select max(data_extracao) from {{ this }})
{% endif %}
4 changes: 3 additions & 1 deletion models/br_rf_cno/br_rf_cno__cnaes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ select
safe_cast(id_cno as string) id_cno,
safe_cast(cnae_2_subclasse as string) cnae_2_subclasse,
from `basedosdados-staging.br_rf_cno_staging.cnaes` as t
{% if is_incremental() %} where data > (select max(data) from {{ this }}) {% endif %}
{% if is_incremental() %}
where safe_cast(data as date) > (select max(data_extracao) from {{ this }})
{% endif %}
4 changes: 3 additions & 1 deletion models/br_rf_cno/br_rf_cno__microdados.sql
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,6 @@ left join
) b
on ltrim(microdados.id_municipio_rf, '0') = b.id_municipio_rf

{% if is_incremental() %} where data > (select max(data) from {{ this }}) {% endif %}
{% if is_incremental() %}
where safe_cast(data as date) > (select max(data_extracao) from {{ this }})
{% endif %}
4 changes: 3 additions & 1 deletion models/br_rf_cno/br_rf_cno__vinculos.sql
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@ select
) qualificacao_contribuinte,
from `basedosdados-staging.br_rf_cno_staging.vinculos` as t

{% if is_incremental() %} where data > (select max(data) from {{ this }}) {% endif %}
{% if is_incremental() %}
where safe_cast(data as date) > (select max(data_extracao) from {{ this }})
{% endif %}

0 comments on commit 500ed90

Please sign in to comment.