Skip to content

Commit

Permalink
Merge pull request #732 from basedosdados/fix-saeb-2021
Browse files Browse the repository at this point in the history
Correção `br_inep_saeb.{uf,brasil,municipio}`
  • Loading branch information
vilelaluiza authored Aug 14, 2024
2 parents 9a90673 + 500ed90 commit 8afd5c0
Show file tree
Hide file tree
Showing 10 changed files with 152,756 additions and 145 deletions.
10 changes: 3 additions & 7 deletions models/br_inep_saeb/br_inep_saeb__brasil.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@

select
safe_cast(ano as int64) ano,
safe_cast(lower(rede) as string) rede,
safe_cast(lower(localizacao) as string) localizacao,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(disciplina as string) disciplina,
safe_cast(
case
when serie = "12" then "3" when serie = "13" then "4" else serie
end as int64
) serie,
safe_cast(serie as int64) serie,
safe_cast(media as float64) media,
safe_cast(nivel_0 as float64) nivel_0,
safe_cast(nivel_1 as float64) nivel_1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
materialized="table",
)
}}

select
safe_cast(ano as int64) ano,
safe_cast(rede as string) rede,
Expand Down
18 changes: 10 additions & 8 deletions models/br_inep_saeb/br_inep_saeb__municipio.sql
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
{{ config(alias="municipio", schema="br_inep_saeb", materialized="table") }}
{{
config(
alias="municipio",
schema="br_inep_saeb",
materialized="table",
)
}}

select
safe_cast(ano as int64) ano,
safe_cast(lower(rede) as string) rede,
safe_cast(lower(localizacao) as string) localizacao,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(id_municipio as string) id_municipio,
safe_cast(disciplina as string) disciplina,
safe_cast(
case
when serie = "12" then "3" when serie = "13" then "4" else serie
end as int64
) serie,
safe_cast(serie as int64) serie,
round(safe_cast(media as float64), 2) media,
round(safe_cast(nivel_0 as float64), 2) nivel_0,
round(safe_cast(nivel_1 as float64), 2) nivel_1,
Expand Down
81 changes: 19 additions & 62 deletions models/br_inep_saeb/br_inep_saeb__uf.sql
Original file line number Diff line number Diff line change
@@ -1,66 +1,23 @@
{{ config(alias="uf", schema="br_inep_saeb", materialized="table") }}

with
tb as (
select
safe_cast(ano as int64) ano,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(disciplina as string) disciplina,
safe_cast(
case
when serie = "12" then "3" when serie = "13" then "4" else serie
end as int64
) serie,
safe_cast(media as float64) media,
safe_cast(nivel_0 as float64) nivel_0,
safe_cast(nivel_1 as float64) nivel_1,
safe_cast(nivel_2 as float64) nivel_2,
safe_cast(nivel_3 as float64) nivel_3,
safe_cast(nivel_4 as float64) nivel_4,
safe_cast(nivel_5 as float64) nivel_5,
safe_cast(nivel_6 as float64) nivel_6,
safe_cast(nivel_7 as float64) nivel_7,
safe_cast(nivel_8 as float64) nivel_8,
safe_cast(nivel_9 as float64) nivel_9,
safe_cast(nivel_10 as float64) nivel_10,
from `basedosdados-staging.br_inep_saeb_staging.uf` as t
),
fixed_2021 as (
select *
from tb
-- Em 2021 as linhas estao duplicadas porque tem `Total` e `total`
-- Entao vamos excluir total
where ano = 2021 and localizacao in ("Total", "Urbana", "Rural")
),
rest_without_2021 as (select * from tb where ano <> 2021)

select
ano,
lower(rede) as rede,
lower(localizacao) as localizacao,
sigla_uf,
disciplina,
serie,
media,
nivel_0,
nivel_1,
nivel_2,
nivel_3,
nivel_4,
nivel_5,
nivel_6,
nivel_7,
nivel_8,
nivel_9,
nivel_10
from
(
select *
from rest_without_2021
union all
select *
from fixed_2021
)
order by ano desc
safe_cast(ano as int64) ano,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(disciplina as string) disciplina,
safe_cast(serie as int64) serie,
safe_cast(media as float64) media,
safe_cast(nivel_0 as float64) nivel_0,
safe_cast(nivel_1 as float64) nivel_1,
safe_cast(nivel_2 as float64) nivel_2,
safe_cast(nivel_3 as float64) nivel_3,
safe_cast(nivel_4 as float64) nivel_4,
safe_cast(nivel_5 as float64) nivel_5,
safe_cast(nivel_6 as float64) nivel_6,
safe_cast(nivel_7 as float64) nivel_7,
safe_cast(nivel_8 as float64) nivel_8,
safe_cast(nivel_9 as float64) nivel_9,
safe_cast(nivel_10 as float64) nivel_10,
from `basedosdados-staging.br_inep_saeb_staging.uf` as t
39 changes: 33 additions & 6 deletions models/br_inep_saeb/code/br_inep_saeb_brasil.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_nivel_serie_disciplina,
get_disciplina_serie,
convert_to_pd_dtype,
drop_empty_lines
)

CWD = os.path.dirname(os.getcwd())
Expand Down Expand Up @@ -111,15 +112,20 @@
br_saeb_latest_output = (
# apenas MT e LP
br_saeb_latest_output.loc[br_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
.pipe(
# vamos remover em_regular (Ensino Médio Integrado)
lambda df: df.loc[df["serie"] != "em_regular"]
)
.assign(
disciplina=lambda df: df["disciplina"].str.upper(),
rede=lambda df: df["rede"].str.lower(),
localizacao=lambda df: df["localizacao"].str.lower(),
serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
serie=lambda df: df["serie"].replace(
{
# em é 12
"em": "12",
# em_integral (Ensino Medio Integrado) é 13
"em_integral": "13",
# em_regular (Ensino Médio Tradicional + Integrado) é 14
"em_regular": "14",
}
),
)
)

Expand All @@ -129,6 +135,8 @@

br_saeb_latest_output.info()

br_saeb_latest_output.shape

tb = bd.Table(dataset_id="br_inep_saeb", table_id="brasil")

bq_cols = tb._get_columns_from_bq(mode="prod")
Expand All @@ -143,10 +151,29 @@
br_saeb_latest_output = br_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]

upstream_df = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.brasil`",
"select * from `basedosdados-dev.br_inep_saeb.brasil` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

assert isinstance(upstream_df, pd.DataFrame)

upstream_df.shape

upstream_df = drop_empty_lines(upstream_df)

upstream_df.shape

br_saeb_latest_output.shape

drop_empty_lines(br_saeb_latest_output).shape

pd.concat([br_saeb_latest_output, upstream_df]).to_csv( # type: ignore
os.path.join(OUTPUT, "brasil.csv"), index=False
)

# Update table
tb.create(
os.path.join(OUTPUT, "brasil.csv"),
if_table_exists="replace",
if_storage_data_exists="replace",
)
42 changes: 35 additions & 7 deletions models/br_inep_saeb/code/br_inep_saeb_municipio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_nivel_serie_disciplina,
get_disciplina_serie,
convert_to_pd_dtype,
drop_empty_lines,
)

CWD = os.path.dirname(os.getcwd())
Expand Down Expand Up @@ -119,6 +120,7 @@
"disciplina",
"serie",
],
how="left",
)
)
.drop(columns=["variable"])
Expand All @@ -142,24 +144,35 @@


mun_saeb_latest_output = (
# apenas MT e LP
# Apenas MT e LP
mun_saeb_latest_output.loc[mun_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
.pipe(
# vamos remover em_regular (Ensino Médio Integrado)
lambda df: df.loc[df["serie"] != "em_regular"]
)
.assign(
disciplina=lambda df: df["disciplina"].str.upper(),
rede=lambda df: df["rede"].str.lower(),
localizacao=lambda df: df["localizacao"].str.lower(),
serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
sigla_uf=lambda df: df["nome_uf"].replace(
dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")]) # type: ignore
),
serie=lambda df: df["serie"].replace(
{
# em é 12
"em": "12",
# em_integral (Ensino Medio Integrado) é 13
"em_integral": "13",
# em_regular (Ensino Médio Tradicional + Integrado) é 14
"em_regular": "14",
}
),
)
.drop(columns=["nome_uf"])
)

mun_saeb_latest_output.shape

mun_saeb_latest_output = drop_empty_lines(mun_saeb_latest_output)

mun_saeb_latest_output.shape

mun_saeb_latest_output["ano"] = 2021

mun_saeb_latest_output.head()
Expand All @@ -180,10 +193,25 @@
mun_saeb_latest_output = mun_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]

upstream_df = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.municipio`",
"select * from `basedosdados.br_inep_saeb.municipio` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

assert isinstance(upstream_df, pd.DataFrame)

# upstream_df["serie"].unique()
#
# upstream_df["serie"] = upstream_df["serie"].replace({3: 12})

upstream_df = drop_empty_lines(upstream_df)

pd.concat([mun_saeb_latest_output, upstream_df]).to_csv( # type: ignore
os.path.join(OUTPUT, "municipio.csv"), index=False
)

# Update table
tb.create(
os.path.join(OUTPUT, "municipio.csv"),
if_table_exists="replace",
if_storage_data_exists="replace",
)
47 changes: 37 additions & 10 deletions models/br_inep_saeb/code/br_inep_saeb_uf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_nivel_serie_disciplina,
get_disciplina_serie,
convert_to_pd_dtype,
drop_empty_lines,
)

CWD = os.path.dirname(os.getcwd())
Expand Down Expand Up @@ -114,32 +115,43 @@
billing_project_id="basedosdados-dev",
)


ufs_saeb_latest_output = (
# apenas MT e LP
# Apenas MT e LP. Não sei porque não subiram outras disciplinas
ufs_saeb_latest_output.loc[ufs_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
.pipe(
# vamos remover em_regular (Ensino Médio Integrado)
lambda df: df.loc[df["serie"] != "em_regular"]
)
.assign(
disciplina=lambda df: df["disciplina"].str.upper(),
rede=lambda df: df["rede"].str.lower(),
localizacao=lambda df: df["localizacao"].str.lower(),
serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
serie=lambda df: df["serie"].replace(
{
# em é 12
"em": "12",
# em_integral (Ensino Medio Integrado) é 13
"em_integral": "13",
# em_regular (Ensino Médio Tradicional + Integrado) é 14
"em_regular": "14",
}
),
sigla_uf=lambda df: df["nome_uf"].replace(
dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")])
), # type: ignore
dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")]) # type: ignore
),
)
.drop(columns=["nome_uf"])
)

# Add column ano = 2021
ufs_saeb_latest_output["ano"] = 2021

ufs_saeb_latest_output.head()

ufs_saeb_latest_output.info()

ufs_saeb_latest_output.shape

drop_empty_lines(ufs_saeb_latest_output).shape

ufs_saeb_latest_output = drop_empty_lines(ufs_saeb_latest_output)

tb = bd.Table(dataset_id="br_inep_saeb", table_id="uf")

bq_cols = tb._get_columns_from_bq(mode="prod")
Expand All @@ -154,10 +166,25 @@
ufs_saeb_latest_output = ufs_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]

upstream_df = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.uf`",
"select * from `basedosdados-dev.br_inep_saeb.uf` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

assert isinstance(upstream_df, pd.DataFrame)

upstream_df["serie"].unique()

upstream_df.shape

upstream_df = drop_empty_lines(upstream_df)

pd.concat([ufs_saeb_latest_output, upstream_df]).to_csv( # type: ignore
os.path.join(OUTPUT, "uf.csv"), index=False
)

# Update table
tb.create(
os.path.join(OUTPUT, "uf.csv"),
if_table_exists="replace",
if_storage_data_exists="replace",
)
Loading

0 comments on commit 8afd5c0

Please sign in to comment.