Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correção br_inep_saeb.{uf,brasil,municipio} #732

Merged
merged 4 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions models/br_inep_saeb/br_inep_saeb__brasil.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@

select
safe_cast(ano as int64) ano,
safe_cast(lower(rede) as string) rede,
safe_cast(lower(localizacao) as string) localizacao,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(disciplina as string) disciplina,
safe_cast(
case
when serie = "12" then "3" when serie = "13" then "4" else serie
end as int64
) serie,
safe_cast(serie as int64) serie,
safe_cast(media as float64) media,
safe_cast(nivel_0 as float64) nivel_0,
safe_cast(nivel_1 as float64) nivel_1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
materialized="table",
)
}}

select
safe_cast(ano as int64) ano,
safe_cast(rede as string) rede,
Expand Down
18 changes: 10 additions & 8 deletions models/br_inep_saeb/br_inep_saeb__municipio.sql
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
{{ config(alias="municipio", schema="br_inep_saeb", materialized="table") }}
{{
config(
alias="municipio",
schema="br_inep_saeb",
materialized="table",
)
}}

select
safe_cast(ano as int64) ano,
safe_cast(lower(rede) as string) rede,
safe_cast(lower(localizacao) as string) localizacao,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(id_municipio as string) id_municipio,
safe_cast(disciplina as string) disciplina,
safe_cast(
case
when serie = "12" then "3" when serie = "13" then "4" else serie
end as int64
) serie,
safe_cast(serie as int64) serie,
round(safe_cast(media as float64), 2) media,
round(safe_cast(nivel_0 as float64), 2) nivel_0,
round(safe_cast(nivel_1 as float64), 2) nivel_1,
Expand Down
81 changes: 19 additions & 62 deletions models/br_inep_saeb/br_inep_saeb__uf.sql
Original file line number Diff line number Diff line change
@@ -1,66 +1,23 @@
{{ config(alias="uf", schema="br_inep_saeb", materialized="table") }}

with
tb as (
select
safe_cast(ano as int64) ano,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(disciplina as string) disciplina,
safe_cast(
case
when serie = "12" then "3" when serie = "13" then "4" else serie
end as int64
) serie,
safe_cast(media as float64) media,
safe_cast(nivel_0 as float64) nivel_0,
safe_cast(nivel_1 as float64) nivel_1,
safe_cast(nivel_2 as float64) nivel_2,
safe_cast(nivel_3 as float64) nivel_3,
safe_cast(nivel_4 as float64) nivel_4,
safe_cast(nivel_5 as float64) nivel_5,
safe_cast(nivel_6 as float64) nivel_6,
safe_cast(nivel_7 as float64) nivel_7,
safe_cast(nivel_8 as float64) nivel_8,
safe_cast(nivel_9 as float64) nivel_9,
safe_cast(nivel_10 as float64) nivel_10,
from `basedosdados-staging.br_inep_saeb_staging.uf` as t
),
fixed_2021 as (
select *
from tb
-- Em 2021 as linhas estao duplicadas porque tem `Total` e `total`
-- Entao vamos excluir total
where ano = 2021 and localizacao in ("Total", "Urbana", "Rural")
),
rest_without_2021 as (select * from tb where ano <> 2021)

select
ano,
lower(rede) as rede,
lower(localizacao) as localizacao,
sigla_uf,
disciplina,
serie,
media,
nivel_0,
nivel_1,
nivel_2,
nivel_3,
nivel_4,
nivel_5,
nivel_6,
nivel_7,
nivel_8,
nivel_9,
nivel_10
from
(
select *
from rest_without_2021
union all
select *
from fixed_2021
)
order by ano desc
safe_cast(ano as int64) ano,
safe_cast(rede as string) rede,
safe_cast(localizacao as string) localizacao,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(disciplina as string) disciplina,
safe_cast(serie as int64) serie,
safe_cast(media as float64) media,
safe_cast(nivel_0 as float64) nivel_0,
safe_cast(nivel_1 as float64) nivel_1,
safe_cast(nivel_2 as float64) nivel_2,
safe_cast(nivel_3 as float64) nivel_3,
safe_cast(nivel_4 as float64) nivel_4,
safe_cast(nivel_5 as float64) nivel_5,
safe_cast(nivel_6 as float64) nivel_6,
safe_cast(nivel_7 as float64) nivel_7,
safe_cast(nivel_8 as float64) nivel_8,
safe_cast(nivel_9 as float64) nivel_9,
safe_cast(nivel_10 as float64) nivel_10,
from `basedosdados-staging.br_inep_saeb_staging.uf` as t
39 changes: 33 additions & 6 deletions models/br_inep_saeb/code/br_inep_saeb_brasil.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_nivel_serie_disciplina,
get_disciplina_serie,
convert_to_pd_dtype,
drop_empty_lines
)

CWD = os.path.dirname(os.getcwd())
Expand Down Expand Up @@ -111,15 +112,20 @@
br_saeb_latest_output = (
# apenas MT e LP
br_saeb_latest_output.loc[br_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
.pipe(
# vamos remover em_regular (Ensino Médio Integrado)
lambda df: df.loc[df["serie"] != "em_regular"]
)
.assign(
disciplina=lambda df: df["disciplina"].str.upper(),
rede=lambda df: df["rede"].str.lower(),
localizacao=lambda df: df["localizacao"].str.lower(),
serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
serie=lambda df: df["serie"].replace(
{
# em é 12
"em": "12",
# em_integral (Ensino Medio Integrado) é 13
"em_integral": "13",
# em_regular (Ensino Médio Tradicional + Integrado) é 14
"em_regular": "14",
}
),
)
)

Expand All @@ -129,6 +135,8 @@

br_saeb_latest_output.info()

br_saeb_latest_output.shape

tb = bd.Table(dataset_id="br_inep_saeb", table_id="brasil")

bq_cols = tb._get_columns_from_bq(mode="prod")
Expand All @@ -143,10 +151,29 @@
br_saeb_latest_output = br_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]

upstream_df = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.brasil`",
"select * from `basedosdados-dev.br_inep_saeb.brasil` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

assert isinstance(upstream_df, pd.DataFrame)

upstream_df.shape

upstream_df = drop_empty_lines(upstream_df)

upstream_df.shape

br_saeb_latest_output.shape

drop_empty_lines(br_saeb_latest_output).shape

pd.concat([br_saeb_latest_output, upstream_df]).to_csv( # type: ignore
os.path.join(OUTPUT, "brasil.csv"), index=False
)

# Update table
tb.create(
os.path.join(OUTPUT, "brasil.csv"),
if_table_exists="replace",
if_storage_data_exists="replace",
)
42 changes: 35 additions & 7 deletions models/br_inep_saeb/code/br_inep_saeb_municipio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_nivel_serie_disciplina,
get_disciplina_serie,
convert_to_pd_dtype,
drop_empty_lines,
)

CWD = os.path.dirname(os.getcwd())
Expand Down Expand Up @@ -119,6 +120,7 @@
"disciplina",
"serie",
],
how="left",
)
)
.drop(columns=["variable"])
Expand All @@ -142,24 +144,35 @@


mun_saeb_latest_output = (
# apenas MT e LP
# Apenas MT e LP
mun_saeb_latest_output.loc[mun_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
.pipe(
# vamos remover em_regular (Ensino Médio Integrado)
lambda df: df.loc[df["serie"] != "em_regular"]
)
.assign(
disciplina=lambda df: df["disciplina"].str.upper(),
rede=lambda df: df["rede"].str.lower(),
localizacao=lambda df: df["localizacao"].str.lower(),
serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
sigla_uf=lambda df: df["nome_uf"].replace(
dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")]) # type: ignore
),
serie=lambda df: df["serie"].replace(
{
# em é 12
"em": "12",
# em_integral (Ensino Medio Integrado) é 13
"em_integral": "13",
# em_regular (Ensino Médio Tradicional + Integrado) é 14
"em_regular": "14",
}
),
)
.drop(columns=["nome_uf"])
)

mun_saeb_latest_output.shape

mun_saeb_latest_output = drop_empty_lines(mun_saeb_latest_output)

mun_saeb_latest_output.shape

mun_saeb_latest_output["ano"] = 2021

mun_saeb_latest_output.head()
Expand All @@ -180,10 +193,25 @@
mun_saeb_latest_output = mun_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]

upstream_df = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.municipio`",
"select * from `basedosdados.br_inep_saeb.municipio` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

assert isinstance(upstream_df, pd.DataFrame)

# upstream_df["serie"].unique()
#
# upstream_df["serie"] = upstream_df["serie"].replace({3: 12})

upstream_df = drop_empty_lines(upstream_df)

pd.concat([mun_saeb_latest_output, upstream_df]).to_csv( # type: ignore
os.path.join(OUTPUT, "municipio.csv"), index=False
)

# Update table
tb.create(
os.path.join(OUTPUT, "municipio.csv"),
if_table_exists="replace",
if_storage_data_exists="replace",
)
47 changes: 37 additions & 10 deletions models/br_inep_saeb/code/br_inep_saeb_uf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
get_nivel_serie_disciplina,
get_disciplina_serie,
convert_to_pd_dtype,
drop_empty_lines,
)

CWD = os.path.dirname(os.getcwd())
Expand Down Expand Up @@ -114,32 +115,43 @@
billing_project_id="basedosdados-dev",
)


ufs_saeb_latest_output = (
# apenas MT e LP
# Apenas MT e LP. Não sei porque não subiram outras disciplinas
ufs_saeb_latest_output.loc[ufs_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
.pipe(
# vamos remover em_regular (Ensino Médio Integrado)
lambda df: df.loc[df["serie"] != "em_regular"]
)
.assign(
disciplina=lambda df: df["disciplina"].str.upper(),
rede=lambda df: df["rede"].str.lower(),
localizacao=lambda df: df["localizacao"].str.lower(),
serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
serie=lambda df: df["serie"].replace(
{
# em é 12
"em": "12",
# em_integral (Ensino Medio Integrado) é 13
"em_integral": "13",
# em_regular (Ensino Médio Tradicional + Integrado) é 14
"em_regular": "14",
}
),
sigla_uf=lambda df: df["nome_uf"].replace(
dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")])
), # type: ignore
dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")]) # type: ignore
),
)
.drop(columns=["nome_uf"])
)

# Add column ano = 2021
ufs_saeb_latest_output["ano"] = 2021

ufs_saeb_latest_output.head()

ufs_saeb_latest_output.info()

ufs_saeb_latest_output.shape

drop_empty_lines(ufs_saeb_latest_output).shape

ufs_saeb_latest_output = drop_empty_lines(ufs_saeb_latest_output)

tb = bd.Table(dataset_id="br_inep_saeb", table_id="uf")

bq_cols = tb._get_columns_from_bq(mode="prod")
Expand All @@ -154,10 +166,25 @@
ufs_saeb_latest_output = ufs_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]

upstream_df = bd.read_sql(
"select * from `basedosdados.br_inep_saeb.uf`",
"select * from `basedosdados-dev.br_inep_saeb.uf` where ano <> 2021",
billing_project_id="basedosdados-dev",
)

assert isinstance(upstream_df, pd.DataFrame)

upstream_df["serie"].unique()

upstream_df.shape

upstream_df = drop_empty_lines(upstream_df)

pd.concat([ufs_saeb_latest_output, upstream_df]).to_csv( # type: ignore
os.path.join(OUTPUT, "uf.csv"), index=False
)

# Update table
tb.create(
os.path.join(OUTPUT, "uf.csv"),
if_table_exists="replace",
if_storage_data_exists="replace",
)
Loading
Loading