Merge pull request #732 from basedosdados/fix-saeb-2021

Correção `br_inep_saeb.{uf,brasil,municipio}`
basedosdados · Aug 14, 2024 · 8afd5c0 · 8afd5c0
2 parents 9a90673 + 500ed90
commit 8afd5c0
Show file tree

Hide file tree

Showing 10 changed files with 152,756 additions and 145 deletions.
diff --git a/models/br_inep_saeb/br_inep_saeb__brasil.sql b/models/br_inep_saeb/br_inep_saeb__brasil.sql
@@ -2,14 +2,10 @@
 
 select
     safe_cast(ano as int64) ano,
-    safe_cast(lower(rede) as string) rede,
-    safe_cast(lower(localizacao) as string) localizacao,
+    safe_cast(rede as string) rede,
+    safe_cast(localizacao as string) localizacao,
     safe_cast(disciplina as string) disciplina,
-    safe_cast(
-        case
-            when serie = "12" then "3" when serie = "13" then "4" else serie
-        end as int64
-    ) serie,
+    safe_cast(serie as int64) serie,
     safe_cast(media as float64) media,
     safe_cast(nivel_0 as float64) nivel_0,
     safe_cast(nivel_1 as float64) nivel_1,

diff --git a/models/br_inep_saeb/br_inep_saeb__brasil_taxa_alfabetizacao.sql b/models/br_inep_saeb/br_inep_saeb__brasil_taxa_alfabetizacao.sql
@@ -5,7 +5,6 @@
         materialized="table",
     )
 }}
-
 select
     safe_cast(ano as int64) ano,
     safe_cast(rede as string) rede,

diff --git a/models/br_inep_saeb/br_inep_saeb__municipio.sql b/models/br_inep_saeb/br_inep_saeb__municipio.sql
@@ -1,17 +1,19 @@
-{{ config(alias="municipio", schema="br_inep_saeb", materialized="table") }}
+{{
+    config(
+        alias="municipio",
+        schema="br_inep_saeb",
+        materialized="table",
+    )
+}}
 
 select
     safe_cast(ano as int64) ano,
-    safe_cast(lower(rede) as string) rede,
-    safe_cast(lower(localizacao) as string) localizacao,
+    safe_cast(rede as string) rede,
+    safe_cast(localizacao as string) localizacao,
     safe_cast(sigla_uf as string) sigla_uf,
     safe_cast(id_municipio as string) id_municipio,
     safe_cast(disciplina as string) disciplina,
-    safe_cast(
-        case
-            when serie = "12" then "3" when serie = "13" then "4" else serie
-        end as int64
-    ) serie,
+    safe_cast(serie as int64) serie,
     round(safe_cast(media as float64), 2) media,
     round(safe_cast(nivel_0 as float64), 2) nivel_0,
     round(safe_cast(nivel_1 as float64), 2) nivel_1,

diff --git a/models/br_inep_saeb/br_inep_saeb__uf.sql b/models/br_inep_saeb/br_inep_saeb__uf.sql
@@ -1,66 +1,23 @@
 {{ config(alias="uf", schema="br_inep_saeb", materialized="table") }}
 
-with
-    tb as (
-        select
-            safe_cast(ano as int64) ano,
-            safe_cast(rede as string) rede,
-            safe_cast(localizacao as string) localizacao,
-            safe_cast(sigla_uf as string) sigla_uf,
-            safe_cast(disciplina as string) disciplina,
-            safe_cast(
-                case
-                    when serie = "12" then "3" when serie = "13" then "4" else serie
-                end as int64
-            ) serie,
-            safe_cast(media as float64) media,
-            safe_cast(nivel_0 as float64) nivel_0,
-            safe_cast(nivel_1 as float64) nivel_1,
-            safe_cast(nivel_2 as float64) nivel_2,
-            safe_cast(nivel_3 as float64) nivel_3,
-            safe_cast(nivel_4 as float64) nivel_4,
-            safe_cast(nivel_5 as float64) nivel_5,
-            safe_cast(nivel_6 as float64) nivel_6,
-            safe_cast(nivel_7 as float64) nivel_7,
-            safe_cast(nivel_8 as float64) nivel_8,
-            safe_cast(nivel_9 as float64) nivel_9,
-            safe_cast(nivel_10 as float64) nivel_10,
-        from `basedosdados-staging.br_inep_saeb_staging.uf` as t
-    ),
-    fixed_2021 as (
-        select *
-        from tb
-        -- Em 2021 as linhas estao duplicadas porque tem `Total` e `total`
-        -- Entao vamos excluir total
-        where ano = 2021 and localizacao in ("Total", "Urbana", "Rural")
-    ),
-    rest_without_2021 as (select * from tb where ano <> 2021)
 
 select
-    ano,
-    lower(rede) as rede,
-    lower(localizacao) as localizacao,
-    sigla_uf,
-    disciplina,
-    serie,
-    media,
-    nivel_0,
-    nivel_1,
-    nivel_2,
-    nivel_3,
-    nivel_4,
-    nivel_5,
-    nivel_6,
-    nivel_7,
-    nivel_8,
-    nivel_9,
-    nivel_10
-from
-    (
-        select *
-        from rest_without_2021
-        union all
-        select *
-        from fixed_2021
-    )
-order by ano desc
+    safe_cast(ano as int64) ano,
+    safe_cast(rede as string) rede,
+    safe_cast(localizacao as string) localizacao,
+    safe_cast(sigla_uf as string) sigla_uf,
+    safe_cast(disciplina as string) disciplina,
+    safe_cast(serie as int64) serie,
+    safe_cast(media as float64) media,
+    safe_cast(nivel_0 as float64) nivel_0,
+    safe_cast(nivel_1 as float64) nivel_1,
+    safe_cast(nivel_2 as float64) nivel_2,
+    safe_cast(nivel_3 as float64) nivel_3,
+    safe_cast(nivel_4 as float64) nivel_4,
+    safe_cast(nivel_5 as float64) nivel_5,
+    safe_cast(nivel_6 as float64) nivel_6,
+    safe_cast(nivel_7 as float64) nivel_7,
+    safe_cast(nivel_8 as float64) nivel_8,
+    safe_cast(nivel_9 as float64) nivel_9,
+    safe_cast(nivel_10 as float64) nivel_10,
+from `basedosdados-staging.br_inep_saeb_staging.uf` as t
diff --git a/models/br_inep_saeb/code/br_inep_saeb_brasil.py b/models/br_inep_saeb/code/br_inep_saeb_brasil.py
@@ -6,6 +6,7 @@
     get_nivel_serie_disciplina,
     get_disciplina_serie,
     convert_to_pd_dtype,
+    drop_empty_lines
 )
 
 CWD = os.path.dirname(os.getcwd())
@@ -111,15 +112,20 @@
 br_saeb_latest_output = (
     # apenas MT e LP
     br_saeb_latest_output.loc[br_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
-    .pipe(
-        # vamos remover em_regular (Ensino Médio Integrado)
-        lambda df: df.loc[df["serie"] != "em_regular"]
-    )
     .assign(
         disciplina=lambda df: df["disciplina"].str.upper(),
         rede=lambda df: df["rede"].str.lower(),
         localizacao=lambda df: df["localizacao"].str.lower(),
-        serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
+        serie=lambda df: df["serie"].replace(
+            {
+                # em é 12
+                "em": "12",
+                # em_integral (Ensino Medio Integrado) é 13
+                "em_integral": "13",
+                # em_regular (Ensino Médio Tradicional + Integrado) é 14
+                "em_regular": "14",
+            }
+        ),
     )
 )
 
@@ -129,6 +135,8 @@
 
 br_saeb_latest_output.info()
 
+br_saeb_latest_output.shape
+
 tb = bd.Table(dataset_id="br_inep_saeb", table_id="brasil")
 
 bq_cols = tb._get_columns_from_bq(mode="prod")
@@ -143,10 +151,29 @@
 br_saeb_latest_output = br_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]
 
 upstream_df = bd.read_sql(
-    "select * from `basedosdados.br_inep_saeb.brasil`",
+    "select * from `basedosdados-dev.br_inep_saeb.brasil` where ano <> 2021",
     billing_project_id="basedosdados-dev",
 )
 
+assert isinstance(upstream_df, pd.DataFrame)
+
+upstream_df.shape
+
+upstream_df = drop_empty_lines(upstream_df)
+
+upstream_df.shape
+
+br_saeb_latest_output.shape
+
+drop_empty_lines(br_saeb_latest_output).shape
+
 pd.concat([br_saeb_latest_output, upstream_df]).to_csv(  # type: ignore
     os.path.join(OUTPUT, "brasil.csv"), index=False
 )
+
+# Update table
+tb.create(
+    os.path.join(OUTPUT, "brasil.csv"),
+    if_table_exists="replace",
+    if_storage_data_exists="replace",
+)
diff --git a/models/br_inep_saeb/code/br_inep_saeb_municipio.py b/models/br_inep_saeb/code/br_inep_saeb_municipio.py
@@ -6,6 +6,7 @@
     get_nivel_serie_disciplina,
     get_disciplina_serie,
     convert_to_pd_dtype,
+    drop_empty_lines,
 )
 
 CWD = os.path.dirname(os.getcwd())
@@ -119,6 +120,7 @@
                 "disciplina",
                 "serie",
             ],
+            how="left",
         )
     )
     .drop(columns=["variable"])
@@ -142,24 +144,35 @@
 
 
 mun_saeb_latest_output = (
-    # apenas MT e LP
+    # Apenas MT e LP
     mun_saeb_latest_output.loc[mun_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
-    .pipe(
-        # vamos remover em_regular (Ensino Médio Integrado)
-        lambda df: df.loc[df["serie"] != "em_regular"]
-    )
     .assign(
         disciplina=lambda df: df["disciplina"].str.upper(),
         rede=lambda df: df["rede"].str.lower(),
         localizacao=lambda df: df["localizacao"].str.lower(),
-        serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
         sigla_uf=lambda df: df["nome_uf"].replace(
             dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")])  # type: ignore
         ),
+        serie=lambda df: df["serie"].replace(
+            {
+                # em é 12
+                "em": "12",
+                # em_integral (Ensino Medio Integrado) é 13
+                "em_integral": "13",
+                # em_regular (Ensino Médio Tradicional + Integrado) é 14
+                "em_regular": "14",
+            }
+        ),
     )
     .drop(columns=["nome_uf"])
 )
 
+mun_saeb_latest_output.shape
+
+mun_saeb_latest_output = drop_empty_lines(mun_saeb_latest_output)
+
+mun_saeb_latest_output.shape
+
 mun_saeb_latest_output["ano"] = 2021
 
 mun_saeb_latest_output.head()
@@ -180,10 +193,25 @@
 mun_saeb_latest_output = mun_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]
 
 upstream_df = bd.read_sql(
-    "select * from `basedosdados.br_inep_saeb.municipio`",
+    "select * from `basedosdados.br_inep_saeb.municipio` where ano <> 2021",
     billing_project_id="basedosdados-dev",
 )
 
+assert isinstance(upstream_df, pd.DataFrame)
+
+# upstream_df["serie"].unique()
+#
+# upstream_df["serie"] = upstream_df["serie"].replace({3: 12})
+
+upstream_df = drop_empty_lines(upstream_df)
+
 pd.concat([mun_saeb_latest_output, upstream_df]).to_csv(  # type: ignore
     os.path.join(OUTPUT, "municipio.csv"), index=False
 )
+
+# Update table
+tb.create(
+    os.path.join(OUTPUT, "municipio.csv"),
+    if_table_exists="replace",
+    if_storage_data_exists="replace",
+)
diff --git a/models/br_inep_saeb/code/br_inep_saeb_uf.py b/models/br_inep_saeb/code/br_inep_saeb_uf.py
@@ -6,6 +6,7 @@
     get_nivel_serie_disciplina,
     get_disciplina_serie,
     convert_to_pd_dtype,
+    drop_empty_lines,
 )
 
 CWD = os.path.dirname(os.getcwd())
@@ -114,32 +115,43 @@
     billing_project_id="basedosdados-dev",
 )
 
-
 ufs_saeb_latest_output = (
-    # apenas MT e LP
+    # Apenas MT e LP. Não sei porque não subiram outras disciplinas
     ufs_saeb_latest_output.loc[ufs_saeb_latest_output["disciplina"].isin(["mt", "lp"])]
-    .pipe(
-        # vamos remover em_regular (Ensino Médio Integrado)
-        lambda df: df.loc[df["serie"] != "em_regular"]
-    )
     .assign(
         disciplina=lambda df: df["disciplina"].str.upper(),
         rede=lambda df: df["rede"].str.lower(),
         localizacao=lambda df: df["localizacao"].str.lower(),
-        serie=lambda df: df["serie"].replace({"em": "3", "em_integral": "4"}),
+        serie=lambda df: df["serie"].replace(
+            {
+                # em é 12
+                "em": "12",
+                # em_integral (Ensino Medio Integrado) é 13
+                "em_integral": "13",
+                # em_regular (Ensino Médio Tradicional + Integrado) é 14
+                "em_regular": "14",
+            }
+        ),
         sigla_uf=lambda df: df["nome_uf"].replace(
-            dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")])
-        ),  # type: ignore
+            dict([(i["nome"], i["sigla"]) for i in bd_dirs_ufs.to_dict("records")])  # type: ignore
+        ),
     )
     .drop(columns=["nome_uf"])
 )
 
+# Add column ano = 2021
 ufs_saeb_latest_output["ano"] = 2021
 
 ufs_saeb_latest_output.head()
 
 ufs_saeb_latest_output.info()
 
+ufs_saeb_latest_output.shape
+
+drop_empty_lines(ufs_saeb_latest_output).shape
+
+ufs_saeb_latest_output = drop_empty_lines(ufs_saeb_latest_output)
+
 tb = bd.Table(dataset_id="br_inep_saeb", table_id="uf")
 
 bq_cols = tb._get_columns_from_bq(mode="prod")
@@ -154,10 +166,25 @@
 ufs_saeb_latest_output = ufs_saeb_latest_output.astype(col_dtypes)[col_dtypes.keys()]
 
 upstream_df = bd.read_sql(
-    "select * from `basedosdados.br_inep_saeb.uf`",
+    "select * from `basedosdados-dev.br_inep_saeb.uf` where ano <> 2021",
     billing_project_id="basedosdados-dev",
 )
 
+assert isinstance(upstream_df, pd.DataFrame)
+
+upstream_df["serie"].unique()
+
+upstream_df.shape
+
+upstream_df = drop_empty_lines(upstream_df)
+
 pd.concat([ufs_saeb_latest_output, upstream_df]).to_csv(  # type: ignore
     os.path.join(OUTPUT, "uf.csv"), index=False
 )
+
+# Update table
+tb.create(
+    os.path.join(OUTPUT, "uf.csv"),
+    if_table_exists="replace",
+    if_storage_data_exists="replace",
+)