Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dbt: br_inep_censo_escolar.turma #520

Merged
merged 5 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions models/br_inep_censo_escolar/br_inep_censo_escolar__turma.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{{
config(
alias="turma",
schema="br_inep_censo_escolar",
partition_by={
"field": "ano",
"data_type": "int64",
"range": {"start": 2009, "end": 2023, "interval": 1},
},
cluster_by="sigla_uf",
)
}}
select
safe_cast(ano as int64) ano,
safe_cast(sigla_uf as string) sigla_uf,
safe_cast(id_municipio as string) id_municipio,
safe_cast(rede as string) rede,
safe_cast(id_escola as string) id_escola,
safe_cast(id_turma as string) id_turma,
safe_cast(etapa_ensino as string) etapa_ensino,
safe_cast(tipo_turma as string) tipo_turma,
safe_cast(hora_inicial as int64) hora_inicial,
safe_cast(minuto_inicial as int64) minuto_inicial,
safe_cast(dia_semana_domingo as int64) dia_semana_domingo,
safe_cast(dia_semana_segunda as int64) dia_semana_segunda,
safe_cast(dia_semana_terca as int64) dia_semana_terca,
safe_cast(dia_semana_quarta as int64) dia_semana_quarta,
safe_cast(dia_semana_quinta as int64) dia_semana_quinta,
safe_cast(dia_semana_sexta as int64) dia_semana_sexta,
safe_cast(dia_semana_sabado as int64) dia_semana_sabado,
safe_cast(numero_dias_atividade as int64) numero_dias_atividade,
safe_cast(numero_duracao_turma as int64) numero_duracao_turma,
safe_cast(tipo_atividade_1 as int64) tipo_atividade_1,
safe_cast(tipo_atividade_2 as int64) tipo_atividade_2,
safe_cast(tipo_atividade_3 as int64) tipo_atividade_3,
safe_cast(tipo_atividade_4 as int64) tipo_atividade_4,
safe_cast(tipo_atividade_5 as int64) tipo_atividade_5,
safe_cast(tipo_atividade_6 as int64) tipo_atividade_6,
safe_cast(id_curso_educacao_profissional as string) id_curso_educacao_profissional,
safe_cast(quantidade_matriculas as int64) quantidade_matriculas,
safe_cast(disciplina_lingua_portuguesa as int64) disciplina_lingua_portuguesa,
safe_cast(disciplina_educacao_fisica as int64) disciplina_educacao_fisica,
safe_cast(disciplina_artes as int64) disciplina_artes,
safe_cast(disciplina_lingua_ingles as int64) disciplina_lingua_ingles,
safe_cast(disciplina_lingua_espanhol as int64) disciplina_lingua_espanhol,
safe_cast(disciplina_lingua_frances as int64) disciplina_lingua_frances,
safe_cast(disciplina_lingua_outra as int64) disciplina_lingua_outra,
safe_cast(disciplina_libras as int64) disciplina_libras,
safe_cast(disciplina_lingua_indigena as int64) disciplina_lingua_indigena,
safe_cast(disciplina_matematica as int64) disciplina_matematica,
safe_cast(disciplina_ciencias as int64) disciplina_ciencias,
safe_cast(disciplina_fisica as int64) disciplina_fisica,
safe_cast(disciplina_quimica as int64) disciplina_quimica,
safe_cast(disciplina_biologia as int64) disciplina_biologia,
safe_cast(disciplina_historia as int64) disciplina_historia,
safe_cast(disciplina_geografia as int64) disciplina_geografia,
safe_cast(disciplina_sociologia as int64) disciplina_sociologia,
safe_cast(disciplina_filosofia as int64) disciplina_filosofia,
safe_cast(disciplina_estudos_sociais as int64) disciplina_estudos_sociais,
safe_cast(disciplina_informatica_comp as int64) disciplina_informatica_comp,
safe_cast(disciplina_ensino_religioso as int64) disciplina_ensino_religioso,
safe_cast(disciplina_profissionalizante as int64) disciplina_profissionalizante,
safe_cast(disciplina_pedagogicas as int64) disciplina_pedagogicas,
safe_cast(disciplina_outras as int64) disciplina_outras,
safe_cast(tipo_localizacao as string) tipo_localizacao,
safe_cast(tipo_categoria_escola_privada as string) tipo_categoria_escola_privada,
safe_cast(conveniada_poder_publico as int64) conveniada_poder_publico,
safe_cast(tipo_convenio_poder_publico as string) tipo_convenio_poder_publico,
safe_cast(mantenedora_privada_emp as int64) mantenedora_privada_emp,
safe_cast(mantenedora_privada_ong as int64) mantenedora_privada_ong,
safe_cast(mantenedora_privada_sind as int64) mantenedora_privada_sind,
safe_cast(mantenedora_privada_sist_s as int64) mantenedora_privada_sist_s,
safe_cast(mantenedora_privada_s_fins as int64) mantenedora_privada_s_fins,
safe_cast(tipo_regulamentacao as string) tipo_regulamentacao,
safe_cast(tipo_localizacao_diferenciada as string) tipo_localizacao_diferenciada,
safe_cast(educacao_indigena as int64) educacao_indigena,
safe_cast(braille as int64) braille,
safe_cast(recursos_baixa_visao as int64) recursos_baixa_visao,
safe_cast(processos_mentais as int64) processos_mentais,
safe_cast(orientacao_mobilidade as int64) orientacao_mobilidade,
safe_cast(sinais as int64) sinais,
safe_cast(comunicacao_alt_aument as int64) comunicacao_alt_aument,
safe_cast(enriquecimento_curricular as int64) enriquecimento_curricular,
safe_cast(soroban as int64) soroban,
safe_cast(informatica_acessivel as int64) informatica_acessivel,
safe_cast(port_escrita as int64) port_escrita,
safe_cast(autonomia_escolar as int64) autonomia_escolar,
safe_cast(
disciplina_atendimento_especiais as int64
) disciplina_atendimento_especiais,
safe_cast(disciplina_diver_socio_cultural as int64) disciplina_diver_socio_cultural,
from `basedosdados-staging.br_inep_censo_escolar_staging.turma` as t
114 changes: 114 additions & 0 deletions models/br_inep_censo_escolar/code/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
import io
import requests
import basedosdados as bd
import pandas as pd
import numpy as np

INPUT = os.path.join(os.getcwd(), "input")
OUTPUT = os.path.join(os.getcwd(), "output")

os.makedirs(INPUT, exist_ok=True)
os.makedirs(OUTPUT, exist_ok=True)

st = bd.Storage(dataset_id="br_inep_censo_escolar", table_id="turma")

blobs = list(st.bucket.list_blobs(prefix=f"raw/br_inep_censo_escolar/turma/"))

for blob in blobs:
filename = blob.name.split("/")[-1]
if filename.endswith(".CSV"):
blob.download_to_filename(filename=os.path.join(INPUT, filename))


dfs = {
str(year): pd.read_csv(os.path.join(INPUT, f"TURMAS_{year}.CSV"), sep=";")
for year in range(2021, 2023 + 1)
}


arch = pd.read_csv(
io.StringIO(
requests.get(
"https://docs.google.com/spreadsheets/d/1qRf25hLSPYX-bSSyffk0DJP_C_mpCHngDY2x_kIohVo/export?format=csv",
timeout=10,
).content.decode("utf-8")
),
dtype=str,
na_values="",
)

renames = {
i["original_name_2020"]: i["name"]
for i in arch.loc[
(arch["name"] != "(deletado)") & (arch["original_name_2020"].notna()),
][["original_name_2020", "name"]].to_dict("records")
}

arch_cols = arch.loc[
(arch["name"] != "(deletado)") & (arch["original_name_2020"].notna()),
]["name"].to_list()


dfs = {
year: df.rename(
columns={k: v for k, v in renames.items() if k in df.columns}, errors="raise"
)
for year, df in dfs.items()
}

dfs = {year: df[[i for i in arch_cols if i in df.columns]] for year, df in dfs.items()}

df = pd.concat([i for _, i in dfs.items()])

del dfs # need memory

all_cols = arch.loc[(arch["name"] != "(deletado)"),]["name"].to_list()

cols_missing = list(set(all_cols) - set(df.columns))

for i in arch.loc[arch["bigquery_type"] == "STRING"]["name"]:
if i in df.columns:
# NOTE: fillna("") porque a coerção astype("Int64").astype("String")
# cria <NA> e ao salvar o csv, <NA> não é salvo como um valor
# vazio i.e "", ele salva como <NA> e isso é intepretado como uma string no BQ
df[i] = df[i].astype("Int64").astype("string").fillna("") # type: ignore

for i in arch.loc[arch["bigquery_type"] == "INT64"]["name"]:
if i in df.columns:
df[i] = df[i].astype("Int64")

for i in cols_missing:
df[i] = np.nan

tb = bd.Table(dataset_id="br_inep_censo_escolar", table_id="turma")

bq_cols = tb._get_columns_from_bq()

partitions = [i["name"] for i in bq_cols["partition_columns"]]

bd_dir = bd.read_sql(
"SELECT id_uf, sigla FROM `basedosdados.br_bd_diretorios_brasil.uf`",
billing_project_id="basedosdados-dev",
)

df["sigla_uf"].unique() # type: ignore

df["sigla_uf"] = df["sigla_uf"].replace( # type: ignore
{i["id_uf"]: i["sigla"] for i in bd_dir.to_dict("records")} # type: ignore
)

df["sigla_uf"].unique() # type: ignore

bq_storage_cols_order = [i["name"] for i in bq_cols["columns"]]

for keys, df_split in df.groupby(partitions):
ano, sigla_uf = keys # type: ignore
path = os.path.join(OUTPUT, f"ano={ano}", f"sigla_uf={sigla_uf}")
os.makedirs(path, exist_ok=True)
df_split.drop(columns=["ano", "sigla_uf"])[bq_storage_cols_order].to_csv( # type: ignore
os.path.join(path, f"{ano}_{sigla_uf}.csv"), index=False
)


tb.create(OUTPUT, if_table_exists="replace", if_storage_data_exists="replace")
Loading
Loading