Merge branch 'main' into br_ibge_criacao_municipio

basedosdados · Sep 17, 2024 · ea6d1aa · ea6d1aa
2 parents cbf82d6 + d13de34
commit ea6d1aa
Show file tree

Hide file tree

Showing 456 changed files with 104,122 additions and 7,264 deletions.
diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml
@@ -98,4 +98,4 @@ jobs:
         run: poetry install --only=dev
       - name: Run script for changing metadata status
         run: |-
-          python .github/workflows/scripts/change_metadata_status.py --modified-files ${{ steps.changed-files.outputs.all_modified_files }} --graphql-url ${{ secrets.BACKEND_GRAPHQL_URL }} --status published --email ${{ secrets.BACKEND_EMAIL }} --password ${{ secrets.BACKEND_PASSWORD }}
+          poetry run python .github/workflows/scripts/change_metadata_status.py --modified-files ${{ steps.changed-files.outputs.all_modified_files }} --graphql-url ${{ secrets.BACKEND_GRAPHQL_URL }} --status published --email ${{ secrets.BACKEND_EMAIL }} --password ${{ secrets.BACKEND_PASSWORD }}
diff --git a/.github/workflows/check-bq-project-name.yml b/.github/workflows/check-bq-project-name.yml
@@ -0,0 +1,34 @@
+---
+name: Check BQ project name
+on:
+  workflow_dispatch:
+  pull_request:
+    paths: ['**/*.sql']
+jobs:
+  check_bucket_name:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Get changed files
+        id: get_files
+        uses: dorny/paths-filter@v2
+        with:
+          list-files: shell
+          filters: |
+            pr:
+              - added|deleted|modified: '**'
+      - name: Install Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+      - name: Run Python script
+        run: |-
+          for file in ${{ steps.get_files.outputs.pr_files }}; do
+            if [[ $file == *.sql ]]; then
+              echo "SQL file detected: $file"
+              python .github/workflows/scripts/check_sql_files.py $file
+            else
+              echo "Não é um arquivo SQL: $file"
+            fi
+          done
diff --git a/.github/workflows/ci-dbt.yaml b/.github/workflows/ci-dbt.yaml
@@ -2,7 +2,7 @@
 name: CI dbt
 on:
   pull_request:
-    paths: ['**.sql', '**.yaml', '**.yml']
+    branches: [main]
 jobs:
   lint:
     name: Lint dbt
@@ -22,4 +22,4 @@ jobs:
       - name: Lint sql
         run: poetry run sqlfmt --diff .
       - name: Lint yaml
-        run: poetry run yamlfix --exclude ".kubernetes/**/*" .
+        run: poetry run yamlfix --exclude ".kubernetes/**/*" --check .
diff --git a/.github/workflows/elementary.yaml b/.github/workflows/elementary.yaml
@@ -0,0 +1,36 @@
+---
+name: Deploy Elementary Report
+on:
+  push:
+    branches: [main, master]
+  schedule:
+    - cron: 00 22 * * 1-5
+  workflow_dispatch:
+jobs:
+  elementary:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout dbt project
+        uses: actions/checkout@v3
+      - name: Run Elementary
+        uses: elementary-data/run-elementary-action@v1.12
+        with:
+          warehouse-type: bigquery
+          adapter-version: 1.5.9
+          profiles-yml: ${{ secrets.ELEMENTARY_PROFILES_YML }}
+          edr-command: edr report --file-path "report.html" --days-back 90 && edr
+            send-report --google-service-account-path "/tmp/gcs_keyfile.json" --gcs-bucket-name
+            "basedosdados" --update-bucket-website "true" --days-back 90
+          bigquery-keyfile: ${{ secrets.BIGQUERY_KEYFILE }}
+          gcs-keyfile: ${{ secrets.GCS_KEYFILE }}
+      - name: Upload report
+        uses: actions/upload-artifact@v3
+        with:
+          name: report.html
+          path: report.html
+      - name: Upload log
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: edr.log
+          path: edr.log
diff --git a/.github/workflows/scripts/check_sql_files.py b/.github/workflows/scripts/check_sql_files.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+
+def check_sql_files(file):
+    found_staging = False
+    if os.path.exists(file) and file.endswith(".sql"):
+        with open(file, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                if "basedosdados-dev" in line:
+                    found_staging = True
+                    print(f"Found 'basedosdados-dev' in {file}")
+                    break
+    return found_staging
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Check for 'basedosdados-dev' occurrences in SQL files.")
+    parser.add_argument("file", help="Path to the SQL file to check")
+    args = parser.parse_args()
+
+    if check_sql_files(args.file):
+        exit(1)
+    else:
+        print("No occurrences of 'basedosdados-staging' found in SQL files.")
diff --git a/.github/workflows/scripts/table_approve.py b/.github/workflows/scripts/table_approve.py
@@ -111,7 +111,7 @@ def push_table_to_bq(
     Dataset(dataset_id).update(mode="prod")
     delete_storage_path = file_path.replace("./downloaded_data/", "")
     print(
-        f"DELETE HEADER FILE FROM basedosdados/staing/{dataset_id}_staging/{table_id}/{delete_storage_path}"
+        f"DELETE HEADER FILE FROM basedosdados/staging/{dataset_id}_staging/{table_id}/{delete_storage_path}"
     )
     st = Storage(dataset_id=dataset_id, table_id=table_id)
     st.delete_file(filename=delete_storage_path, mode="staging")
@@ -146,27 +146,27 @@ def save_header_files(dataset_id, table_id):
             print("Found blob: ", str(blob.name))
             print("Renamed blob: ", blob_path)
             break
-    ### save table header in storage
-
-    print(f"DOWNLOAD HEADER FILE FROM basedosdados-dev.{dataset_id}_staging.{table_id}")
-    query = f"""
-    SELECT * FROM `basedosdados-dev.{dataset_id}_staging.{table_id}` LIMIT 1
-    """
-    df = bd.read_sql(query, billing_project_id="basedosdados", from_file=True)
-    df = df.drop(columns=partitions)
 
     file_name = blob_path.split("/")[-1]
     file_type = file_name.split(".")[-1]
 
     path = Path(blob_path.replace(f"/{file_name}", ""))
     path.mkdir(parents=True, exist_ok=True)
 
+    ### save table header in storage
     if file_type == "csv":
+        print(f"DOWNLOAD HEADER FILE FROM basedosdados-dev.{dataset_id}_staging.{table_id}")
+        query = f"""
+        SELECT * FROM `basedosdados-dev.{dataset_id}_staging.{table_id}` LIMIT 1
+        """
+        df = bd.read_sql(query, billing_project_id="basedosdados", from_file=True)
+        df = df.drop(columns=partitions)
+
         file_path = f"./{path}/table_approve_temp_file_271828.csv"
         df.to_csv(file_path, index=False)
     elif file_type == "parquet":
         file_path = f"./{path}/table_approve_temp_file_271828.parquet"
-        df.to_parquet(file_path)
+        blob.download_to_filename(file_path)
     print("SAVE HEADER FILE: ", file_path)
     return file_path