feat: add tags at table level (#12)

* feat: add tags at column level * chore: remove set * feat; add enum, refactor models file * fix: use retrocompatible typing_extensions * feat: add settings.yml
remoteoss · Sep 5, 2024 · 1f1be19 · 1f1be19
1 parent a841431
commit 1f1be19
Show file tree

Hide file tree

Showing 20 changed files with 9,228 additions and 4,544 deletions.
diff --git a/.github/settings.yml b/.github/settings.yml
@@ -0,0 +1,12 @@
+branches:
+  - name: main
+    protection:
+      required_status_checks:
+        strict: true
+        contexts: []
+      required_pull_request_reviews:
+        dismiss_stale_reviews: true
+        require_code_owner_reviews: true
+        required_approving_review_count: 1
+      restrictions: null
+      enforce_admins: true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -5,7 +5,7 @@ on:
     tags-ignore:
       - "*.*.*"
 
-name: tests
+name: ci
 
 concurrency:
   group: tests

diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.envrc
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"
-snowflake-connector-python = "^3.7.1"
+snowflake-connector-python = {extras = ["secure-local-storage"], version = "^3.12.1"}
 pydantic-settings = "^2.2.1"
 typer = "^0.12.0"
 

diff --git a/snowflake_utils/__main__.py b/snowflake_utils/__main__.py
@@ -1,10 +1,11 @@
+import logging
+import os
+
 import typer
 from typing_extensions import Annotated
 
-from .models import FileFormat, InlineFileFormat, Table, Schema, Column
-from .queries import connect
-import logging
-import os
+from ..snowflake_utils.settings import SnowflakeSettings
+from .models import Column, FileFormat, InlineFileFormat, Schema, Table
 
 app = typer.Typer()
 
@@ -42,7 +43,7 @@ def mass_single_column_update(
     new_column = Column(name=new_column, data_type=data_type)
     log_level = os.getenv("LOG_LEVEL", "INFO")
     logging.getLogger("snowflake-utils").setLevel(log_level)
-    with connect() as conn, conn.cursor() as cursor:
+    with SnowflakeSettings.connect() as conn, conn.cursor() as cursor:
         tables = db_schema.get_tables(cursor=cursor)
         for table in tables:
             columns = table.get_columns(cursor=cursor)

diff --git a/snowflake_utils/models/__init__.py b/snowflake_utils/models/__init__.py
@@ -0,0 +1,17 @@
+from .column import Column
+from .enums import MatchByColumnName, TagLevel
+from .file_format import FileFormat, InlineFileFormat
+from .schema import Schema
+from .table import Table
+from .table_structure import TableStructure
+
+__all__ = [
+    "Column",
+    "MatchByColumnName",
+    "TagLevel",
+    "Schema",
+    "Table",
+    "TableStructure",
+    "FileFormat",
+    "InlineFileFormat",
+]
diff --git a/snowflake_utils/models/column.py b/snowflake_utils/models/column.py
@@ -0,0 +1,43 @@
+from datetime import date, datetime
+
+from pydantic import BaseModel, Field
+
+
+class Column(BaseModel):
+    name: str
+    data_type: str
+    tags: dict[str, str] = Field(default_factory=dict)
+
+
+def _possibly_cast(s: str, old_column_type: str, new_column_type: str) -> str:
+    if old_column_type == "VARIANT" and new_column_type != "VARIANT":
+        return f"PARSE_JSON({s})"
+    return s
+
+
+def _matched(columns: list[Column], old_columns: dict[str, str]):
+    def tmp(x: str) -> str:
+        return f'tmp."{x}"'
+
+    return ",".join(
+        f'dest."{c.name}" = {_possibly_cast(tmp(c.name), old_columns.get(c.name), c.data_type)}'
+        for c in columns
+    )
+
+
+def _inserts(columns: list[Column], old_columns: dict[str, str]) -> str:
+    return ",".join(
+        _possibly_cast(f'tmp."{c.name}"', old_columns.get(c.name), c.data_type)
+        for c in columns
+    )
+
+
+def _type_cast(s: any) -> any:
+    if isinstance(s, (int, float)):
+        return str(s)
+    elif isinstance(s, str):
+        return f"'{s}'"
+    elif isinstance(s, (datetime, date)):
+        return f"'{s.isoformat()}'"
+    else:
+        return f"'{s}'"
diff --git a/snowflake_utils/models/enums.py b/snowflake_utils/models/enums.py
@@ -0,0 +1,12 @@
+from enum import Enum
+
+
+class MatchByColumnName(Enum):
+    CASE_SENSITIVE = "CASE_SENSITIVE"
+    CASE_INSENSITIVE = "CASE_INSENSITIVE"
+    NONE = "NONE"
+
+
+class TagLevel(Enum):
+    COLUMN = "column"
+    TABLE = "table"
diff --git a/snowflake_utils/models/file_format.py b/snowflake_utils/models/file_format.py
@@ -0,0 +1,30 @@
+from pydantic import BaseModel
+from typing_extensions import Self
+
+
+class InlineFileFormat(BaseModel):
+    definition: str
+
+
+class FileFormat(BaseModel):
+    database: str | None = None
+    schema_: str | None = None
+    name: str
+
+    def __str__(self) -> str:
+        return ".".join(
+            s for s in [self.database, self.schema_, self.name] if s is not None
+        )
+
+    @classmethod
+    def from_string(cls, s: str) -> Self:
+        s = s.split(".")
+        match s:
+            case [database, schema, name]:
+                return cls(database=database, schema_=schema, name=name)
+            case [schema, name]:
+                return cls(schema_=schema, name=name)
+            case [name]:
+                return cls(name=name)
+            case _:
+                raise ValueError("Cannot parse file format")
diff --git a/snowflake_utils/models/schema.py b/snowflake_utils/models/schema.py
@@ -0,0 +1,26 @@
+from pydantic import BaseModel
+from snowflake.connector.cursor import SnowflakeCursor
+
+from .table import Table
+
+
+class Schema(BaseModel):
+    name: str
+    database: str | None = None
+
+    @property
+    def fully_qualified_name(self):
+        if self.database:
+            return f"{self.database}.{self.name}"
+        else:
+            return self.name
+
+    def get_tables(self, cursor: SnowflakeCursor):
+        cursor.execute(f"show tables in schema {self.fully_qualified_name};")
+        data = cursor.execute(
+            'select "name", "database_name", "schema_name" FROM TABLE(RESULT_SCAN(LAST_QUERY_ID()));'
+        ).fetchall()
+        return [
+            Table(name=name, schema_=schema, database=database)
+            for (name, database, schema, *_) in data
+        ]