Skip to content

Commit

Permalink
support retaining platform instance casing
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 committed Oct 31, 2024
1 parent 439e908 commit b73c43c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
23 changes: 21 additions & 2 deletions metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def schema_count(self) -> int:
)[0][0]
)

def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str:
def get_urn_for_table(
self, table: _TableName, lower: bool = False, mixed: bool = False
) -> str:
# TODO: Validate that this is the correct 2/3 layer hierarchy for the platform.

table_name = ".".join(
Expand All @@ -101,7 +103,10 @@ def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str:

if lower:
table_name = table_name.lower()
platform_instance = platform_instance.lower() if platform_instance else None
if not mixed:
platform_instance = (
platform_instance.lower() if platform_instance else None
)

if self.platform == "bigquery":
# Normalize shard numbers and other BigQuery weirdness.
Expand Down Expand Up @@ -131,6 +136,20 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
if schema_info:
return urn_lower, schema_info

# Our treatment of platform instances when lowercasing urns
# is inconsistent. In some places (e.g. Snowflake), we lowercase
# the table names but not the platform instance. In other places
# (e.g. Databricks), we lowercase everything because it happens
# via the automatic lowercasing helper.
# See https://github.com/datahub-project/datahub/pull/8928.
# While we have this sort of inconsistency, we should also
# check the mixed case urn, as a last resort.
urn_mixed = self.get_urn_for_table(table, lower=True, mixed=True)
if urn_mixed not in {urn, urn_lower}:
schema_info = self._resolve_schema_info(urn_mixed)
if schema_info:
return urn_mixed, schema_info

if self._prefers_urn_lower():
return urn_lower, None
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def test_get_urn_for_table_lowercase():
== "urn:li:dataset:(urn:li:dataPlatform:mssql,uppercased-instance.database.dataset.table,PROD)"
)

assert (
schema_resolver.get_urn_for_table(table=table, lower=True, mixed=True)
== "urn:li:dataset:(urn:li:dataPlatform:mssql,Uppercased-Instance.database.dataset.table,PROD)"
)


def test_get_urn_for_table_not_lower_should_keep_capital_letters():
schema_resolver = SchemaResolver(
Expand Down

0 comments on commit b73c43c

Please sign in to comment.