Skip to content

Commit

Permalink
fix(ingest/bigquery): escape special characters for table descriptions (
Browse files Browse the repository at this point in the history
  • Loading branch information
AvaniSiddhapuraAPT authored Mar 5, 2024
1 parent 5bee25f commit 782d33d
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_helper import (
unquote_and_decode_unicode_escape_seq,
)
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
BigqueryColumn,
Expand Down Expand Up @@ -1073,7 +1076,9 @@ def gen_dataset_workunits(

dataset_properties = DatasetProperties(
name=datahub_dataset_name.get_table_display_name(),
description=table.comment,
description=unquote_and_decode_unicode_escape_seq(table.comment)
if table.comment
else "",
qualifiedName=str(datahub_dataset_name),
created=(
TimeStamp(time=int(table.created.timestamp() * 1000))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Optional


def unquote_and_decode_unicode_escape_seq(
string: str,
leading_quote: str = '"',
trailing_quote: Optional[str] = None,
) -> str:
"""
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
"""
trailing_quote = trailing_quote if trailing_quote else leading_quote

if string.startswith(leading_quote) and string.endswith(trailing_quote):
string = string[1:-1]

cleaned_string = string.encode().decode("unicode-escape")

return cleaned_string
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -31,7 +32,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -46,7 +48,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -63,7 +66,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -78,7 +82,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -100,7 +105,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -115,7 +121,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -130,7 +137,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -147,7 +155,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -162,7 +171,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -182,7 +192,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -197,7 +208,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand Down Expand Up @@ -229,7 +241,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -243,12 +256,14 @@
"externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1",
"name": "table-1",
"qualifiedName": "project-id-1.bigquery-dataset-1.table-1",
"description": "",
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -263,7 +278,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -279,7 +295,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -296,7 +313,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
},
{
Expand All @@ -320,7 +338,8 @@
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "bigquery-2022_02_03-07_00_00"
"runId": "bigquery-2022_02_03-07_00_00",
"lastRunId": "no-run-id-provided"
}
}
]
36 changes: 36 additions & 0 deletions metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
BigQueryTableRef,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_helper import (
unquote_and_decode_unicode_escape_seq,
)
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
from datahub.sql_parsing.schema_resolver import SchemaResolver
Expand Down Expand Up @@ -176,3 +179,36 @@ def test_bigquery_table_sanitasitation():
assert table_identifier.dataset == "dataset-4567"
assert table_identifier.table == "foo_2016*"
assert table_identifier.get_table_display_name() == "foo"


def test_unquote_and_decode_unicode_escape_seq():

# Test with a string that starts and ends with quotes and has Unicode escape sequences
input_string = '"Hello \\u003cWorld\\u003e"'
expected_output = "Hello <World>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that does not start and end with quotes
input_string = "Hello \\u003cWorld\\u003e"
expected_output = "Hello <World>"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with an empty string
input_string = ""
expected_output = ""
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that does not have Unicode escape sequences
input_string = "No escape sequences here"
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

# Test with a string that starts and ends with quotes but does not have escape sequences
input_string = '"No escape sequences here"'
expected_output = "No escape sequences here"
result = unquote_and_decode_unicode_escape_seq(input_string)
assert result == expected_output

0 comments on commit 782d33d

Please sign in to comment.