Skip to content

Commit

Permalink
Merge branch 'master' into sqlglot-bump
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Oct 17, 2024
2 parents a6e7ef9 + 6f19322 commit e63c233
Show file tree
Hide file tree
Showing 98 changed files with 7,338 additions and 8,660 deletions.
9 changes: 4 additions & 5 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,16 @@ jobs:
timezoneLinux: ${{ matrix.timezone }}
- name: Check out the repo
uses: acryldata/sane-checkout-action@v3
- uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: pip
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: "zulu"
java-version: 17
- uses: gradle/actions/setup-gradle@v3
- uses: actions/setup-python@v5
if: ${{ needs.setup.outputs.ingestion_change == 'true' }}
with:
python-version: "3.10"
cache: pip
- name: Gradle build (and test) for NOT metadata ingestion
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
run: |
Expand Down
3 changes: 3 additions & 0 deletions docker/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ dockerCompose {
isRequiredBy(tasks.named('quickstartDebug'))
composeAdditionalArgs = ['--profile', 'debug']

if (System.getenv().containsKey("DATAHUB_VERSION")) {
environment.put 'DATAHUB_VERSION', System.getenv("DATAHUB_VERSION")
}
environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally

useComposeFiles = ['profiles/docker-compose.yml']
Expand Down
2 changes: 1 addition & 1 deletion docker/profiles/docker-compose.frontend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ x-datahub-frontend-service: &datahub-frontend-service

x-datahub-frontend-service-dev: &datahub-frontend-service-dev
<<: *datahub-frontend-service
image: ${DATAHUB_FRONTEND_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-frontend-react}:debug
image: ${DATAHUB_FRONTEND_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-frontend-react}:${DATAHUB_VERSION:-debug}
ports:
- ${DATAHUB_MAPPED_FRONTEND_DEBUG_PORT:-5002}:5002
- ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
Expand Down
8 changes: 4 additions & 4 deletions docker/profiles/docker-compose.gms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ x-datahub-system-update-service: &datahub-system-update-service

x-datahub-system-update-service-dev: &datahub-system-update-service-dev
<<: *datahub-system-update-service
image: ${DATAHUB_UPGRADE_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-upgrade}:debug
image: ${DATAHUB_UPGRADE_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-upgrade}:${DATAHUB_VERSION:-debug}
ports:
- ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003
environment: &datahub-system-update-dev-env
Expand Down Expand Up @@ -115,7 +115,7 @@ x-datahub-gms-service: &datahub-gms-service

x-datahub-gms-service-dev: &datahub-gms-service-dev
<<: *datahub-gms-service
image: ${DATAHUB_GMS_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-gms}:debug
image: ${DATAHUB_GMS_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-gms}:${DATAHUB_VERSION:-debug}
ports:
- ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
Expand Down Expand Up @@ -159,7 +159,7 @@ x-datahub-mae-consumer-service: &datahub-mae-consumer-service

x-datahub-mae-consumer-service-dev: &datahub-mae-consumer-service-dev
<<: *datahub-mae-consumer-service
image: ${DATAHUB_MAE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mae-consumer}:debug
image: ${DATAHUB_MAE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mae-consumer}:${DATAHUB_VERSION:-debug}
environment:
<<: [*datahub-dev-telemetry-env, *datahub-mae-consumer-env]
volumes:
Expand All @@ -185,7 +185,7 @@ x-datahub-mce-consumer-service: &datahub-mce-consumer-service

x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev
<<: *datahub-mce-consumer-service
image: ${DATAHUB_MCE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mce-consumer}:debug
image: ${DATAHUB_MCE_CONSUMER_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mce-consumer}:${DATAHUB_VERSION:-debug}
environment:
<<: [*datahub-dev-telemetry-env, *datahub-mce-consumer-env]
volumes:
Expand Down
10 changes: 5 additions & 5 deletions docker/profiles/docker-compose.prerequisites.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ services:
mysql-setup-dev:
<<: *mysql-setup
profiles: *mysql-profiles-dev
image: ${DATAHUB_MYSQL_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mysql-setup}:debug
image: ${DATAHUB_MYSQL_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mysql-setup}:${DATAHUB_VERSION:-debug}
postgres:
profiles: *postgres-profiles
hostname: postgres
Expand Down Expand Up @@ -166,7 +166,7 @@ services:
postgres-setup-dev:
<<: *postgres-setup
profiles: *postgres-profiles-dev
image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-postgres-setup}:debug
image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-postgres-setup}:${DATAHUB_VERSION:-debug}
cassandra:
profiles: *cassandra-profiles
hostname: cassandra
Expand Down Expand Up @@ -272,7 +272,7 @@ services:
environment:
<<: *kafka-setup-env
DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-true}
image: ${DATAHUB_KAFKA_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-kafka-setup}:debug
image: ${DATAHUB_KAFKA_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-kafka-setup}:${DATAHUB_VERSION:-debug}
elasticsearch:
profiles: *elasticsearch-profiles
hostname: search
Expand All @@ -296,7 +296,7 @@ services:
volumes:
- esdata:/usr/share/elasticsearch/data
elasticsearch-setup-dev: &elasticsearch-setup-dev
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:debug
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-debug}
profiles: *elasticsearch-profiles
hostname: elasticsearch-setup
env_file: elasticsearch-setup/env/docker.env
Expand Down Expand Up @@ -347,7 +347,7 @@ services:
<<: *opensearch-setup
profiles: *opensearch-profiles-dev
hostname: opensearch-setup-dev
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:debug
image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-debug}
environment:
<<: *search-datastore-environment
USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true}
Expand Down
19 changes: 18 additions & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,31 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

## Next

- #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.<br/> PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.<br/>
Entity urn with `include_workspace_name_in_dataset_urn: false`
```
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.]<SemanticModelName>.<TableName>,<ENV>)
```

Entity urn with `include_workspace_name_in_dataset_urn: true`
```
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.].<WorkspaceName>.<SemanticModelName>.<TableName>,<ENV>)
```

The config `include_workspace_name_in_dataset_urn` is default to `false` for backward compatiblity, However, we recommend enabling this flag after performing the necessary cleanup.
If stateful ingestion is enabled, running ingestion with the latest CLI version will handle the cleanup automatically. Otherwise, we recommend soft deleting all powerbi data via the DataHub CLI:
`datahub delete --platform powerbi --soft` and then re-ingest with the latest CLI version, ensuring the `include_workspace_name_in_dataset_urn` configuration is set to true.

### Breaking Changes

- #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`.
- #11484 - Metadata service authentication enabled by default
- #11484 - Rest API authorization enabled by default
- #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases.
- #11619 - schema field/column paths can no longer be empty strings
- #11619 - schema field/column paths can no longer be duplicated within the schema
- #11619 - schema field/column paths can no longer be duplicated within the schema
- #11570 - The `DatahubClientConfig`'s server field no longer defaults to `http://localhost:8080`. Be sure to explicitly set this.
- #11570 - If a `datahub_api` is explicitly passed to a stateful ingestion config provider, it will be used. We previously ignored it if the pipeline context also had a graph object.

### Potential Downtime

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,6 @@ class Constant:
# Default config constants
DEFAULT_DATAHUB_REST_URL = "http://localhost:8080"

# Environment variable contants
DATAHUB_REST_URL = "DATAHUB_REST_URL"
DATAHUB_ENV = "DATAHUB_ENV"
DATAHUB_PLATFORM_INSTANCE = "DATAHUB_PLATFORM_INSTANCE"
DAGSTER_UI_URL = "DAGSTER_UI_URL"

# Datahub inputs/outputs constant
DATAHUB_INPUTS = "datahub.inputs"
DATAHUB_OUTPUTS = "datahub.outputs"
Expand Down Expand Up @@ -154,7 +148,6 @@ class DatasetLineage(NamedTuple):

class DatahubDagsterSourceConfig(DatasetSourceConfigMixin):
datahub_client_config: DatahubClientConfig = pydantic.Field(
default=DatahubClientConfig(),
description="Datahub client config",
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import traceback
import warnings
from collections import defaultdict
from types import ModuleType
from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Union
Expand Down Expand Up @@ -38,7 +39,7 @@
from dagster._core.events import DagsterEventType, HandledOutputData, LoadedInputData
from dagster._core.execution.stats import RunStepKeyStatsSnapshot
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import DataHubGraph
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
from datahub.metadata.schema_classes import SubTypesClass
from datahub.sql_parsing.sqlglot_lineage import (
SqlParsingResult,
Expand All @@ -47,6 +48,7 @@
from datahub.utilities.urns.dataset_urn import DatasetUrn

from datahub_dagster_plugin.client.dagster_generator import (
Constant,
DagsterEnvironment,
DagsterGenerator,
DatahubDagsterSourceConfig,
Expand Down Expand Up @@ -182,7 +184,17 @@ def __init__(
if config:
self.config = config
else:
self.config = DatahubDagsterSourceConfig()
# This is a temporary warning for backwards compatibility. Eventually, we'll remove this
# branch and make the config required.
warnings.warn(
"Using the default DataHub client config is deprecated. Pass in a config object explicitly.",
stacklevel=2,
)
self.config = DatahubDagsterSourceConfig(
datahub_client_config=DatahubClientConfig(
server=Constant.DEFAULT_DATAHUB_REST_URL
)
)
self.graph = DataHubGraph(
self.config.datahub_client_config,
)
Expand Down
27 changes: 27 additions & 0 deletions metadata-ingestion/examples/mce_files/bootstrap_mce.json
Original file line number Diff line number Diff line change
Expand Up @@ -3613,6 +3613,33 @@
},
"systemMetadata": null
},
{
"entityType": "post",
"entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8de",
"changeType": "UPSERT",
"aspectName": "postInfo",
"aspect": {
"json": {
"type": "HOME_PAGE_ANNOUNCEMENT",
"content": {
"title": "Join Metadata & AI Summit 2024",
"type": "LINK",
"link": "http://www.acryldata.io/conference?utm_source=datahub_quickstart&utm_medium=metadata_ai_2024&utm_campaign=pinned_announcement",
"media": {
"type": "IMAGE",
"location": "https://formulatedby.com/wp-content/uploads/2024/07/0193320a6d93e7508d1598f7b24662f75a87e92f-352x456-1.svg"
}
},
"created": 1712547125049,
"lastModified": 1712547125049
}
},
"systemMetadata": {
"lastObserved": 1712548844816,
"runId": "datahub-2024_04_08-13_00_44",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "post",
"entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8dd",
Expand Down
15 changes: 9 additions & 6 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,13 @@
"Authlib",
}

superset_common = {
"requests",
"sqlalchemy",
"great_expectations",
"greenlet",
}

# Note: for all of these, framework_common will be added.
plugins: Dict[str, Set[str]] = {
# Sink plugins.
Expand Down Expand Up @@ -462,12 +469,8 @@
"sqlalchemy": sql_common,
"sql-queries": usage_common | sqlglot_lib,
"slack": slack,
"superset": {
"requests",
"sqlalchemy",
"great_expectations",
"greenlet",
},
"superset": superset_common,
"preset": superset_common,
# FIXME: I don't think tableau uses sqllineage anymore so we should be able
# to remove that dependency.
"tableau": {"tableauserverclient>=0.24.0"} | sqllineage_lib | sqlglot_lib,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class DataProduct(ConfigModel):
@pydantic.validator("assets", each_item=True)
def assets_must_be_urns(cls, v: str) -> str:
try:
Urn.create_from_string(v)
Urn.from_string(v)
except Exception as e:
raise ValueError(f"asset {v} is not an urn: {e}") from e

Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/src/datahub/ingestion/graph/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class DatahubClientConfig(ConfigModel):

# TODO: Having a default for the server doesn't make a ton of sense. This should be handled
# by callers / the CLI, but the actual client should not have any magic.
server: str = "http://localhost:8080"
server: str
token: Optional[str] = None
timeout_sec: Optional[int] = None
retry_status_codes: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@
# We can't use close as it is not called if the ingestion is not successful
def cleanup(config: BigQueryV2Config) -> None:
if config._credentials_path is not None:
logger.debug(
f"Deleting temporary credential file at {config._credentials_path}"
)
os.unlink(config._credentials_path)


Expand Down Expand Up @@ -188,6 +185,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
self.sql_parser_schema_resolver,
self.profiler,
self.identifiers,
self.ctx.graph,
)

self.add_config_to_report()
Expand Down
Loading

0 comments on commit e63c233

Please sign in to comment.