diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index e75bafdac9628..66a08dc63aa0d 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -54,6 +54,9 @@ jobs: - python-version: "3.11" extra_pip_requirements: "apache-airflow~=2.9.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.11.txt" extra_pip_extras: plugin-v2 + - python-version: "3.11" + extra_pip_requirements: "apache-airflow~=2.10.2 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.10.2/constraints-3.11.txt" + extra_pip_extras: plugin-v2 fail-fast: false steps: - name: Set up JDK 17 diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index ceee59215e431..d1c16b567158a 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -480,6 +480,39 @@ jobs: context: . file: ./docker/kafka-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + kafka_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Kafka Setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, kafka_setup_build ] + if: ${{ needs.setup.outputs.kafka_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" mysql_setup_build: name: Build and Push DataHub MySQL Setup Docker Image @@ -501,6 +534,39 @@ jobs: context: . file: ./docker/mysql-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + mysql_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan MySQL Setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, mysql_setup_build ] + if: ${{ needs.setup.outputs.mysql_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" elasticsearch_setup_build: name: Build and Push DataHub Elasticsearch Setup Docker Image @@ -522,6 +588,39 @@ jobs: context: . file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + elasticsearch_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan ElasticSearch setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, elasticsearch_setup_build ] + if: ${{ needs.setup.outputs.elasticsearch_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' ) }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" datahub_ingestion_base_build: name: Build and Push DataHub Ingestion (Base) Docker Image diff --git a/build.gradle b/build.gradle index d7fbbb6682e04..79a4ca9384d28 100644 --- a/build.gradle +++ b/build.gradle @@ -107,8 +107,8 @@ project.ext.externalDependency = [ 'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3', 'antlr4': 'org.antlr:antlr4:4.9.3', 'assertJ': 'org.assertj:assertj-core:3.11.1', - 'avro': 'org.apache.avro:avro:1.11.3', - 'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3', + 'avro': 'org.apache.avro:avro:1.11.4', + 'avroCompiler': 'org.apache.avro:avro-compiler:1.11.4', 'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17', 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.0.3', 'awsS3': 'software.amazon.awssdk:s3:2.26.21', diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx index a9737c9698f7b..1deeed076d8d6 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx @@ -323,7 +323,13 @@ export const EntityProfile = ({ {showBrowseBar && } {entityData?.status?.removed === true && ( + This entity is marked as soft-deleted, likely due to stateful ingestion or a manual + deletion command, and will not appear in search or lineage graphs. Contact your DataHub + admin for more information. + + } banner /> )} diff --git a/docker/airflow/docker-compose.yaml b/docker/airflow/docker-compose.yaml index df85f2acb9efd..ca19e3c5aa9d1 100644 --- a/docker/airflow/docker-compose.yaml +++ b/docker/airflow/docker-compose.yaml @@ -41,7 +41,6 @@ # # Feel free to modify this file to suit your needs. --- -version: '3' x-airflow-common: &airflow-common image: ${AIRFLOW_IMAGE_NAME:-acryldata/airflow-datahub:latest} diff --git a/docker/cassandra/docker-compose.cassandra.yml b/docker/cassandra/docker-compose.cassandra.yml index ae7d649ab3d23..3fde920161685 100644 --- a/docker/cassandra/docker-compose.cassandra.yml +++ b/docker/cassandra/docker-compose.cassandra.yml @@ -1,6 +1,5 @@ # Override to use Cassandra as a backing store for datahub-gms. --- -version: '3.8' services: cassandra: hostname: cassandra diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 6709aee98d697..55bca606c5ba3 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -4,7 +4,6 @@ # NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/docker-compose-without-neo4j.override.yml b/docker/docker-compose-without-neo4j.override.yml index 11d7cd0c0c87b..adf610ec3a9ed 100644 --- a/docker/docker-compose-without-neo4j.override.yml +++ b/docker/docker-compose-without-neo4j.override.yml @@ -1,5 +1,4 @@ --- -version: '3.9' services: datahub-gms: env_file: datahub-gms/env/docker-without-neo4j.env diff --git a/docker/docker-compose-without-neo4j.postgres.override.yml b/docker/docker-compose-without-neo4j.postgres.override.yml index b81fb6435c297..2d1e0958dfb8b 100644 --- a/docker/docker-compose-without-neo4j.postgres.override.yml +++ b/docker/docker-compose-without-neo4j.postgres.override.yml @@ -1,6 +1,5 @@ # Override to use PostgreSQL as a backing store for datahub-gms. --- -version: '3.9' services: datahub-gms: env_file: diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 53fcc77c6e8f3..4350322a17379 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -4,7 +4,6 @@ # NOTE: This file will cannot build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/docker-compose.consumers-without-neo4j.yml b/docker/docker-compose.consumers-without-neo4j.yml index f1aa6b30cede0..01dff8c0d7b3d 100644 --- a/docker/docker-compose.consumers-without-neo4j.yml +++ b/docker/docker-compose.consumers-without-neo4j.yml @@ -1,5 +1,4 @@ # Service definitions for standalone Kafka consumer containers. -version: '3.9' services: datahub-gms: environment: diff --git a/docker/docker-compose.consumers.dev.yml b/docker/docker-compose.consumers.dev.yml index 00f7b52df151f..df180e22f55d0 100644 --- a/docker/docker-compose.consumers.dev.yml +++ b/docker/docker-compose.consumers.dev.yml @@ -1,4 +1,3 @@ -version: '3.9' services: datahub-mae-consumer: image: acryldata/datahub-mae-consumer:debug diff --git a/docker/docker-compose.consumers.yml b/docker/docker-compose.consumers.yml index 74b9adaeb9948..36fdb5451ebd6 100644 --- a/docker/docker-compose.consumers.yml +++ b/docker/docker-compose.consumers.yml @@ -1,5 +1,4 @@ # Service definitions for standalone Kafka consumer containers. -version: '3.9' services: datahub-gms: environment: diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 2202f362abd99..c68a4c1f5a8fc 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -8,7 +8,6 @@ # To make a JVM app debuggable via IntelliJ, go to its env file and add JVM debug flags, and then add the JVM debug # port to this file. --- -version: '3.9' services: datahub-frontend-react: image: acryldata/datahub-frontend-react:head diff --git a/docker/docker-compose.kafka-setup.yml b/docker/docker-compose.kafka-setup.yml index 59b3459bf4555..67b7641f509b3 100644 --- a/docker/docker-compose.kafka-setup.yml +++ b/docker/docker-compose.kafka-setup.yml @@ -1,3 +1,2 @@ # Empty docker compose for kafka-setup as we have moved kafka-setup back into the main compose -version: '3.9' services: \ No newline at end of file diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml index 51fbe0060aa5f..25abf247a5d04 100644 --- a/docker/docker-compose.override.yml +++ b/docker/docker-compose.override.yml @@ -1,6 +1,5 @@ # Default override to use MySQL as a backing store for datahub-gms (same as docker-compose.mysql.yml). --- -version: '3.9' services: datahub-gms: env_file: datahub-gms/env/docker.env diff --git a/docker/docker-compose.tools.yml b/docker/docker-compose.tools.yml index 8d2c30c64e6c8..9f0e2639521a4 100644 --- a/docker/docker-compose.tools.yml +++ b/docker/docker-compose.tools.yml @@ -1,6 +1,5 @@ # Tools useful for operating & debugging DataHub. --- -version: '3.8' services: kafka-rest-proxy: image: confluentinc/cp-kafka-rest:7.4.0 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5430a8a6fcd5b..e77b6d11ef241 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -4,7 +4,6 @@ # NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/ingestion/docker-compose.yml b/docker/ingestion/docker-compose.yml index 06d4e47aa4a40..a474d9505285d 100644 --- a/docker/ingestion/docker-compose.yml +++ b/docker/ingestion/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: '3.5' services: ingestion: build: diff --git a/docker/mariadb/docker-compose.mariadb.yml b/docker/mariadb/docker-compose.mariadb.yml index 0ee172fb1f1a5..63f5ed929711b 100644 --- a/docker/mariadb/docker-compose.mariadb.yml +++ b/docker/mariadb/docker-compose.mariadb.yml @@ -1,6 +1,5 @@ # Override to use MariaDB as a backing store for datahub-gms. --- -version: '3.8' services: mariadb: hostname: mariadb diff --git a/docker/monitoring/docker-compose.consumers.monitoring.yml b/docker/monitoring/docker-compose.consumers.monitoring.yml index 254b0a58d0223..147ddde37798f 100644 --- a/docker/monitoring/docker-compose.consumers.monitoring.yml +++ b/docker/monitoring/docker-compose.consumers.monitoring.yml @@ -1,5 +1,4 @@ --- -version: '3.8' services: datahub-mae-consumer: environment: diff --git a/docker/monitoring/docker-compose.monitoring.yml b/docker/monitoring/docker-compose.monitoring.yml index c6fa019cf99fc..039aa55bd3867 100644 --- a/docker/monitoring/docker-compose.monitoring.yml +++ b/docker/monitoring/docker-compose.monitoring.yml @@ -1,5 +1,4 @@ --- -version: '3.9' services: datahub-frontend-react: environment: diff --git a/docker/mysql/docker-compose.mysql.yml b/docker/mysql/docker-compose.mysql.yml index 7cc27b9f8b154..ee00e51c09b1f 100644 --- a/docker/mysql/docker-compose.mysql.yml +++ b/docker/mysql/docker-compose.mysql.yml @@ -1,6 +1,5 @@ # Override to use MySQL as a backing store for datahub-gms. --- -version: '3.8' services: mysql: hostname: mysql diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 046ab96cf3002..5f415d4000178 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -291,7 +291,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 6295572aac98f..c2a0ddb0391b9 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -266,7 +266,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index ed5f203ff4d05..46ea765f45e1f 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -266,7 +266,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index a4211acedcf10..8801d26eddbf4 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -55,4 +55,3 @@ services: image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 -version: '3.9' diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index e7571e4baf8b4..1daa969af82d8 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -69,4 +69,3 @@ services: image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 -version: '3.9' diff --git a/docker/quickstart/docker-compose.kafka-setup.quickstart.yml b/docker/quickstart/docker-compose.kafka-setup.quickstart.yml index d126caac6b5a8..ad189ddbec905 100644 --- a/docker/quickstart/docker-compose.kafka-setup.quickstart.yml +++ b/docker/quickstart/docker-compose.kafka-setup.quickstart.yml @@ -1,2 +1 @@ services: {} -version: '3.9' diff --git a/docker/quickstart/docker-compose.monitoring.quickstart.yml b/docker/quickstart/docker-compose.monitoring.quickstart.yml index bddd82e393338..f4afd397e785d 100644 --- a/docker/quickstart/docker-compose.monitoring.quickstart.yml +++ b/docker/quickstart/docker-compose.monitoring.quickstart.yml @@ -41,6 +41,5 @@ services: - 9089:9090 volumes: - ../monitoring/prometheus.yaml:/etc/prometheus/prometheus.yml -version: '3.9' volumes: grafana-storage: null diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 66616be98bec1..4d03b22598606 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -291,7 +291,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/generate_docker_quickstart.py b/docker/quickstart/generate_docker_quickstart.py index 9846233b369d7..6a3b64155adea 100644 --- a/docker/quickstart/generate_docker_quickstart.py +++ b/docker/quickstart/generate_docker_quickstart.py @@ -120,11 +120,6 @@ def modify_docker_config(base_path, docker_yaml_config): elif volumes[i].startswith("./"): volumes[i] = "." + volumes[i] - # 10. Set docker compose version to 3. - # We need at least this version, since we use features like start_period for - # healthchecks (with services dependencies based on them) and shell-like variable interpolation. - docker_yaml_config["version"] = "3.9" - def dedup_env_vars(merged_docker_config): for service in merged_docker_config["services"]: diff --git a/docs/how/search.md b/docs/how/search.md index 4df5e7c1984d5..2274fe7c09240 100644 --- a/docs/how/search.md +++ b/docs/how/search.md @@ -105,6 +105,20 @@ If you want to: - ```/q customProperties: encoding*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20customProperties%3A%20encoding%2A) - Dataset Properties are indexed in ElasticSearch the manner of key=value. Hence if you know the precise key-value pair, you can search using ```"key=value"```. However, if you only know the key, you can use wildcards to replace the value and that is what is being done here. +- Find an entity with an **unversioned** structured property + - ```/q structuredProperties.io_acryl_privacy_retentionTime01:60``` + - This will return results for an **unversioned** structured property's qualified name `io.acryl.private.retentionTime01` and value `60`. + - ```/q _exists_:structuredProperties.io_acryl_privacy_retentionTime01``` + - In this example, the query will return any entity which has any value for the **unversioned** structured property with qualified name `io.acryl.private.retentionTime01`. + +- Find an entity with a **versioned** structured property + - ```/q structuredProperties._versioned.io_acryl_privacy_retentionTime.20240614080000.number:365``` + - This query will return results for a **versioned** structured property with qualified name `io.acryl.privacy.retentionTime`, version `20240614080000`, type `number` and value `365`. + - ```/q _exists_:structuredProperties._versioned.io_acryl_privacy_retentionTime.20240614080000.number``` + - Returns results for a **versioned** structured property with qualified name `io.acryl.privacy.retentionTime`, version `20240614080000` and type `number`. + - ```/q structuredProperties._versioned.io_acryl_privacy_retentionTime.\*.\*:365``` + - Returns results for a **versioned** structured property with any version and type with a values of `365` + - Find a dataset with a column name, **latitude** - ```/q fieldPaths: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20fieldPaths%3A%20latitude) - fieldPaths is the name of the attribute that holds the column name in Datasets. diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 65da1fd5251dc..aca6d30619ea8 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -141,6 +141,7 @@ conn_id = datahub_rest_default # or datahub_kafka_default | capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | | capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | | materialize_iolets | true | Create or un-soft-delete all entities referenced in lineage. | +| render_templates | true | If true, jinja-templated fields will be automatically rendered to improve the accuracy of SQL statement extraction. | | datajob_url_link | taskinstance | If taskinstance, the datajob url will be taskinstance link on airflow. It can also be grid. | | | | graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 1587b4793fbf1..0d5ceefd989dc 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -44,7 +44,7 @@ def get_long_description(): # We remain restrictive on the versions allowed here to prevent # us from being broken by backwards-incompatible changes in the # underlying package. - "openlineage-airflow>=1.2.0,<=1.18.0", + "openlineage-airflow>=1.2.0,<=1.22.0", }, } diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py index c37a1b334ed37..8deba22a107ce 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py @@ -43,19 +43,24 @@ class DatahubLineageConfig(ConfigModel): capture_executions: bool = False + datajob_url_link: DatajobUrl = DatajobUrl.TASKINSTANCE + + # Note that this field is only respected by the lineage backend. + # The Airflow plugin v2 behaves as if it were set to True. + graceful_exceptions: bool = True + + # The remaining config fields are only relevant for the v2 plugin. enable_extractors: bool = True + # If true, ti.render_templates() will be called in the listener. + # Makes extraction of jinja-templated fields more accurate. + render_templates: bool = True + log_level: Optional[str] = None debug_emitter: bool = False disable_openlineage_plugin: bool = True - # Note that this field is only respected by the lineage backend. - # The Airflow plugin behaves as if it were set to True. - graceful_exceptions: bool = True - - datajob_url_link: DatajobUrl = DatajobUrl.TASKINSTANCE - def make_emitter_hook(self) -> "DatahubGenericHook": # This is necessary to avoid issues with circular imports. from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook @@ -84,6 +89,7 @@ def get_lineage_config() -> DatahubLineageConfig: disable_openlineage_plugin = conf.get( "datahub", "disable_openlineage_plugin", fallback=True ) + render_templates = conf.get("datahub", "render_templates", fallback=True) datajob_url_link = conf.get( "datahub", "datajob_url_link", fallback=DatajobUrl.TASKINSTANCE.value ) @@ -102,4 +108,5 @@ def get_lineage_config() -> DatahubLineageConfig: debug_emitter=debug_emitter, disable_openlineage_plugin=disable_openlineage_plugin, datajob_url_link=datajob_url_link, + render_templates=render_templates, ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index 123b74fee74b5..b818b76de9f7f 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -386,7 +386,8 @@ def on_task_instance_running( f"DataHub listener got notification about task instance start for {task_instance.task_id}" ) - task_instance = _render_templates(task_instance) + if self.config.render_templates: + task_instance = _render_templates(task_instance) # The type ignore is to placate mypy on Airflow 2.1.x. dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] @@ -478,7 +479,8 @@ def on_task_instance_finish( ) -> None: dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] - task_instance = _render_templates(task_instance) + if self.config.render_templates: + task_instance = _render_templates(task_instance) # We must prefer the task attribute, in case modifications to the task's inlets/outlets # were made by the execute() method. diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index 9e0a30df6fcbd..2e4596a24c2a6 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27, py310-airflow28, py311-airflow29 +envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27, py310-airflow28, py311-airflow29, py311-airflow210 [testenv] use_develop = true @@ -20,6 +20,7 @@ deps = airflow27: apache-airflow~=2.7.0 airflow28: apache-airflow~=2.8.0 airflow29: apache-airflow~=2.9.0 + airflow210: apache-airflow~=2.10.0 # Respect the Airflow constraints files. # We can't make ourselves work with the constraints of Airflow < 2.3. @@ -30,6 +31,7 @@ deps = py310-airflow27: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt py310-airflow28: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt py311-airflow29: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.11.txt + py311-airflow210: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.10.2/constraints-3.11.txt # Before pinning to the constraint files, we previously left the dependencies # more open. There were a number of packages for which this caused issues. @@ -57,6 +59,6 @@ commands = [testenv:py310-airflow24] extras = dev,integration-tests,plugin-v2,test-airflow24 -[testenv:py310-airflow{26,27,28},py311-airflow{29}] +[testenv:py310-airflow{26,27,28},py311-airflow{29,210}] extras = dev,integration-tests,plugin-v2 diff --git a/metadata-ingestion/docs/sources/datahub/datahub_pre.md b/metadata-ingestion/docs/sources/datahub/datahub_pre.md index cb1cc2c4d5903..b35eb5811e4c9 100644 --- a/metadata-ingestion/docs/sources/datahub/datahub_pre.md +++ b/metadata-ingestion/docs/sources/datahub/datahub_pre.md @@ -71,3 +71,27 @@ and [mce-consumer](../../../../metadata-jobs/mce-consumer-job/README.md)) - Increase the number of gms pods to add redundancy and increase resilience to node evictions * If you are migrating large amounts of data, consider increasing elasticsearch's thread count via the `ELASTICSEARCH_THREAD_COUNT` environment variable. + +#### Exclusions +You will likely want to exclude some urn types from your ingestion, as they contain instance-specific +metadata, such as settings, roles, policies, ingestion sources, and ingestion runs. For example, you +will likely want to start with this: + +```yaml +source: + config: + urn_pattern: # URN pattern to ignore/include in the ingestion + deny: + # Ignores all datahub metadata where the urn matches the regex + - ^urn:li:role.* # Only exclude if you do not want to ingest roles + - ^urn:li:dataHubRole.* # Only exclude if you do not want to ingest roles + - ^urn:li:dataHubPolicy.* # Only exclude if you do not want to ingest policies + - ^urn:li:dataHubIngestionSource.* # Only exclude if you do not want to ingest ingestion sources + - ^urn:li:dataHubSecret.* + - ^urn:li:dataHubExecutionRequest.* + - ^urn:li:dataHubAccessToken.* + - ^urn:li:dataHubUpgrade.* + - ^urn:li:inviteToken.* + - ^urn:li:globalSettings.* + - ^urn:li:dataHubStepState.* +``` diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index cd3983edeeec9..ac674342ee670 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -726,6 +726,7 @@ "snowflake-summary = datahub.ingestion.source.snowflake.snowflake_summary:SnowflakeSummarySource", "snowflake-queries = datahub.ingestion.source.snowflake.snowflake_queries:SnowflakeQueriesSource", "superset = datahub.ingestion.source.superset:SupersetSource", + "preset = datahub.ingestion.source.preset:PresetSource", "tableau = datahub.ingestion.source.tableau.tableau:TableauSource", "openapi = datahub.ingestion.source.openapi:OpenApiSource", "metabase = datahub.ingestion.source.metabase:MetabaseSource", diff --git a/metadata-ingestion/src/datahub/cli/config_utils.py b/metadata-ingestion/src/datahub/cli/config_utils.py index bb85809174ea9..5d9604de7836f 100644 --- a/metadata-ingestion/src/datahub/cli/config_utils.py +++ b/metadata-ingestion/src/datahub/cli/config_utils.py @@ -84,6 +84,13 @@ def _get_config_from_env() -> Tuple[Optional[str], Optional[str]]: return url or host, token +def require_config_from_env() -> Tuple[str, Optional[str]]: + host, token = _get_config_from_env() + if host is None: + raise MissingConfigError("No GMS host was provided in env variables.") + return host, token + + def load_client_config() -> DatahubClientConfig: gms_host_env, gms_token_env = _get_config_from_env() if gms_host_env: diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index b5cc67532a9dd..584bfc1f7cda5 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -338,10 +338,18 @@ def by_filter( # TODO: add some validation on entity_type if not force and not soft and not dry_run: - click.confirm( - "This will permanently delete data from DataHub. Do you want to continue?", - abort=True, - ) + if only_soft_deleted: + click.confirm( + "This will permanently delete data from DataHub. Do you want to continue?", + abort=True, + ) + else: + click.confirm( + "Hard deletion will permanently delete data from DataHub and can be slow. " + "We generally recommend using soft deletes instead. " + "Do you want to continue?", + abort=True, + ) graph = get_default_graph() logger.info(f"Using {graph}") diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index e370ad3562a06..948060c3c4f44 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -76,6 +76,12 @@ def __init__( ): if not gms_server: raise ConfigurationError("gms server is required") + if gms_server == "__from_env__" and token is None: + # HACK: similar to what we do with system auth, we transparently + # inject the config in here. Ideally this should be done in the + # config loader or by the caller, but it gets the job done for now. + gms_server, token = config_utils.require_config_from_env() + self._gms_server = fixup_gms_url(gms_server) self._token = token self.server_config: Dict[str, Any] = {} diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 0fdb7bb537457..b9b0ed556e66c 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -214,27 +214,28 @@ def _get_generic(self, url: str, params: Optional[Dict] = None) -> Dict: def _post_generic(self, url: str, payload_dict: Dict) -> Dict: return self._send_restli_request("POST", url, json=payload_dict) - def _make_rest_sink_config(self) -> "DatahubRestSinkConfig": - from datahub.ingestion.sink.datahub_rest import ( - DatahubRestSinkConfig, - RestSinkMode, - ) + def _make_rest_sink_config( + self, extra_config: Optional[Dict] = None + ) -> "DatahubRestSinkConfig": + from datahub.ingestion.sink.datahub_rest import DatahubRestSinkConfig # This is a bit convoluted - this DataHubGraph class is a subclass of DatahubRestEmitter, # but initializing the rest sink creates another rest emitter. # TODO: We should refactor out the multithreading functionality of the sink # into a separate class that can be used by both the sink and the graph client # e.g. a DatahubBulkRestEmitter that both the sink and the graph client use. - return DatahubRestSinkConfig(**self.config.dict(), mode=RestSinkMode.ASYNC) + return DatahubRestSinkConfig(**self.config.dict(), **(extra_config or {})) @contextlib.contextmanager def make_rest_sink( - self, run_id: str = _GRAPH_DUMMY_RUN_ID + self, + run_id: str = _GRAPH_DUMMY_RUN_ID, + extra_sink_config: Optional[Dict] = None, ) -> Iterator["DatahubRestSink"]: from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.sink.datahub_rest import DatahubRestSink - sink_config = self._make_rest_sink_config() + sink_config = self._make_rest_sink_config(extra_config=extra_sink_config) with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink: yield sink if sink.report.failures: diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py index 9059dcca3e2b8..5b4d3fe38ecd9 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py @@ -79,6 +79,7 @@ class DataHubRestSinkReport(SinkReport): gms_version: Optional[str] = None pending_requests: int = 0 + async_batches_prepared: int = 0 async_batches_split: int = 0 main_thread_blocking_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) @@ -260,6 +261,7 @@ def _emit_batch_wrapper( events.append(event) chunks = self.emitter.emit_mcps(events) + self.report.async_batches_prepared += 1 if chunks > 1: self.report.async_batches_split += chunks logger.info( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 6ea8f21e8b291..11d06771d4e4f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -1,5 +1,6 @@ import logging import re +from base64 import b32decode from collections import defaultdict from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast @@ -89,12 +90,13 @@ HiveColumnToAvroConverter, get_schema_fields_for_hive_column, ) -from datahub.utilities.mapping import Constants from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.ratelimiter import RateLimiter from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor +ENCODED_TAG_PREFIX = "urn_li_encoded_tag_" + logger: logging.Logger = logging.getLogger(__name__) # Handle table snapshots # See https://cloud.google.com/bigquery/docs/table-snapshots-intro. @@ -194,6 +196,18 @@ def store_table_refs(self): or self.config.use_queries_v2 ) + def modified_base32decode(self, text_to_decode: str) -> str: + # When we sync from DataHub to BigQuery, we encode the tags as modified base32 strings. + # BiqQuery labels only support lowercase letters, international characters, numbers, or underscores. + # So we need to modify the base32 encoding to replace the padding character `=` with `_` and convert to lowercase. + if not text_to_decode.startswith("%s" % ENCODED_TAG_PREFIX): + return text_to_decode + text_to_decode = ( + text_to_decode.replace(ENCODED_TAG_PREFIX, "").upper().replace("_", "=") + ) + text = b32decode(text_to_decode.encode("utf-8")).decode("utf-8") + return text + def get_project_workunits( self, project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: @@ -253,7 +267,7 @@ def gen_dataset_containers( tags_joined: Optional[List[str]] = None if tags and self.config.capture_dataset_label_as_tag: tags_joined = [ - f"{k}:{v}" + self.make_tag_from_label(k, v) for k, v in tags.items() if is_tag_allowed(self.config.capture_dataset_label_as_tag, k) ] @@ -662,6 +676,11 @@ def _process_snapshot( dataset_name=dataset_name, ) + def make_tag_from_label(self, key: str, value: str) -> str: + if not value.startswith(ENCODED_TAG_PREFIX): + return make_tag_urn(f"""{key}:{value}""") + return self.modified_base32decode(value) + def gen_table_dataset_workunits( self, table: BigqueryTable, @@ -707,7 +726,7 @@ def gen_table_dataset_workunits( tags_to_add = [] tags_to_add.extend( [ - make_tag_urn(f"""{k}:{v}""") + self.make_tag_from_label(k, v) for k, v in table.labels.items() if is_tag_allowed(self.config.capture_table_label_as_tag, k) ] @@ -733,7 +752,7 @@ def gen_view_dataset_workunits( tags_to_add = None if table.labels and self.config.capture_view_label_as_tag: tags_to_add = [ - make_tag_urn(f"{k}:{v}") + self.make_tag_from_label(k, v) for k, v in table.labels.items() if is_tag_allowed(self.config.capture_view_label_as_tag, k) ] @@ -922,11 +941,6 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: break else: tags = [] - if col.is_partition_column: - tags.append( - TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) - ) - if col.cluster_column_position is not None: tags.append( TagAssociationClass( @@ -944,6 +958,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: type=SchemaFieldDataType( self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)() ), + isPartitioningKey=col.is_partition_column, nativeDataType=col.data_type, description=col.comment, nullable=col.is_nullable, diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index e3f9a150ad000..e4829f8713cf7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -129,6 +129,10 @@ class CSVEnricherSource(Source): If ownership_type_urn is set then ownership_type must be set to CUSTOM. + Note that you have the option in your recipe config to write as a PATCH or as an OVERRIDE. This choice will apply to + all metadata for the entity, not just a single aspect. So OVERRIDE will override all metadata, including performing + deletes if a metadata field is empty. The default is PATCH. + :::note This source will not work on very large csv files that do not fit in memory. ::: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index 0b5c164a6b2c7..ab55d4e15e5de 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -196,11 +196,25 @@ def folder_ancestors( fields: Union[str, List[str]] = ["id", "name", "parent_id"], ) -> Sequence[Folder]: self.client_stats.folder_calls += 1 - return self.client.folder_ancestors( - folder_id, - self.__fields_mapper(fields), - transport_options=self.transport_options, - ) + try: + return self.client.folder_ancestors( + folder_id, + self.__fields_mapper(fields), + transport_options=self.transport_options, + ) + except SDKError as e: + if "Looker Not Found (404)" in str(e): + # Folder ancestors not found + logger.info( + f"Could not find ancestors for folder with id {folder_id}: 404 error" + ) + else: + logger.warning( + f"Could not find ancestors for folder with id {folder_id}" + ) + logger.warning(f"Failure was {e}") + # Folder ancestors not found + return [] def all_connections(self): self.client_stats.all_connections_calls += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index e4dadaf602852..e3fd8849ed844 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -290,8 +290,8 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig): # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes self.mongo_client = MongoClient( - self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options - ) # type: ignore + self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options # type: ignore + ) # This cheaply tests the connection. For details, see # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient diff --git a/metadata-ingestion/src/datahub/ingestion/source/preset.py b/metadata-ingestion/src/datahub/ingestion/source/preset.py new file mode 100644 index 0000000000000..e51520898103d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/preset.py @@ -0,0 +1,114 @@ +import logging +from typing import Dict, Optional + +import requests +from pydantic.class_validators import root_validator, validator +from pydantic.fields import Field + +from datahub.emitter.mce_builder import DEFAULT_ENV +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.superset import SupersetConfig, SupersetSource +from datahub.utilities import config_clean + +logger = logging.getLogger(__name__) + + +class PresetConfig(SupersetConfig): + manager_uri: str = Field( + default="https://api.app.preset.io", description="Preset.io API URL" + ) + connect_uri: str = Field(default="", description="Preset workspace URL.") + display_uri: Optional[str] = Field( + default=None, + description="optional URL to use in links (if `connect_uri` is only for ingestion)", + ) + api_key: Optional[str] = Field(default=None, description="Preset.io API key.") + api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.") + + # Configuration for stateful ingestion + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( + default=None, description="Preset Stateful Ingestion Config." + ) + + options: Dict = Field(default={}, description="") + env: str = Field( + default=DEFAULT_ENV, + description="Environment to use in namespace when constructing URNs", + ) + database_alias: Dict[str, str] = Field( + default={}, + description="Can be used to change mapping for database names in superset to what you have in datahub", + ) + + @validator("connect_uri", "display_uri") + def remove_trailing_slash(cls, v): + return config_clean.remove_trailing_slashes(v) + + @root_validator + def default_display_uri_to_connect_uri(cls, values): + base = values.get("display_uri") + if base is None: + values["display_uri"] = values.get("connect_uri") + return values + + +@platform_name("Preset") +@config_class(PresetConfig) +@support_status(SupportStatus.TESTING) +@capability( + SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" +) +class PresetSource(SupersetSource): + """ + Variation of the Superset plugin that works with Preset.io (Apache Superset SaaS). + """ + + config: PresetConfig + report: StaleEntityRemovalSourceReport + platform = "preset" + + def __init__(self, ctx: PipelineContext, config: PresetConfig): + logger.info(f"ctx is {ctx}") + + super().__init__(ctx, config) + self.config = config + self.report = StaleEntityRemovalSourceReport() + + def login(self): + try: + login_response = requests.post( + f"{self.config.manager_uri}/v1/auth/", + json={"name": self.config.api_key, "secret": self.config.api_secret}, + ) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to authenticate with Preset: {e}") + raise e + + self.access_token = login_response.json()["payload"]["access_token"] + logger.debug("Got access token from Preset") + + requests_session = requests.Session() + requests_session.headers.update( + { + "Authorization": f"Bearer {self.access_token}", + "Content-Type": "application/json", + "Accept": "*/*", + } + ) + # Test the connection + test_response = requests_session.get(f"{self.config.connect_uri}/version") + if not test_response.ok: + logger.error("Unable to connect to workspace") + return requests_session diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py index c73472f1b8041..9d77e13a0f3c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py @@ -29,6 +29,7 @@ STATEFUL_INGESTION_IGNORED_ENTITY_TYPES = { "dataProcessInstance", + "query", } @@ -75,7 +76,10 @@ def auto_stale_entity_removal( if wu.is_primary_source: entity_type = guess_entity_type(urn) - if entity_type is not None: + if ( + entity_type is not None + and entity_type not in STATEFUL_INGESTION_IGNORED_ENTITY_TYPES + ): stale_entity_removal_handler.add_entity_to_state(entity_type, urn) else: stale_entity_removal_handler.add_urn_to_skip(urn) diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index e563a806446c4..858281f880359 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -101,7 +101,11 @@ class SupersetConfig( ) username: Optional[str] = Field(default=None, description="Superset username.") password: Optional[str] = Field(default=None, description="Superset password.") - + api_key: Optional[str] = Field(default=None, description="Preset.io API key.") + api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.") + manager_uri: str = Field( + default="https://api.app.preset.io", description="Preset.io API URL" + ) # Configuration for stateful ingestion stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( default=None, description="Superset Stateful Ingestion Config." @@ -179,7 +183,14 @@ def __init__(self, ctx: PipelineContext, config: SupersetConfig): super().__init__(config, ctx) self.config = config self.report = StaleEntityRemovalSourceReport() + if self.config.domain: + self.domain_registry = DomainRegistry( + cached_domains=[domain_id for domain_id in self.config.domain], + graph=self.ctx.graph, + ) + self.session = self.login() + def login(self) -> requests.Session: login_response = requests.post( f"{self.config.connect_uri}/api/v1/security/login", json={ @@ -193,8 +204,8 @@ def __init__(self, ctx: PipelineContext, config: SupersetConfig): self.access_token = login_response.json()["access_token"] logger.debug("Got access token from superset") - self.session = requests.Session() - self.session.headers.update( + requests_session = requests.Session() + requests_session.headers.update( { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json", @@ -202,17 +213,14 @@ def __init__(self, ctx: PipelineContext, config: SupersetConfig): } ) - if self.config.domain: - self.domain_registry = DomainRegistry( - cached_domains=[domain_id for domain_id in self.config.domain], - graph=self.ctx.graph, - ) - # Test the connection - test_response = self.session.get(f"{self.config.connect_uri}/api/v1/dashboard/") + test_response = requests_session.get( + f"{self.config.connect_uri}/api/v1/dashboard/" + ) if test_response.status_code == 200: pass # TODO(Gabe): how should we message about this error? + return requests_session @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py index e62c46888ef0e..f0f0ab95ca811 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py @@ -11,7 +11,7 @@ class CSVEnricherConfig(ConfigModel): ) write_semantics: str = pydantic.Field( default="PATCH", - description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE"', + description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE". NOTE: this will apply to all metadata for the entity, not just a single aspect.', ) delimiter: str = pydantic.Field( default=",", description="Delimiter to use when parsing CSV" diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index fcf65130df975..02660f0fae08e 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -269,7 +269,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -296,7 +297,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -328,6 +330,29 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:priority:high" + }, + { + "tag": "urn:li:tag:purchase" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", @@ -463,7 +488,8 @@ } ] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -479,7 +505,8 @@ "globalTags": { "tags": [] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -620,7 +647,8 @@ } ] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -636,7 +664,8 @@ "globalTags": { "tags": [] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -1021,5 +1050,37 @@ "runId": "bigquery-2022_02_03-07_00_00", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:priority:high", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "priority:high" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:purchase", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "purchase" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json new file mode 100644 index 0000000000000..2a7336144f23d --- /dev/null +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json @@ -0,0 +1,1031 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1" + }, + "name": "project-id-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1", + "dataset_id": "bigquery-dataset-1", + "location": "US" + }, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1", + "name": "bigquery-dataset-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Age" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", + "name": "table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.view-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1", + "name": "view-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.view-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1", + "name": "snapshot-table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Bigquery Table Snapshot" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Age", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Age" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Email_Address", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Email_Address" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Test Policy Tag", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Test Policy Tag" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json index ab59b95a9f388..a1e5c3fd18f23 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json @@ -268,7 +268,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -295,7 +296,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index f9481d1d83d8b..0ac4e94a5a24f 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -54,6 +54,7 @@ def recipe(mcp_output_path: str, source_config_override: dict = {}) -> dict: "include_usage_statistics": False, "include_table_lineage": True, "include_data_platform_instance": True, + "capture_table_label_as_tag": True, "classification": ClassificationConfig( enabled=True, classifiers=[ @@ -155,6 +156,10 @@ def test_bigquery_v2_ingest( last_altered=None, size_in_bytes=None, rows_count=None, + labels={ + "priority": "high", + "purchase": "urn_li_encoded_tag_ovzg4otmne5hiylhhjyhk4tdnbqxgzi_", + }, ) get_tables_for_dataset.return_value = iter([bigquery_table]) snapshot_table = BigqueryTableSnapshot( @@ -319,8 +324,8 @@ def test_bigquery_queries_v2_ingest( tmp_path, ): test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" - mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_golden.json" - mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_output.json") + mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_queries_golden.json" + mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_queries_output.json") dataset_name = "bigquery-dataset-1" get_datasets_for_project_id.return_value = [ diff --git a/metadata-ingestion/tests/integration/preset/golden_test_ingest.json b/metadata-ingestion/tests/integration/preset/golden_test_ingest.json new file mode 100644 index 0000000000000..5aca7f3e5bd14 --- /dev/null +++ b/metadata-ingestion/tests/integration/preset/golden_test_ingest.json @@ -0,0 +1,286 @@ +[ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { + "urn": "urn:li:dashboard:(preset,1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { + "customProperties": { + "Status": "published", + "IsPublished": "true", + "Owners": "test_username_1, test_username_2", + "IsCertified": "true", + "CertifiedBy": "Certification team", + "CertificationDetails": "Approved" + }, + "title": "test_dashboard_title_1", + "description": "", + "charts": [ + "urn:li:chart:(preset,10)", + "urn:li:chart:(preset,11)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_1" + } + }, + "dashboardUrl": "mock://mock-domain.preset.io/dashboard/test_dashboard_url_1" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { + "urn": "urn:li:dashboard:(preset,2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { + "customProperties": { + "Status": "draft", + "IsPublished": "false", + "Owners": "unknown", + "IsCertified": "false" + }, + "title": "test_dashboard_title_2", + "description": "", + "charts": [ + "urn:li:chart:(preset,12)", + "urn:li:chart:(preset,13)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_2" + } + }, + "dashboardUrl": "mock://mock-domain.preset.io/dashboard/test_dashboard_url_2" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,10)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_1", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_1" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_10", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "BAR" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,11)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_2", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_1" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_11", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "PIE" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,12)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_3", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_2" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_12", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "AREA" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,13)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_4", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_2" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_13", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "HISTOGRAM" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/preset/golden_test_stateful_ingest.json b/metadata-ingestion/tests/integration/preset/golden_test_stateful_ingest.json new file mode 100644 index 0000000000000..719f0a78fb7d7 --- /dev/null +++ b/metadata-ingestion/tests/integration/preset/golden_test_stateful_ingest.json @@ -0,0 +1,261 @@ +[ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { + "urn": "urn:li:dashboard:(preset,1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { + "customProperties": { + "Status": "published", + "IsPublished": "true", + "Owners": "test_username_1, test_username_2", + "IsCertified": "true", + "CertifiedBy": "Certification team", + "CertificationDetails": "Approved" + }, + "title": "test_dashboard_title_1", + "description": "", + "charts": [ + "urn:li:chart:(preset,10)", + "urn:li:chart:(preset,11)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_1" + } + }, + "dashboardUrl": "mock://mock-domain.preset.io/dashboard/test_dashboard_url_1" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-2024_07_10-07_00_00", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_pipeline" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,10)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_1", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_1" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_10", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "BAR" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-2024_07_10-07_00_00", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_pipeline" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,11)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_2", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_1" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_11", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "PIE" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-2024_07_10-07_00_00", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_pipeline" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,12)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_3", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_2" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_12", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "AREA" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-2024_07_10-07_00_00", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_pipeline" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(preset,13)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "", + "Filters": "", + "Dimensions": "" + }, + "title": "test_chart_title_4", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1720594800000, + "actor": "urn:li:corpuser:test_username_2" + } + }, + "chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_13", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)" + } + ], + "type": "HISTOGRAM" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-2024_07_10-07_00_00", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_pipeline" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(preset,2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1720594800000, + "runId": "preset-2024_07_10-07_00_00", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_pipeline" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/preset/test_preset.py b/metadata-ingestion/tests/integration/preset/test_preset.py new file mode 100644 index 0000000000000..f926a762e6a07 --- /dev/null +++ b/metadata-ingestion/tests/integration/preset/test_preset.py @@ -0,0 +1,366 @@ +from typing import Any, Dict, Optional +from unittest.mock import patch + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import mce_helpers +from tests.test_helpers.state_helpers import ( + get_current_checkpoint_from_pipeline, + run_and_get_pipeline, + validate_all_providers_have_committed_successfully, +) + +FROZEN_TIME = "2024-07-10 07:00:00" +GMS_PORT = 8080 +GMS_SERVER = f"http://localhost:{GMS_PORT}" + + +def register_mock_api(request_mock: Any, override_data: Optional[dict] = None) -> None: + if override_data is None: + override_data = {} + + api_vs_response = { + "mock://mock-domain.preset.io/v1/auth/": { + "method": "POST", + "status_code": 200, + "json": { + "payload": { + "access_token": "test_token", + } + }, + }, + "mock://mock-domain.preset.io/version": { + "method": "GET", + "status_code": 200, + "json": { + "ci": { + "built_at": "Tue Jul 10 00:00:00 UTC 2024", + "build_num": "1", + "triggered_by": "Not triggered by a user", + }, + "git": { + "branch": "4.0.1.6", + "sha": "test_sha", + "sha_superset": "test_sha_superset", + "release_name": "test_release_name", + }, + "chart_version": "1.16.1", + "start_time": "2024-07-10 00:00:00", + "mt_deployment": True, + }, + }, + "mock://mock-domain.preset.io/api/v1/dashboard/": { + "method": "GET", + "status_code": 200, + "json": { + "count": 2, + "result": [ + { + "id": "1", + "changed_by": { + "username": "test_username_1", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "dashboard_title": "test_dashboard_title_1", + "url": "/dashboard/test_dashboard_url_1", + "position_json": '{"CHART-test-1": {"meta": { "chartId": "10" }}, "CHART-test-2": {"meta": { "chartId": "11" }}}', + "status": "published", + "published": True, + "owners": [ + { + "username": "test_username_1", + }, + { + "username": "test_username_2", + }, + ], + "certified_by": "Certification team", + "certification_details": "Approved", + }, + { + "id": "2", + "changed_by": { + "username": "test_username_2", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "dashboard_title": "test_dashboard_title_2", + "url": "/dashboard/test_dashboard_url_2", + "position_json": '{"CHART-test-3": {"meta": { "chartId": "12" }}, "CHART-test-4": {"meta": { "chartId": "13" }}}', + "status": "draft", + "published": False, + "owners": [ + { + "first_name": "name", + }, + ], + "certified_by": "", + "certification_details": "", + }, + ], + }, + }, + "mock://mock-domain.preset.io/api/v1/chart/": { + "method": "GET", + "status_code": 200, + "json": { + "count": 4, + "result": [ + { + "id": "10", + "changed_by": { + "username": "test_username_1", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "slice_name": "test_chart_title_1", + "viz_type": "box_plot", + "url": "/explore/test_chart_url_10", + "datasource_id": "20", + "params": '{"metrics": [], "adhoc_filters": []}', + }, + { + "id": "11", + "changed_by": { + "username": "test_username_1", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "slice_name": "test_chart_title_2", + "viz_type": "pie", + "url": "/explore/test_chart_url_11", + "datasource_id": "20", + "params": '{"metrics": [], "adhoc_filters": []}', + }, + { + "id": "12", + "changed_by": { + "username": "test_username_2", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "slice_name": "test_chart_title_3", + "viz_type": "treemap", + "url": "/explore/test_chart_url_12", + "datasource_id": "20", + "params": '{"metrics": [], "adhoc_filters": []}', + }, + { + "id": "13", + "changed_by": { + "username": "test_username_2", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "slice_name": "test_chart_title_4", + "viz_type": "histogram", + "url": "/explore/test_chart_url_13", + "datasource_id": "20", + "params": '{"metrics": [], "adhoc_filters": []}', + }, + ], + }, + }, + "mock://mock-domain.preset.io/api/v1/dataset/20": { + "method": "GET", + "status_code": 200, + "json": { + "result": { + "schema": "test_schema_name", + "table_name": "test_table_name", + "database": { + "id": "30", + "database_name": "test_database_name", + }, + }, + }, + }, + "mock://mock-domain.preset.io/api/v1/database/30": { + "method": "GET", + "status_code": 200, + "json": { + "result": { + "sqlalchemy_uri": "test_sqlalchemy_uri", + }, + }, + }, + } + + api_vs_response.update(override_data) + + for url in api_vs_response: + request_mock.register_uri( + api_vs_response[url]["method"], + url, + json=api_vs_response[url]["json"], + status_code=api_vs_response[url]["status_code"], + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_preset_ingest(pytestconfig, tmp_path, mock_time, requests_mock): + test_resources_dir = pytestconfig.rootpath / "tests/integration/preset" + + register_mock_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "preset-test", + "source": { + "type": "preset", + "config": { + "connect_uri": "mock://mock-domain.preset.io/", + "manager_uri": "mock://mock-domain.preset.io", + "api_key": "test_key", + "api_secret": "test_secret", + "provider": "db", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/preset_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_ingest.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "preset_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_preset_stateful_ingest( + pytestconfig, tmp_path, mock_time, requests_mock, mock_datahub_graph +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/preset" + + register_mock_api(request_mock=requests_mock) + + pipeline_config_dict: Dict[str, Any] = { + "source": { + "type": "preset", + "config": { + "connect_uri": "mock://mock-domain.preset.io/", + "manager_uri": "mock://mock-domain.preset.io", + "api_key": "test_key", + "api_secret": "test_secret", + "provider": "db", + # enable stateful ingestion + "stateful_ingestion": { + "enabled": True, + "remove_stale_metadata": True, + "fail_safe_threshold": 100.0, + "state_provider": { + "type": "datahub", + "config": {"datahub_api": {"server": GMS_SERVER}}, + }, + }, + }, + }, + "sink": { + # we are not really interested in the resulting events for this test + "type": "console" + }, + "pipeline_name": "test_pipeline", + } + + dashboard_endpoint_override = { + "mock://mock-domain.preset.io/api/v1/dashboard/": { + "method": "GET", + "status_code": 200, + "json": { + "count": 1, + "result": [ + { + "id": "1", + "changed_by": { + "username": "test_username_1", + }, + "changed_on_utc": "2024-07-10T07:00:00.000000+0000", + "dashboard_title": "test_dashboard_title_1", + "url": "/dashboard/test_dashboard_url_1", + "position_json": '{"CHART-test-1": {"meta": { "chartId": "10" }}, "CHART-test-2": {"meta": { "chartId": "11" }}}', + "status": "published", + "published": True, + "owners": [ + { + "username": "test_username_1", + }, + { + "username": "test_username_2", + }, + ], + "certified_by": "Certification team", + "certification_details": "Approved", + }, + ], + }, + }, + } + + with patch( + "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", + mock_datahub_graph, + ) as mock_checkpoint: + # Both checkpoint and reporting will use the same mocked graph instance. + mock_checkpoint.return_value = mock_datahub_graph + + # Do the first run of the pipeline and get the default job's checkpoint. + pipeline_run1 = run_and_get_pipeline(pipeline_config_dict) + checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1) + + assert checkpoint1 + assert checkpoint1.state + + # Remove one dashboard from the preset config. + register_mock_api( + request_mock=requests_mock, override_data=dashboard_endpoint_override + ) + + # Capture MCEs of second run to validate Status(removed=true) + deleted_mces_path = f"{tmp_path}/preset_deleted_mces.json" + pipeline_config_dict["sink"]["type"] = "file" + pipeline_config_dict["sink"]["config"] = {"filename": deleted_mces_path} + + # Do the second run of the pipeline. + pipeline_run2 = run_and_get_pipeline(pipeline_config_dict) + checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) + + assert checkpoint2 + assert checkpoint2.state + + # Perform all assertions on the states. The deleted dashboard should not be + # part of the second state + state1 = checkpoint1.state + state2 = checkpoint2.state + difference_urns = list( + state1.get_urns_not_in(type="dashboard", other_checkpoint_state=state2) + ) + + assert len(difference_urns) == 1 + + urn1 = "urn:li:dashboard:(preset,2)" + + assert urn1 in difference_urns + + # Validate that all providers have committed successfully. + validate_all_providers_have_committed_successfully( + pipeline=pipeline_run1, expected_providers=1 + ) + validate_all_providers_have_committed_successfully( + pipeline=pipeline_run2, expected_providers=1 + ) + + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + output_path=deleted_mces_path, + golden_path=test_resources_dir / "golden_test_stateful_ingest.json", + ) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json index ce03804279097..22f63da8ecb95 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json @@ -17,7 +17,7 @@ "state": { "formatVersion": "1.0", "serde": "utf-8", - "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\", \"urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619\"]}" + "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\", \"urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619\", \"urn:li:query:query1\"]}" }, "runId": "dummy-test-stateful-ingestion" } diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json index 6a00e67a2ca21..a155c4cf1dbbb 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json @@ -8,8 +8,8 @@ "json": { "timestampMillis": 1586847600000, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "pipelineName": "dummy_stateful", "platformInstanceId": "", @@ -17,7 +17,7 @@ "state": { "formatVersion": "1.0", "serde": "utf-8", - "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataProcessInstance:7f26c3b4d2d82ace47f4b9dd0c9dea26\"]}" + "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\"]}" }, "runId": "dummy-test-stateful-ingestion" } diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json index adf11a2833914..5cb8576594db3 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json @@ -69,6 +69,23 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:query1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided", + "pipelineName": "dummy_stateful" + } +}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619", @@ -84,5 +101,22 @@ "runId": "dummy-test-stateful-ingestion", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:query1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided", + "pipelineName": "dummy_stateful" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json index e4893642d61ae..5300743f23ca8 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json @@ -47,12 +47,46 @@ } } }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided", + "pipelineName": "dummy_stateful" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, "systemMetadata": { "lastObserved": 1586847600000, "runId": "dummy-test-stateful-ingestion", "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "query", + "entityUrn": "urn:li:query:query2", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided", + "pipelineName": "dummy_stateful" + } +}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:7f26c3b4d2d82ace47f4b9dd0c9dea26", @@ -66,23 +100,25 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "dummy-test-stateful-ingestion", - "lastRunId": "no-run-id-provided" + "lastRunId": "no-run-id-provided", + "pipelineName": "dummy_stateful" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)", + "entityType": "query", + "entityUrn": "urn:li:query:query2", "changeType": "UPSERT", "aspectName": "status", "aspect": { "json": { - "removed": true + "removed": false } }, "systemMetadata": { "lastObserved": 1586847600000, "runId": "dummy-test-stateful-ingestion", - "lastRunId": "no-run-id-provided" + "lastRunId": "no-run-id-provided", + "pipelineName": "dummy_stateful" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py index e3a2a6cccea79..66564dc856aba 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py @@ -29,7 +29,12 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import ( DataProcessInstanceProperties, ) -from datahub.metadata.schema_classes import AuditStampClass, StatusClass +from datahub.metadata.schema_classes import ( + AuditStampClass, + DataPlatformInstanceClass, + StatusClass, +) +from datahub.metadata.urns import DataPlatformUrn, QueryUrn from datahub.utilities.urns.dataset_urn import DatasetUrn from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( @@ -71,6 +76,9 @@ class DummySourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): default=None, description="Data process instance id to ingest.", ) + query_id_to_ingest: Optional[str] = Field( + default=None, description="Query id to ingest" + ) class DummySource(StatefulIngestionSourceBase): @@ -136,6 +144,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ), ).as_workunit() + if self.source_config.query_id_to_ingest: + yield MetadataChangeProposalWrapper( + entityUrn=QueryUrn(self.source_config.query_id_to_ingest).urn(), + aspect=DataPlatformInstanceClass( + platform=DataPlatformUrn("bigquery").urn() + ), + ).as_workunit() + if self.source_config.report_failure: self.reporter.report_failure("Dummy error", "Error") @@ -188,6 +204,7 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): }, }, "dpi_id_to_ingest": "job1", + "query_id_to_ingest": "query1", }, }, "sink": { @@ -198,7 +215,11 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): with mock.patch( "datahub.ingestion.source.state.stale_entity_removal_handler.StaleEntityRemovalHandler._get_state_obj" - ) as mock_state: + ) as mock_state, mock.patch( + "datahub.ingestion.source.state.stale_entity_removal_handler.STATEFUL_INGESTION_IGNORED_ENTITY_TYPES", + {}, + # Second mock is to imitate earlier behavior where entity type check was not present when adding entity to state + ): mock_state.return_value = GenericCheckpointState(serde="utf-8") pipeline_run1 = None pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( # type: ignore @@ -237,6 +258,8 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): "allow": ["dummy_dataset1", "dummy_dataset2"], } pipeline_run2_config["source"]["config"]["dpi_id_to_ingest"] = "job2" + pipeline_run2_config["source"]["config"]["query_id_to_ingest"] = "query2" + pipeline_run2_config["sink"]["config"][ "filename" ] = f"{tmp_path}/{output_file_name_after_deleted}" @@ -288,6 +311,7 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): # assert report last ingestion state non_deletable entity urns non_deletable_urns: List[str] = [ "urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619", + "urn:li:query:query1", ] assert sorted(non_deletable_urns) == sorted( report.last_state_non_deletable_entities diff --git a/metadata-ingestion/tests/unit/test_preset_source.py b/metadata-ingestion/tests/unit/test_preset_source.py new file mode 100644 index 0000000000000..d97db651f4c79 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_preset_source.py @@ -0,0 +1,22 @@ +from datahub.ingestion.source.preset import PresetConfig + + +def test_default_values(): + config = PresetConfig.parse_obj({}) + + assert config.connect_uri == "" + assert config.manager_uri == "https://api.app.preset.io" + assert config.display_uri == "" + assert config.env == "PROD" + assert config.api_key is None + assert config.api_secret is None + + +def test_set_display_uri(): + display_uri = "some_host:1234" + + config = PresetConfig.parse_obj({"display_uri": display_uri}) + + assert config.connect_uri == "" + assert config.manager_uri == "https://api.app.preset.io" + assert config.display_uri == display_uri diff --git a/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml b/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml index 865c5dee0f86e..e2e0eba62f3d9 100644 --- a/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml +++ b/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.6" services: spark-master: image: spark-master diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index cb02fb1c8b2f7..01a1e9cb15984 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -13,6 +13,8 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.linkedin.common.urn.Urn; +import com.linkedin.data.schema.DataSchema; +import com.linkedin.data.schema.MapDataSchema; import com.linkedin.data.template.DoubleMap; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; @@ -662,8 +664,48 @@ private static Map> buildSearchableF Collections.emptySet()))) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + List objectFieldSpec = + entitySpec.getSearchableFieldSpecs().stream() + .filter( + searchableFieldSpec -> + searchableFieldSpec.getSearchableAnnotation().getFieldType() + == SearchableAnnotation.FieldType.OBJECT) + .collect(Collectors.toList()); + + Map> objectFieldTypes = new HashMap<>(); + + objectFieldSpec.forEach( + fieldSpec -> { + String fieldName = fieldSpec.getSearchableAnnotation().getFieldName(); + DataSchema.Type dataType = + ((MapDataSchema) fieldSpec.getPegasusSchema()).getValues().getType(); + + Set fieldType; + + switch (dataType) { + case BOOLEAN: + fieldType = Set.of(SearchableAnnotation.FieldType.BOOLEAN); + break; + case INT: + fieldType = Set.of(SearchableAnnotation.FieldType.COUNT); + break; + case DOUBLE: + case LONG: + case FLOAT: + fieldType = Set.of(SearchableAnnotation.FieldType.DOUBLE); + break; + default: + fieldType = Set.of(SearchableAnnotation.FieldType.TEXT); + break; + } + objectFieldTypes.put(fieldName, fieldType); + annotationFieldTypes.remove(fieldName); + }); + return Stream.concat( - annotationFieldTypes.entrySet().stream(), + Stream.concat( + objectFieldTypes.entrySet().stream(), + annotationFieldTypes.entrySet().stream()), Stream.concat( mappingFieldTypes.entrySet().stream(), aliasFieldTypes.entrySet().stream())); }) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index e135f1941bfec..f72b5fc1f6d22 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -759,7 +759,7 @@ private static Set getFieldTypes( StructuredPropertyUtils.toElasticsearchFieldType(fieldName, aspectRetriever); } else { Set fieldTypes = - searchableFields.getOrDefault(fieldName, Collections.emptySet()); + searchableFields.getOrDefault(fieldName.split("\\.")[0], Collections.emptySet()); finalFieldTypes = fieldTypes.stream().map(ESUtils::getElasticTypeForFieldType).collect(Collectors.toSet()); } @@ -785,6 +785,7 @@ private static RangeQueryBuilder buildRangeQueryFromCriterion( // Determine criterion value, range query only accepts single value so take first value in // values if multiple String criterionValueString = criterion.getValues().get(0).trim(); + Object criterionValue; String documentFieldName; if (fieldTypes.contains(BOOLEAN_FIELD_TYPE)) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java index 109c9b5c44efb..775770d28b4a2 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java @@ -9,7 +9,6 @@ import com.linkedin.metadata.EbeanTestUtils; import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.config.EbeanConfiguration; -import com.linkedin.metadata.entity.EbeanEntityServiceTest; import io.ebean.Database; import io.ebean.test.LoggedSql; import java.util.List; @@ -24,7 +23,7 @@ public class EbeanAspectDaoTest { @BeforeMethod public void setupTest() { - Database server = EbeanTestUtils.createTestServer(EbeanEntityServiceTest.class.getSimpleName()); + Database server = EbeanTestUtils.createTestServer(EbeanAspectDaoTest.class.getSimpleName()); testDao = new EbeanAspectDao(server, EbeanConfiguration.testDefault); } @@ -34,7 +33,8 @@ public void testGetNextVersionForUpdate() { testDao.runInTransactionWithRetryUnlocked( (txContext) -> { - testDao.getNextVersions(Map.of("urn:li:corpuser:test", Set.of("status"))); + testDao.getNextVersions( + Map.of("urn:li:corpuser:testGetNextVersionForUpdate", Set.of("status"))); return ""; }, mock(AspectsBatch.class), @@ -43,9 +43,9 @@ public void testGetNextVersionForUpdate() { // Get the captured SQL statements List sql = LoggedSql.stop().stream() - .filter(str -> str.contains("(t0.urn,t0.aspect,t0.version)")) + .filter(str -> str.contains("testGetNextVersionForUpdate")) .toList(); - assertEquals(sql.size(), 1, String.format("Found: %s", sql)); + assertEquals(sql.size(), 2, String.format("Found: %s", sql)); assertTrue( sql.get(0).contains("for update;"), String.format("Did not find `for update` in %s ", sql)); } @@ -56,7 +56,8 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { testDao.runInTransactionWithRetryUnlocked( (txContext) -> { - testDao.getLatestAspects(Map.of("urn:li:corpuser:test", Set.of("status")), true); + testDao.getLatestAspects( + Map.of("urn:li:corpuser:testGetLatestAspectsForUpdate", Set.of("status")), true); return ""; }, mock(AspectsBatch.class), @@ -65,7 +66,7 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { // Get the captured SQL statements List sql = LoggedSql.stop().stream() - .filter(str -> str.contains("(t0.urn,t0.aspect,t0.version)")) + .filter(str -> str.contains("testGetLatestAspectsForUpdate")) .toList(); assertEquals( sql.size(), 1, String.format("Found: %s", new ObjectMapper().writeValueAsString(sql))); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java index 8d06594e415e0..6665faacae337 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java @@ -10,10 +10,13 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.search.elasticsearch.query.filter.QueryFilterRewriteChain; @@ -38,7 +41,7 @@ public class ESUtilsTest { private static AspectRetriever aspectRetrieverV1; @BeforeClass - public void setup() throws RemoteInvocationException, URISyntaxException { + public static void setup() throws RemoteInvocationException, URISyntaxException { Urn abFghTenUrn = Urn.createFromString("urn:li:structuredProperty:ab.fgh.ten"); // legacy @@ -835,4 +838,61 @@ public void testGetQueryBuilderFromStructPropExistsV1() { + "}"; Assert.assertEquals(result.toString(), expected); } + + @Test + public void testGetQueryBuilderForObjectFields() { + final Criterion singleValueCriterion = + new Criterion() + .setField("testObjectField.numericField") + .setCondition(Condition.EQUAL) + .setValues(new StringArray(ImmutableList.of("10"))); + + Map> searchableFieldTypes = new HashMap<>(); + searchableFieldTypes.put("testObjectField", Set.of(SearchableAnnotation.FieldType.DOUBLE)); + + QueryBuilder result = + ESUtils.getQueryBuilderFromCriterion( + singleValueCriterion, + false, + searchableFieldTypes, + mock(OperationContext.class), + QueryFilterRewriteChain.EMPTY); + String expected = + "{\n" + + " \"terms\" : {\n" + + " \"testObjectField.numericField\" : [\n" + + " 10.0\n" + + " ],\n" + + " \"boost\" : 1.0,\n" + + " \"_name\" : \"testObjectField.numericField\"\n" + + " }\n" + + "}"; + Assert.assertEquals(result.toString(), expected); + + final Criterion multiValueCriterion = + new Criterion() + .setField("testObjectField.numericField") + .setCondition(Condition.EQUAL) + .setValues(new StringArray(ImmutableList.of("10", "20"))); + + result = + ESUtils.getQueryBuilderFromCriterion( + multiValueCriterion, + false, + searchableFieldTypes, + mock(OperationContext.class), + QueryFilterRewriteChain.EMPTY); + expected = + "{\n" + + " \"terms\" : {\n" + + " \"testObjectField.numericField\" : [\n" + + " 10.0,\n" + + " 20.0\n" + + " ],\n" + + " \"boost\" : 1.0,\n" + + " \"_name\" : \"testObjectField.numericField\"\n" + + " }\n" + + "}"; + Assert.assertEquals(result.toString(), expected); + } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl index c436011eb58db..0ce19b32c8930 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl @@ -100,6 +100,26 @@ record DashboardInfo includes CustomProperties, ExternalReference { } datasetEdges: optional array[Edge] + /** + * Dashboards included by this dashboard. + * Some dashboard entities (e.g. PowerBI Apps) can contain other dashboards. + * + * The Edge's sourceUrn should never be set, as it will always be the base dashboard. + */ + @Relationship = { + "/*/destinationUrn": { + "name": "DashboardContainsDashboard", + "entityTypes": [ "dashboard" ], + "isLineage": true, + "createdOn": "datasetEdges/*/created/time" + "createdActor": "datasetEdges/*/created/actor" + "updatedOn": "datasetEdges/*/lastModified/time" + "updatedActor": "datasetEdges/*/lastModified/actor" + "properties": "datasetEdges/*/properties" + } + } + dashboards: array[Edge] = [ ] + /** * Captures information about who created/last modified/deleted this dashboard and when */ diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 30d0e9a09cdf4..e9e2778a479d3 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -1473,6 +1473,26 @@ "updatedOn" : "datasetEdges/*/lastModified/time" } } + }, { + "name" : "dashboards", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Edge" + }, + "doc" : "Dashboards included by this dashboard.\nSome dashboard entities (e.g. PowerBI Apps) can contain other dashboards.\n\nThe Edge's sourceUrn should never be set, as it will always be the base dashboard.", + "default" : [ ], + "Relationship" : { + "/*/destinationUrn" : { + "createdActor" : "datasetEdges/*/created/actor", + "createdOn" : "datasetEdges/*/created/time", + "entityTypes" : [ "dashboard" ], + "isLineage" : true, + "name" : "DashboardContainsDashboard", + "properties" : "datasetEdges/*/properties", + "updatedActor" : "datasetEdges/*/lastModified/actor", + "updatedOn" : "datasetEdges/*/lastModified/time" + } + } }, { "name" : "lastModified", "type" : "com.linkedin.common.ChangeAuditStamps", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 8cf02a768ecae..959cb5381fd9b 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -1500,6 +1500,26 @@ "updatedOn" : "datasetEdges/*/lastModified/time" } } + }, { + "name" : "dashboards", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Edge" + }, + "doc" : "Dashboards included by this dashboard.\nSome dashboard entities (e.g. PowerBI Apps) can contain other dashboards.\n\nThe Edge's sourceUrn should never be set, as it will always be the base dashboard.", + "default" : [ ], + "Relationship" : { + "/*/destinationUrn" : { + "createdActor" : "datasetEdges/*/created/actor", + "createdOn" : "datasetEdges/*/created/time", + "entityTypes" : [ "dashboard" ], + "isLineage" : true, + "name" : "DashboardContainsDashboard", + "properties" : "datasetEdges/*/properties", + "updatedActor" : "datasetEdges/*/lastModified/actor", + "updatedOn" : "datasetEdges/*/lastModified/time" + } + } }, { "name" : "lastModified", "type" : "com.linkedin.common.ChangeAuditStamps", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index d06f3b737a3e1..3e0cd46aba0c0 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -1206,6 +1206,26 @@ "updatedOn" : "datasetEdges/*/lastModified/time" } } + }, { + "name" : "dashboards", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Edge" + }, + "doc" : "Dashboards included by this dashboard.\nSome dashboard entities (e.g. PowerBI Apps) can contain other dashboards.\n\nThe Edge's sourceUrn should never be set, as it will always be the base dashboard.", + "default" : [ ], + "Relationship" : { + "/*/destinationUrn" : { + "createdActor" : "datasetEdges/*/created/actor", + "createdOn" : "datasetEdges/*/created/time", + "entityTypes" : [ "dashboard" ], + "isLineage" : true, + "name" : "DashboardContainsDashboard", + "properties" : "datasetEdges/*/properties", + "updatedActor" : "datasetEdges/*/lastModified/actor", + "updatedOn" : "datasetEdges/*/lastModified/time" + } + } }, { "name" : "lastModified", "type" : "com.linkedin.common.ChangeAuditStamps", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 56562ff49ff8d..7f651a10139e2 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -1206,6 +1206,26 @@ "updatedOn" : "datasetEdges/*/lastModified/time" } } + }, { + "name" : "dashboards", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Edge" + }, + "doc" : "Dashboards included by this dashboard.\nSome dashboard entities (e.g. PowerBI Apps) can contain other dashboards.\n\nThe Edge's sourceUrn should never be set, as it will always be the base dashboard.", + "default" : [ ], + "Relationship" : { + "/*/destinationUrn" : { + "createdActor" : "datasetEdges/*/created/actor", + "createdOn" : "datasetEdges/*/created/time", + "entityTypes" : [ "dashboard" ], + "isLineage" : true, + "name" : "DashboardContainsDashboard", + "properties" : "datasetEdges/*/properties", + "updatedActor" : "datasetEdges/*/lastModified/actor", + "updatedOn" : "datasetEdges/*/lastModified/time" + } + } }, { "name" : "lastModified", "type" : "com.linkedin.common.ChangeAuditStamps", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index b90543745c65f..c3e04add825c9 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -1500,6 +1500,26 @@ "updatedOn" : "datasetEdges/*/lastModified/time" } } + }, { + "name" : "dashboards", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Edge" + }, + "doc" : "Dashboards included by this dashboard.\nSome dashboard entities (e.g. PowerBI Apps) can contain other dashboards.\n\nThe Edge's sourceUrn should never be set, as it will always be the base dashboard.", + "default" : [ ], + "Relationship" : { + "/*/destinationUrn" : { + "createdActor" : "datasetEdges/*/created/actor", + "createdOn" : "datasetEdges/*/created/time", + "entityTypes" : [ "dashboard" ], + "isLineage" : true, + "name" : "DashboardContainsDashboard", + "properties" : "datasetEdges/*/properties", + "updatedActor" : "datasetEdges/*/lastModified/actor", + "updatedOn" : "datasetEdges/*/lastModified/time" + } + } }, { "name" : "lastModified", "type" : "com.linkedin.common.ChangeAuditStamps", diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index bce2d794b1869..3db81b11c67fa 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -8,7 +8,10 @@ if [ "${RUN_QUICKSTART:-true}" == "true" ]; then source ./run-quickstart.sh fi +set +x +echo "Activating virtual environment" source venv/bin/activate +set -x # set environment variables for the test source ./set-test-env-vars.sh diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index e83a116c670a4..902dc1030660a 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -5,7 +5,10 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd "$DIR" ../gradlew :smoke-test:installDev +set +x +echo "Activating virtual environment" source venv/bin/activate +set -x mkdir -p ~/.datahub/plugins/frontend/auth/ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props @@ -23,4 +26,4 @@ DATAHUB_SEARCH_IMAGE="$DATAHUB_SEARCH_IMAGE" DATAHUB_SEARCH_TAG="$DATAHUB_SEARCH XPACK_SECURITY_ENABLED="$XPACK_SECURITY_ENABLED" ELASTICSEARCH_USE_SSL="$ELASTICSEARCH_USE_SSL" \ USE_AWS_ELASTICSEARCH="$USE_AWS_ELASTICSEARCH" \ DATAHUB_VERSION=${DATAHUB_VERSION} \ -docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900 +docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900