Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Mar 19, 2024
2 parents face04c + 64cb5d1 commit a01c793
Show file tree
Hide file tree
Showing 30 changed files with 333 additions and 203 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ private void mapViewProperties(@Nonnull Dataset dataset, @Nonnull DataMap dataMa
graphqlProperties.setMaterialized(properties.isMaterialized());
graphqlProperties.setLanguage(properties.getViewLanguage());
graphqlProperties.setLogic(properties.getViewLogic());
graphqlProperties.setFormattedLogic(properties.getFormattedViewLogic());
dataset.setViewProperties(graphqlProperties);
}

Expand Down
6 changes: 6 additions & 0 deletions datahub-graphql-core/src/main/resources/entity.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -3186,6 +3186,12 @@ type ViewProperties {
"""
logic: String!

"""
A formatted version of the logic associated with the view.
For dbt, this contains the compiled SQL.
"""
formattedLogic: String

"""
The language in which the view logic is written, for example SQL
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import { Typography } from 'antd';
import React from 'react';
import { Radio, Typography } from 'antd';
import React, { useState } from 'react';
import styled from 'styled-components';
import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter';
import { GetDatasetQuery } from '../../../../../../graphql/dataset.generated';
import { ANTD_GRAY } from '../../../constants';
import { useBaseEntity } from '../../../EntityContext';
import { InfoItem } from '../../../components/styled/InfoItem';
import { DBT_URN } from '../../../../../ingest/source/builder/constants';

const InfoSection = styled.div`
border-bottom: 1px solid ${ANTD_GRAY[4.5]};
Expand All @@ -23,9 +24,14 @@ const InfoItemContent = styled.div`
padding-top: 8px;
`;

const FormattingSelector = styled.div`
margin-top: 10px;
`;

const QueryText = styled(Typography.Paragraph)`
margin-top: 20px;
margin-top: 15px;
background-color: ${ANTD_GRAY[2]};
border-radius: 5px;
`;

// NOTE: Yes, using `!important` is a shame. However, the SyntaxHighlighter is applying styles directly
Expand All @@ -38,9 +44,16 @@ const NestedSyntax = styled(SyntaxHighlighter)`
export default function ViewDefinitionTab() {
const baseEntity = useBaseEntity<GetDatasetQuery>();
const logic = baseEntity?.dataset?.viewProperties?.logic || 'UNKNOWN';
const formattedLogic = baseEntity?.dataset?.viewProperties?.formattedLogic;
const materialized = (baseEntity?.dataset?.viewProperties?.materialized && true) || false;
const language = baseEntity?.dataset?.viewProperties?.language || 'UNKNOWN';

const isDbt = baseEntity?.dataset?.platform?.urn === DBT_URN;
const formatOptions = isDbt ? ['Source', 'Compiled'] : ['Raw', 'Formatted'];

const canShowFormatted = !!formattedLogic;
const [showFormatted, setShowFormatted] = useState(false);

return (
<>
<InfoSection>
Expand All @@ -56,8 +69,21 @@ export default function ViewDefinitionTab() {
</InfoSection>
<InfoSection>
<Typography.Title level={5}>Logic</Typography.Title>
{canShowFormatted && (
<FormattingSelector>
<Radio.Group
options={[
{ label: formatOptions[0], value: false },
{ label: formatOptions[1], value: true },
]}
onChange={(e) => setShowFormatted(e.target.value)}
value={showFormatted}
optionType="button"
/>
</FormattingSelector>
)}
<QueryText>
<NestedSyntax language="sql">{logic}</NestedSyntax>
<NestedSyntax language="sql">{showFormatted ? formattedLogic : logic}</NestedSyntax>
</QueryText>
</InfoSection>
</>
Expand Down
1 change: 1 addition & 0 deletions datahub-web-react/src/graphql/dataset.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ fragment viewProperties on Dataset {
viewProperties {
materialized
logic
formattedLogic
language
}
}
Expand Down
2 changes: 2 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

- #9934 and #10075 - Stateful ingestion is now enabled by default if a `pipeline_name` is set and either a datahub-rest sink or `datahub_api` is specified. It will still be disabled by default when any other sink type is used or if there is no pipeline name set.
- #10002 - The `DataHubGraph` client no longer makes a request to the backend during initialization. If you want to preserve the old behavior, call `graph.test_connection()` after constructing the client.
- #10026 - The dbt `use_compiled_code` option has been removed, because we now support capturing both source and compiled dbt SQL. This can be configured using `include_compiled_code`, which will be default enabled in 0.13.1.
- #10055 - Assertion entities generated by dbt are now associated with the dbt dataset entity, and not the entity in the data warehouse.

### Potential Downtime

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import pytest
import setuptools
from datahub.testing.check_imports import ensure_no_indirect_model_imports


def test_package_list_match_inits():
where = "./src"
package_list = set(setuptools.find_packages(where))
namespace_packages = set(setuptools.find_namespace_packages(where))
assert package_list == namespace_packages, "are you missing a package init file?"


def test_check_import_paths(pytestconfig: pytest.Config) -> None:
root = pytestconfig.rootpath

ensure_no_indirect_model_imports([root / "src", root / "tests"])
3 changes: 3 additions & 0 deletions metadata-ingestion/as-a-library.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ from datahub.emitter.rest_emitter import DatahubRestEmitter
# Create an emitter to DataHub over REST
emitter = DatahubRestEmitter(gms_server="http://localhost:8080", extra_headers={})

# For Acryl, you will want to point to your Acryl server's GMS endpoint
# emitter = DatahubRestEmitter(gms_server="https://<your-domain>.acryl.io/gms", token="<your token>", extra_headers={})

# Test the connection
emitter.test_connection()

Expand Down
8 changes: 4 additions & 4 deletions metadata-ingestion/docs/sources/dbt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ Ingesting metadata from dbt requires either using the **dbt** module or the **db
| Source Concept | DataHub Concept | Notes |
| --------------- | ------------------------------------------------------------- | ------------------ |
| `"dbt"` | [Data Platform](../../metamodel/entities/dataPlatform.md) | |
| dbt Source | [Dataset](../../metamodel/entities/dataset.md) | Subtype `source` |
| dbt Seed | [Dataset](../../metamodel/entities/dataset.md) | Subtype `seed` |
| dbt Model | [Dataset](../../metamodel/entities/dataset.md) | Subtype `model` |
| dbt Snapshot | [Dataset](../../metamodel/entities/dataset.md) | Subtype `snapshot` |
| dbt Source | [Dataset](../../metamodel/entities/dataset.md) | Subtype `Source` |
| dbt Seed | [Dataset](../../metamodel/entities/dataset.md) | Subtype `Seed` |
| dbt Model | [Dataset](../../metamodel/entities/dataset.md) | Subtype `Model` |
| dbt Snapshot | [Dataset](../../metamodel/entities/dataset.md) | Subtype `Snapshot` |
| dbt Test | [Assertion](../../metamodel/entities/assertion.md) | |
| dbt Test Result | [Assertion Run Result](../../metamodel/entities/assertion.md) | |

Expand Down
59 changes: 54 additions & 5 deletions metadata-ingestion/docs/sources/dbt/dbt.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,18 +166,33 @@ The below example set as global tag the query tag `tag` key's value.

### Integrating with dbt test

To integrate with dbt tests, the `dbt` source needs access to the `run_results.json` file generated after a `dbt test` execution. Typically, this is written to the `target` directory. A common pattern you can follow is:
To integrate with dbt tests, the `dbt` source needs access to the `run_results.json` file generated after a `dbt test` or `dbt build` execution. Typically, this is written to the `target` directory. A common pattern you can follow is:

1. Run `dbt docs generate` and upload `manifest.json` and `catalog.json` to a location accessible to the `dbt` source (e.g. s3 or local file system)
2. Run `dbt test` and upload `run_results.json` to a location accessible to the `dbt` source (e.g. s3 or local file system)
3. Run `datahub ingest -c dbt_recipe.dhub.yaml` with the following config parameters specified
- test_results_path: pointing to the run_results.json file that you just created
1. Run `dbt build`
2. Copy the `target/run_results.json` file to a separate location. This is important, because otherwise subsequent `dbt` commands will overwrite the run results.
3. Run `dbt docs generate` to generate the `manifest.json` and `catalog.json` files
4. The dbt source makes use of the manifest, catalog, and run results file, and hence will need to be moved to a location accessible to the `dbt` source (e.g. s3 or local file system). In the ingestion recipe, the `test_results_path` config must be set to the location of the `run_results.json` file from the `dbt build` or `dbt test` run.

The connector will produce the following things:

- Assertion definitions that are attached to the dataset (or datasets)
- Results from running the tests attached to the timeline of the dataset

:::note Missing test results?

The most common reason for missing test results is that the `run_results.json` with the test result information is getting overwritten by a subsequent `dbt` command. We recommend copying the `run_results.json` file before running other `dbt` commands.

```sh
dbt source snapshot-freshness
dbt build
cp target/run_results.json target/run_results_backup.json
dbt docs generate
# Reference target/run_results_backup.json in the dbt source config.
```

:::

#### View of dbt tests for a dataset

![test view](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/dbt-tests-view.png)
Expand Down Expand Up @@ -220,3 +235,37 @@ source:
entities_enabled:
test_results: No
```

### Multiple dbt projects

In more complex dbt setups, you may have multiple dbt projects, where models from one project are used as sources in another project.
DataHub supports this setup natively.

Each dbt project should have its own dbt ingestion recipe, and the `platform_instance` field in the recipe should be set to the dbt project name.

For example, if you have two dbt projects `analytics` and `data_mart`, you would have two ingestion recipes.
If you have models in the `data_mart` project that are used as sources in the `analytics` project, the lineage will be automatically captured.

```yaml
# Analytics dbt project
source:
type: dbt
config:
platform_instance: analytics
target_platform: postgres
manifest_path: analytics/target/manifest.json
catalog_path: analytics/target/catalog.json
# ... other configs
```

```yaml
# Data Mart dbt project
source:
type: dbt
config:
platform_instance: data_mart
target_platform: postgres
manifest_path: data_mart/target/manifest.json
catalog_path: data_mart/target/catalog.json
# ... other configs
```
2 changes: 1 addition & 1 deletion metadata-ingestion/scripts/docgen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion
EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources

rm -r $DOCS_OUT_DIR || true
SPARK_VERSION=3.3 python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@
21 changes: 21 additions & 0 deletions metadata-ingestion/src/datahub/cli/check_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,27 @@ def plugins(source: Optional[str], verbose: bool) -> None:
)


@check.command()
@click.option(
"--sql",
type=str,
required=True,
help="The SQL query to parse",
)
@click.option(
"--platform",
type=str,
required=True,
help="The SQL dialect e.g. bigquery or snowflake",
)
def sql_format(sql: str, platform: str) -> None:
"""Parse a SQL query into an abstract syntax tree (AST)."""

from datahub.sql_parsing.sqlglot_utils import try_format_query

click.echo(try_format_query(sql, platform, raises=True))


@check.command()
@click.option(
"--sql",
Expand Down
7 changes: 0 additions & 7 deletions metadata-ingestion/src/datahub/configuration/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,3 @@ def value(self, string: str) -> List[str]:

class VersionedConfig(ConfigModel):
version: str = "1"


class LineageConfig(ConfigModel):
incremental_lineage: bool = Field(
default=False,
description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.",
)
3 changes: 3 additions & 0 deletions metadata-ingestion/src/datahub/configuration/source_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class DatasetSourceConfigMixin(PlatformInstanceConfigMixin, EnvConfigMixin):
Any source that is a primary producer of Dataset metadata should inherit this class
"""

# TODO: Deprecate this in favor of the more granular config mixins in order
# to flatten our config inheritance hierarchies.


class LowerCaseDatasetUrnConfigMixin(ConfigModel):
convert_urns_to_lowercase: bool = Field(
Expand Down
21 changes: 5 additions & 16 deletions metadata-ingestion/src/datahub/emitter/rest_emitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,11 @@ def test_connection(self) -> None:
return

else:
# Looks like we either connected to an old GMS or to some other service. Let's see if we can determine which before raising an error
# A common misconfiguration is connecting to datahub-frontend so we special-case this check
if (
config.get("config", {}).get("application") == "datahub-frontend"
or config.get("config", {}).get("shouldShowDatasetLineage")
is not None
):
raise ConfigurationError(
"You seem to have connected to the frontend instead of the GMS endpoint. "
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)"
)
else:
raise ConfigurationError(
"You have either connected to a pre-v0.8.0 DataHub GMS instance, or to a different server altogether! "
"Please check your configuration and make sure you are talking to the DataHub GMS endpoint."
)
raise ConfigurationError(
"You seem to have connected to the frontend service instead of the GMS endpoint. "
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
)
else:
logger.debug(
f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import copy
from typing import Dict, Iterable, Optional

from pydantic.fields import Field

from datahub.configuration.common import ConfigModel
from datahub.emitter.mce_builder import datahub_guid, set_aspect
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.workunit import MetadataWorkUnit
Expand Down Expand Up @@ -143,3 +146,10 @@ def auto_incremental_lineage(
)
else:
yield wu


class IncrementalLineageConfigMixin(ConfigModel):
incremental_lineage: bool = Field(
default=False,
description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.",
)
1 change: 1 addition & 0 deletions metadata-ingestion/src/datahub/ingestion/run/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ def log_ingestion_stats(self) -> None:
"warnings": stats.discretize(
source_warnings + sink_warnings + global_warnings
),
"has_pipeline_name": bool(self.config.pipeline_name),
},
self.ctx.graph,
)
Expand Down
Loading

0 comments on commit a01c793

Please sign in to comment.