From 4645ab89ee6117865ad6c4486a31617fb1445346 Mon Sep 17 00:00:00 2001 From: Kevin Chun Date: Mon, 20 May 2024 13:06:23 -0700 Subject: [PATCH 1/7] OpenAPI v3 Spec bug fixes: (#10548) --- .../openapi/v3/OpenAPIV3Generator.java | 25 +++++++------------ .../openapi/v3/OpenAPIV3GeneratorTest.java | 15 +++++++++++ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java index 49bced1a7cd95..df3f6445a855a 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java @@ -75,7 +75,8 @@ public static OpenAPI generateOpenApiSpec(EntityRegistry entityRegistry) { final Components components = new Components(); // --> Aspect components // TODO: Correct handling of SystemMetadata and SortOrder - components.addSchemas("SystemMetadata", new Schema().type(TYPE_STRING)); + components.addSchemas( + "SystemMetadata", new Schema().type(TYPE_OBJECT).additionalProperties(true)); components.addSchemas("SortOrder", new Schema()._enum(List.of("ASCENDING", "DESCENDING"))); components.addSchemas("AspectPatch", buildAspectPatchSchema()); entityRegistry @@ -185,7 +186,6 @@ private static PathItem buildSingleEntityPath(final EntitySpec entity) { final Operation getOperation = new Operation() .summary(String.format("Get %s.", upperFirst)) - .operationId(String.format("get%s", upperFirst)) .parameters(parameters) .tags(List.of(entity.getName() + " Entity")) .responses(new ApiResponses().addApiResponse("200", successApiResponse)); @@ -202,7 +202,6 @@ private static PathItem buildSingleEntityPath(final EntitySpec entity) { final Operation headOperation = new Operation() .summary(String.format("%s existence.", upperFirst)) - .operationId(String.format("head%s", upperFirst)) .parameters( List.of( new Parameter() @@ -223,7 +222,6 @@ private static PathItem buildSingleEntityPath(final EntitySpec entity) { final Operation deleteOperation = new Operation() .summary(String.format("Delete entity %s", upperFirst)) - .operationId(String.format("delete%s", upperFirst)) .parameters( List.of( new Parameter() @@ -274,7 +272,6 @@ private static PathItem buildListEntityPath(final EntitySpec entity) { result.setGet( new Operation() .summary(String.format("Scroll/List %s.", upperFirst)) - .operationId("scroll") .parameters(parameters) .tags(List.of(entity.getName() + " Entity")) .responses(new ApiResponses().addApiResponse("200", successApiResponse))); @@ -331,7 +328,6 @@ private static PathItem buildListEntityPath(final EntitySpec entity) { .description("Include systemMetadata with response.") .schema(new Schema().type(TYPE_BOOLEAN)._default(false)))) .summary("Create " + upperFirst + " entities.") - .operationId("createEntities") .tags(List.of(entity.getName() + " Entity")) .requestBody( new RequestBody() @@ -365,7 +361,7 @@ private static void addExtraParameters(final Components components) { .schema( new Schema() .type(TYPE_ARRAY) - ._default(PROPERTY_URN) + ._default(List.of(PROPERTY_URN)) .items( new Schema<>() .type(TYPE_STRING) @@ -386,7 +382,7 @@ private static void addExtraParameters(final Components components) { .in(NAME_QUERY) .name("count") .description("Number of items per page.") - .example("10") + .example(10) .schema(new Schema().type(TYPE_INTEGER)._default(10).minimum(new BigDecimal(1)))); components.addParameters( "ScrollQuery" + MODEL_VERSION, @@ -455,7 +451,7 @@ private static void addAspectSchemas(final Components components, final AspectSp // A non-required $ref property must be wrapped in a { allOf: [ $ref ] } // object to allow the // property to be marked as nullable - schema.setType("object"); + schema.setType(TYPE_OBJECT); schema.set$ref(null); schema.setAllOf(List.of(new Schema().$ref($ref))); } @@ -482,8 +478,10 @@ private static Schema buildAspectRefResponseSchema(final String aspectName) { result.addProperty( "systemMetadata", new Schema<>() - .$ref(PATH_DEFINITIONS + "SystemMetadata") - .description("System metadata for the aspect.")); + .type(TYPE_OBJECT) + .allOf(List.of(new Schema().$ref(PATH_DEFINITIONS + "SystemMetadata"))) + .description("System metadata for the aspect.") + .nullable(true)); return result; } @@ -604,7 +602,6 @@ private static PathItem buildSingleEntityAspectPath( final Operation getOperation = new Operation() .summary(String.format("Get %s for %s.", aspect, entity.getName())) - .operationId(String.format("get%s", upperFirstAspect)) .tags(tags) .parameters(List.of(getParameter)) .responses(new ApiResponses().addApiResponse("200", successApiResponse)); @@ -620,7 +617,6 @@ private static PathItem buildSingleEntityAspectPath( final Operation headOperation = new Operation() .summary(String.format("%s on %s existence.", aspect, upperFirstEntity)) - .operationId(String.format("head%s", upperFirstAspect)) .tags(tags) .responses( new ApiResponses() @@ -634,7 +630,6 @@ private static PathItem buildSingleEntityAspectPath( final Operation deleteOperation = new Operation() .summary(String.format("Delete %s on entity %s", aspect, upperFirstEntity)) - .operationId(String.format("delete%s", upperFirstAspect)) .tags(tags) .responses(new ApiResponses().addApiResponse("200", successDeleteResponse)); // Post Operation @@ -670,7 +665,6 @@ private static PathItem buildSingleEntityAspectPath( final Operation postOperation = new Operation() .summary(String.format("Create aspect %s on %s ", aspect, upperFirstEntity)) - .operationId(String.format("create%s", upperFirstAspect)) .tags(tags) .requestBody(requestBody) .responses(new ApiResponses().addApiResponse("201", successPostResponse)); @@ -709,7 +703,6 @@ private static PathItem buildSingleEntityAspectPath( .description("Include systemMetadata with response.") .schema(new Schema().type(TYPE_BOOLEAN)._default(false)))) .summary(String.format("Patch aspect %s on %s ", aspect, upperFirstEntity)) - .operationId(String.format("patch%s", upperFirstAspect)) .tags(tags) .requestBody(patchRequestBody) .responses(new ApiResponses().addApiResponse("200", successPatchResponse)); diff --git a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/OpenAPIV3GeneratorTest.java b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/OpenAPIV3GeneratorTest.java index 918a0762ade18..0ce62f5cb10f6 100644 --- a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/OpenAPIV3GeneratorTest.java +++ b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/OpenAPIV3GeneratorTest.java @@ -68,5 +68,20 @@ public void testOpenApiSpecBuilder() throws Exception { assertNull(created.get$ref()); assertEquals(List.of(new Schema().$ref("#/components/schemas/TimeStamp")), created.getAllOf()); assertTrue(created.getNullable()); + + // Assert systemMetadata property on response schema is optional. + Map datasetPropertiesResponseSchemaProps = + openAPI + .getComponents() + .getSchemas() + .get("DatasetPropertiesAspectResponse_v3") + .getProperties(); + Schema systemMetadata = datasetPropertiesResponseSchemaProps.get("systemMetadata"); + assertEquals("object", systemMetadata.getType()); + assertNull(systemMetadata.get$ref()); + assertEquals( + List.of(new Schema().$ref("#/components/schemas/SystemMetadata")), + systemMetadata.getAllOf()); + assertTrue(systemMetadata.getNullable()); } } From a2e2fd2acaf33ea44466f0247358e45ecb01ceb0 Mon Sep 17 00:00:00 2001 From: Jay <159848059+jayacryl@users.noreply.github.com> Date: Mon, 20 May 2024 16:13:42 -0400 Subject: [PATCH 2/7] fix(assertions) aligned graphql AssertionType definition with the AssertionType defined in metadata-models (#10534) --- .../src/main/resources/entity.graphql | 25 ++++++++++++++++++- .../com/linkedin/assertion/AssertionInfo.pdl | 5 ++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 1f2642567b49e..2315d6f8767d9 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -7766,10 +7766,33 @@ enum DatasetAssertionScope { } """ -The top-level assertion type. Currently single Dataset assertions are the only type supported. +The top-level assertion type. """ enum AssertionType { + """ + A single-dataset assertion. + """ DATASET + """ + An assertion which indicates when a particular operation should occur to an asset. + """ + FRESHNESS + """ + An assertion which indicates how much data should be available for a particular asset. + """ + VOLUME + """ + A raw SQL-statement based assertion. + """ + SQL + """ + A structured assertion targeting a specific column or field of the Dataset. + """ + FIELD + """ + A schema or structural assertion. + """ + DATA_SCHEMA } """ diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl index e161270145a88..5b60aa18e87da 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl @@ -37,6 +37,11 @@ record AssertionInfo includes CustomProperties, ExternalReference { */ SQL + /** + * A structured assertion targeting a specific column or field of the Dataset. + */ + FIELD + /** * A schema or structural assertion. * From 1ad6746a344e5605edea041924e7ea32a421da59 Mon Sep 17 00:00:00 2001 From: Davi Arnaut Date: Mon, 20 May 2024 13:32:19 -0700 Subject: [PATCH 3/7] fix(smoke-test): pin requests to 2.31.0 (#10549) --- smoke-test/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smoke-test/requirements.txt b/smoke-test/requirements.txt index c5d43163dff5d..861c69f354fe5 100644 --- a/smoke-test/requirements.txt +++ b/smoke-test/requirements.txt @@ -16,3 +16,5 @@ ruff==0.0.287 # stub version are copied from metadata-ingestion/setup.py and that should be the source of truth types-requests>=2.28.11.6,<=2.31.0.3 types-PyYAML +# https://github.com/docker/docker-py/issues/3256 +requests<=2.31.0 From 187ef12182db5e6e4d69925e69378ce14bfb1064 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 20 May 2024 13:33:25 -0700 Subject: [PATCH 4/7] fix(ingest/dbt): improve handling for CLL via ephemeral nodes (#10535) --- .../ingestion/source/dbt/dbt_common.py | 25 ++++++++++--------- .../datahub/ingestion/source/dbt/dbt_core.py | 2 +- .../src/datahub/sql_parsing/sqlglot_utils.py | 21 +++++++++++++++- .../tests/unit/sql_parsing/test_sql_detach.py | 6 ++--- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index ebba664a811c7..3b686ef60de29 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1126,24 +1126,25 @@ def _infer_schemas_and_update_cll( # noqa: C901 elif node.compiled_code: try: # Add CTE stops based on the upstreams list. + cte_mapping = { + cte_name: upstream_node.get_fake_ephemeral_table_name() + for upstream_node in [ + all_nodes_map[upstream_node_name] + for upstream_node_name in node.upstream_nodes + if upstream_node_name in all_nodes_map + ] + if upstream_node.is_ephemeral_model() + for cte_name in _get_dbt_cte_names( + upstream_node.name, schema_resolver.platform + ) + } preprocessed_sql = detach_ctes( parse_statements_and_pick( node.compiled_code, platform=schema_resolver.platform, ), platform=schema_resolver.platform, - cte_mapping={ - cte_name: upstream_node.get_fake_ephemeral_table_name() - for upstream_node in [ - all_nodes_map[upstream_node_name] - for upstream_node_name in node.upstream_nodes - if upstream_node_name in all_nodes_map - ] - if upstream_node.is_ephemeral_model() - for cte_name in _get_dbt_cte_names( - upstream_node.name, schema_resolver.platform - ) - }, + cte_mapping=cte_mapping, ) except Exception as e: self.report.sql_parser_detach_ctes_failures.append(node.dbt_name) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index 750fee227b97a..c78cfdf0b4f0f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -118,7 +118,7 @@ def get_columns( # information from the manifest file. logger.debug(f"Inferring schema info for {dbt_name} from manifest") catalog_columns = { - k: {"name": col["name"], "type": col["data_type"], "index": i} + k: {"name": col["name"], "type": col["data_type"] or "", "index": i} for i, (k, col) in enumerate(manifest_columns.items()) } else: diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py index 778f99c631809..dfb3b8925dcca 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py @@ -5,6 +5,7 @@ import sqlglot import sqlglot.errors +import sqlglot.optimizer.eliminate_ctes logger = logging.getLogger(__name__) DialectOrStr = Union[sqlglot.Dialect, str] @@ -294,4 +295,22 @@ def replace_cte_refs(node: sqlglot.exp.Expression) -> sqlglot.exp.Expression: return node statement = statement.copy() - return statement.transform(replace_cte_refs, copy=False) + statement = statement.transform(replace_cte_refs, copy=False) + + # There's a bug in eliminate_ctes that causes it to not remove all unused CTEs + # when there's a complex chain of dependent CTEs. As a workaround, we call the + # method multiple times until it no longer eliminates any CTEs. + max_eliminate_calls = 5 + for iteration in range(max_eliminate_calls): + new_statement = sqlglot.optimizer.eliminate_ctes.eliminate_ctes( + statement.copy() + ) + if new_statement == statement: + if iteration > 1: + logger.debug( + f"Required {iteration+1} iterations to detach and eliminate all CTEs" + ) + break + statement = new_statement + + return statement diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py index b45e2d0eff693..759f5a1d19242 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py @@ -12,7 +12,7 @@ def test_detach_ctes_simple(): assert ( detached - == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table ON table2.id = _my_cte_table.id" + == "SELECT * FROM table2 JOIN _my_cte_table ON table2.id = _my_cte_table.id" ) @@ -27,7 +27,7 @@ def test_detach_ctes_with_alias(): assert ( detached - == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table AS tablealias ON table2.id = tablealias.id" + == "SELECT * FROM table2 JOIN _my_cte_table AS tablealias ON table2.id = tablealias.id" ) @@ -42,5 +42,5 @@ def test_detach_ctes_with_multipart_replacement(): assert ( detached - == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN my_db.my_schema.my_table ON table2.id = my_db.my_schema.my_table.id" + == "SELECT * FROM table2 JOIN my_db.my_schema.my_table ON table2.id = my_db.my_schema.my_table.id" ) From 1240e03c98aaab56f35ca5a263714bf2990d4582 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Mon, 20 May 2024 18:56:39 -0400 Subject: [PATCH 5/7] feat(connections): add connection entity type and graphql endpoints (#10550) --- .../linkedin/datahub/graphql/Constants.java | 1 + .../datahub/graphql/GmsGraphQLEngine.java | 36 +++++ .../datahub/graphql/GmsGraphQLEngineArgs.java | 2 + .../connection/ConnectionMapper.java | 104 +++++++++++++ .../resolvers/connection/ConnectionUtils.java | 23 +++ .../connection/UpsertConnectionResolver.java | 78 ++++++++++ .../connection/DataHubConnectionType.java | 87 +++++++++++ .../src/main/resources/connection.graphql | 130 ++++++++++++++++ .../src/main/resources/entity.graphql | 5 + .../UpsertConnectionResolverTest.java | 128 +++++++++++++++ .../src/graphql/connection.graphql | 29 ++++ docs-website/graphql/generateGraphQLSchema.sh | 3 +- .../java/com/linkedin/metadata/Constants.java | 4 + .../connection/ConnectionService.java | 129 +++++++++++++++ .../connection/ConnectionServiceTest.java | 147 ++++++++++++++++++ .../connection/DataHubConnectionDetails.pdl | 38 +++++ .../connection/DataHubJsonConnection.pdl | 11 ++ .../metadata/key/DataHubConnectionKey.pdl | 15 ++ .../src/main/resources/entity-registry.yml | 6 + .../connection/ConnectionServiceFactory.java | 19 +++ .../factory/graphql/GraphQLEngineFactory.java | 6 + .../gms/servlet/GraphQLServletConfig.java | 3 +- .../authorization/PoliciesConfig.java | 7 +- 23 files changed, 1008 insertions(+), 3 deletions(-) create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionMapper.java create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionUtils.java create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolver.java create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/connection/DataHubConnectionType.java create mode 100644 datahub-graphql-core/src/main/resources/connection.graphql create mode 100644 datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolverTest.java create mode 100644 datahub-web-react/src/graphql/connection.graphql create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/connection/ConnectionService.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/connection/ConnectionServiceTest.java create mode 100644 metadata-models/src/main/pegasus/com/linkedin/connection/DataHubConnectionDetails.pdl create mode 100644 metadata-models/src/main/pegasus/com/linkedin/connection/DataHubJsonConnection.pdl create mode 100644 metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubConnectionKey.pdl create mode 100644 metadata-service/factories/src/main/java/com/linkedin/gms/factory/connection/ConnectionServiceFactory.java diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java index 5f555b45d3b09..0924dbc0c0a6d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java @@ -21,6 +21,7 @@ private Constants() {} public static final String PROPERTIES_SCHEMA_FILE = "properties.graphql"; public static final String FORMS_SCHEMA_FILE = "forms.graphql"; public static final String INCIDENTS_SCHEMA_FILE = "incident.graphql"; + public static final String CONNECTIONS_SCHEMA_FILE = "connection.graphql"; public static final String BROWSE_PATH_DELIMITER = "/"; public static final String BROWSE_PATH_V2_DELIMITER = "␟"; public static final String VERSION_STAMP_FIELD_NAME = "versionStamp"; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 38c40dbfd83e9..1fb01e9ed0d52 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -48,6 +48,7 @@ import com.linkedin.datahub.graphql.generated.DashboardStatsSummary; import com.linkedin.datahub.graphql.generated.DashboardUserUsageCounts; import com.linkedin.datahub.graphql.generated.DataFlow; +import com.linkedin.datahub.graphql.generated.DataHubConnection; import com.linkedin.datahub.graphql.generated.DataHubView; import com.linkedin.datahub.graphql.generated.DataJob; import com.linkedin.datahub.graphql.generated.DataJobInputOutput; @@ -129,6 +130,7 @@ import com.linkedin.datahub.graphql.resolvers.chart.BrowseV2Resolver; import com.linkedin.datahub.graphql.resolvers.chart.ChartStatsSummaryResolver; import com.linkedin.datahub.graphql.resolvers.config.AppConfigResolver; +import com.linkedin.datahub.graphql.resolvers.connection.UpsertConnectionResolver; import com.linkedin.datahub.graphql.resolvers.container.ContainerEntitiesResolver; import com.linkedin.datahub.graphql.resolvers.container.ParentContainersResolver; import com.linkedin.datahub.graphql.resolvers.dashboard.DashboardStatsSummaryResolver; @@ -306,6 +308,7 @@ import com.linkedin.datahub.graphql.types.chart.ChartType; import com.linkedin.datahub.graphql.types.common.mappers.OperationMapper; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; +import com.linkedin.datahub.graphql.types.connection.DataHubConnectionType; import com.linkedin.datahub.graphql.types.container.ContainerType; import com.linkedin.datahub.graphql.types.corpgroup.CorpGroupType; import com.linkedin.datahub.graphql.types.corpuser.CorpUserType; @@ -355,6 +358,7 @@ import com.linkedin.metadata.config.ViewsConfiguration; import com.linkedin.metadata.config.VisualConfiguration; import com.linkedin.metadata.config.telemetry.TelemetryConfiguration; +import com.linkedin.metadata.connection.ConnectionService; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphClient; import com.linkedin.metadata.graph.SiblingGraphService; @@ -439,6 +443,7 @@ public class GmsGraphQLEngine { private final ERModelRelationshipService erModelRelationshipService; private final FormService formService; private final RestrictedService restrictedService; + private ConnectionService connectionService; private final BusinessAttributeService businessAttributeService; private final FeatureFlags featureFlags; @@ -472,6 +477,7 @@ public class GmsGraphQLEngine { private final GlossaryTermType glossaryTermType; private final GlossaryNodeType glossaryNodeType; private final AspectType aspectType; + private final DataHubConnectionType connectionType; private final ContainerType containerType; private final DomainType domainType; private final NotebookType notebookType; @@ -558,6 +564,7 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) { this.dataProductService = args.dataProductService; this.formService = args.formService; this.restrictedService = args.restrictedService; + this.connectionService = args.connectionService; this.businessAttributeService = args.businessAttributeService; this.ingestionConfiguration = Objects.requireNonNull(args.ingestionConfiguration); @@ -588,6 +595,7 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) { this.glossaryTermType = new GlossaryTermType(entityClient); this.glossaryNodeType = new GlossaryNodeType(entityClient); this.aspectType = new AspectType(entityClient); + this.connectionType = new DataHubConnectionType(entityClient, secretService); this.containerType = new ContainerType(entityClient); this.domainType = new DomainType(entityClient); this.notebookType = new NotebookType(entityClient); @@ -636,6 +644,7 @@ public GmsGraphQLEngine(final GmsGraphQLEngineArgs args) { dataJobType, glossaryTermType, glossaryNodeType, + connectionType, containerType, notebookType, domainType, @@ -753,6 +762,7 @@ public void configureRuntimeWiring(final RuntimeWiring.Builder builder) { configureRoleResolvers(builder); configureBusinessAttributeResolver(builder); configureBusinessAttributeAssociationResolver(builder); + configureConnectionResolvers(builder); } private void configureOrganisationRoleResolvers(RuntimeWiring.Builder builder) { @@ -803,6 +813,7 @@ public GraphQLEngine.Builder builder() { .addSchema(fileBasedSchema(LINEAGE_SCHEMA_FILE)) .addSchema(fileBasedSchema(PROPERTIES_SCHEMA_FILE)) .addSchema(fileBasedSchema(FORMS_SCHEMA_FILE)) + .addSchema(fileBasedSchema(CONNECTIONS_SCHEMA_FILE)) .addSchema(fileBasedSchema(INCIDENTS_SCHEMA_FILE)); for (GmsGraphQLPlugin plugin : this.graphQLPlugins) { @@ -3015,4 +3026,29 @@ private void configureBusinessAttributeAssociationResolver(final RuntimeWiring.B .getBusinessAttribute() .getUrn()))); } + + private void configureConnectionResolvers(final RuntimeWiring.Builder builder) { + builder.type( + "Mutation", + typeWiring -> + typeWiring.dataFetcher( + "upsertConnection", + new UpsertConnectionResolver(connectionService, secretService))); + builder.type( + "Query", + typeWiring -> typeWiring.dataFetcher("connection", getResolver(this.connectionType))); + builder.type( + "DataHubConnection", + typeWiring -> + typeWiring.dataFetcher( + "platform", + new LoadableTypeResolver<>( + this.dataPlatformType, + (env) -> { + final DataHubConnection connection = env.getSource(); + return connection.getPlatform() != null + ? connection.getPlatform().getUrn() + : null; + }))); + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java index 2077a674abd68..d4d4d592d6bca 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngineArgs.java @@ -19,6 +19,7 @@ import com.linkedin.metadata.config.ViewsConfiguration; import com.linkedin.metadata.config.VisualConfiguration; import com.linkedin.metadata.config.telemetry.TelemetryConfiguration; +import com.linkedin.metadata.connection.ConnectionService; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphClient; import com.linkedin.metadata.graph.SiblingGraphService; @@ -84,6 +85,7 @@ public class GmsGraphQLEngineArgs { int graphQLQueryDepthLimit; boolean graphQLQueryIntrospectionEnabled; BusinessAttributeService businessAttributeService; + ConnectionService connectionService; // any fork specific args should go below this line } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionMapper.java new file mode 100644 index 0000000000000..a4ad332d5946d --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionMapper.java @@ -0,0 +1,104 @@ +package com.linkedin.datahub.graphql.resolvers.connection; + +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.DataMap; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.DataHubConnection; +import com.linkedin.datahub.graphql.generated.DataHubConnectionDetails; +import com.linkedin.datahub.graphql.generated.DataHubJsonConnection; +import com.linkedin.datahub.graphql.generated.DataPlatform; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import io.datahubproject.metadata.services.SecretService; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +public class ConnectionMapper { + /** + * Maps a GMS encrypted connection details object into the decrypted form returned by the GraphQL + * API. + * + *

Returns null if the Entity does not have the required aspects: dataHubConnectionDetails or + * dataPlatformInstance. + */ + @Nullable + public static DataHubConnection map( + @Nonnull final QueryContext context, + @Nonnull final EntityResponse entityResponse, + @Nonnull final SecretService secretService) { + // If the connection does not exist, simply return null + if (!hasAspects(entityResponse)) { + return null; + } + + final DataHubConnection result = new DataHubConnection(); + final Urn entityUrn = entityResponse.getUrn(); + final EnvelopedAspectMap aspects = entityResponse.getAspects(); + + result.setUrn(entityUrn.toString()); + result.setType(EntityType.DATAHUB_CONNECTION); + + final EnvelopedAspect envelopedAssertionInfo = + aspects.get(Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME); + if (envelopedAssertionInfo != null) { + result.setDetails( + mapConnectionDetails( + context, + new com.linkedin.connection.DataHubConnectionDetails( + envelopedAssertionInfo.getValue().data()), + secretService)); + } + final EnvelopedAspect envelopedPlatformInstance = + aspects.get(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME); + if (envelopedPlatformInstance != null) { + final DataMap data = envelopedPlatformInstance.getValue().data(); + result.setPlatform(mapPlatform(new DataPlatformInstance(data))); + } + return result; + } + + private static DataHubConnectionDetails mapConnectionDetails( + @Nonnull final QueryContext context, + @Nonnull final com.linkedin.connection.DataHubConnectionDetails gmsDetails, + @Nonnull final SecretService secretService) { + final DataHubConnectionDetails result = new DataHubConnectionDetails(); + result.setType( + com.linkedin.datahub.graphql.generated.DataHubConnectionDetailsType.valueOf( + gmsDetails.getType().toString())); + if (gmsDetails.hasJson() && ConnectionUtils.canManageConnections(context)) { + result.setJson(mapJsonConnectionDetails(gmsDetails.getJson(), secretService)); + } + if (gmsDetails.hasName()) { + result.setName(gmsDetails.getName()); + } + return result; + } + + private static DataHubJsonConnection mapJsonConnectionDetails( + @Nonnull final com.linkedin.connection.DataHubJsonConnection gmsJsonConnection, + @Nonnull final SecretService secretService) { + final DataHubJsonConnection result = new DataHubJsonConnection(); + // Decrypt the BLOB! + result.setBlob(secretService.decrypt(gmsJsonConnection.getEncryptedBlob())); + return result; + } + + private static DataPlatform mapPlatform(final DataPlatformInstance platformInstance) { + // Set dummy platform to be resolved. + final DataPlatform partialPlatform = new DataPlatform(); + partialPlatform.setUrn(platformInstance.getPlatform().toString()); + return partialPlatform; + } + + private static boolean hasAspects(@Nonnull final EntityResponse response) { + return response.hasAspects() + && response.getAspects().containsKey(Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME) + && response.getAspects().containsKey(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME); + } + + private ConnectionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionUtils.java new file mode 100644 index 0000000000000..bcdd6460ae75e --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/ConnectionUtils.java @@ -0,0 +1,23 @@ +package com.linkedin.datahub.graphql.resolvers.connection; + +import com.datahub.authorization.AuthUtil; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.metadata.authorization.PoliciesConfig; +import javax.annotation.Nonnull; + +/** Utilities for working with DataHub Connections. */ +public class ConnectionUtils { + + /** + * Returns true if the user is able to read and or write connection between DataHub and external + * platforms. + */ + public static boolean canManageConnections(@Nonnull QueryContext context) { + return AuthUtil.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + PoliciesConfig.MANAGE_CONNECTIONS_PRIVILEGE); + } + + private ConnectionUtils() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolver.java new file mode 100644 index 0000000000000..3aae612b8cb78 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolver.java @@ -0,0 +1,78 @@ +package com.linkedin.datahub.graphql.resolvers.connection; + +import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; + +import com.datahub.authentication.Authentication; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.connection.DataHubConnectionDetailsType; +import com.linkedin.connection.DataHubJsonConnection; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.exception.AuthorizationException; +import com.linkedin.datahub.graphql.generated.DataHubConnection; +import com.linkedin.datahub.graphql.generated.UpsertDataHubConnectionInput; +import com.linkedin.entity.EntityResponse; +import com.linkedin.metadata.connection.ConnectionService; +import graphql.schema.DataFetcher; +import graphql.schema.DataFetchingEnvironment; +import io.datahubproject.metadata.services.SecretService; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class UpsertConnectionResolver implements DataFetcher> { + + private final ConnectionService _connectionService; + private final SecretService _secretService; + + public UpsertConnectionResolver( + @Nonnull final ConnectionService connectionService, + @Nonnull final SecretService secretService) { + _connectionService = + Objects.requireNonNull(connectionService, "connectionService cannot be null"); + _secretService = Objects.requireNonNull(secretService, "secretService cannot be null"); + } + + @Override + public CompletableFuture get(final DataFetchingEnvironment environment) + throws Exception { + + final QueryContext context = environment.getContext(); + final UpsertDataHubConnectionInput input = + bindArgument(environment.getArgument("input"), UpsertDataHubConnectionInput.class); + final Authentication authentication = context.getAuthentication(); + + return CompletableFuture.supplyAsync( + () -> { + if (!ConnectionUtils.canManageConnections(context)) { + throw new AuthorizationException( + "Unauthorized to upsert Connection. Please contact your DataHub administrator for more information."); + } + + try { + final Urn connectionUrn = + _connectionService.upsertConnection( + context.getOperationContext(), + input.getId(), + UrnUtils.getUrn(input.getPlatformUrn()), + DataHubConnectionDetailsType.valueOf(input.getType().toString()), + input.getJson() != null + // Encrypt payload + ? new DataHubJsonConnection() + .setEncryptedBlob(_secretService.encrypt(input.getJson().getBlob())) + : null, + input.getName()); + + final EntityResponse connectionResponse = + _connectionService.getConnectionEntityResponse( + context.getOperationContext(), connectionUrn); + return ConnectionMapper.map(context, connectionResponse, _secretService); + } catch (Exception e) { + throw new RuntimeException( + String.format("Failed to upsert a Connection from input %s", input), e); + } + }); + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/connection/DataHubConnectionType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/connection/DataHubConnectionType.java new file mode 100644 index 0000000000000..0a62d224c6513 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/connection/DataHubConnectionType.java @@ -0,0 +1,87 @@ +package com.linkedin.datahub.graphql.types.connection; + +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.DataHubConnection; +import com.linkedin.datahub.graphql.generated.Entity; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.resolvers.connection.ConnectionMapper; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.Constants; +import graphql.execution.DataFetcherResult; +import io.datahubproject.metadata.services.SecretService; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; + +public class DataHubConnectionType + implements com.linkedin.datahub.graphql.types.EntityType { + + static final Set ASPECTS_TO_FETCH = + ImmutableSet.of( + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME); + private final EntityClient _entityClient; + private final SecretService _secretService; + + public DataHubConnectionType( + @Nonnull final EntityClient entityClient, @Nonnull final SecretService secretService) { + _entityClient = Objects.requireNonNull(entityClient, "entityClient must not be null"); + _secretService = Objects.requireNonNull(secretService, "secretService must not be null"); + } + + @Override + public EntityType type() { + return EntityType.DATAHUB_CONNECTION; + } + + @Override + public Function getKeyProvider() { + return Entity::getUrn; + } + + @Override + public Class objectClass() { + return DataHubConnection.class; + } + + @Override + public List> batchLoad( + @Nonnull List urns, @Nonnull QueryContext context) throws Exception { + final List connectionUrns = + urns.stream().map(UrnUtils::getUrn).collect(Collectors.toList()); + try { + final Map entities = + _entityClient.batchGetV2( + context.getOperationContext(), + Constants.DATAHUB_CONNECTION_ENTITY_NAME, + new HashSet<>(connectionUrns), + ASPECTS_TO_FETCH); + + final List gmsResults = new ArrayList<>(); + for (Urn urn : connectionUrns) { + gmsResults.add(entities.getOrDefault(urn, null)); + } + return gmsResults.stream() + .map( + gmsResult -> + gmsResult == null + ? null + : DataFetcherResult.newResult() + .data(ConnectionMapper.map(context, gmsResult, _secretService)) + .build()) + .collect(Collectors.toList()); + } catch (Exception e) { + throw new RuntimeException("Failed to batch load Connections", e); + } + } +} diff --git a/datahub-graphql-core/src/main/resources/connection.graphql b/datahub-graphql-core/src/main/resources/connection.graphql new file mode 100644 index 0000000000000..1a7249485e69d --- /dev/null +++ b/datahub-graphql-core/src/main/resources/connection.graphql @@ -0,0 +1,130 @@ +# DataHub Connections-specific GraphQL types + +extend type Query { + """ + Get a set of connection details by URN. + This requires the 'Manage Connections' platform privilege. + Returns null if a connection with the provided urn does not exist. + """ + connection(urn: String!): DataHubConnection +} + +extend type Mutation { + """ + Upsert a particular connection. + This requires the 'Manage Connections' platform privilege. + """ + upsertConnection(input: UpsertDataHubConnectionInput!): DataHubConnection! +} + +""" +A connection between DataHub and an external Platform. +""" +type DataHubConnection implements Entity { + """ + The urn of the connection + """ + urn: String! + + """ + The standard Entity Type field + """ + type: EntityType! + + """ + The connection details + """ + details: DataHubConnectionDetails! + + """ + The external Data Platform associated with the connection + """ + platform: DataPlatform! + + """ + Not implemented! + """ + relationships(input: RelationshipsInput!): EntityRelationshipsResult +} + + +""" +The details of the Connection +""" +type DataHubConnectionDetails { + """ + The type or format of connection + """ + type: DataHubConnectionDetailsType! + + """ + A JSON-encoded connection. Present when type is JSON. + """ + json: DataHubJsonConnection + + """ + The name for this DataHub connection + """ + name: String +} + +""" +The type of a DataHub connection +""" +enum DataHubConnectionDetailsType { + """ + A json-encoded set of connection details. + """ + JSON +} + +""" +The details of a JSON Connection +""" +type DataHubJsonConnection { + """ + The JSON blob containing the specific connection details. + """ + blob: String! +} + +""" +Input required to upsert a new DataHub connection. +""" +input UpsertDataHubConnectionInput { + """ + An optional ID to use when creating the URN of the connection. If none is provided, + a random UUID will be generated automatically. + """ + id: String + + """ + The type or format of connection + """ + type: DataHubConnectionDetailsType! + + """ + Urn of the associated platform + """ + platformUrn: String! + + """ + A JSON-encoded connection. This must be present when type is JSON. + """ + json: DataHubJsonConnectionInput + + """ + An optional name for this connection entity + """ + name: String +} + +""" +The details of a JSON Connection +""" +input DataHubJsonConnectionInput { + """ + The JSON blob containing the specific connection details. + """ + blob: String! +} \ No newline at end of file diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 2315d6f8767d9..2afb42c649fec 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -1143,6 +1143,11 @@ enum EntityType { """ CUSTOM_OWNERSHIP_TYPE + """ + A connection to an external source. + """ + DATAHUB_CONNECTION + """ A DataHub incident - SaaS only """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolverTest.java new file mode 100644 index 0000000000000..5bc5332e711fd --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/connection/UpsertConnectionResolverTest.java @@ -0,0 +1,128 @@ +package com.linkedin.datahub.graphql.resolvers.connection; + +import static com.linkedin.datahub.graphql.TestUtils.*; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertThrows; + +import com.google.common.collect.ImmutableMap; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.connection.DataHubConnectionDetails; +import com.linkedin.connection.DataHubJsonConnection; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.DataHubConnection; +import com.linkedin.datahub.graphql.generated.DataHubConnectionDetailsType; +import com.linkedin.datahub.graphql.generated.DataHubJsonConnectionInput; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.UpsertDataHubConnectionInput; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.connection.ConnectionService; +import graphql.schema.DataFetchingEnvironment; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.services.SecretService; +import java.util.concurrent.CompletionException; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class UpsertConnectionResolverTest { + + private ConnectionService connectionService; + private SecretService secretService; + private UpsertConnectionResolver resolver; + + @BeforeMethod + public void setUp() { + connectionService = Mockito.mock(ConnectionService.class); + secretService = Mockito.mock(SecretService.class); + Mockito.when(secretService.encrypt("{}")).thenReturn("encrypted"); + Mockito.when(secretService.decrypt("encrypted")).thenReturn("{}"); + resolver = new UpsertConnectionResolver(connectionService, secretService); + } + + @Test + public void testGetAuthorized() throws Exception { + // Mock inputs + Urn connectionUrn = UrnUtils.getUrn("urn:li:dataHubConnection:test-id"); + Urn platformUrn = UrnUtils.getUrn("urn:li:dataPlatform:slack"); + + final UpsertDataHubConnectionInput input = new UpsertDataHubConnectionInput(); + input.setId(connectionUrn.getId()); + input.setPlatformUrn(platformUrn.toString()); + input.setType(DataHubConnectionDetailsType.JSON); + input.setName("test-name"); + input.setJson(new DataHubJsonConnectionInput("{}")); + + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + final DataHubConnectionDetails details = + new DataHubConnectionDetails() + .setType(com.linkedin.connection.DataHubConnectionDetailsType.JSON) + .setJson(new DataHubJsonConnection().setEncryptedBlob("encrypted")); + + final DataPlatformInstance platformInstance = + new DataPlatformInstance().setPlatform(platformUrn); + + when(connectionService.upsertConnection( + any(OperationContext.class), + Mockito.eq(input.getId()), + Mockito.eq(platformUrn), + Mockito.eq(details.getType()), + Mockito.eq(details.getJson()), + Mockito.any(String.class))) + .thenReturn(connectionUrn); + when(connectionService.getConnectionEntityResponse( + any(OperationContext.class), Mockito.eq(connectionUrn))) + .thenReturn( + new EntityResponse() + .setUrn(connectionUrn) + .setEntityName(Constants.DATAHUB_CONNECTION_ENTITY_NAME) + .setAspects( + new EnvelopedAspectMap( + ImmutableMap.of( + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + new EnvelopedAspect() + .setName(Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME) + .setValue(new Aspect(details.data())), + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect() + .setName(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME) + .setValue(new Aspect(platformInstance.data())))))); + + DataHubConnection actual = resolver.get(mockEnv).get(); + + Assert.assertEquals(actual.getType(), EntityType.DATAHUB_CONNECTION); + Assert.assertEquals(actual.getUrn(), connectionUrn.toString()); + Assert.assertEquals(actual.getPlatform().getUrn(), platformUrn.toString()); + Assert.assertEquals(actual.getDetails().getType(), input.getType()); + Assert.assertEquals(actual.getDetails().getJson().getBlob(), input.getJson().getBlob()); + } + + @Test + public void testGetUnAuthorized() { + // Mock inputs + Urn connectionUrn = UrnUtils.getUrn("urn:li:dataHubConnection:test-id"); + + final UpsertDataHubConnectionInput input = new UpsertDataHubConnectionInput(); + input.setId(connectionUrn.getId()); + input.setPlatformUrn(connectionUrn.toString()); + input.setType(DataHubConnectionDetailsType.JSON); + + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + + assertThrows(CompletionException.class, () -> resolver.get(mockEnv).join()); + } +} diff --git a/datahub-web-react/src/graphql/connection.graphql b/datahub-web-react/src/graphql/connection.graphql new file mode 100644 index 0000000000000..02f87f08c519f --- /dev/null +++ b/datahub-web-react/src/graphql/connection.graphql @@ -0,0 +1,29 @@ +mutation upsertConnection($input: UpsertDataHubConnectionInput!) { + upsertConnection(input: $input) { + urn + details { + type + json { + blob + } + } + platform { + ...platformFields + } + } +} + +query connection($urn: String!) { + connection(urn: $urn) { + urn + details { + type + json { + blob + } + } + platform { + ...platformFields + } + } +} diff --git a/docs-website/graphql/generateGraphQLSchema.sh b/docs-website/graphql/generateGraphQLSchema.sh index c6d7ec528b613..da14fbc337f90 100755 --- a/docs-website/graphql/generateGraphQLSchema.sh +++ b/docs-website/graphql/generateGraphQLSchema.sh @@ -17,4 +17,5 @@ cat ../../datahub-graphql-core/src/main/resources/timeline.graphql >> combined.g cat ../../datahub-graphql-core/src/main/resources/step.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/lineage.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/properties.graphql >> combined.graphql -cat ../../datahub-graphql-core/src/main/resources/forms.graphql >> combined.graphql \ No newline at end of file +cat ../../datahub-graphql-core/src/main/resources/forms.graphql >> combined.graphql +cat ../../datahub-graphql-core/src/main/resources/connection.graphql >> combined.graphql \ No newline at end of file diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index c200a4bc30d19..66ed48a428a21 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -358,6 +358,10 @@ public class Constants { public static final String GLOBAL_SETTINGS_INFO_ASPECT_NAME = "globalSettingsInfo"; public static final Urn GLOBAL_SETTINGS_URN = Urn.createFromTuple(GLOBAL_SETTINGS_ENTITY_NAME, 0); + // Connection + public static final String DATAHUB_CONNECTION_ENTITY_NAME = "dataHubConnection"; + public static final String DATAHUB_CONNECTION_DETAILS_ASPECT_NAME = "dataHubConnectionDetails"; + // Relationships public static final String IS_MEMBER_OF_GROUP_RELATIONSHIP_NAME = "IsMemberOfGroup"; public static final String IS_MEMBER_OF_NATIVE_GROUP_RELATIONSHIP_NAME = "IsMemberOfNativeGroup"; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/connection/ConnectionService.java b/metadata-io/src/main/java/com/linkedin/metadata/connection/ConnectionService.java new file mode 100644 index 0000000000000..f044ea52a251a --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/connection/ConnectionService.java @@ -0,0 +1,129 @@ +package com.linkedin.metadata.connection; + +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.connection.DataHubConnectionDetails; +import com.linkedin.connection.DataHubConnectionDetailsType; +import com.linkedin.connection.DataHubJsonConnection; +import com.linkedin.data.template.SetMode; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.entity.AspectUtils; +import com.linkedin.metadata.key.DataHubConnectionKey; +import com.linkedin.metadata.utils.EntityKeyUtils; +import com.linkedin.mxe.MetadataChangeProposal; +import io.datahubproject.metadata.context.OperationContext; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.UUID; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +public class ConnectionService { + + private final EntityClient _entityClient; + + /** + * Upserts a DataHub connection. If the connection with the provided ID already exists, then it + * will be overwritten. + * + *

This method assumes that authorization has already been verified at the calling layer. + * + * @return the URN of the new connection. + */ + public Urn upsertConnection( + @Nonnull OperationContext opContext, + @Nullable final String id, + @Nonnull final Urn platformUrn, + @Nonnull final DataHubConnectionDetailsType type, + @Nullable final DataHubJsonConnection json, + @Nullable final String name) { + Objects.requireNonNull(platformUrn, "platformUrn must not be null"); + Objects.requireNonNull(type, "type must not be null"); + Objects.requireNonNull(opContext, "opContext must not be null"); + + // 1. Optionally generate new connection id + final String connectionId = id != null ? id : UUID.randomUUID().toString(); + final DataHubConnectionKey key = new DataHubConnectionKey().setId(connectionId); + final Urn connectionUrn = + EntityKeyUtils.convertEntityKeyToUrn(key, Constants.DATAHUB_CONNECTION_ENTITY_NAME); + + // 2. Build Connection Details + final DataHubConnectionDetails details = new DataHubConnectionDetails(); + details.setType(type); + // default set name as ID if it exists, otherwise use name if it exists + details.setName(id, SetMode.IGNORE_NULL); + details.setName(name, SetMode.IGNORE_NULL); + + if (DataHubConnectionDetailsType.JSON.equals(details.getType())) { + if (json != null) { + details.setJson(json); + } else { + throw new IllegalArgumentException( + "Connections with type JSON must provide the field 'json'."); + } + } + + // 3. Build platform instance + final DataPlatformInstance platformInstance = new DataPlatformInstance(); + platformInstance.setPlatform(platformUrn); + + // 4. Write changes to GMS + try { + final List aspectsToIngest = new ArrayList<>(); + aspectsToIngest.add( + AspectUtils.buildMetadataChangeProposal( + connectionUrn, Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, details)); + aspectsToIngest.add( + AspectUtils.buildMetadataChangeProposal( + connectionUrn, Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, platformInstance)); + _entityClient.batchIngestProposals(opContext, aspectsToIngest, false); + } catch (Exception e) { + throw new RuntimeException( + String.format("Failed to upsert Connection with urn %s", connectionUrn), e); + } + return connectionUrn; + } + + @Nullable + public DataHubConnectionDetails getConnectionDetails( + @Nonnull OperationContext opContext, @Nonnull final Urn connectionUrn) { + Objects.requireNonNull(connectionUrn, "connectionUrn must not be null"); + final EntityResponse response = getConnectionEntityResponse(opContext, connectionUrn); + if (response != null + && response.getAspects().containsKey(Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME)) { + return new DataHubConnectionDetails( + response + .getAspects() + .get(Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME) + .getValue() + .data()); + } + // No aspect found + return null; + } + + @Nullable + public EntityResponse getConnectionEntityResponse( + @Nonnull OperationContext opContext, @Nonnull final Urn connectionUrn) { + try { + return _entityClient.getV2( + opContext, + Constants.DATAHUB_CONNECTION_ENTITY_NAME, + connectionUrn, + ImmutableSet.of( + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME)); + } catch (Exception e) { + throw new RuntimeException( + String.format("Failed to retrieve Connection with urn %s", connectionUrn), e); + } + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/connection/ConnectionServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/connection/ConnectionServiceTest.java new file mode 100644 index 0000000000000..658c66807ccf1 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/connection/ConnectionServiceTest.java @@ -0,0 +1,147 @@ +package com.linkedin.metadata.connection; + +import static org.mockito.Mockito.*; +import static org.testng.Assert.*; + +import com.datahub.authentication.Authentication; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.connection.DataHubConnectionDetails; +import com.linkedin.connection.DataHubConnectionDetailsType; +import com.linkedin.connection.DataHubJsonConnection; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.entity.AspectUtils; +import com.linkedin.metadata.key.DataHubConnectionKey; +import com.linkedin.metadata.utils.EntityKeyUtils; +import io.datahubproject.metadata.context.OperationContext; +import org.mockito.Mockito; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class ConnectionServiceTest { + + private EntityClient entityClient; + private Authentication systemAuthentication; + private ConnectionService connectionService; + + @BeforeMethod + public void setUp() { + entityClient = Mockito.mock(EntityClient.class); + systemAuthentication = Mockito.mock(Authentication.class); + connectionService = new ConnectionService(entityClient); + } + + @Test + public void testUpsertConnection() throws Exception { + final String id = "testId"; + final Urn platformUrn = UrnUtils.getUrn("urn:li:dataPlatform:slack"); + final DataHubConnectionDetailsType type = DataHubConnectionDetailsType.JSON; + final DataHubJsonConnection json = new DataHubJsonConnection().setEncryptedBlob("blob"); + final Authentication authentication = Mockito.mock(Authentication.class); + final DataHubConnectionKey key = new DataHubConnectionKey().setId(id); + final Urn connectionUrn = + EntityKeyUtils.convertEntityKeyToUrn(key, Constants.DATAHUB_CONNECTION_ENTITY_NAME); + + // Execute and assert + Urn result = + connectionService.upsertConnection( + mock(OperationContext.class), id, platformUrn, type, json, null); + + DataHubConnectionDetails expectedDetails = mockConnectionDetails(id); + DataPlatformInstance expectedDataPlatformInstance = mockPlatformInstance(platformUrn); + + verify(entityClient) + .batchIngestProposals( + any(OperationContext.class), + Mockito.eq( + ImmutableList.of( + AspectUtils.buildMetadataChangeProposal( + connectionUrn, + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + expectedDetails), + AspectUtils.buildMetadataChangeProposal( + connectionUrn, + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, + expectedDataPlatformInstance))), + Mockito.eq(false)); + assertEquals(result, connectionUrn); + } + + @Test + public void testGetConnectionDetails() throws Exception { + final Urn connectionUrn = Mockito.mock(Urn.class); + + final DataHubConnectionDetails connectionDetails = mockConnectionDetails("testId"); + final DataPlatformInstance platformInstance = + mockPlatformInstance(UrnUtils.getUrn("urn:li:dataPlatform:slack")); + + EntityResponse response = + new EntityResponse() + .setEntityName(Constants.DATAHUB_CONNECTION_ENTITY_NAME) + .setUrn(connectionUrn) + .setAspects( + new EnvelopedAspectMap( + ImmutableMap.of( + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + new EnvelopedAspect() + .setName(Constants.DATAHUB_CONNECTION_ENTITY_NAME) + .setValue(new Aspect(connectionDetails.data())), + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect() + .setName(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME) + .setValue(new Aspect(platformInstance.data()))))); + when(entityClient.getV2( + any(OperationContext.class), + Mockito.eq(Constants.DATAHUB_CONNECTION_ENTITY_NAME), + Mockito.eq(connectionUrn), + Mockito.eq( + ImmutableSet.of( + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME)))) + .thenReturn(response); + + // Execute and assert + DataHubConnectionDetails details = + connectionService.getConnectionDetails(mock(OperationContext.class), connectionUrn); + assertEquals(details, connectionDetails); + } + + @Test + public void testGetConnectionEntityResponse() throws Exception { + final Urn connectionUrn = Mockito.mock(Urn.class); + EntityResponse response = Mockito.mock(EntityResponse.class); + when(entityClient.getV2( + any(OperationContext.class), + Mockito.eq(Constants.DATAHUB_CONNECTION_ENTITY_NAME), + Mockito.eq(connectionUrn), + Mockito.eq( + ImmutableSet.of( + Constants.DATAHUB_CONNECTION_DETAILS_ASPECT_NAME, + Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME)))) + .thenReturn(response); + // Execute and assert + assertEquals( + connectionService.getConnectionEntityResponse(mock(OperationContext.class), connectionUrn), + response); + } + + private DataHubConnectionDetails mockConnectionDetails(String id) { + return new DataHubConnectionDetails() + .setType(DataHubConnectionDetailsType.JSON) + .setName(id) + .setJson(new DataHubJsonConnection().setEncryptedBlob("blob")); + } + + private DataPlatformInstance mockPlatformInstance(Urn platformUrn) { + return new DataPlatformInstance().setPlatform(platformUrn); + } +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/connection/DataHubConnectionDetails.pdl b/metadata-models/src/main/pegasus/com/linkedin/connection/DataHubConnectionDetails.pdl new file mode 100644 index 0000000000000..81f57abf2dac4 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/connection/DataHubConnectionDetails.pdl @@ -0,0 +1,38 @@ +namespace com.linkedin.connection + +import com.linkedin.common.Urn + +/** + * Information about a connection to an external platform. + */ +@Aspect = { + "name": "dataHubConnectionDetails" +} +record DataHubConnectionDetails { + /** + * The type of the connection. This defines the schema / encoding of the connection details. + */ + @Searchable = {} + type: enum DataHubConnectionDetailsType { + /** + * A json-encoded set of connection details + */ + JSON + } + + /** + * Display name of the connection + */ + @Searchable = { + "fieldType": "TEXT_PARTIAL", + "enableAutocomplete": true, + "boostScore": 10.0 + } + name: optional string + + /** + * An JSON payload containing raw connection details. + * This will be present if the type is JSON. + */ + json: optional DataHubJsonConnection +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/connection/DataHubJsonConnection.pdl b/metadata-models/src/main/pegasus/com/linkedin/connection/DataHubJsonConnection.pdl new file mode 100644 index 0000000000000..996e2a3238bd5 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/connection/DataHubJsonConnection.pdl @@ -0,0 +1,11 @@ +namespace com.linkedin.connection + +/** + * A set of connection details consisting of an encrypted JSON blob. + */ +record DataHubJsonConnection { + /** + * The encrypted JSON connection details. + */ + encryptedBlob: string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubConnectionKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubConnectionKey.pdl new file mode 100644 index 0000000000000..cd851d8382759 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataHubConnectionKey.pdl @@ -0,0 +1,15 @@ +namespace com.linkedin.metadata.key + +/** + * Key for a Connection + */ +@Aspect = { + "name": "dataHubConnectionKey" +} +record DataHubConnectionKey { + /** + * A unique identifier for the connection. + */ + @Searchable = {} + id: string +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index a9301076d4e82..60ef05ea55b2c 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -570,6 +570,12 @@ entities: - formInfo - dynamicFormAssignment - ownership + - name: dataHubConnection + category: internal + keyAspect: dataHubConnectionKey + aspects: + - dataHubConnectionDetails + - dataPlatformInstance events: plugins: aspectPayloadValidators: diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/connection/ConnectionServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/connection/ConnectionServiceFactory.java new file mode 100644 index 0000000000000..07cc59722e91f --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/connection/ConnectionServiceFactory.java @@ -0,0 +1,19 @@ +package com.linkedin.gms.factory.connection; + +import com.linkedin.entity.client.SystemEntityClient; +import com.linkedin.metadata.connection.ConnectionService; +import javax.annotation.Nonnull; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class ConnectionServiceFactory { + @Bean(name = "connectionService") + @Nonnull + protected ConnectionService getInstance( + @Qualifier("systemEntityClient") final SystemEntityClient systemEntityClient) + throws Exception { + return new ConnectionService(systemEntityClient); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java index 678d442396d0f..1ac6010be92e5 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java @@ -21,6 +21,7 @@ import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.recommendation.RecommendationServiceFactory; import com.linkedin.metadata.client.UsageStatsJavaClient; +import com.linkedin.metadata.connection.ConnectionService; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.graph.GraphClient; import com.linkedin.metadata.graph.GraphService; @@ -181,6 +182,10 @@ public class GraphQLEngineFactory { @Qualifier("businessAttributeService") private BusinessAttributeService businessAttributeService; + @Autowired + @Qualifier("connectionService") + private ConnectionService _connectionService; + @Bean(name = "graphQLEngine") @Nonnull protected GraphQLEngine graphQLEngine( @@ -233,6 +238,7 @@ protected GraphQLEngine graphQLEngine( configProvider.getGraphQL().getQuery().getComplexityLimit()); args.setGraphQLQueryDepthLimit(configProvider.getGraphQL().getQuery().getDepthLimit()); args.setBusinessAttributeService(businessAttributeService); + args.setConnectionService(_connectionService); return new GmsGraphQLEngine(args).builder().build(); } } diff --git a/metadata-service/war/src/main/java/com/linkedin/gms/servlet/GraphQLServletConfig.java b/metadata-service/war/src/main/java/com/linkedin/gms/servlet/GraphQLServletConfig.java index 64ec11f58c60d..42413df0757e6 100644 --- a/metadata-service/war/src/main/java/com/linkedin/gms/servlet/GraphQLServletConfig.java +++ b/metadata-service/war/src/main/java/com/linkedin/gms/servlet/GraphQLServletConfig.java @@ -18,7 +18,8 @@ "com.linkedin.gms.factory.query", "com.linkedin.gms.factory.ermodelrelation", "com.linkedin.gms.factory.dataproduct", - "com.linkedin.gms.factory.businessattribute" + "com.linkedin.gms.factory.businessattribute", + "com.linkedin.gms.factory.connection" }) @Configuration public class GraphQLServletConfig {} diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index 342c492b01b2e..ea8f52925b5b3 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -142,6 +142,10 @@ public class PoliciesConfig { "Manage Business Attribute", "Create, update, delete Business Attribute"); + public static final Privilege MANAGE_CONNECTIONS_PRIVILEGE = + Privilege.of( + "MANAGE_CONNECTIONS", "Manage Connections", "Manage connections to external platforms."); + public static final List PLATFORM_PRIVILEGES = ImmutableList.of( MANAGE_POLICIES_PRIVILEGE, @@ -164,7 +168,8 @@ public class PoliciesConfig { MANAGE_GLOBAL_VIEWS, MANAGE_GLOBAL_OWNERSHIP_TYPES, CREATE_BUSINESS_ATTRIBUTE_PRIVILEGE, - MANAGE_BUSINESS_ATTRIBUTE_PRIVILEGE); + MANAGE_BUSINESS_ATTRIBUTE_PRIVILEGE, + MANAGE_CONNECTIONS_PRIVILEGE); // Resource Privileges // From 634a486d81f16b6797402a7b9162868a78e37625 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Tue, 21 May 2024 09:13:04 +0530 Subject: [PATCH 6/7] doc(gms/scim-api): fix title and add overview (#10388) --- ...ing-identity-provisioning-with-ms-entra.md | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/docs/managed-datahub/configuring-identity-provisioning-with-ms-entra.md b/docs/managed-datahub/configuring-identity-provisioning-with-ms-entra.md index ed840ff28e2f6..342b44d478afb 100644 --- a/docs/managed-datahub/configuring-identity-provisioning-with-ms-entra.md +++ b/docs/managed-datahub/configuring-identity-provisioning-with-ms-entra.md @@ -1,13 +1,34 @@ --- -title: "Configuring MS Entra with DataHub" +title: "SCIM Integration: MS Entra and DataHub" hide_title: true --- import FeatureAvailability from '@site/src/components/FeatureAvailability'; -# Entity Events API +## SCIM Integration: MS Entra and DataHub -# Configuring User/Group/Roles provisioning from MS Entra to DataHub +## Overview +On completion of this setup the MS Entra will automatically manage the groups/users/roles from MS Entra to DataHub. + +Consider following configuration in MS Entra +- A group `governance-team` group +- And it has two memeber `john` and `sid` +- And the group has role `Reader` + +If you configure the `governance-team` for auto provisioning, MS Entra will creates the `governance-team` group and it's member automatically on DataHub and set the `Reader` roles on users. + +If you remove `john` from group `governance-team` then MS Entra will automatically removes the `john` from DataHub's `governance-team` group. + +If you permanently deletes a user or group from MS Entra then MS Entra will automatically deletes the user or group from the DataHub. + +> MS Entra doesn't send the user's password on user creation and hence DataHub Admin need to reset their password to be able to login into the DataHub. + + +> Only Admin, Editor and Reader roles are supported in DataHub. These roles are preconfigured/created on DataHub + + + +## Configuring User/Group/Roles provisioning from MS Entra to DataHub 1. **Generate Personal Access Token**: Generate a personal access token from [DataHub](../../docs/authentication/personal-access-tokens.md#creating-personal-access-tokens). @@ -34,22 +55,32 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; c. Fill detail as shown in below image + Fill listed fields + + - Set `Mapping type` to `Expression` + - Set `Expression` to `SingleAppRoleAssignment([appRoleAssignments])` + - Set `Target attribute` to `roles[primary eq "True"].value` + - Set `Match objects using this attribute` to `No` + - Set `Apply this mapping` to `Always` +

- d. **Create Role**: Go to `Provisioning` section and click on `application registration.` to create the role + d. **Create Role**: Go back to the app created in Step #1 and go to the Provisioning section and click on application registration. to create the role

- Create three roles having `Display Name` and `Value` as mentioned below + Create three roles having `Display Name` and `Value` as mentioned below - Admin - Editor - Reader + Only these three roles are supported in DataHub. + e. While creating the App Role set `Allowed member types` to `Users/Groups` 4. **Add Users/Groups/Roles in the App**: Go to application created in step #1 and click on `Add user/group` as shown in below image From 7f37c6f17a766cf582992bf90f0e2aa7f1031623 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 21 May 2024 18:12:10 +0900 Subject: [PATCH 7/7] docs: add guides on forms & structured properties (#10340) --- docs-website/sidebars.js | 9 +- .../openapi/openapi-structured-properties.md | 328 ---------- docs/api/tutorials/forms.md | 148 +++++ docs/api/tutorials/structured-properties.md | 567 ++++++++++++++++++ .../feature-guides/documentation-forms.md | 113 ++++ docs/features/feature-guides/properties.md | 158 +++++ 6 files changed, 990 insertions(+), 333 deletions(-) delete mode 100644 docs/api/openapi/openapi-structured-properties.md create mode 100644 docs/api/tutorials/forms.md create mode 100644 docs/api/tutorials/structured-properties.md create mode 100644 docs/features/feature-guides/documentation-forms.md create mode 100644 docs/features/feature-guides/properties.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 5c71e79a10172..786abb62bc97b 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -106,6 +106,7 @@ module.exports = { type: "doc", id: "docs/features/dataset-usage-and-query-history", }, + "docs/features/feature-guides/documentation-forms", { label: "Domains", type: "doc", @@ -162,6 +163,7 @@ module.exports = { type: "doc", id: "docs/posts", }, + "docs/features/feature-guides/properties", { label: "Schema history", type: "doc", @@ -676,11 +678,6 @@ module.exports = { label: "OpenAPI", id: "docs/api/openapi/openapi-usage-guide", }, - { - type: "doc", - label: "Structured Properties", - id: "docs/api/openapi/openapi-structured-properties", - }, ], }, "docs/dev-guides/timeline", @@ -810,6 +807,8 @@ module.exports = { "docs/api/tutorials/descriptions", "docs/api/tutorials/custom-properties", "docs/api/tutorials/ml", + "docs/api/tutorials/structured-properties", + "docs/api/tutorials/forms", ], }, { diff --git a/docs/api/openapi/openapi-structured-properties.md b/docs/api/openapi/openapi-structured-properties.md deleted file mode 100644 index 8dd660698a0e8..0000000000000 --- a/docs/api/openapi/openapi-structured-properties.md +++ /dev/null @@ -1,328 +0,0 @@ -# Structured Properties - DataHub OpenAPI v2 Guide - -This guides walks through the process of creating and using a Structured Property using the `v2` version -of the DataHub OpenAPI implementation. Note that this refers to DataHub's OpenAPI version and not the version of OpenAPI itself. - -Requirements: -* curl -* jq - -## Structured Property Definition - -Before a structured property can be added to an entity it must first be defined. Here is an example -structured property being created against a local quickstart instance. - -### Create Property Definition - -Example Request: - -```shell -curl -X 'POST' -v \ - 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Amy.test.MyProperty01/propertyDefinition' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "qualifiedName": "my.test.MyProperty01", - "displayName": "MyProperty01", - "valueType": "urn:li:dataType:datahub.string", - "allowedValues": [ - { - "value": {"string": "foo"}, - "description": "test foo value" - }, - { - "value": {"string": "bar"}, - "description": "test bar value" - } - ], - "cardinality": "SINGLE", - "entityTypes": [ - "urn:li:entityType:datahub.dataset" - ], - "description": "test description" -}' | jq -``` - -### Read Property Definition - -Example Request: - -```shell -curl -X 'GET' -v \ - 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Amy.test.MyProperty01/propertyDefinition' \ - -H 'accept: application/json' | jq -``` - -Example Response: - -```json -{ - "value": { - "allowedValues": [ - { - "value": { - "string": "foo" - }, - "description": "test foo value" - }, - { - "value": { - "string": "bar" - }, - "description": "test bar value" - } - ], - "qualifiedName": "my.test.MyProperty01", - "displayName": "MyProperty01", - "valueType": "urn:li:dataType:datahub.string", - "description": "test description", - "entityTypes": [ - "urn:li:entityType:datahub.dataset" - ], - "cardinality": "SINGLE" - } -} -``` - -### Delete Property Definition - -There are two types of deletion present in DataHub: `hard` and `soft` delete. As of the current release only the `soft` delete -is supported for Structured Properties. See the subsections below for more details. - -#### Soft Delete - -A `soft` deleted Structured Property does not remove any underlying data on the Structured Property entity -or the Structured Property's values written to other entities. The `soft` delete is 100% reversible with zero data loss. -When a Structured Property is `soft` deleted, a few operations are not available. - -Structured Property Soft Delete Effects: - -* Entities with a `soft` deleted Structured Property value will not return the `soft` deleted properties -* Updates to a `soft` deleted Structured Property's definition are denied -* Adding a `soft` deleted Structured Property's value to an entity is denied -* Search filters using a `soft` deleted Structured Property will be denied - -The following command will `soft` delete the test property `MyProperty01` created in this guide by writing -to the `status` aspect. - -```shell -curl -X 'POST' \ - 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Amy.test.MyProperty01/status?systemMetadata=false' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ -"removed": true -}' | jq -``` - -Removing the `soft` delete from the Structured Property can be done by either `hard` deleting the `status` aspect or -changing the `removed` boolean to `false. - -```shell -curl -X 'POST' \ - 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Amy.test.MyProperty01/status?systemMetadata=false' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ -"removed": false -}' | jq -``` - -#### Hard Delete - -⚠ **Not Implemented** ⚠ - -## Applying Structured Properties - -Structured Properties can now be added to entities which have the `structuredProperties` as aspect. In the following -example we'll attach and remove properties to an example dataset entity with urn `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`. - -### Set Structured Property Values - -This will set/replace all structured properties on the entity. See `PATCH` operations to add/remove a single property. - -```shell -curl -X 'POST' -v \ - 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "properties": [ - { - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty01", - "values": [ - {"string": "foo"} - ] - } - ] -}' | jq -``` - -### Patch Structured Property Value - -For this example, we'll extend create a second structured property and apply both properties to the same -dataset used previously. After this your system should include both `my.test.MyProperty01` and `my.test.MyProperty02`. - -```shell -curl -X 'POST' -v \ - 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Amy.test.MyProperty02/propertyDefinition' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "qualifiedName": "my.test.MyProperty02", - "displayName": "MyProperty02", - "valueType": "urn:li:dataType:datahub.string", - "allowedValues": [ - { - "value": {"string": "foo2"}, - "description": "test foo2 value" - }, - { - "value": {"string": "bar2"}, - "description": "test bar2 value" - } - ], - "cardinality": "SINGLE", - "entityTypes": [ - "urn:li:entityType:datahub.dataset" - ] -}' | jq -``` - -This command will attach one of each of the two properties to our test dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`. - -```shell -curl -X 'POST' -v \ - 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "properties": [ - { - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty01", - "values": [ - {"string": "foo"} - ] - }, - { - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty02", - "values": [ - {"string": "bar2"} - ] - } - ] -}' | jq -``` - -#### Remove Structured Property Value - -The expected state of our test dataset include 2 structured properties. We'd like to remove the first one and preserve -the second property. - -```shell -curl -X 'PATCH' -v \ - 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json-patch+json' \ - -d '{ - "patch": [ - { - "op": "remove", - "path": "/properties/urn:li:structuredProperty:my.test.MyProperty01" - } - ], - "arrayPrimaryKeys": { - "properties": [ - "propertyUrn" - ] - } - }' | jq -``` - -The response will show that the expected property has been removed. - -```json -{ - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", - "aspects": { - "structuredProperties": { - "value": { - "properties": [ - { - "values": [ - { - "string": "bar2" - } - ], - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty02" - } - ] - } - } - } -} -``` - -#### Add Structured Property Value - -In this example, we'll add the property back with a different value, preserving the existing property. - -```shell -curl -X 'PATCH' -v \ - 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json-patch+json' \ - -d '{ - "patch": [ - { - "op": "add", - "path": "/properties/urn:li:structuredProperty:my.test.MyProperty01", - "value": { - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty01", - "values": [ - { - "string": "bar" - } - ] - } - } - ], - "arrayPrimaryKeys": { - "properties": [ - "propertyUrn" - ] - } - }' | jq -``` - -The response shows that the property was re-added with the new value `bar` instead of the previous value `foo`. - -```json -{ - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", - "aspects": { - "structuredProperties": { - "value": { - "properties": [ - { - "values": [ - { - "string": "bar2" - } - ], - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty02" - }, - { - "values": [ - { - "string": "bar" - } - ], - "propertyUrn": "urn:li:structuredProperty:my.test.MyProperty01" - } - ] - } - } - } -} -``` diff --git a/docs/api/tutorials/forms.md b/docs/api/tutorials/forms.md new file mode 100644 index 0000000000000..f60699ffebab5 --- /dev/null +++ b/docs/api/tutorials/forms.md @@ -0,0 +1,148 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Documentation Forms + +## Why Would You Use Documentation Forms? + +Documentation Forms are a way for end-users to fill out all mandatory attributes associated with a data asset. The form will be dynamically generated based on the definitions provided by administrators and stewards and matching rules. + +Learn more about forms in the [Documentation Forms Feature Guide](../../../docs/features/feature-guides/documentation-forms.md). + + +### Goal Of This Guide +This guide will show you how to create and read forms. + +## Prerequisites + +For this tutorial, you need to deploy DataHub Quickstart and ingest sample data. +For detailed information, please refer to [Datahub Quickstart Guide](/docs/quickstart.md). + + + + + +Install the relevant CLI version. Forms are available as of CLI version `0.13.1`. The corresponding SaaS release version is `v0.2.16.5` +Connect to your instance via [init](https://datahubproject.io/docs/cli/#init): + +1. Run `datahub init` to update the instance you want to load into +2. Set the server to your sandbox instance, `https://{your-instance-address}/gms` +3. Set the token to your access token + + + + + + +## Create a Form + + + + +Create a yaml file representing the forms you’d like to load. +For example, below file represents a form `123456` You can see the full example [here](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/forms/forms.yaml). + + +```yaml +- id: 123456 + # urn: "urn:li:form:123456" # optional if id is provided + type: VERIFICATION # Supported Types: DOCUMENTATION, VERIFICATION + name: "Metadata Initiative 2023" + description: "How we want to ensure the most important data assets in our organization have all of the most important and expected pieces of metadata filled out" + prompts: + - id: "123" + title: "Retention Time" + description: "Apply Retention Time structured property to form" + type: STRUCTURED_PROPERTY + structured_property_id: io.acryl.privacy.retentionTime + required: True # optional, will default to True + entities: # Either pass a list of urns or a group of filters. This example shows a list of urns + urns: + - urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD) + # optionally assign the form to a specific set of users and/or groups + # when omitted, form will be assigned to Asset owners + actors: + users: + - urn:li:corpuser:jane@email.com # note: these should be urns + - urn:li:corpuser:john@email.com + groups: + - urn:li:corpGroup:team@email.com # note: these should be urns +``` + +:::note +Note that the structured properties and related entities should be created before you create the form. +Please refer to the [Structured Properties Tutorial](/docs/api/tutorials/structured-properties.md) for more information. +::: + + +You can apply forms to either a list of entity urns, or a list of filters. For a list of entity urns, use this structure: + +``` +entities: +urns: + - urn:li:dataset:... +``` + +For a list of filters, use this structure: + +``` +entities: +filters: + types: + - dataset # you can use entity type name or urn + platforms: + - snowflake # you can use platform name or urn + domains: + - urn:li:domain:finance # you must use domain urn + containers: + - urn:li:container:my_container # you must use container urn +``` + +Note that you can filter to entity types, platforms, domains, and/or containers. + +Use the CLI to create your properties: + +```commandline +datahub forms upsert -f {forms_yaml} +``` + +If successful, you should see `Created form urn:li:form:...` + + + + +## Read Property Definition + + + + + +You can see the properties you created by running the following command: + +```commandline +datahub forms get --urn {urn} +``` +For example, you can run `datahub forms get --urn urn:li:form:123456`. + +If successful, you should see metadata about your form returned like below. + +```json +{ + "urn": "urn:li:form:123456", + "name": "Metadata Initiative 2023", + "description": "How we want to ensure the most important data assets in our organization have all of the most important and expected pieces of metadata filled out", + "prompts": [ + { + "id": "123", + "title": "Retention Time", + "description": "Apply Retention Time structured property to form", + "type": "STRUCTURED_PROPERTY", + "structured_property_urn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime" + } + ], + "type": "VERIFICATION" +} +``` + + + diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md new file mode 100644 index 0000000000000..c32e92e58e8c7 --- /dev/null +++ b/docs/api/tutorials/structured-properties.md @@ -0,0 +1,567 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Structured Properties + +## Why Would You Use Structured Properties? + + Structured properties are a structured, named set of properties that can be attached to logical entities like Datasets, DataJobs, etc. +Structured properties have values that are types. Conceptually, they are like “field definitions”. + +Learn more about structured properties in the [Structured Properties Feature Guide](../../../docs/features/feature-guides/properties.md). + + +### Goal Of This Guide + +This guide will show you how to execute the following actions with structured properties. +- Create structured properties +- Read structured properties +- Delete structured properties (soft delete) +- Add structured properties to a dataset +- Patch structured properties (add / remove / update a single property) + +## Prerequisites + +For this tutorial, you need to deploy DataHub Quickstart and ingest sample data. +For detailed information, please refer to [Datahub Quickstart Guide](/docs/quickstart.md). + +Additionally, you need to have the following tools installed according to the method you choose to interact with DataHub: + + + + +Install the relevant CLI version. Forms are available as of CLI version `0.13.1`. The corresponding SaaS release version is `v0.2.16.5` +Connect to your instance via [init](https://datahubproject.io/docs/cli/#init): + +- Run `datahub init` to update the instance you want to load into. +- Set the server to your sandbox instance, `https://{your-instance-address}/gms`. +- Set the token to your access token. + + + + + +Requirements for OpenAPI are: +* curl +* jq + + + + + +## Create Structured Properties + +The following code will create a structured property `io.acryl.privacy.retentionTime`. + + + + +Create a yaml file representing the properties you’d like to load. +For example, below file represents a property `io.acryl.privacy.retentionTime`. You can see the full example [here](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/structured_properties/struct_props.yaml). + +```yaml +- id: io.acryl.privacy.retentionTime + # - urn: urn:li:structuredProperty:io.acryl.privacy.retentionTime # optional if id is provided + qualified_name: io.acryl.privacy.retentionTime # required if urn is provided + type: number + cardinality: MULTIPLE + display_name: Retention Time + entity_types: + - dataset # or urn:li:entityType:datahub.dataset + - dataFlow + description: "Retention Time is used to figure out how long to retain records in a dataset" + allowed_values: + - value: 30 + description: 30 days, usually reserved for datasets that are ephemeral and contain pii + - value: 90 + description: Use this for datasets that drive monthly reporting but contain pii + - value: 365 + description: Use this for non-sensitive data that can be retained for longer +``` + +Use the CLI to create your properties: +```commandline +datahub properties upsert -f {properties_yaml} +``` + +If successful, you should see `Created structured property urn:li:structuredProperty:...` + + + + +```commandline +curl -X 'POST' -v \ + 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Aio.acryl.privacy.retentionTime/propertyDefinition' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "qualifiedName": "io.acryl.privacy.retentionTime", + "valueType": "urn:li:dataType:datahub.number", + "description": "Retention Time is used to figure out how long to retain records in a dataset", + "displayName": "Retention Time", + "cardinality": "MULTIPLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "allowedValues": [ + { + "value": {"double": 30}, + "description": "30 days, usually reserved for datasets that are ephemeral and contain pii" + }, + { + "value": {"double": 60}, + "description": "Use this for datasets that drive monthly reporting but contain pii" + }, + { + "value": {"double": 365}, + "description": "Use this for non-sensitive data that can be retained for longer" + } + ] +}' | jq +``` + + + +## Read Structured Properties + +You can see the properties you created by running the following command: + + + + + +```commandline +datahub properties get --urn {urn} +``` +For example, you can run `datahub properties get --urn urn:li:structuredProperty:io.acryl.privacy.retentionTime`. +If successful, you should see metadata about your properties returned. + +```commandline +{ + "urn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "qualified_name": "io.acryl.privacy.retentionTime", + "type": "urn:li:dataType:datahub.number", + "description": "Retention Time is used to figure out how long to retain records in a dataset", + "display_name": "Retention Time", + "entity_types": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "cardinality": "MULTIPLE", + "allowed_values": [ + { + "value": "30", + "description": "30 days, usually reserved for datasets that are ephemeral and contain pii" + }, + { + "value": "90", + "description": "Use this for datasets that drive monthly reporting but contain pii" + }, + { + "value": "365", + "description": "Use this for non-sensitive data that can be retained for longer" + } + ] +} +``` + + + + +Example Request: +``` +curl -X 'GET' -v \ + 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Aio.acryl.privacy.retentionTime/propertyDefinition' \ + -H 'accept: application/json' | jq +``` + +Example Response: + +```commandline +{ + "value": { + "allowedValues": [ + { + "value": { + "double": 30.0 + }, + "description": "30 days, usually reserved for datasets that are ephemeral and contain pii" + }, + { + "value": { + "double": 60.0 + }, + "description": "Use this for datasets that drive monthly reporting but contain pii" + }, + { + "value": { + "double": 365.0 + }, + "description": "Use this for non-sensitive data that can be retained for longer" + } + ], + "qualifiedName": "io.acryl.privacy.retentionTime", + "displayName": "Retention Time", + "valueType": "urn:li:dataType:datahub.number", + "description": "Retention Time is used to figure out how long to retain records in a dataset", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "cardinality": "MULTIPLE" + } +} +``` + + + + + +## Set Structured Property To a Dataset + +This action will set/replace all structured properties on the entity. See PATCH operations to add/remove a single property. + + + + +You can set structured properties to a dataset by creating a dataset yaml file with structured properties. For example, below is a dataset yaml file with structured properties in both the field and dataset level. + +Please refer to the [full example here.](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/structured_properties/datasets.yaml) + +```yaml +- id: user_clicks_snowflake + platform: snowflake + schema: + fields: + - id: user_id + structured_properties: + io.acryl.dataManagement.deprecationDate: "2023-01-01" + structured_properties: + io.acryl.dataManagement.replicationSLA: 90 +``` + +Use the CLI to upsert your dataset yaml file: +```commandline +datahub dataset upsert -f {dataset_yaml} +``` +If successful, you should see `Update succeeded for urn:li:dataset:...` + + + + + + +Following command will set structured properties `retentionTime` as `90` to a dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`. +Please note that the structured property and the dataset must exist before executing this command. (You can create sample datasets using the `datahub docker ingest-sample-data`) + +```commandline +curl -X 'POST' -v \ + 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "properties": [ + { + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "values": [ + {"string": "90"} + ] + } + ] +}' | jq +``` + + + + +#### Expected Outcomes + +Once your datasets are uploaded, you can view them in the UI and view the properties associated with them under the Properties tab. + +

+ +

+ +Or you can run the following command to view the properties associated with the dataset: + +```commandline +datahub dataset get --urn {urn} +``` + +## Patch Structured Property Value + +This section will show you how to patch a structured property value - either by removing, adding, or upserting a single property. + +### Add Structured Property Value + +For this example, we'll extend create a second structured property and apply both properties to the same dataset used previously. +After this your system should include both `io.acryl.privacy.retentionTime` and `io.acryl.privacy.retentionTime02`. + + + + +Let's start by creating the second structured property. + +``` +curl -X 'POST' -v \ + 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Aio.acryl.privacy.retentionTime02/propertyDefinition' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "qualifiedName": "io.acryl.privacy.retentionTime02", + "displayName": "Retention Time 02", + "valueType": "urn:li:dataType:datahub.string", + "allowedValues": [ + { + "value": {"string": "foo2"}, + "description": "test foo2 value" + }, + { + "value": {"string": "bar2"}, + "description": "test bar2 value" + } + ], + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ] +}' | jq + +``` + +This command will attach one of each of the two properties to our test dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)` +Specically, this will set `io.acryl.privacy.retentionTime` as `90` and `io.acryl.privacy.retentionTime02` as `bar2`. + + +``` +curl -X 'POST' -v \ + 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "properties": [ + { + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "values": [ + {"string": "90"} + ] + }, + { + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime02", + "values": [ + {"string": "bar2"} + ] + } + ] +}' | jq +``` + + + + +#### Expected Outcomes +You can see that the dataset now has two structured properties attached to it. + +

+ +

+ + + +### Remove Structured Property Value + +The expected state of our test dataset include 2 structured properties. +We'd like to remove the first one (`io.acryl.privacy.retentionTime`) and preserve the second property. (`io.acryl.privacy.retentionTime02`). + + + + +``` +curl -X 'PATCH' -v \ + 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json-patch+json' \ + -d '{ + "patch": [ + { + "op": "remove", + "path": "/properties/urn:li:structuredProperty:io.acryl.privacy.retentionTime" + } + ], + "arrayPrimaryKeys": { + "properties": [ + "propertyUrn" + ] + } + }' | jq +``` +The response will show that the expected property has been removed. + +``` +{ + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", + "aspects": { + "structuredProperties": { + "value": { + "properties": [ + { + "values": [ + { + "string": "bar2" + } + ], + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime02" + } + ] + } + } + } +} +``` + + + +#### Expected Outcomes +You can see that the first property has been removed and the second property is still present. + +

+ +

+ + + +### Upsert Structured Property Value + +In this example, we'll add the property back with a different value, preserving the existing property. + + + + +``` +curl -X 'PATCH' -v \ + 'http://localhost:8080/openapi/v2/entity/dataset/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2CSampleHiveDataset%2CPROD%29/structuredProperties' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json-patch+json' \ + -d '{ + "patch": [ + { + "op": "add", + "path": "/properties/urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "value": { + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "values": [ + { + "string": "365" + } + ] + } + } + ], + "arrayPrimaryKeys": { + "properties": [ + "propertyUrn" + ] + } + }' | jq +``` + +Below is the expected response: +``` +{ + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)", + "aspects": { + "structuredProperties": { + "value": { + "properties": [ + { + "values": [ + { + "string": "bar2" + } + ], + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime02" + }, + { + "values": [ + { + "string": "365" + } + ], + "propertyUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime" + } + ] + } + } + } +} +``` + +The response shows that the property was re-added with the new value bar instead of the previous value foo. + + + + +#### Expected Outcomes +You can see that the first property has been added back with a new value and the second property is still present. + +

+ +

+ + + +## Delete Structured Properties + +There are two types of deletion present in DataHub: hard and soft delete. As of the current release only the soft delete is supported for Structured Properties. + +:::note SOFT DELETE +A soft deleted Structured Property does not remove any underlying data on the Structured Property entity or the Structured Property's values written to other entities. The soft delete is 100% reversible with zero data loss. When a Structured Property is soft deleted, a few operations are not available. + +Structured Property Soft Delete Effects: + +- Entities with a soft deleted Structured Property value will not return the soft deleted properties +- Updates to a soft deleted Structured Property's definition are denied +- Adding a soft deleted Structured Property's value to an entity is denied +- Search filters using a soft deleted Structured Property will be denied +::: + + + + + +The following command will soft delete the test property. + +```commandline +datahub delete --urn {urn} +``` + + + + +The following command will soft delete the test property by writing to the status aspect. + +``` +curl -X 'POST' \ + 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Aio.acryl.privacy.retentionTime/status?systemMetadata=false' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ +"removed": true +}' | jq +``` + +If you want to **remove the soft delete**, you can do so by either hard deleting the status aspect or changing the removed boolean to `false` like below. + +``` +curl -X 'POST' \ + 'http://localhost:8080/openapi/v2/entity/structuredProperty/urn%3Ali%3AstructuredProperty%3Aio.acryl.privacy.retentionTime/status?systemMetadata=false' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ +"removed": false +}' | jq +``` + + + diff --git a/docs/features/feature-guides/documentation-forms.md b/docs/features/feature-guides/documentation-forms.md new file mode 100644 index 0000000000000..8b2966810de7c --- /dev/null +++ b/docs/features/feature-guides/documentation-forms.md @@ -0,0 +1,113 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +# About DataHub Documentation Forms + + +DataHub Documentation Forms streamline the process of setting documentation requirements and delegating annotation responsibilities to the relevant data asset owners, stewards, and subject matter experts. + +Forms are highly configurable, making it easy to ask the right questions of the right people, for a specific set of assets. + +## What are Documentation Forms? + +You can think of Documentation Forms as a survey for your data assets: a set of questions that must be answered in order for an asset to be considered properly documented. + +Verification Forms are an extension of Documentation Forms, requiring a final verification, or sign-off, on all responses before the asset can be considered Verified. This is useful for compliance and/or governance annotation initiatives where you want assignees to provide a final acknowledgement that the information provided is correct. + +## Creating and Assigning Documentation Forms + +Documentation Forms are defined via YAML with the following details: + +- Name and Description to help end-users understand the scope and use case +- Form Type, either Documentation or Verification + - Verification Forms require a final signoff, i.e. Verification, of all required questions before the Form can be considered complete +- Form Questions (aka "prompts") for end-users to complete + - Questions can be assigned at the asset-level and/or the field-level + - Asset-level questions can be configured to be required; by default, all questions are optional +- Assigned Assets, defined by: + - A set of specific asset URNs, OR + - Assets related to a set of filters, such as Type (Datasets, Dashboards, etc.), Platform (Snowflake, Looker, etc.), Domain (Product, Marketing, etc.), or Container (Schema, Folder, etc.) +- Optional: Form Assignees + - Optionally assign specific DataHub users/groups to complete the Form for all relevant assets + - If omitted, any Owner of an Asset can complete Forms assigned to that Asset + +Here's an example of defining a Documentation Form via YAML: +```yaml +- id: 123456 + # urn: "urn:li:form:123456" # optional if id is provided + type: VERIFICATION # Supported Types: DOCUMENTATION, VERIFICATION + name: "Metadata Initiative 2024" + description: "How we want to ensure the most important data assets in our organization have all of the most important and expected pieces of metadata filled out" + prompts: # Questions for Form assignees to complete + - id: "123" + title: "Data Retention Time" + description: "Apply Retention Time structured property to form" + type: STRUCTURED_PROPERTY + structured_property_id: io.acryl.privacy.retentionTime + required: True # optional; default value is False + entities: # Either pass a list of urns or a group of filters. This example shows a list of urns + urns: + - urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD) + # optionally assign the form to a specific set of users and/or groups + # when omitted, form will be assigned to Asset owners + actors: + users: + - urn:li:corpuser:jane@email.com # note: these should be URNs + - urn:li:corpuser:john@email.com + groups: + - urn:li:corpGroup:team@email.com # note: these should be URNs + +``` + +:::note +Documentation Forms currently only support defining Structured Properties as Form Questions +::: + + + + + +## Additional Resources + +### Videos + +**Asset Verification in Acryl Cloud** + +

+ +

+ +## FAQ and Troubleshooting + +**What is the difference between Documentation and Verification Forms?** + +Both form types are a way to configure a set of optional and/or required questions for DataHub users to complete. When using Verification Forms, users will be presented with a final verification step once all required questions have been completed; you can think of this as a final acknowledgement of the accuracy of information submitted. + +**Who is able to complete Forms in DataHub?** + +By default, any owner of an Asset will be able to respond to questions assigned via a Form. + +When assigning a Form to an Asset, you can optionally assign specific DataHub users/groups to fill them out. + +**Can I assign multiple Forms to a single asset?** + +You sure can! Please keep in mind that an Asset will only be considered Documented or Verified if all required questions are completed on all assiged Forms. + +### API Tutorials + +- [Create a Documentation Form](../../../docs/api/tutorials/forms.md) + +:::note +You must create a Structured Property before including it in a Documentation Form. +To learn more about creating Structured Properties via CLI, please see the [Create Structured Properties](/docs/api/tutorials/structured-properties.md) tutorial. +::: + +### Related Features + +- [DataHub Properties](/docs/features/feature-guides/properties.md) \ No newline at end of file diff --git a/docs/features/feature-guides/properties.md b/docs/features/feature-guides/properties.md new file mode 100644 index 0000000000000..0d961b9ceac4f --- /dev/null +++ b/docs/features/feature-guides/properties.md @@ -0,0 +1,158 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# About DataHub Properties + + +DataHub Custom Properties and Structured Properties are powerful tools to collect meaningful metadata for Assets that might not perfectly fit into other Aspects within DataHub, such as Glossary Terms, Tags, etc. Both types can be found in an Asset's Properties tab: + +

+ +

+ +This guide will explain the differences and use cases of each property type. + +## What are Custom Properties and Structured Properties? +Here are the differences between the two property types at a glance: + +| Custom Properties | Structured Properties | +| --- | --- | +| Map of key-value pairs stored as strings | Validated namespaces and data types | +| Added to assets during ingestion and via API | Defined via YAML; created and added to assets via CLI | +| No support for UI-based Edits | Support for UI-based edits | + +**Custom Properties** are key-value pairs of strings that capture additional information about assets that is not readily available in standard metadata fields. Custom Properties can be added to assets automatically during ingestion or programmatically via API and *cannot* be edited via the UI. +

+ +

+

Example of Custom Properties assigned to a Dataset

+ +**Structured Properties** are an extension of Custom Properties, providing a structured and validated way to attach metadata to DataHub Assets. Available as of v0.13.1, Structured Properties have a pre-defined type (Date, Integer, URN, String, etc.). They can be configured to only accept a specific set of allowed values, making it easier to ensure high levels of data quality and consistency. Structured Properties are defined via YAML, added to assets via CLI, and can be edited via the UI. +

+ +

+

Example of Structured Properties assigned to a Dataset

+ +## Use Cases for Custom Properties and Structured Properties +**Custom Properties** are useful for capturing raw metadata from source systems during ingestion or programmatically via API. Some examples include: + +- GitHub file location of code which generated a dataset +- Data encoding type +- Account ID, cluster size, and region where a dataset is stored + +**Structured Properties** are useful for setting and enforcing standards of metadata collection, particularly in support of compliance and governance initiatives. Values can be added programmatically via API, then manually via the DataHub UI as necessary. Some examples include: + +- Deprecation Date + - Type: Date, Single Select + - Validation: Must be formatted as 'YYYY-MM-DD' +- Data Retention Period + - Type: String, Single Select + - Validation: Adheres to allowed values "30 Days", "90 Days", "365 Days", or "Indefinite" +- Consulted Compliance Officer, chosen from a list of DataHub users + - Type: DataHub User, Multi-Select + - Validation: Must be valid DataHub User URN + +By using Structured Properties, compliance and governance officers can ensure consistency in data collection across assets. + +## Creating, Assigning, and Editing Structured Properties + +Structured Properties are defined via YAML, then created and assigned to DataHub Assets via the DataHub CLI. + +Here's how we would define the above examples in YAML: + + + + +```yaml +- id: deprecation_date + qualified_name: deprecation_date + type: date # Supported types: date, string, number, urn, rich_text + cardinality: SINGLE # Supported options: SINGLE, MULTIPLE + display_name: Deprecation Date + description: "Scheduled date when resource will be deprecated in the source system" + entity_types: # Define which types of DataHub Assets the Property can be assigned to + - dataset +``` + + + + +```yaml +- id: retention_period + qualified_name: retention_period + type: string # Supported types: date, string, number, urn, rich_text + cardinality: SINGLE # Supported options: SINGLE, MULTIPLE + display_name: Data Retention Period + description: "Predetermined storage duration before being deleted or archived + based on legal, regulatory, or organizational requirements" + entity_types: # Define which types of DataHub Assets the Property can be assigned to + - dataset + allowed_values: + - value: "30 Days" + description: "Use this for datasets that are ephemeral and contain PII" + - value: "90 Days" + description: "Use this for datasets that drive monthly reporting but contain PII" + - value: "365 Days" + description: "Use this for non-sensitive data that can be retained for longer" + - value: "Indefinite" + description: "Use this for non-sensitive data that can be retained indefinitely" +``` + + + + +```yaml +- id: compliance_officer + qualified_name: compliance_officer + type: urn # Supported types: date, string, number, urn, rich_text + cardinality: MULTIPLE # Supported options: SINGLE, MULTIPLE + display_name: Consulted Compliance Officer(s) + description: "Member(s) of the Compliance Team consulted/informed during audit" + type_qualifier: # Define the type of Asset URNs to allow + - corpuser + - corpGroup + entity_types: # Define which types of DataHub Assets the Property can be assigned to + - dataset +``` + + + + +:::note +To learn more about creating and assigning Structured Properties via CLI, please see the [Create Structured Properties](/docs/api/tutorials/structured-properties.md) tutorial. +::: + +Once a Structured Property is assigned to an Asset, Users with the `Edit Properties` Metadata Privilege will be able to change Structured Property values via the DataHub UI. +

+ +

+

Example of editing the value of a Structured Property via the UI

+ +### Videos + +**Deep Dive: UI-Editable Properties** + +

+ +

+ + +### API + +Please see the following API guides related to Custom and Structured Properties: + +- [Custom Properties API Guide](/docs/api/tutorials/structured-properties.md) +- [Structured Properties API Guide](/docs/api/tutorials/structured-properties.md) + + +## FAQ and Troubleshooting + +**Why can't I edit the value of a Structured Property from the DataHub UI?** +1. Your version of DataHub does not support UI-based edits of Structured Properties. Confirm you are running DataHub v0.13.1 or later. +2. You are attempting to edit a Custom Property, not a Structured Property. Confirm you are trying to edit a Structured Property, which will have an "Edit" button visible. Please note that Custom Properties are not eligible for UI-based edits to minimize overwrites during recurring ingestion. +3. You do not have the necessary privileges. Confirm with your Admin that you have the `Edit Properties` Metadata Privilege. + +### Related Features + +- [Documentation Forms](/docs/features/feature-guides/documentation-forms.md) \ No newline at end of file