diff --git a/build.gradle b/build.gradle index 437ac4fbf6f32..c6e14081c6147 100644 --- a/build.gradle +++ b/build.gradle @@ -250,6 +250,7 @@ project.ext.externalDependency = [ 'springBootStarterValidation': "org.springframework.boot:spring-boot-starter-validation:$springBootVersion", 'springKafka': "org.springframework.kafka:spring-kafka:$springKafkaVersion", 'springActuator': "org.springframework.boot:spring-boot-starter-actuator:$springBootVersion", + 'springRetry': "org.springframework.retry:spring-retry:2.0.6", 'swaggerAnnotations': 'io.swagger.core.v3:swagger-annotations:2.2.15', 'swaggerCli': 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.46', 'swaggerCore': 'io.swagger.core.v3:swagger-core:2.2.7', diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java index 0924dbc0c0a6d..e55f1fd5ecf5b 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java @@ -20,6 +20,7 @@ private Constants() {} public static final String LINEAGE_SCHEMA_FILE = "lineage.graphql"; public static final String PROPERTIES_SCHEMA_FILE = "properties.graphql"; public static final String FORMS_SCHEMA_FILE = "forms.graphql"; + public static final String ASSERTIONS_SCHEMA_FILE = "assertions.graphql"; public static final String INCIDENTS_SCHEMA_FILE = "incident.graphql"; public static final String CONNECTIONS_SCHEMA_FILE = "connection.graphql"; public static final String BROWSE_PATH_DELIMITER = "/"; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 5315a444d07b7..50a73817678ee 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -118,7 +118,12 @@ import com.linkedin.datahub.graphql.resolvers.assertion.AssertionRunEventResolver; import com.linkedin.datahub.graphql.resolvers.assertion.DeleteAssertionResolver; import com.linkedin.datahub.graphql.resolvers.assertion.EntityAssertionsResolver; -import com.linkedin.datahub.graphql.resolvers.auth.*; +import com.linkedin.datahub.graphql.resolvers.auth.CreateAccessTokenResolver; +import com.linkedin.datahub.graphql.resolvers.auth.DebugAccessResolver; +import com.linkedin.datahub.graphql.resolvers.auth.GetAccessTokenMetadataResolver; +import com.linkedin.datahub.graphql.resolvers.auth.GetAccessTokenResolver; +import com.linkedin.datahub.graphql.resolvers.auth.ListAccessTokensResolver; +import com.linkedin.datahub.graphql.resolvers.auth.RevokeAccessTokenResolver; import com.linkedin.datahub.graphql.resolvers.browse.BrowsePathsResolver; import com.linkedin.datahub.graphql.resolvers.browse.BrowseResolver; import com.linkedin.datahub.graphql.resolvers.browse.EntityBrowsePathsResolver; @@ -814,6 +819,7 @@ public GraphQLEngine.Builder builder() { .addSchema(fileBasedSchema(PROPERTIES_SCHEMA_FILE)) .addSchema(fileBasedSchema(FORMS_SCHEMA_FILE)) .addSchema(fileBasedSchema(CONNECTIONS_SCHEMA_FILE)) + .addSchema(fileBasedSchema(ASSERTIONS_SCHEMA_FILE)) .addSchema(fileBasedSchema(INCIDENTS_SCHEMA_FILE)); for (GmsGraphQLPlugin plugin : this.graphQLPlugins) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java index 74d9e7f8a8c57..f2682ad050c86 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java @@ -239,6 +239,9 @@ public static Filter viewFilter( return null; } DataHubViewInfo viewInfo = resolveView(opContext, viewService, UrnUtils.getUrn(viewUrn)); + if (viewInfo == null) { + return null; + } Filter result = SearchUtils.combineFilters(null, viewInfo.getDefinition().getFilter()); return result; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java index 3ca78d643679b..18f8ad85668d8 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java @@ -98,6 +98,16 @@ public CompletableFuture get(DataFetchingEnvironment e && AssertionResultType.SUCCESS.equals( runEvent.getResult().getType())) .count())); + result.setErrored( + Math.toIntExact( + runEvents.stream() + .filter( + runEvent -> + AssertionRunStatus.COMPLETE.equals(runEvent.getStatus()) + && runEvent.getResult() != null + && AssertionResultType.ERROR.equals( + runEvent.getResult().getType())) + .count())); result.setRunEvents(runEvents); return result; } catch (RemoteInvocationException e) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java index ca13792b1e92b..1e7fac2edbc9a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java @@ -2,6 +2,8 @@ import static com.linkedin.metadata.Constants.GLOBAL_TAGS_ASPECT_NAME; +import com.linkedin.assertion.AssertionAction; +import com.linkedin.assertion.AssertionActions; import com.linkedin.assertion.AssertionInfo; import com.linkedin.common.DataPlatformInstance; import com.linkedin.common.GlobalTags; @@ -10,24 +12,40 @@ import com.linkedin.data.DataMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Assertion; +import com.linkedin.datahub.graphql.generated.AssertionActionType; +import com.linkedin.datahub.graphql.generated.AssertionSource; +import com.linkedin.datahub.graphql.generated.AssertionSourceType; import com.linkedin.datahub.graphql.generated.AssertionStdAggregation; import com.linkedin.datahub.graphql.generated.AssertionStdOperator; import com.linkedin.datahub.graphql.generated.AssertionStdParameter; import com.linkedin.datahub.graphql.generated.AssertionStdParameterType; import com.linkedin.datahub.graphql.generated.AssertionStdParameters; import com.linkedin.datahub.graphql.generated.AssertionType; +import com.linkedin.datahub.graphql.generated.AuditStamp; import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DatasetAssertionInfo; import com.linkedin.datahub.graphql.generated.DatasetAssertionScope; +import com.linkedin.datahub.graphql.generated.DateInterval; import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.FieldAssertionInfo; +import com.linkedin.datahub.graphql.generated.FixedIntervalSchedule; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo; +import com.linkedin.datahub.graphql.generated.SchemaAssertionCompatibility; +import com.linkedin.datahub.graphql.generated.SchemaAssertionField; +import com.linkedin.datahub.graphql.generated.SchemaAssertionInfo; import com.linkedin.datahub.graphql.generated.SchemaFieldRef; +import com.linkedin.datahub.graphql.generated.SqlAssertionInfo; +import com.linkedin.datahub.graphql.generated.VolumeAssertionInfo; import com.linkedin.datahub.graphql.types.common.mappers.DataPlatformInstanceAspectMapper; import com.linkedin.datahub.graphql.types.common.mappers.StringMapMapper; +import com.linkedin.datahub.graphql.types.dataset.mappers.SchemaFieldMapper; +import com.linkedin.datahub.graphql.types.dataset.mappers.SchemaMetadataMapper; import com.linkedin.datahub.graphql.types.tag.mappers.GlobalTagsMapper; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.metadata.Constants; +import com.linkedin.schema.SchemaField; import java.util.Collections; import java.util.stream.Collectors; import javax.annotation.Nullable; @@ -48,6 +66,14 @@ public static Assertion map(@Nullable QueryContext context, final EntityResponse result.setInfo( mapAssertionInfo(context, new AssertionInfo(envelopedAssertionInfo.getValue().data()))); } + + final EnvelopedAspect envelopedAssertionActions = + aspects.get(Constants.ASSERTION_ACTIONS_ASPECT_NAME); + if (envelopedAssertionActions != null) { + result.setActions( + mapAssertionActions(new AssertionActions(envelopedAssertionActions.getValue().data()))); + } + final EnvelopedAspect envelopedPlatformInstance = aspects.get(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME); if (envelopedPlatformInstance != null) { @@ -83,20 +109,93 @@ private static com.linkedin.datahub.graphql.generated.Status mapStatus(Status st return result; } - private static com.linkedin.datahub.graphql.generated.AssertionInfo mapAssertionInfo( + public static com.linkedin.datahub.graphql.generated.AssertionInfo mapAssertionInfo( @Nullable QueryContext context, final AssertionInfo gmsAssertionInfo) { final com.linkedin.datahub.graphql.generated.AssertionInfo assertionInfo = new com.linkedin.datahub.graphql.generated.AssertionInfo(); assertionInfo.setType(AssertionType.valueOf(gmsAssertionInfo.getType().name())); + + if (gmsAssertionInfo.hasLastUpdated()) { + assertionInfo.setLastUpdated( + new AuditStamp( + gmsAssertionInfo.getLastUpdated().getTime(), + gmsAssertionInfo.getLastUpdated().getActor().toString())); + } if (gmsAssertionInfo.hasDatasetAssertion()) { DatasetAssertionInfo datasetAssertion = mapDatasetAssertionInfo(context, gmsAssertionInfo.getDatasetAssertion()); assertionInfo.setDatasetAssertion(datasetAssertion); } - assertionInfo.setDescription(gmsAssertionInfo.getDescription()); + // Description + if (gmsAssertionInfo.hasDescription()) { + assertionInfo.setDescription(gmsAssertionInfo.getDescription()); + } + // FRESHNESS Assertions + if (gmsAssertionInfo.hasFreshnessAssertion()) { + FreshnessAssertionInfo freshnessAssertionInfo = + FreshnessAssertionMapper.mapFreshnessAssertionInfo( + context, gmsAssertionInfo.getFreshnessAssertion()); + assertionInfo.setFreshnessAssertion(freshnessAssertionInfo); + } + // VOLUME Assertions + if (gmsAssertionInfo.hasVolumeAssertion()) { + VolumeAssertionInfo volumeAssertionInfo = + VolumeAssertionMapper.mapVolumeAssertionInfo( + context, gmsAssertionInfo.getVolumeAssertion()); + assertionInfo.setVolumeAssertion(volumeAssertionInfo); + } + // SQL Assertions + if (gmsAssertionInfo.hasSqlAssertion()) { + SqlAssertionInfo sqlAssertionInfo = + SqlAssertionMapper.mapSqlAssertionInfo(gmsAssertionInfo.getSqlAssertion()); + assertionInfo.setSqlAssertion(sqlAssertionInfo); + } + // FIELD Assertions + if (gmsAssertionInfo.hasFieldAssertion()) { + FieldAssertionInfo fieldAssertionInfo = + FieldAssertionMapper.mapFieldAssertionInfo(context, gmsAssertionInfo.getFieldAssertion()); + assertionInfo.setFieldAssertion(fieldAssertionInfo); + } + // SCHEMA Assertions + if (gmsAssertionInfo.hasSchemaAssertion()) { + SchemaAssertionInfo schemaAssertionInfo = + mapSchemaAssertionInfo(context, gmsAssertionInfo.getSchemaAssertion()); + assertionInfo.setSchemaAssertion(schemaAssertionInfo); + } + // Source Type + if (gmsAssertionInfo.hasSource()) { + assertionInfo.setSource(mapSource(gmsAssertionInfo.getSource())); + } return assertionInfo; } + private static com.linkedin.datahub.graphql.generated.AssertionActions mapAssertionActions( + final AssertionActions gmsAssertionActions) { + final com.linkedin.datahub.graphql.generated.AssertionActions result = + new com.linkedin.datahub.graphql.generated.AssertionActions(); + if (gmsAssertionActions.hasOnFailure()) { + result.setOnFailure( + gmsAssertionActions.getOnFailure().stream() + .map(AssertionMapper::mapAssertionAction) + .collect(Collectors.toList())); + } + if (gmsAssertionActions.hasOnSuccess()) { + result.setOnSuccess( + gmsAssertionActions.getOnSuccess().stream() + .map(AssertionMapper::mapAssertionAction) + .collect(Collectors.toList())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.AssertionAction mapAssertionAction( + final AssertionAction gmsAssertionAction) { + final com.linkedin.datahub.graphql.generated.AssertionAction result = + new com.linkedin.datahub.graphql.generated.AssertionAction(); + result.setType(AssertionActionType.valueOf(gmsAssertionAction.getType().toString())); + return result; + } + private static DatasetAssertionInfo mapDatasetAssertionInfo( @Nullable QueryContext context, final com.linkedin.assertion.DatasetAssertionInfo gmsDatasetAssertion) { @@ -152,7 +251,7 @@ private static SchemaFieldRef mapDatasetSchemaField(final Urn schemaFieldUrn) { return new SchemaFieldRef(schemaFieldUrn.toString(), schemaFieldUrn.getEntityKey().get(1)); } - private static AssertionStdParameters mapParameters( + protected static AssertionStdParameters mapParameters( final com.linkedin.assertion.AssertionStdParameters params) { final AssertionStdParameters result = new AssertionStdParameters(); if (params.hasValue()) { @@ -175,5 +274,61 @@ private static AssertionStdParameter mapParameter( return result; } - private AssertionMapper() {} + protected static FixedIntervalSchedule mapFixedIntervalSchedule( + com.linkedin.assertion.FixedIntervalSchedule gmsFixedIntervalSchedule) { + FixedIntervalSchedule fixedIntervalSchedule = new FixedIntervalSchedule(); + fixedIntervalSchedule.setUnit(DateInterval.valueOf(gmsFixedIntervalSchedule.getUnit().name())); + fixedIntervalSchedule.setMultiple(gmsFixedIntervalSchedule.getMultiple()); + return fixedIntervalSchedule; + } + + private static AssertionSource mapSource(final com.linkedin.assertion.AssertionSource gmsSource) { + AssertionSource result = new AssertionSource(); + result.setType(AssertionSourceType.valueOf(gmsSource.getType().toString())); + if (gmsSource.hasCreated()) { + result.setCreated( + new AuditStamp( + gmsSource.getCreated().getTime(), gmsSource.getCreated().getActor().toString())); + } + return result; + } + + protected static com.linkedin.datahub.graphql.generated.SchemaFieldSpec mapSchemaFieldSpec( + final com.linkedin.schema.SchemaFieldSpec gmsField) { + final com.linkedin.datahub.graphql.generated.SchemaFieldSpec result = + new com.linkedin.datahub.graphql.generated.SchemaFieldSpec(); + result.setPath(gmsField.getPath()); + result.setType(gmsField.getType()); + result.setNativeType(gmsField.getNativeType()); + return result; + } + + private static SchemaAssertionInfo mapSchemaAssertionInfo( + @Nullable final QueryContext context, + final com.linkedin.assertion.SchemaAssertionInfo gmsSchemaAssertionInfo) { + SchemaAssertionInfo result = new SchemaAssertionInfo(); + result.setCompatibility( + SchemaAssertionCompatibility.valueOf(gmsSchemaAssertionInfo.getCompatibility().name())); + result.setEntityUrn(gmsSchemaAssertionInfo.getEntity().toString()); + result.setSchema( + SchemaMetadataMapper.INSTANCE.apply( + context, gmsSchemaAssertionInfo.getSchema(), gmsSchemaAssertionInfo.getEntity(), 0L)); + result.setFields( + gmsSchemaAssertionInfo.getSchema().getFields().stream() + .map(AssertionMapper::mapSchemaField) + .collect(Collectors.toList())); + return result; + } + + private static SchemaAssertionField mapSchemaField(final SchemaField gmsField) { + SchemaAssertionField result = new SchemaAssertionField(); + result.setPath(gmsField.getFieldPath()); + result.setType(new SchemaFieldMapper().mapSchemaFieldDataType(gmsField.getType())); + if (gmsField.hasNativeDataType()) { + result.setNativeType(gmsField.getNativeDataType()); + } + return result; + } + + protected AssertionMapper() {} } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java index 0cf74439132fe..9c90478f03dc5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java @@ -28,8 +28,8 @@ public class AssertionType Constants.ASSERTION_KEY_ASPECT_NAME, Constants.ASSERTION_INFO_ASPECT_NAME, Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, - Constants.GLOBAL_TAGS_ASPECT_NAME); - + Constants.GLOBAL_TAGS_ASPECT_NAME, + Constants.ASSERTION_ACTIONS_ASPECT_NAME); private final EntityClient _entityClient; public AssertionType(final EntityClient entityClient) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapper.java new file mode 100644 index 0000000000000..82d041a464c3f --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapper.java @@ -0,0 +1,92 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.FieldAssertionInfo; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AssertionStdOperator; +import com.linkedin.datahub.graphql.generated.FieldAssertionType; +import com.linkedin.datahub.graphql.generated.FieldMetricType; +import com.linkedin.datahub.graphql.generated.FieldTransformType; +import com.linkedin.datahub.graphql.generated.FieldValuesFailThresholdType; +import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper; +import javax.annotation.Nullable; + +public class FieldAssertionMapper extends AssertionMapper { + + public static com.linkedin.datahub.graphql.generated.FieldAssertionInfo mapFieldAssertionInfo( + @Nullable final QueryContext context, final FieldAssertionInfo gmsFieldAssertionInfo) { + final com.linkedin.datahub.graphql.generated.FieldAssertionInfo result = + new com.linkedin.datahub.graphql.generated.FieldAssertionInfo(); + result.setEntityUrn(gmsFieldAssertionInfo.getEntity().toString()); + result.setType(FieldAssertionType.valueOf(gmsFieldAssertionInfo.getType().name())); + if (gmsFieldAssertionInfo.hasFilter()) { + result.setFilter(DatasetFilterMapper.map(context, gmsFieldAssertionInfo.getFilter())); + } + if (gmsFieldAssertionInfo.hasFieldValuesAssertion()) { + result.setFieldValuesAssertion( + mapFieldValuesAssertion(gmsFieldAssertionInfo.getFieldValuesAssertion())); + } + if (gmsFieldAssertionInfo.hasFieldMetricAssertion()) { + result.setFieldMetricAssertion( + mapFieldMetricAssertion(gmsFieldAssertionInfo.getFieldMetricAssertion())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldValuesAssertion + mapFieldValuesAssertion( + final com.linkedin.assertion.FieldValuesAssertion gmsFieldValuesAssertion) { + final com.linkedin.datahub.graphql.generated.FieldValuesAssertion result = + new com.linkedin.datahub.graphql.generated.FieldValuesAssertion(); + result.setField(mapSchemaFieldSpec(gmsFieldValuesAssertion.getField())); + result.setOperator(AssertionStdOperator.valueOf(gmsFieldValuesAssertion.getOperator().name())); + result.setFailThreshold( + mapFieldValuesFailThreshold(gmsFieldValuesAssertion.getFailThreshold())); + result.setExcludeNulls(gmsFieldValuesAssertion.isExcludeNulls()); + + if (gmsFieldValuesAssertion.hasTransform()) { + result.setTransform(mapFieldTransform(gmsFieldValuesAssertion.getTransform())); + } + + if (gmsFieldValuesAssertion.hasParameters()) { + result.setParameters(mapParameters(gmsFieldValuesAssertion.getParameters())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldMetricAssertion + mapFieldMetricAssertion( + final com.linkedin.assertion.FieldMetricAssertion gmsFieldMetricAssertion) { + final com.linkedin.datahub.graphql.generated.FieldMetricAssertion result = + new com.linkedin.datahub.graphql.generated.FieldMetricAssertion(); + result.setField(mapSchemaFieldSpec(gmsFieldMetricAssertion.getField())); + result.setMetric(FieldMetricType.valueOf(gmsFieldMetricAssertion.getMetric().name())); + result.setOperator(AssertionStdOperator.valueOf(gmsFieldMetricAssertion.getOperator().name())); + + if (gmsFieldMetricAssertion.hasParameters()) { + result.setParameters(mapParameters(gmsFieldMetricAssertion.getParameters())); + } + + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldTransform mapFieldTransform( + final com.linkedin.assertion.FieldTransform gmsFieldTransform) { + final com.linkedin.datahub.graphql.generated.FieldTransform result = + new com.linkedin.datahub.graphql.generated.FieldTransform(); + result.setType(FieldTransformType.valueOf(gmsFieldTransform.getType().name())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold + mapFieldValuesFailThreshold( + final com.linkedin.assertion.FieldValuesFailThreshold gmsFieldValuesFailThreshold) { + final com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold result = + new com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold(); + result.setType( + FieldValuesFailThresholdType.valueOf(gmsFieldValuesFailThreshold.getType().name())); + result.setValue(gmsFieldValuesFailThreshold.getValue()); + return result; + } + + private FieldAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapper.java new file mode 100644 index 0000000000000..22e1c1d8bae9e --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapper.java @@ -0,0 +1,59 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.data.template.GetMode; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionSchedule; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionType; +import com.linkedin.datahub.graphql.generated.FreshnessCronSchedule; +import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper; +import javax.annotation.Nullable; + +public class FreshnessAssertionMapper extends AssertionMapper { + + public static FreshnessAssertionInfo mapFreshnessAssertionInfo( + @Nullable final QueryContext context, + final com.linkedin.assertion.FreshnessAssertionInfo gmsFreshnessAssertionInfo) { + FreshnessAssertionInfo freshnessAssertionInfo = new FreshnessAssertionInfo(); + freshnessAssertionInfo.setEntityUrn(gmsFreshnessAssertionInfo.getEntity().toString()); + freshnessAssertionInfo.setType( + FreshnessAssertionType.valueOf(gmsFreshnessAssertionInfo.getType().name())); + if (gmsFreshnessAssertionInfo.hasSchedule()) { + freshnessAssertionInfo.setSchedule( + mapFreshnessAssertionSchedule(gmsFreshnessAssertionInfo.getSchedule())); + } + if (gmsFreshnessAssertionInfo.hasFilter()) { + freshnessAssertionInfo.setFilter( + DatasetFilterMapper.map(context, gmsFreshnessAssertionInfo.getFilter())); + } + return freshnessAssertionInfo; + } + + private static FreshnessCronSchedule mapFreshnessCronSchedule( + final com.linkedin.assertion.FreshnessCronSchedule gmsCronSchedule) { + FreshnessCronSchedule cronSchedule = new FreshnessCronSchedule(); + cronSchedule.setCron(gmsCronSchedule.getCron()); + cronSchedule.setTimezone(gmsCronSchedule.getTimezone()); + cronSchedule.setWindowStartOffsetMs(gmsCronSchedule.getWindowStartOffsetMs(GetMode.NULL)); + return cronSchedule; + } + + private static FreshnessAssertionSchedule mapFreshnessAssertionSchedule( + final com.linkedin.assertion.FreshnessAssertionSchedule gmsFreshnessAssertionSchedule) { + FreshnessAssertionSchedule freshnessAssertionSchedule = new FreshnessAssertionSchedule(); + freshnessAssertionSchedule.setType( + FreshnessAssertionScheduleType.valueOf(gmsFreshnessAssertionSchedule.getType().name())); + if (gmsFreshnessAssertionSchedule.hasCron()) { + freshnessAssertionSchedule.setCron( + mapFreshnessCronSchedule(gmsFreshnessAssertionSchedule.getCron())); + } + if (gmsFreshnessAssertionSchedule.hasFixedInterval()) { + freshnessAssertionSchedule.setFixedInterval( + mapFixedIntervalSchedule(gmsFreshnessAssertionSchedule.getFixedInterval())); + } + return freshnessAssertionSchedule; + } + + private FreshnessAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapper.java new file mode 100644 index 0000000000000..e75d2221164d4 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapper.java @@ -0,0 +1,27 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.SqlAssertionInfo; +import com.linkedin.datahub.graphql.generated.AssertionStdOperator; +import com.linkedin.datahub.graphql.generated.AssertionValueChangeType; +import com.linkedin.datahub.graphql.generated.SqlAssertionType; + +public class SqlAssertionMapper extends AssertionMapper { + + public static com.linkedin.datahub.graphql.generated.SqlAssertionInfo mapSqlAssertionInfo( + final SqlAssertionInfo gmsSqlAssertionInfo) { + final com.linkedin.datahub.graphql.generated.SqlAssertionInfo result = + new com.linkedin.datahub.graphql.generated.SqlAssertionInfo(); + result.setEntityUrn(gmsSqlAssertionInfo.getEntity().toString()); + result.setType(SqlAssertionType.valueOf(gmsSqlAssertionInfo.getType().name())); + result.setStatement(gmsSqlAssertionInfo.getStatement()); + result.setOperator(AssertionStdOperator.valueOf(gmsSqlAssertionInfo.getOperator().name())); + result.setParameters(mapParameters(gmsSqlAssertionInfo.getParameters())); + if (gmsSqlAssertionInfo.hasChangeType()) { + result.setChangeType( + AssertionValueChangeType.valueOf(gmsSqlAssertionInfo.getChangeType().name())); + } + return result; + } + + private SqlAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapper.java new file mode 100644 index 0000000000000..3d0294c45e520 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapper.java @@ -0,0 +1,115 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.VolumeAssertionInfo; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AssertionStdOperator; +import com.linkedin.datahub.graphql.generated.AssertionValueChangeType; +import com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformerType; +import com.linkedin.datahub.graphql.generated.VolumeAssertionType; +import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper; +import javax.annotation.Nullable; + +public class VolumeAssertionMapper extends AssertionMapper { + + public static com.linkedin.datahub.graphql.generated.VolumeAssertionInfo mapVolumeAssertionInfo( + @Nullable final QueryContext context, final VolumeAssertionInfo gmsVolumeAssertionInfo) { + final com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + new com.linkedin.datahub.graphql.generated.VolumeAssertionInfo(); + result.setEntityUrn(gmsVolumeAssertionInfo.getEntity().toString()); + result.setType(VolumeAssertionType.valueOf(gmsVolumeAssertionInfo.getType().name())); + if (gmsVolumeAssertionInfo.hasFilter()) { + result.setFilter(DatasetFilterMapper.map(context, gmsVolumeAssertionInfo.getFilter())); + } + if (gmsVolumeAssertionInfo.hasRowCountTotal()) { + result.setRowCountTotal(mapRowCountTotal(gmsVolumeAssertionInfo.getRowCountTotal())); + } + if (gmsVolumeAssertionInfo.hasRowCountChange()) { + result.setRowCountChange(mapRowCountChange(gmsVolumeAssertionInfo.getRowCountChange())); + } + if (gmsVolumeAssertionInfo.hasIncrementingSegmentRowCountTotal()) { + result.setIncrementingSegmentRowCountTotal( + mapIncrementingSegmentRowCountTotal( + gmsVolumeAssertionInfo.getIncrementingSegmentRowCountTotal())); + } + if (gmsVolumeAssertionInfo.hasIncrementingSegmentRowCountChange()) { + result.setIncrementingSegmentRowCountChange( + mapIncrementingSegmentRowCountChange( + gmsVolumeAssertionInfo.getIncrementingSegmentRowCountChange())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.RowCountTotal mapRowCountTotal( + final com.linkedin.assertion.RowCountTotal gmsRowCountTotal) { + final com.linkedin.datahub.graphql.generated.RowCountTotal result = + new com.linkedin.datahub.graphql.generated.RowCountTotal(); + result.setOperator(AssertionStdOperator.valueOf(gmsRowCountTotal.getOperator().name())); + result.setParameters(mapParameters(gmsRowCountTotal.getParameters())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.RowCountChange mapRowCountChange( + final com.linkedin.assertion.RowCountChange gmsRowCountChange) { + final com.linkedin.datahub.graphql.generated.RowCountChange result = + new com.linkedin.datahub.graphql.generated.RowCountChange(); + result.setOperator(AssertionStdOperator.valueOf(gmsRowCountChange.getOperator().name())); + result.setParameters(mapParameters(gmsRowCountChange.getParameters())); + result.setType(AssertionValueChangeType.valueOf(gmsRowCountChange.getType().name())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal + mapIncrementingSegmentRowCountTotal( + final com.linkedin.assertion.IncrementingSegmentRowCountTotal + gmsIncrementingSegmentRowCountTotal) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal(); + result.setOperator( + AssertionStdOperator.valueOf(gmsIncrementingSegmentRowCountTotal.getOperator().name())); + result.setParameters(mapParameters(gmsIncrementingSegmentRowCountTotal.getParameters())); + result.setSegment(mapIncrementingSegmentSpec(gmsIncrementingSegmentRowCountTotal.getSegment())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange + mapIncrementingSegmentRowCountChange( + final com.linkedin.assertion.IncrementingSegmentRowCountChange + gmsIncrementingSegmentRowCountChange) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange(); + result.setOperator( + AssertionStdOperator.valueOf(gmsIncrementingSegmentRowCountChange.getOperator().name())); + result.setParameters(mapParameters(gmsIncrementingSegmentRowCountChange.getParameters())); + result.setSegment( + mapIncrementingSegmentSpec(gmsIncrementingSegmentRowCountChange.getSegment())); + result.setType( + AssertionValueChangeType.valueOf(gmsIncrementingSegmentRowCountChange.getType().name())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec + mapIncrementingSegmentSpec(final com.linkedin.assertion.IncrementingSegmentSpec gmsSegment) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec(); + result.setField(mapSchemaFieldSpec(gmsSegment.getField())); + if (gmsSegment.hasTransformer()) { + result.setTransformer(mapIncrementingSegmentFieldTransformer(gmsSegment.getTransformer())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer + mapIncrementingSegmentFieldTransformer( + final com.linkedin.assertion.IncrementingSegmentFieldTransformer gmsTransformer) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer(); + result.setType( + IncrementingSegmentFieldTransformerType.valueOf(gmsTransformer.getType().name())); + if (gmsTransformer.hasNativeType()) { + result.setNativeType(gmsTransformer.getNativeType()); + } + return result; + } + + private VolumeAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java index a2cc9d5a66edd..3674186ac23fe 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java @@ -51,7 +51,7 @@ public SchemaField apply( return result; } - private SchemaFieldDataType mapSchemaFieldDataType( + public SchemaFieldDataType mapSchemaFieldDataType( @Nonnull final com.linkedin.schema.SchemaFieldDataType dataTypeUnion) { final com.linkedin.schema.SchemaFieldDataType.Type type = dataTypeUnion.getType(); if (type.isBytesType()) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java index ca363deb90c4d..d0c5605976d63 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java @@ -71,6 +71,7 @@ private SearchAcrossLineageResult mapResult( .setDegrees(new ArrayList<>(searchEntity.getDegrees())) .setExplored(Boolean.TRUE.equals(searchEntity.isExplored())) .setIgnoredAsHop(Boolean.TRUE.equals(searchEntity.isIgnoredAsHop())) + .setTruncatedChildren(Boolean.TRUE.equals(searchEntity.isTruncatedChildren())) .build(); } } diff --git a/datahub-graphql-core/src/main/resources/assertions.graphql b/datahub-graphql-core/src/main/resources/assertions.graphql new file mode 100644 index 0000000000000..0ed264b20fe27 --- /dev/null +++ b/datahub-graphql-core/src/main/resources/assertions.graphql @@ -0,0 +1,896 @@ +""" +Defines a schema field, each with a specified path and type. +""" +type SchemaAssertionField { + """ + The standard V1 path of the field within the schema. + """ + path: String! + + """ + The std type of the field + """ + type: SchemaFieldDataType! + + """ + Optional: The specific native or standard type of the field. + """ + nativeType: String +} + +""" +Defines the required compatibility level for the schema assertion to pass. +""" +enum SchemaAssertionCompatibility { + """ + The schema must be exactly the same as the expected schema. + """ + EXACT_MATCH + + """ + The schema must be a superset of the expected schema. + """ + SUPERSET + + """ + The schema must be a subset of the expected schema. + """ + SUBSET +} + +""" +The source of an assertion +""" +enum AssertionSourceType { + """ + The assertion was defined natively on DataHub by a user. + """ + NATIVE + """ + The assertion was defined and managed externally of DataHub. + """ + EXTERNAL + """ + The assertion was inferred, e.g. from offline AI / ML models. + """ + INFERRED +} + +""" +The type of an Freshness assertion +""" +enum FreshnessAssertionType { + """ + An assertion defined against a Dataset Change Operation - insert, update, delete, etc + """ + DATASET_CHANGE + """ + An assertion defined against a Data Job run + """ + DATA_JOB_RUN +} + +extend type AssertionInfo { + """ + Information about an Freshness Assertion + """ + freshnessAssertion: FreshnessAssertionInfo + + """ + Information about an Volume Assertion + """ + volumeAssertion: VolumeAssertionInfo + + """ + Information about a SQL Assertion + """ + sqlAssertion: SqlAssertionInfo + + """ + Information about a Field Assertion + """ + fieldAssertion: FieldAssertionInfo + + """ + Schema assertion, e.g. defining the expected structure for an asset. + """ + schemaAssertion: SchemaAssertionInfo + + """ + The source or origin of the Assertion definition. + """ + source: AssertionSource + + """ + The time that the status last changed and the actor who changed it + """ + lastUpdated: AuditStamp +} + +extend type Assertion { + """ + The actions associated with the Assertion + """ + actions: AssertionActions +} + +""" +Some actions associated with an assertion +""" +type AssertionActions { + """ + Actions to be executed on successful assertion run. + """ + onSuccess: [AssertionAction!]! + + """ + Actions to be executed on failed assertion run. + """ + onFailure: [AssertionAction!]! +} + +""" +An action associated with an assertion +""" +type AssertionAction { + """ + The type of the actions + """ + type: AssertionActionType! +} + + +""" +The type of the Action +""" +enum AssertionActionType { + """ + Raise an incident. + """ + RAISE_INCIDENT + """ + Resolve open incidents related to the assertion. + """ + RESOLVE_INCIDENT +} + + +""" +Information about an Freshness assertion. +""" +type FreshnessAssertionInfo { + """ + The urn of the entity that the Freshness assertion is related to + """ + entityUrn: String! + + """ + The type of the Freshness Assertion + """ + type: FreshnessAssertionType! + + """ + Produce FAIL Assertion Result if the asset is not updated on the cadence and within the time range described by the schedule. + """ + schedule: FreshnessAssertionSchedule! + + """ + A filter applied when querying an external Dataset or Table + """ + filter: DatasetFilter +} + +""" +Attributes defining a single Freshness schedule. +""" +type FreshnessAssertionSchedule { + """ + The type of schedule + """ + type: FreshnessAssertionScheduleType! + + """ + A cron schedule. This is populated if the type is CRON. + """ + cron: FreshnessCronSchedule + + """ + A fixed interval schedule. This is populated if the type is FIXED_INTERVAL. + """ + fixedInterval: FixedIntervalSchedule +} + +""" +The type of an Freshness assertion +""" +enum FreshnessAssertionScheduleType { + """ + An schedule based on a CRON schedule representing the expected event times. + """ + CRON + + """ + A scheduled based on a recurring fixed schedule which is used to compute the expected operation window. E.g. "every 24 hours". + """ + FIXED_INTERVAL +} + +""" +A cron-formatted schedule +""" +type FreshnessCronSchedule { + """ + A cron-formatted execution interval, as a cron string, e.g. 1 * * * * + """ + cron: String! + + """ + Timezone in which the cron interval applies, e.g. America/Los Angeles + """ + timezone: String! + + """ + An optional offset in milliseconds to SUBTRACT from the timestamp generated by the cron schedule + to generate the lower bounds of the "Freshness window", or the window of time in which an event must have occurred in order for the Freshness + to be considering passing. + If left empty, the start of the Freshness window will be the _end_ of the previously evaluated Freshness window. + """ + windowStartOffsetMs: Long +} + +""" +A fixed interval schedule. +""" +type FixedIntervalSchedule { + """ + Interval unit such as minute/hour/day etc. + """ + unit: DateInterval! + + """ + How many units. Defaults to 1. + """ + multiple: Int! +} + +""" +The source of an Assertion +""" +type AssertionSource { + """ + The source type + """ + type: AssertionSourceType! + """ + The time at which the assertion was initially created and the actor who created it + """ + created: AuditStamp +} + +""" +Information about the field to use in an assertion +""" +type SchemaFieldSpec { + """ + The field path + """ + path: String! + + """ + The DataHub standard schema field type. + """ + type: String! + + """ + The native field type + """ + nativeType: String! +} + +""" +An enum to represent a type of change in an assertion value, metric, or measurement. +""" +enum AssertionValueChangeType { + """ + A change that is defined in absolute terms. + """ + ABSOLUTE + + """ + A change that is defined in relative terms using percentage change + from the original value. + """ + PERCENTAGE +} + +""" +A type of volume (row count) assertion +""" +enum VolumeAssertionType { + """ + A volume assertion that is evaluated against the total row count of a dataset. + """ + ROW_COUNT_TOTAL + + """ + A volume assertion that is evaluated against an incremental row count of a dataset, + or a row count change. + """ + ROW_COUNT_CHANGE + + """ + A volume assertion that checks the latest "segment" in a table based on an incrementing + column to check whether it's row count falls into a particular range. + This can be used to monitor the row count of an incrementing date-partition column segment. + """ + INCREMENTING_SEGMENT_ROW_COUNT_TOTAL + + """ + A volume assertion that compares the row counts in neighboring "segments" or "partitions" + of an incrementing column. This can be used to track changes between subsequent date partition + in a table, for example. + """ + INCREMENTING_SEGMENT_ROW_COUNT_CHANGE +} + +""" +Attributes defining an ROW_COUNT_TOTAL volume assertion. +""" +type RowCountTotal { + """ + The operator you'd like to apply. + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Attributes defining an ROW_COUNT_CHANGE volume assertion. +""" +type RowCountChange { + """ + The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + """ + type: AssertionValueChangeType! + + """ + The operator you'd like to apply. + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_TOTAL volume assertion. +""" +type IncrementingSegmentRowCountTotal { + """ + A specification of how the 'segment' can be derived using a column and an optional transformer function. + """ + segment: IncrementingSegmentSpec! + + """ + The operator you'd like to apply. + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_CHANGE volume assertion. +""" +type IncrementingSegmentRowCountChange { + """ + A specification of how the 'segment' can be derived using a column and an optional transformer function. + """ + segment: IncrementingSegmentSpec! + + """ + The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + """ + type: AssertionValueChangeType! + + """ + The operator you'd like to apply to the row count value + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Core attributes required to identify an incrementing segment in a table. This type is mainly useful +for tables that constantly increase with new rows being added on a particular cadence (e.g. fact or event tables). + +An incrementing segment represents a logical chunk of data which is INSERTED +into a dataset on a regular interval, along with the presence of a constantly-incrementing column +value such as an event time, date partition, or last modified column. + +An incrementing segment is principally identified by 2 key attributes combined: + +1. A field or column that represents the incrementing value. New rows that are inserted will be identified using this column. + Note that the value of this column may not by itself represent the "bucket" or the "segment" in which the row falls. + +2. [Optional] An transformer function that may be applied to the selected column value in order + to obtain the final "segment identifier" or "bucket identifier". Rows that have the same value after applying the transformation + will be grouped into the same segment, using which the final value (e.g. row count) will be determined. +""" +type IncrementingSegmentSpec { + """ + The field to use to generate segments. It must be constantly incrementing as new rows are inserted. + """ + field: SchemaFieldSpec! + + """ + Optional transformer function to apply to the field in order to obtain the final segment or bucket identifier. + If not provided, then no operator will be applied to the field. (identity function) + """ + transformer: IncrementingSegmentFieldTransformer +} + +""" +The definition of the transformer function that should be applied to a given field / column value in a dataset +in order to determine the segment or bucket that it belongs to, which in turn is used to evaluate +volume assertions. +""" +type IncrementingSegmentFieldTransformer { + """ + The 'standard' operator type. Note that not all source systems will support all operators. + """ + type: IncrementingSegmentFieldTransformerType! + + """ + The 'native' transformer type, useful as a back door if a custom transformer is required. + This field is required if the type is NATIVE. + """ + nativeType: String +} + +""" +The 'standard' transformer type. Note that not all source systems will support all operators. +""" +enum IncrementingSegmentFieldTransformerType { + """ + Rounds a timestamp (in seconds) down to the start of the month. + """ + TIMESTAMP_MS_TO_MINUTE + + """ + Rounds a timestamp (in milliseconds) down to the nearest hour. + """ + TIMESTAMP_MS_TO_HOUR + + """ + Rounds a timestamp (in milliseconds) down to the start of the day. + """ + TIMESTAMP_MS_TO_DATE + + """ + Rounds a timestamp (in milliseconds) down to the start of the month + """ + TIMESTAMP_MS_TO_MONTH + + """ + Rounds a timestamp (in milliseconds) down to the start of the year + """ + TIMESTAMP_MS_TO_YEAR + + """ + Rounds a numeric value down to the nearest integer. + """ + FLOOR + + """ + Rounds a numeric value up to the nearest integer. + """ + CEILING + + """ + A backdoor to provide a native operator type specific to a given source system like + Snowflake, Redshift, BQ, etc. + """ + NATIVE +} + +""" +A definition of a Volume (row count) assertion. +""" +type VolumeAssertionInfo { + """ + The entity targeted by this Volume check. + """ + entityUrn: String! + + """ + The type of the freshness assertion being monitored. + """ + type: VolumeAssertionType! + + """ + Produce FAILURE Assertion Result if the row count of the asset does not meet specific requirements. + Required if type is 'ROW_COUNT_TOTAL'. + """ + rowCountTotal: RowCountTotal + + """ + Produce FAILURE Assertion Result if the row count delta of the asset does not meet specific requirements. + Required if type is 'ROW_COUNT_CHANGE'. + """ + rowCountChange: RowCountChange + + """ + Produce FAILURE Assertion Result if the latest incrementing segment row count total of the asset + does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_TOTAL'. + """ + incrementingSegmentRowCountTotal: IncrementingSegmentRowCountTotal + + """ + Produce FAILURE Assertion Result if the incrementing segment row count delta of the asset + does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_CHANGE'. + """ + incrementingSegmentRowCountChange: IncrementingSegmentRowCountChange + + """ + A definition of the specific filters that should be applied, when performing monitoring. + If not provided, there is no filter, and the full table is under consideration. + """ + filter: DatasetFilter +} + +""" +The type of the SQL assertion being monitored. +""" +enum SqlAssertionType { + """ + A SQL Metric Assertion, e.g. one based on a numeric value returned by an arbitrary SQL query. + """ + METRIC + + """ + A SQL assertion that is evaluated against the CHANGE in a metric assertion over time. + """ + METRIC_CHANGE +} + +""" +Attributes defining a SQL Assertion +""" +type SqlAssertionInfo { + """ + The type of the SQL assertion being monitored. + """ + type: SqlAssertionType! + + """ + The entity targeted by this SQL check. + """ + entityUrn: String! + + """ + The SQL statement to be executed when evaluating the assertion. + """ + statement: String! + + """ + The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + Required if the type is METRIC_CHANGE. + """ + changeType: AssertionValueChangeType + + """ + The operator you'd like to apply to the result of the SQL query. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + """ + parameters: AssertionStdParameters! +} + +""" +The type of a Field assertion +""" +enum FieldAssertionType { + """ + An assertion used to validate the values contained with a field / column given a set of rows. + """ + FIELD_VALUES + + """ + An assertion used to validate the value of a common field / column metric (e.g. aggregation) + such as null count + percentage, min, max, median, and more. + """ + FIELD_METRIC +} + +""" +The type of the Field Transform +""" +enum FieldTransformType { + """ + Obtain the length of a string field / column (applicable to string types) + """ + LENGTH +} + +""" +The type of failure threshold. +""" +enum FieldValuesFailThresholdType { + """ + The maximum number of column values (i.e. rows) that are allowed + to fail the defined expectations before the assertion officially fails. + """ + COUNT + + """ + The maximum percentage of rows that are allowed + to fail the defined column expectations before the assertion officially fails. + """ + PERCENTAGE +} + +""" +A standard metric that can be derived from the set of values +for a specific field / column of a dataset / table. +""" +enum FieldMetricType { + """ + The number of unique values found in the column value set + """ + UNIQUE_COUNT + + """ + The percentage of unique values to total rows for the dataset + """ + UNIQUE_PERCENTAGE + + """ + The number of null values found in the column value set + """ + NULL_COUNT + + """ + The percentage of null values to total rows for the dataset + """ + NULL_PERCENTAGE + + """ + The minimum value in the column set (applies to numeric columns) + """ + MIN + + """ + The maximum value in the column set (applies to numeric columns) + """ + MAX + + """ + The mean length found in the column set (applies to numeric columns) + """ + MEAN + + """ + The median length found in the column set (applies to numeric columns) + """ + MEDIAN + + """ + The stddev length found in the column set (applies to numeric columns) + """ + STDDEV + + """ + The number of negative values found in the value set (applies to numeric columns) + """ + NEGATIVE_COUNT + + """ + The percentage of negative values to total rows for the dataset (applies to numeric columns) + """ + NEGATIVE_PERCENTAGE + + """ + The number of zero values found in the value set (applies to numeric columns) + """ + ZERO_COUNT + + """ + The percentage of zero values to total rows for the dataset (applies to numeric columns) + """ + ZERO_PERCENTAGE + + """ + The minimum length found in the column set (applies to string columns) + """ + MIN_LENGTH + + """ + The maximum length found in the column set (applies to string columns) + """ + MAX_LENGTH + + """ + The number of empty string values found in the value set (applies to string columns). + Note: This is a completely different metric different from NULL_COUNT! + """ + EMPTY_COUNT + + """ + The percentage of empty string values to total rows for the dataset (applies to string columns). + Note: This is a completely different metric different from NULL_PERCENTAGE! + """ + EMPTY_PERCENTAGE +} + +""" +A definition of a Field (Column) assertion. +""" +type FieldAssertionInfo { + """ + The type of the field assertion being monitored. + """ + type: FieldAssertionType! + + """ + The entity targeted by this Field check. + """ + entityUrn: String! + + """ + The definition of an assertion that validates individual values of a field / column for a set of rows. + """ + fieldValuesAssertion: FieldValuesAssertion + + """ + The definition of an assertion that validates a common metric obtained about a field / column for a set of rows. + """ + fieldMetricAssertion: FieldMetricAssertion + + """ + A definition of the specific filters that should be applied, when performing monitoring. + If not provided, there is no filter, and the full table is under consideration. + """ + filter: DatasetFilter +} + +""" +A definition of a Field Values assertion. +""" +type FieldValuesAssertion { + """ + The field under evaluation. + """ + field: SchemaFieldSpec! + + """ + An optional transform to apply to field values before evaluating the operator. + """ + transform: FieldTransform + + """ + The predicate to evaluate against a single value of the field. + Depending on the operator, parameters may be required + """ + operator: AssertionStdOperator! + + """ + Standard parameters required for the assertion. + """ + parameters: AssertionStdParameters + + """ + Additional customization about when the assertion should be officially considered failing. + """ + failThreshold: FieldValuesFailThreshold! + + """ + Whether to ignore or allow nulls when running the values assertion. + """ + excludeNulls: Boolean! +} + +""" +Definition of a transform applied to the values of a column / field. +""" +type FieldTransform { + """ + The type of the field transform. + """ + type: FieldTransformType! +} + +type FieldValuesFailThreshold { + """ + The type of failure threshold. + """ + type: FieldValuesFailThresholdType! + + """ + The value of the threshold, either representing a count or percentage. + """ + value: Long! +} + +""" +A definition of a Field Metric assertion. +""" +type FieldMetricAssertion { + """ + The field under evaluation + """ + field: SchemaFieldSpec! + + """ + The specific metric to assert against. + """ + metric: FieldMetricType! + + """ + The predicate to evaluate against the metric for the field / column. + """ + operator: AssertionStdOperator! + + """ + Standard parameters required for the assertion. + """ + parameters: AssertionStdParameters +} + +""" +Information about an Schema assertion +""" +type SchemaAssertionInfo { + """ + The entity targeted by this schema assertion. + """ + entityUrn: String! + + """ + A single field in the schema assertion. + """ + fields: [SchemaAssertionField!]! + + """ + A definition of the expected structure for the asset + Deprecated! Use the simpler 'fields' instead. + """ + schema: SchemaMetadata + + """ + The compatibility level required for the assertion to pass. + """ + compatibility: SchemaAssertionCompatibility! +} diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index de030f77b0b01..92d4a1723c0b6 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -7508,6 +7508,11 @@ type BatchSpec { The result type of an assertion, success or failure. """ enum AssertionResultType { + """ + The assertion has not yet been fully evaluated. + """ + INIT + """ The assertion succeeded. """ @@ -7517,6 +7522,11 @@ enum AssertionResultType { The assertion failed. """ FAILURE + + """ + The assertion errored. + """ + ERROR } """ @@ -7678,6 +7688,16 @@ enum AssertionStdOperator { """ NOT_IN + """ + Value being asserted is true. + """ + IS_TRUE + + """ + Value being asserted is false. + """ + IS_FALSE + """ Other """ @@ -7824,6 +7844,11 @@ type AssertionRunEventsResult { """ succeeded: Int! + """ + The number of errored run events + """ + errored: Int! + """ The run events themselves """ diff --git a/datahub-graphql-core/src/main/resources/incident.graphql b/datahub-graphql-core/src/main/resources/incident.graphql index f7060b3ae8f67..c2938543ed949 100644 --- a/datahub-graphql-core/src/main/resources/incident.graphql +++ b/datahub-graphql-core/src/main/resources/incident.graphql @@ -136,6 +136,36 @@ enum IncidentState { A specific type of incident """ enum IncidentType { + """ + A Freshness Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + FRESHNESS + + """ + A Volume Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + VOLUME + + """ + A Field Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + FIELD + + """ + A SQL Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + SQL + + """ + A Schema has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + DATA_SCHEMA + """ An operational incident, e.g. failure to materialize a dataset, or failure to execute a task / pipeline. """ @@ -174,6 +204,11 @@ enum IncidentSourceType { The incident was created manually, from either the API or the UI. """ MANUAL + + """ + An assertion has failed, triggering the incident. + """ + ASSERTION_FAILURE } """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index 499ac3a0860d4..c7b5e61e9831c 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -747,6 +747,11 @@ type SearchAcrossLineageResult { """ explored: Boolean! + """ + Indicates this destination node has additional unexplored child relationships + """ + truncatedChildren: Boolean! + """ Whether this relationship was ignored as a hop """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java index 7323a62d94bfe..c047a0d0a3f05 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java @@ -97,6 +97,7 @@ public void testGetSuccess() throws Exception { assertEquals(result.getTotal(), 1); assertEquals(result.getFailed(), 0); assertEquals(result.getSucceeded(), 1); + assertEquals(result.getErrored(), 0); com.linkedin.datahub.graphql.generated.AssertionRunEvent graphqlRunEvent = resolver.get(mockEnv).get().getRunEvents().get(0); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapperTest.java new file mode 100644 index 0000000000000..376af14af08f6 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapperTest.java @@ -0,0 +1,346 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import static org.testng.Assert.assertEquals; + +import com.google.common.collect.ImmutableList; +import com.linkedin.assertion.AssertionInfo; +import com.linkedin.assertion.AssertionSource; +import com.linkedin.assertion.AssertionStdAggregation; +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.AssertionStdParameter; +import com.linkedin.assertion.AssertionStdParameterType; +import com.linkedin.assertion.AssertionStdParameters; +import com.linkedin.assertion.AssertionType; +import com.linkedin.assertion.DatasetAssertionInfo; +import com.linkedin.assertion.DatasetAssertionScope; +import com.linkedin.assertion.FreshnessAssertionInfo; +import com.linkedin.assertion.FreshnessAssertionSchedule; +import com.linkedin.assertion.FreshnessAssertionScheduleType; +import com.linkedin.assertion.FreshnessAssertionType; +import com.linkedin.assertion.FreshnessCronSchedule; +import com.linkedin.assertion.SchemaAssertionCompatibility; +import com.linkedin.assertion.SchemaAssertionInfo; +import com.linkedin.common.GlobalTags; +import com.linkedin.common.TagAssociationArray; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.TagUrn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.DataMap; +import com.linkedin.data.template.StringMap; +import com.linkedin.datahub.graphql.generated.Assertion; +import com.linkedin.datahub.graphql.generated.FixedIntervalSchedule; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import com.linkedin.schema.MySqlDDL; +import com.linkedin.schema.SchemaField; +import com.linkedin.schema.SchemaFieldArray; +import com.linkedin.schema.SchemaFieldDataType; +import com.linkedin.schema.SchemaMetadata; +import com.linkedin.schema.StringType; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class AssertionMapperTest { + + @Test + public void testMapDatasetAssertion() { + // Case 1: Without nullable fields + AssertionInfo input = createFreshnessAssertionInfoWithoutNullableFields(); + EntityResponse datasetAssertionEntityResponse = createAssertionInfoEntityResponse(input); + Assertion output = AssertionMapper.map(null, datasetAssertionEntityResponse); + verifyAssertionInfo(input, output); + + // Case 2: With nullable fields + input = createFreshnessAssertionInfoWithNullableFields(); + EntityResponse datasetAssertionEntityResponseWithNullables = + createAssertionInfoEntityResponse(input); + output = AssertionMapper.map(null, datasetAssertionEntityResponseWithNullables); + verifyAssertionInfo(input, output); + } + + @Test + public void testMapTags() throws Exception { + HashMap aspects = new HashMap<>(); + AssertionInfo info = createFreshnessAssertionInfoWithoutNullableFields(); + + EnvelopedAspect envelopedTagsAspect = new EnvelopedAspect(); + GlobalTags tags = new GlobalTags(); + tags.setTags( + new TagAssociationArray( + new TagAssociationArray( + Collections.singletonList( + new com.linkedin.common.TagAssociation() + .setTag(TagUrn.createFromString("urn:li:tag:test")))))); + envelopedTagsAspect.setValue(new Aspect(tags.data())); + + aspects.put(Constants.ASSERTION_INFO_ASPECT_NAME, createEnvelopedAspect(info.data())); + aspects.put(Constants.GLOBAL_TAGS_ASPECT_NAME, createEnvelopedAspect(tags.data())); + EntityResponse response = createEntityResponse(aspects); + + Assertion assertion = AssertionMapper.map(null, response); + assertEquals(assertion.getTags().getTags().size(), 1); + assertEquals( + assertion.getTags().getTags().get(0).getTag().getUrn().toString(), "urn:li:tag:test"); + } + + @Test + public void testMapFreshnessAssertion() { + // Case 1: Without nullable fields + AssertionInfo inputInfo = createFreshnessAssertionInfoWithoutNullableFields(); + + EntityResponse freshnessAssertionEntityResponse = createAssertionInfoEntityResponse(inputInfo); + Assertion output = AssertionMapper.map(null, freshnessAssertionEntityResponse); + verifyAssertionInfo(inputInfo, output); + + // Case 2: With nullable fields + inputInfo = createDatasetAssertionInfoWithNullableFields(); + EntityResponse freshnessAssertionEntityResponseWithNullables = + createAssertionInfoEntityResponse(inputInfo); + output = AssertionMapper.map(null, freshnessAssertionEntityResponseWithNullables); + verifyAssertionInfo(inputInfo, output); + } + + @Test + public void testMapDataSchemaAssertion() { + AssertionInfo input = createSchemaAssertion(); + EntityResponse schemaAssertionEntityResponse = createAssertionInfoEntityResponse(input); + Assertion output = AssertionMapper.map(null, schemaAssertionEntityResponse); + verifyAssertionInfo(input, output); + } + + private void verifyAssertionInfo(AssertionInfo input, Assertion output) { + Assert.assertNotNull(output); + Assert.assertNotNull(output.getInfo()); + Assert.assertEquals( + output.getInfo().getType().toString(), output.getInfo().getType().toString()); + + if (input.hasDatasetAssertion()) { + verifyDatasetAssertion(input.getDatasetAssertion(), output.getInfo().getDatasetAssertion()); + } + + if (input.hasFreshnessAssertion()) { + verifyFreshnessAssertion( + input.getFreshnessAssertion(), output.getInfo().getFreshnessAssertion()); + } + + if (input.hasSchemaAssertion()) { + verifySchemaAssertion(input.getSchemaAssertion(), output.getInfo().getSchemaAssertion()); + } + + if (input.hasSource()) { + verifySource(input.getSource(), output.getInfo().getSource()); + } + } + + private void verifyDatasetAssertion( + DatasetAssertionInfo input, + com.linkedin.datahub.graphql.generated.DatasetAssertionInfo output) { + Assert.assertEquals(output.getOperator().toString(), input.getOperator().toString()); + Assert.assertEquals(output.getOperator().toString(), input.getOperator().toString()); + Assert.assertEquals(output.getScope().toString(), input.getScope().toString()); + Assert.assertEquals(output.getDatasetUrn(), input.getDataset().toString()); + if (input.hasAggregation()) { + Assert.assertEquals(output.getAggregation().toString(), input.getAggregation().toString()); + } + if (input.hasNativeType()) { + Assert.assertEquals(output.getNativeType(), input.getNativeType().toString()); + } + if (input.hasLogic()) { + Assert.assertEquals(output.getLogic(), input.getLogic()); + } + if (input.hasFields()) { + Assert.assertTrue( + input.getFields().stream() + .allMatch( + field -> + output.getFields().stream() + .anyMatch(outField -> field.toString().equals(outField.getUrn())))); + } + } + + private void verifyFreshnessAssertion( + FreshnessAssertionInfo input, + com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo output) { + Assert.assertEquals(output.getType().toString(), input.getType().toString()); + Assert.assertEquals(output.getEntityUrn(), input.getEntity().toString()); + if (input.hasSchedule()) { + verifyFreshnessSchedule(input.getSchedule(), output.getSchedule()); + } + } + + private void verifySchemaAssertion( + SchemaAssertionInfo input, + com.linkedin.datahub.graphql.generated.SchemaAssertionInfo output) { + Assert.assertEquals(output.getEntityUrn(), input.getEntity().toString()); + Assert.assertEquals(output.getCompatibility().toString(), input.getCompatibility().toString()); + Assert.assertEquals( + output.getSchema().getFields().size(), input.getSchema().getFields().size()); + } + + private void verifyCronSchedule( + FreshnessCronSchedule input, + com.linkedin.datahub.graphql.generated.FreshnessCronSchedule output) { + Assert.assertEquals(output.getCron(), input.getCron()); + Assert.assertEquals(output.getTimezone(), input.getTimezone()); + if (input.hasWindowStartOffsetMs()) { + Assert.assertEquals(output.getWindowStartOffsetMs(), input.getWindowStartOffsetMs()); + } + } + + private void verifyFreshnessSchedule( + FreshnessAssertionSchedule input, + com.linkedin.datahub.graphql.generated.FreshnessAssertionSchedule output) { + Assert.assertEquals(output.getType().toString(), input.getType().toString()); + if (input.hasCron()) { + verifyCronSchedule(input.getCron(), output.getCron()); + } + if (input.hasFixedInterval()) { + verifyFixedIntervalSchedule(input.getFixedInterval(), output.getFixedInterval()); + } + } + + private void verifyFixedIntervalSchedule( + com.linkedin.assertion.FixedIntervalSchedule input, FixedIntervalSchedule output) { + Assert.assertEquals(output.getMultiple(), (int) input.getMultiple()); + Assert.assertEquals(output.getUnit().toString(), input.getUnit().toString()); + } + + private void verifySource( + AssertionSource input, com.linkedin.datahub.graphql.generated.AssertionSource output) { + Assert.assertEquals(output.getType().toString(), input.getType().toString()); + } + + private EntityResponse createAssertionInfoEntityResponse(final AssertionInfo info) { + HashMap aspects = new HashMap<>(); + aspects.put(Constants.ASSERTION_INFO_ASPECT_NAME, createEnvelopedAspect(info.data())); + + return createEntityResponse(aspects); + } + + private EntityResponse createEntityResponse(Map aspects) { + EntityResponse entityResponse = new EntityResponse(); + entityResponse.setUrn(UrnUtils.getUrn("urn:li:assertion:1")); + entityResponse.setAspects(new EnvelopedAspectMap(new HashMap<>())); + aspects.forEach( + (aspectName, envelopedAspect) -> { + entityResponse.getAspects().put(aspectName, envelopedAspect); + }); + + return entityResponse; + } + + private EnvelopedAspect createEnvelopedAspect(DataMap dataMap) { + EnvelopedAspect envelopedAspect = new EnvelopedAspect(); + envelopedAspect.setValue(new Aspect(dataMap)); + return envelopedAspect; + } + + private AssertionInfo createDatasetAssertionInfoWithoutNullableFields() { + AssertionInfo info = new AssertionInfo(); + info.setType(com.linkedin.assertion.AssertionType.DATASET); + DatasetAssertionInfo datasetAssertionInfo = new DatasetAssertionInfo(); + datasetAssertionInfo.setDataset(UrnUtils.getUrn("urn:li:dataset:1")); + datasetAssertionInfo.setScope(DatasetAssertionScope.DATASET_COLUMN); + datasetAssertionInfo.setOperator(AssertionStdOperator.GREATER_THAN); + info.setDatasetAssertion(datasetAssertionInfo); + return info; + } + + private AssertionInfo createDatasetAssertionInfoWithNullableFields() { + AssertionInfo infoWithoutNullables = createDatasetAssertionInfoWithoutNullableFields(); + DatasetAssertionInfo baseInfo = infoWithoutNullables.getDatasetAssertion(); + baseInfo.setFields( + new UrnArray( + Arrays.asList( + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,name,PROD),field)")))); + baseInfo.setAggregation(AssertionStdAggregation.SUM); + baseInfo.setParameters(createAssertionStdParameters()); + baseInfo.setNativeType("native_type"); + baseInfo.setNativeParameters(new StringMap(Collections.singletonMap("key", "value"))); + baseInfo.setLogic("sample_logic"); + infoWithoutNullables.setSource( + new AssertionSource().setType(com.linkedin.assertion.AssertionSourceType.INFERRED)); + return infoWithoutNullables; + } + + private AssertionInfo createFreshnessAssertionInfoWithoutNullableFields() { + AssertionInfo info = new AssertionInfo(); + info.setType(AssertionType.FRESHNESS); + FreshnessAssertionInfo freshnessAssertionInfo = new FreshnessAssertionInfo(); + freshnessAssertionInfo.setEntity( + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,name,PROD)")); + freshnessAssertionInfo.setType(FreshnessAssertionType.DATASET_CHANGE); + info.setFreshnessAssertion(freshnessAssertionInfo); + return info; + } + + private AssertionInfo createFreshnessAssertionInfoWithNullableFields() { + AssertionInfo infoWithoutNullables = createFreshnessAssertionInfoWithoutNullableFields(); + FreshnessAssertionInfo baseInfo = infoWithoutNullables.getFreshnessAssertion(); + baseInfo.setSchedule(createFreshnessAssertionSchedule()); + infoWithoutNullables.setSource( + new AssertionSource().setType(com.linkedin.assertion.AssertionSourceType.INFERRED)); + return infoWithoutNullables; + } + + private AssertionInfo createSchemaAssertion() { + AssertionInfo info = new AssertionInfo(); + info.setType(AssertionType.DATA_SCHEMA); + SchemaAssertionInfo schemaAssertionInfo = new SchemaAssertionInfo(); + schemaAssertionInfo.setEntity(UrnUtils.getUrn("urn:li:dataset:1")); + schemaAssertionInfo.setCompatibility(SchemaAssertionCompatibility.SUPERSET); + schemaAssertionInfo.setSchema( + new SchemaMetadata() + .setCluster("Test") + .setHash("Test") + .setPlatformSchema(SchemaMetadata.PlatformSchema.create(new MySqlDDL())) + .setFields( + new SchemaFieldArray( + ImmutableList.of( + new SchemaField() + .setType( + new SchemaFieldDataType() + .setType(SchemaFieldDataType.Type.create(new StringType()))) + .setNullable(false) + .setNativeDataType("string") + .setFieldPath("test"))))); + return info; + } + + private AssertionStdParameters createAssertionStdParameters() { + AssertionStdParameters parameters = new AssertionStdParameters(); + parameters.setValue(createAssertionStdParameter()); + parameters.setMinValue(createAssertionStdParameter()); + parameters.setMaxValue(createAssertionStdParameter()); + return parameters; + } + + private AssertionStdParameter createAssertionStdParameter() { + AssertionStdParameter parameter = new AssertionStdParameter(); + parameter.setType(AssertionStdParameterType.NUMBER); + parameter.setValue("100"); + return parameter; + } + + private FreshnessAssertionSchedule createFreshnessAssertionSchedule() { + FreshnessAssertionSchedule schedule = new FreshnessAssertionSchedule(); + schedule.setType(FreshnessAssertionScheduleType.CRON); + schedule.setCron(createCronSchedule()); + return schedule; + } + + private FreshnessCronSchedule createCronSchedule() { + FreshnessCronSchedule cronSchedule = new FreshnessCronSchedule(); + cronSchedule.setCron("0 0 * * *"); + cronSchedule.setTimezone("UTC"); + return cronSchedule; + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java index dd2b676a94130..33774690b7c7a 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java @@ -7,6 +7,10 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.linkedin.assertion.AssertionAction; +import com.linkedin.assertion.AssertionActionArray; +import com.linkedin.assertion.AssertionActionType; +import com.linkedin.assertion.AssertionActions; import com.linkedin.assertion.AssertionInfo; import com.linkedin.assertion.AssertionType; import com.linkedin.common.DataPlatformInstance; @@ -48,6 +52,17 @@ public class AssertionTypeTest { new DataPlatformInstance() .setPlatform(new DataPlatformUrn("snowflake")) .setInstance(null, SetMode.IGNORE_NULL); + // Acryl SaaS Only + private static final AssertionActions TEST_ASSERTION_ACTIONS = + new AssertionActions() + .setOnSuccess( + new AssertionActionArray( + ImmutableList.of( + new AssertionAction().setType(AssertionActionType.RAISE_INCIDENT)))) + .setOnFailure( + new AssertionActionArray( + ImmutableList.of( + new AssertionAction().setType(AssertionActionType.RESOLVE_INCIDENT)))); private static final String TEST_ASSERTION_URN_2 = "urn:li:assertion:guid-2"; @@ -69,6 +84,9 @@ public void testBatchLoad() throws Exception { assertion1Aspects.put( Constants.ASSERTION_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(TEST_ASSERTION_INFO.data()))); + assertion1Aspects.put( + Constants.ASSERTION_ACTIONS_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_ASSERTION_ACTIONS.data()))); Mockito.when( client.batchGetV2( any(), @@ -112,6 +130,12 @@ public void testBatchLoad() throws Exception { assertEquals(assertion.getInfo().getType().toString(), AssertionType.DATASET.toString()); assertEquals(assertion.getInfo().getDatasetAssertion(), null); assertEquals(assertion.getPlatform().getUrn(), "urn:li:dataPlatform:snowflake"); + assertEquals( + assertion.getActions().getOnSuccess().get(0).getType(), + com.linkedin.datahub.graphql.generated.AssertionActionType.RAISE_INCIDENT); + assertEquals( + assertion.getActions().getOnFailure().get(0).getType(), + com.linkedin.datahub.graphql.generated.AssertionActionType.RESOLVE_INCIDENT); // Assert second element is null. assertNull(result.get(1)); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapperTest.java new file mode 100644 index 0000000000000..7758aaa986fed --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapperTest.java @@ -0,0 +1,100 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.FieldAssertionInfo; +import com.linkedin.assertion.FieldAssertionType; +import com.linkedin.assertion.FieldMetricAssertion; +import com.linkedin.assertion.FieldMetricType; +import com.linkedin.assertion.FieldTransform; +import com.linkedin.assertion.FieldTransformType; +import com.linkedin.assertion.FieldValuesAssertion; +import com.linkedin.assertion.FieldValuesFailThreshold; +import com.linkedin.assertion.FieldValuesFailThresholdType; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetFilter; +import com.linkedin.dataset.DatasetFilterType; +import com.linkedin.schema.SchemaFieldSpec; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class FieldAssertionMapperTest { + @Test + public void testMapFieldValuesAssertionInfo() throws Exception { + FieldAssertionInfo fieldAssertionInfo = + new FieldAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setType(FieldAssertionType.FIELD_VALUES) + .setFieldValuesAssertion( + new FieldValuesAssertion() + .setExcludeNulls(true) + .setFailThreshold( + new FieldValuesFailThreshold() + .setType(FieldValuesFailThresholdType.PERCENTAGE) + .setValue(5L)) + .setField( + new SchemaFieldSpec() + .setPath("path") + .setType("STRING") + .setNativeType("VARCHAR")) + .setOperator(AssertionStdOperator.IS_TRUE) + .setTransform(new FieldTransform().setType(FieldTransformType.LENGTH))); + + com.linkedin.datahub.graphql.generated.FieldAssertionInfo result = + FieldAssertionMapper.mapFieldAssertionInfo(null, fieldAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.FieldAssertionType.FIELD_VALUES); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals(result.getFieldValuesAssertion().getField().getPath(), "path"); + Assert.assertEquals(result.getFieldValuesAssertion().getField().getType(), "STRING"); + Assert.assertEquals(result.getFieldValuesAssertion().getField().getNativeType(), "VARCHAR"); + Assert.assertEquals( + result.getFieldValuesAssertion().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.IS_TRUE); + Assert.assertEquals( + result.getFieldValuesAssertion().getTransform().getType(), + com.linkedin.datahub.graphql.generated.FieldTransformType.LENGTH); + Assert.assertEquals(result.getFieldValuesAssertion().getExcludeNulls(), true); + Assert.assertEquals( + result.getFieldValuesAssertion().getFailThreshold().getType(), + com.linkedin.datahub.graphql.generated.FieldValuesFailThresholdType.PERCENTAGE); + Assert.assertEquals( + result.getFieldValuesAssertion().getFailThreshold().getValue(), Long.valueOf(5L)); + } + + @Test + public void testMapFieldMetricAssertionInfo() throws Exception { + FieldAssertionInfo fieldAssertionInfo = + new FieldAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(FieldAssertionType.FIELD_METRIC) + .setFieldMetricAssertion( + new FieldMetricAssertion() + .setField( + new SchemaFieldSpec() + .setPath("path") + .setType("STRING") + .setNativeType("VARCHAR")) + .setOperator(AssertionStdOperator.IS_TRUE) + .setMetric(FieldMetricType.MEDIAN)); + + com.linkedin.datahub.graphql.generated.FieldAssertionInfo result = + FieldAssertionMapper.mapFieldAssertionInfo(null, fieldAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.FieldAssertionType.FIELD_METRIC); + Assert.assertEquals(result.getFieldMetricAssertion().getField().getPath(), "path"); + Assert.assertEquals(result.getFieldMetricAssertion().getField().getType(), "STRING"); + Assert.assertEquals(result.getFieldMetricAssertion().getField().getNativeType(), "VARCHAR"); + Assert.assertEquals( + result.getFieldMetricAssertion().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.IS_TRUE); + Assert.assertEquals( + result.getFieldMetricAssertion().getMetric(), + com.linkedin.datahub.graphql.generated.FieldMetricType.MEDIAN); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapperTest.java new file mode 100644 index 0000000000000..b69ed02bdfd62 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapperTest.java @@ -0,0 +1,82 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.FixedIntervalSchedule; +import com.linkedin.assertion.FreshnessAssertionInfo; +import com.linkedin.assertion.FreshnessAssertionSchedule; +import com.linkedin.assertion.FreshnessAssertionScheduleType; +import com.linkedin.assertion.FreshnessAssertionType; +import com.linkedin.assertion.FreshnessCronSchedule; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetFilter; +import com.linkedin.dataset.DatasetFilterType; +import com.linkedin.timeseries.CalendarInterval; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class FreshnessAssertionMapperTest { + @Test + public void testMapCronFreshnessAssertionInfo() throws Exception { + FreshnessAssertionInfo freshnessAssertionInfo = + new FreshnessAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(FreshnessAssertionType.DATASET_CHANGE) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setSchedule( + new FreshnessAssertionSchedule() + .setType(FreshnessAssertionScheduleType.CRON) + .setCron( + new FreshnessCronSchedule() + .setCron("0 0 0 * * ? *") + .setTimezone("America/Los_Angeles") + .setWindowStartOffsetMs(10L))); + + com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo result = + FreshnessAssertionMapper.mapFreshnessAssertionInfo(null, freshnessAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionType.DATASET_CHANGE); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getSchedule().getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType.CRON); + Assert.assertEquals(result.getSchedule().getCron().getCron(), "0 0 0 * * ? *"); + Assert.assertEquals(result.getSchedule().getCron().getTimezone(), "America/Los_Angeles"); + Assert.assertEquals(result.getSchedule().getCron().getWindowStartOffsetMs(), Long.valueOf(10L)); + } + + @Test + public void testMapFixedIntervalFreshnessAssertionInfo() throws Exception { + FreshnessAssertionInfo freshnessAssertionInfo = + new FreshnessAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(FreshnessAssertionType.DATASET_CHANGE) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setSchedule( + new FreshnessAssertionSchedule() + .setType(FreshnessAssertionScheduleType.FIXED_INTERVAL) + .setFixedInterval( + new FixedIntervalSchedule().setUnit(CalendarInterval.DAY).setMultiple(10))); + + com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo result = + FreshnessAssertionMapper.mapFreshnessAssertionInfo(null, freshnessAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionType.DATASET_CHANGE); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getSchedule().getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType.FIXED_INTERVAL); + Assert.assertEquals( + result.getSchedule().getFixedInterval().getUnit(), + com.linkedin.datahub.graphql.generated.DateInterval.DAY); + Assert.assertEquals(result.getSchedule().getFixedInterval().getMultiple(), 10); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapperTest.java new file mode 100644 index 0000000000000..271362c9fd846 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapperTest.java @@ -0,0 +1,78 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.AssertionStdParameter; +import com.linkedin.assertion.AssertionStdParameterType; +import com.linkedin.assertion.AssertionStdParameters; +import com.linkedin.assertion.AssertionValueChangeType; +import com.linkedin.assertion.SqlAssertionInfo; +import com.linkedin.assertion.SqlAssertionType; +import com.linkedin.common.urn.Urn; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SqlAssertionMapperTest { + @Test + public void testMapMetricSqlAssertionInfo() throws Exception { + SqlAssertionInfo sqlAssertionInfo = + new SqlAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(SqlAssertionType.METRIC) + .setStatement("SELECT COUNT(*) FROM foo.bar.baz") + .setOperator(AssertionStdOperator.GREATER_THAN) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue(("5")))); + + com.linkedin.datahub.graphql.generated.SqlAssertionInfo result = + SqlAssertionMapper.mapSqlAssertionInfo(sqlAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.SqlAssertionType.METRIC); + Assert.assertEquals(result.getStatement(), "SELECT COUNT(*) FROM foo.bar.baz"); + Assert.assertEquals( + result.getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN); + Assert.assertEquals( + result.getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getParameters().getValue().getValue(), "5"); + } + + @Test + public void testMapMetricChangeSqlAssertionInfo() throws Exception { + SqlAssertionInfo sqlAssertionInfo = + new SqlAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(SqlAssertionType.METRIC_CHANGE) + .setStatement("SELECT COUNT(*) FROM foo.bar.baz") + .setChangeType(AssertionValueChangeType.ABSOLUTE) + .setOperator(AssertionStdOperator.GREATER_THAN) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue(("5")))); + + com.linkedin.datahub.graphql.generated.SqlAssertionInfo result = + SqlAssertionMapper.mapSqlAssertionInfo(sqlAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.SqlAssertionType.METRIC_CHANGE); + Assert.assertEquals(result.getStatement(), "SELECT COUNT(*) FROM foo.bar.baz"); + Assert.assertEquals( + result.getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN); + Assert.assertEquals( + result.getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getParameters().getValue().getValue(), "5"); + Assert.assertEquals( + result.getChangeType(), + com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapperTest.java new file mode 100644 index 0000000000000..f23fadb699207 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapperTest.java @@ -0,0 +1,207 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.AssertionStdParameter; +import com.linkedin.assertion.AssertionStdParameterType; +import com.linkedin.assertion.AssertionStdParameters; +import com.linkedin.assertion.AssertionValueChangeType; +import com.linkedin.assertion.IncrementingSegmentFieldTransformer; +import com.linkedin.assertion.IncrementingSegmentFieldTransformerType; +import com.linkedin.assertion.IncrementingSegmentRowCountChange; +import com.linkedin.assertion.IncrementingSegmentRowCountTotal; +import com.linkedin.assertion.RowCountChange; +import com.linkedin.assertion.RowCountTotal; +import com.linkedin.assertion.VolumeAssertionInfo; +import com.linkedin.assertion.VolumeAssertionType; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetFilter; +import com.linkedin.dataset.DatasetFilterType; +import com.linkedin.schema.SchemaFieldSpec; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class VolumeAssertionMapperTest { + @Test + public void testMapRowCountTotalVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.ROW_COUNT_TOTAL) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setRowCountTotal( + new RowCountTotal() + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10")))); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType.ROW_COUNT_TOTAL); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getRowCountTotal().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getRowCountTotal().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getRowCountTotal().getParameters().getValue().getValue(), "10"); + } + + @Test + public void testMapRowCountChangeVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.ROW_COUNT_CHANGE) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setRowCountChange( + new RowCountChange() + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10"))) + .setType(AssertionValueChangeType.ABSOLUTE)); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType.ROW_COUNT_CHANGE); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getRowCountChange().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getRowCountChange().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getRowCountChange().getParameters().getValue().getValue(), "10"); + Assert.assertEquals( + result.getRowCountChange().getType(), + com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE); + } + + @Test + public void testMapIncrementingSegmentRowCountTotalVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.INCREMENTING_SEGMENT_ROW_COUNT_TOTAL) + .setIncrementingSegmentRowCountTotal( + new IncrementingSegmentRowCountTotal() + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10"))) + .setSegment( + new com.linkedin.assertion.IncrementingSegmentSpec() + .setField( + new SchemaFieldSpec() + .setPath("path") + .setNativeType("VARCHAR") + .setType("STRING")) + .setTransformer( + new IncrementingSegmentFieldTransformer() + .setType(IncrementingSegmentFieldTransformerType.CEILING) + .setNativeType("CEILING")))); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType + .INCREMENTING_SEGMENT_ROW_COUNT_TOTAL); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getParameters().getValue().getValue(), "10"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getField().getPath(), "path"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getField().getNativeType(), + "VARCHAR"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getField().getType(), "STRING"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getTransformer().getType(), + com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformerType.CEILING); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getTransformer().getNativeType(), + "CEILING"); + } + + @Test + public void testMapIncrementingSegmentRowCountChangeVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.INCREMENTING_SEGMENT_ROW_COUNT_CHANGE) + .setIncrementingSegmentRowCountChange( + new IncrementingSegmentRowCountChange() + .setType(AssertionValueChangeType.ABSOLUTE) + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10"))) + .setSegment( + new com.linkedin.assertion.IncrementingSegmentSpec() + .setField( + new SchemaFieldSpec() + .setPath("path") + .setNativeType("VARCHAR") + .setType("STRING")))); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType + .INCREMENTING_SEGMENT_ROW_COUNT_CHANGE); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getType(), + com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getParameters().getValue().getValue(), "10"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getSegment().getField().getPath(), "path"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getSegment().getField().getNativeType(), + "VARCHAR"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getSegment().getField().getType(), "STRING"); + } +} diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx index 171a36b1cfbcc..aa00e8ebc879d 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/useDeleteEntity.tsx @@ -29,16 +29,10 @@ function useDeleteEntity( const { isInGlossaryContext, urnsToUpdate, setUrnsToUpdate } = useGlossaryEntityData(); const { handleDeleteDomain } = useHandleDeleteDomain({ entityData, urn }); - const maybeDeleteEntity = getDeleteEntityMutation(type)(); - const deleteEntity = (maybeDeleteEntity && maybeDeleteEntity[0]) || undefined; - const client = maybeDeleteEntity?.[1].client; + const [deleteEntity, { client }] = getDeleteEntityMutation(type)() ?? [undefined, { client: undefined }]; function handleDeleteEntity() { - deleteEntity?.({ - variables: { - urn, - }, - }) + deleteEntity?.({ variables: { urn } }) .then(() => { analytics.event({ type: EventType.DeleteEntityEvent, @@ -56,10 +50,6 @@ function useDeleteEntity( handleDeleteDomain(); } - if (client && entityData.type === EntityType.GlossaryTerm && entityData?.parentNodes?.nodes) { - removeTermFromGlossaryNode(client, entityData.parentNodes.nodes[0].urn, urn); - } - setTimeout( () => { setHasBeenDeleted(true); @@ -67,6 +57,9 @@ function useDeleteEntity( if (isInGlossaryContext) { const parentNodeToUpdate = getParentNodeToUpdate(entityData, type); updateGlossarySidebar([parentNodeToUpdate], urnsToUpdate, setUrnsToUpdate); + if (client) { + removeTermFromGlossaryNode(client, parentNodeToUpdate, urn); + } } if (!hideMessage) { message.success({ diff --git a/docs-website/graphql/generateGraphQLSchema.sh b/docs-website/graphql/generateGraphQLSchema.sh index da14fbc337f90..a904a2e36d7c1 100755 --- a/docs-website/graphql/generateGraphQLSchema.sh +++ b/docs-website/graphql/generateGraphQLSchema.sh @@ -9,6 +9,7 @@ cat ../../datahub-graphql-core/src/main/resources/app.graphql >> combined.graphq cat ../../datahub-graphql-core/src/main/resources/auth.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/constraints.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/entity.graphql >> combined.graphql +cat ../../datahub-graphql-core/src/main/resources/assertions.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/ingestion.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/recommendation.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/search.graphql >> combined.graphql diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index 66ed48a428a21..79ae0fbeacd94 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -285,6 +285,7 @@ public class Constants { public static final String ASSERTION_INFO_ASPECT_NAME = "assertionInfo"; public static final String ASSERTION_RUN_EVENT_ASPECT_NAME = "assertionRunEvent"; public static final String ASSERTION_RUN_EVENT_STATUS_COMPLETE = "COMPLETE"; + public static final String ASSERTION_ACTIONS_ASPECT_NAME = "assertionActions"; // Tests public static final String TEST_ENTITY_NAME = "test"; diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index bc3415b2ccc8c..2df15309810db 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -18,6 +18,7 @@ dependencies { annotationProcessor externalDependency.lombok testImplementation externalDependency.mockito + testImplementation externalDependency.testng constraints { implementation(externalDependency.log4jCore) { diff --git a/metadata-dao-impl/kafka-producer/src/main/java/com/datahub/metadata/dao/producer/KafkaProducerThrottle.java b/metadata-dao-impl/kafka-producer/src/main/java/com/datahub/metadata/dao/producer/KafkaProducerThrottle.java new file mode 100644 index 0000000000000..8fbb34b1eacd6 --- /dev/null +++ b/metadata-dao-impl/kafka-producer/src/main/java/com/datahub/metadata/dao/producer/KafkaProducerThrottle.java @@ -0,0 +1,246 @@ +package com.datahub.metadata.dao.producer; + +import com.codahale.metrics.Gauge; +import com.google.common.annotations.VisibleForTesting; +import com.linkedin.metadata.config.MetadataChangeProposalConfig; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.utils.metrics.MetricUtils; +import com.linkedin.util.Pair; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javax.annotation.Nonnull; +import lombok.Builder; +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.admin.Admin; +import org.apache.kafka.clients.admin.OffsetSpec; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.springframework.util.backoff.BackOffExecution; +import org.springframework.util.backoff.ExponentialBackOff; + +@Slf4j +@Builder(toBuilder = true) +public class KafkaProducerThrottle { + @Nonnull private final EntityRegistry entityRegistry; + @Nonnull private final Admin kafkaAdmin; + @Nonnull private final MetadataChangeProposalConfig.ThrottlesConfig config; + @Nonnull private final String mclConsumerGroupId; + @Nonnull private final String versionedTopicName; + @Nonnull private final String timeseriesTopicName; + @Nonnull private final Consumer pauseConsumer; + + private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); + private final Map medianLag = new ConcurrentHashMap<>(); + private final Map backoffMap = new ConcurrentHashMap<>(); + + /** Update lag information at a given rate */ + public KafkaProducerThrottle start() { + if ((config.getVersioned().isEnabled() || config.getTimeseries().isEnabled()) + && config.getUpdateIntervalMs() > 0) { + scheduler.scheduleAtFixedRate( + () -> { + refresh(); + try { + throttle(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }, + config.getUpdateIntervalMs(), + config.getUpdateIntervalMs(), + TimeUnit.MILLISECONDS); + } + return this; + } + + @VisibleForTesting + public void refresh() { + medianLag.putAll(getMedianLag()); + log.info("MCL medianLag: {}", medianLag); + } + + @VisibleForTesting + public void stop() { + scheduler.shutdown(); + } + + /** + * Get copy of the lag info + * + * @return median lag per mcl topic + */ + @VisibleForTesting + public Map getLag() { + return medianLag.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + @VisibleForTesting + public boolean isThrottled(MclType mclType) { + if (getThrottleConfig(mclType).isEnabled() && medianLag.containsKey(mclType)) { + return medianLag.get(mclType) > getThrottleConfig(mclType).getThreshold(); + } + return false; + } + + @VisibleForTesting + public long computeNextBackOff(MclType mclType) { + if (isThrottled(mclType)) { + BackOffExecution backOffExecution = + backoffMap.computeIfAbsent( + mclType, + k -> { + MetadataChangeProposalConfig.ThrottleConfig throttleConfig = + getThrottleConfig(mclType); + ExponentialBackOff backoff = + new ExponentialBackOff( + throttleConfig.getInitialIntervalMs(), throttleConfig.getMultiplier()); + backoff.setMaxAttempts(throttleConfig.getMaxAttempts()); + backoff.setMaxInterval(throttleConfig.getMaxIntervalMs()); + return backoff.start(); + }); + return backOffExecution.nextBackOff(); + } + return 0; + } + + @VisibleForTesting + public void throttle() throws InterruptedException { + for (MclType mclType : MclType.values()) { + if (isThrottled(mclType)) { + long backoffWaitMs = computeNextBackOff(mclType); + + if (backoffWaitMs > 0) { + log.warn( + "Throttled producer Topic: {} Duration: {} ms MedianLag: {}", + getTopicName(mclType), + backoffWaitMs, + medianLag.get(mclType)); + MetricUtils.gauge( + this.getClass(), + String.format("%s_throttled", getTopicName(mclType)), + () -> (Gauge) () -> 1); + MetricUtils.counter( + this.getClass(), String.format("%s_throttledCount", getTopicName(mclType))) + .inc(); + + log.info("Pausing MCE consumer for {} ms.", backoffWaitMs); + pauseConsumer.accept(true); + Thread.sleep(backoffWaitMs); + log.info("Resuming MCE consumer."); + pauseConsumer.accept(false); + + // if throttled for one topic, skip remaining + return; + } else { + // no throttle or exceeded configuration limits + log.info("MCE consumer throttle exponential backoff reset."); + backoffMap.remove(mclType); + MetricUtils.gauge( + this.getClass(), + String.format("%s_throttled", getTopicName(mclType)), + () -> (Gauge) () -> 0); + } + } else { + // not throttled, remove backoff tracking + log.info("MCE consumer throttle exponential backoff reset."); + backoffMap.remove(mclType); + MetricUtils.gauge( + this.getClass(), + String.format("%s_throttled", getTopicName(mclType)), + () -> (Gauge) () -> 0); + } + } + } + + private Map getMedianLag() { + try { + Map mclConsumerOffsets = + kafkaAdmin + .listConsumerGroupOffsets(mclConsumerGroupId) + .partitionsToOffsetAndMetadata() + .get() + .entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + Map latestOffsetRequest = + mclConsumerOffsets.keySet().stream() + .map(offsetAndMetadata -> Map.entry(offsetAndMetadata, OffsetSpec.latest())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Map endOffsetValues = + kafkaAdmin.listOffsets(latestOffsetRequest).all().get().entrySet().stream() + .map(entry -> Map.entry(entry.getKey(), entry.getValue().offset())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + return Stream.of( + Pair.of(MclType.VERSIONED, versionedTopicName), + Pair.of(MclType.TIMESERIES, timeseriesTopicName)) + .map( + topic -> { + MclType mclType = topic.getFirst(); + String topicName = topic.getSecond(); + + Map topicOffsets = + mclConsumerOffsets.entrySet().stream() + .filter(entry -> entry.getKey().topic().equals(topicName)) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + List offsetValues = + topicOffsets.values().stream() + .map(OffsetAndMetadata::offset) + .map(Long::doubleValue) + .collect(Collectors.toList()); + long offsetMedian = getMedian(offsetValues).longValue(); + + List topicEndOffsetValues = + topicOffsets.keySet().stream() + .map(topicPart -> endOffsetValues.getOrDefault(topicPart, 0L)) + .map(Long::doubleValue) + .collect(Collectors.toList()); + long endOffsetMedian = getMedian(topicEndOffsetValues).longValue(); + return Map.entry(mclType, Math.max(0, endOffsetMedian - offsetMedian)); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } catch (ExecutionException | InterruptedException e) { + log.error("Error fetching consumer group offsets.", e); + return Map.of(MclType.VERSIONED, 0L, MclType.TIMESERIES, 0L); + } + } + + private MetadataChangeProposalConfig.ThrottleConfig getThrottleConfig(MclType mclType) { + MetadataChangeProposalConfig.ThrottleConfig throttleConfig; + switch (mclType) { + case VERSIONED -> throttleConfig = config.getVersioned(); + case TIMESERIES -> throttleConfig = config.getTimeseries(); + default -> throw new IllegalStateException(); + } + return throttleConfig; + } + + private String getTopicName(MclType mclType) { + return MclType.TIMESERIES.equals(mclType) ? timeseriesTopicName : versionedTopicName; + } + + private static Double getMedian(Collection listValues) { + double[] values = listValues.stream().mapToDouble(d -> d).sorted().toArray(); + double median; + if (values.length % 2 == 0) + median = (values[values.length / 2] + values[values.length / 2 - 1]) / 2; + else median = values[values.length / 2]; + return median; + } + + public enum MclType { + TIMESERIES, + VERSIONED + } +} diff --git a/metadata-dao-impl/kafka-producer/src/test/java/com/datahub/metadata/dao/producer/KafkaProducerThrottleTest.java b/metadata-dao-impl/kafka-producer/src/test/java/com/datahub/metadata/dao/producer/KafkaProducerThrottleTest.java new file mode 100644 index 0000000000000..ce6104ee2ca7d --- /dev/null +++ b/metadata-dao-impl/kafka-producer/src/test/java/com/datahub/metadata/dao/producer/KafkaProducerThrottleTest.java @@ -0,0 +1,363 @@ +package com.datahub.metadata.dao.producer; + +import static org.mockito.ArgumentMatchers.anyMap; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; +import static org.mockito.Mockito.verifyNoMoreInteractions; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import com.linkedin.metadata.config.MetadataChangeProposalConfig; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.mxe.Topics; +import com.linkedin.util.Pair; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.ListConsumerGroupOffsetsResult; +import org.apache.kafka.clients.admin.ListOffsetsResult; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.KafkaFuture; +import org.apache.kafka.common.TopicPartition; +import org.testng.annotations.Test; + +public class KafkaProducerThrottleTest { + private static final List STANDARD_TOPICS = + List.of(Topics.METADATA_CHANGE_LOG_VERSIONED, Topics.METADATA_CHANGE_LOG_TIMESERIES); + private static final String STANDARD_MCL_CONSUMER_GROUP_ID = "generic-mae-consumer-job-client"; + + @Test + public void testLagCalculation() throws ExecutionException, InterruptedException { + // 3 partitions + // Consumer offsets: 1, 2, 3 + // End offsets: 2, 4, 6 + // Lag: 1, 2, 3 + // MedianLag: 2 + AdminClient mockAdmin = + mockKafka( + generateLag( + STANDARD_TOPICS, + topicPart -> (long) topicPart.partition() + 1, + topicPart -> ((long) topicPart.partition() + 1) * 2, + 3)); + + KafkaProducerThrottle test = + KafkaProducerThrottle.builder() + .config(noSchedulerConfig().getThrottle()) + .kafkaAdmin(mockAdmin) + .versionedTopicName(STANDARD_TOPICS.get(0)) + .timeseriesTopicName(STANDARD_TOPICS.get(1)) + .entityRegistry(mock(EntityRegistry.class)) + .mclConsumerGroupId(STANDARD_MCL_CONSUMER_GROUP_ID) + .pauseConsumer(mock(Consumer.class)) + .build(); + + // Refresh calculations + test.refresh(); + + assertEquals( + test.getLag(), + Map.of( + KafkaProducerThrottle.MclType.VERSIONED, 2L, + KafkaProducerThrottle.MclType.TIMESERIES, 2L)); + } + + @Test + public void testThrottle() throws ExecutionException, InterruptedException { + MetadataChangeProposalConfig.ThrottlesConfig noThrottleConfig = + noSchedulerConfig().getThrottle(); + noThrottleConfig + .getVersioned() + .setThreshold(10) + .setInitialIntervalMs(1) + .setMultiplier(1) + .setMaxAttempts(1) + .setMaxIntervalMs(1); + + MetadataChangeProposalConfig.ThrottlesConfig throttleConfig = noSchedulerConfig().getThrottle(); + throttleConfig + .getVersioned() + .setThreshold(1) + .setInitialIntervalMs(1) + .setMultiplier(1) + .setMaxAttempts(1) + .setMaxIntervalMs(1); + + // 3 partitions + // Consumer offsets: 1, 2, 3 + // End offsets: 2, 4, 6 + // Lag: 1, 2, 3 + // MedianLag: 2 + AdminClient mockAdmin = + mockKafka( + generateLag( + STANDARD_TOPICS, + topicPart -> (long) topicPart.partition() + 1, + topicPart -> ((long) topicPart.partition() + 1) * 2, + 3)); + + Consumer pauseFunction = mock(Consumer.class); + + KafkaProducerThrottle test = + KafkaProducerThrottle.builder() + .config(noThrottleConfig) + .kafkaAdmin(mockAdmin) + .versionedTopicName(STANDARD_TOPICS.get(0)) + .timeseriesTopicName(STANDARD_TOPICS.get(1)) + .entityRegistry(mock(EntityRegistry.class)) + .mclConsumerGroupId(STANDARD_MCL_CONSUMER_GROUP_ID) + .pauseConsumer(pauseFunction) + .build(); + + // Refresh calculations + test.refresh(); + assertEquals( + test.getLag(), + Map.of( + KafkaProducerThrottle.MclType.VERSIONED, 2L, + KafkaProducerThrottle.MclType.TIMESERIES, 2L)); + assertFalse( + test.isThrottled(KafkaProducerThrottle.MclType.VERSIONED), + "Expected not throttling, lag is below threshold"); + assertFalse(test.isThrottled(KafkaProducerThrottle.MclType.TIMESERIES)); + test.throttle(); + verifyNoInteractions(pauseFunction); + reset(pauseFunction); + + KafkaProducerThrottle test2 = test.toBuilder().config(throttleConfig).build(); + // Refresh calculations + test2.refresh(); + assertEquals( + test2.getLag(), + Map.of( + KafkaProducerThrottle.MclType.VERSIONED, 2L, + KafkaProducerThrottle.MclType.TIMESERIES, 2L)); + assertTrue( + test2.isThrottled(KafkaProducerThrottle.MclType.VERSIONED), + "Expected throttling, lag is above threshold."); + assertFalse( + test2.isThrottled(KafkaProducerThrottle.MclType.TIMESERIES), + "Expected not throttling. Timeseries is disabled"); + test2.throttle(); + + // verify 1ms pause and resume + verify(pauseFunction).accept(eq(true)); + verify(pauseFunction).accept(eq(false)); + verifyNoMoreInteractions(pauseFunction); + } + + @Test + public void testBackOff() throws ExecutionException, InterruptedException { + MetadataChangeProposalConfig.ThrottlesConfig throttleConfig = noSchedulerConfig().getThrottle(); + throttleConfig + .getVersioned() + .setThreshold(1) + .setInitialIntervalMs(1) + .setMultiplier(2) + .setMaxAttempts(5) + .setMaxIntervalMs(8); + + // 3 partitions + // Consumer offsets: 1, 2, 3 + // End offsets: 2, 4, 6 + // Lag: 1, 2, 3 + // MedianLag: 2 + AdminClient mockAdmin = + mockKafka( + generateLag( + STANDARD_TOPICS, + topicPart -> (long) topicPart.partition() + 1, + topicPart -> ((long) topicPart.partition() + 1) * 2, + 3)); + + KafkaProducerThrottle test = + KafkaProducerThrottle.builder() + .config(throttleConfig) + .kafkaAdmin(mockAdmin) + .versionedTopicName(STANDARD_TOPICS.get(0)) + .timeseriesTopicName(STANDARD_TOPICS.get(1)) + .entityRegistry(mock(EntityRegistry.class)) + .mclConsumerGroupId(STANDARD_MCL_CONSUMER_GROUP_ID) + .pauseConsumer(mock(Consumer.class)) + .build(); + + // Refresh calculations + test.refresh(); + assertEquals( + test.getLag(), + Map.of( + KafkaProducerThrottle.MclType.VERSIONED, 2L, + KafkaProducerThrottle.MclType.TIMESERIES, 2L)); + assertTrue( + test.isThrottled(KafkaProducerThrottle.MclType.VERSIONED), + "Expected throttling, lag is above threshold."); + assertFalse( + test.isThrottled(KafkaProducerThrottle.MclType.TIMESERIES), + "Expected no throttling. Timeseries is disabled"); + + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.TIMESERIES), + 0L, + "Expected no backoff. Timeseries is disabled."); + + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.VERSIONED), 1L, "Expected initial 1"); + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.VERSIONED), + 2L, + "Expected second 2^1"); + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.VERSIONED), 4L, "Expected third 2^2"); + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.VERSIONED), + 8L, + "Expected fourth 2^3"); + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.VERSIONED), + 8L, + "Expected fifth max interval at 8"); + assertEquals( + test.computeNextBackOff(KafkaProducerThrottle.MclType.VERSIONED), + -1L, + "Expected max attempts"); + } + + @Test + public void testScheduler() throws ExecutionException, InterruptedException { + MetadataChangeProposalConfig config = new MetadataChangeProposalConfig(); + MetadataChangeProposalConfig.ThrottlesConfig throttlesConfig = + new MetadataChangeProposalConfig.ThrottlesConfig() + .setUpdateIntervalMs(10); // configure fast update for test + throttlesConfig.setVersioned( + new MetadataChangeProposalConfig.ThrottleConfig() + .setEnabled(true) // enable 1 throttle config to activate + ); + throttlesConfig.setTimeseries( + new MetadataChangeProposalConfig.ThrottleConfig().setEnabled(false)); + config.setThrottle(throttlesConfig); + + // 1 lag, 1 partition + AdminClient mockAdmin = + mockKafka(generateLag(STANDARD_TOPICS, topicPart -> 1L, topicPart -> 2L, 1)); + + KafkaProducerThrottle test = + KafkaProducerThrottle.builder() + .config(throttlesConfig) + .kafkaAdmin(mockAdmin) + .versionedTopicName(STANDARD_TOPICS.get(0)) + .timeseriesTopicName(STANDARD_TOPICS.get(1)) + .entityRegistry(mock(EntityRegistry.class)) + .mclConsumerGroupId(STANDARD_MCL_CONSUMER_GROUP_ID) + .pauseConsumer(mock(Consumer.class)) + .build(); + + try { + test.start(); + Thread.sleep(50); + assertEquals( + test.getLag(), + Map.of( + KafkaProducerThrottle.MclType.VERSIONED, 1L, + KafkaProducerThrottle.MclType.TIMESERIES, 1L), + "Expected lag updated"); + } finally { + test.stop(); + } + } + + private static MetadataChangeProposalConfig noSchedulerConfig() { + MetadataChangeProposalConfig config = new MetadataChangeProposalConfig(); + MetadataChangeProposalConfig.ThrottlesConfig throttlesConfig = + new MetadataChangeProposalConfig.ThrottlesConfig() + .setUpdateIntervalMs(0); // no scheduler, manual update + throttlesConfig.setVersioned( + new MetadataChangeProposalConfig.ThrottleConfig() + .setEnabled(true) // enable 1 throttle config to activate + ); + throttlesConfig.setTimeseries( + new MetadataChangeProposalConfig.ThrottleConfig().setEnabled(false)); + config.setThrottle(throttlesConfig); + return config; + } + + private static Pair, Map> + generateLag( + Collection topicNames, + Function consumerOffset, + Function endOffset, + int partitions) { + + Set topicPartitions = + topicNames.stream() + .flatMap( + topicName -> + IntStream.range(0, partitions) + .mapToObj(partitionNum -> new TopicPartition(topicName, partitionNum))) + .collect(Collectors.toSet()); + + Map consumerOffsetMap = + topicPartitions.stream() + .map( + topicPartition -> + Map.entry( + topicPartition, + new OffsetAndMetadata(consumerOffset.apply(topicPartition)))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Map endOffsetMap = + topicPartitions.stream() + .map(topicPartition -> Map.entry(topicPartition, endOffset.apply(topicPartition))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + return Pair.of(consumerOffsetMap, endOffsetMap); + } + + private static AdminClient mockKafka( + Pair, Map> offsetPair) + throws ExecutionException, InterruptedException { + + AdminClient mockKafkaAdmin = mock(AdminClient.class); + + // consumer offsets + ListConsumerGroupOffsetsResult mockConsumerOffsetsResult = + mock(ListConsumerGroupOffsetsResult.class); + KafkaFuture> mockConsumerFuture = + mock(KafkaFuture.class); + when(mockConsumerOffsetsResult.partitionsToOffsetAndMetadata()).thenReturn(mockConsumerFuture); + when(mockConsumerFuture.get()).thenReturn(offsetPair.getFirst()); + when(mockKafkaAdmin.listConsumerGroupOffsets(anyString())) + .thenReturn(mockConsumerOffsetsResult); + + // end offsets + ListOffsetsResult mockOffsetsResult = mock(ListOffsetsResult.class); + KafkaFuture> mockOffsetFuture = + mock(KafkaFuture.class); + Map resultMap = + offsetPair.getSecond().entrySet().stream() + .map( + entry -> { + ListOffsetsResult.ListOffsetsResultInfo mockInfo = + mock(ListOffsetsResult.ListOffsetsResultInfo.class); + when(mockInfo.offset()).thenReturn(entry.getValue()); + return Map.entry(entry.getKey(), mockInfo); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + when(mockOffsetFuture.get()).thenReturn(resultMap); + when(mockOffsetsResult.all()).thenReturn(mockOffsetFuture); + when(mockKafkaAdmin.listOffsets(anyMap())).thenReturn(mockOffsetsResult); + + return mockKafkaAdmin; + } +} diff --git a/metadata-ingestion/docs/sources/athena/athena_pre.md b/metadata-ingestion/docs/sources/athena/athena_pre.md index a56457d3f84fc..fdb2b136a5255 100644 --- a/metadata-ingestion/docs/sources/athena/athena_pre.md +++ b/metadata-ingestion/docs/sources/athena/athena_pre.md @@ -35,7 +35,7 @@ In order to execute this source, you will need to create a policy with below per "glue:GetPartitions", "s3:GetObject", "s3:ListBucket", - "s3:GetBucketLocation", + "s3:GetBucketLocation" ], "Resource": [ "arn:aws:athena:${region-id}:${account-id}:datacatalog/*", @@ -64,9 +64,9 @@ In order to execute this source, you will need to create a policy with below per "arn:aws:s3:::${athena-query-result-bucket}/*", "arn:aws:s3:::${athena-query-result-bucket}" ] - }, + } ] } ``` -Replace `${var}` with appropriate values as per your athena setup. \ No newline at end of file +Replace `${var}` with appropriate values as per your athena setup. diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md index 07ab56f113030..7e40315a2e319 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg.md +++ b/metadata-ingestion/docs/sources/iceberg/iceberg.md @@ -10,7 +10,7 @@ This ingestion source maps the following Source System Concepts to DataHub Conce | Source Concept | DataHub Concept | Notes | | -- | -- | -- | | `iceberg` | [Data Platform](docs/generated/metamodel/entities/dataPlatform.md) | | -| Table | [Dataset](docs/generated/metamodel/entities/dataset.md) | Each Iceberg table maps to a Dataset named using the parent folders. If a table is stored under `my/namespace/table`, the dataset name will be `my.namespace.table`. If a [Platform Instance](https://datahubproject.io/docs/platform-instances/) is configured, it will be used as a prefix: `.my.namespace.table`. | +| Table | [Dataset](docs/generated/metamodel/entities/dataset.md) | An Iceberg table is registered inside a catalog using a name, where the catalog is responsible for creating, dropping and renaming tables. Catalogs manage a collection of tables that are usually grouped into namespaces. The name of a table is mapped to a Dataset name. If a [Platform Instance](https://datahubproject.io/docs/platform-instances/) is configured, it will be used as a prefix: `.my.namespace.table`. | | [Table property](https://iceberg.apache.org/docs/latest/configuration/#table-properties) | [User (a.k.a CorpUser)](docs/generated/metamodel/entities/corpuser.md) | The value of a table property can be used as the name of a CorpUser owner. This table property name can be configured with the source option `user_ownership_property`. | | [Table property](https://iceberg.apache.org/docs/latest/configuration/#table-properties) | CorpGroup | The value of a table property can be used as the name of a CorpGroup owner. This table property name can be configured with the source option `group_ownership_property`. | | Table parent folders (excluding [warehouse catalog location](https://iceberg.apache.org/docs/latest/configuration/#catalog-properties)) | Container | Available in a future release | diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml index 8caedafbea50e..c8deb8fcc9340 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml +++ b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml @@ -3,17 +3,25 @@ source: config: env: PROD catalog: - name: my_iceberg_catalog - type: rest - # Catalog configuration follows pyiceberg's documentation (https://py.iceberg.apache.org/configuration) - config: + # REST catalog configuration example using S3 storage + my_rest_catalog: + type: rest + # Catalog configuration follows pyiceberg's documentation (https://py.iceberg.apache.org/configuration) uri: http://localhost:8181 s3.access-key-id: admin s3.secret-access-key: password s3.region: us-east-1 warehouse: s3a://warehouse/wh/ s3.endpoint: http://localhost:9000 - platform_instance: my_iceberg_catalog + # SQL catalog configuration example using Azure datalake storage and a PostgreSQL database + # my_sql_catalog: + # type: sql + # uri: postgresql+psycopg2://user:password@sqldatabase.postgres.database.azure.com:5432/icebergcatalog + # adlfs.tenant-id: + # adlfs.account-name: + # adlfs.client-id: + # adlfs.client-secret: + platform_instance: my_rest_catalog table_pattern: allow: - marketing.* @@ -21,5 +29,4 @@ source: enabled: true sink: - # sink configs - + # sink configs \ No newline at end of file diff --git a/metadata-ingestion/examples/library/assertions_configuration.yml b/metadata-ingestion/examples/library/assertions_configuration.yml new file mode 100644 index 0000000000000..a44945a30f9a3 --- /dev/null +++ b/metadata-ingestion/examples/library/assertions_configuration.yml @@ -0,0 +1,76 @@ +version: 1 +namespace: test-config-id-1 +assertions: + # Freshness Assertion + - entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + type: freshness + lookback_interval: "1 hour" + last_modified_field: col_timestamp + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Volume Assertion + - type: volume + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + metric: row_count + condition: + type: less_than_or_equal_to + value: 1000 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Metric Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + field: col_date + metric: null_count + condition: + type: equal_to + value: 0 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Value Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + field: quantity + condition: + type: between + min: 0 + max: 10 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT + # Custom SQL Metric Assertion + - type: sql + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + statement: select mode(quantity) from test_db.public.purchase_event + condition: + type: equal_to + value: 5 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT diff --git a/metadata-ingestion/examples/perf/lineage_perf_example.py b/metadata-ingestion/examples/perf/lineage_perf_example.py new file mode 100644 index 0000000000000..3ee78bacb268a --- /dev/null +++ b/metadata-ingestion/examples/perf/lineage_perf_example.py @@ -0,0 +1,402 @@ +from typing import Iterable + +from datahub.emitter.mce_builder import make_data_job_urn, make_dataset_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import get_default_graph +from datahub.metadata.schema_classes import ( + DataJobInputOutputClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, + List, + StatusClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.utilities.urns.dataset_urn import DatasetUrn + + +def lineage_mcp_generator( + urn: str, upstreams: List[str] +) -> Iterable[MetadataChangeProposalWrapper]: + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=upstream, + type=DatasetLineageTypeClass.TRANSFORMED, + ) + for upstream in upstreams + ] + ), + ) + for upstream in upstreams: + yield MetadataChangeProposalWrapper( + entityUrn=upstream, aspect=StatusClass(removed=False) + ) + for urn_itr in [urn, *upstreams]: + yield MetadataChangeProposalWrapper( + entityUrn=urn_itr, + aspect=DatasetPropertiesClass(name=DatasetUrn.from_string(urn_itr).name), + ) + + +def datajob_lineage_mcp_generator( + urn: str, upstreams: List[str], downstreams: List[str] +) -> Iterable[MetadataChangeProposalWrapper]: + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=DataJobInputOutputClass( + inputDatasets=upstreams, + outputDatasets=downstreams, + ), + ) + for upstream in upstreams: + yield MetadataChangeProposalWrapper( + entityUrn=upstream, aspect=StatusClass(removed=False) + ) + for downstream in downstreams: + yield MetadataChangeProposalWrapper( + entityUrn=downstream, aspect=StatusClass(removed=False) + ) + + +def scenario_truncate_basic(): + """searchAcrossLineage(root, depth=n, breadth=3, skip=None) + All 21 urns. + """ + + path = "truncate.basic" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + + yield from lineage_mcp_generator( + root_urn, + [make_dataset_urn("snowflake", f"{path}.u_{i}") for i in range(10)], + ) + + for i in range(10): + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.d_{i}"), [root_urn] + ) + + +def scenario_truncate_intermediate(): + """searchAcrossLineage(root, depth=3, skip=None) + 1 root urn, all 3 direct upstreams and downstreams, and then 4 urns for each 'expanded' urn. + Total 1 + 3 + 4*3 = 16 urns. + """ + + path = "truncate.intermediate" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + + yield from lineage_mcp_generator( + root_urn, [make_dataset_urn("snowflake", f"{path}.u_{i}") for i in range(10)] + ) + + for i in range(3): + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.u_{i}"), + [make_dataset_urn("snowflake", f"{path}.u_{i}_u_{j}") for j in range(3)], + ) + + for i in range(3): + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.d_{i}"), [root_urn] + ) + for j in range(3): + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.d_{i}d_{j}"), + [make_dataset_urn("snowflake", f"{path}.d_{i}")], + ) + + +def scenario_truncate_complex(): + """searchAcrossLineage(root, depth=n, breadth=3, skip=None) + 1 root urn, + direct (lvl a) upstream, + its two (lvl b) upstreams, + each of their 3 (lvl c) upstreams, + each of their 4 (lvl d) upstreams, + then, for three of the lvl d nodes, 5 (lvl e) upstreams each, + then, for two of the lvl e nodes, 6 (lvl f) upstreams, and for the other lvl e node, 1 (lvl f) upstream. + Total 1 + 1 + 2 + (2 * 3) + (2 * 3 * 4) + (2 * 3 * 3 * 5) + (2 * 3 * 3 * 2 * 6) + (2 * 3 * 3 * 1 * 1) = 358 urns. + """ + + path = "truncate.complex" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + lvl_a = make_dataset_urn("snowflake", f"{path}.u_0") + lvl_b = {i: make_dataset_urn("snowflake", f"{path}.u_0_u_{i}") for i in range(2)} + lvl_c = { + (a, b): make_dataset_urn("snowflake", f"{path}.u_0_u_{a}_u_{b}") + for a in range(2) + for b in range(3) + } + lvl_d = { + (a, b, c): make_dataset_urn("snowflake", f"{path}.u_0_u_{a}_u_{b}_u_{c}") + for a in range(2) + for b in range(3) + for c in range(4) + } + lvl_e = { + (a, b, c, d): make_dataset_urn( + "snowflake", f"{path}.u_0_u_{a}_u_{b}_u_{c}_u_{d}" + ) + for a in range(2) + for b in range(3) + for c in range(4) + for d in range(5) + } + lvl_f = { + (a, b, c, d, e): make_dataset_urn( + "snowflake", f"{path}.u_0_u_{a}_u_{b}_u_{c}_u_{d}_u_{e}" + ) + for a in range(2) + for b in range(3) + for c in range(4) + for d in range(5) + for e in range(6 if d % 2 == 0 else 1) + } + + yield from lineage_mcp_generator(root_urn, [lvl_a]) + yield from lineage_mcp_generator(lvl_a, list(lvl_b.values())) + for a, urn in lvl_b.items(): + yield from lineage_mcp_generator(urn, [lvl_c[(a, b)] for b in range(3)]) + for (a, b), urn in lvl_c.items(): + yield from lineage_mcp_generator(urn, [lvl_d[(a, b, c)] for c in range(4)]) + for (a, b, c), urn in lvl_d.items(): + yield from lineage_mcp_generator(urn, [lvl_e[(a, b, c, d)] for d in range(5)]) + for (a, b, c, d), urn in lvl_e.items(): + yield from lineage_mcp_generator( + urn, [lvl_f[(a, b, c, d, e)] for e in range(6 if d % 2 == 0 else 1)] + ) + + +def scenario_skip_basic(): + """searchAcrossLineage(root, depth=1, breadth=10, skip=[{type: "dataJob"}, {type: "dataset", platform: "urn:li:dataPlatform:dbt"}]) + 1 root urn, both airflow nodes, both dbt nodes, and all 6 snowflake neighbors. + Total 1 + 2 + 2 + 6 = 11 urns. + """ + path = "skip.basic" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + upstream_dbt_urn = make_dataset_urn("dbt", f"{path}.u_0") + upstream_airflow_urn = make_data_job_urn("airflow", f"{path}.flow", f"{path}.u_0") + + yield from lineage_mcp_generator( + root_urn, + [ + make_dataset_urn("snowflake", f"{path}.u_direct"), + upstream_dbt_urn, + ], + ) + yield from lineage_mcp_generator( + upstream_dbt_urn, + [make_dataset_urn("snowflake", f"{path}.u_through_dbt")], + ) + yield from datajob_lineage_mcp_generator( + upstream_airflow_urn, + [make_dataset_urn("snowflake", f"{path}.u_through_airflow")], + [root_urn], + ) + + downstream_dbt_urn = make_dataset_urn("dbt", f"{path}.d_0") + downstream_airflow_urn = make_data_job_urn("airflow", f"{path}.flow", f"{path}.d_0") + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.d_direct"), + [root_urn], + ) + yield from lineage_mcp_generator( + downstream_dbt_urn, + [root_urn], + ) + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.d_through_dbt"), + [downstream_dbt_urn], + ) + yield from datajob_lineage_mcp_generator( + downstream_airflow_urn, + [root_urn], + [make_dataset_urn("snowflake", f"{path}.d_through_airflow")], + ) + + +def scenario_skip_intermediate(): + """searchAcrossLineage(root, depth=1, breadth=10, skip=[{type: "dataJob"}, {type: "dataset", platform: "urn:li:dataPlatform:dbt"}]) + 1 root urn and all nodes aside from those upstream of `skip.intermediate.u_indirect_1`. + Total 11 urns. + searchAcrossLineage(root, depth=2, breadth=10, skip=[{type: "dataJob"}, {type: "dataset", platform: "urn:li:dataPlatform:dbt"}]) + All 14 urns. + """ + path = "skip.intermediate" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + upstream_dbt_urns = [make_dataset_urn("dbt", f"{path}.u_{i}") for i in range(6)] + upstream_airflow_urn = make_data_job_urn("airflow", f"{path}.flow", f"{path}.u_0") + + yield from lineage_mcp_generator( + root_urn, + [ + make_dataset_urn("snowflake", f"{path}.u_direct"), + upstream_dbt_urns[0], + ], + ) + yield from datajob_lineage_mcp_generator( + upstream_airflow_urn, [upstream_dbt_urns[1]], [upstream_dbt_urns[0]] + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[1], + [ + upstream_dbt_urns[2], + ], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[2], + [ + upstream_dbt_urns[3], + upstream_dbt_urns[4], + ], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[3], + [make_dataset_urn("snowflake", f"{path}.u_indirect_0")], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[4], + [ + make_dataset_urn("snowflake", f"{path}.u_indirect_1"), + make_dataset_urn("snowflake", f"{path}.u_indirect_2"), + ], + ) + yield from lineage_mcp_generator( + make_dataset_urn("snowflake", f"{path}.u_indirect_1"), + [make_dataset_urn("snowflake", f"{path}.u_depth_2"), upstream_dbt_urns[5]], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[5], + [ + make_dataset_urn("snowflake", f"{path}.u_depth_2_indirect"), + ], + ) + + +def scenario_skip_complex(): + """searchAcrossLineage(root, depth=1, breadth=1, skip=[{type: "dataJob"}, {type: "dataset", platform: "urn:li:dataPlatform:dbt"}]) + The 11 urns from scenario_skip_intermediate, plus 2 snowflake urns and 1 dbt node from the single expanded upstream. + Total 14 urns. + """ + path = "skip.complex" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + upstream_dbt_urns = [make_dataset_urn("dbt", f"{path}.u_{i}") for i in range(5)] + upstream_airflow_urn = make_data_job_urn("airflow", f"{path}.flow", f"{path}.u_0") + depth_one_snowflake_urns = { + "direct": make_dataset_urn("snowflake", f"{path}.u_direct"), + "indirect_0": make_dataset_urn("snowflake", f"{path}.u_indirect_0"), + "indirect_1": make_dataset_urn("snowflake", f"{path}.u_indirect_1"), + "indirect_2": make_dataset_urn("snowflake", f"{path}.u_indirect_2"), + } + + yield from lineage_mcp_generator( + root_urn, + [ + depth_one_snowflake_urns["direct"], + upstream_dbt_urns[0], + ], + ) + yield from datajob_lineage_mcp_generator( + upstream_airflow_urn, [upstream_dbt_urns[1]], [upstream_dbt_urns[0]] + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[1], + [ + upstream_dbt_urns[2], + ], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[2], + [ + upstream_dbt_urns[3], + upstream_dbt_urns[4], + ], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[3], + [depth_one_snowflake_urns["indirect_0"]], + ) + yield from lineage_mcp_generator( + upstream_dbt_urns[4], + [ + depth_one_snowflake_urns["indirect_1"], + depth_one_snowflake_urns["indirect_2"], + ], + ) + + for name, urn in depth_one_snowflake_urns.items(): + dbt_urn = make_dataset_urn("dbt", f"{path}.u_{name}") + yield from lineage_mcp_generator( + urn, + [make_dataset_urn("snowflake", f"{path}.direct_u_{name}"), dbt_urn], + ) + yield from lineage_mcp_generator( + dbt_urn, + [make_dataset_urn("snowflake", f"{path}.indirect_u_{name}")], + ) + + +def scenario_perf(): + """searchAcrossLineage(root, depth=n, breadth=3, skip=None) + 1 root urn, + direct (lvl a) upstream, + its 100 (lvl b) upstreams, + each of their 30 (lvl c) upstreams, + each of their 40 (lvl d) upstreams, + then, 50 (lvl e) upstreams each, + then, half of lvl e nodes, 6 (lvl f) upstreams, and for the other lvl e node, 1 (lvl f) upstream. + Total 1 + 1 + 100 + (100 * 30) + (100 * 30 * 40) + (100 * 30 * 40 * 5) = 723,102 urns. + Disabled by default to avoid overloading + """ + + path = "lineage.perf" + root_urn = make_dataset_urn("snowflake", f"{path}.root") + lvl_a = make_dataset_urn("snowflake", f"{path}.u_0") + lvl_b = {i: make_dataset_urn("snowflake", f"{path}.u_0_u_{i}") for i in range(100)} + lvl_c = { + (a, b): make_dataset_urn("snowflake", f"{path}.u_0_u_{a}_u_{b}") + for a in range(100) + for b in range(30) + } + lvl_d = { + (a, b, c): make_dataset_urn("snowflake", f"{path}.u_0_u_{a}_u_{b}_u_{c}") + for a in range(100) + for b in range(30) + for c in range(40) + } + lvl_e = { + (a, b, c, d): make_dataset_urn( + "snowflake", f"{path}.u_0_u_{a}_u_{b}_u_{c}_u_{d}" + ) + for a in range(100) + for b in range(30) + for c in range(40) + for d in range(5) + } + + yield from lineage_mcp_generator(root_urn, [lvl_a]) + yield from lineage_mcp_generator(lvl_a, list(lvl_b.values())) + for a, urn in lvl_b.items(): + yield from lineage_mcp_generator(urn, [lvl_c[(a, b)] for b in range(30)]) + for (a, b), urn in lvl_c.items(): + yield from lineage_mcp_generator(urn, [lvl_d[(a, b, c)] for c in range(40)]) + for (a, b, c), urn in lvl_d.items(): + yield from lineage_mcp_generator(urn, [lvl_e[(a, b, c, d)] for d in range(5)]) + + +if __name__ == "__main__": + graph = get_default_graph() + for mcp in [ + *scenario_truncate_basic(), + *scenario_truncate_intermediate(), + *scenario_truncate_complex(), + *scenario_skip_basic(), + *scenario_skip_intermediate(), + *scenario_skip_complex(), + # *scenario_perf(), + ]: + graph.emit_mcp(mcp) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 2bfc94d13aa14..4702c9d540ec0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -225,11 +225,7 @@ iceberg_common = { # Iceberg Python SDK - "pyiceberg~=0.4", - # We currently pin to pydantic v1, since we only test against pydantic v1 in CI. - # However, we should remove this once we fix compatibility with newer versions - # of pyiceberg, which depend on pydantic v2. - *pydantic_no_v2, + "pyiceberg>=0.4,<0.7", } mssql_common = { @@ -797,6 +793,7 @@ "datahub-kafka", "sync-file-emitter", "sql-parser", + "iceberg", } else set() ) diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/__init__.py b/metadata-ingestion/src/datahub/api/entities/assertion/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion.py new file mode 100644 index 0000000000000..e0975a1c0351c --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion.py @@ -0,0 +1,57 @@ +from abc import abstractmethod +from typing import Optional + +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field +from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionInfo + + +class BaseAssertionProtocol(v1_ConfigModel): + @abstractmethod + def get_id(self) -> str: + pass + + @abstractmethod + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + pass + + @abstractmethod + def get_assertion_trigger( + self, + ) -> Optional[AssertionTrigger]: + pass + + +class BaseAssertion(v1_ConfigModel): + id_raw: Optional[str] = v1_Field( + default=None, + description="The raw id of the assertion." + "If provided, this is used when creating identifier for this assertion" + "along with assertion type and entity.", + ) + + id: Optional[str] = v1_Field( + default=None, + description="The id of the assertion." + "If provided, this is used as identifier for this assertion." + "If provided, no other assertion fields are considered to create identifier.", + ) + + description: Optional[str] = None + + # Can contain metadata extracted from datahub. e.g. + # - entity qualified name + # - entity schema + meta: Optional[dict] = None + + +class BaseEntityAssertion(BaseAssertion): + entity: str = v1_Field( + description="The entity urn that the assertion is associated with" + ) + + trigger: Optional[AssertionTrigger] = v1_Field( + description="The trigger schedule for assertion", alias="schedule" + ) diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_config_spec.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_config_spec.py new file mode 100644 index 0000000000000..08205cc621253 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_config_spec.py @@ -0,0 +1,41 @@ +from typing import List, Optional + +from ruamel.yaml import YAML +from typing_extensions import Literal + +from datahub.api.entities.assertion.datahub_assertion import DataHubAssertion +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field + + +class AssertionsConfigSpec(v1_ConfigModel): + """ + Declarative configuration specification for datahub assertions. + + This model is used as a simpler, Python-native representation to define assertions. + It can be easily parsed from a equivalent YAML file. + + Currently, this is converted into series of assertion MCPs that can be emitted to DataHub. + In future, this would invoke datahub GraphQL API to upsert assertions. + """ + + version: Literal[1] + + id: Optional[str] = v1_Field( + default=None, + alias="namespace", + description="Unique identifier of assertions configuration file", + ) + + assertions: List[DataHubAssertion] + + @classmethod + def from_yaml( + cls, + file: str, + ) -> "AssertionsConfigSpec": + with open(file) as fp: + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + orig_dictionary = yaml.load(fp) + parsed_spec = AssertionsConfigSpec.parse_obj(orig_dictionary) + # parsed_spec._original_yaml_dict = orig_dictionary + return parsed_spec diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py new file mode 100644 index 0000000000000..8704ed13cb6c3 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py @@ -0,0 +1,304 @@ +import json +from typing import List, Optional, Union + +from typing_extensions import Literal, Protocol + +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel +from datahub.metadata.schema_classes import ( + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, +) + + +class Operator(Protocol): + """Specification for an assertion operator. + + This class exists only for documentation (not used in typing checking). + """ + + operator: str + + def id(self) -> str: + ... + + def generate_parameters(self) -> AssertionStdParametersClass: + ... + + +def _generate_assertion_std_parameter( + value: Union[str, int, float, list] +) -> AssertionStdParameterClass: + if isinstance(value, str): + return AssertionStdParameterClass( + value=value, type=AssertionStdParameterTypeClass.STRING + ) + elif isinstance(value, (int, float)): + return AssertionStdParameterClass( + value=str(value), type=AssertionStdParameterTypeClass.NUMBER + ) + elif isinstance(value, list): + return AssertionStdParameterClass( + value=json.dumps(value), type=AssertionStdParameterTypeClass.LIST + ) + else: + raise ValueError( + f"Unsupported assertion parameter {value} of type {type(value)}" + ) + + +Param = Union[str, int, float, List[Union[str, float, int]]] + + +def _generate_assertion_std_parameters( + value: Optional[Param] = None, + min_value: Optional[Param] = None, + max_value: Optional[Param] = None, +) -> AssertionStdParametersClass: + return AssertionStdParametersClass( + value=_generate_assertion_std_parameter(value) if value else None, + minValue=_generate_assertion_std_parameter(min_value) if min_value else None, + maxValue=_generate_assertion_std_parameter(max_value) if max_value else None, + ) + + +class EqualToOperator(v1_ConfigModel): + type: Literal["equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotEqualToOperator(v1_ConfigModel): + type: Literal["not_equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.NOT_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class BetweenOperator(v1_ConfigModel): + type: Literal["between"] + min: Union[int, float] + max: Union[int, float] + + operator: str = AssertionStdOperatorClass.BETWEEN + + def id(self) -> str: + return f"{self.type}-{self.min}-{self.max}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters( + min_value=self.min, max_value=self.max + ) + + +class LessThanOperator(v1_ConfigModel): + type: Literal["less_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOperator(v1_ConfigModel): + type: Literal["greater_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class LessThanOrEqualToOperator(v1_ConfigModel): + type: Literal["less_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOrEqualToOperator(v1_ConfigModel): + type: Literal["greater_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class InOperator(v1_ConfigModel): + type: Literal["in"] + value: List[Union[str, float, int]] + + operator: str = AssertionStdOperatorClass.IN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotInOperator(v1_ConfigModel): + type: Literal["not_in"] + value: List[Union[str, float, int]] + + operator: str = AssertionStdOperatorClass.NOT_IN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class IsNullOperator(v1_ConfigModel): + type: Literal["is_null"] + + operator: str = AssertionStdOperatorClass.NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class NotNullOperator(v1_ConfigModel): + type: Literal["is_not_null"] + + operator: str = AssertionStdOperatorClass.NOT_NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class IsTrueOperator(v1_ConfigModel): + type: Literal["is_true"] + + operator: str = AssertionStdOperatorClass.IS_TRUE + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class IsFalseOperator(v1_ConfigModel): + type: Literal["is_false"] + + operator: str = AssertionStdOperatorClass.IS_FALSE + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class ContainsOperator(v1_ConfigModel): + type: Literal["contains"] + value: str + + operator: str = AssertionStdOperatorClass.CONTAIN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class EndsWithOperator(v1_ConfigModel): + type: Literal["ends_with"] + value: str + + operator: str = AssertionStdOperatorClass.END_WITH + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class StartsWithOperator(v1_ConfigModel): + type: Literal["starts_with"] + value: str + + operator: str = AssertionStdOperatorClass.START_WITH + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class MatchesRegexOperator(v1_ConfigModel): + type: Literal["matches_regex"] + value: str + + operator: str = AssertionStdOperatorClass.REGEX_MATCH + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +Operators = Union[ + InOperator, + NotInOperator, + EqualToOperator, + NotEqualToOperator, + BetweenOperator, + LessThanOperator, + LessThanOrEqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + IsNullOperator, + NotNullOperator, + IsTrueOperator, + IsFalseOperator, + ContainsOperator, + EndsWithOperator, + StartsWithOperator, + MatchesRegexOperator, +] diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_trigger.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_trigger.py new file mode 100644 index 0000000000000..d780916484744 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_trigger.py @@ -0,0 +1,52 @@ +from datetime import timedelta +from typing import Union + +import humanfriendly +from typing_extensions import Literal + +from datahub.configuration.pydantic_migration_helpers import ( + v1_ConfigModel, + v1_Field, + v1_validator, +) + + +class CronTrigger(v1_ConfigModel): + type: Literal["cron"] + cron: str = v1_Field( + description="The cron expression to use. See https://crontab.guru/ for help." + ) + timezone: str = v1_Field( + "UTC", + description="The timezone to use for the cron schedule. Defaults to UTC.", + ) + + +class IntervalTrigger(v1_ConfigModel): + type: Literal["interval"] + interval: timedelta + + @v1_validator("interval", pre=True) + def lookback_interval_to_timedelta(cls, v): + if isinstance(v, str): + seconds = humanfriendly.parse_timespan(v) + return timedelta(seconds=seconds) + raise ValueError("Invalid value.") + + +class EntityChangeTrigger(v1_ConfigModel): + type: Literal["on_table_change"] + + +class ManualTrigger(v1_ConfigModel): + type: Literal["manual"] + + +class AssertionTrigger(v1_ConfigModel): + __root__: Union[ + CronTrigger, IntervalTrigger, EntityChangeTrigger, ManualTrigger + ] = v1_Field(discriminator="type") + + @property + def trigger(self): + return self.__root__ diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/compiler_interface.py b/metadata-ingestion/src/datahub/api/entities/assertion/compiler_interface.py new file mode 100644 index 0000000000000..27b43a58530b1 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/compiler_interface.py @@ -0,0 +1,81 @@ +from abc import abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Dict, List, Literal + +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec +from datahub.ingestion.api.report import Report +from datahub.utilities.lossy_collections import LossyDict, LossyList + + +class StrEnum(str, Enum): + pass + + +class CompileResultArtifactType(StrEnum): + SQL_QUERIES = "SQL_QUERIES" + COMPILE_REPORT = "COMPILE_REPORT" + + +@dataclass +class CompileResultArtifact(Report): + name: str + type: CompileResultArtifactType + path: Path + description: str + + +@dataclass +class AssertionCompilationReport(Report): + """Additional details to debug compilation""" + + num_processed: int = 0 + num_compile_succeeded: int = 0 + num_compile_failed: int = 0 # Likely due to assertion not supported in platform + + warnings: LossyDict[str, LossyList[str]] = field(default_factory=LossyDict) + failures: LossyDict[str, LossyList[str]] = field(default_factory=LossyDict) + + artifacts: List[Path] = field(default_factory=list) + + def report_warning(self, key: str, reason: str) -> None: + warnings = self.warnings.get(key, LossyList()) + warnings.append(reason) + self.warnings[key] = warnings + + def report_failure(self, key: str, reason: str) -> None: + failures = self.failures.get(key, LossyList()) + failures.append(reason) + self.failures[key] = failures + + +@dataclass +class AssertionCompilationResult: + """Results of compilation step , along with detailed report object""" + + platform: str + status: Literal["success", "failure"] + + report: AssertionCompilationReport = field( + default_factory=AssertionCompilationReport + ) + + artifacts: List[CompileResultArtifact] = field(default_factory=list) + + def add_artifact(self, artifact: CompileResultArtifact) -> None: + self.artifacts.append(artifact) + self.report.artifacts.append(artifact.path) + + +class AssertionCompiler: + @classmethod + @abstractmethod + def create(cls, output_dir: str, extras: Dict[str, str]) -> "AssertionCompiler": + pass + + @abstractmethod + def compile( + self, assertion_config_spec: AssertionsConfigSpec + ) -> AssertionCompilationResult: + pass diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/datahub_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/datahub_assertion.py new file mode 100644 index 0000000000000..ed18b78418d76 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/datahub_assertion.py @@ -0,0 +1,35 @@ +from typing import Optional, Union + +from datahub.api.entities.assertion.assertion import BaseAssertionProtocol +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.field_assertion import FieldAssertion +from datahub.api.entities.assertion.freshness_assertion import FreshnessAssertion +from datahub.api.entities.assertion.sql_assertion import SQLAssertion +from datahub.api.entities.assertion.volume_assertion import VolumeAssertion +from datahub.configuration.pydantic_migration_helpers import v1_Field +from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionInfo + + +class DataHubAssertion(BaseAssertionProtocol): + __root__: Union[ + FreshnessAssertion, + VolumeAssertion, + SQLAssertion, + FieldAssertion, + # TODO: Add SchemaAssertion + ] = v1_Field(discriminator="type") + + @property + def assertion(self): + return self.__root__.assertion + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info_aspect() + + def get_id(self) -> str: + return self.__root__.get_id() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.get_assertion_trigger() diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/field_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/field_assertion.py new file mode 100644 index 0000000000000..ae062c3a8e5cb --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/field_assertion.py @@ -0,0 +1,158 @@ +from enum import Enum +from typing import Optional, Union + +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_operator import Operators +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.field_metric import FieldMetric +from datahub.api.entities.assertion.filter import DatasetFilter +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + FieldAssertionInfo, + FieldAssertionType, +) +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldSpec +from datahub.metadata.schema_classes import ( + FieldMetricAssertionClass, + FieldTransformClass, + FieldTransformTypeClass, + FieldValuesAssertionClass, + FieldValuesFailThresholdClass, + FieldValuesFailThresholdTypeClass, +) + + +class FieldValuesFailThreshold(v1_ConfigModel): + type: Literal["count", "percentage"] = v1_Field(default="count") + value: int = v1_Field(default=0) + + def to_field_values_failure_threshold(self) -> FieldValuesFailThresholdClass: + return FieldValuesFailThresholdClass( + type=( + FieldValuesFailThresholdTypeClass.COUNT + if self.type == Literal["count"] + else FieldValuesFailThresholdTypeClass.PERCENTAGE + ), + value=self.value, + ) + + +class FieldTransform(Enum): + LENGTH = "length" + + +class FieldValuesAssertion(BaseEntityAssertion): + type: Literal["field"] + field: str + field_transform: Optional[FieldTransform] = v1_Field(default=None) + operator: Operators = v1_Field(discriminator="type", alias="condition") + filters: Optional[DatasetFilter] = v1_Field(default=None) + failure_threshold: FieldValuesFailThreshold = v1_Field( + default=FieldValuesFailThreshold() + ) + exclude_nulls: bool = v1_Field(default=True) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FIELD, + fieldAssertion=FieldAssertionInfo( + type=FieldAssertionType.FIELD_VALUES, + entity=self.entity, + fieldValuesAssertion=FieldValuesAssertionClass( + field=SchemaFieldSpec( + path=self.field, + type="", # Not required + nativeType="", # Not required + ), + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + failThreshold=self.failure_threshold.to_field_values_failure_threshold(), + excludeNulls=self.exclude_nulls, + transform=( + FieldTransformClass(type=FieldTransformTypeClass.LENGTH) + if self.field_transform == Literal["length"] + else None + ), + ), + ), + ) + + def get_id(self) -> str: + guid_dict = { + "entity": self.entity, + "type": self.type, + "field": self.field, + "operator": str(self.operator.operator), + "id_raw": self.id_raw, + } + return self.id or datahub_guid(guid_dict) + + +class FieldMetricAssertion(BaseEntityAssertion): + type: Literal["field"] + field: str + operator: Operators = v1_Field(discriminator="type", alias="condition") + metric: FieldMetric + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FIELD, + fieldAssertion=FieldAssertionInfo( + type=FieldAssertionType.FIELD_METRIC, + entity=self.entity, + fieldMetricAssertion=FieldMetricAssertionClass( + field=SchemaFieldSpec( + path=self.field, + type="", # Not required + nativeType="", # Not required + ), + metric=self.metric.name, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ), + ) + + def get_id(self) -> str: + guid_dict = { + "entity": self.entity, + "type": self.type, + "field": self.field, + "metric": self.metric.value, + "id_raw": self.id_raw, + } + return self.id or datahub_guid(guid_dict) + + +class FieldAssertion(BaseAssertionProtocol): + __root__: Union[FieldMetricAssertion, FieldValuesAssertion] + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + return self.__root__.get_id() + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/field_metric.py b/metadata-ingestion/src/datahub/api/entities/assertion/field_metric.py new file mode 100644 index 0000000000000..7a236da2d562d --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/field_metric.py @@ -0,0 +1,21 @@ +from enum import Enum + + +class FieldMetric(Enum): + UNIQUE_COUNT = "unique_count" + UNIQUE_PERCENTAGE = "unique_percentage" + NULL_COUNT = "null_count" + NULL_PERCENTAGE = "null_percentage" + MIN = "min" + MAX = "max" + MEAN = "mean" + MEDIAN = "median" + STDDEV = "stddev" + NEGATIVE_COUNT = "negative_count" + NEGATIVE_PERCENTAGE = "negative_percentage" + ZERO_COUNT = "zero_count" + ZERO_PERCENTAGE = "zero_percentage" + MIN_LENGTH = "min_length" + MAX_LENGTH = "max_length" + EMPTY_COUNT = "empty_count" + EMPTY_PERCENTAGE = "empty_percentage" diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/filter.py b/metadata-ingestion/src/datahub/api/entities/assertion/filter.py new file mode 100644 index 0000000000000..05d75b674d6af --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/filter.py @@ -0,0 +1,13 @@ +from typing_extensions import Literal + +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel + + +class SqlFilter(v1_ConfigModel): + type: Literal["sql"] + sql: str + + +DatasetFilter = SqlFilter +# class DatasetFilter(v1_ConfigModel): +# __root__: Union[SqlFilter] = v1_Field(discriminator="type") diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/freshness_assertion.py new file mode 100644 index 0000000000000..f9e1df7d68f27 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/freshness_assertion.py @@ -0,0 +1,124 @@ +from datetime import timedelta +from enum import Enum +from typing import Optional, Union + +import humanfriendly +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.filter import DatasetFilter +from datahub.configuration.pydantic_migration_helpers import v1_Field, v1_validator +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + FixedIntervalSchedule, + FreshnessAssertionInfo, + FreshnessAssertionSchedule, + FreshnessAssertionScheduleType, + FreshnessAssertionType, + FreshnessCronSchedule, +) +from datahub.metadata.com.linkedin.pegasus2avro.timeseries import CalendarInterval + + +class FreshnessSourceType(Enum): + LAST_MODIFIED_COLUMN = "last_modified_column" + + +class CronFreshnessAssertion(BaseEntityAssertion): + type: Literal["freshness"] + freshness_type: Literal["cron"] + cron: str = v1_Field( + description="The cron expression to use. See https://crontab.guru/ for help." + ) + timezone: str = v1_Field( + "UTC", + description="The timezone to use for the cron schedule. Defaults to UTC.", + ) + source_type: FreshnessSourceType = v1_Field( + default=FreshnessSourceType.LAST_MODIFIED_COLUMN + ) + last_modified_field: str + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FRESHNESS, + freshnessAssertion=FreshnessAssertionInfo( + type=FreshnessAssertionType.DATASET_CHANGE, + entity=self.entity, + schedule=FreshnessAssertionSchedule( + type=FreshnessAssertionScheduleType.CRON, + cron=FreshnessCronSchedule(cron=self.cron, timezone=self.timezone), + ), + ), + ) + + +class FixedIntervalFreshnessAssertion(BaseEntityAssertion): + type: Literal["freshness"] + freshness_type: Literal["interval"] = v1_Field(default="interval") + lookback_interval: timedelta + filters: Optional[DatasetFilter] = v1_Field(default=None) + source_type: FreshnessSourceType = v1_Field( + default=FreshnessSourceType.LAST_MODIFIED_COLUMN + ) + last_modified_field: str + + @v1_validator("lookback_interval", pre=True) + def lookback_interval_to_timedelta(cls, v): + if isinstance(v, str): + seconds = humanfriendly.parse_timespan(v) + return timedelta(seconds=seconds) + raise ValueError("Invalid value.") + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FRESHNESS, + freshnessAssertion=FreshnessAssertionInfo( + type=FreshnessAssertionType.DATASET_CHANGE, + entity=self.entity, + schedule=FreshnessAssertionSchedule( + type=FreshnessAssertionScheduleType.FIXED_INTERVAL, + fixedInterval=FixedIntervalSchedule( + unit=CalendarInterval.SECOND, + multiple=self.lookback_interval.seconds, + ), + ), + ), + ) + + +class FreshnessAssertion(BaseAssertionProtocol): + __root__: Union[FixedIntervalFreshnessAssertion, CronFreshnessAssertion] + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + guid_dict = { + "entity": self.__root__.entity, + "type": self.__root__.type, + "id_raw": self.__root__.id_raw, + } + return self.__root__.id or datahub_guid(guid_dict) + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/sql_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/sql_assertion.py new file mode 100644 index 0000000000000..3d12cfde428f4 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/sql_assertion.py @@ -0,0 +1,91 @@ +from typing import Optional, Union + +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_operator import Operators +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.configuration.pydantic_migration_helpers import v1_Field +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + AssertionValueChangeType, + SqlAssertionInfo, + SqlAssertionType, +) + + +class SqlMetricAssertion(BaseEntityAssertion): + type: Literal["sql"] + statement: str + operator: Operators = v1_Field(discriminator="type", alias="condition") + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.SQL, + sqlAssertion=SqlAssertionInfo( + type=SqlAssertionType.METRIC, + entity=self.entity, + statement=self.statement, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ) + + +class SqlMetricChangeAssertion(BaseEntityAssertion): + type: Literal["sql"] + statement: str + change_type: Literal["absolute", "percentage"] + operator: Operators = v1_Field(discriminator="type", alias="condition") + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.SQL, + sqlAssertion=SqlAssertionInfo( + type=SqlAssertionType.METRIC_CHANGE, + entity=self.entity, + statement=self.statement, + changeType=( + AssertionValueChangeType.ABSOLUTE + if self.change_type == Literal["absolute"] + else AssertionValueChangeType.PERCENTAGE + ), + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ) + + +class SQLAssertion(BaseAssertionProtocol): + __root__: Union[SqlMetricAssertion, SqlMetricChangeAssertion] = v1_Field() + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + guid_dict = { + "entity": self.__root__.entity, + "type": self.__root__.type, + "id_raw": self.__root__.id_raw, + } + return self.__root__.id or datahub_guid(guid_dict) + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/volume_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/volume_assertion.py new file mode 100644 index 0000000000000..da6a125874aa7 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/volume_assertion.py @@ -0,0 +1,98 @@ +from typing import Optional, Union + +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_operator import Operators +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.filter import DatasetFilter +from datahub.configuration.pydantic_migration_helpers import v1_Field +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + AssertionValueChangeType, + RowCountChange, + RowCountTotal, + VolumeAssertionInfo, + VolumeAssertionType, +) + + +class RowCountTotalVolumeAssertion(BaseEntityAssertion): + type: Literal["volume"] + metric: Literal["row_count"] = v1_Field(default="row_count") + operator: Operators = v1_Field(discriminator="type", alias="condition") + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.VOLUME, + volumeAssertion=VolumeAssertionInfo( + type=VolumeAssertionType.ROW_COUNT_TOTAL, + entity=self.entity, + rowCountTotal=RowCountTotal( + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ), + ) + + +class RowCountChangeVolumeAssertion(BaseEntityAssertion): + type: Literal["volume"] + metric: Literal["row_count"] = v1_Field(default="row_count") + change_type: Literal["absolute", "percentage"] + operator: Operators = v1_Field(discriminator="type", alias="condition") + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.VOLUME, + volumeAssertion=VolumeAssertionInfo( + type=VolumeAssertionType.ROW_COUNT_CHANGE, + entity=self.entity, + rowCountChange=RowCountChange( + type=( + AssertionValueChangeType.ABSOLUTE + if self.change_type == Literal["absolute"] + else AssertionValueChangeType.PERCENTAGE + ), + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ), + ) + + +class VolumeAssertion(BaseAssertionProtocol): + __root__: Union[RowCountTotalVolumeAssertion, RowCountChangeVolumeAssertion] + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + guid_dict = { + "entity": self.__root__.entity, + "type": self.__root__.type, + "id_raw": self.__root__.id_raw, + } + return self.__root__.id or datahub_guid(guid_dict) + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index f9e0eb45692d4..9332b701fed39 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -32,6 +32,7 @@ "domain", "glossaryTerm", "glossaryNode", + "form", } _RECURSIVE_DELETE_TYPES = { diff --git a/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py b/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py new file mode 100644 index 0000000000000..dad724bfe1115 --- /dev/null +++ b/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py @@ -0,0 +1,151 @@ +import logging +import os +from pathlib import Path +from typing import Dict, List, Optional + +import click +from click_default_group import DefaultGroup + +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec +from datahub.api.entities.assertion.compiler_interface import ( + AssertionCompilationResult, + CompileResultArtifact, + CompileResultArtifactType, +) +from datahub.emitter.mce_builder import make_assertion_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import get_default_graph +from datahub.integrations.assertion.registry import ASSERTION_PLATFORMS +from datahub.telemetry import telemetry +from datahub.upgrade import upgrade + +logger = logging.getLogger(__name__) + +REPORT_FILE_NAME = "compile_report.json" + + +@click.group(cls=DefaultGroup, default="upsert") +def assertions() -> None: + """A group of commands to interact with the Assertion entity in DataHub.""" + pass + + +@assertions.command() +@click.option("-f", "--file", required=True, type=click.Path(exists=True)) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def upsert(file: str) -> None: + """Upsert (create or update) a set of assertions in DataHub.""" + + assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file) + + with get_default_graph() as graph: + for assertion_spec in assertions_spec.assertions: + try: + mcp = MetadataChangeProposalWrapper( + entityUrn=make_assertion_urn(assertion_spec.get_id()), + aspect=assertion_spec.get_assertion_info_aspect(), + ) + graph.emit_mcp(mcp) + # TODO: Validate uniqueness of assertion ids. Report if duplicates found. + # TODO: Use upsert graphql endpoints here instead of graph.emit_mcp. + click.secho(f"Update succeeded for urn {mcp.entityUrn}.", fg="green") + except Exception as e: + logger.exception(e) + click.secho( + f"Update failed for {mcp.entityUrn}: {e}", + fg="red", + ) + + +@assertions.command() +@click.option("-f", "--file", required=True, type=click.Path(exists=True)) +@click.option("-p", "--platform", required=True, type=str) +@click.option("-o", "--output-to", required=False, type=click.Path(exists=True)) +@click.option( + "-x", + "--extras", + required=False, + multiple=True, + default=[], + help="Platform-specific extra key-value inputs in form key=value", +) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def compile( + file: str, platform: str, output_to: Optional[str], extras: List[str] +) -> None: + """Compile a set of assertions for input assertion platform. + Note that this does not run any code or execute any queries on assertion platform + and only creates artifacts specific to assertion platform that can be executed manually. + In future, we may introduce separate command to automatically apply these compiled changes + in assertion platform. Currently, generated result artifacts are stored in target folder + unless another folder is specified using option `--output-to `. + """ + + if platform not in ASSERTION_PLATFORMS: + click.secho( + f"Platform {platform} is not supported.", + fg="red", + ) + + if output_to is None: + output_to = f"{os.getcwd()}/target" + + if not os.path.isdir(output_to): + os.mkdir(output_to) + + assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file) + + try: + compiler = ASSERTION_PLATFORMS[platform].create( + output_dir=output_to, extras=extras_list_to_dict(extras) + ) + result = compiler.compile(assertions_spec) + + write_report_file(output_to, result) + click.secho("Compile report:", bold=True) + click.echo(result.report.as_string()) + if result.status == "failure": + click.secho("Failure", fg="yellow", bold=True) + else: + click.secho("Success", fg="green", bold=True) + except Exception as e: + logger.exception(e) + click.secho( + f"Compile failed: {e}", + fg="red", + ) + + +def write_report_file(output_to: str, result: AssertionCompilationResult) -> None: + report_path = Path(output_to) / REPORT_FILE_NAME + with (report_path).open("w") as f: + result.add_artifact( + CompileResultArtifact( + name=REPORT_FILE_NAME, + path=report_path, + type=CompileResultArtifactType.COMPILE_REPORT, + description="Detailed report about compile status", + ) + ) + f.write(result.report.as_json()) + + +def extras_list_to_dict(extras: List[str]) -> Dict[str, str]: + extra_properties: Dict[str, str] = dict() + for x in extras: + parts = x.split("=") + assert ( + len(parts) == 2 + ), f"Invalid value for extras {x}, should be in format key=value" + extra_properties[parts[0]] = parts[1] + return extra_properties + + +# TODO: support for +# Immediate: +# 1. delete assertions (from datahub) +# Later: +# 3. execute compiled assertions on assertion platform (Later, requires connection details to platform), +# 4. cleanup assertions from assertion platform (generate artifacts. optionally execute) diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 7c5d84b93726d..49042db7b9299 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -25,6 +25,7 @@ from datahub.cli.ingest_cli import ingest from datahub.cli.migrate import migrate from datahub.cli.put_cli import put +from datahub.cli.specific.assertions_cli import assertions from datahub.cli.specific.datacontract_cli import datacontract from datahub.cli.specific.dataproduct_cli import dataproduct from datahub.cli.specific.dataset_cli import dataset @@ -164,6 +165,7 @@ def init(use_password: bool = False) -> None: datahub.add_command(properties) datahub.add_command(forms) datahub.add_command(datacontract) +datahub.add_command(assertions) try: from datahub.cli.lite_cli import lite diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 887d7ec703c0a..7621c6d363e3d 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -1281,6 +1281,170 @@ def create_tag(self, tag_name: str) -> str: # return urn return res["createTag"] + def _assertion_result_shared(self) -> str: + fragment: str = """ + fragment assertionResult on AssertionResult { + type + rowCount + missingCount + unexpectedCount + actualAggValue + externalUrl + nativeResults { + value + } + error { + type + properties { + value + } + } + } + """ + return fragment + + def _run_assertion_result_shared(self) -> str: + fragment: str = """ + fragment runAssertionResult on RunAssertionResult { + assertion { + urn + } + result { + ... assertionResult + } + } + """ + return fragment + + def _run_assertion_build_params( + self, params: Optional[Dict[str, str]] = {} + ) -> List[Any]: + if params is None: + return [] + + results = [] + for key, value in params.items(): + result = { + "key": key, + "value": value, + } + results.append(result) + + return results + + def run_assertion( + self, + urn: str, + save_result: bool = True, + parameters: Optional[Dict[str, str]] = {}, + async_flag: bool = False, + ) -> Dict: + params = self._run_assertion_build_params(parameters) + graph_query: str = """ + %s + mutation runAssertion($assertionUrn: String!, $saveResult: Boolean, $parameters: [StringMapEntryInput!], $async: Boolean!) { + runAssertion(urn: $assertionUrn, saveResult: $saveResult, parameters: $parameters, async: $async) { + ... assertionResult + } + } + """ % ( + self._assertion_result_shared() + ) + + variables = { + "assertionUrn": urn, + "saveResult": save_result, + "parameters": params, + "async": async_flag, + } + + res = self.execute_graphql( + query=graph_query, + variables=variables, + ) + + return res["runAssertion"] + + def run_assertions( + self, + urns: List[str], + save_result: bool = True, + parameters: Optional[Dict[str, str]] = {}, + async_flag: bool = False, + ) -> Dict: + params = self._run_assertion_build_params(parameters) + graph_query: str = """ + %s + %s + mutation runAssertions($assertionUrns: [String!]!, $saveResult: Boolean, $parameters: [StringMapEntryInput!], $async: Boolean!) { + runAssertions(urns: $assertionUrns, saveResults: $saveResult, parameters: $parameters, async: $async) { + passingCount + failingCount + errorCount + results { + ... runAssertionResult + } + } + } + """ % ( + self._assertion_result_shared(), + self._run_assertion_result_shared(), + ) + + variables = { + "assertionUrns": urns, + "saveResult": save_result, + "parameters": params, + "async": async_flag, + } + + res = self.execute_graphql( + query=graph_query, + variables=variables, + ) + + return res["runAssertions"] + + def run_assertions_for_asset( + self, + urn: str, + tag_urns: Optional[List[str]] = [], + parameters: Optional[Dict[str, str]] = {}, + async_flag: bool = False, + ) -> Dict: + params = self._run_assertion_build_params(parameters) + graph_query: str = """ + %s + %s + mutation runAssertionsForAsset($assetUrn: String!, $tagUrns: [String!], $parameters: [StringMapEntryInput!], $async: Boolean!) { + runAssertionsForAsset(urn: $assetUrn, tagUrns: $tagUrns, parameters: $parameters, async: $async) { + passingCount + failingCount + errorCount + results { + ... runAssertionResult + } + } + } + """ % ( + self._assertion_result_shared(), + self._run_assertion_result_shared(), + ) + + variables = { + "assetUrn": urn, + "tagUrns": tag_urns, + "parameters": params, + "async": async_flag, + } + + res = self.execute_graphql( + query=graph_query, + variables=variables, + ) + + return res["runAssertionsForAsset"] + def close(self) -> None: self._make_schema_resolver.cache_clear() super().close() diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py index 007b7487cb6a4..8572b2378a3bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py @@ -1,9 +1,9 @@ import concurrent.futures import contextlib +import dataclasses import functools import logging import uuid -from dataclasses import dataclass from enum import auto from typing import Optional, Union @@ -29,6 +29,7 @@ MetadataChangeProposal, ) from datahub.utilities.advanced_thread_executor import PartitionExecutor +from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.server_config_util import set_gms_config logger = logging.getLogger(__name__) @@ -44,15 +45,17 @@ class DatahubRestSinkConfig(DatahubClientConfig): # These only apply in async mode. max_threads: int = 15 - max_pending_requests: int = 500 + max_pending_requests: int = 2000 -@dataclass +@dataclasses.dataclass class DataHubRestSinkReport(SinkReport): - max_threads: int = -1 - gms_version: str = "" + max_threads: Optional[int] = None + gms_version: Optional[str] = None pending_requests: int = 0 + main_thread_blocking_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) + def compute_stats(self) -> None: super().compute_stats() @@ -105,7 +108,7 @@ def __post_init__(self) -> None: self.report.gms_version = ( gms_config.get("versions", {}) .get("acryldata/datahub", {}) - .get("version", "") + .get("version", None) ) self.report.max_threads = self.config.max_threads logger.debug("Setting env variables to override config") @@ -189,25 +192,28 @@ def write_record_async( ], write_callback: WriteCallback, ) -> None: - record = record_envelope.record - if self.config.mode == SyncOrAsync.ASYNC: - partition_key = _get_partition_key(record_envelope) - self.executor.submit( - partition_key, - self._emit_wrapper, - record, - done_callback=functools.partial( - self._write_done_callback, record_envelope, write_callback - ), - ) - self.report.pending_requests += 1 - else: - # execute synchronously - try: - self._emit_wrapper(record) - write_callback.on_success(record_envelope, success_metadata={}) - except Exception as e: - write_callback.on_failure(record_envelope, e, failure_metadata={}) + # Because the default is async mode and most sources are slower than the sink, this + # should only have a high value if the sink is actually a bottleneck. + with self.report.main_thread_blocking_timer: + record = record_envelope.record + if self.config.mode == SyncOrAsync.ASYNC: + partition_key = _get_partition_key(record_envelope) + self.executor.submit( + partition_key, + self._emit_wrapper, + record, + done_callback=functools.partial( + self._write_done_callback, record_envelope, write_callback + ), + ) + self.report.pending_requests += 1 + else: + # execute synchronously + try: + self._emit_wrapper(record) + write_callback.on_success(record_envelope, success_metadata={}) + except Exception as e: + write_callback.on_failure(record_envelope, e, failure_metadata={}) def emit_async( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 2585260434a38..b5caa83b2ff37 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Iterable, List, Optional from pyiceberg.catalog import Catalog +from pyiceberg.exceptions import NoSuchIcebergTableError from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table from pyiceberg.typedef import Identifier @@ -76,6 +77,9 @@ ) LOGGER = logging.getLogger(__name__) +logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( + logging.WARNING +) @platform_name("Iceberg") @@ -134,9 +138,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: catalog = self.config.get_catalog() except Exception as e: LOGGER.error("Failed to get catalog", exc_info=True) - self.report.report_failure( - "get-catalog", f"Failed to get catalog {self.config.catalog.name}: {e}" - ) + self.report.report_failure("get-catalog", f"Failed to get catalog: {e}") return for dataset_path in self._get_datasets(catalog): @@ -150,7 +152,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Try to load an Iceberg table. Might not contain one, this will be caught by NoSuchIcebergTableError. table = catalog.load_table(dataset_path) yield from self._create_iceberg_workunit(dataset_name, table) - except Exception as e: + except NoSuchIcebergTableError as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( f"Exception while processing table {dataset_path}, skipping it.", @@ -175,6 +177,7 @@ def _create_iceberg_workunit( custom_properties = table.metadata.properties.copy() custom_properties["location"] = table.metadata.location custom_properties["format-version"] = str(table.metadata.format_version) + custom_properties["partition-spec"] = str(self._get_partition_aspect(table)) if table.current_snapshot(): custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id) custom_properties["manifest-list"] = table.current_snapshot().manifest_list @@ -204,6 +207,49 @@ def _create_iceberg_workunit( profiler = IcebergProfiler(self.report, self.config.profiling) yield from profiler.profile_table(dataset_name, dataset_urn, table) + def _get_partition_aspect(self, table: Table) -> Optional[str]: + """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table. + Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification. + Extra information has been added to this spec to make the information more user-friendly. + + Since Datahub does not have a place in its model to store this information, it is saved as a JSON string and displayed as a table property. + + Here is an example: + ```json + "partition-spec": "[{\"name\": \"timeperiod_loaded\", \"transform\": \"identity\", \"source\": \"timeperiod_loaded\", \"source-id\": 19, \"source-type\": \"date\", \"field-id\": 1000}]", + ``` + + Args: + table (Table): The Iceberg table to extract partition spec from. + + Returns: + str: JSON representation of the partition spec of the provided table (empty array if table is not partitioned) or `None` if an error occured. + """ + try: + return json.dumps( + [ + { + "name": partition.name, + "transform": str(partition.transform), + "source": str( + table.schema().find_column_name(partition.source_id) + ), + "source-id": partition.source_id, + "source-type": str( + table.schema().find_type(partition.source_id) + ), + "field-id": partition.field_id, + } + for partition in table.spec().fields + ] + ) + except Exception as e: + self.report.report_warning( + "extract-partition", + f"Failed to extract partition spec from Iceberg table {table.name()} due to error: {str(e)}", + ) + return None + def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: owners = [] if self.config.user_ownership_property: @@ -432,6 +478,25 @@ def visit_timestamp(self, timestamp_type: TimestampType) -> Dict[str, Any]: "native_data_type": str(timestamp_type), } + # visit_timestamptz() is required when using pyiceberg >= 0.5.0, which is essentially a duplicate + # of visit_timestampz(). The function has been renamed from visit_timestampz(). + # Once Datahub can upgrade its pyiceberg dependency to >=0.5.0, the visit_timestampz() function can be safely removed. + def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]: + # Avro supports 2 types of timestamp: + # - Timestamp: independent of a particular timezone or calendar (TZ information is lost) + # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local + # utcAdjustment: bool = True + return { + "type": "long", + "logicalType": "timestamp-micros", + # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec. + # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634 + # "logicalType": "timestamp-micros" + # if timestamp_type.adjust_to_utc + # else "local-timestamp-micros", + "native_data_type": str(timestamptz_type), + } + def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]: # Avro supports 2 types of timestamp: # - Timestamp: independent of a particular timezone or calendar (TZ information is lost) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py index f4d93f67b27af..98db275e754c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py @@ -1,7 +1,8 @@ +import logging from dataclasses import dataclass, field from typing import Dict, List, Optional -from pydantic import Field +from pydantic import Field, validator from pyiceberg.catalog import Catalog, load_catalog from datahub.configuration.common import AllowDenyPattern, ConfigModel @@ -18,6 +19,8 @@ is_profiling_enabled, ) +logger = logging.getLogger(__name__) + class IcebergProfilingConfig(ConfigModel): enabled: bool = Field( @@ -50,32 +53,14 @@ class IcebergProfilingConfig(ConfigModel): # include_field_sample_values: bool = True -class IcebergCatalogConfig(ConfigModel): - """ - Iceberg catalog config. - - https://py.iceberg.apache.org/configuration/ - """ - - name: str = Field( - default="default", - description="Name of catalog", - ) - type: str = Field( - description="Type of catalog. See [PyIceberg](https://py.iceberg.apache.org/configuration/) for list of possible values.", - ) - config: Dict[str, str] = Field( - description="Catalog specific configuration. See [PyIceberg documentation](https://py.iceberg.apache.org/configuration/) for details.", - ) - - class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): # Override the stateful_ingestion config param with the Iceberg custom stateful ingestion config in the IcebergSourceConfig stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( default=None, description="Iceberg Stateful Ingestion Config." ) - catalog: IcebergCatalogConfig = Field( - description="Catalog configuration where to find Iceberg tables. See [pyiceberg's catalog configuration details](https://py.iceberg.apache.org/configuration/).", + # The catalog configuration is using a dictionary to be open and flexible. All the keys and values are handled by pyiceberg. This will future-proof any configuration change done by pyiceberg. + catalog: Dict[str, Dict[str, str]] = Field( + description="Catalog configuration where to find Iceberg tables. Only one catalog specification is supported. The format is the same as [pyiceberg's catalog configuration](https://py.iceberg.apache.org/configuration/), where the catalog name is specified as the object name and attributes are set as key-value pairs.", ) table_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), @@ -91,6 +76,45 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin) ) profiling: IcebergProfilingConfig = IcebergProfilingConfig() + @validator("catalog", pre=True, always=True) + def handle_deprecated_catalog_format(cls, value): + # Once support for deprecated format is dropped, we can remove this validator. + if ( + isinstance(value, dict) + and "name" in value + and "type" in value + and "config" in value + ): + # This looks like the deprecated format + logger.warning( + "The catalog configuration format you are using is deprecated and will be removed in a future version. Please update to the new format.", + ) + catalog_name = value["name"] + catalog_type = value["type"] + catalog_config = value["config"] + new_catalog_config = { + catalog_name: {"type": catalog_type, **catalog_config} + } + return new_catalog_config + # In case the input is already the new format or is invalid + return value + + @validator("catalog") + def validate_catalog_size(cls, value): + if len(value) != 1: + raise ValueError("The catalog must contain exactly one entry.") + + # Retrieve the dict associated with the one catalog entry + catalog_name, catalog_config = next(iter(value.items())) + + # Check if that dict is not empty + if not catalog_config or not isinstance(catalog_config, dict): + raise ValueError( + f"The catalog configuration for '{catalog_name}' must not be empty and should be a dictionary with at least one key-value pair." + ) + + return value + def is_profiling_enabled(self) -> bool: return self.profiling.enabled and is_profiling_enabled( self.profiling.operation_config @@ -102,9 +126,12 @@ def get_catalog(self) -> Catalog: Returns: Catalog: Iceberg catalog instance. """ - return load_catalog( - name=self.catalog.name, **{"type": self.catalog.type, **self.catalog.config} - ) + if not self.catalog: + raise ValueError("No catalog configuration found") + + # Retrieve the dict associated with the one catalog entry + catalog_name, catalog_config = next(iter(self.catalog.items())) + return load_catalog(name=catalog_name, **catalog_config) @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 09105b2c6bfb0..ec78b15348701 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1384,6 +1384,7 @@ class LookerDashboardElement: input_fields: Optional[List[InputFieldElement]] = None folder_path: Optional[str] = None # for independent looks. folder: Optional[LookerFolder] = None + owner: Optional[LookerUser] = None def url(self, base_url: str) -> str: # A dashboard element can use a look or just a raw query against an explore diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index c4ba3146031af..c87ee1d77f5cd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -437,6 +437,7 @@ def _get_looker_dashboard_element( # noqa: C901 for exp in explores ], input_fields=input_fields, + owner=None, ) # Dashboard elements can *alternatively* link to an existing look @@ -488,6 +489,7 @@ def _get_looker_dashboard_element( # noqa: C901 if element.look.folder else None ), + owner=self._get_looker_user(element.look.user_id), ) # Failing the above two approaches, pick out details from result_maker @@ -558,6 +560,7 @@ def _get_looker_dashboard_element( # noqa: C901 LookerExplore(model_name=model, name=exp) for exp in explores ], input_fields=input_fields, + owner=None, ) logger.debug(f"Element {element.title}: Unable to parse LookerDashboardElement") @@ -690,6 +693,10 @@ def _make_chart_metadata_events( ownership = self.get_ownership(dashboard) if ownership is not None: chart_snapshot.aspects.append(ownership) + elif dashboard is None and dashboard_element is not None: + ownership = self.get_ownership(dashboard_element) + if ownership is not None: + chart_snapshot.aspects.append(ownership) chart_mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot) @@ -970,10 +977,10 @@ def _make_dashboard_and_chart_mces( yield from dashboard_events def get_ownership( - self, looker_dashboard: LookerDashboard + self, looker_dashboard_look: Union[LookerDashboard, LookerDashboardElement] ) -> Optional[OwnershipClass]: - if looker_dashboard.owner is not None: - owner_urn = looker_dashboard.owner.get_urn( + if looker_dashboard_look.owner is not None: + owner_urn = looker_dashboard_look.owner.get_urn( self.source_config.strip_user_ids_from_email ) if owner_urn is not None: @@ -1381,7 +1388,14 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]: self.reporter.report_stage_start("extract_independent_looks") logger.debug("Extracting looks not part of Dashboard") - look_fields: List[str] = ["id", "title", "description", "query_id", "folder"] + look_fields: List[str] = [ + "id", + "title", + "description", + "query_id", + "folder", + "user_id", + ] query_fields: List[str] = [ "id", "view", @@ -1426,7 +1440,9 @@ def extract_independent_looks(self) -> Iterable[MetadataWorkUnit]: subtitle_text=look.description, look_id=look.id, dashboard_id=None, # As this is independent look - look=LookWithQuery(query=query, folder=look.folder), + look=LookWithQuery( + query=query, folder=look.folder, user_id=look.user_id + ), ), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 74507d850014a..2ff73323a14e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -49,6 +49,11 @@ class S3LineageProviderConfig(ConfigModel): description="Strip filename from s3 url. It only applies if path_specs are not specified.", ) + ignore_non_path_spec_path: bool = Field( + default=False, + description="Ignore paths that are not match in path_specs. It only applies if path_specs are specified.", + ) + class S3DatasetLineageProviderConfigBase(ConfigModel): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index 6c6267e80ee62..87deab72284c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -264,13 +264,23 @@ def warn(self, log: logging.Logger, key: str, reason: str) -> None: # TODO: Remove this method. self.report.warning(key, reason) - def _get_s3_path(self, path: str) -> str: + def _get_s3_path(self, path: str) -> Optional[str]: if self.config.s3_lineage_config: for path_spec in self.config.s3_lineage_config.path_specs: if path_spec.allowed(path): _, table_path = path_spec.extract_table_name_and_path(path) return table_path + if ( + self.config.s3_lineage_config.ignore_non_path_spec_path + and len(self.config.s3_lineage_config.path_specs) > 0 + ): + self.report.num_lineage_dropped_s3_path += 1 + logger.debug( + f"Skipping s3 path {path} as it does not match any path spec." + ) + return None + if self.config.s3_lineage_config.strip_urls: if "/" in urlparse(path).path: return str(path.rsplit("/", 1)[0]) @@ -323,13 +333,14 @@ def _get_sources_from_query( ), ) - def _build_s3_path_from_row(self, filename: str) -> str: + def _build_s3_path_from_row(self, filename: str) -> Optional[str]: path = filename.strip() if urlparse(path).scheme != "s3": raise ValueError( f"Only s3 source supported with copy/unload. The source was: {path}" ) - return strip_s3_prefix(self._get_s3_path(path)) + s3_path = self._get_s3_path(path) + return strip_s3_prefix(s3_path) if s3_path else None def _get_sources( self, @@ -369,7 +380,11 @@ def _get_sources( ) self.report.num_lineage_dropped_not_support_copy_path += 1 return [], None - path = strip_s3_prefix(self._get_s3_path(path)) + s3_path = self._get_s3_path(path) + if s3_path is None: + return [], None + + path = strip_s3_prefix(s3_path) urn = make_dataset_urn_with_platform_instance( platform=platform.value, name=path, @@ -539,6 +554,8 @@ def _get_target_lineage( target_platform = LineageDatasetPlatform.S3 # Following call requires 'filename' key in lineage_row target_path = self._build_s3_path_from_row(lineage_row.filename) + if target_path is None: + return None urn = make_dataset_urn_with_platform_instance( platform=target_platform.value, name=target_path, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py index 2e6cb8051c91e..3012f4949baeb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py @@ -44,6 +44,7 @@ class RedshiftReport( num_lineage_dropped_query_parser: int = 0 num_lineage_dropped_not_support_copy_path: int = 0 num_lineage_processed_temp_tables = 0 + num_lineage_dropped_s3_path: int = 0 lineage_start_time: Optional[datetime] = None lineage_end_time: Optional[datetime] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py new file mode 100644 index 0000000000000..8abb656e30e73 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -0,0 +1,129 @@ +import logging +from datetime import datetime +from typing import Callable, Iterable, List, Optional + +from pydantic import BaseModel + +from datahub.emitter.mce_builder import ( + make_assertion_urn, + make_data_platform_urn, + make_dataplatform_instance_urn, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeCommonMixin, + SnowflakeConnectionMixin, + SnowflakeQueryMixin, +) +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionResult, + AssertionResultType, + AssertionRunEvent, + AssertionRunStatus, +) +from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance +from datahub.utilities.time import datetime_to_ts_millis + +logger: logging.Logger = logging.getLogger(__name__) + + +class DataQualityMonitoringResult(BaseModel): + MEASUREMENT_TIME: datetime + METRIC_NAME: str + TABLE_NAME: str + TABLE_SCHEMA: str + TABLE_DATABASE: str + VALUE: int + + +class SnowflakeAssertionsHandler( + SnowflakeCommonMixin, SnowflakeQueryMixin, SnowflakeConnectionMixin +): + def __init__( + self, + config: SnowflakeV2Config, + report: SnowflakeV2Report, + dataset_urn_builder: Callable[[str], str], + ) -> None: + self.config = config + self.report = report + self.logger = logger + self.dataset_urn_builder = dataset_urn_builder + self.connection = None + self._urns_processed: List[str] = [] + + def get_assertion_workunits( + self, discovered_datasets: List[str] + ) -> Iterable[MetadataWorkUnit]: + + self.connection = self.create_connection() + if self.connection is None: + return + + cur = self.query( + SnowflakeQuery.dmf_assertion_results( + datetime_to_ts_millis(self.config.start_time), + datetime_to_ts_millis(self.config.end_time), + ) + ) + for db_row in cur: + mcp = self._process_result_row(db_row, discovered_datasets) + if mcp: + yield mcp.as_workunit(is_primary_source=False) + + if mcp.entityUrn and mcp.entityUrn not in self._urns_processed: + self._urns_processed.append(mcp.entityUrn) + yield self._gen_platform_instance_wu(mcp.entityUrn) + + def _gen_platform_instance_wu(self, urn: str) -> MetadataWorkUnit: + + # Construct a MetadataChangeProposalWrapper object for assertion platform + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=DataPlatformInstance( + platform=make_data_platform_urn(self.platform), + instance=( + make_dataplatform_instance_urn( + self.platform, self.config.platform_instance + ) + if self.config.platform_instance + else None + ), + ), + ).as_workunit(is_primary_source=False) + + def _process_result_row( + self, result_row: dict, discovered_datasets: List[str] + ) -> Optional[MetadataChangeProposalWrapper]: + try: + result = DataQualityMonitoringResult.parse_obj(result_row) + assertion_guid = result.METRIC_NAME.split("__")[-1].lower() + status = bool(result.VALUE) # 1 if PASS, 0 if FAIL + assertee = self.get_dataset_identifier( + result.TABLE_NAME, result.TABLE_SCHEMA, result.TABLE_DATABASE + ) + if assertee in discovered_datasets: + return MetadataChangeProposalWrapper( + entityUrn=make_assertion_urn(assertion_guid), + aspect=AssertionRunEvent( + timestampMillis=datetime_to_ts_millis(result.MEASUREMENT_TIME), + runId=result.MEASUREMENT_TIME.strftime("%Y-%m-%dT%H:%M:%SZ"), + asserteeUrn=self.dataset_urn_builder(assertee), + status=AssertionRunStatus.COMPLETE, + assertionUrn=make_assertion_urn(assertion_guid), + result=AssertionResult( + type=( + AssertionResultType.SUCCESS + if status + else AssertionResultType.FAILURE + ) + ), + ), + ) + except Exception as e: + self.report.report_warning("assertion-result-parse-failure", str(e)) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index c1fbb2cdc1f3f..4beb268448569 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -164,6 +164,12 @@ class SnowflakeV2Config( "username.", ) + include_assertion_results: bool = Field( + default=False, + description="Whether to ingest assertion run results for assertions created using Datahub" + " assertions CLI in snowflake", + ) + @validator("convert_urns_to_lowercase") def validate_convert_urns_to_lowercase(cls, v): if not v: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index dac43499a1c71..205490a6d29c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -1016,3 +1016,26 @@ def table_upstreams_only( ORDER BY h.downstream_table_name """ + + @staticmethod + def dmf_assertion_results(start_time_millis: int, end_time_millis: int) -> str: + + pattern = r"datahub\\_\\_%" + escape_pattern = r"\\" + return f""" + SELECT + MEASUREMENT_TIME AS "MEASUREMENT_TIME", + METRIC_NAME AS "METRIC_NAME", + TABLE_NAME AS "TABLE_NAME", + TABLE_SCHEMA AS "TABLE_SCHEMA", + TABLE_DATABASE AS "TABLE_DATABASE", + VALUE::INT AS "VALUE" + FROM + SNOWFLAKE.LOCAL.DATA_QUALITY_MONITORING_RESULTS + WHERE + MEASUREMENT_TIME >= to_timestamp_ltz({start_time_millis}, 3) + AND MEASUREMENT_TIME < to_timestamp_ltz({end_time_millis}, 3) + AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}' + ORDER BY MEASUREMENT_TIME ASC; + +""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 140b702a8b74b..fc2733c211580 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -50,6 +50,9 @@ SnowflakeEdition, SnowflakeObjectDomain, ) +from datahub.ingestion.source.snowflake.snowflake_assertion import ( + SnowflakeAssertionsHandler, +) from datahub.ingestion.source.snowflake.snowflake_config import ( SnowflakeV2Config, TagOption, @@ -604,6 +607,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) and self.usage_extractor: yield from self.usage_extractor.get_usage_workunits(discovered_datasets) + if self.config.include_assertion_results: + yield from SnowflakeAssertionsHandler( + self.config, self.report, self.gen_dataset_urn + ).get_assertion_workunits(discovered_datasets) + def report_cache_info(self) -> None: lru_cache_functions: List[Callable] = [ self.data_dictionary.get_tables_for_database, diff --git a/metadata-ingestion/src/datahub/integrations/assertion/__init__.py b/metadata-ingestion/src/datahub/integrations/assertion/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/integrations/assertion/common.py b/metadata-ingestion/src/datahub/integrations/assertion/common.py new file mode 100644 index 0000000000000..9ffad5cf66640 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/common.py @@ -0,0 +1,61 @@ +from functools import lru_cache +from typing import List, Optional, Tuple, TypedDict + +from datahub.api.entities.assertion.assertion import BaseEntityAssertion +from datahub.ingestion.graph.client import get_default_graph +from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata +from datahub.utilities.urns.urn import Urn + + +class ColumnDict(TypedDict): + col: str + native_type: str + + +@lru_cache +def get_qualified_name_from_datahub(urn: str) -> Optional[str]: + with get_default_graph() as graph: + props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties) + if props is not None: + return props.qualifiedName + return None + + +@lru_cache +def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]: + with get_default_graph() as graph: + schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata) + if schema is not None: + return [ + {"col": field.fieldPath, "native_type": field.nativeDataType} + for field in schema.fields + ] + return None + + +def get_entity_name(assertion: BaseEntityAssertion) -> Tuple[str, str, str]: + if assertion.meta and assertion.meta.get("entity_qualified_name"): + parts = assertion.meta["entity_qualified_name"].split(".") + else: + qualified_name = get_qualified_name_from_datahub(assertion.entity) + if qualified_name is not None: + parts = qualified_name.split(".") + else: + urn_id = Urn.create_from_string(assertion.entity).entity_ids[1] + parts = urn_id.split(".") + if len(parts) > 3: + parts = parts[-3:] + assert len(parts) == 3 + database = parts[-3] + schema = parts[-2] + table = parts[-1] + return database, schema, table + + +def get_entity_schema(assertion: BaseEntityAssertion) -> Optional[List[ColumnDict]]: + if assertion.meta and assertion.meta.get("entity_schema"): + return assertion.meta.get("entity_schema") + elif get_schema_from_datahub(assertion.entity): + return get_schema_from_datahub(assertion.entity) + return None diff --git a/metadata-ingestion/src/datahub/integrations/assertion/registry.py b/metadata-ingestion/src/datahub/integrations/assertion/registry.py new file mode 100644 index 0000000000000..26015ddbf9a31 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/registry.py @@ -0,0 +1,8 @@ +from typing import Dict, Type + +from datahub.api.entities.assertion.compiler_interface import AssertionCompiler +from datahub.integrations.assertion.snowflake.compiler import SnowflakeAssertionCompiler + +ASSERTION_PLATFORMS: Dict[str, Type[AssertionCompiler]] = { + "snowflake": SnowflakeAssertionCompiler +} diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/__init__.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py new file mode 100644 index 0000000000000..8d2ae2960ebd0 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py @@ -0,0 +1,237 @@ +import logging +import os +from pathlib import Path +from typing import Dict, Tuple + +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec +from datahub.api.entities.assertion.assertion_operator import LessThanOrEqualToOperator +from datahub.api.entities.assertion.assertion_trigger import ( + AssertionTrigger, + CronTrigger, + EntityChangeTrigger, + IntervalTrigger, +) +from datahub.api.entities.assertion.compiler_interface import ( + AssertionCompilationResult, + AssertionCompiler, + CompileResultArtifact, + CompileResultArtifactType, +) +from datahub.api.entities.assertion.datahub_assertion import DataHubAssertion +from datahub.api.entities.assertion.field_assertion import FieldValuesAssertion +from datahub.api.entities.assertion.freshness_assertion import ( + FixedIntervalFreshnessAssertion, +) +from datahub.emitter.mce_builder import make_assertion_urn +from datahub.integrations.assertion.common import get_entity_name, get_entity_schema +from datahub.integrations.assertion.snowflake.dmf_generator import SnowflakeDMFHandler +from datahub.integrations.assertion.snowflake.field_metric_sql_generator import ( + SnowflakeFieldMetricSQLGenerator, +) +from datahub.integrations.assertion.snowflake.field_values_metric_sql_generator import ( + SnowflakeFieldValuesMetricSQLGenerator, +) +from datahub.integrations.assertion.snowflake.metric_operator_sql_generator import ( + SnowflakeMetricEvalOperatorSQLGenerator, +) +from datahub.integrations.assertion.snowflake.metric_sql_generator import ( + SnowflakeMetricSQLGenerator, +) + +logger = logging.Logger(__name__) + +DMF_DEFINITIONS_FILE_NAME = "dmf_definitions.sql" +DMF_ASSOCIATIONS_FILE_NAME = "dmf_associations.sql" +DMF_SCHEMA_PROPERTY_KEY = "DMF_SCHEMA" + + +class SnowflakeAssertionCompiler(AssertionCompiler): + def __init__(self, output_dir: str, extras: Dict[str, str]) -> None: + self.output_dir = Path(output_dir) + self.extras = extras + self.metric_generator = SnowflakeMetricSQLGenerator( + SnowflakeFieldMetricSQLGenerator(), SnowflakeFieldValuesMetricSQLGenerator() + ) + self.metric_evaluator = SnowflakeMetricEvalOperatorSQLGenerator() + self.dmf_handler = SnowflakeDMFHandler() + + self._entity_schedule_history: Dict[str, AssertionTrigger] = dict() + + @classmethod + def create( + cls, output_dir: str, extras: Dict[str, str] + ) -> "SnowflakeAssertionCompiler": + assert os.path.exists( + output_dir + ), f"Specified location {output_dir} does not exist." + + assert os.path.isdir( + output_dir + ), f"Specified location {output_dir} is not a folder." + + assert any( + x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras + ), "Must specify value for DMF schema using -x DMF_SCHEMA=" + + return SnowflakeAssertionCompiler(output_dir, extras) + + def compile( + self, assertion_config_spec: AssertionsConfigSpec + ) -> AssertionCompilationResult: + result = AssertionCompilationResult("snowflake", "success") + + # TODO: Create/Report permissions sql + + dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME + dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME + with (dmf_definitions_path).open("w") as definitions, ( + dmf_associations_path + ).open("w") as associations: + for assertion_spec in assertion_config_spec.assertions: + result.report.num_processed += 1 + try: + start_line = f"\n-- Start of Assertion {assertion_spec.get_id()}\n" + (dmf_definition, dmf_association) = self.process_assertion( + assertion_spec + ) + end_line = f"\n-- End of Assertion {assertion_spec.get_id()}\n" + + definitions.write(start_line) + definitions.write(dmf_definition) + definitions.write(end_line) + + associations.write(start_line) + associations.write(dmf_association) + associations.write(end_line) + + result.report.num_compile_succeeded += 1 + except Exception as e: + result.status = "failure" + result.report.report_failure( + assertion_spec.get_id(), + f"Failed to compile assertion of type {assertion_spec.assertion.type} due to error: {e}", + ) + result.report.num_compile_failed += 1 + if result.report.num_compile_succeeded > 0: + result.add_artifact( + CompileResultArtifact( + name=DMF_DEFINITIONS_FILE_NAME, + path=dmf_definitions_path, + type=CompileResultArtifactType.SQL_QUERIES, + description="SQL file containing DMF create definitions equivalent to Datahub Assertions", + ) + ) + result.add_artifact( + CompileResultArtifact( + name=DMF_ASSOCIATIONS_FILE_NAME, + path=dmf_associations_path, + type=CompileResultArtifactType.SQL_QUERIES, + description="ALTER TABLE queries to associate DMFs to table to run on configured schedule.", + ) + ) + + return result + + def process_assertion(self, assertion: DataHubAssertion) -> Tuple[str, str]: + # TODO: support schema assertion ? + + # For freshness assertion, metric is difference in seconds between assertion execution time + # and last time table was updated. + # For field values assertion, metric is number or percentage of rows that do not satify + # operator condition. + # For remaining assertions, numeric metric is discernible in assertion definition itself. + metric_definition = self.metric_generator.metric_sql(assertion.assertion) + + if isinstance(assertion.assertion, FixedIntervalFreshnessAssertion): + assertion_sql = self.metric_evaluator.operator_sql( + LessThanOrEqualToOperator( + type="less_than_or_equal_to", + value=assertion.assertion.lookback_interval.total_seconds(), + ), + metric_definition, + ) + elif isinstance(assertion.assertion, FieldValuesAssertion): + assertion_sql = self.metric_evaluator.operator_sql( + LessThanOrEqualToOperator( + type="less_than_or_equal_to", + value=assertion.assertion.failure_threshold.value, + ), + metric_definition, + ) + else: + assertion_sql = self.metric_evaluator.operator_sql( + assertion.assertion.operator, metric_definition + ) + + dmf_name = get_dmf_name(assertion) + dmf_schema_name = self.extras[DMF_SCHEMA_PROPERTY_KEY] + + args_create_dmf, args_add_dmf = get_dmf_args(assertion) + + entity_name = get_entity_name(assertion.assertion) + + self._entity_schedule_history.setdefault( + assertion.assertion.entity, assertion.assertion.trigger + ) + if ( + assertion.assertion.entity in self._entity_schedule_history + and self._entity_schedule_history[assertion.assertion.entity] + != assertion.assertion.trigger + ): + raise ValueError( + "Assertions on same entity must have same schedules as of now." + f" Found different schedules on entity {assertion.assertion.entity} ->" + f" ({self._entity_schedule_history[assertion.assertion.entity].trigger})," + f" ({assertion.assertion.trigger.trigger})" + ) + + dmf_schedule = get_dmf_schedule(assertion.assertion.trigger) + dmf_definition = self.dmf_handler.create_dmf( + f"{dmf_schema_name}.{dmf_name}", + args_create_dmf, + assertion.assertion.description + or f"Created via DataHub for assertion {make_assertion_urn(assertion.get_id())} of type {assertion.assertion.type}", + assertion_sql, + ) + dmf_association = self.dmf_handler.add_dmf_to_table( + f"{dmf_schema_name}.{dmf_name}", + args_add_dmf, + dmf_schedule, + ".".join(entity_name), + ) + + return dmf_definition, dmf_association + + +def get_dmf_name(assertion: DataHubAssertion) -> str: + return f"datahub__{assertion.get_id()}" + + +def get_dmf_args(assertion: DataHubAssertion) -> Tuple[str, str]: + """Returns Tuple with + - Args used to create DMF + - Args used to add DMF to table""" + # Snowflake does not allow creating custom data metric + # function without column name argument. + # So we fetch any one column from table's schema + args_create_dmf = "ARGT TABLE({col_name} {col_type})" + args_add_dmf = "{col_name}" + entity_schema = get_entity_schema(assertion.assertion) + if entity_schema: + for col_dict in entity_schema: + return args_create_dmf.format( + col_name=col_dict["col"], col_type=col_dict["native_type"] + ), args_add_dmf.format(col_name=col_dict["col"]) + + raise ValueError("entity schema not available") + + +def get_dmf_schedule(trigger: AssertionTrigger) -> str: + if isinstance(trigger.trigger, EntityChangeTrigger): + return "TRIGGER_ON_CHANGES" + elif isinstance(trigger.trigger, CronTrigger): + return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}" + elif isinstance(trigger.trigger, IntervalTrigger): + return f"{trigger.trigger.interval.seconds/60} MIN" + else: + raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}") diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/dmf_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/dmf_generator.py new file mode 100644 index 0000000000000..4f50b7c2b81a5 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/dmf_generator.py @@ -0,0 +1,22 @@ +class SnowflakeDMFHandler: + def create_dmf( + self, dmf_name: str, dmf_args: str, dmf_comment: str, dmf_sql: str + ) -> str: + return f""" + CREATE or REPLACE DATA METRIC FUNCTION + {dmf_name} ({dmf_args}) + RETURNS NUMBER + COMMENT = '{dmf_comment}' + AS + $$ + {dmf_sql} + $$; + """ + + def add_dmf_to_table( + self, dmf_name: str, dmf_col_args: str, dmf_schedule: str, table_identifier: str + ) -> str: + return f""" + ALTER TABLE {table_identifier} SET DATA_METRIC_SCHEDULE = '{dmf_schedule}'; + ALTER TABLE {table_identifier} ADD DATA METRIC FUNCTION {dmf_name} ON ({dmf_col_args}); + """ diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_metric_sql_generator.py new file mode 100644 index 0000000000000..3ff218a9f280b --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_metric_sql_generator.py @@ -0,0 +1,154 @@ +from typing import List, Optional + +from datahub.api.entities.assertion.field_assertion import FieldMetricAssertion +from datahub.api.entities.assertion.field_metric import FieldMetric +from datahub.integrations.assertion.common import get_entity_name + + +class SnowflakeFieldMetricSQLGenerator: + def unique_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select count(distinct {field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def unique_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select count(distinct {field_name})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def null_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause( + [dataset_filter, f"{field_name} is null"] + ) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def null_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.null_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def min_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select min({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def max_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select max({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def mean_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select avg({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def median_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select median({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def stddev_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select stddev({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def negative_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause([dataset_filter, f"{field_name} < 0"]) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def negative_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.negative_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def zero_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause([dataset_filter, f"{field_name} = 0"]) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def zero_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.zero_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def min_length_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select min(length({field_name})) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def max_length_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select max(length({field_name})) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def empty_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause( + [dataset_filter, f"({field_name} is null or trim({field_name})='')"] + ) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def empty_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.empty_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def _setup_where_clause(self, filters: List[Optional[str]]) -> str: + where_clause = " and ".join(f for f in filters if f) + return f"where {where_clause}" if where_clause else "" + + def metric_sql(self, assertion: FieldMetricAssertion) -> str: + metric_sql_mapping = { + FieldMetric.UNIQUE_COUNT: self.unique_count_sql, + FieldMetric.UNIQUE_PERCENTAGE: self.unique_percentage_sql, + FieldMetric.NULL_COUNT: self.null_count_sql, + FieldMetric.NULL_PERCENTAGE: self.null_percentage_sql, + FieldMetric.MIN: self.min_sql, + FieldMetric.MAX: self.max_sql, + FieldMetric.MEAN: self.mean_sql, + FieldMetric.MEDIAN: self.median_sql, + FieldMetric.STDDEV: self.stddev_sql, + FieldMetric.NEGATIVE_COUNT: self.negative_count_sql, + FieldMetric.NEGATIVE_PERCENTAGE: self.negative_percentage_sql, + FieldMetric.ZERO_COUNT: self.zero_count_sql, + FieldMetric.ZERO_PERCENTAGE: self.zero_percentage_sql, + FieldMetric.MIN_LENGTH: self.min_length_sql, + FieldMetric.MAX_LENGTH: self.max_length_sql, + FieldMetric.EMPTY_COUNT: self.empty_count_sql, + FieldMetric.EMPTY_PERCENTAGE: self.empty_percentage_sql, + } + + entity_name = ".".join(get_entity_name(assertion)) + + return metric_sql_mapping[assertion.metric]( + assertion.field, + entity_name, + ( + assertion.filters.sql + if assertion.filters and assertion.filters.sql + else None + ), + ) diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_values_metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_values_metric_sql_generator.py new file mode 100644 index 0000000000000..b77cc971d3a45 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_values_metric_sql_generator.py @@ -0,0 +1,283 @@ +from functools import singledispatchmethod +from typing import List, Optional + +from datahub.api.entities.assertion.assertion_operator import ( + BetweenOperator, + ContainsOperator, + EndsWithOperator, + EqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + InOperator, + IsFalseOperator, + IsNullOperator, + IsTrueOperator, + LessThanOperator, + LessThanOrEqualToOperator, + MatchesRegexOperator, + NotEqualToOperator, + NotInOperator, + NotNullOperator, + Operators, + StartsWithOperator, +) +from datahub.api.entities.assertion.field_assertion import ( + FieldTransform, + FieldValuesAssertion, +) +from datahub.integrations.assertion.common import get_entity_name + + +class SnowflakeFieldValuesMetricSQLGenerator: + @singledispatchmethod + def values_metric_sql( + self, + operators: Operators, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + """ + Generates SQL that would return boolean value for each table row. + 1 if FAIL and 0 if PASS. Note the unusual reversal of 1 and 0. + This is deliberate, as metric represents number of failing rows. + """ + raise ValueError(f"Unsupported values metric operator type {type(operators)} ") + + @values_metric_sql.register + def _( + self, + operators: InOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} in {tuple(operators.value)} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: NotInOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} not in {tuple(operators.value)} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: EqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} = {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: NotEqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} != {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: BetweenOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} between {operators.min} and {operators.max} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: LessThanOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} < {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: LessThanOrEqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} <= {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: GreaterThanOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} > {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: GreaterThanOrEqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} >= {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: IsNullOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} is null then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: NotNullOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} is not null then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: IsTrueOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: IsFalseOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when not {transformed_field} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: ContainsOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when contains({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: StartsWithOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when startswith({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: EndsWithOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when endswith({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: MatchesRegexOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when REGEXP_LIKE({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + def _setup_where_clause(self, filters: List[Optional[str]]) -> str: + where_clause = " and ".join(f for f in filters if f) + return f"where {where_clause}" if where_clause else "" + + def _setup_field_transform( + self, field: str, transform: Optional[FieldTransform] + ) -> str: + if transform is None: + return field + elif transform is FieldTransform.LENGTH: + return f"length({field})" + raise ValueError(f"Unsupported transform type {transform}") + + def metric_sql(self, assertion: FieldValuesAssertion) -> str: + """ + Note that this applies negative operator in order to check whether or not + number of invalid value rows are less than configured failThreshold. + + Args: + assertion (FieldValuesAssertion): _description_ + + Returns: + str: _description_ + """ + entity_name = ".".join(get_entity_name(assertion)) + + dataset_filter = ( + assertion.filters.sql + if assertion.filters and assertion.filters.sql + else None + ) + where_clause = self._setup_where_clause( + [ + dataset_filter, + f"{assertion.field} is not null" if assertion.exclude_nulls else None, + ] + ) + transformed_field = self._setup_field_transform( + assertion.field, assertion.field_transform + ) + # this sql would return boolean value for each table row. 1 if fail and 0 if pass. + sql = self.values_metric_sql( + assertion.operator, entity_name, transformed_field, where_clause + ) + + # metric would be number of failing rows OR percentage of failing rows. + if assertion.failure_threshold.type == "count": + return f"select sum($1) as metric from ({sql})" + else: # percentage + return f"select sum($1)/count(*) as metric from ({sql})" diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_operator_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_operator_sql_generator.py new file mode 100644 index 0000000000000..e7549d105b3f6 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_operator_sql_generator.py @@ -0,0 +1,68 @@ +from functools import singledispatchmethod + +from datahub.api.entities.assertion.assertion_operator import ( + BetweenOperator, + EqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + IsFalseOperator, + IsNullOperator, + IsTrueOperator, + LessThanOperator, + LessThanOrEqualToOperator, + NotNullOperator, + Operators, +) + + +class SnowflakeMetricEvalOperatorSQLGenerator: + @singledispatchmethod + def operator_sql(self, operators: Operators, metric_sql: str) -> str: + """ + Generates Operator SQL that applies operator on `metric` + and returns a numeric boolean value 1 if PASS, 0 if FAIL + + """ + raise ValueError(f"Unsupported metric operator type {type(operators)} ") + + @operator_sql.register + def _(self, operators: EqualToOperator, metric_sql: str) -> str: + return f"select case when metric={operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: BetweenOperator, metric_sql: str) -> str: + return f"select case when metric between {operators.min} and {operators.max} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: LessThanOperator, metric_sql: str) -> str: + return f"select case when metric < {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: LessThanOrEqualToOperator, metric_sql: str) -> str: + return f"select case when metric <= {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: GreaterThanOperator, metric_sql: str) -> str: + return f"select case when metric > {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: GreaterThanOrEqualToOperator, metric_sql: str) -> str: + return f"select case when metric >= {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: NotNullOperator, metric_sql: str) -> str: + return ( + f"select case when metric is not null then 1 else 0 end from ({metric_sql})" + ) + + @operator_sql.register + def _(self, operators: IsNullOperator, metric_sql: str) -> str: + return f"select case when metric is null then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: IsTrueOperator, metric_sql: str) -> str: + return f"select case when metric then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: IsFalseOperator, metric_sql: str) -> str: + return f"select case when not metric then 1 else 0 end from ({metric_sql})" diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py new file mode 100644 index 0000000000000..5b079129e0a9c --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass +from functools import singledispatchmethod + +from datahub.api.entities.assertion.assertion import BaseEntityAssertion +from datahub.api.entities.assertion.field_assertion import ( + FieldMetricAssertion, + FieldValuesAssertion, +) +from datahub.api.entities.assertion.freshness_assertion import ( + FixedIntervalFreshnessAssertion, + FreshnessSourceType, +) +from datahub.api.entities.assertion.sql_assertion import ( + SqlMetricAssertion, + SqlMetricChangeAssertion, +) +from datahub.api.entities.assertion.volume_assertion import ( + RowCountChangeVolumeAssertion, + RowCountTotalVolumeAssertion, +) +from datahub.integrations.assertion.common import get_entity_name +from datahub.integrations.assertion.snowflake.field_metric_sql_generator import ( + SnowflakeFieldMetricSQLGenerator, +) +from datahub.integrations.assertion.snowflake.field_values_metric_sql_generator import ( + SnowflakeFieldValuesMetricSQLGenerator, +) + + +@dataclass +class SnowflakeMetricSQLGenerator: + field_metric_sql_generator: SnowflakeFieldMetricSQLGenerator + field_values_metric_sql_generator: SnowflakeFieldValuesMetricSQLGenerator + + @singledispatchmethod + def metric_sql( + self, + assertion: BaseEntityAssertion, + ) -> str: + """Generates Metric SQL that typically returns a numeric metric""" + raise ValueError(f"Unsupported assertion type {type(assertion)} ") + + @metric_sql.register + def _(self, assertion: RowCountChangeVolumeAssertion) -> str: + raise ValueError(f"Unsupported assertion type {type(assertion)} ") + + @metric_sql.register + def _(self, assertion: SqlMetricChangeAssertion) -> str: + raise ValueError(f"Unsupported assertion type {type(assertion)} ") + + @metric_sql.register + def _(self, assertion: FixedIntervalFreshnessAssertion) -> str: + entity_name = ".".join(get_entity_name(assertion)) + if assertion.filters and assertion.filters.sql: + where_clause = f"where {assertion.filters.sql}" + else: + where_clause = "" + + if ( + assertion.source_type == FreshnessSourceType.LAST_MODIFIED_COLUMN + and assertion.last_modified_field + ): + return f"""select timediff( + second, + max({assertion.last_modified_field}::TIMESTAMP_LTZ), + SNOWFLAKE.CORE.DATA_METRIC_SCHEDULED_TIME() + ) as metric from {entity_name} {where_clause}""" + else: + raise ValueError( + f"Unsupported freshness source type {assertion.source_type} " + ) + + @metric_sql.register + def _(self, assertion: RowCountTotalVolumeAssertion) -> str: + + # Can not use information schema here due to error - + # Data metric function body cannot refer to the non-deterministic function 'CURRENT_DATABASE_MAIN_METASTORE_ID'. + + entity_name = ".".join(get_entity_name(assertion)) + if assertion.filters and assertion.filters.sql: + where_clause = f"where {assertion.filters.sql}" + else: + where_clause = "" + return f"select count(*) as metric from {entity_name} {where_clause}" + + @metric_sql.register + def _(self, assertion: SqlMetricAssertion) -> str: + return f"select $1 as metric from ({assertion.statement})" + + @metric_sql.register + def _(self, assertion: FieldMetricAssertion) -> str: + sql = self.field_metric_sql_generator.metric_sql(assertion) + return f"select $1 as metric from ({sql})" + + @metric_sql.register + def _(self, assertion: FieldValuesAssertion) -> str: + return self.field_values_metric_sql_generator.metric_sql(assertion) diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json index cc94625560a43..21f1f634c4563 100644 --- a/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json @@ -1,184 +1,189 @@ [ - { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "root", - "created-at": "2023-07-04T14:23:10.457317300Z", - "write.format.default": "parquet", - "location": "s3a://warehouse/wh/nyc/another_taxis", - "format-version": "1", - "snapshot-id": "6904764113937987369", - "manifest-list": "s3a://warehouse/wh/nyc/another_taxis/metadata/snap-6904764113937987369-1-f18ce54a-d59c-461a-a066-9d3085ccf2f2.avro" +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2024-05-22T14:09:15.234903700Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/another_taxis", + "format-version": "1", + "partition-spec": "[{\"name\": \"trip_date\", \"transform\": \"identity\", \"source\": \"trip_date\", \"source-id\": 2, \"source-type\": \"timestamptz\", \"field-id\": 1000}]", + "snapshot-id": "1706020810864905360", + "manifest-list": "s3a://warehouse/wh/nyc/another_taxis/metadata/snap-1706020810864905360-1-90ad8346-ac1b-4e73-bb30-dfd9b0b0e0dc.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:root", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:root", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "nyc.another_taxis", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", - "nullable": true, + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.another_taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", - "nullable": true, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.TimeType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", - "nullable": true, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", - "nullable": true, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", - "nullable": true, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "double", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", - "nullable": true, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" + } + ] } }, - { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:iceberg", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:iceberg,test_platform_instance)" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:iceberg", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:iceberg,test_platform_instance)" } }, - { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.taxis,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } - ] \ No newline at end of file +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json index 163911623470e..b017b6cd31520 100644 --- a/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json @@ -1,153 +1,156 @@ [ - { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "root", - "created-at": "2023-06-12T17:32:17.227545005Z", - "write.format.default": "parquet", - "location": "s3a://warehouse/wh/nyc/taxis", - "format-version": "1", - "snapshot-id": "2505818429184337337", - "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-2505818429184337337-1-a64915c4-afc8-40e3-97a7-98b072b42e10.avro" +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2024-05-22T14:08:04.001538500Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/taxis", + "format-version": "1", + "partition-spec": "[{\"name\": \"trip_date\", \"transform\": \"identity\", \"source\": \"trip_date\", \"source-id\": 2, \"source-type\": \"timestamptz\", \"field-id\": 1000}]", + "snapshot-id": "5259199139271057622", + "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-5259199139271057622-1-24dca7b8-d437-458e-ae91-df1d3e30bdc8.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:root", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:root", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "nyc.taxis", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", - "nullable": true, + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", - "nullable": true, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.TimeType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", - "nullable": true, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", - "nullable": true, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", - "nullable": true, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "double", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", - "nullable": true, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" + } + ] } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test", + "lastRunId": "no-run-id-provided" } - ] \ No newline at end of file +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json index bdb7091014626..453a79494fa25 100644 --- a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json @@ -1,216 +1,220 @@ [ - { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "root", - "created-at": "2023-06-12T17:33:25.422993540Z", - "write.format.default": "parquet", - "location": "s3a://warehouse/wh/nyc/taxis", - "format-version": "1", - "snapshot-id": "2585047006374307840", - "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-2585047006374307840-1-2e2bef19-40d1-4ad1-8fad-e57783477710.avro" +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2024-05-22T14:10:22.926080700Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/taxis", + "format-version": "1", + "partition-spec": "[{\"name\": \"trip_date\", \"transform\": \"identity\", \"source\": \"trip_date\", \"source-id\": 2, \"source-type\": \"timestamptz\", \"field-id\": 1000}]", + "snapshot-id": "564034874306625146", + "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-564034874306625146-1-562a1705-d774-4e0a-baf0-1988bcc7be72.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:root", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:root", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "nyc.taxis", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", - "nullable": true, + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", - "nullable": true, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.TimeType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", - "nullable": true, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", - "nullable": true, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", - "nullable": true, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "double", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", - "nullable": true, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" + } + ] } }, - { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "json": { - "timestampMillis": 1586847600000, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 5, + "columnCount": 6, + "fieldProfiles": [ + { + "fieldPath": "vendor_id", + "nullCount": 0, + "nullProportion": 0.0, + "min": "1", + "max": "3" }, - "rowCount": 5, - "columnCount": 6, - "fieldProfiles": [ - { - "fieldPath": "vendor_id", - "nullCount": 0, - "nullProportion": 0.0, - "min": "1", - "max": "3" - }, - { - "fieldPath": "trip_date", - "nullCount": 0, - "nullProportion": 0.0, - "min": "2000-01-01T12:00:00+00:00", - "max": "2000-01-04T12:00:00+00:00" - }, - { - "fieldPath": "trip_id", - "nullCount": 0, - "nullProportion": 0.0, - "min": "1000371", - "max": "1000375" - }, - { - "fieldPath": "trip_distance", - "nullCount": 0, - "nullProportion": 0.0, - "min": "0.0", - "max": "8.399999618530273" - }, - { - "fieldPath": "fare_amount", - "nullCount": 0, - "nullProportion": 0.0, - "min": "0.0", - "max": "42.13" - }, - { - "fieldPath": "store_and_fwd_flag", - "nullCount": 0, - "nullProportion": 0.0 - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" + { + "fieldPath": "trip_date", + "nullCount": 0, + "nullProportion": 0.0, + "min": "2000-01-01T12:00:00+00:00", + "max": "2000-01-04T12:00:00+00:00" + }, + { + "fieldPath": "trip_id", + "nullCount": 0, + "nullProportion": 0.0, + "min": "1000371", + "max": "1000375" + }, + { + "fieldPath": "trip_distance", + "nullCount": 0, + "nullProportion": 0.0, + "min": "0.0", + "max": "8.399999618530273" + }, + { + "fieldPath": "fare_amount", + "nullCount": 0, + "nullProportion": 0.0, + "min": "0.0", + "max": "42.13" + }, + { + "fieldPath": "store_and_fwd_flag", + "nullCount": 0, + "nullProportion": 0.0 + } + ] } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test", + "lastRunId": "no-run-id-provided" } - ] \ No newline at end of file +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml index 197c03bf2ee8d..38c216e1bf49f 100644 --- a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml @@ -4,9 +4,8 @@ source: type: iceberg config: catalog: - name: default - type: rest - config: + default: + type: rest uri: http://localhost:8181 s3.access-key-id: admin s3.secret-access-key: password diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml b/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml index 8b5d035aed259..2624ee6d0df81 100644 --- a/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml @@ -4,9 +4,8 @@ source: type: iceberg config: catalog: - name: default - type: rest - config: + default: + type: rest uri: http://localhost:8181 s3.access-key-id: admin s3.secret-access-key: password diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py index 24a636077bfdd..5a12afa457f01 100644 --- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py +++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py @@ -79,9 +79,8 @@ def test_iceberg_stateful_ingest( "type": "iceberg", "config": { "catalog": { - "name": "default", - "type": "rest", - "config": { + "default": { + "type": "rest", "uri": "http://localhost:8181", "s3.access-key-id": "admin", "s3.secret-access-key": "password", diff --git a/metadata-ingestion/tests/unit/api/entities/assertion/__init__.py b/metadata-ingestion/tests/unit/api/entities/assertion/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config.yml b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config.yml new file mode 100644 index 0000000000000..a44945a30f9a3 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config.yml @@ -0,0 +1,76 @@ +version: 1 +namespace: test-config-id-1 +assertions: + # Freshness Assertion + - entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + type: freshness + lookback_interval: "1 hour" + last_modified_field: col_timestamp + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Volume Assertion + - type: volume + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + metric: row_count + condition: + type: less_than_or_equal_to + value: 1000 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Metric Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + field: col_date + metric: null_count + condition: + type: equal_to + value: 0 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Value Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + field: quantity + condition: + type: between + min: 0 + max: 10 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT + # Custom SQL Metric Assertion + - type: sql + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + statement: select mode(quantity) from test_db.public.purchase_event + condition: + type: equal_to + value: 5 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT diff --git a/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config_spec.py b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config_spec.py new file mode 100644 index 0000000000000..74f13ac7b2a19 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config_spec.py @@ -0,0 +1,13 @@ +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec + + +def test_assertion_config_spec_parses_correct_type(pytestconfig): + config_file = ( + pytestconfig.rootpath + / "tests/unit/api/entities/assertion/test_assertion_config.yml" + ) + + config_spec = AssertionsConfigSpec.from_yaml(config_file) + assert config_spec.version == 1 + assert config_spec.id == "test-config-id-1" + assert len(config_spec.assertions) == 5 diff --git a/metadata-ingestion/tests/unit/cli/assertion/__init__.py b/metadata-ingestion/tests/unit/cli/assertion/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/tests/unit/cli/assertion/dmf_associations.sql b/metadata-ingestion/tests/unit/cli/assertion/dmf_associations.sql new file mode 100644 index 0000000000000..7e6b1982515e0 --- /dev/null +++ b/metadata-ingestion/tests/unit/cli/assertion/dmf_associations.sql @@ -0,0 +1,35 @@ + +-- Start of Assertion 025cce4dd4123c0f007908011a9c64d7 + + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC'; + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__025cce4dd4123c0f007908011a9c64d7 ON (col_date); + +-- End of Assertion 025cce4dd4123c0f007908011a9c64d7 + +-- Start of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC'; + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__5c32eef47bd763fece7d21c7cbf6c659 ON (col_date); + +-- End of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + +-- Start of Assertion 04be4145bd8de10bed3dfcb0cee57842 + + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC'; + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__04be4145bd8de10bed3dfcb0cee57842 ON (col_date); + +-- End of Assertion 04be4145bd8de10bed3dfcb0cee57842 + +-- Start of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT SET DATA_METRIC_SCHEDULE = 'TRIGGER_ON_CHANGES'; + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__b065942d2bca8a4dbe90cc3ec2d9ca9f ON (quantity); + +-- End of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + +-- Start of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 + + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT SET DATA_METRIC_SCHEDULE = 'TRIGGER_ON_CHANGES'; + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__170dbd53f28eedbbaba52ebbf189f6b1 ON (quantity); + +-- End of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 diff --git a/metadata-ingestion/tests/unit/cli/assertion/dmf_definitions.sql b/metadata-ingestion/tests/unit/cli/assertion/dmf_definitions.sql new file mode 100644 index 0000000000000..85056e150b9b3 --- /dev/null +++ b/metadata-ingestion/tests/unit/cli/assertion/dmf_definitions.sql @@ -0,0 +1,71 @@ + +-- Start of Assertion 025cce4dd4123c0f007908011a9c64d7 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__025cce4dd4123c0f007908011a9c64d7 (ARGT TABLE(col_date DATE)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:025cce4dd4123c0f007908011a9c64d7 of type freshness' + AS + $$ + select case when metric <= 3600 then 1 else 0 end from (select timediff( + second, + max(col_timestamp::TIMESTAMP_LTZ), + SNOWFLAKE.CORE.DATA_METRIC_SCHEDULED_TIME() + ) as metric from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ) + $$; + +-- End of Assertion 025cce4dd4123c0f007908011a9c64d7 + +-- Start of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__5c32eef47bd763fece7d21c7cbf6c659 (ARGT TABLE(col_date DATE)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:5c32eef47bd763fece7d21c7cbf6c659 of type volume' + AS + $$ + select case when metric <= 1000 then 1 else 0 end from (select count(*) as metric from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ) + $$; + +-- End of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + +-- Start of Assertion 04be4145bd8de10bed3dfcb0cee57842 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__04be4145bd8de10bed3dfcb0cee57842 (ARGT TABLE(col_date DATE)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:04be4145bd8de10bed3dfcb0cee57842 of type field' + AS + $$ + select case when metric=0 then 1 else 0 end from (select $1 as metric from (select count(*) + from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES where col_date is null)) + $$; + +-- End of Assertion 04be4145bd8de10bed3dfcb0cee57842 + +-- Start of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__b065942d2bca8a4dbe90cc3ec2d9ca9f (ARGT TABLE(quantity FLOAT)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:b065942d2bca8a4dbe90cc3ec2d9ca9f of type field' + AS + $$ + select case when metric <= 0 then 1 else 0 end from (select sum($1) as metric from (select case when quantity between 0 and 10 then 0 else 1 end + from TEST_DB.PUBLIC.PURCHASE_EVENT where quantity is not null)) + $$; + +-- End of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + +-- Start of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__170dbd53f28eedbbaba52ebbf189f6b1 (ARGT TABLE(quantity FLOAT)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:170dbd53f28eedbbaba52ebbf189f6b1 of type sql' + AS + $$ + select case when metric=5 then 1 else 0 end from (select $1 as metric from (select mode(quantity) from test_db.public.purchase_event)) + $$; + +-- End of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 diff --git a/metadata-ingestion/tests/unit/cli/assertion/test_compile.py b/metadata-ingestion/tests/unit/cli/assertion/test_compile.py new file mode 100644 index 0000000000000..47253b5b0d71e --- /dev/null +++ b/metadata-ingestion/tests/unit/cli/assertion/test_compile.py @@ -0,0 +1,42 @@ +import filecmp +import os + +from datahub.integrations.assertion.snowflake.compiler import ( + DMF_ASSOCIATIONS_FILE_NAME, + DMF_DEFINITIONS_FILE_NAME, +) +from tests.test_helpers.click_helpers import run_datahub_cmd + + +def test_compile_assertion_config_spec_for_snowflake(pytestconfig, tmp_path): + config_file = ( + pytestconfig.rootpath + / "tests/unit/api/entities/assertion/test_assertion_config.yml" + ).resolve() + + golden_file_path = pytestconfig.rootpath / "tests/unit/cli/assertion/" + run_datahub_cmd( + [ + "assertions", + "compile", + "-f", + f"{config_file}", + "-p", + "snowflake", + "-x", + "DMF_SCHEMA=test_db.datahub_dmfs", + "-o", + tmp_path, + ], + ) + + output_file_names = [ + DMF_DEFINITIONS_FILE_NAME, + DMF_ASSOCIATIONS_FILE_NAME, + ] + + for file_name in output_file_names: + assert os.path.exists(tmp_path / file_name) + assert filecmp.cmp( + golden_file_path / file_name, tmp_path / file_name + ), f"{file_name} is not as expected" diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index e2b463004f5a1..5df7a2f3aa944 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -34,7 +34,6 @@ IcebergSource, IcebergSourceConfig, ) -from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField from datahub.metadata.schema_classes import ( ArrayTypeClass, @@ -50,9 +49,7 @@ def with_iceberg_source() -> IcebergSource: - catalog: IcebergCatalogConfig = IcebergCatalogConfig( - name="test", type="rest", config={} - ) + catalog = {"test": {"type": "rest"}} return IcebergSource( ctx=PipelineContext(run_id="iceberg-source-test"), config=IcebergSourceConfig(catalog=catalog), @@ -95,14 +92,29 @@ def test_config_catalog_not_configured(): """ Test when an Iceberg catalog is provided, but not properly configured. """ + # When no catalog configurationis provided, the config should be invalid + with pytest.raises(ValidationError, match="type"): + IcebergSourceConfig(catalog={}) # type: ignore + + # When a catalog name is provided without configuration, the config should be invalid with pytest.raises(ValidationError): - IcebergCatalogConfig() # type: ignore + IcebergSourceConfig(catalog={"test": {}}) - with pytest.raises(ValidationError, match="conf"): - IcebergCatalogConfig(type="a type") # type: ignore - with pytest.raises(ValidationError, match="type"): - IcebergCatalogConfig(conf={}) # type: ignore +def test_config_deprecated_catalog_configuration(): + """ + Test when a deprecated Iceberg catalog configuration is provided, it should be converted to the current scheme. + """ + deprecated_config = { + "name": "test", + "type": "rest", + "config": {"uri": "http://a.uri.test", "another_prop": "another_value"}, + } + migrated_config = IcebergSourceConfig(catalog=deprecated_config) + assert migrated_config.catalog["test"] is not None + assert migrated_config.catalog["test"]["type"] == "rest" + assert migrated_config.catalog["test"]["uri"] == "http://a.uri.test" + assert migrated_config.catalog["test"]["another_prop"] == "another_value" def test_config_for_tests(): diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index 00c19fd3835cf..30b688761d584 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -37,7 +37,6 @@ import io.opentelemetry.extension.annotations.WithSpan; import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; @@ -65,9 +64,18 @@ import org.opensearch.index.query.BoolQueryBuilder; import org.opensearch.index.query.QueryBuilder; import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.TermQueryBuilder; import org.opensearch.search.SearchHit; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.bucket.filter.FilterAggregationBuilder; +import org.opensearch.search.aggregations.bucket.filter.ParsedFilter; +import org.opensearch.search.aggregations.bucket.terms.ParsedStringTerms; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.opensearch.search.aggregations.metrics.ParsedTopHits; +import org.opensearch.search.aggregations.metrics.TopHitsAggregationBuilder; import org.opensearch.search.builder.SearchSourceBuilder; import org.opensearch.search.rescore.QueryRescorerBuilder; +import org.opensearch.search.sort.SortOrder; /** A search DAO for Elasticsearch backend. */ @Slf4j @@ -89,7 +97,13 @@ public class ESGraphQueryDAO { static final String UPDATED_ON = "updatedOn"; static final String UPDATED_ACTOR = "updatedActor"; static final String PROPERTIES = "properties"; + static final String SCORE_FIELD = "_score"; static final String UI = "UI"; + static final String FILTER_BY_SOURCE_RELATIONSHIP = "filter_by_source_relationship"; + static final String FILTER_BY_DESTINATION_RELATIONSHIP = "filter_by_destination_relationship"; + static final String GROUP_BY_SOURCE_AGG = "group_by_source"; + static final String GROUP_BY_DESTINATION_AGG = "group_by_destination"; + static final String TOP_DOCUMENTS_AGG = "top_documents"; @Nonnull public static void addFilterToQueryBuilder( @@ -118,15 +132,7 @@ private SearchResponse executeLineageSearchQuery( @Nonnull final QueryBuilder query, final int offset, final int count) { SearchRequest searchRequest = new SearchRequest(); - SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); - - searchSourceBuilder.from(offset); - searchSourceBuilder.size(count); - - searchSourceBuilder.query(query); - if (graphQueryConfiguration.isBoostViaNodes()) { - addViaNodeBoostQuery(searchSourceBuilder); - } + SearchSourceBuilder searchSourceBuilder = sharedSourceBuilder(query, offset, count); searchRequest.source(searchSourceBuilder); @@ -141,24 +147,77 @@ private SearchResponse executeLineageSearchQuery( } } - private SearchResponse executeLineageSearchQuery( - @Nonnull final QueryBuilder query, - @Nullable Object[] sort, - @Nullable String pitId, - @Nonnull String keepAlive, - final int count) { - SearchRequest searchRequest = new SearchRequest(); - + private SearchSourceBuilder sharedSourceBuilder( + @Nonnull final QueryBuilder query, final int offset, final int count) { SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); - ESUtils.setSearchAfter(searchSourceBuilder, sort, pitId, keepAlive); + searchSourceBuilder.from(offset); searchSourceBuilder.size(count); + searchSourceBuilder.query(query); + if (graphQueryConfiguration.isBoostViaNodes()) { + addViaNodeBoostQuery(searchSourceBuilder); + } + return searchSourceBuilder; + } + private SearchResponse executeGroupByLineageSearchQuery( + @Nonnull final QueryBuilder query, + final int offset, + final int count, + final Set> validEdges) { + SearchRequest searchRequest = new SearchRequest(); + + SearchSourceBuilder searchSourceBuilder = sharedSourceBuilder(query, offset, 0); + + // We have to group by both Source AND Destination because edge types may go in different + // directions for lineage + // set up filters for each relationship type in the correct direction to limit buckets + BoolQueryBuilder sourceFilterQuery = QueryBuilders.boolQuery(); + sourceFilterQuery.minimumShouldMatch(1); + validEdges.stream() + .filter(pair -> RelationshipDirection.OUTGOING.equals(pair.getValue().getDirection())) + .forEach(pair -> sourceFilterQuery.should(getAggregationFilter(pair))); + + BoolQueryBuilder destFilterQuery = QueryBuilders.boolQuery(); + destFilterQuery.minimumShouldMatch(1); + validEdges.stream() + .filter(pair -> RelationshipDirection.INCOMING.equals(pair.getValue().getDirection())) + .forEach(pair -> destFilterQuery.should(getAggregationFilter(pair))); + + FilterAggregationBuilder sourceRelationshipTypeFilters = + AggregationBuilders.filter(FILTER_BY_SOURCE_RELATIONSHIP, sourceFilterQuery); + FilterAggregationBuilder destRelationshipTypeFilters = + AggregationBuilders.filter(FILTER_BY_DESTINATION_RELATIONSHIP, destFilterQuery); + TermsAggregationBuilder sourceAgg = + AggregationBuilders.terms(GROUP_BY_SOURCE_AGG) + .field(SOURCE + ".urn") + .size( + graphQueryConfiguration + .getBatchSize()); // Number of buckets can be up to batch size per query for + // each + + TermsAggregationBuilder destAgg = + AggregationBuilders.terms(GROUP_BY_DESTINATION_AGG) + .field(DESTINATION + ".urn") + .size(graphQueryConfiguration.getBatchSize()); + + TopHitsAggregationBuilder topHitsAgg = + AggregationBuilders.topHits(TOP_DOCUMENTS_AGG) + .size(count) + .sort(SCORE_FIELD, SortOrder.DESC); + sourceAgg.subAggregation(topHitsAgg); + destAgg.subAggregation(topHitsAgg); + + sourceRelationshipTypeFilters.subAggregation(sourceAgg); + destRelationshipTypeFilters.subAggregation(destAgg); + searchSourceBuilder.aggregation(sourceRelationshipTypeFilters); + searchSourceBuilder.aggregation(destRelationshipTypeFilters); searchRequest.source(searchSourceBuilder); searchRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esQuery").time()) { + try (Timer.Context ignored = + MetricUtils.timer(this.getClass(), "esLineageGroupByQuery").time()) { MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); return client.search(searchRequest, RequestOptions.DEFAULT); } catch (Exception e) { @@ -167,6 +226,21 @@ private SearchResponse executeLineageSearchQuery( } } + private BoolQueryBuilder getAggregationFilter(Pair pair) { + BoolQueryBuilder subFilter = QueryBuilders.boolQuery(); + TermQueryBuilder relationshipTypeTerm = + QueryBuilders.termQuery(RELATIONSHIP_TYPE, pair.getValue().getType()); + subFilter.must(relationshipTypeTerm); + TermQueryBuilder sourceTypeTerm = + QueryBuilders.termQuery(SOURCE + ".entityType", pair.getKey()); + subFilter.must(sourceTypeTerm); + TermQueryBuilder destinationTypeTerm = + QueryBuilders.termQuery( + DESTINATION + ".entityType", pair.getValue().getOpposingEntityType()); + subFilter.must(destinationTypeTerm); + return subFilter; + } + public SearchResponse getSearchResponse( @Nullable final List sourceTypes, @Nonnull final Filter sourceEntityFilter, @@ -412,16 +486,7 @@ private Stream processOneHopLineage( intermediateStream = Stream.concat(intermediateStream, ignoreAsHopUrns); } } - // We limit after adding all the relationships at the previous level so each hop is fully - // returned, - // but we only explore a limited number of entities per hop, sort to make the truncation - // consistent - if (lineageFlags.getEntitiesExploredPerHopLimit() != null) { - intermediateStream = - intermediateStream - .sorted(Comparator.comparing(Urn::toString)) - .limit(lineageFlags.getEntitiesExploredPerHopLimit()); - } + if (remainingHops > 0) { // If there are hops remaining, we expect to explore everything getting passed back to the // loop, barring a timeout @@ -537,11 +602,6 @@ private List getLineageRelationships( Collectors.toMap( Function.identity(), entityType -> lineageRegistry.getLineageRelationships(entityType, direction))); - - QueryBuilder finalQuery = - getLineageQuery(urnsPerEntityType, edgesPerEntityType, graphFilters, lineageFlags); - SearchResponse response = - executeLineageSearchQuery(finalQuery, 0, graphQueryConfiguration.getMaxResult()); Set entityUrnSet = new HashSet<>(entityUrns); // Get all valid edges given the set of urns to hop from Set> validEdges = @@ -550,16 +610,37 @@ private List getLineageRelationships( entry -> entry.getValue().stream().map(edgeInfo -> Pair.of(entry.getKey(), edgeInfo))) .collect(Collectors.toSet()); - return extractRelationships( - entityUrnSet, - response, - validEdges, - visitedEntities, - viaEntities, - numHops, - remainingHops, - existingPaths, - exploreMultiplePaths); + + QueryBuilder finalQuery = + getLineageQuery(urnsPerEntityType, edgesPerEntityType, graphFilters, lineageFlags); + SearchResponse response; + if (lineageFlags != null && lineageFlags.getEntitiesExploredPerHopLimit() != null) { + response = + executeGroupByLineageSearchQuery( + finalQuery, 0, lineageFlags.getEntitiesExploredPerHopLimit(), validEdges); + return extractRelationshipsGroupByQuery( + entityUrnSet, + response, + validEdges, + visitedEntities, + viaEntities, + numHops, + remainingHops, + existingPaths, + exploreMultiplePaths); + } else { + response = executeLineageSearchQuery(finalQuery, 0, graphQueryConfiguration.getMaxResult()); + return extractRelationships( + entityUrnSet, + response, + validEdges, + visitedEntities, + viaEntities, + numHops, + remainingHops, + existingPaths, + exploreMultiplePaths); + } } @VisibleForTesting @@ -756,158 +837,20 @@ private static List extractRelationships( log.debug("numHits: {}, numHops {}, remainingHops {}", hits.length, numHops, remainingHops); int index = -1; for (SearchHit hit : hits) { - index++; - final Map document = hit.getSourceAsMap(); - final Urn sourceUrn = - UrnUtils.getUrn(((Map) document.get(SOURCE)).get("urn").toString()); - final Urn destinationUrn = - UrnUtils.getUrn( - ((Map) document.get(DESTINATION)).get("urn").toString()); - final String type = document.get(RELATIONSHIP_TYPE).toString(); - if (sourceUrn.equals(destinationUrn)) { - log.debug("Skipping a self-edge of type {} on {}", type, sourceUrn); - continue; - } - final Number createdOnNumber = (Number) document.getOrDefault(CREATED_ON, null); - final Long createdOn = createdOnNumber != null ? createdOnNumber.longValue() : null; - final Number updatedOnNumber = (Number) document.getOrDefault(UPDATED_ON, null); - final Long updatedOn = updatedOnNumber != null ? updatedOnNumber.longValue() : null; - final String createdActorString = (String) document.getOrDefault(CREATED_ACTOR, null); - final Urn createdActor = - createdActorString == null ? null : UrnUtils.getUrn(createdActorString); - final String updatedActorString = (String) document.getOrDefault(UPDATED_ACTOR, null); - final Urn updatedActor = - updatedActorString == null ? null : UrnUtils.getUrn(updatedActorString); - final Map properties; - if (document.containsKey(PROPERTIES) && document.get(PROPERTIES) instanceof Map) { - properties = (Map) document.get(PROPERTIES); - } else { - properties = Collections.emptyMap(); - } - boolean isManual = properties.containsKey(SOURCE) && properties.get(SOURCE).equals("UI"); - Urn viaEntity = null; - String viaContent = (String) document.getOrDefault(EDGE_FIELD_VIA, null); - if (viaContent != null) { - try { - viaEntity = Urn.createFromString(viaContent); - } catch (Exception e) { - log.warn( - "Failed to parse urn from via entity {}, will swallow exception and continue...", - viaContent); - } - } - log.debug("{}: viaEntity {}", index, viaEntity); - - // Potential outgoing edge - if (entityUrns.contains(sourceUrn)) { - log.debug("{}: entity urns contains source urn {}", index, sourceUrn); - // Skip if already visited or if we're exploring multiple paths - // Skip if edge is not a valid outgoing edge - if ((exploreMultiplePaths || !visitedEntities.contains(destinationUrn)) - && validEdges.contains( - Pair.of( - sourceUrn.getEntityType(), - new EdgeInfo( - type, - RelationshipDirection.OUTGOING, - destinationUrn.getEntityType().toLowerCase())))) { - - if (visitedEntities.contains(destinationUrn)) { - log.debug("Found a second path to the same urn {}", destinationUrn); - } - // Append the edge to a set of unique graph paths. - if (addEdgeToPaths(existingPaths, sourceUrn, viaEntity, destinationUrn)) { - final LineageRelationship relationship = - createLineageRelationship( - type, - destinationUrn, - numHops, - existingPaths.getOrDefault(destinationUrn, new UrnArrayArray()), - // Fetch the paths to the next level entity. - createdOn, - createdActor, - updatedOn, - updatedActor, - isManual); - log.debug("Adding relationship {} to urn {}", relationship, destinationUrn); - lineageRelationshipMap.put(relationship.getEntity(), relationship); - if ((viaEntity != null) && (!viaEntities.contains(viaEntity))) { - UrnArrayArray viaPaths = getViaPaths(existingPaths, destinationUrn, viaEntity); - LineageRelationship viaRelationship = - createLineageRelationship( - type, - viaEntity, - numHops, - viaPaths, - createdOn, - createdActor, - updatedOn, - updatedActor, - isManual); - viaEntities.add(viaEntity); - lineageRelationshipMap.put(viaRelationship.getEntity(), viaRelationship); - log.debug("Adding via entity {} with paths {}", viaEntity, viaPaths); - } - } - visitedEntities.add(destinationUrn); - } - } - - // Potential incoming edge - if (entityUrns.contains(destinationUrn)) { - // Skip if already visited or if we're exploring multiple paths - // Skip if edge is not a valid outgoing edge - log.debug("entity urns contains destination urn {}", destinationUrn); - if ((exploreMultiplePaths || !visitedEntities.contains(sourceUrn)) - && validEdges.contains( - Pair.of( - destinationUrn.getEntityType(), - new EdgeInfo( - type, - RelationshipDirection.INCOMING, - sourceUrn.getEntityType().toLowerCase())))) { - if (visitedEntities.contains(sourceUrn)) { - log.debug("Found a second path to the same urn {}", sourceUrn); - } - visitedEntities.add(sourceUrn); - // Append the edge to a set of unique graph paths. - if (addEdgeToPaths(existingPaths, destinationUrn, viaEntity, sourceUrn)) { - log.debug("Adding incoming edge: {}, {}, {}", destinationUrn, viaEntity, sourceUrn); - final LineageRelationship relationship = - createLineageRelationship( - type, - sourceUrn, - numHops, - existingPaths.getOrDefault(sourceUrn, new UrnArrayArray()), - // Fetch the paths to the next level entity. - createdOn, - createdActor, - updatedOn, - updatedActor, - isManual); - log.debug("Adding relationship {} to urn {}", relationship, sourceUrn); - lineageRelationshipMap.put(relationship.getEntity(), relationship); - if ((viaEntity != null) && (!viaEntities.contains(viaEntity))) { - UrnArrayArray viaPaths = getViaPaths(existingPaths, sourceUrn, viaEntity); - viaEntities.add(viaEntity); - LineageRelationship viaRelationship = - createLineageRelationship( - type, - viaEntity, - numHops, - viaPaths, - createdOn, - createdActor, - updatedOn, - updatedActor, - isManual); - lineageRelationshipMap.put(viaRelationship.getEntity(), viaRelationship); - log.debug("Adding via relationship {} to urn {}", viaRelationship, viaEntity); - } - } - } - } + processSearchHit( + hit, + entityUrns, + index, + exploreMultiplePaths, + visitedEntities, + validEdges, + existingPaths, + numHops, + false, + lineageRelationshipMap, + viaEntities); } + List result = new ArrayList<>(lineageRelationshipMap.values()); log.debug("Number of lineage relationships in list: {}", result.size()); return result; @@ -919,6 +862,261 @@ private static List extractRelationships( } } + private static void processSearchHit( + SearchHit hit, + Set entityUrns, + int index, + boolean exploreMultiplePaths, + Set visitedEntities, + Set> validEdges, + Map existingPaths, + int numHops, + boolean truncatedChildren, + Map lineageRelationshipMap, + Set viaEntities) { + index++; + // Extract fields + final Map document = hit.getSourceAsMap(); + final Urn sourceUrn = + UrnUtils.getUrn(((Map) document.get(SOURCE)).get("urn").toString()); + final Urn destinationUrn = + UrnUtils.getUrn(((Map) document.get(DESTINATION)).get("urn").toString()); + final String type = document.get(RELATIONSHIP_TYPE).toString(); + if (sourceUrn.equals(destinationUrn)) { + log.debug("Skipping a self-edge of type {} on {}", type, sourceUrn); + return; + } + final Number createdOnNumber = (Number) document.getOrDefault(CREATED_ON, null); + final Long createdOn = createdOnNumber != null ? createdOnNumber.longValue() : null; + final Number updatedOnNumber = (Number) document.getOrDefault(UPDATED_ON, null); + final Long updatedOn = updatedOnNumber != null ? updatedOnNumber.longValue() : null; + final String createdActorString = (String) document.getOrDefault(CREATED_ACTOR, null); + final Urn createdActor = + createdActorString == null ? null : UrnUtils.getUrn(createdActorString); + final String updatedActorString = (String) document.getOrDefault(UPDATED_ACTOR, null); + final Urn updatedActor = + updatedActorString == null ? null : UrnUtils.getUrn(updatedActorString); + final Map properties; + if (document.containsKey(PROPERTIES) && document.get(PROPERTIES) instanceof Map) { + properties = (Map) document.get(PROPERTIES); + } else { + properties = Collections.emptyMap(); + } + boolean isManual = properties.containsKey(SOURCE) && properties.get(SOURCE).equals("UI"); + Urn viaEntity = null; + String viaContent = (String) document.getOrDefault(EDGE_FIELD_VIA, null); + if (viaContent != null) { + try { + viaEntity = Urn.createFromString(viaContent); + } catch (Exception e) { + log.warn( + "Failed to parse urn from via entity {}, will swallow exception and continue...", + viaContent); + } + } + log.debug("{}: viaEntity {}", index, viaEntity); + + // Potential outgoing edge + if (entityUrns.contains(sourceUrn)) { + processOutgoingEdge( + entityUrns, + sourceUrn, + index, + exploreMultiplePaths, + visitedEntities, + destinationUrn, + validEdges, + type, + existingPaths, + viaEntity, + numHops, + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual, + truncatedChildren, + lineageRelationshipMap, + viaEntities); + } + + // Potential incoming edge + if (entityUrns.contains(destinationUrn)) { + processIncomingEdge( + entityUrns, + sourceUrn, + exploreMultiplePaths, + visitedEntities, + destinationUrn, + validEdges, + type, + existingPaths, + viaEntity, + numHops, + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual, + truncatedChildren, + lineageRelationshipMap, + viaEntities); + } + } + + private static void processOutgoingEdge( + Set entityUrns, + Urn sourceUrn, + int index, + boolean exploreMultiplePaths, + Set visitedEntities, + Urn destinationUrn, + Set> validEdges, + String type, + Map existingPaths, + Urn viaEntity, + int numHops, + Long createdOn, + Urn createdActor, + Long updatedOn, + Urn updatedActor, + boolean isManual, + boolean truncatedChildren, + Map lineageRelationshipMap, + Set viaEntities) { + if (entityUrns.contains(sourceUrn)) { + log.debug("{}: entity urns contains source urn {}", index, sourceUrn); + // Skip if already visited or if we're exploring multiple paths + // Skip if edge is not a valid outgoing edge + if ((exploreMultiplePaths || !visitedEntities.contains(destinationUrn)) + && validEdges.contains( + Pair.of( + sourceUrn.getEntityType(), + new EdgeInfo( + type, + RelationshipDirection.OUTGOING, + destinationUrn.getEntityType().toLowerCase())))) { + + if (visitedEntities.contains(destinationUrn)) { + log.debug("Found a second path to the same urn {}", destinationUrn); + } + // Append the edge to a set of unique graph paths. + if (addEdgeToPaths(existingPaths, sourceUrn, viaEntity, destinationUrn)) { + final LineageRelationship relationship = + createLineageRelationship( + type, + destinationUrn, + numHops, + existingPaths.getOrDefault(destinationUrn, new UrnArrayArray()), + // Fetch the paths to the next level entity. + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual, + truncatedChildren); + log.debug("Adding relationship {} to urn {}", relationship, destinationUrn); + lineageRelationshipMap.put(relationship.getEntity(), relationship); + if ((viaEntity != null) && (!viaEntities.contains(viaEntity))) { + UrnArrayArray viaPaths = getViaPaths(existingPaths, destinationUrn, viaEntity); + LineageRelationship viaRelationship = + createLineageRelationship( + type, + viaEntity, + numHops, + viaPaths, + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual, + truncatedChildren); + viaEntities.add(viaEntity); + lineageRelationshipMap.put(viaRelationship.getEntity(), viaRelationship); + log.debug("Adding via entity {} with paths {}", viaEntity, viaPaths); + } + } + visitedEntities.add(destinationUrn); + } + } + } + + private static void processIncomingEdge( + Set entityUrns, + Urn sourceUrn, + boolean exploreMultiplePaths, + Set visitedEntities, + Urn destinationUrn, + Set> validEdges, + String type, + Map existingPaths, + Urn viaEntity, + int numHops, + Long createdOn, + Urn createdActor, + Long updatedOn, + Urn updatedActor, + boolean isManual, + boolean truncatedChildren, + Map lineageRelationshipMap, + Set viaEntities) { + if (entityUrns.contains(destinationUrn)) { + // Skip if already visited or if we're exploring multiple paths + // Skip if edge is not a valid outgoing edge + log.debug("entity urns contains destination urn {}", destinationUrn); + if ((exploreMultiplePaths || !visitedEntities.contains(sourceUrn)) + && validEdges.contains( + Pair.of( + destinationUrn.getEntityType(), + new EdgeInfo( + type, + RelationshipDirection.INCOMING, + sourceUrn.getEntityType().toLowerCase())))) { + if (visitedEntities.contains(sourceUrn)) { + log.debug("Found a second path to the same urn {}", sourceUrn); + } + visitedEntities.add(sourceUrn); + // Append the edge to a set of unique graph paths. + if (addEdgeToPaths(existingPaths, destinationUrn, viaEntity, sourceUrn)) { + log.debug("Adding incoming edge: {}, {}, {}", destinationUrn, viaEntity, sourceUrn); + final LineageRelationship relationship = + createLineageRelationship( + type, + sourceUrn, + numHops, + existingPaths.getOrDefault(sourceUrn, new UrnArrayArray()), + // Fetch the paths to the next level entity. + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual, + truncatedChildren); + log.debug("Adding relationship {} to urn {}", relationship, sourceUrn); + lineageRelationshipMap.put(relationship.getEntity(), relationship); + if ((viaEntity != null) && (!viaEntities.contains(viaEntity))) { + UrnArrayArray viaPaths = getViaPaths(existingPaths, sourceUrn, viaEntity); + viaEntities.add(viaEntity); + LineageRelationship viaRelationship = + createLineageRelationship( + type, + viaEntity, + numHops, + viaPaths, + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual, + truncatedChildren); + lineageRelationshipMap.put(viaRelationship.getEntity(), viaRelationship); + log.debug("Adding via relationship {} to urn {}", viaRelationship, viaEntity); + } + } + } + } + } + private static UrnArrayArray getViaPaths( Map existingPaths, Urn destinationUrn, Urn viaEntity) { UrnArrayArray destinationPaths = @@ -946,7 +1144,8 @@ private static LineageRelationship createLineageRelationship( @Nullable final Urn createdActor, @Nullable final Long updatedOn, @Nullable final Urn updatedActor, - final boolean isManual) { + final boolean isManual, + final boolean truncatedChildren) { final LineageRelationship relationship = new LineageRelationship() .setType(type) @@ -967,9 +1166,88 @@ private static LineageRelationship createLineageRelationship( relationship.setUpdatedActor(updatedActor); } relationship.setIsManual(isManual); + relationship.setTruncatedChildren(truncatedChildren); return relationship; } + @WithSpan + private static List extractRelationshipsGroupByQuery( + @Nonnull Set entityUrns, + @Nonnull SearchResponse searchResponse, + Set> validEdges, + Set visitedEntities, + Set viaEntities, + int numHops, + int remainingHops, + Map existingPaths, + boolean exploreMultiplePaths) { + try { + Map lineageRelationshipMap = new HashMap<>(); + ParsedFilter sourceFilterAgg = + searchResponse.getAggregations().get(FILTER_BY_SOURCE_RELATIONSHIP); + ParsedStringTerms sourceTermsAgg = sourceFilterAgg.getAggregations().get(GROUP_BY_SOURCE_AGG); + SearchHit[] hits = new SearchHit[0]; + List sourceBuckets = + (List) sourceTermsAgg.getBuckets(); + int index = -1; + for (ParsedStringTerms.ParsedBucket bucket : sourceBuckets) { + ParsedTopHits topHits = bucket.getAggregations().get(TOP_DOCUMENTS_AGG); + SearchHit[] topHitsArray = topHits.getHits().getHits(); + boolean truncatedChildren = topHits.getHits().getTotalHits().value > topHitsArray.length; + for (SearchHit hit : topHitsArray) { + processSearchHit( + hit, + entityUrns, + index, + exploreMultiplePaths, + visitedEntities, + validEdges, + existingPaths, + numHops, + truncatedChildren, + lineageRelationshipMap, + viaEntities); + } + } + + ParsedFilter destFilterAgg = + searchResponse.getAggregations().get(FILTER_BY_DESTINATION_RELATIONSHIP); + ParsedStringTerms destTermsAgg = + destFilterAgg.getAggregations().get(GROUP_BY_DESTINATION_AGG); + List destBuckets = + (List) destTermsAgg.getBuckets(); + for (ParsedStringTerms.ParsedBucket bucket : destBuckets) { + ParsedTopHits topHits = bucket.getAggregations().get(TOP_DOCUMENTS_AGG); + SearchHit[] topHitsArray = topHits.getHits().getHits(); + boolean truncatedChildren = topHits.getHits().getTotalHits().value > topHitsArray.length; + for (SearchHit hit : topHitsArray) { + processSearchHit( + hit, + entityUrns, + index, + exploreMultiplePaths, + visitedEntities, + validEdges, + existingPaths, + numHops, + truncatedChildren, + lineageRelationshipMap, + viaEntities); + } + } + log.debug("numHits: {}, numHops {}, remainingHops {}", hits.length, numHops, remainingHops); + + List result = new ArrayList<>(lineageRelationshipMap.values()); + log.debug("Number of lineage relationships in list: {}", result.size()); + return result; + } catch (Exception e) { + // This exception handler merely exists to log the exception at an appropriate point and + // rethrow + log.error("Caught exception", e); + throw e; + } + } + private static BoolQueryBuilder getOutGoingEdgeQuery( @Nonnull List urns, @Nonnull List outgoingEdges, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java index 70b30f27553c7..513672b071c17 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java @@ -942,29 +942,27 @@ public RelatedEntitiesScrollResult scrollRelatedEntities( final String edgeCriteria = relationshipFilterToCriteria(relationshipFilter); final RelationshipDirection relationshipDirection = relationshipFilter.getDirection(); - String srcNodeLabel = ""; + + String matchTemplate = "MATCH (src %s)-[r%s %s]-(dest %s)%s"; + if (relationshipDirection == RelationshipDirection.INCOMING) { + matchTemplate = "MATCH (src %s)<-[r%s %s]-(dest %s)%s"; + } else if (relationshipDirection == RelationshipDirection.OUTGOING) { + matchTemplate = "MATCH (src %s)-[r%s %s]->(dest %s)%s"; + } + + String srcNodeLabel = StringUtils.EMPTY; // Create a URN from the String. Only proceed if srcCriteria is not null or empty - if (srcCriteria != null && !srcCriteria.isEmpty()) { + if (StringUtils.isNotEmpty(srcCriteria)) { final String urnValue = sourceEntityFilter.getOr().get(0).getAnd().get(0).getValue().toString(); try { final Urn urn = Urn.createFromString(urnValue); srcNodeLabel = urn.getEntityType(); + matchTemplate = matchTemplate.replace("(src ", "(src:%s "); } catch (URISyntaxException e) { log.error("Failed to parse URN: {} ", urnValue, e); } } - String matchTemplate = "MATCH (src:%s %s)-[r%s %s]-(dest %s)%s"; - if (relationshipDirection == RelationshipDirection.INCOMING) { - matchTemplate = "MATCH (src:%s %s)<-[r%s %s]-(dest %s)%s"; - } else if (relationshipDirection == RelationshipDirection.OUTGOING) { - matchTemplate = "MATCH (src:%s %s)-[r%s %s]->(dest %s)%s"; - } - - final String returnNodes = - String.format( - "RETURN dest, src, type(r)"); // Return both related entity and the relationship type. - final String returnCount = "RETURN count(*)"; // For getting the total results. String relationshipTypeFilter = ""; if (!relationshipTypes.isEmpty()) { @@ -974,18 +972,34 @@ public RelatedEntitiesScrollResult scrollRelatedEntities( String whereClause = computeEntityTypeWhereClause(sourceTypes, destinationTypes); // Build Statement strings - String baseStatementString = - String.format( - matchTemplate, - srcNodeLabel, - srcCriteria, - relationshipTypeFilter, - edgeCriteria, - destCriteria, - whereClause); + String baseStatementString; + if (StringUtils.isNotEmpty(srcNodeLabel)) { + baseStatementString = + String.format( + matchTemplate, + srcNodeLabel, + srcCriteria, + relationshipTypeFilter, + edgeCriteria, + destCriteria, + whereClause); + } else { + baseStatementString = + String.format( + matchTemplate, + srcCriteria, + relationshipTypeFilter, + edgeCriteria, + destCriteria, + whereClause); + } log.info(baseStatementString); + final String returnNodes = + "RETURN dest, src, type(r)"; // Return both related entity and the relationship type. + final String returnCount = "RETURN count(*)"; // For getting the total results. + final String resultStatementString = String.format("%s %s SKIP $offset LIMIT $count", baseStatementString, returnNodes); final String countStatementString = String.format("%s %s", baseStatementString, returnCount); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java index c06457768d725..95c8eb13beb93 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java @@ -745,6 +745,7 @@ private LineageSearchEntity buildLineageSearchEntity( entity.setDegrees(lineageRelationship.getDegrees()); } entity.setExplored(Boolean.TRUE.equals(lineageRelationship.isExplored())); + entity.setTruncatedChildren(Boolean.TRUE.equals(lineageRelationship.isTruncatedChildren())); entity.setIgnoredAsHop(Boolean.TRUE.equals(lineageRelationship.isIgnoredAsHop())); } return entity; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java index 3768fbdb01884..37a7e5adde2dc 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/AutocompleteRequestHandler.java @@ -37,10 +37,7 @@ import lombok.extern.slf4j.Slf4j; import org.opensearch.action.search.SearchRequest; import org.opensearch.action.search.SearchResponse; -import org.opensearch.index.query.BoolQueryBuilder; -import org.opensearch.index.query.MultiMatchQueryBuilder; -import org.opensearch.index.query.QueryBuilder; -import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.*; import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder; import org.opensearch.search.SearchHit; import org.opensearch.search.builder.SearchSourceBuilder; @@ -219,10 +216,9 @@ private static BoolQueryBuilder defaultQuery( autocompleteQueryBuilder.field(fieldName + ".ngram._3gram"); autocompleteQueryBuilder.field(fieldName + ".ngram._4gram"); } - + autocompleteQueryBuilder.field(fieldName + ".delimited"); finalQuery.should(QueryBuilders.matchPhrasePrefixQuery(fieldName + ".delimited", query)); }); - finalQuery.should(autocompleteQueryBuilder); return finalQuery; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java index b1a74b9c09d35..29389f2e66558 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java @@ -22,12 +22,13 @@ import com.linkedin.mxe.PlatformEvent; import com.linkedin.platform.event.v1.EntityChangeEvent; import io.datahubproject.metadata.context.OperationContext; -import java.util.Arrays; -import java.util.Map; -import java.util.Set; -import java.util.function.BiConsumer; +import java.util.*; +import java.util.concurrent.*; import java.util.stream.Collectors; +import lombok.Data; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.time.StopWatch; +import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Value; import org.springframework.lang.NonNull; import org.springframework.lang.Nullable; @@ -41,18 +42,25 @@ public class BusinessAttributeUpdateHookService { private final UpdateIndicesService updateIndicesService; private final int relatedEntitiesCount; private final int getRelatedEntitiesBatchSize; - + private ExecutorService executor; public static final String TAG = "TAG"; public static final String GLOSSARY_TERM = "GLOSSARY_TERM"; public static final String DOCUMENTATION = "DOCUMENTATION"; + private final int threadCount; + private final int AWAIT_TERMINATION_TIME = 10; + private final int keepAlive; public BusinessAttributeUpdateHookService( @NonNull UpdateIndicesService updateIndicesService, @NonNull @Value("${businessAttribute.fetchRelatedEntitiesCount}") int relatedEntitiesCount, - @NonNull @Value("${businessAttribute.fetchRelatedEntitiesBatchSize}") int relatedBatchSize) { + @NonNull @Value("${businessAttribute.fetchRelatedEntitiesBatchSize}") int relatedBatchSize, + @NonNull @Value("${businessAttribute.threadCount}") int threadCount, + @NonNull @Value("${businessAttribute.keepAliveTime}") int keepAlive) { this.updateIndicesService = updateIndicesService; this.relatedEntitiesCount = relatedEntitiesCount; this.getRelatedEntitiesBatchSize = relatedBatchSize; + this.threadCount = threadCount; + this.keepAlive = keepAlive; } public void handleChangeEvent( @@ -61,38 +69,51 @@ public void handleChangeEvent( GenericRecordUtils.deserializePayload( event.getPayload().getValue(), EntityChangeEvent.class); + executor = businessAttributePropagationWorkerPool(threadCount, keepAlive); + if (!entityChangeEvent.getEntityType().equals(Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME)) { - log.info("Skipping MCL event for entity:" + entityChangeEvent.getEntityType()); return; } final Set businessAttributeCategories = ImmutableSet.of(TAG, GLOSSARY_TERM, DOCUMENTATION); if (!businessAttributeCategories.contains(entityChangeEvent.getCategory())) { - log.info("Skipping MCL event for category: " + entityChangeEvent.getCategory()); return; } Urn urn = entityChangeEvent.getEntityUrn(); - log.info("Business Attribute update hook invoked for urn :" + urn); + log.info("Business Attribute update hook invoked for urn : {}", urn); fetchRelatedEntities( opContext, urn, - (batch, batchNumber) -> processBatch(opContext, batch, batchNumber), + (batch, batchNumber, entityKey) -> processBatch(opContext, batch, batchNumber, entityKey), null, 0, 1); + + executor.shutdown(); + try { + if (!executor.awaitTermination(AWAIT_TERMINATION_TIME, TimeUnit.MINUTES)) { + executor.shutdownNow(); // Cancel currently executing tasks + if (!executor.awaitTermination(AWAIT_TERMINATION_TIME, TimeUnit.MINUTES)) + log.error("Business Attribute Propagation Executor is not terminating"); + } + } catch (InterruptedException ie) { + executor.shutdownNow(); + } } private void fetchRelatedEntities( @NonNull final OperationContext opContext, @NonNull final Urn urn, - @NonNull final BiConsumer resultConsumer, + @NonNull + final TriFunction> + resultFunction, @Nullable String scrollId, int consumedEntityCount, int batchNumber) { GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever(); - + final ArrayList> futureList = new ArrayList<>(); RelatedEntitiesScrollResult result = graph.scrollRelatedEntities( null, @@ -106,52 +127,143 @@ private void fetchRelatedEntities( getRelatedEntitiesBatchSize, null, null); - resultConsumer.accept(result, batchNumber); + + futureList.add( + executor.submit(resultFunction.apply(result, batchNumber, urn.getEntityKey().toString()))); + consumedEntityCount = consumedEntityCount + result.getEntities().size(); if (result.getScrollId() != null && consumedEntityCount < relatedEntitiesCount) { batchNumber = batchNumber + 1; fetchRelatedEntities( - opContext, urn, resultConsumer, result.getScrollId(), consumedEntityCount, batchNumber); + opContext, urn, resultFunction, result.getScrollId(), consumedEntityCount, batchNumber); + } + + for (Future future : futureList) { + try { + ExecutionResult futureResult = future.get(); + if (futureResult.getException() != null) { + log.error( + "Batch {} for BA:{} is failed with exception", + futureResult.getBatchNumber(), + futureResult.getEntityKey(), + futureResult.getException()); + } else { + log.info(futureResult.getResult()); + } + } catch (InterruptedException | ExecutionException e) { + log.error("Business Attribute Propagation Parallel Processing Exception", e); + } } + futureList.clear(); } - private void processBatch( + private Callable processBatch( @NonNull OperationContext opContext, @NonNull RelatedEntitiesScrollResult batch, - int batchNumber) { - AspectRetriever aspectRetriever = opContext.getRetrieverContext().get().getAspectRetriever(); - log.info("BA Update Batch {} started", batchNumber); - Set entityUrns = - batch.getEntities().stream() - .map(RelatedEntity::getUrn) - .map(UrnUtils::getUrn) - .collect(Collectors.toSet()); - - Map> entityAspectMap = - aspectRetriever.getLatestAspectObjects( - entityUrns, Set.of(Constants.BUSINESS_ATTRIBUTE_ASPECT)); - - entityAspectMap.entrySet().stream() - .filter(entry -> entry.getValue().containsKey(Constants.BUSINESS_ATTRIBUTE_ASPECT)) - .forEach( - entry -> { - final Urn entityUrn = entry.getKey(); - final Aspect aspect = entry.getValue().get(Constants.BUSINESS_ATTRIBUTE_ASPECT); - - updateIndicesService.handleChangeEvent( - opContext, - PegasusUtils.constructMCL( - null, - Constants.SCHEMA_FIELD_ENTITY_NAME, - entityUrn, - ChangeType.UPSERT, - Constants.BUSINESS_ATTRIBUTE_ASPECT, - opContext.getAuditStamp(), - new BusinessAttributes(aspect.data()), - null, - null, - null)); - }); - log.info("BA Update Batch {} completed", batchNumber); + int batchNumber, + String entityKey) { + return () -> { + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + AspectRetriever aspectRetriever = opContext.getRetrieverContext().get().getAspectRetriever(); + log.info("Batch {} for BA:{} started", batchNumber, entityKey); + ExecutionResult executionResult = new ExecutionResult(); + executionResult.setBatchNumber(batchNumber); + executionResult.setEntityKey(entityKey); + try { + Set entityUrns = + batch.getEntities().stream() + .map(RelatedEntity::getUrn) + .map(UrnUtils::getUrn) + .collect(Collectors.toSet()); + + Map> entityAspectMap = + aspectRetriever.getLatestAspectObjects( + entityUrns, Set.of(Constants.BUSINESS_ATTRIBUTE_ASPECT)); + + entityAspectMap.entrySet().stream() + .filter(entry -> entry.getValue().containsKey(Constants.BUSINESS_ATTRIBUTE_ASPECT)) + .forEach( + entry -> { + final Urn entityUrn = entry.getKey(); + final Aspect aspect = entry.getValue().get(Constants.BUSINESS_ATTRIBUTE_ASPECT); + updateIndicesService.handleChangeEvent( + opContext, + PegasusUtils.constructMCL( + null, + Constants.SCHEMA_FIELD_ENTITY_NAME, + entityUrn, + ChangeType.UPSERT, + Constants.BUSINESS_ATTRIBUTE_ASPECT, + opContext.getAuditStamp(), + new BusinessAttributes(aspect.data()), + null, + null, + null)); + }); + stopWatch.stop(); + String result = + String.format( + "Batch %s for BA:%s is completed in %s", + batchNumber, entityKey, TimeAgo.toDuration(stopWatch.getTime())) + .toString(); + executionResult.setResult(result); + } catch (Exception e) { + executionResult.setException(e); + } + return executionResult; + }; + } + + private ExecutorService businessAttributePropagationWorkerPool(int numThreads, int keepAlive) { + numThreads = numThreads < 0 ? Runtime.getRuntime().availableProcessors() * 2 : numThreads; + return new ThreadPoolExecutor( + numThreads, numThreads, keepAlive, TimeUnit.SECONDS, new LinkedBlockingQueue()); + } + + @FunctionalInterface + private interface TriFunction { + R apply(T t, U u, V v); + } + + @Data + private class ExecutionResult { + String result; + Throwable exception; + int batchNumber; + String entityKey; + } + + private static final class TimeAgo { + private static final List times = + Arrays.asList( + TimeUnit.DAYS.toMillis(365), + TimeUnit.DAYS.toMillis(30), + TimeUnit.DAYS.toMillis(1), + TimeUnit.HOURS.toMillis(1), + TimeUnit.MINUTES.toMillis(1), + TimeUnit.SECONDS.toMillis(1), + TimeUnit.MILLISECONDS.toMillis(1)); + private static final List timesString = + Arrays.asList("year", "month", "day", "hour", "minute", "second", "milliseconds"); + + private static String toDuration(long duration) { + + StringBuffer res = new StringBuffer(); + for (int i = 0; i < times.size(); i++) { + Long current = times.get(i); + long temp = duration / current; + if (temp > 0) { + res.append(temp) + .append(" ") + .append(timesString.get(i)) + .append(temp != 1 ? "s" : StringUtils.EMPTY) + .append(" "); + } + duration = duration % current; + } + if (StringUtils.EMPTY.equals(res.toString())) return "0 seconds ago"; + else return res.toString(); + } } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java index c151c1f381ce9..1aebc48153bbe 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java @@ -2163,7 +2163,7 @@ public void testHighlyConnectedGraphWalk() throws Exception { } assertEquals(new HashSet<>(relatedEntities.getEntities()), expectedRelatedEntities); - Urn root = UrnUtils.getUrn(relatedEntities.getEntities().get(0).getUrn()); + Urn root = dataset1Urn; EntityLineageResult lineageResult = getGraphService(false) .getLineage( @@ -2180,13 +2180,18 @@ public void testHighlyConnectedGraphWalk() throws Exception { 1000, 100, new LineageFlags().setEntitiesExploredPerHopLimit(5)); - assertEquals(lineageResult.getRelationships().size(), 19); + // Unable to explore all paths because multi is disabled, but will be at least 5 since it will + // explore 5 edges + assertTrue( + lineageResult.getRelationships().size() >= 5 + && lineageResult.getRelationships().size() < 20, + "Size was: " + lineageResult.getRelationships().size()); LineageRelationshipArray relationships = lineageResult.getRelationships(); int maxDegree = relationships.stream() .flatMap(relationship -> relationship.getDegrees().stream()) .reduce(0, Math::max); - assertEquals(maxDegree, 1); + assertTrue(maxDegree > 1); EntityLineageResult lineageResultMulti = getGraphService(true) @@ -2205,13 +2210,16 @@ public void testHighlyConnectedGraphWalk() throws Exception { 100, new LineageFlags().setEntitiesExploredPerHopLimit(5)); - assertEquals(lineageResultMulti.getRelationships().size(), 20); + assertTrue( + lineageResultMulti.getRelationships().size() >= 5 + && lineageResultMulti.getRelationships().size() <= 20, + "Size was: " + lineageResultMulti.getRelationships().size()); relationships = lineageResultMulti.getRelationships(); maxDegree = relationships.stream() .flatMap(relationship -> relationship.getDegrees().stream()) .reduce(0, Math::max); - assertTrue(maxDegree > 4); + assertTrue(maxDegree >= 2); // Reset graph service getGraphService(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java index 23b4c82ca0566..d1ee1996e5b8a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java @@ -430,7 +430,10 @@ public void testExplored() throws Exception { Assert.assertTrue(Boolean.TRUE.equals(result.getRelationships().get(0).isExplored())); EntityLineageResult result2 = getUpstreamLineage(dataset2Urn, null, null, 10, 0); - Assert.assertTrue(result2.getRelationships().get(0).isExplored() == null); + Assert.assertTrue(result2.getRelationships().isEmpty()); + + EntityLineageResult result3 = getUpstreamLineage(dataset2Urn, null, null, 10, 1); + Assert.assertTrue(result3.getRelationships().get(0).isExplored()); } /** diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle index b062547724138..5ea24059a3ee3 100644 --- a/metadata-jobs/mce-consumer/build.gradle +++ b/metadata-jobs/mce-consumer/build.gradle @@ -22,6 +22,7 @@ dependencies { implementation project(':metadata-events:mxe-utils-avro') implementation project(':metadata-io') implementation project(':metadata-service:restli-client-api') + implementation project(':metadata-dao-impl:kafka-producer') implementation spec.product.pegasus.restliClient implementation spec.product.pegasus.restliCommon implementation externalDependency.elasticSearchRest diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java index 9db3a77e710a3..b3f81551c830a 100644 --- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java +++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java @@ -76,7 +76,7 @@ public void setupTest() throws URISyntaxException { mockUpdateIndicesService = mock(UpdateIndicesService.class); actorUrn = Urn.createFromString(TEST_ACTOR_URN); businessAttributeServiceHook = - new BusinessAttributeUpdateHookService(mockUpdateIndicesService, 100, 1); + new BusinessAttributeUpdateHookService(mockUpdateIndicesService, 100, 1, 10, 60); businessAttributeUpdateHook = new BusinessAttributeUpdateHook(businessAttributeServiceHook, true); } diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl index 5b60aa18e87da..65196a69ce366 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl @@ -2,6 +2,7 @@ namespace com.linkedin.assertion import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.AuditStamp /** * Information about an assertion @@ -66,10 +67,15 @@ record AssertionInfo includes CustomProperties, ExternalReference { volumeAssertion: optional VolumeAssertionInfo /** - * A SQL Assertion definition. This field is populated when the type is SQL. + * A SQL Assertion definition. This field is populated when the type is SQL. */ sqlAssertion: optional SqlAssertionInfo + /** + * A Field Assertion definition. This field is populated when the type is FIELD. + */ + fieldAssertion: optional FieldAssertionInfo + /** * An schema Assertion definition. This field is populated when the type is DATA_SCHEMA */ @@ -83,6 +89,12 @@ record AssertionInfo includes CustomProperties, ExternalReference { */ source: optional AssertionSource + /** + * The time at which the assertion was last updated and the actor who updated it. + * This field is only present for Native assertions updated after this field was introduced. + */ + lastUpdated: optional AuditStamp + /** * An optional human-readable description of the assertion */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl index e768fe8521942..4bbfa20f8663e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl @@ -33,6 +33,14 @@ record AssertionResultError { */ UNSUPPORTED_PLATFORM /** + * Error while executing a custom SQL assertion + */ + CUSTOM_SQL_ERROR + /** + * Error while executing a field assertion + */ + FIELD_ASSERTION_ERROR + /** * Unknown error */ UNKNOWN_ERROR @@ -42,4 +50,4 @@ record AssertionResultError { * Additional metadata depending on the type of error */ properties: optional map[string, string] -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl index d8892c0c71c6f..734a48f771886 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl @@ -1,5 +1,7 @@ namespace com.linkedin.assertion +import com.linkedin.common.AuditStamp + /** * The source of an assertion */ @@ -24,4 +26,10 @@ record AssertionSource { */ INFERRED } + + /** + * The time at which the assertion was initially created and the author who created it. + * This field is only present for Native assertions created after this field was introduced. + */ + created: optional AuditStamp } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl index 2e0dcbe24986b..ee4f961249025 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl @@ -34,6 +34,16 @@ enum AssertionStdOperator { */ EQUAL_TO + /** + * Value being asserted is not equal to value. Requires 'value' parameter. + */ + NOT_EQUAL_TO + + /** + * Value being asserted is null. Requires no parameters. + */ + NULL + /** * Value being asserted is not null. Requires no parameters. */ @@ -69,6 +79,16 @@ enum AssertionStdOperator { */ NOT_IN + /** + * Value being asserted is true. Requires no parameters. + */ + IS_TRUE + + /** + * Value being asserted is false. Requires no parameters. + */ + IS_FALSE + /** * Other */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl index a212fe84aff13..9c3e3ea7c1c95 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl @@ -13,10 +13,29 @@ record AssertionStdParameter { * The type of the parameter */ type: enum AssertionStdParameterType { + /** + * A string value + */ STRING + + /** + * A numeric value + */ NUMBER + + /** + * A list of values. When used, value should be formatted as a serialized JSON array. + */ LIST + + /** + * A set of values. When used, value should be formatted as a serialized JSON array. + */ SET + + /** + * A value of unknown type + */ UNKNOWN } } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldAssertionInfo.pdl new file mode 100644 index 0000000000000..0b8d9ab8cceb8 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldAssertionInfo.pdl @@ -0,0 +1,57 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a Field Assertion. +**/ +record FieldAssertionInfo { + /** + * The type of the field assertion being monitored. + */ + @Searchable = {} + type: enum FieldAssertionType { + /** + * An assertion used to validate the values contained with a field / column given a set of rows. + */ + FIELD_VALUES + /** + * An assertion used to validate the value of a common field / column metric (e.g. aggregation) such as null count + percentage, + * min, max, median, and more. + */ + FIELD_METRIC + } + + /** + * The entity targeted by this Field check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * The definition of an assertion that validates individual values of a field / column for a set of rows. + * This type of assertion verifies that each column value meets a particular requirement. + */ + fieldValuesAssertion: optional FieldValuesAssertion + + /** + * The definition of an assertion that validates a common metric obtained about a field / column for a set of rows. + * This type of assertion verifies that the value of a high-level metric obtained by aggregating over a column meets + * expectations + */ + fieldMetricAssertion: optional FieldMetricAssertion + + /** + * A definition of the specific filters that should be applied, when performing monitoring. + * If not provided, there is no filter, and the full table is under consideration. + * If using DataHub Dataset Profiles as the assertion source type, the value of this field will be ignored. + */ + filter: optional DatasetFilter +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricAssertion.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricAssertion.pdl new file mode 100644 index 0000000000000..ca9ce9cbd6a8c --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricAssertion.pdl @@ -0,0 +1,39 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + +/** +* Attributes defining a field metric assertion, which asserts an expectation against +* a common metric derived from the set of field / column values, for example: +* max, min, median, null count, null percentage, unique count, unique percentage, and more. +*/ +record FieldMetricAssertion { + /** + * The field under evaluation + */ + @Searchable = { + "/path": { + "fieldName": "fieldPath" + } + } + field: SchemaFieldSpec + + /** + * The specific metric to assert against. This is the value that + * will be obtained by applying a standard operation, such as an aggregation, + * to the selected field. + */ + metric: FieldMetricType + + /** + * The predicate to evaluate against the metric for the field / column. + * Depending on the operator, parameters may be required in order to successfully + * evaluate the assertion against the metric value. + */ + operator: AssertionStdOperator + + /** + * Standard parameters required for the assertion. e.g. min_value, max_value, value, columns + */ + parameters: optional AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricType.pdl new file mode 100644 index 0000000000000..9df06e9dc1fe2 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricType.pdl @@ -0,0 +1,94 @@ +namespace com.linkedin.assertion + +/** + * A standard metric that can be derived from the set of values + * for a specific field / column of a dataset / table. + */ +enum FieldMetricType { + /** + * The number of unique values found in the column value set + */ + UNIQUE_COUNT + + /** + * The percentage of unique values to total rows for the dataset + */ + UNIQUE_PERCENTAGE + + /** + * The number of null values found in the column value set + */ + NULL_COUNT + + /** + * The percentage of null values to total rows for the dataset + */ + NULL_PERCENTAGE + + /** + * The minimum value in the column set (applies to numeric columns) + */ + MIN + + /** + * The maximum value in the column set (applies to numeric columns) + */ + MAX + + /** + * The mean length found in the column set (applies to numeric columns) + */ + MEAN + + /** + * The median length found in the column set (applies to numeric columns) + */ + MEDIAN + + /** + * The stddev length found in the column set (applies to numeric columns) + */ + STDDEV + + /** + * The number of negative values found in the value set (applies to numeric columns) + */ + NEGATIVE_COUNT + + /** + * The percentage of negative values to total rows for the dataset (applies to numeric columns) + */ + NEGATIVE_PERCENTAGE + + /** + * The number of zero values found in the value set (applies to numeric columns) + */ + ZERO_COUNT + + /** + * The percentage of zero values to total rows for the dataset (applies to numeric columns) + */ + ZERO_PERCENTAGE + + /** + * The minimum length found in the column set (applies to string columns) + */ + MIN_LENGTH + + /** + * The maximum length found in the column set (applies to string columns) + */ + MAX_LENGTH + + /** + * The number of empty string values found in the value set (applies to string columns). + * Note: This is a completely different metric different from NULL_COUNT! + */ + EMPTY_COUNT + + /** + * The percentage of empty string values to total rows for the dataset (applies to string columns) + * Note: This is a completely different metric different from NULL_PERCENTAGE! + */ + EMPTY_PERCENTAGE +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldTransform.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldTransform.pdl new file mode 100644 index 0000000000000..3b3d3339a9b86 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldTransform.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.assertion + +/** +* Definition of a transform applied to the values of a column / field. +* Note that the applicability of a field transform ultimately depends on the native type +* of the field / column. +* +* Model has single field to permit extension. +*/ +record FieldTransform { + /** + * The type of the field transform, e.g. the transformation + * function / operator to apply. + */ + type: enum FieldTransformType { + /** + * Obtain the length of a string field / column (applicable to string types) + */ + LENGTH + } +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldValuesAssertion.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldValuesAssertion.pdl new file mode 100644 index 0000000000000..0400124234462 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldValuesAssertion.pdl @@ -0,0 +1,83 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + +/** +* Attributes defining a field values assertion, which asserts that the values for a field / column +* of a dataset / table matches a set of expectations. +* +* In other words, this type of assertion acts as a semantic constraint applied to fields for a specific column. +* +* TODO: We should display the "failed row count" to the user if the column fails the verification rules. +* TODO: Determine whether we need an "operator" that can be applied to the field. +*/ +record FieldValuesAssertion { + /** + * The field under evaluation + */ + @Searchable = { + "/path": { + "fieldName": "fieldPath" + } + } + field: SchemaFieldSpec + + /** + * An optional transform to apply to field values + * before evaluating the operator. + * + * If none is applied, the field value will be compared as is. + */ + transform: optional FieldTransform + + /** + * The predicate to evaluate against a single value of the field. + * Depending on the operator, parameters may be required in order to successfully + * evaluate the assertion against the field value. + */ + operator: AssertionStdOperator + + /** + * Standard parameters required for the assertion. e.g. min_value, max_value, value, columns + */ + parameters: optional AssertionStdParameters + + /** + * Additional customization about when the assertion + * should be officially considered failing. + */ + failThreshold: record FieldValuesFailThreshold { + + /** + * The type of failure threshold. Either based on the number + * of column values (rows) that fail the expectations, or the percentage + * of the total rows under consideration. + */ + type: enum FieldValuesFailThresholdType { + /* + * The maximum number of column values (i.e. rows) that are allowed + * to fail the defined expectations before the assertion officially fails. + */ + COUNT + /* + * The maximum percentage of rows that are allowed + * to fail the defined column expectations before the assertion officially fails. + */ + PERCENTAGE + } = "COUNT" + + /** + * By default this is 0, meaning that ALL column values (i.e. rows) must + * meet the defined expectations. + */ + value: long = 0 + } + + /** + * Whether to ignore or allow nulls when running the values assertion. (i.e. + * consider only non-null values) using operators OTHER than the IS_NULL operator. + * + * Defaults to true, allowing null values. + */ + excludeNulls: boolean = true +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl index 04acd1c71352d..179d4a1b13591 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl @@ -4,11 +4,13 @@ import com.linkedin.schema.SchemaFieldSpec /** -* Lightweight spec used for referencing a particular schema field. -**/ +* Lightweight spec used for referencing a particular schema field that is used to compute +* a freshness signal or operation. +* TODO: Since this is now leveraged across assertions & metrics / operations, we should consider moving this to a common package. +*/ record FreshnessFieldSpec includes SchemaFieldSpec { /** - * The type of the field being used to verify the Freshness Assertion. + * The type of the field being used to verify the Freshness of the asset. */ kind: optional FreshnessFieldKind } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl index fd246e0c7cfc4..2e691d5152ae3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl @@ -25,5 +25,36 @@ record SchemaAssertionInfo { * Note that many of the fields of this model, especially those related to metadata (tags, terms) * will go unused in this context. */ - schema: SchemaMetadata +// @Relationship = { +// "/foreignKeys/*/foreignFields/*": null, +// "/foreignKeys/*/foreignDataset": null, +// "/fields/*/globalTags/tags/*/tag": null, +// "/fields/*/glossaryTerms/terms/*/urn": null +// } +// @Searchable = { +// "/fields/*/fieldPath": null, +// "/fields/*/description": null, +// "/fields/*/label": null, +// "/fields/*/globalTags/tags/*/tag": null, +// "/fields/*/glossaryTerms/terms/*/urn": null +// } + schema: SchemaMetadata + + /** + * The required compatibility level for the schema assertion to pass. + */ + compatibility: optional enum SchemaAssertionCompatibility { + /** + * The actual schema must be exactly the same as the expected schema + */ + EXACT_MATCH, + /** + * The actual schema must be a superset of the expected schema + */ + SUPERSET, + /** + * The actual schema must be a subset of the expected schema + */ + SUBSET + } = "EXACT_MATCH" } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl index 327b76f95762e..bdc78d3bd0a6f 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl @@ -8,7 +8,7 @@ import com.linkedin.dataset.DatasetFilter */ record VolumeAssertionInfo { /** - * The type of the freshness assertion being monitored. + * The type of the volume assertion being monitored. */ @Searchable = {} type: enum VolumeAssertionType { diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl index 273d2c2a56f95..3ff8b58284f18 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl @@ -12,5 +12,9 @@ record DataQualityContract { * The assertion representing the Data Quality contract. * E.g. a table or column-level assertion. */ + @Relationship = { + "name": "IncludesDataQualityAssertion", + "entityTypes": [ "assertion" ] + } assertion: Urn } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl index 6c11e0da5b128..af61a660cdf76 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl @@ -9,5 +9,9 @@ record SchemaContract { /** * The assertion representing the schema contract. */ + @Relationship = { + "name": "IncludesSchemaAssertion", + "entityTypes": [ "assertion" ] + } assertion: Urn } diff --git a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl index 2f8912da5458c..2e65d37dc0939 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl @@ -22,6 +22,11 @@ record IncidentSource { * Manually created incident, via UI or API. */ MANUAL + + /** + * An assertion has failed, triggering the incident. + */ + ASSERTION_FAILURE } /** diff --git a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl index 27c4790e3b6ef..1c3473018d4e0 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl @@ -4,6 +4,36 @@ namespace com.linkedin.incident * A type of asset incident */ enum IncidentType { + /** + * An Freshness Assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + FRESHNESS + + /** + * An Volume Assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + VOLUME + + /** + * A Field Assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + FIELD + + /** + * A raw SQL-statement based assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + SQL + + /** + * A Data Schema assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + DATA_SCHEMA + /** * A misc. operational incident, e.g. failure to materialize a dataset. */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl index 552dd7323b551..7535c7e9292ec 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl @@ -73,6 +73,11 @@ record LineageRelationship { */ explored: optional boolean + /** + * Indicates this destination node has additional unexplored child relationships + */ + truncatedChildren: optional boolean + /** * Whether this relationship was ignored as a hop while performing the graph walk */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl index 3fd8a48c6bf5e..3f246b5014df0 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl @@ -35,6 +35,11 @@ record LineageSearchEntity includes SearchEntity { */ explored: optional boolean + /** + * Indicates this destination node has additional unexplored child relationships + */ + truncatedChildren: optional boolean + /** * Whether this relationship was ignored as a hop while performing the graph walk */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalViewsSettings.pdl b/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalViewsSettings.pdl index ff34dea556855..10b1176b637ef 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalViewsSettings.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalViewsSettings.pdl @@ -9,5 +9,9 @@ record GlobalViewsSettings { /** * The default View for the instance, or organization. */ + @Relationship = { + "name": "viewedWith", + "entityTypes": [ "dataHubView" ] + } defaultView: optional Urn -} \ No newline at end of file +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/MetadataChangeProposalConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/MetadataChangeProposalConfig.java new file mode 100644 index 0000000000000..3d3808bc5feb4 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/MetadataChangeProposalConfig.java @@ -0,0 +1,30 @@ +package com.linkedin.metadata.config; + +import lombok.Data; +import lombok.experimental.Accessors; + +@Data +@Accessors(chain = true) +public class MetadataChangeProposalConfig { + + ThrottlesConfig throttle; + + @Data + @Accessors(chain = true) + public static class ThrottlesConfig { + Integer updateIntervalMs; + ThrottleConfig versioned; + ThrottleConfig timeseries; + } + + @Data + @Accessors(chain = true) + public static class ThrottleConfig { + boolean enabled; + Integer threshold; + Integer maxAttempts; + Integer initialIntervalMs; + Integer multiplier; + Integer maxIntervalMs; + } +} diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 770be86e254b1..19621dce767c6 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -460,3 +460,28 @@ forms: businessAttribute: fetchRelatedEntitiesCount: ${BUSINESS_ATTRIBUTE_RELATED_ENTITIES_COUNT:20000} fetchRelatedEntitiesBatchSize: ${BUSINESS_ATTRIBUTE_RELATED_ENTITIES_BATCH_SIZE:1000} + threadCount: ${BUSINESS_ATTRIBUTE_PROPAGATION_CONCURRENCY_THREAD_COUNT:-1} # Thread Pool size, default 2 * # of cores + keepAliveTime: ${BUSINESS_ATTRIBUTE_PROPAGATION_CONCURRENCY_KEEP_ALIVE:60} # Number of seconds to keep inactive threads alive + +metadataChangeProposal: + throttle: + updateIntervalMs: ${MCP_THROTTLE_UPDATE_INTERVAL_MS:60000} + + # Versioned MCL topic + versioned: + # Whether to throttle MCP processing based on MCL backlog + enabled: ${MCP_VERSIONED_THROTTLE_ENABLED:false} + threshold: ${MCP_VERSIONED_THRESHOLD:4000} # throttle threshold + maxAttempts: ${MCP_VERSIONED_MAX_ATTEMPTS:1000} + initialIntervalMs: ${MCP_VERSIONED_INITIAL_INTERVAL_MS:100} + multiplier: ${MCP_VERSIONED_MULTIPLIER:10} + maxIntervalMs: ${MCP_VERSIONED_MAX_INTERVAL_MS:30000} + # Timeseries MCL topic + timeseries: + # Whether to throttle MCP processing based on MCL backlog + enabled: ${MCP_TIMESERIES_THROTTLE_ENABLED:false} + threshold: ${MCP_TIMESERIES_THRESHOLD:4000} # throttle threshold + maxAttempts: ${MCP_TIMESERIES_MAX_ATTEMPTS:1000} + initialIntervalMs: ${MCP_TIMESERIES_INITIAL_INTERVAL_MS:100} + multiplier: ${MCP_TIMESERIES_MULTIPLIER:10} + maxIntervalMs: ${MCP_TIMESERIES_MAX_INTERVAL_MS:30000} \ No newline at end of file diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java index 9381e24fabab6..08adbd54730a7 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java @@ -7,6 +7,7 @@ import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.GraphQLConfiguration; import com.linkedin.metadata.config.IngestionConfiguration; +import com.linkedin.metadata.config.MetadataChangeProposalConfig; import com.linkedin.metadata.config.SystemUpdateConfiguration; import com.linkedin.metadata.config.TestsConfiguration; import com.linkedin.metadata.config.ViewsConfiguration; @@ -80,4 +81,7 @@ public class ConfigurationProvider { /** GraphQL Configurations */ private GraphQLConfiguration graphQL; + + /** MCP throttling configuration */ + private MetadataChangeProposalConfig metadataChangeProposal; } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java index 8c0a079f1e61d..aa80fc62db09c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java @@ -68,7 +68,7 @@ EntityRegistryFactory.class, DataHubTokenServiceFactory.class, GitVersionFactory.class, - SiblingGraphServiceFactory.class + SiblingGraphServiceFactory.class, }) public class GraphQLEngineFactory { @Autowired diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java index 5844dc4a8f72a..6a2b9f511b79f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java @@ -44,7 +44,7 @@ public static Map buildProducerProperties( Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); } // else we rely on KafkaProperties which defaults to localhost:9092 - Map props = properties.buildProducerProperties(); + Map props = properties.buildProducerProperties(null); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, schemaRegistryConfig.getSerializer()); diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java index d5210213185be..9501b03482d04 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java @@ -73,7 +73,7 @@ private static Map buildCustomizedProperties( Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); } // else we rely on KafkaProperties which defaults to localhost:9092 - Map customizedProperties = baseKafkaProperties.buildConsumerProperties(); + Map customizedProperties = baseKafkaProperties.buildConsumerProperties(null); customizedProperties.put( ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ErrorHandlingDeserializer.class); customizedProperties.put( diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java index 3a6c9770fd362..0193ded97f81b 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java @@ -44,7 +44,7 @@ protected KafkaListenerContainerFactory createInstance( Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); } // else we rely on KafkaProperties which defaults to localhost:9092 - Map customizedProperties = properties.buildConsumerProperties(); + Map customizedProperties = properties.buildConsumerProperties(null); customizedProperties.put( ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG, kafkaConfiguration.getConsumer().getMaxPartitionFetchBytes()); diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaProducerThrottleFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaProducerThrottleFactory.java new file mode 100644 index 0000000000000..1eaff82fd517f --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/throttle/KafkaProducerThrottleFactory.java @@ -0,0 +1,93 @@ +package com.linkedin.gms.factory.kafka.throttle; + +import com.datahub.metadata.dao.producer.KafkaProducerThrottle; +import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.config.MetadataChangeProposalConfig; +import com.linkedin.metadata.config.kafka.KafkaConfiguration; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.mxe.Topics; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.KafkaAdminClient; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.kafka.KafkaProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.kafka.config.KafkaListenerEndpointRegistry; +import org.springframework.kafka.listener.MessageListenerContainer; + +@Slf4j +@Configuration +public class KafkaProducerThrottleFactory { + + @Value("${METADATA_CHANGE_LOG_KAFKA_CONSUMER_GROUP_ID:generic-mae-consumer-job-client}") + private String maeConsumerGroupId; + + @Value("${METADATA_CHANGE_PROPOSAL_KAFKA_CONSUMER_GROUP_ID:generic-mce-consumer-job-client}") + private String mceConsumerGroupId; + + @Value("${METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME:" + Topics.METADATA_CHANGE_LOG_VERSIONED + "}") + private String versionedTopicName; + + @Value( + "${METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME:" + Topics.METADATA_CHANGE_LOG_TIMESERIES + "}") + private String timeseriesTopicName; + + @Bean + public KafkaProducerThrottle kafkaProducerThrottle( + @Qualifier("configurationProvider") ConfigurationProvider provider, + final KafkaProperties kafkaProperties, + final EntityRegistry entityRegistry, + final KafkaListenerEndpointRegistry registry) { + + KafkaConfiguration kafkaConfiguration = provider.getKafka(); + MetadataChangeProposalConfig mcpConfig = provider.getMetadataChangeProposal(); + + return KafkaProducerThrottle.builder() + .entityRegistry(entityRegistry) + .kafkaAdmin(kafkaAdmin(kafkaConfiguration, kafkaProperties)) + .config(mcpConfig.getThrottle()) + .mclConsumerGroupId(maeConsumerGroupId) + .timeseriesTopicName(timeseriesTopicName) + .versionedTopicName(versionedTopicName) + .pauseConsumer( + (pause) -> { + Optional container = + Optional.ofNullable(registry.getListenerContainer(mceConsumerGroupId)); + if (container.isEmpty()) { + log.warn( + "Expected container was missing: {} throttling is not possible.", + mceConsumerGroupId); + } else { + if (pause) { + container.ifPresent(MessageListenerContainer::pause); + } else { + container.ifPresent(MessageListenerContainer::resume); + } + } + }) + .build() + .start(); + } + + private static AdminClient kafkaAdmin( + KafkaConfiguration kafkaConfiguration, final KafkaProperties kafkaProperties) { + Map adminProperties = new HashMap<>(kafkaProperties.buildAdminProperties(null)); + + // KAFKA_BOOTSTRAP_SERVER has precedence over SPRING_KAFKA_BOOTSTRAP_SERVERS + if (kafkaConfiguration.getBootstrapServers() != null + && !kafkaConfiguration.getBootstrapServers().isEmpty()) { + adminProperties.put( + AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, + Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); + } // else we rely on KafkaProperties which defaults to localhost:9092 or environment variables + + return KafkaAdminClient.create(adminProperties); + } +} diff --git a/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java b/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java index 8a080c8d9076e..3953ab8a45636 100644 --- a/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java +++ b/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java @@ -28,34 +28,39 @@ public SpringPluginFactory( @Nonnull List classLoaders) { super(pluginConfiguration, classLoaders); - String[] packageScan = - extractPackageScan( - Optional.ofNullable(pluginConfiguration) - .map(PluginConfiguration::streamAll) - .orElse(Stream.of())) - .toArray(String[]::new); - - if (springApplicationContext != null || packageScan.length == 0) { - this.springApplicationContext = springApplicationContext; - } else { - AnnotationConfigApplicationContext rootContext = null; - - for (ClassLoader classLoader : classLoaders) { - AnnotationConfigApplicationContext applicationContext = - new AnnotationConfigApplicationContext(); - applicationContext.setId("custom-plugin"); - if (rootContext != null) { - applicationContext.setParent(rootContext); + try { + String[] packageScan = + extractPackageScan( + Optional.ofNullable(pluginConfiguration) + .map(PluginConfiguration::streamAll) + .orElse(Stream.of())) + .toArray(String[]::new); + + if (springApplicationContext != null || packageScan.length == 0) { + this.springApplicationContext = springApplicationContext; + } else { + AnnotationConfigApplicationContext rootContext = null; + + for (ClassLoader classLoader : classLoaders) { + AnnotationConfigApplicationContext applicationContext = + new AnnotationConfigApplicationContext(); + applicationContext.setId("custom-plugin"); + if (rootContext != null) { + applicationContext.setParent(rootContext); + } + applicationContext.setClassLoader(classLoader); + applicationContext.scan(packageScan); + rootContext = applicationContext; } - applicationContext.setClassLoader(classLoader); - applicationContext.scan(packageScan); - rootContext = applicationContext; + rootContext.refresh(); + this.springApplicationContext = rootContext; } - rootContext.refresh(); - this.springApplicationContext = rootContext; - } - loadPlugins(); + loadPlugins(); + } catch (Exception e) { + log.error("Error loading Spring Plugins!", e); + throw e; + } } private static Stream extractPackageScan(Stream configStream) { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 00b434d30356f..eb81fe3ff8db3 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -843,10 +843,10 @@ "PRE" : "Designates pre-production fabrics", "PROD" : "Designates production fabrics", "QA" : "Designates quality assurance fabrics", + "RVW" : "Designates review fabrics", "STG" : "Designates staging fabrics", "TEST" : "Designates testing fabrics", - "UAT" : "Designates user acceptance testing fabrics", - "RVW" : "Designates review fabrics" + "UAT" : "Designates user acceptance testing fabrics" } }, { "type" : "record", @@ -2489,7 +2489,13 @@ }, { "name" : "lastModified", "type" : "com.linkedin.common.AuditStamp", - "doc" : "Audit stamp containing who last modified the status and when." + "doc" : "Audit stamp containing who last modified the status and when.", + "Searchable" : { + "/time" : { + "fieldName" : "statusLastModifiedAt", + "fieldType" : "COUNT" + } + } } ], "Aspect" : { "name" : "corpUserStatus" @@ -2861,8 +2867,9 @@ }, { "name" : "label", "type" : "string", - "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.", + "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.\n\nNote that this field is deprecated and is not surfaced in the UI.", "optional" : true, + "Deprecated" : true, "Searchable" : { "boostScore" : 0.2, "fieldName" : "fieldLabels", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index ffbcdd1b2adb3..38d91856f1536 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -843,10 +843,10 @@ "PRE" : "Designates pre-production fabrics", "PROD" : "Designates production fabrics", "QA" : "Designates quality assurance fabrics", + "RVW" : "Designates review fabrics", "STG" : "Designates staging fabrics", "TEST" : "Designates testing fabrics", - "UAT" : "Designates user acceptance testing fabrics", - "RVW" : "Designates review fabrics" + "UAT" : "Designates user acceptance testing fabrics" } }, { "type" : "record", @@ -2801,7 +2801,13 @@ }, { "name" : "lastModified", "type" : "com.linkedin.common.AuditStamp", - "doc" : "Audit stamp containing who last modified the status and when." + "doc" : "Audit stamp containing who last modified the status and when.", + "Searchable" : { + "/time" : { + "fieldName" : "statusLastModifiedAt", + "fieldType" : "COUNT" + } + } } ], "Aspect" : { "name" : "corpUserStatus" @@ -3249,8 +3255,9 @@ }, { "name" : "label", "type" : "string", - "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.", + "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.\n\nNote that this field is deprecated and is not surfaced in the UI.", "optional" : true, + "Deprecated" : true, "Searchable" : { "boostScore" : 0.2, "fieldName" : "fieldLabels", @@ -5344,7 +5351,12 @@ "items" : "com.linkedin.common.Urn" }, "doc" : "A specific set of users to apply the policy to (disjunctive)", - "optional" : true + "optional" : true, + "Searchable" : { + "/*" : { + "fieldType" : "URN" + } + } }, { "name" : "groups", "type" : { @@ -5352,7 +5364,12 @@ "items" : "com.linkedin.common.Urn" }, "doc" : "A specific set of groups to apply the policy to (disjunctive)", - "optional" : true + "optional" : true, + "Searchable" : { + "/*" : { + "fieldType" : "URN" + } + } }, { "name" : "resourceOwners", "type" : "boolean", @@ -5370,12 +5387,18 @@ "name" : "allUsers", "type" : "boolean", "doc" : "Whether the filter should apply to all users.", - "default" : false + "default" : false, + "Searchable" : { + "fieldType" : "BOOLEAN" + } }, { "name" : "allGroups", "type" : "boolean", "doc" : "Whether the filter should apply to all groups.", - "default" : false + "default" : false, + "Searchable" : { + "fieldType" : "BOOLEAN" + } }, { "name" : "roles", "type" : { @@ -5389,6 +5412,11 @@ "entityTypes" : [ "dataHubRole" ], "name" : "IsAssociatedWithRole" } + }, + "Searchable" : { + "/*" : { + "fieldType" : "URN" + } } } ] }, @@ -6211,6 +6239,11 @@ "type" : "boolean", "doc" : "Marks an entity as having been explored for as a part of the graph walk", "optional" : true + }, { + "name" : "truncatedChildren", + "type" : "boolean", + "doc" : "Indicates this destination node has additional unexplored child relationships", + "optional" : true }, { "name" : "ignoredAsHop", "type" : "boolean", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index 0139072b2ae15..e1c8d3007d59d 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -585,10 +585,10 @@ "PRE" : "Designates pre-production fabrics", "PROD" : "Designates production fabrics", "QA" : "Designates quality assurance fabrics", + "RVW" : "Designates review fabrics", "STG" : "Designates staging fabrics", "TEST" : "Designates testing fabrics", - "UAT" : "Designates user acceptance testing fabrics", - "RVW" : "Designates review fabrics" + "UAT" : "Designates user acceptance testing fabrics" } }, { "type" : "record", @@ -2222,7 +2222,13 @@ }, { "name" : "lastModified", "type" : "com.linkedin.common.AuditStamp", - "doc" : "Audit stamp containing who last modified the status and when." + "doc" : "Audit stamp containing who last modified the status and when.", + "Searchable" : { + "/time" : { + "fieldName" : "statusLastModifiedAt", + "fieldType" : "COUNT" + } + } } ], "Aspect" : { "name" : "corpUserStatus" @@ -2594,8 +2600,9 @@ }, { "name" : "label", "type" : "string", - "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.", + "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.\n\nNote that this field is deprecated and is not surfaced in the UI.", "optional" : true, + "Deprecated" : true, "Searchable" : { "boostScore" : 0.2, "fieldName" : "fieldLabels", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json index 3886faffadedb..ba29f43dae0a6 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json @@ -182,6 +182,11 @@ "type" : "boolean", "doc" : "Marks this relationship as explored during the graph walk", "optional" : true + }, { + "name" : "truncatedChildren", + "type" : "boolean", + "doc" : "Indicates this destination node has additional unexplored child relationships", + "optional" : true }, { "name" : "ignoredAsHop", "type" : "boolean", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 1caeed2570317..8572ae2f07943 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -585,10 +585,10 @@ "PRE" : "Designates pre-production fabrics", "PROD" : "Designates production fabrics", "QA" : "Designates quality assurance fabrics", + "RVW" : "Designates review fabrics", "STG" : "Designates staging fabrics", "TEST" : "Designates testing fabrics", - "UAT" : "Designates user acceptance testing fabrics", - "RVW" : "Designates review fabrics" + "UAT" : "Designates user acceptance testing fabrics" } }, { "type" : "record", @@ -2216,7 +2216,13 @@ }, { "name" : "lastModified", "type" : "com.linkedin.common.AuditStamp", - "doc" : "Audit stamp containing who last modified the status and when." + "doc" : "Audit stamp containing who last modified the status and when.", + "Searchable" : { + "/time" : { + "fieldName" : "statusLastModifiedAt", + "fieldType" : "COUNT" + } + } } ], "Aspect" : { "name" : "corpUserStatus" @@ -2588,8 +2594,9 @@ }, { "name" : "label", "type" : "string", - "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.", + "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.\n\nNote that this field is deprecated and is not surfaced in the UI.", "optional" : true, + "Deprecated" : true, "Searchable" : { "boostScore" : 0.2, "fieldName" : "fieldLabels", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 1592333988b4c..bb32d6a870d48 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -843,10 +843,10 @@ "PRE" : "Designates pre-production fabrics", "PROD" : "Designates production fabrics", "QA" : "Designates quality assurance fabrics", + "RVW" : "Designates review fabrics", "STG" : "Designates staging fabrics", "TEST" : "Designates testing fabrics", - "UAT" : "Designates user acceptance testing fabrics", - "RVW" : "Designates review fabrics" + "UAT" : "Designates user acceptance testing fabrics" } }, { "type" : "record", @@ -2795,7 +2795,13 @@ }, { "name" : "lastModified", "type" : "com.linkedin.common.AuditStamp", - "doc" : "Audit stamp containing who last modified the status and when." + "doc" : "Audit stamp containing who last modified the status and when.", + "Searchable" : { + "/time" : { + "fieldName" : "statusLastModifiedAt", + "fieldType" : "COUNT" + } + } } ], "Aspect" : { "name" : "corpUserStatus" @@ -3243,8 +3249,9 @@ }, { "name" : "label", "type" : "string", - "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.", + "doc" : "Label of the field. Provides a more human-readable name for the field than field path. Some sources will\nprovide this metadata but not all sources have the concept of a label. If just one string is associated with\na field in a source, that is most likely a description.\n\nNote that this field is deprecated and is not surfaced in the UI.", "optional" : true, + "Deprecated" : true, "Searchable" : { "boostScore" : 0.2, "fieldName" : "fieldLabels", @@ -5338,7 +5345,12 @@ "items" : "com.linkedin.common.Urn" }, "doc" : "A specific set of users to apply the policy to (disjunctive)", - "optional" : true + "optional" : true, + "Searchable" : { + "/*" : { + "fieldType" : "URN" + } + } }, { "name" : "groups", "type" : { @@ -5346,7 +5358,12 @@ "items" : "com.linkedin.common.Urn" }, "doc" : "A specific set of groups to apply the policy to (disjunctive)", - "optional" : true + "optional" : true, + "Searchable" : { + "/*" : { + "fieldType" : "URN" + } + } }, { "name" : "resourceOwners", "type" : "boolean", @@ -5364,12 +5381,18 @@ "name" : "allUsers", "type" : "boolean", "doc" : "Whether the filter should apply to all users.", - "default" : false + "default" : false, + "Searchable" : { + "fieldType" : "BOOLEAN" + } }, { "name" : "allGroups", "type" : "boolean", "doc" : "Whether the filter should apply to all groups.", - "default" : false + "default" : false, + "Searchable" : { + "fieldType" : "BOOLEAN" + } }, { "name" : "roles", "type" : { @@ -5383,6 +5406,11 @@ "entityTypes" : [ "dataHubRole" ], "name" : "IsAssociatedWithRole" } + }, + "Searchable" : { + "/*" : { + "fieldType" : "URN" + } } } ] }, diff --git a/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTestConfiguration.java b/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTestConfiguration.java index 7ab673b0a46fe..6901cd665f166 100644 --- a/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTestConfiguration.java +++ b/metadata-service/schema-registry-servlet/src/test/java/io/datahubproject/openapi/test/SchemaRegistryControllerTestConfiguration.java @@ -1,6 +1,7 @@ package io.datahubproject.openapi.test; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; +import com.linkedin.metadata.models.registry.EntityRegistry; import org.springframework.boot.test.context.TestConfiguration; import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.context.annotation.ComponentScan; @@ -11,4 +12,6 @@ @ComponentScan(basePackages = {"com.linkedin.gms.factory.kafka", "com.linkedin.gms.factory.config"}) public class SchemaRegistryControllerTestConfiguration { @MockBean KafkaHealthChecker kafkaHealthChecker; + + @MockBean EntityRegistry entityRegistry; } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/service/ViewService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/service/ViewService.java index 13bb4a5b9f73b..e01c2d224691a 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/service/ViewService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/service/ViewService.java @@ -11,12 +11,14 @@ import com.linkedin.metadata.entity.AspectUtils; import com.linkedin.metadata.key.DataHubViewKey; import com.linkedin.metadata.utils.EntityKeyUtils; +import com.linkedin.r2.RemoteInvocationException; import com.linkedin.view.DataHubViewDefinition; import com.linkedin.view.DataHubViewInfo; import com.linkedin.view.DataHubViewType; import io.datahubproject.metadata.context.OperationContext; import java.util.Objects; import java.util.UUID; +import java.util.concurrent.CompletableFuture; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; @@ -177,6 +179,20 @@ public void deleteView(@Nonnull OperationContext opContext, @Nonnull Urn viewUrn try { this.entityClient.deleteEntity( opContext, Objects.requireNonNull(viewUrn, "viewUrn must not be null")); + + // Asynchronously delete all references to the entity (to return quickly) + CompletableFuture.runAsync( + () -> { + try { + this.entityClient.deleteEntityReferences(opContext, viewUrn); + } catch (RemoteInvocationException e) { + log.error( + String.format( + "Caught exception while attempting to clear all entity references for view with urn %s", + viewUrn), + e); + } + }); } catch (Exception e) { throw new RuntimeException(String.format("Failed to delete View with urn %s", viewUrn), e); }