diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java index 0924dbc0c0a6d9..e55f1fd5ecf5bf 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/Constants.java @@ -20,6 +20,7 @@ private Constants() {} public static final String LINEAGE_SCHEMA_FILE = "lineage.graphql"; public static final String PROPERTIES_SCHEMA_FILE = "properties.graphql"; public static final String FORMS_SCHEMA_FILE = "forms.graphql"; + public static final String ASSERTIONS_SCHEMA_FILE = "assertions.graphql"; public static final String INCIDENTS_SCHEMA_FILE = "incident.graphql"; public static final String CONNECTIONS_SCHEMA_FILE = "connection.graphql"; public static final String BROWSE_PATH_DELIMITER = "/"; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 5315a444d07b7b..50a73817678ee0 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -118,7 +118,12 @@ import com.linkedin.datahub.graphql.resolvers.assertion.AssertionRunEventResolver; import com.linkedin.datahub.graphql.resolvers.assertion.DeleteAssertionResolver; import com.linkedin.datahub.graphql.resolvers.assertion.EntityAssertionsResolver; -import com.linkedin.datahub.graphql.resolvers.auth.*; +import com.linkedin.datahub.graphql.resolvers.auth.CreateAccessTokenResolver; +import com.linkedin.datahub.graphql.resolvers.auth.DebugAccessResolver; +import com.linkedin.datahub.graphql.resolvers.auth.GetAccessTokenMetadataResolver; +import com.linkedin.datahub.graphql.resolvers.auth.GetAccessTokenResolver; +import com.linkedin.datahub.graphql.resolvers.auth.ListAccessTokensResolver; +import com.linkedin.datahub.graphql.resolvers.auth.RevokeAccessTokenResolver; import com.linkedin.datahub.graphql.resolvers.browse.BrowsePathsResolver; import com.linkedin.datahub.graphql.resolvers.browse.BrowseResolver; import com.linkedin.datahub.graphql.resolvers.browse.EntityBrowsePathsResolver; @@ -814,6 +819,7 @@ public GraphQLEngine.Builder builder() { .addSchema(fileBasedSchema(PROPERTIES_SCHEMA_FILE)) .addSchema(fileBasedSchema(FORMS_SCHEMA_FILE)) .addSchema(fileBasedSchema(CONNECTIONS_SCHEMA_FILE)) + .addSchema(fileBasedSchema(ASSERTIONS_SCHEMA_FILE)) .addSchema(fileBasedSchema(INCIDENTS_SCHEMA_FILE)); for (GmsGraphQLPlugin plugin : this.graphQLPlugins) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java index 3ca78d643679b9..18f8ad85668d8d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolver.java @@ -98,6 +98,16 @@ public CompletableFuture get(DataFetchingEnvironment e && AssertionResultType.SUCCESS.equals( runEvent.getResult().getType())) .count())); + result.setErrored( + Math.toIntExact( + runEvents.stream() + .filter( + runEvent -> + AssertionRunStatus.COMPLETE.equals(runEvent.getStatus()) + && runEvent.getResult() != null + && AssertionResultType.ERROR.equals( + runEvent.getResult().getType())) + .count())); result.setRunEvents(runEvents); return result; } catch (RemoteInvocationException e) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java index ca13792b1e92b2..1e7fac2edbc9a9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapper.java @@ -2,6 +2,8 @@ import static com.linkedin.metadata.Constants.GLOBAL_TAGS_ASPECT_NAME; +import com.linkedin.assertion.AssertionAction; +import com.linkedin.assertion.AssertionActions; import com.linkedin.assertion.AssertionInfo; import com.linkedin.common.DataPlatformInstance; import com.linkedin.common.GlobalTags; @@ -10,24 +12,40 @@ import com.linkedin.data.DataMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Assertion; +import com.linkedin.datahub.graphql.generated.AssertionActionType; +import com.linkedin.datahub.graphql.generated.AssertionSource; +import com.linkedin.datahub.graphql.generated.AssertionSourceType; import com.linkedin.datahub.graphql.generated.AssertionStdAggregation; import com.linkedin.datahub.graphql.generated.AssertionStdOperator; import com.linkedin.datahub.graphql.generated.AssertionStdParameter; import com.linkedin.datahub.graphql.generated.AssertionStdParameterType; import com.linkedin.datahub.graphql.generated.AssertionStdParameters; import com.linkedin.datahub.graphql.generated.AssertionType; +import com.linkedin.datahub.graphql.generated.AuditStamp; import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.DatasetAssertionInfo; import com.linkedin.datahub.graphql.generated.DatasetAssertionScope; +import com.linkedin.datahub.graphql.generated.DateInterval; import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.FieldAssertionInfo; +import com.linkedin.datahub.graphql.generated.FixedIntervalSchedule; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo; +import com.linkedin.datahub.graphql.generated.SchemaAssertionCompatibility; +import com.linkedin.datahub.graphql.generated.SchemaAssertionField; +import com.linkedin.datahub.graphql.generated.SchemaAssertionInfo; import com.linkedin.datahub.graphql.generated.SchemaFieldRef; +import com.linkedin.datahub.graphql.generated.SqlAssertionInfo; +import com.linkedin.datahub.graphql.generated.VolumeAssertionInfo; import com.linkedin.datahub.graphql.types.common.mappers.DataPlatformInstanceAspectMapper; import com.linkedin.datahub.graphql.types.common.mappers.StringMapMapper; +import com.linkedin.datahub.graphql.types.dataset.mappers.SchemaFieldMapper; +import com.linkedin.datahub.graphql.types.dataset.mappers.SchemaMetadataMapper; import com.linkedin.datahub.graphql.types.tag.mappers.GlobalTagsMapper; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.metadata.Constants; +import com.linkedin.schema.SchemaField; import java.util.Collections; import java.util.stream.Collectors; import javax.annotation.Nullable; @@ -48,6 +66,14 @@ public static Assertion map(@Nullable QueryContext context, final EntityResponse result.setInfo( mapAssertionInfo(context, new AssertionInfo(envelopedAssertionInfo.getValue().data()))); } + + final EnvelopedAspect envelopedAssertionActions = + aspects.get(Constants.ASSERTION_ACTIONS_ASPECT_NAME); + if (envelopedAssertionActions != null) { + result.setActions( + mapAssertionActions(new AssertionActions(envelopedAssertionActions.getValue().data()))); + } + final EnvelopedAspect envelopedPlatformInstance = aspects.get(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME); if (envelopedPlatformInstance != null) { @@ -83,20 +109,93 @@ private static com.linkedin.datahub.graphql.generated.Status mapStatus(Status st return result; } - private static com.linkedin.datahub.graphql.generated.AssertionInfo mapAssertionInfo( + public static com.linkedin.datahub.graphql.generated.AssertionInfo mapAssertionInfo( @Nullable QueryContext context, final AssertionInfo gmsAssertionInfo) { final com.linkedin.datahub.graphql.generated.AssertionInfo assertionInfo = new com.linkedin.datahub.graphql.generated.AssertionInfo(); assertionInfo.setType(AssertionType.valueOf(gmsAssertionInfo.getType().name())); + + if (gmsAssertionInfo.hasLastUpdated()) { + assertionInfo.setLastUpdated( + new AuditStamp( + gmsAssertionInfo.getLastUpdated().getTime(), + gmsAssertionInfo.getLastUpdated().getActor().toString())); + } if (gmsAssertionInfo.hasDatasetAssertion()) { DatasetAssertionInfo datasetAssertion = mapDatasetAssertionInfo(context, gmsAssertionInfo.getDatasetAssertion()); assertionInfo.setDatasetAssertion(datasetAssertion); } - assertionInfo.setDescription(gmsAssertionInfo.getDescription()); + // Description + if (gmsAssertionInfo.hasDescription()) { + assertionInfo.setDescription(gmsAssertionInfo.getDescription()); + } + // FRESHNESS Assertions + if (gmsAssertionInfo.hasFreshnessAssertion()) { + FreshnessAssertionInfo freshnessAssertionInfo = + FreshnessAssertionMapper.mapFreshnessAssertionInfo( + context, gmsAssertionInfo.getFreshnessAssertion()); + assertionInfo.setFreshnessAssertion(freshnessAssertionInfo); + } + // VOLUME Assertions + if (gmsAssertionInfo.hasVolumeAssertion()) { + VolumeAssertionInfo volumeAssertionInfo = + VolumeAssertionMapper.mapVolumeAssertionInfo( + context, gmsAssertionInfo.getVolumeAssertion()); + assertionInfo.setVolumeAssertion(volumeAssertionInfo); + } + // SQL Assertions + if (gmsAssertionInfo.hasSqlAssertion()) { + SqlAssertionInfo sqlAssertionInfo = + SqlAssertionMapper.mapSqlAssertionInfo(gmsAssertionInfo.getSqlAssertion()); + assertionInfo.setSqlAssertion(sqlAssertionInfo); + } + // FIELD Assertions + if (gmsAssertionInfo.hasFieldAssertion()) { + FieldAssertionInfo fieldAssertionInfo = + FieldAssertionMapper.mapFieldAssertionInfo(context, gmsAssertionInfo.getFieldAssertion()); + assertionInfo.setFieldAssertion(fieldAssertionInfo); + } + // SCHEMA Assertions + if (gmsAssertionInfo.hasSchemaAssertion()) { + SchemaAssertionInfo schemaAssertionInfo = + mapSchemaAssertionInfo(context, gmsAssertionInfo.getSchemaAssertion()); + assertionInfo.setSchemaAssertion(schemaAssertionInfo); + } + // Source Type + if (gmsAssertionInfo.hasSource()) { + assertionInfo.setSource(mapSource(gmsAssertionInfo.getSource())); + } return assertionInfo; } + private static com.linkedin.datahub.graphql.generated.AssertionActions mapAssertionActions( + final AssertionActions gmsAssertionActions) { + final com.linkedin.datahub.graphql.generated.AssertionActions result = + new com.linkedin.datahub.graphql.generated.AssertionActions(); + if (gmsAssertionActions.hasOnFailure()) { + result.setOnFailure( + gmsAssertionActions.getOnFailure().stream() + .map(AssertionMapper::mapAssertionAction) + .collect(Collectors.toList())); + } + if (gmsAssertionActions.hasOnSuccess()) { + result.setOnSuccess( + gmsAssertionActions.getOnSuccess().stream() + .map(AssertionMapper::mapAssertionAction) + .collect(Collectors.toList())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.AssertionAction mapAssertionAction( + final AssertionAction gmsAssertionAction) { + final com.linkedin.datahub.graphql.generated.AssertionAction result = + new com.linkedin.datahub.graphql.generated.AssertionAction(); + result.setType(AssertionActionType.valueOf(gmsAssertionAction.getType().toString())); + return result; + } + private static DatasetAssertionInfo mapDatasetAssertionInfo( @Nullable QueryContext context, final com.linkedin.assertion.DatasetAssertionInfo gmsDatasetAssertion) { @@ -152,7 +251,7 @@ private static SchemaFieldRef mapDatasetSchemaField(final Urn schemaFieldUrn) { return new SchemaFieldRef(schemaFieldUrn.toString(), schemaFieldUrn.getEntityKey().get(1)); } - private static AssertionStdParameters mapParameters( + protected static AssertionStdParameters mapParameters( final com.linkedin.assertion.AssertionStdParameters params) { final AssertionStdParameters result = new AssertionStdParameters(); if (params.hasValue()) { @@ -175,5 +274,61 @@ private static AssertionStdParameter mapParameter( return result; } - private AssertionMapper() {} + protected static FixedIntervalSchedule mapFixedIntervalSchedule( + com.linkedin.assertion.FixedIntervalSchedule gmsFixedIntervalSchedule) { + FixedIntervalSchedule fixedIntervalSchedule = new FixedIntervalSchedule(); + fixedIntervalSchedule.setUnit(DateInterval.valueOf(gmsFixedIntervalSchedule.getUnit().name())); + fixedIntervalSchedule.setMultiple(gmsFixedIntervalSchedule.getMultiple()); + return fixedIntervalSchedule; + } + + private static AssertionSource mapSource(final com.linkedin.assertion.AssertionSource gmsSource) { + AssertionSource result = new AssertionSource(); + result.setType(AssertionSourceType.valueOf(gmsSource.getType().toString())); + if (gmsSource.hasCreated()) { + result.setCreated( + new AuditStamp( + gmsSource.getCreated().getTime(), gmsSource.getCreated().getActor().toString())); + } + return result; + } + + protected static com.linkedin.datahub.graphql.generated.SchemaFieldSpec mapSchemaFieldSpec( + final com.linkedin.schema.SchemaFieldSpec gmsField) { + final com.linkedin.datahub.graphql.generated.SchemaFieldSpec result = + new com.linkedin.datahub.graphql.generated.SchemaFieldSpec(); + result.setPath(gmsField.getPath()); + result.setType(gmsField.getType()); + result.setNativeType(gmsField.getNativeType()); + return result; + } + + private static SchemaAssertionInfo mapSchemaAssertionInfo( + @Nullable final QueryContext context, + final com.linkedin.assertion.SchemaAssertionInfo gmsSchemaAssertionInfo) { + SchemaAssertionInfo result = new SchemaAssertionInfo(); + result.setCompatibility( + SchemaAssertionCompatibility.valueOf(gmsSchemaAssertionInfo.getCompatibility().name())); + result.setEntityUrn(gmsSchemaAssertionInfo.getEntity().toString()); + result.setSchema( + SchemaMetadataMapper.INSTANCE.apply( + context, gmsSchemaAssertionInfo.getSchema(), gmsSchemaAssertionInfo.getEntity(), 0L)); + result.setFields( + gmsSchemaAssertionInfo.getSchema().getFields().stream() + .map(AssertionMapper::mapSchemaField) + .collect(Collectors.toList())); + return result; + } + + private static SchemaAssertionField mapSchemaField(final SchemaField gmsField) { + SchemaAssertionField result = new SchemaAssertionField(); + result.setPath(gmsField.getFieldPath()); + result.setType(new SchemaFieldMapper().mapSchemaFieldDataType(gmsField.getType())); + if (gmsField.hasNativeDataType()) { + result.setNativeType(gmsField.getNativeDataType()); + } + return result; + } + + protected AssertionMapper() {} } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java index 0cf74439132fe8..9c90478f03dc5f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/AssertionType.java @@ -28,8 +28,8 @@ public class AssertionType Constants.ASSERTION_KEY_ASPECT_NAME, Constants.ASSERTION_INFO_ASPECT_NAME, Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME, - Constants.GLOBAL_TAGS_ASPECT_NAME); - + Constants.GLOBAL_TAGS_ASPECT_NAME, + Constants.ASSERTION_ACTIONS_ASPECT_NAME); private final EntityClient _entityClient; public AssertionType(final EntityClient entityClient) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapper.java new file mode 100644 index 00000000000000..82d041a464c3fb --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapper.java @@ -0,0 +1,92 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.FieldAssertionInfo; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AssertionStdOperator; +import com.linkedin.datahub.graphql.generated.FieldAssertionType; +import com.linkedin.datahub.graphql.generated.FieldMetricType; +import com.linkedin.datahub.graphql.generated.FieldTransformType; +import com.linkedin.datahub.graphql.generated.FieldValuesFailThresholdType; +import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper; +import javax.annotation.Nullable; + +public class FieldAssertionMapper extends AssertionMapper { + + public static com.linkedin.datahub.graphql.generated.FieldAssertionInfo mapFieldAssertionInfo( + @Nullable final QueryContext context, final FieldAssertionInfo gmsFieldAssertionInfo) { + final com.linkedin.datahub.graphql.generated.FieldAssertionInfo result = + new com.linkedin.datahub.graphql.generated.FieldAssertionInfo(); + result.setEntityUrn(gmsFieldAssertionInfo.getEntity().toString()); + result.setType(FieldAssertionType.valueOf(gmsFieldAssertionInfo.getType().name())); + if (gmsFieldAssertionInfo.hasFilter()) { + result.setFilter(DatasetFilterMapper.map(context, gmsFieldAssertionInfo.getFilter())); + } + if (gmsFieldAssertionInfo.hasFieldValuesAssertion()) { + result.setFieldValuesAssertion( + mapFieldValuesAssertion(gmsFieldAssertionInfo.getFieldValuesAssertion())); + } + if (gmsFieldAssertionInfo.hasFieldMetricAssertion()) { + result.setFieldMetricAssertion( + mapFieldMetricAssertion(gmsFieldAssertionInfo.getFieldMetricAssertion())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldValuesAssertion + mapFieldValuesAssertion( + final com.linkedin.assertion.FieldValuesAssertion gmsFieldValuesAssertion) { + final com.linkedin.datahub.graphql.generated.FieldValuesAssertion result = + new com.linkedin.datahub.graphql.generated.FieldValuesAssertion(); + result.setField(mapSchemaFieldSpec(gmsFieldValuesAssertion.getField())); + result.setOperator(AssertionStdOperator.valueOf(gmsFieldValuesAssertion.getOperator().name())); + result.setFailThreshold( + mapFieldValuesFailThreshold(gmsFieldValuesAssertion.getFailThreshold())); + result.setExcludeNulls(gmsFieldValuesAssertion.isExcludeNulls()); + + if (gmsFieldValuesAssertion.hasTransform()) { + result.setTransform(mapFieldTransform(gmsFieldValuesAssertion.getTransform())); + } + + if (gmsFieldValuesAssertion.hasParameters()) { + result.setParameters(mapParameters(gmsFieldValuesAssertion.getParameters())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldMetricAssertion + mapFieldMetricAssertion( + final com.linkedin.assertion.FieldMetricAssertion gmsFieldMetricAssertion) { + final com.linkedin.datahub.graphql.generated.FieldMetricAssertion result = + new com.linkedin.datahub.graphql.generated.FieldMetricAssertion(); + result.setField(mapSchemaFieldSpec(gmsFieldMetricAssertion.getField())); + result.setMetric(FieldMetricType.valueOf(gmsFieldMetricAssertion.getMetric().name())); + result.setOperator(AssertionStdOperator.valueOf(gmsFieldMetricAssertion.getOperator().name())); + + if (gmsFieldMetricAssertion.hasParameters()) { + result.setParameters(mapParameters(gmsFieldMetricAssertion.getParameters())); + } + + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldTransform mapFieldTransform( + final com.linkedin.assertion.FieldTransform gmsFieldTransform) { + final com.linkedin.datahub.graphql.generated.FieldTransform result = + new com.linkedin.datahub.graphql.generated.FieldTransform(); + result.setType(FieldTransformType.valueOf(gmsFieldTransform.getType().name())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold + mapFieldValuesFailThreshold( + final com.linkedin.assertion.FieldValuesFailThreshold gmsFieldValuesFailThreshold) { + final com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold result = + new com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold(); + result.setType( + FieldValuesFailThresholdType.valueOf(gmsFieldValuesFailThreshold.getType().name())); + result.setValue(gmsFieldValuesFailThreshold.getValue()); + return result; + } + + private FieldAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapper.java new file mode 100644 index 00000000000000..22e1c1d8bae9ea --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapper.java @@ -0,0 +1,59 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.data.template.GetMode; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionSchedule; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType; +import com.linkedin.datahub.graphql.generated.FreshnessAssertionType; +import com.linkedin.datahub.graphql.generated.FreshnessCronSchedule; +import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper; +import javax.annotation.Nullable; + +public class FreshnessAssertionMapper extends AssertionMapper { + + public static FreshnessAssertionInfo mapFreshnessAssertionInfo( + @Nullable final QueryContext context, + final com.linkedin.assertion.FreshnessAssertionInfo gmsFreshnessAssertionInfo) { + FreshnessAssertionInfo freshnessAssertionInfo = new FreshnessAssertionInfo(); + freshnessAssertionInfo.setEntityUrn(gmsFreshnessAssertionInfo.getEntity().toString()); + freshnessAssertionInfo.setType( + FreshnessAssertionType.valueOf(gmsFreshnessAssertionInfo.getType().name())); + if (gmsFreshnessAssertionInfo.hasSchedule()) { + freshnessAssertionInfo.setSchedule( + mapFreshnessAssertionSchedule(gmsFreshnessAssertionInfo.getSchedule())); + } + if (gmsFreshnessAssertionInfo.hasFilter()) { + freshnessAssertionInfo.setFilter( + DatasetFilterMapper.map(context, gmsFreshnessAssertionInfo.getFilter())); + } + return freshnessAssertionInfo; + } + + private static FreshnessCronSchedule mapFreshnessCronSchedule( + final com.linkedin.assertion.FreshnessCronSchedule gmsCronSchedule) { + FreshnessCronSchedule cronSchedule = new FreshnessCronSchedule(); + cronSchedule.setCron(gmsCronSchedule.getCron()); + cronSchedule.setTimezone(gmsCronSchedule.getTimezone()); + cronSchedule.setWindowStartOffsetMs(gmsCronSchedule.getWindowStartOffsetMs(GetMode.NULL)); + return cronSchedule; + } + + private static FreshnessAssertionSchedule mapFreshnessAssertionSchedule( + final com.linkedin.assertion.FreshnessAssertionSchedule gmsFreshnessAssertionSchedule) { + FreshnessAssertionSchedule freshnessAssertionSchedule = new FreshnessAssertionSchedule(); + freshnessAssertionSchedule.setType( + FreshnessAssertionScheduleType.valueOf(gmsFreshnessAssertionSchedule.getType().name())); + if (gmsFreshnessAssertionSchedule.hasCron()) { + freshnessAssertionSchedule.setCron( + mapFreshnessCronSchedule(gmsFreshnessAssertionSchedule.getCron())); + } + if (gmsFreshnessAssertionSchedule.hasFixedInterval()) { + freshnessAssertionSchedule.setFixedInterval( + mapFixedIntervalSchedule(gmsFreshnessAssertionSchedule.getFixedInterval())); + } + return freshnessAssertionSchedule; + } + + private FreshnessAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapper.java new file mode 100644 index 00000000000000..e75d2221164d4d --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapper.java @@ -0,0 +1,27 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.SqlAssertionInfo; +import com.linkedin.datahub.graphql.generated.AssertionStdOperator; +import com.linkedin.datahub.graphql.generated.AssertionValueChangeType; +import com.linkedin.datahub.graphql.generated.SqlAssertionType; + +public class SqlAssertionMapper extends AssertionMapper { + + public static com.linkedin.datahub.graphql.generated.SqlAssertionInfo mapSqlAssertionInfo( + final SqlAssertionInfo gmsSqlAssertionInfo) { + final com.linkedin.datahub.graphql.generated.SqlAssertionInfo result = + new com.linkedin.datahub.graphql.generated.SqlAssertionInfo(); + result.setEntityUrn(gmsSqlAssertionInfo.getEntity().toString()); + result.setType(SqlAssertionType.valueOf(gmsSqlAssertionInfo.getType().name())); + result.setStatement(gmsSqlAssertionInfo.getStatement()); + result.setOperator(AssertionStdOperator.valueOf(gmsSqlAssertionInfo.getOperator().name())); + result.setParameters(mapParameters(gmsSqlAssertionInfo.getParameters())); + if (gmsSqlAssertionInfo.hasChangeType()) { + result.setChangeType( + AssertionValueChangeType.valueOf(gmsSqlAssertionInfo.getChangeType().name())); + } + return result; + } + + private SqlAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapper.java new file mode 100644 index 00000000000000..3d0294c45e5205 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapper.java @@ -0,0 +1,115 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.VolumeAssertionInfo; +import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AssertionStdOperator; +import com.linkedin.datahub.graphql.generated.AssertionValueChangeType; +import com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformerType; +import com.linkedin.datahub.graphql.generated.VolumeAssertionType; +import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper; +import javax.annotation.Nullable; + +public class VolumeAssertionMapper extends AssertionMapper { + + public static com.linkedin.datahub.graphql.generated.VolumeAssertionInfo mapVolumeAssertionInfo( + @Nullable final QueryContext context, final VolumeAssertionInfo gmsVolumeAssertionInfo) { + final com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + new com.linkedin.datahub.graphql.generated.VolumeAssertionInfo(); + result.setEntityUrn(gmsVolumeAssertionInfo.getEntity().toString()); + result.setType(VolumeAssertionType.valueOf(gmsVolumeAssertionInfo.getType().name())); + if (gmsVolumeAssertionInfo.hasFilter()) { + result.setFilter(DatasetFilterMapper.map(context, gmsVolumeAssertionInfo.getFilter())); + } + if (gmsVolumeAssertionInfo.hasRowCountTotal()) { + result.setRowCountTotal(mapRowCountTotal(gmsVolumeAssertionInfo.getRowCountTotal())); + } + if (gmsVolumeAssertionInfo.hasRowCountChange()) { + result.setRowCountChange(mapRowCountChange(gmsVolumeAssertionInfo.getRowCountChange())); + } + if (gmsVolumeAssertionInfo.hasIncrementingSegmentRowCountTotal()) { + result.setIncrementingSegmentRowCountTotal( + mapIncrementingSegmentRowCountTotal( + gmsVolumeAssertionInfo.getIncrementingSegmentRowCountTotal())); + } + if (gmsVolumeAssertionInfo.hasIncrementingSegmentRowCountChange()) { + result.setIncrementingSegmentRowCountChange( + mapIncrementingSegmentRowCountChange( + gmsVolumeAssertionInfo.getIncrementingSegmentRowCountChange())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.RowCountTotal mapRowCountTotal( + final com.linkedin.assertion.RowCountTotal gmsRowCountTotal) { + final com.linkedin.datahub.graphql.generated.RowCountTotal result = + new com.linkedin.datahub.graphql.generated.RowCountTotal(); + result.setOperator(AssertionStdOperator.valueOf(gmsRowCountTotal.getOperator().name())); + result.setParameters(mapParameters(gmsRowCountTotal.getParameters())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.RowCountChange mapRowCountChange( + final com.linkedin.assertion.RowCountChange gmsRowCountChange) { + final com.linkedin.datahub.graphql.generated.RowCountChange result = + new com.linkedin.datahub.graphql.generated.RowCountChange(); + result.setOperator(AssertionStdOperator.valueOf(gmsRowCountChange.getOperator().name())); + result.setParameters(mapParameters(gmsRowCountChange.getParameters())); + result.setType(AssertionValueChangeType.valueOf(gmsRowCountChange.getType().name())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal + mapIncrementingSegmentRowCountTotal( + final com.linkedin.assertion.IncrementingSegmentRowCountTotal + gmsIncrementingSegmentRowCountTotal) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal(); + result.setOperator( + AssertionStdOperator.valueOf(gmsIncrementingSegmentRowCountTotal.getOperator().name())); + result.setParameters(mapParameters(gmsIncrementingSegmentRowCountTotal.getParameters())); + result.setSegment(mapIncrementingSegmentSpec(gmsIncrementingSegmentRowCountTotal.getSegment())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange + mapIncrementingSegmentRowCountChange( + final com.linkedin.assertion.IncrementingSegmentRowCountChange + gmsIncrementingSegmentRowCountChange) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange(); + result.setOperator( + AssertionStdOperator.valueOf(gmsIncrementingSegmentRowCountChange.getOperator().name())); + result.setParameters(mapParameters(gmsIncrementingSegmentRowCountChange.getParameters())); + result.setSegment( + mapIncrementingSegmentSpec(gmsIncrementingSegmentRowCountChange.getSegment())); + result.setType( + AssertionValueChangeType.valueOf(gmsIncrementingSegmentRowCountChange.getType().name())); + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec + mapIncrementingSegmentSpec(final com.linkedin.assertion.IncrementingSegmentSpec gmsSegment) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec(); + result.setField(mapSchemaFieldSpec(gmsSegment.getField())); + if (gmsSegment.hasTransformer()) { + result.setTransformer(mapIncrementingSegmentFieldTransformer(gmsSegment.getTransformer())); + } + return result; + } + + private static com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer + mapIncrementingSegmentFieldTransformer( + final com.linkedin.assertion.IncrementingSegmentFieldTransformer gmsTransformer) { + final com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer result = + new com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer(); + result.setType( + IncrementingSegmentFieldTransformerType.valueOf(gmsTransformer.getType().name())); + if (gmsTransformer.hasNativeType()) { + result.setNativeType(gmsTransformer.getNativeType()); + } + return result; + } + + private VolumeAssertionMapper() {} +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java index a2cc9d5a66edd9..3674186ac23fe6 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java @@ -51,7 +51,7 @@ public SchemaField apply( return result; } - private SchemaFieldDataType mapSchemaFieldDataType( + public SchemaFieldDataType mapSchemaFieldDataType( @Nonnull final com.linkedin.schema.SchemaFieldDataType dataTypeUnion) { final com.linkedin.schema.SchemaFieldDataType.Type type = dataTypeUnion.getType(); if (type.isBytesType()) { diff --git a/datahub-graphql-core/src/main/resources/assertions.graphql b/datahub-graphql-core/src/main/resources/assertions.graphql new file mode 100644 index 00000000000000..0ed264b20fe27e --- /dev/null +++ b/datahub-graphql-core/src/main/resources/assertions.graphql @@ -0,0 +1,896 @@ +""" +Defines a schema field, each with a specified path and type. +""" +type SchemaAssertionField { + """ + The standard V1 path of the field within the schema. + """ + path: String! + + """ + The std type of the field + """ + type: SchemaFieldDataType! + + """ + Optional: The specific native or standard type of the field. + """ + nativeType: String +} + +""" +Defines the required compatibility level for the schema assertion to pass. +""" +enum SchemaAssertionCompatibility { + """ + The schema must be exactly the same as the expected schema. + """ + EXACT_MATCH + + """ + The schema must be a superset of the expected schema. + """ + SUPERSET + + """ + The schema must be a subset of the expected schema. + """ + SUBSET +} + +""" +The source of an assertion +""" +enum AssertionSourceType { + """ + The assertion was defined natively on DataHub by a user. + """ + NATIVE + """ + The assertion was defined and managed externally of DataHub. + """ + EXTERNAL + """ + The assertion was inferred, e.g. from offline AI / ML models. + """ + INFERRED +} + +""" +The type of an Freshness assertion +""" +enum FreshnessAssertionType { + """ + An assertion defined against a Dataset Change Operation - insert, update, delete, etc + """ + DATASET_CHANGE + """ + An assertion defined against a Data Job run + """ + DATA_JOB_RUN +} + +extend type AssertionInfo { + """ + Information about an Freshness Assertion + """ + freshnessAssertion: FreshnessAssertionInfo + + """ + Information about an Volume Assertion + """ + volumeAssertion: VolumeAssertionInfo + + """ + Information about a SQL Assertion + """ + sqlAssertion: SqlAssertionInfo + + """ + Information about a Field Assertion + """ + fieldAssertion: FieldAssertionInfo + + """ + Schema assertion, e.g. defining the expected structure for an asset. + """ + schemaAssertion: SchemaAssertionInfo + + """ + The source or origin of the Assertion definition. + """ + source: AssertionSource + + """ + The time that the status last changed and the actor who changed it + """ + lastUpdated: AuditStamp +} + +extend type Assertion { + """ + The actions associated with the Assertion + """ + actions: AssertionActions +} + +""" +Some actions associated with an assertion +""" +type AssertionActions { + """ + Actions to be executed on successful assertion run. + """ + onSuccess: [AssertionAction!]! + + """ + Actions to be executed on failed assertion run. + """ + onFailure: [AssertionAction!]! +} + +""" +An action associated with an assertion +""" +type AssertionAction { + """ + The type of the actions + """ + type: AssertionActionType! +} + + +""" +The type of the Action +""" +enum AssertionActionType { + """ + Raise an incident. + """ + RAISE_INCIDENT + """ + Resolve open incidents related to the assertion. + """ + RESOLVE_INCIDENT +} + + +""" +Information about an Freshness assertion. +""" +type FreshnessAssertionInfo { + """ + The urn of the entity that the Freshness assertion is related to + """ + entityUrn: String! + + """ + The type of the Freshness Assertion + """ + type: FreshnessAssertionType! + + """ + Produce FAIL Assertion Result if the asset is not updated on the cadence and within the time range described by the schedule. + """ + schedule: FreshnessAssertionSchedule! + + """ + A filter applied when querying an external Dataset or Table + """ + filter: DatasetFilter +} + +""" +Attributes defining a single Freshness schedule. +""" +type FreshnessAssertionSchedule { + """ + The type of schedule + """ + type: FreshnessAssertionScheduleType! + + """ + A cron schedule. This is populated if the type is CRON. + """ + cron: FreshnessCronSchedule + + """ + A fixed interval schedule. This is populated if the type is FIXED_INTERVAL. + """ + fixedInterval: FixedIntervalSchedule +} + +""" +The type of an Freshness assertion +""" +enum FreshnessAssertionScheduleType { + """ + An schedule based on a CRON schedule representing the expected event times. + """ + CRON + + """ + A scheduled based on a recurring fixed schedule which is used to compute the expected operation window. E.g. "every 24 hours". + """ + FIXED_INTERVAL +} + +""" +A cron-formatted schedule +""" +type FreshnessCronSchedule { + """ + A cron-formatted execution interval, as a cron string, e.g. 1 * * * * + """ + cron: String! + + """ + Timezone in which the cron interval applies, e.g. America/Los Angeles + """ + timezone: String! + + """ + An optional offset in milliseconds to SUBTRACT from the timestamp generated by the cron schedule + to generate the lower bounds of the "Freshness window", or the window of time in which an event must have occurred in order for the Freshness + to be considering passing. + If left empty, the start of the Freshness window will be the _end_ of the previously evaluated Freshness window. + """ + windowStartOffsetMs: Long +} + +""" +A fixed interval schedule. +""" +type FixedIntervalSchedule { + """ + Interval unit such as minute/hour/day etc. + """ + unit: DateInterval! + + """ + How many units. Defaults to 1. + """ + multiple: Int! +} + +""" +The source of an Assertion +""" +type AssertionSource { + """ + The source type + """ + type: AssertionSourceType! + """ + The time at which the assertion was initially created and the actor who created it + """ + created: AuditStamp +} + +""" +Information about the field to use in an assertion +""" +type SchemaFieldSpec { + """ + The field path + """ + path: String! + + """ + The DataHub standard schema field type. + """ + type: String! + + """ + The native field type + """ + nativeType: String! +} + +""" +An enum to represent a type of change in an assertion value, metric, or measurement. +""" +enum AssertionValueChangeType { + """ + A change that is defined in absolute terms. + """ + ABSOLUTE + + """ + A change that is defined in relative terms using percentage change + from the original value. + """ + PERCENTAGE +} + +""" +A type of volume (row count) assertion +""" +enum VolumeAssertionType { + """ + A volume assertion that is evaluated against the total row count of a dataset. + """ + ROW_COUNT_TOTAL + + """ + A volume assertion that is evaluated against an incremental row count of a dataset, + or a row count change. + """ + ROW_COUNT_CHANGE + + """ + A volume assertion that checks the latest "segment" in a table based on an incrementing + column to check whether it's row count falls into a particular range. + This can be used to monitor the row count of an incrementing date-partition column segment. + """ + INCREMENTING_SEGMENT_ROW_COUNT_TOTAL + + """ + A volume assertion that compares the row counts in neighboring "segments" or "partitions" + of an incrementing column. This can be used to track changes between subsequent date partition + in a table, for example. + """ + INCREMENTING_SEGMENT_ROW_COUNT_CHANGE +} + +""" +Attributes defining an ROW_COUNT_TOTAL volume assertion. +""" +type RowCountTotal { + """ + The operator you'd like to apply. + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Attributes defining an ROW_COUNT_CHANGE volume assertion. +""" +type RowCountChange { + """ + The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + """ + type: AssertionValueChangeType! + + """ + The operator you'd like to apply. + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_TOTAL volume assertion. +""" +type IncrementingSegmentRowCountTotal { + """ + A specification of how the 'segment' can be derived using a column and an optional transformer function. + """ + segment: IncrementingSegmentSpec! + + """ + The operator you'd like to apply. + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_CHANGE volume assertion. +""" +type IncrementingSegmentRowCountChange { + """ + A specification of how the 'segment' can be derived using a column and an optional transformer function. + """ + segment: IncrementingSegmentSpec! + + """ + The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + """ + type: AssertionValueChangeType! + + """ + The operator you'd like to apply to the row count value + Note that only numeric operators are valid inputs: + GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + BETWEEN. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + Note that only numeric parameter types are valid inputs: NUMBER. + """ + parameters: AssertionStdParameters! +} + +""" +Core attributes required to identify an incrementing segment in a table. This type is mainly useful +for tables that constantly increase with new rows being added on a particular cadence (e.g. fact or event tables). + +An incrementing segment represents a logical chunk of data which is INSERTED +into a dataset on a regular interval, along with the presence of a constantly-incrementing column +value such as an event time, date partition, or last modified column. + +An incrementing segment is principally identified by 2 key attributes combined: + +1. A field or column that represents the incrementing value. New rows that are inserted will be identified using this column. + Note that the value of this column may not by itself represent the "bucket" or the "segment" in which the row falls. + +2. [Optional] An transformer function that may be applied to the selected column value in order + to obtain the final "segment identifier" or "bucket identifier". Rows that have the same value after applying the transformation + will be grouped into the same segment, using which the final value (e.g. row count) will be determined. +""" +type IncrementingSegmentSpec { + """ + The field to use to generate segments. It must be constantly incrementing as new rows are inserted. + """ + field: SchemaFieldSpec! + + """ + Optional transformer function to apply to the field in order to obtain the final segment or bucket identifier. + If not provided, then no operator will be applied to the field. (identity function) + """ + transformer: IncrementingSegmentFieldTransformer +} + +""" +The definition of the transformer function that should be applied to a given field / column value in a dataset +in order to determine the segment or bucket that it belongs to, which in turn is used to evaluate +volume assertions. +""" +type IncrementingSegmentFieldTransformer { + """ + The 'standard' operator type. Note that not all source systems will support all operators. + """ + type: IncrementingSegmentFieldTransformerType! + + """ + The 'native' transformer type, useful as a back door if a custom transformer is required. + This field is required if the type is NATIVE. + """ + nativeType: String +} + +""" +The 'standard' transformer type. Note that not all source systems will support all operators. +""" +enum IncrementingSegmentFieldTransformerType { + """ + Rounds a timestamp (in seconds) down to the start of the month. + """ + TIMESTAMP_MS_TO_MINUTE + + """ + Rounds a timestamp (in milliseconds) down to the nearest hour. + """ + TIMESTAMP_MS_TO_HOUR + + """ + Rounds a timestamp (in milliseconds) down to the start of the day. + """ + TIMESTAMP_MS_TO_DATE + + """ + Rounds a timestamp (in milliseconds) down to the start of the month + """ + TIMESTAMP_MS_TO_MONTH + + """ + Rounds a timestamp (in milliseconds) down to the start of the year + """ + TIMESTAMP_MS_TO_YEAR + + """ + Rounds a numeric value down to the nearest integer. + """ + FLOOR + + """ + Rounds a numeric value up to the nearest integer. + """ + CEILING + + """ + A backdoor to provide a native operator type specific to a given source system like + Snowflake, Redshift, BQ, etc. + """ + NATIVE +} + +""" +A definition of a Volume (row count) assertion. +""" +type VolumeAssertionInfo { + """ + The entity targeted by this Volume check. + """ + entityUrn: String! + + """ + The type of the freshness assertion being monitored. + """ + type: VolumeAssertionType! + + """ + Produce FAILURE Assertion Result if the row count of the asset does not meet specific requirements. + Required if type is 'ROW_COUNT_TOTAL'. + """ + rowCountTotal: RowCountTotal + + """ + Produce FAILURE Assertion Result if the row count delta of the asset does not meet specific requirements. + Required if type is 'ROW_COUNT_CHANGE'. + """ + rowCountChange: RowCountChange + + """ + Produce FAILURE Assertion Result if the latest incrementing segment row count total of the asset + does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_TOTAL'. + """ + incrementingSegmentRowCountTotal: IncrementingSegmentRowCountTotal + + """ + Produce FAILURE Assertion Result if the incrementing segment row count delta of the asset + does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_CHANGE'. + """ + incrementingSegmentRowCountChange: IncrementingSegmentRowCountChange + + """ + A definition of the specific filters that should be applied, when performing monitoring. + If not provided, there is no filter, and the full table is under consideration. + """ + filter: DatasetFilter +} + +""" +The type of the SQL assertion being monitored. +""" +enum SqlAssertionType { + """ + A SQL Metric Assertion, e.g. one based on a numeric value returned by an arbitrary SQL query. + """ + METRIC + + """ + A SQL assertion that is evaluated against the CHANGE in a metric assertion over time. + """ + METRIC_CHANGE +} + +""" +Attributes defining a SQL Assertion +""" +type SqlAssertionInfo { + """ + The type of the SQL assertion being monitored. + """ + type: SqlAssertionType! + + """ + The entity targeted by this SQL check. + """ + entityUrn: String! + + """ + The SQL statement to be executed when evaluating the assertion. + """ + statement: String! + + """ + The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + Required if the type is METRIC_CHANGE. + """ + changeType: AssertionValueChangeType + + """ + The operator you'd like to apply to the result of the SQL query. + """ + operator: AssertionStdOperator! + + """ + The parameters you'd like to provide as input to the operator. + """ + parameters: AssertionStdParameters! +} + +""" +The type of a Field assertion +""" +enum FieldAssertionType { + """ + An assertion used to validate the values contained with a field / column given a set of rows. + """ + FIELD_VALUES + + """ + An assertion used to validate the value of a common field / column metric (e.g. aggregation) + such as null count + percentage, min, max, median, and more. + """ + FIELD_METRIC +} + +""" +The type of the Field Transform +""" +enum FieldTransformType { + """ + Obtain the length of a string field / column (applicable to string types) + """ + LENGTH +} + +""" +The type of failure threshold. +""" +enum FieldValuesFailThresholdType { + """ + The maximum number of column values (i.e. rows) that are allowed + to fail the defined expectations before the assertion officially fails. + """ + COUNT + + """ + The maximum percentage of rows that are allowed + to fail the defined column expectations before the assertion officially fails. + """ + PERCENTAGE +} + +""" +A standard metric that can be derived from the set of values +for a specific field / column of a dataset / table. +""" +enum FieldMetricType { + """ + The number of unique values found in the column value set + """ + UNIQUE_COUNT + + """ + The percentage of unique values to total rows for the dataset + """ + UNIQUE_PERCENTAGE + + """ + The number of null values found in the column value set + """ + NULL_COUNT + + """ + The percentage of null values to total rows for the dataset + """ + NULL_PERCENTAGE + + """ + The minimum value in the column set (applies to numeric columns) + """ + MIN + + """ + The maximum value in the column set (applies to numeric columns) + """ + MAX + + """ + The mean length found in the column set (applies to numeric columns) + """ + MEAN + + """ + The median length found in the column set (applies to numeric columns) + """ + MEDIAN + + """ + The stddev length found in the column set (applies to numeric columns) + """ + STDDEV + + """ + The number of negative values found in the value set (applies to numeric columns) + """ + NEGATIVE_COUNT + + """ + The percentage of negative values to total rows for the dataset (applies to numeric columns) + """ + NEGATIVE_PERCENTAGE + + """ + The number of zero values found in the value set (applies to numeric columns) + """ + ZERO_COUNT + + """ + The percentage of zero values to total rows for the dataset (applies to numeric columns) + """ + ZERO_PERCENTAGE + + """ + The minimum length found in the column set (applies to string columns) + """ + MIN_LENGTH + + """ + The maximum length found in the column set (applies to string columns) + """ + MAX_LENGTH + + """ + The number of empty string values found in the value set (applies to string columns). + Note: This is a completely different metric different from NULL_COUNT! + """ + EMPTY_COUNT + + """ + The percentage of empty string values to total rows for the dataset (applies to string columns). + Note: This is a completely different metric different from NULL_PERCENTAGE! + """ + EMPTY_PERCENTAGE +} + +""" +A definition of a Field (Column) assertion. +""" +type FieldAssertionInfo { + """ + The type of the field assertion being monitored. + """ + type: FieldAssertionType! + + """ + The entity targeted by this Field check. + """ + entityUrn: String! + + """ + The definition of an assertion that validates individual values of a field / column for a set of rows. + """ + fieldValuesAssertion: FieldValuesAssertion + + """ + The definition of an assertion that validates a common metric obtained about a field / column for a set of rows. + """ + fieldMetricAssertion: FieldMetricAssertion + + """ + A definition of the specific filters that should be applied, when performing monitoring. + If not provided, there is no filter, and the full table is under consideration. + """ + filter: DatasetFilter +} + +""" +A definition of a Field Values assertion. +""" +type FieldValuesAssertion { + """ + The field under evaluation. + """ + field: SchemaFieldSpec! + + """ + An optional transform to apply to field values before evaluating the operator. + """ + transform: FieldTransform + + """ + The predicate to evaluate against a single value of the field. + Depending on the operator, parameters may be required + """ + operator: AssertionStdOperator! + + """ + Standard parameters required for the assertion. + """ + parameters: AssertionStdParameters + + """ + Additional customization about when the assertion should be officially considered failing. + """ + failThreshold: FieldValuesFailThreshold! + + """ + Whether to ignore or allow nulls when running the values assertion. + """ + excludeNulls: Boolean! +} + +""" +Definition of a transform applied to the values of a column / field. +""" +type FieldTransform { + """ + The type of the field transform. + """ + type: FieldTransformType! +} + +type FieldValuesFailThreshold { + """ + The type of failure threshold. + """ + type: FieldValuesFailThresholdType! + + """ + The value of the threshold, either representing a count or percentage. + """ + value: Long! +} + +""" +A definition of a Field Metric assertion. +""" +type FieldMetricAssertion { + """ + The field under evaluation + """ + field: SchemaFieldSpec! + + """ + The specific metric to assert against. + """ + metric: FieldMetricType! + + """ + The predicate to evaluate against the metric for the field / column. + """ + operator: AssertionStdOperator! + + """ + Standard parameters required for the assertion. + """ + parameters: AssertionStdParameters +} + +""" +Information about an Schema assertion +""" +type SchemaAssertionInfo { + """ + The entity targeted by this schema assertion. + """ + entityUrn: String! + + """ + A single field in the schema assertion. + """ + fields: [SchemaAssertionField!]! + + """ + A definition of the expected structure for the asset + Deprecated! Use the simpler 'fields' instead. + """ + schema: SchemaMetadata + + """ + The compatibility level required for the assertion to pass. + """ + compatibility: SchemaAssertionCompatibility! +} diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index de030f77b0b017..92d4a1723c0b61 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -7508,6 +7508,11 @@ type BatchSpec { The result type of an assertion, success or failure. """ enum AssertionResultType { + """ + The assertion has not yet been fully evaluated. + """ + INIT + """ The assertion succeeded. """ @@ -7517,6 +7522,11 @@ enum AssertionResultType { The assertion failed. """ FAILURE + + """ + The assertion errored. + """ + ERROR } """ @@ -7678,6 +7688,16 @@ enum AssertionStdOperator { """ NOT_IN + """ + Value being asserted is true. + """ + IS_TRUE + + """ + Value being asserted is false. + """ + IS_FALSE + """ Other """ @@ -7824,6 +7844,11 @@ type AssertionRunEventsResult { """ succeeded: Int! + """ + The number of errored run events + """ + errored: Int! + """ The run events themselves """ diff --git a/datahub-graphql-core/src/main/resources/incident.graphql b/datahub-graphql-core/src/main/resources/incident.graphql index f7060b3ae8f67b..c2938543ed9494 100644 --- a/datahub-graphql-core/src/main/resources/incident.graphql +++ b/datahub-graphql-core/src/main/resources/incident.graphql @@ -136,6 +136,36 @@ enum IncidentState { A specific type of incident """ enum IncidentType { + """ + A Freshness Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + FRESHNESS + + """ + A Volume Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + VOLUME + + """ + A Field Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + FIELD + + """ + A SQL Assertion has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + SQL + + """ + A Schema has failed, triggering the incident. + Raised on assets where assertions are configured to generate incidents. + """ + DATA_SCHEMA + """ An operational incident, e.g. failure to materialize a dataset, or failure to execute a task / pipeline. """ @@ -174,6 +204,11 @@ enum IncidentSourceType { The incident was created manually, from either the API or the UI. """ MANUAL + + """ + An assertion has failed, triggering the incident. + """ + ASSERTION_FAILURE } """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java index 7323a62d94bfe2..c047a0d0a3f051 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/assertion/AssertionRunEventResolverTest.java @@ -97,6 +97,7 @@ public void testGetSuccess() throws Exception { assertEquals(result.getTotal(), 1); assertEquals(result.getFailed(), 0); assertEquals(result.getSucceeded(), 1); + assertEquals(result.getErrored(), 0); com.linkedin.datahub.graphql.generated.AssertionRunEvent graphqlRunEvent = resolver.get(mockEnv).get().getRunEvents().get(0); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapperTest.java new file mode 100644 index 00000000000000..376af14af08f65 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionMapperTest.java @@ -0,0 +1,346 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import static org.testng.Assert.assertEquals; + +import com.google.common.collect.ImmutableList; +import com.linkedin.assertion.AssertionInfo; +import com.linkedin.assertion.AssertionSource; +import com.linkedin.assertion.AssertionStdAggregation; +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.AssertionStdParameter; +import com.linkedin.assertion.AssertionStdParameterType; +import com.linkedin.assertion.AssertionStdParameters; +import com.linkedin.assertion.AssertionType; +import com.linkedin.assertion.DatasetAssertionInfo; +import com.linkedin.assertion.DatasetAssertionScope; +import com.linkedin.assertion.FreshnessAssertionInfo; +import com.linkedin.assertion.FreshnessAssertionSchedule; +import com.linkedin.assertion.FreshnessAssertionScheduleType; +import com.linkedin.assertion.FreshnessAssertionType; +import com.linkedin.assertion.FreshnessCronSchedule; +import com.linkedin.assertion.SchemaAssertionCompatibility; +import com.linkedin.assertion.SchemaAssertionInfo; +import com.linkedin.common.GlobalTags; +import com.linkedin.common.TagAssociationArray; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.TagUrn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.DataMap; +import com.linkedin.data.template.StringMap; +import com.linkedin.datahub.graphql.generated.Assertion; +import com.linkedin.datahub.graphql.generated.FixedIntervalSchedule; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.metadata.Constants; +import com.linkedin.schema.MySqlDDL; +import com.linkedin.schema.SchemaField; +import com.linkedin.schema.SchemaFieldArray; +import com.linkedin.schema.SchemaFieldDataType; +import com.linkedin.schema.SchemaMetadata; +import com.linkedin.schema.StringType; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class AssertionMapperTest { + + @Test + public void testMapDatasetAssertion() { + // Case 1: Without nullable fields + AssertionInfo input = createFreshnessAssertionInfoWithoutNullableFields(); + EntityResponse datasetAssertionEntityResponse = createAssertionInfoEntityResponse(input); + Assertion output = AssertionMapper.map(null, datasetAssertionEntityResponse); + verifyAssertionInfo(input, output); + + // Case 2: With nullable fields + input = createFreshnessAssertionInfoWithNullableFields(); + EntityResponse datasetAssertionEntityResponseWithNullables = + createAssertionInfoEntityResponse(input); + output = AssertionMapper.map(null, datasetAssertionEntityResponseWithNullables); + verifyAssertionInfo(input, output); + } + + @Test + public void testMapTags() throws Exception { + HashMap aspects = new HashMap<>(); + AssertionInfo info = createFreshnessAssertionInfoWithoutNullableFields(); + + EnvelopedAspect envelopedTagsAspect = new EnvelopedAspect(); + GlobalTags tags = new GlobalTags(); + tags.setTags( + new TagAssociationArray( + new TagAssociationArray( + Collections.singletonList( + new com.linkedin.common.TagAssociation() + .setTag(TagUrn.createFromString("urn:li:tag:test")))))); + envelopedTagsAspect.setValue(new Aspect(tags.data())); + + aspects.put(Constants.ASSERTION_INFO_ASPECT_NAME, createEnvelopedAspect(info.data())); + aspects.put(Constants.GLOBAL_TAGS_ASPECT_NAME, createEnvelopedAspect(tags.data())); + EntityResponse response = createEntityResponse(aspects); + + Assertion assertion = AssertionMapper.map(null, response); + assertEquals(assertion.getTags().getTags().size(), 1); + assertEquals( + assertion.getTags().getTags().get(0).getTag().getUrn().toString(), "urn:li:tag:test"); + } + + @Test + public void testMapFreshnessAssertion() { + // Case 1: Without nullable fields + AssertionInfo inputInfo = createFreshnessAssertionInfoWithoutNullableFields(); + + EntityResponse freshnessAssertionEntityResponse = createAssertionInfoEntityResponse(inputInfo); + Assertion output = AssertionMapper.map(null, freshnessAssertionEntityResponse); + verifyAssertionInfo(inputInfo, output); + + // Case 2: With nullable fields + inputInfo = createDatasetAssertionInfoWithNullableFields(); + EntityResponse freshnessAssertionEntityResponseWithNullables = + createAssertionInfoEntityResponse(inputInfo); + output = AssertionMapper.map(null, freshnessAssertionEntityResponseWithNullables); + verifyAssertionInfo(inputInfo, output); + } + + @Test + public void testMapDataSchemaAssertion() { + AssertionInfo input = createSchemaAssertion(); + EntityResponse schemaAssertionEntityResponse = createAssertionInfoEntityResponse(input); + Assertion output = AssertionMapper.map(null, schemaAssertionEntityResponse); + verifyAssertionInfo(input, output); + } + + private void verifyAssertionInfo(AssertionInfo input, Assertion output) { + Assert.assertNotNull(output); + Assert.assertNotNull(output.getInfo()); + Assert.assertEquals( + output.getInfo().getType().toString(), output.getInfo().getType().toString()); + + if (input.hasDatasetAssertion()) { + verifyDatasetAssertion(input.getDatasetAssertion(), output.getInfo().getDatasetAssertion()); + } + + if (input.hasFreshnessAssertion()) { + verifyFreshnessAssertion( + input.getFreshnessAssertion(), output.getInfo().getFreshnessAssertion()); + } + + if (input.hasSchemaAssertion()) { + verifySchemaAssertion(input.getSchemaAssertion(), output.getInfo().getSchemaAssertion()); + } + + if (input.hasSource()) { + verifySource(input.getSource(), output.getInfo().getSource()); + } + } + + private void verifyDatasetAssertion( + DatasetAssertionInfo input, + com.linkedin.datahub.graphql.generated.DatasetAssertionInfo output) { + Assert.assertEquals(output.getOperator().toString(), input.getOperator().toString()); + Assert.assertEquals(output.getOperator().toString(), input.getOperator().toString()); + Assert.assertEquals(output.getScope().toString(), input.getScope().toString()); + Assert.assertEquals(output.getDatasetUrn(), input.getDataset().toString()); + if (input.hasAggregation()) { + Assert.assertEquals(output.getAggregation().toString(), input.getAggregation().toString()); + } + if (input.hasNativeType()) { + Assert.assertEquals(output.getNativeType(), input.getNativeType().toString()); + } + if (input.hasLogic()) { + Assert.assertEquals(output.getLogic(), input.getLogic()); + } + if (input.hasFields()) { + Assert.assertTrue( + input.getFields().stream() + .allMatch( + field -> + output.getFields().stream() + .anyMatch(outField -> field.toString().equals(outField.getUrn())))); + } + } + + private void verifyFreshnessAssertion( + FreshnessAssertionInfo input, + com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo output) { + Assert.assertEquals(output.getType().toString(), input.getType().toString()); + Assert.assertEquals(output.getEntityUrn(), input.getEntity().toString()); + if (input.hasSchedule()) { + verifyFreshnessSchedule(input.getSchedule(), output.getSchedule()); + } + } + + private void verifySchemaAssertion( + SchemaAssertionInfo input, + com.linkedin.datahub.graphql.generated.SchemaAssertionInfo output) { + Assert.assertEquals(output.getEntityUrn(), input.getEntity().toString()); + Assert.assertEquals(output.getCompatibility().toString(), input.getCompatibility().toString()); + Assert.assertEquals( + output.getSchema().getFields().size(), input.getSchema().getFields().size()); + } + + private void verifyCronSchedule( + FreshnessCronSchedule input, + com.linkedin.datahub.graphql.generated.FreshnessCronSchedule output) { + Assert.assertEquals(output.getCron(), input.getCron()); + Assert.assertEquals(output.getTimezone(), input.getTimezone()); + if (input.hasWindowStartOffsetMs()) { + Assert.assertEquals(output.getWindowStartOffsetMs(), input.getWindowStartOffsetMs()); + } + } + + private void verifyFreshnessSchedule( + FreshnessAssertionSchedule input, + com.linkedin.datahub.graphql.generated.FreshnessAssertionSchedule output) { + Assert.assertEquals(output.getType().toString(), input.getType().toString()); + if (input.hasCron()) { + verifyCronSchedule(input.getCron(), output.getCron()); + } + if (input.hasFixedInterval()) { + verifyFixedIntervalSchedule(input.getFixedInterval(), output.getFixedInterval()); + } + } + + private void verifyFixedIntervalSchedule( + com.linkedin.assertion.FixedIntervalSchedule input, FixedIntervalSchedule output) { + Assert.assertEquals(output.getMultiple(), (int) input.getMultiple()); + Assert.assertEquals(output.getUnit().toString(), input.getUnit().toString()); + } + + private void verifySource( + AssertionSource input, com.linkedin.datahub.graphql.generated.AssertionSource output) { + Assert.assertEquals(output.getType().toString(), input.getType().toString()); + } + + private EntityResponse createAssertionInfoEntityResponse(final AssertionInfo info) { + HashMap aspects = new HashMap<>(); + aspects.put(Constants.ASSERTION_INFO_ASPECT_NAME, createEnvelopedAspect(info.data())); + + return createEntityResponse(aspects); + } + + private EntityResponse createEntityResponse(Map aspects) { + EntityResponse entityResponse = new EntityResponse(); + entityResponse.setUrn(UrnUtils.getUrn("urn:li:assertion:1")); + entityResponse.setAspects(new EnvelopedAspectMap(new HashMap<>())); + aspects.forEach( + (aspectName, envelopedAspect) -> { + entityResponse.getAspects().put(aspectName, envelopedAspect); + }); + + return entityResponse; + } + + private EnvelopedAspect createEnvelopedAspect(DataMap dataMap) { + EnvelopedAspect envelopedAspect = new EnvelopedAspect(); + envelopedAspect.setValue(new Aspect(dataMap)); + return envelopedAspect; + } + + private AssertionInfo createDatasetAssertionInfoWithoutNullableFields() { + AssertionInfo info = new AssertionInfo(); + info.setType(com.linkedin.assertion.AssertionType.DATASET); + DatasetAssertionInfo datasetAssertionInfo = new DatasetAssertionInfo(); + datasetAssertionInfo.setDataset(UrnUtils.getUrn("urn:li:dataset:1")); + datasetAssertionInfo.setScope(DatasetAssertionScope.DATASET_COLUMN); + datasetAssertionInfo.setOperator(AssertionStdOperator.GREATER_THAN); + info.setDatasetAssertion(datasetAssertionInfo); + return info; + } + + private AssertionInfo createDatasetAssertionInfoWithNullableFields() { + AssertionInfo infoWithoutNullables = createDatasetAssertionInfoWithoutNullableFields(); + DatasetAssertionInfo baseInfo = infoWithoutNullables.getDatasetAssertion(); + baseInfo.setFields( + new UrnArray( + Arrays.asList( + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,name,PROD),field)")))); + baseInfo.setAggregation(AssertionStdAggregation.SUM); + baseInfo.setParameters(createAssertionStdParameters()); + baseInfo.setNativeType("native_type"); + baseInfo.setNativeParameters(new StringMap(Collections.singletonMap("key", "value"))); + baseInfo.setLogic("sample_logic"); + infoWithoutNullables.setSource( + new AssertionSource().setType(com.linkedin.assertion.AssertionSourceType.INFERRED)); + return infoWithoutNullables; + } + + private AssertionInfo createFreshnessAssertionInfoWithoutNullableFields() { + AssertionInfo info = new AssertionInfo(); + info.setType(AssertionType.FRESHNESS); + FreshnessAssertionInfo freshnessAssertionInfo = new FreshnessAssertionInfo(); + freshnessAssertionInfo.setEntity( + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,name,PROD)")); + freshnessAssertionInfo.setType(FreshnessAssertionType.DATASET_CHANGE); + info.setFreshnessAssertion(freshnessAssertionInfo); + return info; + } + + private AssertionInfo createFreshnessAssertionInfoWithNullableFields() { + AssertionInfo infoWithoutNullables = createFreshnessAssertionInfoWithoutNullableFields(); + FreshnessAssertionInfo baseInfo = infoWithoutNullables.getFreshnessAssertion(); + baseInfo.setSchedule(createFreshnessAssertionSchedule()); + infoWithoutNullables.setSource( + new AssertionSource().setType(com.linkedin.assertion.AssertionSourceType.INFERRED)); + return infoWithoutNullables; + } + + private AssertionInfo createSchemaAssertion() { + AssertionInfo info = new AssertionInfo(); + info.setType(AssertionType.DATA_SCHEMA); + SchemaAssertionInfo schemaAssertionInfo = new SchemaAssertionInfo(); + schemaAssertionInfo.setEntity(UrnUtils.getUrn("urn:li:dataset:1")); + schemaAssertionInfo.setCompatibility(SchemaAssertionCompatibility.SUPERSET); + schemaAssertionInfo.setSchema( + new SchemaMetadata() + .setCluster("Test") + .setHash("Test") + .setPlatformSchema(SchemaMetadata.PlatformSchema.create(new MySqlDDL())) + .setFields( + new SchemaFieldArray( + ImmutableList.of( + new SchemaField() + .setType( + new SchemaFieldDataType() + .setType(SchemaFieldDataType.Type.create(new StringType()))) + .setNullable(false) + .setNativeDataType("string") + .setFieldPath("test"))))); + return info; + } + + private AssertionStdParameters createAssertionStdParameters() { + AssertionStdParameters parameters = new AssertionStdParameters(); + parameters.setValue(createAssertionStdParameter()); + parameters.setMinValue(createAssertionStdParameter()); + parameters.setMaxValue(createAssertionStdParameter()); + return parameters; + } + + private AssertionStdParameter createAssertionStdParameter() { + AssertionStdParameter parameter = new AssertionStdParameter(); + parameter.setType(AssertionStdParameterType.NUMBER); + parameter.setValue("100"); + return parameter; + } + + private FreshnessAssertionSchedule createFreshnessAssertionSchedule() { + FreshnessAssertionSchedule schedule = new FreshnessAssertionSchedule(); + schedule.setType(FreshnessAssertionScheduleType.CRON); + schedule.setCron(createCronSchedule()); + return schedule; + } + + private FreshnessCronSchedule createCronSchedule() { + FreshnessCronSchedule cronSchedule = new FreshnessCronSchedule(); + cronSchedule.setCron("0 0 * * *"); + cronSchedule.setTimezone("UTC"); + return cronSchedule; + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java index dd2b676a941302..33774690b7c7a9 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/AssertionTypeTest.java @@ -7,6 +7,10 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.linkedin.assertion.AssertionAction; +import com.linkedin.assertion.AssertionActionArray; +import com.linkedin.assertion.AssertionActionType; +import com.linkedin.assertion.AssertionActions; import com.linkedin.assertion.AssertionInfo; import com.linkedin.assertion.AssertionType; import com.linkedin.common.DataPlatformInstance; @@ -48,6 +52,17 @@ public class AssertionTypeTest { new DataPlatformInstance() .setPlatform(new DataPlatformUrn("snowflake")) .setInstance(null, SetMode.IGNORE_NULL); + // Acryl SaaS Only + private static final AssertionActions TEST_ASSERTION_ACTIONS = + new AssertionActions() + .setOnSuccess( + new AssertionActionArray( + ImmutableList.of( + new AssertionAction().setType(AssertionActionType.RAISE_INCIDENT)))) + .setOnFailure( + new AssertionActionArray( + ImmutableList.of( + new AssertionAction().setType(AssertionActionType.RESOLVE_INCIDENT)))); private static final String TEST_ASSERTION_URN_2 = "urn:li:assertion:guid-2"; @@ -69,6 +84,9 @@ public void testBatchLoad() throws Exception { assertion1Aspects.put( Constants.ASSERTION_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(TEST_ASSERTION_INFO.data()))); + assertion1Aspects.put( + Constants.ASSERTION_ACTIONS_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(TEST_ASSERTION_ACTIONS.data()))); Mockito.when( client.batchGetV2( any(), @@ -112,6 +130,12 @@ public void testBatchLoad() throws Exception { assertEquals(assertion.getInfo().getType().toString(), AssertionType.DATASET.toString()); assertEquals(assertion.getInfo().getDatasetAssertion(), null); assertEquals(assertion.getPlatform().getUrn(), "urn:li:dataPlatform:snowflake"); + assertEquals( + assertion.getActions().getOnSuccess().get(0).getType(), + com.linkedin.datahub.graphql.generated.AssertionActionType.RAISE_INCIDENT); + assertEquals( + assertion.getActions().getOnFailure().get(0).getType(), + com.linkedin.datahub.graphql.generated.AssertionActionType.RESOLVE_INCIDENT); // Assert second element is null. assertNull(result.get(1)); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapperTest.java new file mode 100644 index 00000000000000..7758aaa986fed3 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FieldAssertionMapperTest.java @@ -0,0 +1,100 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.FieldAssertionInfo; +import com.linkedin.assertion.FieldAssertionType; +import com.linkedin.assertion.FieldMetricAssertion; +import com.linkedin.assertion.FieldMetricType; +import com.linkedin.assertion.FieldTransform; +import com.linkedin.assertion.FieldTransformType; +import com.linkedin.assertion.FieldValuesAssertion; +import com.linkedin.assertion.FieldValuesFailThreshold; +import com.linkedin.assertion.FieldValuesFailThresholdType; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetFilter; +import com.linkedin.dataset.DatasetFilterType; +import com.linkedin.schema.SchemaFieldSpec; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class FieldAssertionMapperTest { + @Test + public void testMapFieldValuesAssertionInfo() throws Exception { + FieldAssertionInfo fieldAssertionInfo = + new FieldAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setType(FieldAssertionType.FIELD_VALUES) + .setFieldValuesAssertion( + new FieldValuesAssertion() + .setExcludeNulls(true) + .setFailThreshold( + new FieldValuesFailThreshold() + .setType(FieldValuesFailThresholdType.PERCENTAGE) + .setValue(5L)) + .setField( + new SchemaFieldSpec() + .setPath("path") + .setType("STRING") + .setNativeType("VARCHAR")) + .setOperator(AssertionStdOperator.IS_TRUE) + .setTransform(new FieldTransform().setType(FieldTransformType.LENGTH))); + + com.linkedin.datahub.graphql.generated.FieldAssertionInfo result = + FieldAssertionMapper.mapFieldAssertionInfo(null, fieldAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.FieldAssertionType.FIELD_VALUES); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals(result.getFieldValuesAssertion().getField().getPath(), "path"); + Assert.assertEquals(result.getFieldValuesAssertion().getField().getType(), "STRING"); + Assert.assertEquals(result.getFieldValuesAssertion().getField().getNativeType(), "VARCHAR"); + Assert.assertEquals( + result.getFieldValuesAssertion().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.IS_TRUE); + Assert.assertEquals( + result.getFieldValuesAssertion().getTransform().getType(), + com.linkedin.datahub.graphql.generated.FieldTransformType.LENGTH); + Assert.assertEquals(result.getFieldValuesAssertion().getExcludeNulls(), true); + Assert.assertEquals( + result.getFieldValuesAssertion().getFailThreshold().getType(), + com.linkedin.datahub.graphql.generated.FieldValuesFailThresholdType.PERCENTAGE); + Assert.assertEquals( + result.getFieldValuesAssertion().getFailThreshold().getValue(), Long.valueOf(5L)); + } + + @Test + public void testMapFieldMetricAssertionInfo() throws Exception { + FieldAssertionInfo fieldAssertionInfo = + new FieldAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(FieldAssertionType.FIELD_METRIC) + .setFieldMetricAssertion( + new FieldMetricAssertion() + .setField( + new SchemaFieldSpec() + .setPath("path") + .setType("STRING") + .setNativeType("VARCHAR")) + .setOperator(AssertionStdOperator.IS_TRUE) + .setMetric(FieldMetricType.MEDIAN)); + + com.linkedin.datahub.graphql.generated.FieldAssertionInfo result = + FieldAssertionMapper.mapFieldAssertionInfo(null, fieldAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.FieldAssertionType.FIELD_METRIC); + Assert.assertEquals(result.getFieldMetricAssertion().getField().getPath(), "path"); + Assert.assertEquals(result.getFieldMetricAssertion().getField().getType(), "STRING"); + Assert.assertEquals(result.getFieldMetricAssertion().getField().getNativeType(), "VARCHAR"); + Assert.assertEquals( + result.getFieldMetricAssertion().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.IS_TRUE); + Assert.assertEquals( + result.getFieldMetricAssertion().getMetric(), + com.linkedin.datahub.graphql.generated.FieldMetricType.MEDIAN); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapperTest.java new file mode 100644 index 00000000000000..b69ed02bdfd626 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/FreshnessAssertionMapperTest.java @@ -0,0 +1,82 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.FixedIntervalSchedule; +import com.linkedin.assertion.FreshnessAssertionInfo; +import com.linkedin.assertion.FreshnessAssertionSchedule; +import com.linkedin.assertion.FreshnessAssertionScheduleType; +import com.linkedin.assertion.FreshnessAssertionType; +import com.linkedin.assertion.FreshnessCronSchedule; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetFilter; +import com.linkedin.dataset.DatasetFilterType; +import com.linkedin.timeseries.CalendarInterval; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class FreshnessAssertionMapperTest { + @Test + public void testMapCronFreshnessAssertionInfo() throws Exception { + FreshnessAssertionInfo freshnessAssertionInfo = + new FreshnessAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(FreshnessAssertionType.DATASET_CHANGE) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setSchedule( + new FreshnessAssertionSchedule() + .setType(FreshnessAssertionScheduleType.CRON) + .setCron( + new FreshnessCronSchedule() + .setCron("0 0 0 * * ? *") + .setTimezone("America/Los_Angeles") + .setWindowStartOffsetMs(10L))); + + com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo result = + FreshnessAssertionMapper.mapFreshnessAssertionInfo(null, freshnessAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionType.DATASET_CHANGE); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getSchedule().getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType.CRON); + Assert.assertEquals(result.getSchedule().getCron().getCron(), "0 0 0 * * ? *"); + Assert.assertEquals(result.getSchedule().getCron().getTimezone(), "America/Los_Angeles"); + Assert.assertEquals(result.getSchedule().getCron().getWindowStartOffsetMs(), Long.valueOf(10L)); + } + + @Test + public void testMapFixedIntervalFreshnessAssertionInfo() throws Exception { + FreshnessAssertionInfo freshnessAssertionInfo = + new FreshnessAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(FreshnessAssertionType.DATASET_CHANGE) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setSchedule( + new FreshnessAssertionSchedule() + .setType(FreshnessAssertionScheduleType.FIXED_INTERVAL) + .setFixedInterval( + new FixedIntervalSchedule().setUnit(CalendarInterval.DAY).setMultiple(10))); + + com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo result = + FreshnessAssertionMapper.mapFreshnessAssertionInfo(null, freshnessAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionType.DATASET_CHANGE); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getSchedule().getType(), + com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType.FIXED_INTERVAL); + Assert.assertEquals( + result.getSchedule().getFixedInterval().getUnit(), + com.linkedin.datahub.graphql.generated.DateInterval.DAY); + Assert.assertEquals(result.getSchedule().getFixedInterval().getMultiple(), 10); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapperTest.java new file mode 100644 index 00000000000000..271362c9fd8468 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/SqlAssertionMapperTest.java @@ -0,0 +1,78 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.AssertionStdParameter; +import com.linkedin.assertion.AssertionStdParameterType; +import com.linkedin.assertion.AssertionStdParameters; +import com.linkedin.assertion.AssertionValueChangeType; +import com.linkedin.assertion.SqlAssertionInfo; +import com.linkedin.assertion.SqlAssertionType; +import com.linkedin.common.urn.Urn; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SqlAssertionMapperTest { + @Test + public void testMapMetricSqlAssertionInfo() throws Exception { + SqlAssertionInfo sqlAssertionInfo = + new SqlAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(SqlAssertionType.METRIC) + .setStatement("SELECT COUNT(*) FROM foo.bar.baz") + .setOperator(AssertionStdOperator.GREATER_THAN) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue(("5")))); + + com.linkedin.datahub.graphql.generated.SqlAssertionInfo result = + SqlAssertionMapper.mapSqlAssertionInfo(sqlAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.SqlAssertionType.METRIC); + Assert.assertEquals(result.getStatement(), "SELECT COUNT(*) FROM foo.bar.baz"); + Assert.assertEquals( + result.getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN); + Assert.assertEquals( + result.getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getParameters().getValue().getValue(), "5"); + } + + @Test + public void testMapMetricChangeSqlAssertionInfo() throws Exception { + SqlAssertionInfo sqlAssertionInfo = + new SqlAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(SqlAssertionType.METRIC_CHANGE) + .setStatement("SELECT COUNT(*) FROM foo.bar.baz") + .setChangeType(AssertionValueChangeType.ABSOLUTE) + .setOperator(AssertionStdOperator.GREATER_THAN) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue(("5")))); + + com.linkedin.datahub.graphql.generated.SqlAssertionInfo result = + SqlAssertionMapper.mapSqlAssertionInfo(sqlAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), com.linkedin.datahub.graphql.generated.SqlAssertionType.METRIC_CHANGE); + Assert.assertEquals(result.getStatement(), "SELECT COUNT(*) FROM foo.bar.baz"); + Assert.assertEquals( + result.getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN); + Assert.assertEquals( + result.getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getParameters().getValue().getValue(), "5"); + Assert.assertEquals( + result.getChangeType(), + com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE); + } +} diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapperTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapperTest.java new file mode 100644 index 00000000000000..f23fadb6992078 --- /dev/null +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/types/assertion/VolumeAssertionMapperTest.java @@ -0,0 +1,207 @@ +package com.linkedin.datahub.graphql.types.assertion; + +import com.linkedin.assertion.AssertionStdOperator; +import com.linkedin.assertion.AssertionStdParameter; +import com.linkedin.assertion.AssertionStdParameterType; +import com.linkedin.assertion.AssertionStdParameters; +import com.linkedin.assertion.AssertionValueChangeType; +import com.linkedin.assertion.IncrementingSegmentFieldTransformer; +import com.linkedin.assertion.IncrementingSegmentFieldTransformerType; +import com.linkedin.assertion.IncrementingSegmentRowCountChange; +import com.linkedin.assertion.IncrementingSegmentRowCountTotal; +import com.linkedin.assertion.RowCountChange; +import com.linkedin.assertion.RowCountTotal; +import com.linkedin.assertion.VolumeAssertionInfo; +import com.linkedin.assertion.VolumeAssertionType; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetFilter; +import com.linkedin.dataset.DatasetFilterType; +import com.linkedin.schema.SchemaFieldSpec; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class VolumeAssertionMapperTest { + @Test + public void testMapRowCountTotalVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.ROW_COUNT_TOTAL) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setRowCountTotal( + new RowCountTotal() + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10")))); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType.ROW_COUNT_TOTAL); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getRowCountTotal().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getRowCountTotal().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getRowCountTotal().getParameters().getValue().getValue(), "10"); + } + + @Test + public void testMapRowCountChangeVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.ROW_COUNT_CHANGE) + .setFilter( + new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;")) + .setRowCountChange( + new RowCountChange() + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10"))) + .setType(AssertionValueChangeType.ABSOLUTE)); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType.ROW_COUNT_CHANGE); + Assert.assertEquals( + result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL); + Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;"); + Assert.assertEquals( + result.getRowCountChange().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getRowCountChange().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals(result.getRowCountChange().getParameters().getValue().getValue(), "10"); + Assert.assertEquals( + result.getRowCountChange().getType(), + com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE); + } + + @Test + public void testMapIncrementingSegmentRowCountTotalVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.INCREMENTING_SEGMENT_ROW_COUNT_TOTAL) + .setIncrementingSegmentRowCountTotal( + new IncrementingSegmentRowCountTotal() + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10"))) + .setSegment( + new com.linkedin.assertion.IncrementingSegmentSpec() + .setField( + new SchemaFieldSpec() + .setPath("path") + .setNativeType("VARCHAR") + .setType("STRING")) + .setTransformer( + new IncrementingSegmentFieldTransformer() + .setType(IncrementingSegmentFieldTransformerType.CEILING) + .setNativeType("CEILING")))); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType + .INCREMENTING_SEGMENT_ROW_COUNT_TOTAL); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getParameters().getValue().getValue(), "10"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getField().getPath(), "path"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getField().getNativeType(), + "VARCHAR"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getField().getType(), "STRING"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getTransformer().getType(), + com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformerType.CEILING); + Assert.assertEquals( + result.getIncrementingSegmentRowCountTotal().getSegment().getTransformer().getNativeType(), + "CEILING"); + } + + @Test + public void testMapIncrementingSegmentRowCountChangeVolumeAssertionInfo() throws Exception { + VolumeAssertionInfo volumeAssertionInfo = + new VolumeAssertionInfo() + .setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)")) + .setType(VolumeAssertionType.INCREMENTING_SEGMENT_ROW_COUNT_CHANGE) + .setIncrementingSegmentRowCountChange( + new IncrementingSegmentRowCountChange() + .setType(AssertionValueChangeType.ABSOLUTE) + .setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO) + .setParameters( + new AssertionStdParameters() + .setValue( + new AssertionStdParameter() + .setType(AssertionStdParameterType.NUMBER) + .setValue("10"))) + .setSegment( + new com.linkedin.assertion.IncrementingSegmentSpec() + .setField( + new SchemaFieldSpec() + .setPath("path") + .setNativeType("VARCHAR") + .setType("STRING")))); + + com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result = + VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo); + Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"); + Assert.assertEquals( + result.getType(), + com.linkedin.datahub.graphql.generated.VolumeAssertionType + .INCREMENTING_SEGMENT_ROW_COUNT_CHANGE); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getType(), + com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getOperator(), + com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getParameters().getValue().getType(), + com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getParameters().getValue().getValue(), "10"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getSegment().getField().getPath(), "path"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getSegment().getField().getNativeType(), + "VARCHAR"); + Assert.assertEquals( + result.getIncrementingSegmentRowCountChange().getSegment().getField().getType(), "STRING"); + } +} diff --git a/docs-website/graphql/generateGraphQLSchema.sh b/docs-website/graphql/generateGraphQLSchema.sh index da14fbc337f903..a904a2e36d7c19 100755 --- a/docs-website/graphql/generateGraphQLSchema.sh +++ b/docs-website/graphql/generateGraphQLSchema.sh @@ -9,6 +9,7 @@ cat ../../datahub-graphql-core/src/main/resources/app.graphql >> combined.graphq cat ../../datahub-graphql-core/src/main/resources/auth.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/constraints.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/entity.graphql >> combined.graphql +cat ../../datahub-graphql-core/src/main/resources/assertions.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/ingestion.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/recommendation.graphql >> combined.graphql cat ../../datahub-graphql-core/src/main/resources/search.graphql >> combined.graphql diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index 66ed48a428a216..79ae0fbeacd940 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -285,6 +285,7 @@ public class Constants { public static final String ASSERTION_INFO_ASPECT_NAME = "assertionInfo"; public static final String ASSERTION_RUN_EVENT_ASPECT_NAME = "assertionRunEvent"; public static final String ASSERTION_RUN_EVENT_STATUS_COMPLETE = "COMPLETE"; + public static final String ASSERTION_ACTIONS_ASPECT_NAME = "assertionActions"; // Tests public static final String TEST_ENTITY_NAME = "test"; diff --git a/metadata-ingestion/examples/library/assertions_configuration.yml b/metadata-ingestion/examples/library/assertions_configuration.yml new file mode 100644 index 00000000000000..a44945a30f9a37 --- /dev/null +++ b/metadata-ingestion/examples/library/assertions_configuration.yml @@ -0,0 +1,76 @@ +version: 1 +namespace: test-config-id-1 +assertions: + # Freshness Assertion + - entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + type: freshness + lookback_interval: "1 hour" + last_modified_field: col_timestamp + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Volume Assertion + - type: volume + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + metric: row_count + condition: + type: less_than_or_equal_to + value: 1000 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Metric Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + field: col_date + metric: null_count + condition: + type: equal_to + value: 0 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Value Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + field: quantity + condition: + type: between + min: 0 + max: 10 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT + # Custom SQL Metric Assertion + - type: sql + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + statement: select mode(quantity) from test_db.public.purchase_event + condition: + type: equal_to + value: 5 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/__init__.py b/metadata-ingestion/src/datahub/api/entities/assertion/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion.py new file mode 100644 index 00000000000000..e0975a1c0351c7 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion.py @@ -0,0 +1,57 @@ +from abc import abstractmethod +from typing import Optional + +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field +from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionInfo + + +class BaseAssertionProtocol(v1_ConfigModel): + @abstractmethod + def get_id(self) -> str: + pass + + @abstractmethod + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + pass + + @abstractmethod + def get_assertion_trigger( + self, + ) -> Optional[AssertionTrigger]: + pass + + +class BaseAssertion(v1_ConfigModel): + id_raw: Optional[str] = v1_Field( + default=None, + description="The raw id of the assertion." + "If provided, this is used when creating identifier for this assertion" + "along with assertion type and entity.", + ) + + id: Optional[str] = v1_Field( + default=None, + description="The id of the assertion." + "If provided, this is used as identifier for this assertion." + "If provided, no other assertion fields are considered to create identifier.", + ) + + description: Optional[str] = None + + # Can contain metadata extracted from datahub. e.g. + # - entity qualified name + # - entity schema + meta: Optional[dict] = None + + +class BaseEntityAssertion(BaseAssertion): + entity: str = v1_Field( + description="The entity urn that the assertion is associated with" + ) + + trigger: Optional[AssertionTrigger] = v1_Field( + description="The trigger schedule for assertion", alias="schedule" + ) diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_config_spec.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_config_spec.py new file mode 100644 index 00000000000000..08205cc621253f --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_config_spec.py @@ -0,0 +1,41 @@ +from typing import List, Optional + +from ruamel.yaml import YAML +from typing_extensions import Literal + +from datahub.api.entities.assertion.datahub_assertion import DataHubAssertion +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field + + +class AssertionsConfigSpec(v1_ConfigModel): + """ + Declarative configuration specification for datahub assertions. + + This model is used as a simpler, Python-native representation to define assertions. + It can be easily parsed from a equivalent YAML file. + + Currently, this is converted into series of assertion MCPs that can be emitted to DataHub. + In future, this would invoke datahub GraphQL API to upsert assertions. + """ + + version: Literal[1] + + id: Optional[str] = v1_Field( + default=None, + alias="namespace", + description="Unique identifier of assertions configuration file", + ) + + assertions: List[DataHubAssertion] + + @classmethod + def from_yaml( + cls, + file: str, + ) -> "AssertionsConfigSpec": + with open(file) as fp: + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + orig_dictionary = yaml.load(fp) + parsed_spec = AssertionsConfigSpec.parse_obj(orig_dictionary) + # parsed_spec._original_yaml_dict = orig_dictionary + return parsed_spec diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py new file mode 100644 index 00000000000000..8704ed13cb6c30 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_operator.py @@ -0,0 +1,304 @@ +import json +from typing import List, Optional, Union + +from typing_extensions import Literal, Protocol + +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel +from datahub.metadata.schema_classes import ( + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, +) + + +class Operator(Protocol): + """Specification for an assertion operator. + + This class exists only for documentation (not used in typing checking). + """ + + operator: str + + def id(self) -> str: + ... + + def generate_parameters(self) -> AssertionStdParametersClass: + ... + + +def _generate_assertion_std_parameter( + value: Union[str, int, float, list] +) -> AssertionStdParameterClass: + if isinstance(value, str): + return AssertionStdParameterClass( + value=value, type=AssertionStdParameterTypeClass.STRING + ) + elif isinstance(value, (int, float)): + return AssertionStdParameterClass( + value=str(value), type=AssertionStdParameterTypeClass.NUMBER + ) + elif isinstance(value, list): + return AssertionStdParameterClass( + value=json.dumps(value), type=AssertionStdParameterTypeClass.LIST + ) + else: + raise ValueError( + f"Unsupported assertion parameter {value} of type {type(value)}" + ) + + +Param = Union[str, int, float, List[Union[str, float, int]]] + + +def _generate_assertion_std_parameters( + value: Optional[Param] = None, + min_value: Optional[Param] = None, + max_value: Optional[Param] = None, +) -> AssertionStdParametersClass: + return AssertionStdParametersClass( + value=_generate_assertion_std_parameter(value) if value else None, + minValue=_generate_assertion_std_parameter(min_value) if min_value else None, + maxValue=_generate_assertion_std_parameter(max_value) if max_value else None, + ) + + +class EqualToOperator(v1_ConfigModel): + type: Literal["equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotEqualToOperator(v1_ConfigModel): + type: Literal["not_equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.NOT_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class BetweenOperator(v1_ConfigModel): + type: Literal["between"] + min: Union[int, float] + max: Union[int, float] + + operator: str = AssertionStdOperatorClass.BETWEEN + + def id(self) -> str: + return f"{self.type}-{self.min}-{self.max}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters( + min_value=self.min, max_value=self.max + ) + + +class LessThanOperator(v1_ConfigModel): + type: Literal["less_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOperator(v1_ConfigModel): + type: Literal["greater_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class LessThanOrEqualToOperator(v1_ConfigModel): + type: Literal["less_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOrEqualToOperator(v1_ConfigModel): + type: Literal["greater_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class InOperator(v1_ConfigModel): + type: Literal["in"] + value: List[Union[str, float, int]] + + operator: str = AssertionStdOperatorClass.IN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotInOperator(v1_ConfigModel): + type: Literal["not_in"] + value: List[Union[str, float, int]] + + operator: str = AssertionStdOperatorClass.NOT_IN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class IsNullOperator(v1_ConfigModel): + type: Literal["is_null"] + + operator: str = AssertionStdOperatorClass.NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class NotNullOperator(v1_ConfigModel): + type: Literal["is_not_null"] + + operator: str = AssertionStdOperatorClass.NOT_NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class IsTrueOperator(v1_ConfigModel): + type: Literal["is_true"] + + operator: str = AssertionStdOperatorClass.IS_TRUE + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class IsFalseOperator(v1_ConfigModel): + type: Literal["is_false"] + + operator: str = AssertionStdOperatorClass.IS_FALSE + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +class ContainsOperator(v1_ConfigModel): + type: Literal["contains"] + value: str + + operator: str = AssertionStdOperatorClass.CONTAIN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class EndsWithOperator(v1_ConfigModel): + type: Literal["ends_with"] + value: str + + operator: str = AssertionStdOperatorClass.END_WITH + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class StartsWithOperator(v1_ConfigModel): + type: Literal["starts_with"] + value: str + + operator: str = AssertionStdOperatorClass.START_WITH + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class MatchesRegexOperator(v1_ConfigModel): + type: Literal["matches_regex"] + value: str + + operator: str = AssertionStdOperatorClass.REGEX_MATCH + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +Operators = Union[ + InOperator, + NotInOperator, + EqualToOperator, + NotEqualToOperator, + BetweenOperator, + LessThanOperator, + LessThanOrEqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + IsNullOperator, + NotNullOperator, + IsTrueOperator, + IsFalseOperator, + ContainsOperator, + EndsWithOperator, + StartsWithOperator, + MatchesRegexOperator, +] diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/assertion_trigger.py b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_trigger.py new file mode 100644 index 00000000000000..d7809164847447 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/assertion_trigger.py @@ -0,0 +1,52 @@ +from datetime import timedelta +from typing import Union + +import humanfriendly +from typing_extensions import Literal + +from datahub.configuration.pydantic_migration_helpers import ( + v1_ConfigModel, + v1_Field, + v1_validator, +) + + +class CronTrigger(v1_ConfigModel): + type: Literal["cron"] + cron: str = v1_Field( + description="The cron expression to use. See https://crontab.guru/ for help." + ) + timezone: str = v1_Field( + "UTC", + description="The timezone to use for the cron schedule. Defaults to UTC.", + ) + + +class IntervalTrigger(v1_ConfigModel): + type: Literal["interval"] + interval: timedelta + + @v1_validator("interval", pre=True) + def lookback_interval_to_timedelta(cls, v): + if isinstance(v, str): + seconds = humanfriendly.parse_timespan(v) + return timedelta(seconds=seconds) + raise ValueError("Invalid value.") + + +class EntityChangeTrigger(v1_ConfigModel): + type: Literal["on_table_change"] + + +class ManualTrigger(v1_ConfigModel): + type: Literal["manual"] + + +class AssertionTrigger(v1_ConfigModel): + __root__: Union[ + CronTrigger, IntervalTrigger, EntityChangeTrigger, ManualTrigger + ] = v1_Field(discriminator="type") + + @property + def trigger(self): + return self.__root__ diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/compiler_interface.py b/metadata-ingestion/src/datahub/api/entities/assertion/compiler_interface.py new file mode 100644 index 00000000000000..27b43a58530b1e --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/compiler_interface.py @@ -0,0 +1,81 @@ +from abc import abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Dict, List, Literal + +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec +from datahub.ingestion.api.report import Report +from datahub.utilities.lossy_collections import LossyDict, LossyList + + +class StrEnum(str, Enum): + pass + + +class CompileResultArtifactType(StrEnum): + SQL_QUERIES = "SQL_QUERIES" + COMPILE_REPORT = "COMPILE_REPORT" + + +@dataclass +class CompileResultArtifact(Report): + name: str + type: CompileResultArtifactType + path: Path + description: str + + +@dataclass +class AssertionCompilationReport(Report): + """Additional details to debug compilation""" + + num_processed: int = 0 + num_compile_succeeded: int = 0 + num_compile_failed: int = 0 # Likely due to assertion not supported in platform + + warnings: LossyDict[str, LossyList[str]] = field(default_factory=LossyDict) + failures: LossyDict[str, LossyList[str]] = field(default_factory=LossyDict) + + artifacts: List[Path] = field(default_factory=list) + + def report_warning(self, key: str, reason: str) -> None: + warnings = self.warnings.get(key, LossyList()) + warnings.append(reason) + self.warnings[key] = warnings + + def report_failure(self, key: str, reason: str) -> None: + failures = self.failures.get(key, LossyList()) + failures.append(reason) + self.failures[key] = failures + + +@dataclass +class AssertionCompilationResult: + """Results of compilation step , along with detailed report object""" + + platform: str + status: Literal["success", "failure"] + + report: AssertionCompilationReport = field( + default_factory=AssertionCompilationReport + ) + + artifacts: List[CompileResultArtifact] = field(default_factory=list) + + def add_artifact(self, artifact: CompileResultArtifact) -> None: + self.artifacts.append(artifact) + self.report.artifacts.append(artifact.path) + + +class AssertionCompiler: + @classmethod + @abstractmethod + def create(cls, output_dir: str, extras: Dict[str, str]) -> "AssertionCompiler": + pass + + @abstractmethod + def compile( + self, assertion_config_spec: AssertionsConfigSpec + ) -> AssertionCompilationResult: + pass diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/datahub_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/datahub_assertion.py new file mode 100644 index 00000000000000..ed18b78418d768 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/datahub_assertion.py @@ -0,0 +1,35 @@ +from typing import Optional, Union + +from datahub.api.entities.assertion.assertion import BaseAssertionProtocol +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.field_assertion import FieldAssertion +from datahub.api.entities.assertion.freshness_assertion import FreshnessAssertion +from datahub.api.entities.assertion.sql_assertion import SQLAssertion +from datahub.api.entities.assertion.volume_assertion import VolumeAssertion +from datahub.configuration.pydantic_migration_helpers import v1_Field +from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionInfo + + +class DataHubAssertion(BaseAssertionProtocol): + __root__: Union[ + FreshnessAssertion, + VolumeAssertion, + SQLAssertion, + FieldAssertion, + # TODO: Add SchemaAssertion + ] = v1_Field(discriminator="type") + + @property + def assertion(self): + return self.__root__.assertion + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info_aspect() + + def get_id(self) -> str: + return self.__root__.get_id() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.get_assertion_trigger() diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/field_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/field_assertion.py new file mode 100644 index 00000000000000..ae062c3a8e5cbd --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/field_assertion.py @@ -0,0 +1,158 @@ +from enum import Enum +from typing import Optional, Union + +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_operator import Operators +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.field_metric import FieldMetric +from datahub.api.entities.assertion.filter import DatasetFilter +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + FieldAssertionInfo, + FieldAssertionType, +) +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldSpec +from datahub.metadata.schema_classes import ( + FieldMetricAssertionClass, + FieldTransformClass, + FieldTransformTypeClass, + FieldValuesAssertionClass, + FieldValuesFailThresholdClass, + FieldValuesFailThresholdTypeClass, +) + + +class FieldValuesFailThreshold(v1_ConfigModel): + type: Literal["count", "percentage"] = v1_Field(default="count") + value: int = v1_Field(default=0) + + def to_field_values_failure_threshold(self) -> FieldValuesFailThresholdClass: + return FieldValuesFailThresholdClass( + type=( + FieldValuesFailThresholdTypeClass.COUNT + if self.type == Literal["count"] + else FieldValuesFailThresholdTypeClass.PERCENTAGE + ), + value=self.value, + ) + + +class FieldTransform(Enum): + LENGTH = "length" + + +class FieldValuesAssertion(BaseEntityAssertion): + type: Literal["field"] + field: str + field_transform: Optional[FieldTransform] = v1_Field(default=None) + operator: Operators = v1_Field(discriminator="type", alias="condition") + filters: Optional[DatasetFilter] = v1_Field(default=None) + failure_threshold: FieldValuesFailThreshold = v1_Field( + default=FieldValuesFailThreshold() + ) + exclude_nulls: bool = v1_Field(default=True) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FIELD, + fieldAssertion=FieldAssertionInfo( + type=FieldAssertionType.FIELD_VALUES, + entity=self.entity, + fieldValuesAssertion=FieldValuesAssertionClass( + field=SchemaFieldSpec( + path=self.field, + type="", # Not required + nativeType="", # Not required + ), + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + failThreshold=self.failure_threshold.to_field_values_failure_threshold(), + excludeNulls=self.exclude_nulls, + transform=( + FieldTransformClass(type=FieldTransformTypeClass.LENGTH) + if self.field_transform == Literal["length"] + else None + ), + ), + ), + ) + + def get_id(self) -> str: + guid_dict = { + "entity": self.entity, + "type": self.type, + "field": self.field, + "operator": str(self.operator.operator), + "id_raw": self.id_raw, + } + return self.id or datahub_guid(guid_dict) + + +class FieldMetricAssertion(BaseEntityAssertion): + type: Literal["field"] + field: str + operator: Operators = v1_Field(discriminator="type", alias="condition") + metric: FieldMetric + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FIELD, + fieldAssertion=FieldAssertionInfo( + type=FieldAssertionType.FIELD_METRIC, + entity=self.entity, + fieldMetricAssertion=FieldMetricAssertionClass( + field=SchemaFieldSpec( + path=self.field, + type="", # Not required + nativeType="", # Not required + ), + metric=self.metric.name, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ), + ) + + def get_id(self) -> str: + guid_dict = { + "entity": self.entity, + "type": self.type, + "field": self.field, + "metric": self.metric.value, + "id_raw": self.id_raw, + } + return self.id or datahub_guid(guid_dict) + + +class FieldAssertion(BaseAssertionProtocol): + __root__: Union[FieldMetricAssertion, FieldValuesAssertion] + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + return self.__root__.get_id() + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/field_metric.py b/metadata-ingestion/src/datahub/api/entities/assertion/field_metric.py new file mode 100644 index 00000000000000..7a236da2d562d3 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/field_metric.py @@ -0,0 +1,21 @@ +from enum import Enum + + +class FieldMetric(Enum): + UNIQUE_COUNT = "unique_count" + UNIQUE_PERCENTAGE = "unique_percentage" + NULL_COUNT = "null_count" + NULL_PERCENTAGE = "null_percentage" + MIN = "min" + MAX = "max" + MEAN = "mean" + MEDIAN = "median" + STDDEV = "stddev" + NEGATIVE_COUNT = "negative_count" + NEGATIVE_PERCENTAGE = "negative_percentage" + ZERO_COUNT = "zero_count" + ZERO_PERCENTAGE = "zero_percentage" + MIN_LENGTH = "min_length" + MAX_LENGTH = "max_length" + EMPTY_COUNT = "empty_count" + EMPTY_PERCENTAGE = "empty_percentage" diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/filter.py b/metadata-ingestion/src/datahub/api/entities/assertion/filter.py new file mode 100644 index 00000000000000..05d75b674d6af9 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/filter.py @@ -0,0 +1,13 @@ +from typing_extensions import Literal + +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel + + +class SqlFilter(v1_ConfigModel): + type: Literal["sql"] + sql: str + + +DatasetFilter = SqlFilter +# class DatasetFilter(v1_ConfigModel): +# __root__: Union[SqlFilter] = v1_Field(discriminator="type") diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/freshness_assertion.py new file mode 100644 index 00000000000000..f9e1df7d68f271 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/freshness_assertion.py @@ -0,0 +1,124 @@ +from datetime import timedelta +from enum import Enum +from typing import Optional, Union + +import humanfriendly +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.filter import DatasetFilter +from datahub.configuration.pydantic_migration_helpers import v1_Field, v1_validator +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + FixedIntervalSchedule, + FreshnessAssertionInfo, + FreshnessAssertionSchedule, + FreshnessAssertionScheduleType, + FreshnessAssertionType, + FreshnessCronSchedule, +) +from datahub.metadata.com.linkedin.pegasus2avro.timeseries import CalendarInterval + + +class FreshnessSourceType(Enum): + LAST_MODIFIED_COLUMN = "last_modified_column" + + +class CronFreshnessAssertion(BaseEntityAssertion): + type: Literal["freshness"] + freshness_type: Literal["cron"] + cron: str = v1_Field( + description="The cron expression to use. See https://crontab.guru/ for help." + ) + timezone: str = v1_Field( + "UTC", + description="The timezone to use for the cron schedule. Defaults to UTC.", + ) + source_type: FreshnessSourceType = v1_Field( + default=FreshnessSourceType.LAST_MODIFIED_COLUMN + ) + last_modified_field: str + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FRESHNESS, + freshnessAssertion=FreshnessAssertionInfo( + type=FreshnessAssertionType.DATASET_CHANGE, + entity=self.entity, + schedule=FreshnessAssertionSchedule( + type=FreshnessAssertionScheduleType.CRON, + cron=FreshnessCronSchedule(cron=self.cron, timezone=self.timezone), + ), + ), + ) + + +class FixedIntervalFreshnessAssertion(BaseEntityAssertion): + type: Literal["freshness"] + freshness_type: Literal["interval"] = v1_Field(default="interval") + lookback_interval: timedelta + filters: Optional[DatasetFilter] = v1_Field(default=None) + source_type: FreshnessSourceType = v1_Field( + default=FreshnessSourceType.LAST_MODIFIED_COLUMN + ) + last_modified_field: str + + @v1_validator("lookback_interval", pre=True) + def lookback_interval_to_timedelta(cls, v): + if isinstance(v, str): + seconds = humanfriendly.parse_timespan(v) + return timedelta(seconds=seconds) + raise ValueError("Invalid value.") + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.FRESHNESS, + freshnessAssertion=FreshnessAssertionInfo( + type=FreshnessAssertionType.DATASET_CHANGE, + entity=self.entity, + schedule=FreshnessAssertionSchedule( + type=FreshnessAssertionScheduleType.FIXED_INTERVAL, + fixedInterval=FixedIntervalSchedule( + unit=CalendarInterval.SECOND, + multiple=self.lookback_interval.seconds, + ), + ), + ), + ) + + +class FreshnessAssertion(BaseAssertionProtocol): + __root__: Union[FixedIntervalFreshnessAssertion, CronFreshnessAssertion] + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + guid_dict = { + "entity": self.__root__.entity, + "type": self.__root__.type, + "id_raw": self.__root__.id_raw, + } + return self.__root__.id or datahub_guid(guid_dict) + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/sql_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/sql_assertion.py new file mode 100644 index 00000000000000..3d12cfde428f4e --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/sql_assertion.py @@ -0,0 +1,91 @@ +from typing import Optional, Union + +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_operator import Operators +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.configuration.pydantic_migration_helpers import v1_Field +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + AssertionValueChangeType, + SqlAssertionInfo, + SqlAssertionType, +) + + +class SqlMetricAssertion(BaseEntityAssertion): + type: Literal["sql"] + statement: str + operator: Operators = v1_Field(discriminator="type", alias="condition") + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.SQL, + sqlAssertion=SqlAssertionInfo( + type=SqlAssertionType.METRIC, + entity=self.entity, + statement=self.statement, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ) + + +class SqlMetricChangeAssertion(BaseEntityAssertion): + type: Literal["sql"] + statement: str + change_type: Literal["absolute", "percentage"] + operator: Operators = v1_Field(discriminator="type", alias="condition") + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.SQL, + sqlAssertion=SqlAssertionInfo( + type=SqlAssertionType.METRIC_CHANGE, + entity=self.entity, + statement=self.statement, + changeType=( + AssertionValueChangeType.ABSOLUTE + if self.change_type == Literal["absolute"] + else AssertionValueChangeType.PERCENTAGE + ), + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ) + + +class SQLAssertion(BaseAssertionProtocol): + __root__: Union[SqlMetricAssertion, SqlMetricChangeAssertion] = v1_Field() + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + guid_dict = { + "entity": self.__root__.entity, + "type": self.__root__.type, + "id_raw": self.__root__.id_raw, + } + return self.__root__.id or datahub_guid(guid_dict) + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/api/entities/assertion/volume_assertion.py b/metadata-ingestion/src/datahub/api/entities/assertion/volume_assertion.py new file mode 100644 index 00000000000000..da6a125874aa72 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/assertion/volume_assertion.py @@ -0,0 +1,98 @@ +from typing import Optional, Union + +from typing_extensions import Literal + +from datahub.api.entities.assertion.assertion import ( + BaseAssertionProtocol, + BaseEntityAssertion, +) +from datahub.api.entities.assertion.assertion_operator import Operators +from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger +from datahub.api.entities.assertion.filter import DatasetFilter +from datahub.configuration.pydantic_migration_helpers import v1_Field +from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionInfo, + AssertionType, + AssertionValueChangeType, + RowCountChange, + RowCountTotal, + VolumeAssertionInfo, + VolumeAssertionType, +) + + +class RowCountTotalVolumeAssertion(BaseEntityAssertion): + type: Literal["volume"] + metric: Literal["row_count"] = v1_Field(default="row_count") + operator: Operators = v1_Field(discriminator="type", alias="condition") + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.VOLUME, + volumeAssertion=VolumeAssertionInfo( + type=VolumeAssertionType.ROW_COUNT_TOTAL, + entity=self.entity, + rowCountTotal=RowCountTotal( + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ), + ) + + +class RowCountChangeVolumeAssertion(BaseEntityAssertion): + type: Literal["volume"] + metric: Literal["row_count"] = v1_Field(default="row_count") + change_type: Literal["absolute", "percentage"] + operator: Operators = v1_Field(discriminator="type", alias="condition") + filters: Optional[DatasetFilter] = v1_Field(default=None) + + def get_assertion_info( + self, + ) -> AssertionInfo: + return AssertionInfo( + description=self.description, + type=AssertionType.VOLUME, + volumeAssertion=VolumeAssertionInfo( + type=VolumeAssertionType.ROW_COUNT_CHANGE, + entity=self.entity, + rowCountChange=RowCountChange( + type=( + AssertionValueChangeType.ABSOLUTE + if self.change_type == Literal["absolute"] + else AssertionValueChangeType.PERCENTAGE + ), + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + ), + ), + ) + + +class VolumeAssertion(BaseAssertionProtocol): + __root__: Union[RowCountTotalVolumeAssertion, RowCountChangeVolumeAssertion] + + @property + def assertion(self): + return self.__root__ + + def get_id(self) -> str: + guid_dict = { + "entity": self.__root__.entity, + "type": self.__root__.type, + "id_raw": self.__root__.id_raw, + } + return self.__root__.id or datahub_guid(guid_dict) + + def get_assertion_info_aspect( + self, + ) -> AssertionInfo: + return self.__root__.get_assertion_info() + + def get_assertion_trigger(self) -> Optional[AssertionTrigger]: + return self.__root__.trigger diff --git a/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py b/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py new file mode 100644 index 00000000000000..dad724bfe11157 --- /dev/null +++ b/metadata-ingestion/src/datahub/cli/specific/assertions_cli.py @@ -0,0 +1,151 @@ +import logging +import os +from pathlib import Path +from typing import Dict, List, Optional + +import click +from click_default_group import DefaultGroup + +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec +from datahub.api.entities.assertion.compiler_interface import ( + AssertionCompilationResult, + CompileResultArtifact, + CompileResultArtifactType, +) +from datahub.emitter.mce_builder import make_assertion_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import get_default_graph +from datahub.integrations.assertion.registry import ASSERTION_PLATFORMS +from datahub.telemetry import telemetry +from datahub.upgrade import upgrade + +logger = logging.getLogger(__name__) + +REPORT_FILE_NAME = "compile_report.json" + + +@click.group(cls=DefaultGroup, default="upsert") +def assertions() -> None: + """A group of commands to interact with the Assertion entity in DataHub.""" + pass + + +@assertions.command() +@click.option("-f", "--file", required=True, type=click.Path(exists=True)) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def upsert(file: str) -> None: + """Upsert (create or update) a set of assertions in DataHub.""" + + assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file) + + with get_default_graph() as graph: + for assertion_spec in assertions_spec.assertions: + try: + mcp = MetadataChangeProposalWrapper( + entityUrn=make_assertion_urn(assertion_spec.get_id()), + aspect=assertion_spec.get_assertion_info_aspect(), + ) + graph.emit_mcp(mcp) + # TODO: Validate uniqueness of assertion ids. Report if duplicates found. + # TODO: Use upsert graphql endpoints here instead of graph.emit_mcp. + click.secho(f"Update succeeded for urn {mcp.entityUrn}.", fg="green") + except Exception as e: + logger.exception(e) + click.secho( + f"Update failed for {mcp.entityUrn}: {e}", + fg="red", + ) + + +@assertions.command() +@click.option("-f", "--file", required=True, type=click.Path(exists=True)) +@click.option("-p", "--platform", required=True, type=str) +@click.option("-o", "--output-to", required=False, type=click.Path(exists=True)) +@click.option( + "-x", + "--extras", + required=False, + multiple=True, + default=[], + help="Platform-specific extra key-value inputs in form key=value", +) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def compile( + file: str, platform: str, output_to: Optional[str], extras: List[str] +) -> None: + """Compile a set of assertions for input assertion platform. + Note that this does not run any code or execute any queries on assertion platform + and only creates artifacts specific to assertion platform that can be executed manually. + In future, we may introduce separate command to automatically apply these compiled changes + in assertion platform. Currently, generated result artifacts are stored in target folder + unless another folder is specified using option `--output-to `. + """ + + if platform not in ASSERTION_PLATFORMS: + click.secho( + f"Platform {platform} is not supported.", + fg="red", + ) + + if output_to is None: + output_to = f"{os.getcwd()}/target" + + if not os.path.isdir(output_to): + os.mkdir(output_to) + + assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file) + + try: + compiler = ASSERTION_PLATFORMS[platform].create( + output_dir=output_to, extras=extras_list_to_dict(extras) + ) + result = compiler.compile(assertions_spec) + + write_report_file(output_to, result) + click.secho("Compile report:", bold=True) + click.echo(result.report.as_string()) + if result.status == "failure": + click.secho("Failure", fg="yellow", bold=True) + else: + click.secho("Success", fg="green", bold=True) + except Exception as e: + logger.exception(e) + click.secho( + f"Compile failed: {e}", + fg="red", + ) + + +def write_report_file(output_to: str, result: AssertionCompilationResult) -> None: + report_path = Path(output_to) / REPORT_FILE_NAME + with (report_path).open("w") as f: + result.add_artifact( + CompileResultArtifact( + name=REPORT_FILE_NAME, + path=report_path, + type=CompileResultArtifactType.COMPILE_REPORT, + description="Detailed report about compile status", + ) + ) + f.write(result.report.as_json()) + + +def extras_list_to_dict(extras: List[str]) -> Dict[str, str]: + extra_properties: Dict[str, str] = dict() + for x in extras: + parts = x.split("=") + assert ( + len(parts) == 2 + ), f"Invalid value for extras {x}, should be in format key=value" + extra_properties[parts[0]] = parts[1] + return extra_properties + + +# TODO: support for +# Immediate: +# 1. delete assertions (from datahub) +# Later: +# 3. execute compiled assertions on assertion platform (Later, requires connection details to platform), +# 4. cleanup assertions from assertion platform (generate artifacts. optionally execute) diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 7c5d84b93726d8..49042db7b9299f 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -25,6 +25,7 @@ from datahub.cli.ingest_cli import ingest from datahub.cli.migrate import migrate from datahub.cli.put_cli import put +from datahub.cli.specific.assertions_cli import assertions from datahub.cli.specific.datacontract_cli import datacontract from datahub.cli.specific.dataproduct_cli import dataproduct from datahub.cli.specific.dataset_cli import dataset @@ -164,6 +165,7 @@ def init(use_password: bool = False) -> None: datahub.add_command(properties) datahub.add_command(forms) datahub.add_command(datacontract) +datahub.add_command(assertions) try: from datahub.cli.lite_cli import lite diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py new file mode 100644 index 00000000000000..8abb656e30e73e --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -0,0 +1,129 @@ +import logging +from datetime import datetime +from typing import Callable, Iterable, List, Optional + +from pydantic import BaseModel + +from datahub.emitter.mce_builder import ( + make_assertion_urn, + make_data_platform_urn, + make_dataplatform_instance_urn, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeCommonMixin, + SnowflakeConnectionMixin, + SnowflakeQueryMixin, +) +from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( + AssertionResult, + AssertionResultType, + AssertionRunEvent, + AssertionRunStatus, +) +from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance +from datahub.utilities.time import datetime_to_ts_millis + +logger: logging.Logger = logging.getLogger(__name__) + + +class DataQualityMonitoringResult(BaseModel): + MEASUREMENT_TIME: datetime + METRIC_NAME: str + TABLE_NAME: str + TABLE_SCHEMA: str + TABLE_DATABASE: str + VALUE: int + + +class SnowflakeAssertionsHandler( + SnowflakeCommonMixin, SnowflakeQueryMixin, SnowflakeConnectionMixin +): + def __init__( + self, + config: SnowflakeV2Config, + report: SnowflakeV2Report, + dataset_urn_builder: Callable[[str], str], + ) -> None: + self.config = config + self.report = report + self.logger = logger + self.dataset_urn_builder = dataset_urn_builder + self.connection = None + self._urns_processed: List[str] = [] + + def get_assertion_workunits( + self, discovered_datasets: List[str] + ) -> Iterable[MetadataWorkUnit]: + + self.connection = self.create_connection() + if self.connection is None: + return + + cur = self.query( + SnowflakeQuery.dmf_assertion_results( + datetime_to_ts_millis(self.config.start_time), + datetime_to_ts_millis(self.config.end_time), + ) + ) + for db_row in cur: + mcp = self._process_result_row(db_row, discovered_datasets) + if mcp: + yield mcp.as_workunit(is_primary_source=False) + + if mcp.entityUrn and mcp.entityUrn not in self._urns_processed: + self._urns_processed.append(mcp.entityUrn) + yield self._gen_platform_instance_wu(mcp.entityUrn) + + def _gen_platform_instance_wu(self, urn: str) -> MetadataWorkUnit: + + # Construct a MetadataChangeProposalWrapper object for assertion platform + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=DataPlatformInstance( + platform=make_data_platform_urn(self.platform), + instance=( + make_dataplatform_instance_urn( + self.platform, self.config.platform_instance + ) + if self.config.platform_instance + else None + ), + ), + ).as_workunit(is_primary_source=False) + + def _process_result_row( + self, result_row: dict, discovered_datasets: List[str] + ) -> Optional[MetadataChangeProposalWrapper]: + try: + result = DataQualityMonitoringResult.parse_obj(result_row) + assertion_guid = result.METRIC_NAME.split("__")[-1].lower() + status = bool(result.VALUE) # 1 if PASS, 0 if FAIL + assertee = self.get_dataset_identifier( + result.TABLE_NAME, result.TABLE_SCHEMA, result.TABLE_DATABASE + ) + if assertee in discovered_datasets: + return MetadataChangeProposalWrapper( + entityUrn=make_assertion_urn(assertion_guid), + aspect=AssertionRunEvent( + timestampMillis=datetime_to_ts_millis(result.MEASUREMENT_TIME), + runId=result.MEASUREMENT_TIME.strftime("%Y-%m-%dT%H:%M:%SZ"), + asserteeUrn=self.dataset_urn_builder(assertee), + status=AssertionRunStatus.COMPLETE, + assertionUrn=make_assertion_urn(assertion_guid), + result=AssertionResult( + type=( + AssertionResultType.SUCCESS + if status + else AssertionResultType.FAILURE + ) + ), + ), + ) + except Exception as e: + self.report.report_warning("assertion-result-parse-failure", str(e)) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index c1fbb2cdc1f3fb..4beb2684485694 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -164,6 +164,12 @@ class SnowflakeV2Config( "username.", ) + include_assertion_results: bool = Field( + default=False, + description="Whether to ingest assertion run results for assertions created using Datahub" + " assertions CLI in snowflake", + ) + @validator("convert_urns_to_lowercase") def validate_convert_urns_to_lowercase(cls, v): if not v: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index dac43499a1c715..205490a6d29c6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -1016,3 +1016,26 @@ def table_upstreams_only( ORDER BY h.downstream_table_name """ + + @staticmethod + def dmf_assertion_results(start_time_millis: int, end_time_millis: int) -> str: + + pattern = r"datahub\\_\\_%" + escape_pattern = r"\\" + return f""" + SELECT + MEASUREMENT_TIME AS "MEASUREMENT_TIME", + METRIC_NAME AS "METRIC_NAME", + TABLE_NAME AS "TABLE_NAME", + TABLE_SCHEMA AS "TABLE_SCHEMA", + TABLE_DATABASE AS "TABLE_DATABASE", + VALUE::INT AS "VALUE" + FROM + SNOWFLAKE.LOCAL.DATA_QUALITY_MONITORING_RESULTS + WHERE + MEASUREMENT_TIME >= to_timestamp_ltz({start_time_millis}, 3) + AND MEASUREMENT_TIME < to_timestamp_ltz({end_time_millis}, 3) + AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}' + ORDER BY MEASUREMENT_TIME ASC; + +""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 140b702a8b74bb..fc2733c211580c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -50,6 +50,9 @@ SnowflakeEdition, SnowflakeObjectDomain, ) +from datahub.ingestion.source.snowflake.snowflake_assertion import ( + SnowflakeAssertionsHandler, +) from datahub.ingestion.source.snowflake.snowflake_config import ( SnowflakeV2Config, TagOption, @@ -604,6 +607,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) and self.usage_extractor: yield from self.usage_extractor.get_usage_workunits(discovered_datasets) + if self.config.include_assertion_results: + yield from SnowflakeAssertionsHandler( + self.config, self.report, self.gen_dataset_urn + ).get_assertion_workunits(discovered_datasets) + def report_cache_info(self) -> None: lru_cache_functions: List[Callable] = [ self.data_dictionary.get_tables_for_database, diff --git a/metadata-ingestion/src/datahub/integrations/assertion/__init__.py b/metadata-ingestion/src/datahub/integrations/assertion/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/integrations/assertion/common.py b/metadata-ingestion/src/datahub/integrations/assertion/common.py new file mode 100644 index 00000000000000..9ffad5cf66640a --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/common.py @@ -0,0 +1,61 @@ +from functools import lru_cache +from typing import List, Optional, Tuple, TypedDict + +from datahub.api.entities.assertion.assertion import BaseEntityAssertion +from datahub.ingestion.graph.client import get_default_graph +from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata +from datahub.utilities.urns.urn import Urn + + +class ColumnDict(TypedDict): + col: str + native_type: str + + +@lru_cache +def get_qualified_name_from_datahub(urn: str) -> Optional[str]: + with get_default_graph() as graph: + props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties) + if props is not None: + return props.qualifiedName + return None + + +@lru_cache +def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]: + with get_default_graph() as graph: + schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata) + if schema is not None: + return [ + {"col": field.fieldPath, "native_type": field.nativeDataType} + for field in schema.fields + ] + return None + + +def get_entity_name(assertion: BaseEntityAssertion) -> Tuple[str, str, str]: + if assertion.meta and assertion.meta.get("entity_qualified_name"): + parts = assertion.meta["entity_qualified_name"].split(".") + else: + qualified_name = get_qualified_name_from_datahub(assertion.entity) + if qualified_name is not None: + parts = qualified_name.split(".") + else: + urn_id = Urn.create_from_string(assertion.entity).entity_ids[1] + parts = urn_id.split(".") + if len(parts) > 3: + parts = parts[-3:] + assert len(parts) == 3 + database = parts[-3] + schema = parts[-2] + table = parts[-1] + return database, schema, table + + +def get_entity_schema(assertion: BaseEntityAssertion) -> Optional[List[ColumnDict]]: + if assertion.meta and assertion.meta.get("entity_schema"): + return assertion.meta.get("entity_schema") + elif get_schema_from_datahub(assertion.entity): + return get_schema_from_datahub(assertion.entity) + return None diff --git a/metadata-ingestion/src/datahub/integrations/assertion/registry.py b/metadata-ingestion/src/datahub/integrations/assertion/registry.py new file mode 100644 index 00000000000000..26015ddbf9a315 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/registry.py @@ -0,0 +1,8 @@ +from typing import Dict, Type + +from datahub.api.entities.assertion.compiler_interface import AssertionCompiler +from datahub.integrations.assertion.snowflake.compiler import SnowflakeAssertionCompiler + +ASSERTION_PLATFORMS: Dict[str, Type[AssertionCompiler]] = { + "snowflake": SnowflakeAssertionCompiler +} diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/__init__.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py new file mode 100644 index 00000000000000..8d2ae2960ebd05 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/compiler.py @@ -0,0 +1,237 @@ +import logging +import os +from pathlib import Path +from typing import Dict, Tuple + +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec +from datahub.api.entities.assertion.assertion_operator import LessThanOrEqualToOperator +from datahub.api.entities.assertion.assertion_trigger import ( + AssertionTrigger, + CronTrigger, + EntityChangeTrigger, + IntervalTrigger, +) +from datahub.api.entities.assertion.compiler_interface import ( + AssertionCompilationResult, + AssertionCompiler, + CompileResultArtifact, + CompileResultArtifactType, +) +from datahub.api.entities.assertion.datahub_assertion import DataHubAssertion +from datahub.api.entities.assertion.field_assertion import FieldValuesAssertion +from datahub.api.entities.assertion.freshness_assertion import ( + FixedIntervalFreshnessAssertion, +) +from datahub.emitter.mce_builder import make_assertion_urn +from datahub.integrations.assertion.common import get_entity_name, get_entity_schema +from datahub.integrations.assertion.snowflake.dmf_generator import SnowflakeDMFHandler +from datahub.integrations.assertion.snowflake.field_metric_sql_generator import ( + SnowflakeFieldMetricSQLGenerator, +) +from datahub.integrations.assertion.snowflake.field_values_metric_sql_generator import ( + SnowflakeFieldValuesMetricSQLGenerator, +) +from datahub.integrations.assertion.snowflake.metric_operator_sql_generator import ( + SnowflakeMetricEvalOperatorSQLGenerator, +) +from datahub.integrations.assertion.snowflake.metric_sql_generator import ( + SnowflakeMetricSQLGenerator, +) + +logger = logging.Logger(__name__) + +DMF_DEFINITIONS_FILE_NAME = "dmf_definitions.sql" +DMF_ASSOCIATIONS_FILE_NAME = "dmf_associations.sql" +DMF_SCHEMA_PROPERTY_KEY = "DMF_SCHEMA" + + +class SnowflakeAssertionCompiler(AssertionCompiler): + def __init__(self, output_dir: str, extras: Dict[str, str]) -> None: + self.output_dir = Path(output_dir) + self.extras = extras + self.metric_generator = SnowflakeMetricSQLGenerator( + SnowflakeFieldMetricSQLGenerator(), SnowflakeFieldValuesMetricSQLGenerator() + ) + self.metric_evaluator = SnowflakeMetricEvalOperatorSQLGenerator() + self.dmf_handler = SnowflakeDMFHandler() + + self._entity_schedule_history: Dict[str, AssertionTrigger] = dict() + + @classmethod + def create( + cls, output_dir: str, extras: Dict[str, str] + ) -> "SnowflakeAssertionCompiler": + assert os.path.exists( + output_dir + ), f"Specified location {output_dir} does not exist." + + assert os.path.isdir( + output_dir + ), f"Specified location {output_dir} is not a folder." + + assert any( + x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras + ), "Must specify value for DMF schema using -x DMF_SCHEMA=" + + return SnowflakeAssertionCompiler(output_dir, extras) + + def compile( + self, assertion_config_spec: AssertionsConfigSpec + ) -> AssertionCompilationResult: + result = AssertionCompilationResult("snowflake", "success") + + # TODO: Create/Report permissions sql + + dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME + dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME + with (dmf_definitions_path).open("w") as definitions, ( + dmf_associations_path + ).open("w") as associations: + for assertion_spec in assertion_config_spec.assertions: + result.report.num_processed += 1 + try: + start_line = f"\n-- Start of Assertion {assertion_spec.get_id()}\n" + (dmf_definition, dmf_association) = self.process_assertion( + assertion_spec + ) + end_line = f"\n-- End of Assertion {assertion_spec.get_id()}\n" + + definitions.write(start_line) + definitions.write(dmf_definition) + definitions.write(end_line) + + associations.write(start_line) + associations.write(dmf_association) + associations.write(end_line) + + result.report.num_compile_succeeded += 1 + except Exception as e: + result.status = "failure" + result.report.report_failure( + assertion_spec.get_id(), + f"Failed to compile assertion of type {assertion_spec.assertion.type} due to error: {e}", + ) + result.report.num_compile_failed += 1 + if result.report.num_compile_succeeded > 0: + result.add_artifact( + CompileResultArtifact( + name=DMF_DEFINITIONS_FILE_NAME, + path=dmf_definitions_path, + type=CompileResultArtifactType.SQL_QUERIES, + description="SQL file containing DMF create definitions equivalent to Datahub Assertions", + ) + ) + result.add_artifact( + CompileResultArtifact( + name=DMF_ASSOCIATIONS_FILE_NAME, + path=dmf_associations_path, + type=CompileResultArtifactType.SQL_QUERIES, + description="ALTER TABLE queries to associate DMFs to table to run on configured schedule.", + ) + ) + + return result + + def process_assertion(self, assertion: DataHubAssertion) -> Tuple[str, str]: + # TODO: support schema assertion ? + + # For freshness assertion, metric is difference in seconds between assertion execution time + # and last time table was updated. + # For field values assertion, metric is number or percentage of rows that do not satify + # operator condition. + # For remaining assertions, numeric metric is discernible in assertion definition itself. + metric_definition = self.metric_generator.metric_sql(assertion.assertion) + + if isinstance(assertion.assertion, FixedIntervalFreshnessAssertion): + assertion_sql = self.metric_evaluator.operator_sql( + LessThanOrEqualToOperator( + type="less_than_or_equal_to", + value=assertion.assertion.lookback_interval.total_seconds(), + ), + metric_definition, + ) + elif isinstance(assertion.assertion, FieldValuesAssertion): + assertion_sql = self.metric_evaluator.operator_sql( + LessThanOrEqualToOperator( + type="less_than_or_equal_to", + value=assertion.assertion.failure_threshold.value, + ), + metric_definition, + ) + else: + assertion_sql = self.metric_evaluator.operator_sql( + assertion.assertion.operator, metric_definition + ) + + dmf_name = get_dmf_name(assertion) + dmf_schema_name = self.extras[DMF_SCHEMA_PROPERTY_KEY] + + args_create_dmf, args_add_dmf = get_dmf_args(assertion) + + entity_name = get_entity_name(assertion.assertion) + + self._entity_schedule_history.setdefault( + assertion.assertion.entity, assertion.assertion.trigger + ) + if ( + assertion.assertion.entity in self._entity_schedule_history + and self._entity_schedule_history[assertion.assertion.entity] + != assertion.assertion.trigger + ): + raise ValueError( + "Assertions on same entity must have same schedules as of now." + f" Found different schedules on entity {assertion.assertion.entity} ->" + f" ({self._entity_schedule_history[assertion.assertion.entity].trigger})," + f" ({assertion.assertion.trigger.trigger})" + ) + + dmf_schedule = get_dmf_schedule(assertion.assertion.trigger) + dmf_definition = self.dmf_handler.create_dmf( + f"{dmf_schema_name}.{dmf_name}", + args_create_dmf, + assertion.assertion.description + or f"Created via DataHub for assertion {make_assertion_urn(assertion.get_id())} of type {assertion.assertion.type}", + assertion_sql, + ) + dmf_association = self.dmf_handler.add_dmf_to_table( + f"{dmf_schema_name}.{dmf_name}", + args_add_dmf, + dmf_schedule, + ".".join(entity_name), + ) + + return dmf_definition, dmf_association + + +def get_dmf_name(assertion: DataHubAssertion) -> str: + return f"datahub__{assertion.get_id()}" + + +def get_dmf_args(assertion: DataHubAssertion) -> Tuple[str, str]: + """Returns Tuple with + - Args used to create DMF + - Args used to add DMF to table""" + # Snowflake does not allow creating custom data metric + # function without column name argument. + # So we fetch any one column from table's schema + args_create_dmf = "ARGT TABLE({col_name} {col_type})" + args_add_dmf = "{col_name}" + entity_schema = get_entity_schema(assertion.assertion) + if entity_schema: + for col_dict in entity_schema: + return args_create_dmf.format( + col_name=col_dict["col"], col_type=col_dict["native_type"] + ), args_add_dmf.format(col_name=col_dict["col"]) + + raise ValueError("entity schema not available") + + +def get_dmf_schedule(trigger: AssertionTrigger) -> str: + if isinstance(trigger.trigger, EntityChangeTrigger): + return "TRIGGER_ON_CHANGES" + elif isinstance(trigger.trigger, CronTrigger): + return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}" + elif isinstance(trigger.trigger, IntervalTrigger): + return f"{trigger.trigger.interval.seconds/60} MIN" + else: + raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}") diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/dmf_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/dmf_generator.py new file mode 100644 index 00000000000000..4f50b7c2b81a57 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/dmf_generator.py @@ -0,0 +1,22 @@ +class SnowflakeDMFHandler: + def create_dmf( + self, dmf_name: str, dmf_args: str, dmf_comment: str, dmf_sql: str + ) -> str: + return f""" + CREATE or REPLACE DATA METRIC FUNCTION + {dmf_name} ({dmf_args}) + RETURNS NUMBER + COMMENT = '{dmf_comment}' + AS + $$ + {dmf_sql} + $$; + """ + + def add_dmf_to_table( + self, dmf_name: str, dmf_col_args: str, dmf_schedule: str, table_identifier: str + ) -> str: + return f""" + ALTER TABLE {table_identifier} SET DATA_METRIC_SCHEDULE = '{dmf_schedule}'; + ALTER TABLE {table_identifier} ADD DATA METRIC FUNCTION {dmf_name} ON ({dmf_col_args}); + """ diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_metric_sql_generator.py new file mode 100644 index 00000000000000..3ff218a9f280b3 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_metric_sql_generator.py @@ -0,0 +1,154 @@ +from typing import List, Optional + +from datahub.api.entities.assertion.field_assertion import FieldMetricAssertion +from datahub.api.entities.assertion.field_metric import FieldMetric +from datahub.integrations.assertion.common import get_entity_name + + +class SnowflakeFieldMetricSQLGenerator: + def unique_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select count(distinct {field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def unique_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select count(distinct {field_name})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def null_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause( + [dataset_filter, f"{field_name} is null"] + ) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def null_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.null_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def min_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select min({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def max_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select max({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def mean_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select avg({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def median_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select median({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def stddev_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select stddev({field_name}) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def negative_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause([dataset_filter, f"{field_name} < 0"]) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def negative_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.negative_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def zero_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause([dataset_filter, f"{field_name} = 0"]) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def zero_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.zero_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def min_length_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select min(length({field_name})) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def max_length_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select max(length({field_name})) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def empty_count_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + where_clause = self._setup_where_clause( + [dataset_filter, f"({field_name} is null or trim({field_name})='')"] + ) + return f"""select count(*) + from {entity_name} {where_clause}""" + + def empty_percentage_sql( + self, field_name: str, entity_name: str, dataset_filter: Optional[str] + ) -> str: + return f"""select ({self.empty_count_sql(field_name, entity_name, dataset_filter)})/count(*) + from {entity_name} {self._setup_where_clause([dataset_filter])}""" + + def _setup_where_clause(self, filters: List[Optional[str]]) -> str: + where_clause = " and ".join(f for f in filters if f) + return f"where {where_clause}" if where_clause else "" + + def metric_sql(self, assertion: FieldMetricAssertion) -> str: + metric_sql_mapping = { + FieldMetric.UNIQUE_COUNT: self.unique_count_sql, + FieldMetric.UNIQUE_PERCENTAGE: self.unique_percentage_sql, + FieldMetric.NULL_COUNT: self.null_count_sql, + FieldMetric.NULL_PERCENTAGE: self.null_percentage_sql, + FieldMetric.MIN: self.min_sql, + FieldMetric.MAX: self.max_sql, + FieldMetric.MEAN: self.mean_sql, + FieldMetric.MEDIAN: self.median_sql, + FieldMetric.STDDEV: self.stddev_sql, + FieldMetric.NEGATIVE_COUNT: self.negative_count_sql, + FieldMetric.NEGATIVE_PERCENTAGE: self.negative_percentage_sql, + FieldMetric.ZERO_COUNT: self.zero_count_sql, + FieldMetric.ZERO_PERCENTAGE: self.zero_percentage_sql, + FieldMetric.MIN_LENGTH: self.min_length_sql, + FieldMetric.MAX_LENGTH: self.max_length_sql, + FieldMetric.EMPTY_COUNT: self.empty_count_sql, + FieldMetric.EMPTY_PERCENTAGE: self.empty_percentage_sql, + } + + entity_name = ".".join(get_entity_name(assertion)) + + return metric_sql_mapping[assertion.metric]( + assertion.field, + entity_name, + ( + assertion.filters.sql + if assertion.filters and assertion.filters.sql + else None + ), + ) diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_values_metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_values_metric_sql_generator.py new file mode 100644 index 00000000000000..b77cc971d3a450 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/field_values_metric_sql_generator.py @@ -0,0 +1,283 @@ +from functools import singledispatchmethod +from typing import List, Optional + +from datahub.api.entities.assertion.assertion_operator import ( + BetweenOperator, + ContainsOperator, + EndsWithOperator, + EqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + InOperator, + IsFalseOperator, + IsNullOperator, + IsTrueOperator, + LessThanOperator, + LessThanOrEqualToOperator, + MatchesRegexOperator, + NotEqualToOperator, + NotInOperator, + NotNullOperator, + Operators, + StartsWithOperator, +) +from datahub.api.entities.assertion.field_assertion import ( + FieldTransform, + FieldValuesAssertion, +) +from datahub.integrations.assertion.common import get_entity_name + + +class SnowflakeFieldValuesMetricSQLGenerator: + @singledispatchmethod + def values_metric_sql( + self, + operators: Operators, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + """ + Generates SQL that would return boolean value for each table row. + 1 if FAIL and 0 if PASS. Note the unusual reversal of 1 and 0. + This is deliberate, as metric represents number of failing rows. + """ + raise ValueError(f"Unsupported values metric operator type {type(operators)} ") + + @values_metric_sql.register + def _( + self, + operators: InOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} in {tuple(operators.value)} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: NotInOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} not in {tuple(operators.value)} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: EqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} = {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: NotEqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} != {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: BetweenOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} between {operators.min} and {operators.max} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: LessThanOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} < {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: LessThanOrEqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} <= {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: GreaterThanOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} > {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: GreaterThanOrEqualToOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} >= {operators.value} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: IsNullOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} is null then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: NotNullOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} is not null then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: IsTrueOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when {transformed_field} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: IsFalseOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when not {transformed_field} then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: ContainsOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when contains({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: StartsWithOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when startswith({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: EndsWithOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when endswith({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + @values_metric_sql.register + def _( + self, + operators: MatchesRegexOperator, + entity_name: str, + transformed_field: str, + where_clause: str, + ) -> str: + return f"""select case when REGEXP_LIKE({transformed_field},'{operators.value}') then 0 else 1 end + from {entity_name} {where_clause}""" + + def _setup_where_clause(self, filters: List[Optional[str]]) -> str: + where_clause = " and ".join(f for f in filters if f) + return f"where {where_clause}" if where_clause else "" + + def _setup_field_transform( + self, field: str, transform: Optional[FieldTransform] + ) -> str: + if transform is None: + return field + elif transform is FieldTransform.LENGTH: + return f"length({field})" + raise ValueError(f"Unsupported transform type {transform}") + + def metric_sql(self, assertion: FieldValuesAssertion) -> str: + """ + Note that this applies negative operator in order to check whether or not + number of invalid value rows are less than configured failThreshold. + + Args: + assertion (FieldValuesAssertion): _description_ + + Returns: + str: _description_ + """ + entity_name = ".".join(get_entity_name(assertion)) + + dataset_filter = ( + assertion.filters.sql + if assertion.filters and assertion.filters.sql + else None + ) + where_clause = self._setup_where_clause( + [ + dataset_filter, + f"{assertion.field} is not null" if assertion.exclude_nulls else None, + ] + ) + transformed_field = self._setup_field_transform( + assertion.field, assertion.field_transform + ) + # this sql would return boolean value for each table row. 1 if fail and 0 if pass. + sql = self.values_metric_sql( + assertion.operator, entity_name, transformed_field, where_clause + ) + + # metric would be number of failing rows OR percentage of failing rows. + if assertion.failure_threshold.type == "count": + return f"select sum($1) as metric from ({sql})" + else: # percentage + return f"select sum($1)/count(*) as metric from ({sql})" diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_operator_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_operator_sql_generator.py new file mode 100644 index 00000000000000..e7549d105b3f62 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_operator_sql_generator.py @@ -0,0 +1,68 @@ +from functools import singledispatchmethod + +from datahub.api.entities.assertion.assertion_operator import ( + BetweenOperator, + EqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + IsFalseOperator, + IsNullOperator, + IsTrueOperator, + LessThanOperator, + LessThanOrEqualToOperator, + NotNullOperator, + Operators, +) + + +class SnowflakeMetricEvalOperatorSQLGenerator: + @singledispatchmethod + def operator_sql(self, operators: Operators, metric_sql: str) -> str: + """ + Generates Operator SQL that applies operator on `metric` + and returns a numeric boolean value 1 if PASS, 0 if FAIL + + """ + raise ValueError(f"Unsupported metric operator type {type(operators)} ") + + @operator_sql.register + def _(self, operators: EqualToOperator, metric_sql: str) -> str: + return f"select case when metric={operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: BetweenOperator, metric_sql: str) -> str: + return f"select case when metric between {operators.min} and {operators.max} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: LessThanOperator, metric_sql: str) -> str: + return f"select case when metric < {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: LessThanOrEqualToOperator, metric_sql: str) -> str: + return f"select case when metric <= {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: GreaterThanOperator, metric_sql: str) -> str: + return f"select case when metric > {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: GreaterThanOrEqualToOperator, metric_sql: str) -> str: + return f"select case when metric >= {operators.value} then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: NotNullOperator, metric_sql: str) -> str: + return ( + f"select case when metric is not null then 1 else 0 end from ({metric_sql})" + ) + + @operator_sql.register + def _(self, operators: IsNullOperator, metric_sql: str) -> str: + return f"select case when metric is null then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: IsTrueOperator, metric_sql: str) -> str: + return f"select case when metric then 1 else 0 end from ({metric_sql})" + + @operator_sql.register + def _(self, operators: IsFalseOperator, metric_sql: str) -> str: + return f"select case when not metric then 1 else 0 end from ({metric_sql})" diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py new file mode 100644 index 00000000000000..5b079129e0a9c5 --- /dev/null +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass +from functools import singledispatchmethod + +from datahub.api.entities.assertion.assertion import BaseEntityAssertion +from datahub.api.entities.assertion.field_assertion import ( + FieldMetricAssertion, + FieldValuesAssertion, +) +from datahub.api.entities.assertion.freshness_assertion import ( + FixedIntervalFreshnessAssertion, + FreshnessSourceType, +) +from datahub.api.entities.assertion.sql_assertion import ( + SqlMetricAssertion, + SqlMetricChangeAssertion, +) +from datahub.api.entities.assertion.volume_assertion import ( + RowCountChangeVolumeAssertion, + RowCountTotalVolumeAssertion, +) +from datahub.integrations.assertion.common import get_entity_name +from datahub.integrations.assertion.snowflake.field_metric_sql_generator import ( + SnowflakeFieldMetricSQLGenerator, +) +from datahub.integrations.assertion.snowflake.field_values_metric_sql_generator import ( + SnowflakeFieldValuesMetricSQLGenerator, +) + + +@dataclass +class SnowflakeMetricSQLGenerator: + field_metric_sql_generator: SnowflakeFieldMetricSQLGenerator + field_values_metric_sql_generator: SnowflakeFieldValuesMetricSQLGenerator + + @singledispatchmethod + def metric_sql( + self, + assertion: BaseEntityAssertion, + ) -> str: + """Generates Metric SQL that typically returns a numeric metric""" + raise ValueError(f"Unsupported assertion type {type(assertion)} ") + + @metric_sql.register + def _(self, assertion: RowCountChangeVolumeAssertion) -> str: + raise ValueError(f"Unsupported assertion type {type(assertion)} ") + + @metric_sql.register + def _(self, assertion: SqlMetricChangeAssertion) -> str: + raise ValueError(f"Unsupported assertion type {type(assertion)} ") + + @metric_sql.register + def _(self, assertion: FixedIntervalFreshnessAssertion) -> str: + entity_name = ".".join(get_entity_name(assertion)) + if assertion.filters and assertion.filters.sql: + where_clause = f"where {assertion.filters.sql}" + else: + where_clause = "" + + if ( + assertion.source_type == FreshnessSourceType.LAST_MODIFIED_COLUMN + and assertion.last_modified_field + ): + return f"""select timediff( + second, + max({assertion.last_modified_field}::TIMESTAMP_LTZ), + SNOWFLAKE.CORE.DATA_METRIC_SCHEDULED_TIME() + ) as metric from {entity_name} {where_clause}""" + else: + raise ValueError( + f"Unsupported freshness source type {assertion.source_type} " + ) + + @metric_sql.register + def _(self, assertion: RowCountTotalVolumeAssertion) -> str: + + # Can not use information schema here due to error - + # Data metric function body cannot refer to the non-deterministic function 'CURRENT_DATABASE_MAIN_METASTORE_ID'. + + entity_name = ".".join(get_entity_name(assertion)) + if assertion.filters and assertion.filters.sql: + where_clause = f"where {assertion.filters.sql}" + else: + where_clause = "" + return f"select count(*) as metric from {entity_name} {where_clause}" + + @metric_sql.register + def _(self, assertion: SqlMetricAssertion) -> str: + return f"select $1 as metric from ({assertion.statement})" + + @metric_sql.register + def _(self, assertion: FieldMetricAssertion) -> str: + sql = self.field_metric_sql_generator.metric_sql(assertion) + return f"select $1 as metric from ({sql})" + + @metric_sql.register + def _(self, assertion: FieldValuesAssertion) -> str: + return self.field_values_metric_sql_generator.metric_sql(assertion) diff --git a/metadata-ingestion/tests/unit/api/entities/assertion/__init__.py b/metadata-ingestion/tests/unit/api/entities/assertion/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config.yml b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config.yml new file mode 100644 index 00000000000000..a44945a30f9a37 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config.yml @@ -0,0 +1,76 @@ +version: 1 +namespace: test-config-id-1 +assertions: + # Freshness Assertion + - entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + type: freshness + lookback_interval: "1 hour" + last_modified_field: col_timestamp + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Volume Assertion + - type: volume + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + metric: row_count + condition: + type: less_than_or_equal_to + value: 1000 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Metric Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD) + field: col_date + metric: null_count + condition: + type: equal_to + value: 0 + schedule: + type: cron + cron: 0 * * * * + meta: + entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES + entity_schema: + - col: col_date + native_type: DATE + # Field Value Assertion + - type: field + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + field: quantity + condition: + type: between + min: 0 + max: 10 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT + # Custom SQL Metric Assertion + - type: sql + entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD) + statement: select mode(quantity) from test_db.public.purchase_event + condition: + type: equal_to + value: 5 + schedule: + type: on_table_change + meta: + entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT + entity_schema: + - col: quantity + native_type: FLOAT diff --git a/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config_spec.py b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config_spec.py new file mode 100644 index 00000000000000..74f13ac7b2a19d --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/assertion/test_assertion_config_spec.py @@ -0,0 +1,13 @@ +from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec + + +def test_assertion_config_spec_parses_correct_type(pytestconfig): + config_file = ( + pytestconfig.rootpath + / "tests/unit/api/entities/assertion/test_assertion_config.yml" + ) + + config_spec = AssertionsConfigSpec.from_yaml(config_file) + assert config_spec.version == 1 + assert config_spec.id == "test-config-id-1" + assert len(config_spec.assertions) == 5 diff --git a/metadata-ingestion/tests/unit/cli/assertion/__init__.py b/metadata-ingestion/tests/unit/cli/assertion/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/cli/assertion/dmf_associations.sql b/metadata-ingestion/tests/unit/cli/assertion/dmf_associations.sql new file mode 100644 index 00000000000000..7e6b1982515e0f --- /dev/null +++ b/metadata-ingestion/tests/unit/cli/assertion/dmf_associations.sql @@ -0,0 +1,35 @@ + +-- Start of Assertion 025cce4dd4123c0f007908011a9c64d7 + + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC'; + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__025cce4dd4123c0f007908011a9c64d7 ON (col_date); + +-- End of Assertion 025cce4dd4123c0f007908011a9c64d7 + +-- Start of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC'; + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__5c32eef47bd763fece7d21c7cbf6c659 ON (col_date); + +-- End of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + +-- Start of Assertion 04be4145bd8de10bed3dfcb0cee57842 + + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC'; + ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__04be4145bd8de10bed3dfcb0cee57842 ON (col_date); + +-- End of Assertion 04be4145bd8de10bed3dfcb0cee57842 + +-- Start of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT SET DATA_METRIC_SCHEDULE = 'TRIGGER_ON_CHANGES'; + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__b065942d2bca8a4dbe90cc3ec2d9ca9f ON (quantity); + +-- End of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + +-- Start of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 + + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT SET DATA_METRIC_SCHEDULE = 'TRIGGER_ON_CHANGES'; + ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__170dbd53f28eedbbaba52ebbf189f6b1 ON (quantity); + +-- End of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 diff --git a/metadata-ingestion/tests/unit/cli/assertion/dmf_definitions.sql b/metadata-ingestion/tests/unit/cli/assertion/dmf_definitions.sql new file mode 100644 index 00000000000000..85056e150b9b33 --- /dev/null +++ b/metadata-ingestion/tests/unit/cli/assertion/dmf_definitions.sql @@ -0,0 +1,71 @@ + +-- Start of Assertion 025cce4dd4123c0f007908011a9c64d7 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__025cce4dd4123c0f007908011a9c64d7 (ARGT TABLE(col_date DATE)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:025cce4dd4123c0f007908011a9c64d7 of type freshness' + AS + $$ + select case when metric <= 3600 then 1 else 0 end from (select timediff( + second, + max(col_timestamp::TIMESTAMP_LTZ), + SNOWFLAKE.CORE.DATA_METRIC_SCHEDULED_TIME() + ) as metric from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ) + $$; + +-- End of Assertion 025cce4dd4123c0f007908011a9c64d7 + +-- Start of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__5c32eef47bd763fece7d21c7cbf6c659 (ARGT TABLE(col_date DATE)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:5c32eef47bd763fece7d21c7cbf6c659 of type volume' + AS + $$ + select case when metric <= 1000 then 1 else 0 end from (select count(*) as metric from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ) + $$; + +-- End of Assertion 5c32eef47bd763fece7d21c7cbf6c659 + +-- Start of Assertion 04be4145bd8de10bed3dfcb0cee57842 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__04be4145bd8de10bed3dfcb0cee57842 (ARGT TABLE(col_date DATE)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:04be4145bd8de10bed3dfcb0cee57842 of type field' + AS + $$ + select case when metric=0 then 1 else 0 end from (select $1 as metric from (select count(*) + from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES where col_date is null)) + $$; + +-- End of Assertion 04be4145bd8de10bed3dfcb0cee57842 + +-- Start of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__b065942d2bca8a4dbe90cc3ec2d9ca9f (ARGT TABLE(quantity FLOAT)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:b065942d2bca8a4dbe90cc3ec2d9ca9f of type field' + AS + $$ + select case when metric <= 0 then 1 else 0 end from (select sum($1) as metric from (select case when quantity between 0 and 10 then 0 else 1 end + from TEST_DB.PUBLIC.PURCHASE_EVENT where quantity is not null)) + $$; + +-- End of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f + +-- Start of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 + + CREATE or REPLACE DATA METRIC FUNCTION + test_db.datahub_dmfs.datahub__170dbd53f28eedbbaba52ebbf189f6b1 (ARGT TABLE(quantity FLOAT)) + RETURNS NUMBER + COMMENT = 'Created via DataHub for assertion urn:li:assertion:170dbd53f28eedbbaba52ebbf189f6b1 of type sql' + AS + $$ + select case when metric=5 then 1 else 0 end from (select $1 as metric from (select mode(quantity) from test_db.public.purchase_event)) + $$; + +-- End of Assertion 170dbd53f28eedbbaba52ebbf189f6b1 diff --git a/metadata-ingestion/tests/unit/cli/assertion/test_compile.py b/metadata-ingestion/tests/unit/cli/assertion/test_compile.py new file mode 100644 index 00000000000000..47253b5b0d71ea --- /dev/null +++ b/metadata-ingestion/tests/unit/cli/assertion/test_compile.py @@ -0,0 +1,42 @@ +import filecmp +import os + +from datahub.integrations.assertion.snowflake.compiler import ( + DMF_ASSOCIATIONS_FILE_NAME, + DMF_DEFINITIONS_FILE_NAME, +) +from tests.test_helpers.click_helpers import run_datahub_cmd + + +def test_compile_assertion_config_spec_for_snowflake(pytestconfig, tmp_path): + config_file = ( + pytestconfig.rootpath + / "tests/unit/api/entities/assertion/test_assertion_config.yml" + ).resolve() + + golden_file_path = pytestconfig.rootpath / "tests/unit/cli/assertion/" + run_datahub_cmd( + [ + "assertions", + "compile", + "-f", + f"{config_file}", + "-p", + "snowflake", + "-x", + "DMF_SCHEMA=test_db.datahub_dmfs", + "-o", + tmp_path, + ], + ) + + output_file_names = [ + DMF_DEFINITIONS_FILE_NAME, + DMF_ASSOCIATIONS_FILE_NAME, + ] + + for file_name in output_file_names: + assert os.path.exists(tmp_path / file_name) + assert filecmp.cmp( + golden_file_path / file_name, tmp_path / file_name + ), f"{file_name} is not as expected" diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl index 5b60aa18e87da9..65196a69ce3660 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl @@ -2,6 +2,7 @@ namespace com.linkedin.assertion import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.AuditStamp /** * Information about an assertion @@ -66,10 +67,15 @@ record AssertionInfo includes CustomProperties, ExternalReference { volumeAssertion: optional VolumeAssertionInfo /** - * A SQL Assertion definition. This field is populated when the type is SQL. + * A SQL Assertion definition. This field is populated when the type is SQL. */ sqlAssertion: optional SqlAssertionInfo + /** + * A Field Assertion definition. This field is populated when the type is FIELD. + */ + fieldAssertion: optional FieldAssertionInfo + /** * An schema Assertion definition. This field is populated when the type is DATA_SCHEMA */ @@ -83,6 +89,12 @@ record AssertionInfo includes CustomProperties, ExternalReference { */ source: optional AssertionSource + /** + * The time at which the assertion was last updated and the actor who updated it. + * This field is only present for Native assertions updated after this field was introduced. + */ + lastUpdated: optional AuditStamp + /** * An optional human-readable description of the assertion */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl index e768fe8521942f..4bbfa20f8663ec 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl @@ -33,6 +33,14 @@ record AssertionResultError { */ UNSUPPORTED_PLATFORM /** + * Error while executing a custom SQL assertion + */ + CUSTOM_SQL_ERROR + /** + * Error while executing a field assertion + */ + FIELD_ASSERTION_ERROR + /** * Unknown error */ UNKNOWN_ERROR @@ -42,4 +50,4 @@ record AssertionResultError { * Additional metadata depending on the type of error */ properties: optional map[string, string] -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl index d8892c0c71c6f6..734a48f7718863 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl @@ -1,5 +1,7 @@ namespace com.linkedin.assertion +import com.linkedin.common.AuditStamp + /** * The source of an assertion */ @@ -24,4 +26,10 @@ record AssertionSource { */ INFERRED } + + /** + * The time at which the assertion was initially created and the author who created it. + * This field is only present for Native assertions created after this field was introduced. + */ + created: optional AuditStamp } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl index 2e0dcbe24986b8..ee4f9612490258 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdOperator.pdl @@ -34,6 +34,16 @@ enum AssertionStdOperator { */ EQUAL_TO + /** + * Value being asserted is not equal to value. Requires 'value' parameter. + */ + NOT_EQUAL_TO + + /** + * Value being asserted is null. Requires no parameters. + */ + NULL + /** * Value being asserted is not null. Requires no parameters. */ @@ -69,6 +79,16 @@ enum AssertionStdOperator { */ NOT_IN + /** + * Value being asserted is true. Requires no parameters. + */ + IS_TRUE + + /** + * Value being asserted is false. Requires no parameters. + */ + IS_FALSE + /** * Other */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl index a212fe84aff13f..9c3e3ea7c1c958 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdParameter.pdl @@ -13,10 +13,29 @@ record AssertionStdParameter { * The type of the parameter */ type: enum AssertionStdParameterType { + /** + * A string value + */ STRING + + /** + * A numeric value + */ NUMBER + + /** + * A list of values. When used, value should be formatted as a serialized JSON array. + */ LIST + + /** + * A set of values. When used, value should be formatted as a serialized JSON array. + */ SET + + /** + * A value of unknown type + */ UNKNOWN } } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldAssertionInfo.pdl new file mode 100644 index 00000000000000..0b8d9ab8cceb8f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldAssertionInfo.pdl @@ -0,0 +1,57 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a Field Assertion. +**/ +record FieldAssertionInfo { + /** + * The type of the field assertion being monitored. + */ + @Searchable = {} + type: enum FieldAssertionType { + /** + * An assertion used to validate the values contained with a field / column given a set of rows. + */ + FIELD_VALUES + /** + * An assertion used to validate the value of a common field / column metric (e.g. aggregation) such as null count + percentage, + * min, max, median, and more. + */ + FIELD_METRIC + } + + /** + * The entity targeted by this Field check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * The definition of an assertion that validates individual values of a field / column for a set of rows. + * This type of assertion verifies that each column value meets a particular requirement. + */ + fieldValuesAssertion: optional FieldValuesAssertion + + /** + * The definition of an assertion that validates a common metric obtained about a field / column for a set of rows. + * This type of assertion verifies that the value of a high-level metric obtained by aggregating over a column meets + * expectations + */ + fieldMetricAssertion: optional FieldMetricAssertion + + /** + * A definition of the specific filters that should be applied, when performing monitoring. + * If not provided, there is no filter, and the full table is under consideration. + * If using DataHub Dataset Profiles as the assertion source type, the value of this field will be ignored. + */ + filter: optional DatasetFilter +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricAssertion.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricAssertion.pdl new file mode 100644 index 00000000000000..ca9ce9cbd6a8c3 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricAssertion.pdl @@ -0,0 +1,39 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + +/** +* Attributes defining a field metric assertion, which asserts an expectation against +* a common metric derived from the set of field / column values, for example: +* max, min, median, null count, null percentage, unique count, unique percentage, and more. +*/ +record FieldMetricAssertion { + /** + * The field under evaluation + */ + @Searchable = { + "/path": { + "fieldName": "fieldPath" + } + } + field: SchemaFieldSpec + + /** + * The specific metric to assert against. This is the value that + * will be obtained by applying a standard operation, such as an aggregation, + * to the selected field. + */ + metric: FieldMetricType + + /** + * The predicate to evaluate against the metric for the field / column. + * Depending on the operator, parameters may be required in order to successfully + * evaluate the assertion against the metric value. + */ + operator: AssertionStdOperator + + /** + * Standard parameters required for the assertion. e.g. min_value, max_value, value, columns + */ + parameters: optional AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricType.pdl new file mode 100644 index 00000000000000..9df06e9dc1fe2c --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldMetricType.pdl @@ -0,0 +1,94 @@ +namespace com.linkedin.assertion + +/** + * A standard metric that can be derived from the set of values + * for a specific field / column of a dataset / table. + */ +enum FieldMetricType { + /** + * The number of unique values found in the column value set + */ + UNIQUE_COUNT + + /** + * The percentage of unique values to total rows for the dataset + */ + UNIQUE_PERCENTAGE + + /** + * The number of null values found in the column value set + */ + NULL_COUNT + + /** + * The percentage of null values to total rows for the dataset + */ + NULL_PERCENTAGE + + /** + * The minimum value in the column set (applies to numeric columns) + */ + MIN + + /** + * The maximum value in the column set (applies to numeric columns) + */ + MAX + + /** + * The mean length found in the column set (applies to numeric columns) + */ + MEAN + + /** + * The median length found in the column set (applies to numeric columns) + */ + MEDIAN + + /** + * The stddev length found in the column set (applies to numeric columns) + */ + STDDEV + + /** + * The number of negative values found in the value set (applies to numeric columns) + */ + NEGATIVE_COUNT + + /** + * The percentage of negative values to total rows for the dataset (applies to numeric columns) + */ + NEGATIVE_PERCENTAGE + + /** + * The number of zero values found in the value set (applies to numeric columns) + */ + ZERO_COUNT + + /** + * The percentage of zero values to total rows for the dataset (applies to numeric columns) + */ + ZERO_PERCENTAGE + + /** + * The minimum length found in the column set (applies to string columns) + */ + MIN_LENGTH + + /** + * The maximum length found in the column set (applies to string columns) + */ + MAX_LENGTH + + /** + * The number of empty string values found in the value set (applies to string columns). + * Note: This is a completely different metric different from NULL_COUNT! + */ + EMPTY_COUNT + + /** + * The percentage of empty string values to total rows for the dataset (applies to string columns) + * Note: This is a completely different metric different from NULL_PERCENTAGE! + */ + EMPTY_PERCENTAGE +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldTransform.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldTransform.pdl new file mode 100644 index 00000000000000..3b3d3339a9b864 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldTransform.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.assertion + +/** +* Definition of a transform applied to the values of a column / field. +* Note that the applicability of a field transform ultimately depends on the native type +* of the field / column. +* +* Model has single field to permit extension. +*/ +record FieldTransform { + /** + * The type of the field transform, e.g. the transformation + * function / operator to apply. + */ + type: enum FieldTransformType { + /** + * Obtain the length of a string field / column (applicable to string types) + */ + LENGTH + } +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldValuesAssertion.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldValuesAssertion.pdl new file mode 100644 index 00000000000000..0400124234462d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FieldValuesAssertion.pdl @@ -0,0 +1,83 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + +/** +* Attributes defining a field values assertion, which asserts that the values for a field / column +* of a dataset / table matches a set of expectations. +* +* In other words, this type of assertion acts as a semantic constraint applied to fields for a specific column. +* +* TODO: We should display the "failed row count" to the user if the column fails the verification rules. +* TODO: Determine whether we need an "operator" that can be applied to the field. +*/ +record FieldValuesAssertion { + /** + * The field under evaluation + */ + @Searchable = { + "/path": { + "fieldName": "fieldPath" + } + } + field: SchemaFieldSpec + + /** + * An optional transform to apply to field values + * before evaluating the operator. + * + * If none is applied, the field value will be compared as is. + */ + transform: optional FieldTransform + + /** + * The predicate to evaluate against a single value of the field. + * Depending on the operator, parameters may be required in order to successfully + * evaluate the assertion against the field value. + */ + operator: AssertionStdOperator + + /** + * Standard parameters required for the assertion. e.g. min_value, max_value, value, columns + */ + parameters: optional AssertionStdParameters + + /** + * Additional customization about when the assertion + * should be officially considered failing. + */ + failThreshold: record FieldValuesFailThreshold { + + /** + * The type of failure threshold. Either based on the number + * of column values (rows) that fail the expectations, or the percentage + * of the total rows under consideration. + */ + type: enum FieldValuesFailThresholdType { + /* + * The maximum number of column values (i.e. rows) that are allowed + * to fail the defined expectations before the assertion officially fails. + */ + COUNT + /* + * The maximum percentage of rows that are allowed + * to fail the defined column expectations before the assertion officially fails. + */ + PERCENTAGE + } = "COUNT" + + /** + * By default this is 0, meaning that ALL column values (i.e. rows) must + * meet the defined expectations. + */ + value: long = 0 + } + + /** + * Whether to ignore or allow nulls when running the values assertion. (i.e. + * consider only non-null values) using operators OTHER than the IS_NULL operator. + * + * Defaults to true, allowing null values. + */ + excludeNulls: boolean = true +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl index 04acd1c71352de..179d4a1b135913 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl @@ -4,11 +4,13 @@ import com.linkedin.schema.SchemaFieldSpec /** -* Lightweight spec used for referencing a particular schema field. -**/ +* Lightweight spec used for referencing a particular schema field that is used to compute +* a freshness signal or operation. +* TODO: Since this is now leveraged across assertions & metrics / operations, we should consider moving this to a common package. +*/ record FreshnessFieldSpec includes SchemaFieldSpec { /** - * The type of the field being used to verify the Freshness Assertion. + * The type of the field being used to verify the Freshness of the asset. */ kind: optional FreshnessFieldKind } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl index fd246e0c7cfc46..2e691d5152ae34 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl @@ -25,5 +25,36 @@ record SchemaAssertionInfo { * Note that many of the fields of this model, especially those related to metadata (tags, terms) * will go unused in this context. */ - schema: SchemaMetadata +// @Relationship = { +// "/foreignKeys/*/foreignFields/*": null, +// "/foreignKeys/*/foreignDataset": null, +// "/fields/*/globalTags/tags/*/tag": null, +// "/fields/*/glossaryTerms/terms/*/urn": null +// } +// @Searchable = { +// "/fields/*/fieldPath": null, +// "/fields/*/description": null, +// "/fields/*/label": null, +// "/fields/*/globalTags/tags/*/tag": null, +// "/fields/*/glossaryTerms/terms/*/urn": null +// } + schema: SchemaMetadata + + /** + * The required compatibility level for the schema assertion to pass. + */ + compatibility: optional enum SchemaAssertionCompatibility { + /** + * The actual schema must be exactly the same as the expected schema + */ + EXACT_MATCH, + /** + * The actual schema must be a superset of the expected schema + */ + SUPERSET, + /** + * The actual schema must be a subset of the expected schema + */ + SUBSET + } = "EXACT_MATCH" } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl index 327b76f95762e3..bdc78d3bd0a6f2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl @@ -8,7 +8,7 @@ import com.linkedin.dataset.DatasetFilter */ record VolumeAssertionInfo { /** - * The type of the freshness assertion being monitored. + * The type of the volume assertion being monitored. */ @Searchable = {} type: enum VolumeAssertionType { diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl index 273d2c2a56f95b..3ff8b58284f189 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl @@ -12,5 +12,9 @@ record DataQualityContract { * The assertion representing the Data Quality contract. * E.g. a table or column-level assertion. */ + @Relationship = { + "name": "IncludesDataQualityAssertion", + "entityTypes": [ "assertion" ] + } assertion: Urn } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl index 6c11e0da5b1286..af61a660cdf768 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl @@ -9,5 +9,9 @@ record SchemaContract { /** * The assertion representing the schema contract. */ + @Relationship = { + "name": "IncludesSchemaAssertion", + "entityTypes": [ "assertion" ] + } assertion: Urn } diff --git a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl index 2f8912da5458c9..2e65d37dc09398 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentSource.pdl @@ -22,6 +22,11 @@ record IncidentSource { * Manually created incident, via UI or API. */ MANUAL + + /** + * An assertion has failed, triggering the incident. + */ + ASSERTION_FAILURE } /** diff --git a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl index 27c4790e3b6ef6..1c3473018d4e0a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/incident/IncidentType.pdl @@ -4,6 +4,36 @@ namespace com.linkedin.incident * A type of asset incident */ enum IncidentType { + /** + * An Freshness Assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + FRESHNESS + + /** + * An Volume Assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + VOLUME + + /** + * A Field Assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + FIELD + + /** + * A raw SQL-statement based assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + SQL + + /** + * A Data Schema assertion has failed, triggering the incident. + * Raised on entities where assertions are configured to generate incidents. + */ + DATA_SCHEMA + /** * A misc. operational incident, e.g. failure to materialize a dataset. */ diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java index 8c0a079f1e61d1..aa80fc62db09c4 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/graphql/GraphQLEngineFactory.java @@ -68,7 +68,7 @@ EntityRegistryFactory.class, DataHubTokenServiceFactory.class, GitVersionFactory.class, - SiblingGraphServiceFactory.class + SiblingGraphServiceFactory.class, }) public class GraphQLEngineFactory { @Autowired