From 9346fe2f178f9e7c5c8e318ebb7f5fac0e0964ce Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Wed, 13 Jul 2022 12:15:39 +0200 Subject: [PATCH] [HOPSWORKS-3178] Remove Deequ based data validation in favour of GE (#655) --- auto_doc.py | 70 --- .../hsfs/ExternalFeatureGroup.java | 14 +- .../com/logicalclocks/hsfs/FeatureGroup.java | 13 +- .../com/logicalclocks/hsfs/FeatureStore.java | 72 +-- .../hsfs/HopsworksConnection.java | 12 - .../hsfs/StreamFeatureGroup.java | 13 +- .../hsfs/constructor/FsQuery.java | 1 - .../hsfs/engine/Constraint.scala | 23 - .../hsfs/engine/ConstraintGroup.scala | 19 - .../hsfs/engine/DataValidationEngine.java | 327 ------------- .../hsfs/engine/DeequEngine.scala | 127 ----- .../hsfs/engine/ExpectationsEngine.java | 32 -- .../engine/ExternalFeatureGroupEngine.java | 5 - .../hsfs/engine/FeatureGroupBaseEngine.java | 6 - .../hsfs/engine/FeatureGroupEngine.java | 14 - .../hsfs/engine/StreamFeatureGroupEngine.java | 6 - .../hsfs/metadata/Expectation.java | 84 ---- .../hsfs/metadata/ExpectationResult.java | 86 ---- .../hsfs/metadata/ExpectationsApi.java | 245 ---------- .../hsfs/metadata/FeatureGroupApi.java | 8 +- .../hsfs/metadata/FeatureGroupBase.java | 109 ----- .../hsfs/metadata/FeatureGroupValidation.java | 60 --- .../metadata/FeatureGroupValidations.java | 49 -- .../metadata/FeatureGroupValidationsApi.java | 141 ------ .../hsfs/metadata/RuleDefinition.java | 52 -- .../logicalclocks/hsfs/metadata/RulesApi.java | 58 --- .../hsfs/metadata/ValidationResult.java | 47 -- .../metadata/validation/AcceptedType.java | 21 - .../hsfs/metadata/validation/FeatureType.java | 5 - .../hsfs/metadata/validation/Level.java | 22 - .../hsfs/metadata/validation/Predicate.java | 24 - .../hsfs/metadata/validation/Rule.java | 74 --- .../hsfs/metadata/validation/RuleName.java | 47 -- .../metadata/validation/ValidationType.java | 46 -- mkdocs.yml | 4 - python/hsfs/connection.py | 13 +- python/hsfs/core/data_validation_engine.py | 154 ------ python/hsfs/core/expectations_api.py | 144 ------ python/hsfs/core/expectations_engine.py | 31 -- python/hsfs/core/feature_group_api.py | 3 +- python/hsfs/core/feature_group_base_engine.py | 9 - python/hsfs/core/feature_group_engine.py | 14 - python/hsfs/core/rules_api.py | 42 -- python/hsfs/core/validations_api.py | 79 ---- python/hsfs/engine/spark.py | 47 -- python/hsfs/expectation.py | 106 ----- python/hsfs/expectation_result.py | 86 ---- python/hsfs/feature_group.py | 446 ++++++------------ python/hsfs/feature_group_validation.py | 137 ------ python/hsfs/feature_store.py | 135 +----- python/hsfs/rule.py | 147 ------ python/hsfs/ruledefinition.py | 94 ---- python/hsfs/validation_result.py | 109 ----- 53 files changed, 180 insertions(+), 3552 deletions(-) delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/engine/Constraint.scala delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/engine/ConstraintGroup.scala delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/engine/DataValidationEngine.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/engine/DeequEngine.scala delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/engine/ExpectationsEngine.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/Expectation.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationResult.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationsApi.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidation.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidations.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidationsApi.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/RuleDefinition.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/RulesApi.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/ValidationResult.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/AcceptedType.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/FeatureType.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Level.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Predicate.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Rule.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/RuleName.java delete mode 100644 java/src/main/java/com/logicalclocks/hsfs/metadata/validation/ValidationType.java delete mode 100644 python/hsfs/core/data_validation_engine.py delete mode 100644 python/hsfs/core/expectations_api.py delete mode 100644 python/hsfs/core/expectations_engine.py delete mode 100644 python/hsfs/core/rules_api.py delete mode 100644 python/hsfs/core/validations_api.py delete mode 100644 python/hsfs/expectation.py delete mode 100644 python/hsfs/expectation_result.py delete mode 100644 python/hsfs/feature_group_validation.py delete mode 100644 python/hsfs/rule.py delete mode 100644 python/hsfs/ruledefinition.py delete mode 100644 python/hsfs/validation_result.py diff --git a/auto_doc.py b/auto_doc.py index d38095524d..bc61408fb1 100644 --- a/auto_doc.py +++ b/auto_doc.py @@ -140,38 +140,6 @@ "hsfs.statistics_config.StatisticsConfig" ), }, - "feature_validation.md": { - "rule": ["hsfs.rule.Rule"], - "rule_properties": keras_autodoc.get_properties("hsfs.rule.Rule"), - "ruledefinition": ["hsfs.ruledefinition.RuleDefinition"], - "ruledefinition_getall": ["hsfs.connection.Connection.get_rules"], - "ruledefinition_get": ["hsfs.connection.Connection.get_rule"], - "ruledefinition_properties": keras_autodoc.get_properties( - "hsfs.ruledefinition.RuleDefinition" - ), - "expectation": ["hsfs.expectation.Expectation"], - "expectation_properties": keras_autodoc.get_properties( - "hsfs.expectation.Expectation" - ), - "expectation_methods": keras_autodoc.get_methods( - "hsfs.expectation.Expectation", - exclude=[ - "from_response_json", - "update_from_response_json", - "json", - "to_dict", - ], - ), - "expectation_create": ["hsfs.feature_store.FeatureStore.create_expectation"], - "expectation_get": ["hsfs.feature_store.FeatureStore.get_expectation"], - "expectation_getall": ["hsfs.feature_store.FeatureStore.get_expectations"], - "validation_result": ["hsfs.validation_result.ValidationResult"], - "validation_result_properties": keras_autodoc.get_properties( - "hsfs.validation_result.ValidationResult" - ), - "validate": ["hsfs.feature_group.FeatureGroup.validate"], - "validation_result_get": ["hsfs.feature_group.FeatureGroup.get_validations"], - }, "tags.md": { "fg_tag_add": ["hsfs.feature_group.FeatureGroupBase.add_tag"], "fg_tag_get": ["hsfs.feature_group.FeatureGroupBase.get_tag"], @@ -308,44 +276,6 @@ "hsfs.statistics_config.StatisticsConfig" ), }, - "api/rule_api.md": { - "rule": ["hsfs.rule.Rule"], - "rule_properties": keras_autodoc.get_properties("hsfs.rule.Rule"), - }, - "api/rule_definition_api.md": { - "ruledefinition": ["hsfs.ruledefinition.RuleDefinition"], - "ruledefinition_getall": ["hsfs.connection.Connection.get_rules"], - "ruledefinition_get": ["hsfs.connection.Connection.get_rule"], - "ruledefinition_properties": keras_autodoc.get_properties( - "hsfs.ruledefinition.RuleDefinition" - ), - }, - "api/expectation_api.md": { - "expectation": ["hsfs.expectation.Expectation"], - "expectation_properties": keras_autodoc.get_properties( - "hsfs.expectation.Expectation" - ), - "expectation_methods": keras_autodoc.get_methods( - "hsfs.expectation.Expectation", - exclude=[ - "from_response_json", - "update_from_response_json", - "json", - "to_dict", - ], - ), - "expectation_create": ["hsfs.feature_store.FeatureStore.create_expectation"], - "expectation_get": ["hsfs.feature_store.FeatureStore.get_expectation"], - "expectation_getall": ["hsfs.feature_store.FeatureStore.get_expectations"], - }, - "api/validation_api.md": { - "validation_result": ["hsfs.validation_result.ValidationResult"], - "validation_result_properties": keras_autodoc.get_properties( - "hsfs.validation_result.ValidationResult" - ), - "validate": ["hsfs.feature_group.FeatureGroup.validate"], - "validation_result_get": ["hsfs.feature_group.FeatureGroup.get_validations"], - }, "api/transformation_functions_api.md": { "transformation_function": [ "hsfs.transformation_function.TransformationFunction" diff --git a/java/src/main/java/com/logicalclocks/hsfs/ExternalFeatureGroup.java b/java/src/main/java/com/logicalclocks/hsfs/ExternalFeatureGroup.java index 814ef7936c..5b5014f52e 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/ExternalFeatureGroup.java +++ b/java/src/main/java/com/logicalclocks/hsfs/ExternalFeatureGroup.java @@ -19,10 +19,8 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.logicalclocks.hsfs.engine.ExternalFeatureGroupEngine; import com.logicalclocks.hsfs.engine.CodeEngine; -import com.logicalclocks.hsfs.metadata.Expectation; import com.logicalclocks.hsfs.metadata.FeatureGroupBase; import com.logicalclocks.hsfs.metadata.OnDemandOptions; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Getter; @@ -30,10 +28,8 @@ import lombok.Setter; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import scala.collection.JavaConverters; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -73,9 +69,7 @@ public class ExternalFeatureGroup extends FeatureGroupBase { public ExternalFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String query, ExternalDataFormat dataFormat, String path, Map options, @NonNull StorageConnector storageConnector, String description, List primaryKeys, - List features, StatisticsConfig statisticsConfig, - scala.collection.Seq expectations, - ValidationType validationType, String eventTime) { + List features, StatisticsConfig statisticsConfig, String eventTime) { this.featureStore = featureStore; this.name = name; this.version = version; @@ -93,12 +87,6 @@ public ExternalFeatureGroup(FeatureStore featureStore, @NonNull String name, Int this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.eventTime = eventTime; - this.validationType = validationType != null ? validationType : ValidationType.NONE; - if (expectations != null && !expectations.isEmpty()) { - this.expectationsNames = new ArrayList<>(); - this.expectations = JavaConverters.seqAsJavaListConverter(expectations).asJava(); - this.expectations.forEach(expectation -> this.expectationsNames.add(expectation.getName())); - } } public ExternalFeatureGroup() { diff --git a/java/src/main/java/com/logicalclocks/hsfs/FeatureGroup.java b/java/src/main/java/com/logicalclocks/hsfs/FeatureGroup.java index 0b0395f7e2..1978ef7b45 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/FeatureGroup.java +++ b/java/src/main/java/com/logicalclocks/hsfs/FeatureGroup.java @@ -24,8 +24,6 @@ import com.logicalclocks.hsfs.engine.FeatureGroupUtils; import com.logicalclocks.hsfs.metadata.FeatureGroupBase; import com.logicalclocks.hsfs.engine.StatisticsEngine; -import com.logicalclocks.hsfs.metadata.Expectation; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import com.logicalclocks.hsfs.metadata.Statistics; import lombok.AllArgsConstructor; import lombok.Builder; @@ -40,11 +38,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.avro.Schema; -import scala.collection.JavaConverters; import java.io.IOException; import java.text.ParseException; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.TimeoutException; @@ -96,8 +92,7 @@ public class FeatureGroup extends FeatureGroupBase { public FeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, - StatisticsConfig statisticsConfig, ValidationType validationType, - scala.collection.Seq expectations, String onlineTopicName, String eventTime) { + StatisticsConfig statisticsConfig, String onlineTopicName, String eventTime) { this.featureStore = featureStore; this.name = name; this.version = version; @@ -112,12 +107,6 @@ public FeatureGroup(FeatureStore featureStore, @NonNull String name, Integer ver this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); - this.validationType = validationType != null ? validationType : ValidationType.NONE; - if (expectations != null && !expectations.isEmpty()) { - this.expectationsNames = new ArrayList<>(); - this.expectations = JavaConverters.seqAsJavaListConverter(expectations).asJava(); - this.expectations.forEach(expectation -> this.expectationsNames.add(expectation.getName())); - } this.onlineTopicName = onlineTopicName; this.eventTime = eventTime; } diff --git a/java/src/main/java/com/logicalclocks/hsfs/FeatureStore.java b/java/src/main/java/com/logicalclocks/hsfs/FeatureStore.java index 82fa529dae..3390ee1743 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/FeatureStore.java +++ b/java/src/main/java/com/logicalclocks/hsfs/FeatureStore.java @@ -19,12 +19,9 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.logicalclocks.hsfs.engine.FeatureViewEngine; import com.logicalclocks.hsfs.engine.SparkEngine; -import com.logicalclocks.hsfs.metadata.Expectation; -import com.logicalclocks.hsfs.metadata.ExpectationsApi; import com.logicalclocks.hsfs.metadata.FeatureGroupApi; import com.logicalclocks.hsfs.metadata.StorageConnectorApi; import com.logicalclocks.hsfs.metadata.TrainingDatasetApi; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import lombok.Getter; import lombok.NonNull; import lombok.Setter; @@ -33,7 +30,6 @@ import scala.collection.JavaConverters; import java.io.IOException; -import java.util.ArrayList; import java.util.List; public class FeatureStore { @@ -55,7 +51,6 @@ public class FeatureStore { private FeatureGroupApi featureGroupApi; private TrainingDatasetApi trainingDatasetApi; private StorageConnectorApi storageConnectorApi; - private ExpectationsApi expectationsApi; private FeatureViewEngine featureViewEngine; private static final Logger LOGGER = LoggerFactory.getLogger(FeatureStore.class); @@ -66,7 +61,6 @@ public FeatureStore() { featureGroupApi = new FeatureGroupApi(); trainingDatasetApi = new TrainingDatasetApi(); storageConnectorApi = new StorageConnectorApi(); - expectationsApi = new ExpectationsApi(); featureViewEngine = new FeatureViewEngine(); } @@ -260,16 +254,14 @@ public FeatureGroup.FeatureGroupBuilder createFeatureGroup() { public FeatureGroup getOrCreateFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateFeatureGroup(this, name, version, null, null, - null, null, false, null, null, null, - null, null); + null, null, false, null, null, null); } public FeatureGroup getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateFeatureGroup(this, name, version, null, primaryKeys, - null, null, onlineEnabled, null, null, null, - null, eventTime); + null, null, onlineEnabled, null, null, eventTime); } public FeatureGroup getOrCreateFeatureGroup(String name, Integer version, @@ -279,21 +271,18 @@ public FeatureGroup getOrCreateFeatureGroup(String name, Integer version, String eventTime) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateFeatureGroup(this, name, version, null, primaryKeys, - partitionKeys, null, onlineEnabled, null, null, null, - null, eventTime); + partitionKeys, null, onlineEnabled, null, null, eventTime); } public FeatureGroup getOrCreateFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, - StatisticsConfig statisticsConfig, ValidationType validationType, - scala.collection.Seq expectations, String eventTime) + StatisticsConfig statisticsConfig, String eventTime) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateFeatureGroup(this, name, version, description, primaryKeys, - partitionKeys, hudiPrecombineKey, onlineEnabled, timeTravelFormat, statisticsConfig, validationType, - expectations, eventTime); + partitionKeys, hudiPrecombineKey, onlineEnabled, timeTravelFormat, statisticsConfig, eventTime); } public StreamFeatureGroup.StreamFeatureGroupBuilder createStreamFeatureGroup() { @@ -304,14 +293,14 @@ public StreamFeatureGroup.StreamFeatureGroupBuilder createStreamFeatureGroup() { public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateStreamFeatureGroup(this, name, version, null, - null, null, null, false, null, null, null); + null, null, null, false, null, null); } public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateStreamFeatureGroup(this, name, version, null, - primaryKeys, null, null, onlineEnabled, null, null, eventTime); + primaryKeys, null, null, onlineEnabled, null, eventTime); } public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, @@ -320,19 +309,18 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver return featureGroupApi.getOrCreateStreamFeatureGroup(this, name, version, null, - primaryKeys, partitionKeys, null, onlineEnabled, null, null, eventTime); + primaryKeys, partitionKeys, null, onlineEnabled, null, eventTime); } public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, StatisticsConfig statisticsConfig, - scala.collection.Seq expectations, String eventTime) throws IOException, FeatureStoreException { return featureGroupApi.getOrCreateStreamFeatureGroup(this, name, version, description, - primaryKeys, partitionKeys, hudiPrecombineKey, onlineEnabled, statisticsConfig, expectations, eventTime); + primaryKeys, partitionKeys, hudiPrecombineKey, onlineEnabled, statisticsConfig, eventTime); } public ExternalFeatureGroup.ExternalFeatureGroupBuilder createExternalFeatureGroup() { @@ -383,12 +371,6 @@ public TrainingDataset.TrainingDatasetBuilder createTrainingDataset() { .featureStore(this); } - - public Expectation.ExpectationBuilder createExpectation() { - return Expectation.builder() - .featureStore(this); - } - /** * Get a training dataset object from the selected feature store. * @@ -430,42 +412,6 @@ public scala.collection.Seq getTrainingDatasets(@NonNull String return JavaConverters.asScalaBufferConverter(trainingDatasetApi.get(this, name, null)).asScala().toSeq(); } - public scala.collection.Seq createExpectations(scala.collection.Seq expectations) - throws FeatureStoreException, IOException { - List newExpectations = new ArrayList<>(); - List expectationsList = - (List) JavaConverters.seqAsJavaListConverter(expectations).asJava(); - for (Expectation expectation : expectationsList) { - expectation = expectationsApi.put(this, expectation); - newExpectations.add(expectation); - } - return JavaConverters.asScalaBufferConverter(newExpectations).asScala().toSeq(); - } - - public Expectation getExpectation(String name) - throws FeatureStoreException, IOException { - return expectationsApi.get(this, name); - } - - public scala.collection.Seq getExpectations() throws FeatureStoreException, IOException { - return JavaConverters.asScalaBufferConverter(expectationsApi.get(this)).asScala().toSeq(); - } - - public void deleteExpectation(Expectation expectation) throws FeatureStoreException, IOException { - deleteExpectation(expectation.getName()); - } - - public void deleteExpectation(String name) throws FeatureStoreException, IOException { - expectationsApi.delete(this, name); - } - - public void deleteExpectations(scala.collection.Seq expectations) - throws FeatureStoreException, IOException { - for (Expectation expectation : (List) JavaConverters.seqAsJavaListConverter(expectations).asJava()) { - deleteExpectation(expectation); - } - } - @Override public String toString() { return "FeatureStore{" diff --git a/java/src/main/java/com/logicalclocks/hsfs/HopsworksConnection.java b/java/src/main/java/com/logicalclocks/hsfs/HopsworksConnection.java index 30cd0034c3..398b589d98 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/HopsworksConnection.java +++ b/java/src/main/java/com/logicalclocks/hsfs/HopsworksConnection.java @@ -20,9 +20,6 @@ import com.logicalclocks.hsfs.metadata.FeatureStoreApi; import com.logicalclocks.hsfs.metadata.HopsworksClient; import com.logicalclocks.hsfs.metadata.ProjectApi; -import com.logicalclocks.hsfs.metadata.RuleDefinition; -import com.logicalclocks.hsfs.metadata.RulesApi; -import com.logicalclocks.hsfs.metadata.validation.RuleName; import com.logicalclocks.hsfs.util.Constants; import lombok.Builder; import lombok.Getter; @@ -69,7 +66,6 @@ public class HopsworksConnection implements Closeable { private FeatureStoreApi featureStoreApi = new FeatureStoreApi(); private ProjectApi projectApi = new ProjectApi(); - private RulesApi rulesApi = new RulesApi(); private Project projectObj; @@ -148,12 +144,4 @@ private String getProjectName(String project) { } return project; } - - public scala.collection.Seq getRules() throws FeatureStoreException, IOException { - return rulesApi.get(); - } - - public RuleDefinition getRule(RuleName name) throws FeatureStoreException, IOException { - return rulesApi.get(name); - } } diff --git a/java/src/main/java/com/logicalclocks/hsfs/StreamFeatureGroup.java b/java/src/main/java/com/logicalclocks/hsfs/StreamFeatureGroup.java index 49f3780a71..671a85e2b4 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/StreamFeatureGroup.java +++ b/java/src/main/java/com/logicalclocks/hsfs/StreamFeatureGroup.java @@ -24,9 +24,7 @@ import com.logicalclocks.hsfs.engine.FeatureGroupUtils; import com.logicalclocks.hsfs.engine.StatisticsEngine; import com.logicalclocks.hsfs.engine.StreamFeatureGroupEngine; -import com.logicalclocks.hsfs.metadata.Expectation; import com.logicalclocks.hsfs.metadata.FeatureGroupBase; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import lombok.AllArgsConstructor; import lombok.Builder; @@ -38,11 +36,9 @@ import org.apache.avro.Schema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import scala.collection.JavaConverters; import java.io.IOException; import java.text.ParseException; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -105,8 +101,7 @@ public class StreamFeatureGroup extends FeatureGroupBase { public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, List features, - StatisticsConfig statisticsConfig, ValidationType validationType, - scala.collection.Seq expectations, String onlineTopicName, String eventTime) { + StatisticsConfig statisticsConfig, String onlineTopicName, String eventTime) { this.featureStore = featureStore; this.name = name; this.version = version; @@ -119,12 +114,6 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ this.onlineEnabled = onlineEnabled; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); - this.validationType = validationType != null ? validationType : ValidationType.NONE; - if (expectations != null && !expectations.isEmpty()) { - this.expectationsNames = new ArrayList<>(); - this.expectations = JavaConverters.seqAsJavaListConverter(expectations).asJava(); - this.expectations.forEach(expectation -> this.expectationsNames.add(expectation.getName())); - } this.onlineTopicName = onlineTopicName; this.eventTime = eventTime; } diff --git a/java/src/main/java/com/logicalclocks/hsfs/constructor/FsQuery.java b/java/src/main/java/com/logicalclocks/hsfs/constructor/FsQuery.java index 5caf7d71dc..0140a7d7ad 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/constructor/FsQuery.java +++ b/java/src/main/java/com/logicalclocks/hsfs/constructor/FsQuery.java @@ -22,7 +22,6 @@ import com.logicalclocks.hsfs.ExternalFeatureGroup; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.engine.SparkEngine; -import com.logicalclocks.hsfs.metadata.FeatureGroupBase; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.NoArgsConstructor; diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/Constraint.scala b/java/src/main/java/com/logicalclocks/hsfs/engine/Constraint.scala deleted file mode 100644 index a6007c832c..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/Constraint.scala +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.engine - -import com.logicalclocks.hsfs.metadata.validation.AcceptedType - -case class Constraint(name: String, hint: Option[String], columns: Option[Seq[String]], min: Option[Double], - max: Option[Double], value: Option[String], pattern: Option[String], - acceptedType: Option[AcceptedType], legalValues: Option[Array[String]]) diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/ConstraintGroup.scala b/java/src/main/java/com/logicalclocks/hsfs/engine/ConstraintGroup.scala deleted file mode 100644 index afa171d80a..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/ConstraintGroup.scala +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.engine - -case class ConstraintGroup(level: String, description: String, constraints: Seq[Constraint]) diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/DataValidationEngine.java b/java/src/main/java/com/logicalclocks/hsfs/engine/DataValidationEngine.java deleted file mode 100644 index 832077af02..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/DataValidationEngine.java +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.engine; - -import com.amazon.deequ.checks.Check; -import com.amazon.deequ.checks.CheckResult; -import com.amazon.deequ.constraints.ConstraintResult; -import com.logicalclocks.hsfs.EntityEndpointType; -import com.logicalclocks.hsfs.FeatureStoreException; -import com.logicalclocks.hsfs.metadata.Expectation; -import com.logicalclocks.hsfs.metadata.ExpectationsApi; -import com.logicalclocks.hsfs.metadata.ExpectationResult; -import com.logicalclocks.hsfs.metadata.FeatureGroupBase; -import com.logicalclocks.hsfs.metadata.FeatureGroupValidation; -import com.logicalclocks.hsfs.metadata.FeatureGroupValidationsApi; -import com.logicalclocks.hsfs.metadata.ValidationResult; -import com.logicalclocks.hsfs.metadata.validation.Level; -import com.logicalclocks.hsfs.metadata.validation.Rule; -import com.logicalclocks.hsfs.metadata.validation.RuleName; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.Option; -import scala.collection.JavaConverters; - -import java.io.IOException; -import java.time.Instant; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.google.common.base.Strings; - -public class DataValidationEngine { - - private static DataValidationEngine INSTANCE = null; - - public static synchronized DataValidationEngine getInstance() { - if (INSTANCE == null) { - INSTANCE = new DataValidationEngine(); - } - return INSTANCE; - } - - private final FeatureGroupValidationsApi featureGroupValidationsApi = - new FeatureGroupValidationsApi(EntityEndpointType.FEATURE_GROUP); - - private final ExpectationsApi expectationsApi = new ExpectationsApi(EntityEndpointType.FEATURE_GROUP); - - private static final Logger LOGGER = LoggerFactory.getLogger(DataValidationEngine.class); - - public FeatureGroupValidation validate(FeatureGroupBase featureGroupBase, S data, - List expectations, - Boolean logActivity) - throws FeatureStoreException, IOException { - List expectationResults = validate((Dataset) data, expectations); - return featureGroupValidationsApi.put(featureGroupBase, - FeatureGroupValidation.builder() - .validationTime(Instant.now().toEpochMilli()) - .expectationResults(expectationResults).build(), - logActivity); - } - - public List validate(Dataset data, List expectations) { - // Loop through all feature group expectations, then loop all features and rules of the expectation and - // create constraints for Deequ. - List expectationResults = new ArrayList<>(); - for (Expectation expectation : expectations) { - List constraintGroups = new ArrayList<>(); - Map> constraintGroupLevels = new HashMap<>(); - List validationResults = new ArrayList<>(); - // An expectation contains all the features its rules are applied to but not every rule is applied to all features - // Certain Compliance rules are applied on pairs of features which means the rule will be applied on all possible - // combinations between the expectations features and the rule's "feature" (rule.getFeature(). - for (Rule rule : expectation.getRules()) { - boolean comparativeRule = isRuleAppliedToFeaturePairs(rule); - for (String feature : expectation.getFeatures()) { - String[] legalValues = null; - if (rule.getLegalValues() != null && !rule.getLegalValues().isEmpty()) { - legalValues = rule.getLegalValues().toArray(new String[0]); - } - - List featuresToEval = comparativeRule && !feature.equals(rule.getFeature()) - ? Arrays.asList(feature, rule.getFeature()) - : Collections.singletonList(feature); - - Constraint constraint = - new Constraint(rule.getName().name(), - Option.apply(rule.getName().name()), - Option - .apply(JavaConverters.asScalaBufferConverter(featuresToEval).asScala().toSeq()), - Option.apply(rule.getMin()), - Option.apply(rule.getMax()), - Option.apply(rule.getValue()), - Option.apply(rule.getPattern()), - Option.apply(rule.getAcceptedType()), - Option.apply(legalValues)); - if (!constraintGroupLevels.containsKey(rule.getLevel())) { - constraintGroupLevels.put(rule.getLevel(), new ArrayList<>()); - } - constraintGroupLevels.get(rule.getLevel()).add(constraint); - } - } - if (!constraintGroupLevels.isEmpty()) { - for (Level level : constraintGroupLevels.keySet()) { - ConstraintGroup constraintGroup = new ConstraintGroup(level.name(), level.name(), - JavaConverters.asScalaIteratorConverter(constraintGroupLevels.get(level).iterator()).asScala().toSeq()); - constraintGroups.add(constraintGroup); - } - } - - // Run Deequ verification suite and return results - Map deequResults = DeequEngine.runVerification(data, - JavaConverters.asScalaIteratorConverter(constraintGroups.iterator()).asScala().toSeq()); - // Parse Deequ results and convert to Feature Group validation results. Unfortunately we don't have a way of - // getting the features and the constraint type directly from the ConstraintResult object so we need to parse - // the String representation of the object and for every constraint type the representation follows a different - // format. For every constraint type there is an example in the comments to assist. - for (Check check : deequResults.keySet()) { - List constraintResultsList = - DeequEngine.getConstraintResults(deequResults.get(check).constraintResults()); - for (ConstraintResult constraintResult : constraintResultsList) { - String[] constraintInfo = constraintResult.constraint().toString().split("\\W+"); - String constraintType = constraintInfo[1]; - List deequFeatures = new ArrayList<>(); - String deequRule = null; - boolean constraintTypeComplex = false; - - if (constraintType.equals("Compliance")) { //IS_LESS_THAN etc. - // ComplianceConstraint(Compliance(year is less than salary,year < salary,None)) - constraintTypeComplex = true; - if (constraintResult.constraint().toString().contains("contained in")) { - // ComplianceConstraint(Compliance(car contained in car15,car20,`car` IS NULL OR `car` ... - deequRule = "iscontainedin"; - deequFeatures.add(constraintInfo[2]); - } else if (constraintResult.constraint().toString().contains("is positive")) { - // ComplianceConstraint(Compliance(age is positive,COALESCE(car, 1.0) > 0,None)) - deequRule = "ispositive"; - deequFeatures.add(constraintInfo[2]); - } else if (constraintResult.constraint().toString().contains("non-negative")) { - // ComplianceConstraint(Compliance(age is non-negative,COALESCE(amount, 0.0) >= 0,None)) - deequRule = "isnonnegative"; - deequFeatures.add(constraintInfo[2]); - } else { - deequFeatures.addAll(Arrays.asList( - Arrays.stream(constraintInfo, constraintInfo.length - 3, constraintInfo.length - 2 + 1) - .toArray(String[]::new))); - Pattern pattern = Pattern.compile(deequFeatures.get(0) + "(.*?)" + deequFeatures.get(1), Pattern.DOTALL); - Matcher matcher = pattern.matcher(constraintResult.constraint().toString()); - if (matcher.find()) { - deequRule = matcher.group(1).replaceAll(" ", ""); - } - } - } else { - deequRule = constraintInfo[1]; - if (deequRule.equalsIgnoreCase("MutualInformation") || constraintType.equals("Correlation")) { - constraintTypeComplex = true; - if (constraintType.equals("MutualInformation")) { - // MutualInformationConstraint(MutualInformation(List(year, salary),None)) - deequFeatures.add(constraintInfo[3]); - deequFeatures.add(constraintInfo[4]); - } else { - // "CorrelationConstraint(Correlation(year,salary,None)) - deequFeatures.add(constraintInfo[2]); - deequFeatures.add(constraintInfo[3]); - } - } else { - // MinimumConstraint(Minimum(commission,None))... - deequFeatures.add(constraintInfo[2]); - } - } - - RuleName ruleName = getRuleNameFromDeequ(deequRule); - // Find rule from list of rules that Deequ used for validation - if (constraintTypeComplex) { - for (Rule rule : expectation.getRules()) { - if (rule.getName() == ruleName) { - validationResults.add(ValidationResult.builder() - .status(ExpectationResult.Status.fromDeequStatus(constraintResult.status(), rule.getLevel())) - .features(deequFeatures) - .rule(rule) - .message(!constraintResult.message().isEmpty() ? constraintResult.message().get() : "Success") - .value(String.valueOf(constraintResult.metric().get().value().get())) - .build()); - } - } - } else { - for (String feature : expectation.getFeatures()) { - for (Rule rule : expectation.getRules()) { - if (rule.getName() == ruleName && feature.equals(constraintInfo[2])) { - validationResults.add(ValidationResult.builder() - .status(ExpectationResult.Status.fromDeequStatus(constraintResult.status(), rule.getLevel())) - .features(Collections.singletonList(feature)) - .rule(rule) - .message(!constraintResult.message().isEmpty() ? constraintResult.message().get() : "Success") - .value(String.valueOf(constraintResult.metric().get().value().get())) - .build()); - break; - } - } - } - } - } - } - expectationResults.add(ExpectationResult.builder().expectation(expectation).results(validationResults).build()); - } - return expectationResults; - } - - public List getValidations(FeatureGroupBase featureGroupBase) - throws FeatureStoreException, IOException { - return featureGroupValidationsApi.get(featureGroupBase); - } - - public FeatureGroupValidation getValidation(FeatureGroupBase featureGroupBase, ImmutablePair pair) - throws FeatureStoreException, IOException { - return featureGroupValidationsApi.get(featureGroupBase, pair); - } - - public RuleName getRuleNameFromDeequ(String rule) { - if (Strings.isNullOrEmpty(rule)) { - throw new IllegalArgumentException("Rule name cannot be null or empty"); - } - switch (rule.toLowerCase()) { - case "maximum": - return RuleName.HAS_MAX; - case "minimum": - return RuleName.HAS_MIN; - case "mean": - return RuleName.HAS_MEAN; - case "size": - return RuleName.HAS_SIZE; - case "sum": - return RuleName.HAS_SUM; - case "completeness": - return RuleName.HAS_COMPLETENESS; - case "uniqueness": - return RuleName.HAS_UNIQUENESS; - case "distinctness": - return RuleName.HAS_DISTINCTNESS; - case "uniquevalueratio": - return RuleName.HAS_UNIQUE_VALUE_RATIO; - case "histogram": - return RuleName.HAS_NUMBER_OF_DISTINCT_VALUES; - case "entropy": - return RuleName.HAS_ENTROPY; - case "mutualinformation": - return RuleName.HAS_MUTUAL_INFORMATION; - case "approxquantile": - return RuleName.HAS_APPROX_QUANTILE; - case "standarddeviation": - return RuleName.HAS_STANDARD_DEVIATION; - case "approxcountdistinct": - return RuleName.HAS_APPROX_COUNT_DISTINCT; - case "correlation": - return RuleName.HAS_CORRELATION; - case "patternmatch": - return RuleName.HAS_PATTERN; - case "minlength": - return RuleName.HAS_MIN_LENGTH; - case "maxlength": - return RuleName.HAS_MAX_LENGTH; - case "datatype": - return RuleName.HAS_DATATYPE; - case "isnonnegative": - return RuleName.IS_NON_NEGATIVE; - case "ispositive": - return RuleName.IS_POSITIVE; - case "islessthan": - return RuleName.IS_LESS_THAN; - case "islessthanorequalto": - return RuleName.IS_LESS_THAN_OR_EQUAL_TO; - case "isgreaterthan": - return RuleName.IS_GREATER_THAN; - case "isgreaterthanorequalto": - return RuleName.IS_GREATER_THAN_OR_EQUAL_TO; - case "iscontainedin": - return RuleName.IS_CONTAINED_IN; - default: - throw new UnsupportedOperationException("Deequ rule not supported: " + rule); - } - } - - public static boolean isRuleAppliedToFeaturePairs(Rule rule) { - return isRuleAppliedToFeaturePairs(rule.getName()); - } - - public static boolean isRuleAppliedToFeaturePairs(RuleName ruleName) { - return isRuleAppliedToFeaturePairs(ruleName.name()); - } - - public static boolean isRuleAppliedToFeaturePairs(String ruleName) { - return ruleName.equals(RuleName.IS_GREATER_THAN_OR_EQUAL_TO.name()) - || ruleName.equals(RuleName.IS_GREATER_THAN.name()) - || ruleName.equals(RuleName.IS_LESS_THAN.name()) - || ruleName.equals(RuleName.IS_LESS_THAN_OR_EQUAL_TO.name()) - || ruleName.equals(RuleName.HAS_MUTUAL_INFORMATION.name()) - || ruleName.equals(RuleName.HAS_CORRELATION.name()); - } - - public enum ValidationTimeType { - VALIDATION_TIME, - COMMIT_TIME - } -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/DeequEngine.scala b/java/src/main/java/com/logicalclocks/hsfs/engine/DeequEngine.scala deleted file mode 100644 index 7a4de4e70b..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/DeequEngine.scala +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.engine - -import com.amazon.deequ.VerificationSuite -import com.amazon.deequ.checks.{Check, CheckLevel, CheckResult} -import com.amazon.deequ.constraints.{ConstrainableDataTypes, ConstraintResult} -import org.apache.spark.sql.DataFrame - -import java.util -import scala.collection.JavaConverters.{mapAsJavaMapConverter, _} - -object DeequEngine { - - def longBoundary(min: Option[Double], max: Option[Double]): Long => Boolean = { - (min, max) match { - case (Some(x), Some(y)) => v => v >= x && v <= y - case (Some(x), None) => _ >= x - case (None, Some(y)) => _ <= y - case _ => _ => true - } - } - - def doubleBoundary(min: Option[Double], max: Option[Double]): Double => Boolean = { - (min, max) match { - case (Some(x), Some(y)) => v => v >= x && v <= y - case (Some(x), None) => _ >= x - case (None, Some(y)) => _ <= y - case _ => _ => true - } - } - - def addConstraint(check: Check, constraint: Constraint): Check = { - constraint.name match { - // Reason for using string instead of Enum is - // https://stackoverflow.com/questions/7083502/why-cant-a-variable-be-a-stable-identifier - case "HAS_MEAN" => check.hasMean(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_MIN" => check.hasMin(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_MAX" => check.hasMax(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_SUM" => check.hasSum(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_SIZE" => check.hasSize( - longBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_COMPLETENESS" => check.hasCompleteness(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_UNIQUENESS" => check.hasUniqueness(constraint.columns.get, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_DISTINCTNESS" => check.hasDistinctness(constraint.columns.get, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_UNIQUE_VALUE_RATIO" => check.hasUniqueValueRatio(constraint.columns.get, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_NUMBER_OF_DISTINCT_VALUES" => check.hasNumberOfDistinctValues(constraint.columns.get.head, - longBoundary(constraint.min, constraint.max), hint = constraint.hint) - case "HAS_ENTROPY" => check.hasEntropy(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_MUTUAL_INFORMATION" => check.hasMutualInformation(constraint.columns.get.head, constraint.columns.get(1), - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_APPROX_QUANTILE" => check.hasApproxQuantile(constraint.columns.get.head, constraint.value.get.toDouble, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_STANDARD_DEVIATION" => check.hasStandardDeviation(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_APPROX_COUNT_DISTINCT" => check.hasApproxCountDistinct(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_CORRELATION" => check.hasCorrelation(constraint.columns.get.head, constraint.columns.get(1), - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_PATTERN" => check.hasPattern(constraint.columns.get.head, constraint.pattern.get.r, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_MIN_LENGTH" => check.hasMinLength(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_MAX_LENGTH" => check.hasMaxLength(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "HAS_DATATYPE" => check.hasDataType(constraint.columns.get.head, - ConstrainableDataTypes.withName(constraint.acceptedType.get.name()), - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_NON_NEGATIVE" => check.isNonNegative(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_POSITIVE" => check.isPositive(constraint.columns.get.head, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_LESS_THAN" => check.isLessThan(constraint.columns.get.head, constraint.columns.get(1), - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_LESS_THAN_OR_EQUAL_TO" => check.isLessThanOrEqualTo(constraint.columns.get.head, constraint.columns.get(1), - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_GREATER_THAN" => check.isGreaterThan(constraint.columns.get.head, constraint.columns.get(1), - doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_GREATER_THAN_OR_EQUAL_TO" => check.isGreaterThanOrEqualTo(constraint.columns.get.head, - constraint.columns.get(1), doubleBoundary(constraint.min, constraint.max), constraint.hint) - case "IS_CONTAINED_IN" => check.isContainedIn(constraint.columns.get.head, constraint.legalValues.get, - doubleBoundary(constraint.min, constraint.max), constraint.hint) - } - } - - def checksFromRules(constraintGroups: Seq[ConstraintGroup]): Seq[Check] = { - constraintGroups - .map(group => { - var check = Check(CheckLevel.withName(group.level.toLowerCase().capitalize), group.description); - group.constraints.foreach(constraint => check = addConstraint(check, constraint)) - check - }) - } - - def runVerification(data: DataFrame, constraintGroups: Seq[ConstraintGroup]): util.Map[Check, CheckResult] = { - val checks = checksFromRules(constraintGroups) - VerificationSuite().onData(data).addChecks(checks).run().checkResults.asJava - } - - def getConstraintResults(constraintResults : Seq[ConstraintResult]): util.List[ConstraintResult] ={ - constraintResults.asJava - } - -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/ExpectationsEngine.java b/java/src/main/java/com/logicalclocks/hsfs/engine/ExpectationsEngine.java deleted file mode 100644 index 24468fcf16..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/ExpectationsEngine.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.engine; - -import com.logicalclocks.hsfs.FeatureStoreException; -import com.logicalclocks.hsfs.metadata.Expectation; -import com.logicalclocks.hsfs.metadata.ExpectationsApi; - -import java.io.IOException; - -public class ExpectationsEngine { - - private final ExpectationsApi expectationsApi = new ExpectationsApi(); - - public Expectation save(Expectation expectation) throws FeatureStoreException, IOException { - return expectationsApi.put(expectation.getFeatureStore(), expectation); - } -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/ExternalFeatureGroupEngine.java b/java/src/main/java/com/logicalclocks/hsfs/engine/ExternalFeatureGroupEngine.java index 65f5c90c04..35c2ccc40b 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/ExternalFeatureGroupEngine.java +++ b/java/src/main/java/com/logicalclocks/hsfs/engine/ExternalFeatureGroupEngine.java @@ -19,7 +19,6 @@ import com.logicalclocks.hsfs.FeatureStoreException; import com.logicalclocks.hsfs.ExternalFeatureGroup; import com.logicalclocks.hsfs.metadata.FeatureGroupApi; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -53,10 +52,6 @@ public ExternalFeatureGroup saveFeatureGroup(ExternalFeatureGroup externalFeatur ExternalFeatureGroup apiFg = featureGroupApi.save(externalFeatureGroup); externalFeatureGroup.setId(apiFg.getId()); - if (externalFeatureGroup.getValidationType() != ValidationType.NONE && onDemandDataset != null) { - externalFeatureGroup.validate(onDemandDataset, true); - } - return externalFeatureGroup; } } diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupBaseEngine.java b/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupBaseEngine.java index 1d80ef7761..0c421ab857 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupBaseEngine.java +++ b/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupBaseEngine.java @@ -118,10 +118,4 @@ private FeatureGroupBase initFeatureGroupBase(FeatureGroupBase featureGroup) { } return new FeatureGroupBase(); } - - public void updateValidationType(FeatureGroupBase featureGroupBase, Class fgClass) - throws FeatureStoreException, IOException { - featureGroupApi.updateMetadata( - featureGroupBase, "validationType", featureGroupBase.getValidationType(), fgClass); - } } diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java b/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java index 4ce831f1fd..350575eecd 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java +++ b/java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java @@ -25,8 +25,6 @@ import com.logicalclocks.hsfs.engine.hudi.HudiEngine; import com.logicalclocks.hsfs.metadata.KafkaApi; import com.logicalclocks.hsfs.metadata.FeatureGroupApi; -import com.logicalclocks.hsfs.metadata.FeatureGroupValidation; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -88,13 +86,6 @@ public void insert(FeatureGroup featureGroup, Dataset featureData, Storage hudiPrecombineKey, featureData, false); } - if (featureGroup.getValidationType() != ValidationType.NONE) { - FeatureGroupValidation validation = featureGroup.validate(featureData, true); - if (validation != null) { - validationId = validation.getValidationId(); - } - } - if (saveMode == SaveMode.Overwrite) { // If we set overwrite, then the directory will be removed and with it all the metadata // related to the feature group will be lost. We need to keep them. @@ -124,11 +115,6 @@ public StreamingQuery insertStream(FeatureGroup featureGroup, Dataset featu hudiPrecombineKey, featureData, true); } - if (featureGroup.getValidationType() != ValidationType.NONE) { - LOGGER.info("ValidationWarning: Stream ingestion for feature group `" + featureGroup.getName() - + "`, with version `" + featureGroup.getVersion() + "` will not perform validation."); - } - StreamingQuery streamingQuery = SparkEngine.getInstance().writeStreamDataframe(featureGroup, utils.sanitizeFeatureNames(featureData), queryName, outputMode, awaitTermination, timeout, checkpointLocation, utils.getKafkaConfig(featureGroup, writeOptions)); diff --git a/java/src/main/java/com/logicalclocks/hsfs/engine/StreamFeatureGroupEngine.java b/java/src/main/java/com/logicalclocks/hsfs/engine/StreamFeatureGroupEngine.java index e7f663b071..336d3dfcd9 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/engine/StreamFeatureGroupEngine.java +++ b/java/src/main/java/com/logicalclocks/hsfs/engine/StreamFeatureGroupEngine.java @@ -25,7 +25,6 @@ import com.logicalclocks.hsfs.metadata.KafkaApi; import com.logicalclocks.hsfs.metadata.FeatureGroupApi; import com.logicalclocks.hsfs.metadata.Option; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import lombok.SneakyThrows; @@ -87,11 +86,6 @@ public Object insertStream(StreamFeatureGroup streamFeatureGroup, S featureD jobConfiguration, featureData); } - if (streamFeatureGroup.getValidationType() != ValidationType.NONE) { - LOGGER.info("ValidationWarning: Stream ingestion for feature group `" + streamFeatureGroup.getName() - + "`, with version `" + streamFeatureGroup.getVersion() + "` will not perform validation."); - } - return SparkEngine.getInstance().writeStreamDataframe(streamFeatureGroup, utils.sanitizeFeatureNames(featureData), queryName, outputMode, awaitTermination, timeout, checkpointLocation, utils.getKafkaConfig(streamFeatureGroup, writeOptions)); diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/Expectation.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/Expectation.java deleted file mode 100644 index 510efb26d2..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/Expectation.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.logicalclocks.hsfs.FeatureStore; -import com.logicalclocks.hsfs.FeatureStoreException; -import com.logicalclocks.hsfs.engine.ExpectationsEngine; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import lombok.ToString; -import scala.collection.JavaConverters; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import com.logicalclocks.hsfs.metadata.validation.Rule; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@ToString -@Builder -public class Expectation extends RestDto { - - @Getter @Setter - private String name; - @Getter @Setter - private String description; - @Getter @Setter - private List features = new ArrayList<>(); - @Getter @Setter - private List rules; - @Getter @Setter - @JsonIgnore - private FeatureStore featureStore; - - private final ExpectationsEngine expectationsEngine = new ExpectationsEngine(); - - public static class ExpectationBuilder { - - public ExpectationBuilder features(List features) { - this.features = features; - return this; - } - - public ExpectationBuilder features(scala.collection.Seq features) { - this.features = JavaConverters.seqAsJavaListConverter(features).asJava(); - return this; - } - - public ExpectationBuilder rules(scala.collection.Seq rules) { - this.rules = JavaConverters.seqAsJavaListConverter(rules).asJava(); - return this; - } - - public ExpectationBuilder rules(List rules) { - this.rules = rules; - return this; - } - } - - public void save() throws FeatureStoreException, IOException { - expectationsEngine.save(this); - } -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationResult.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationResult.java deleted file mode 100644 index a6a8e266c4..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationResult.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.logicalclocks.hsfs.metadata.validation.Level; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import lombok.ToString; -import scala.Enumeration; - -import java.util.List; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@ToString -@Builder -public class ExpectationResult { - - @Getter @Setter - private Status status; //Set by backend - @Getter @Setter - private Expectation expectation; - @Getter @Setter - private List results; - - public enum Status { - NONE("None",0), - SUCCESS("Success",1), - WARNING("Warning",2), - FAILURE("Failure",3); - - private final String name; - private final int severity; - - Status(String name, int severity) { - this.name = name; - this.severity = severity; - } - - public int getSeverity() { - return severity; - } - - public static Status fromString(String name) { - return valueOf(name.toUpperCase()); - } - - public static Status fromDeequStatus(Enumeration.Value status, Level level) { - if (status == com.amazon.deequ.constraints.ConstraintStatus.Failure()) { - return level == Level.WARNING ? WARNING : FAILURE; - } else if (status == com.amazon.deequ.constraints.ConstraintStatus.Success()) { - return SUCCESS; - } else { - return NONE; - } - } - - public String getName() { - return name; - } - - @Override - public String toString() { - return name; - } - } -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationsApi.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationsApi.java deleted file mode 100644 index cbfebd097c..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/ExpectationsApi.java +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.damnhandy.uri.template.UriTemplate; -import com.logicalclocks.hsfs.EntityEndpointType; -import com.logicalclocks.hsfs.FeatureStore; -import com.logicalclocks.hsfs.FeatureStoreException; -import org.apache.http.HttpHeaders; -import org.apache.http.client.methods.HttpDelete; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPut; -import org.apache.http.entity.StringEntity; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import static com.logicalclocks.hsfs.metadata.HopsworksClient.PROJECT_PATH; -import static com.logicalclocks.hsfs.metadata.HopsworksClient.getInstance; - -public class ExpectationsApi { - - public static final String ENTITY_ROOT_PATH = "{/entityType}"; - public static final String ENTITY_ID_PATH = ENTITY_ROOT_PATH + "{/entityId}"; - public static final String EXPECTATIONS_PATH = - ENTITY_ID_PATH + "/expectations{/name}{?engine,filter_by,sort_by,offset,limit,expand}"; - - private static final Logger LOGGER = LoggerFactory.getLogger(ExpectationsApi.class); - - private EntityEndpointType entityType; - - public ExpectationsApi() { - } - - public ExpectationsApi(EntityEndpointType entityType) { - this.entityType = entityType; - LOGGER.info("ExpectationsApi.EXPECTATIONS_PATH:" + EXPECTATIONS_PATH); - } - - public Expectation put(FeatureStore featureStore, Expectation expectation) throws FeatureStoreException, IOException { - return put(featureStore.getProjectId(), featureStore.getId(), expectation); - } - - public Expectation put(FeatureGroupBase featureGroupBase, String name) throws FeatureStoreException, IOException { - return put(featureGroupBase.getFeatureStore().getProjectId(), featureGroupBase.getId(), - featureGroupBase.getFeatureStore().getId(), name); - } - - private Expectation put(Integer projectId, Integer featurestoreId, Expectation expectation) - throws FeatureStoreException, IOException { - HopsworksClient hopsworksClient = getInstance(); - String pathTemplate = PROJECT_PATH + FeatureStoreApi.FEATURE_STORE_PATH + EXPECTATIONS_PATH; - String uri = UriTemplate.fromTemplate(pathTemplate) - .set("projectId", projectId) - .set("fsId", featurestoreId) - .expand(); - - String expectationStr = hopsworksClient.getObjectMapper().writeValueAsString(expectation); - HttpPut putRequest = new HttpPut(uri); - putRequest.setHeader(HttpHeaders.CONTENT_TYPE, "application/json"); - putRequest.setEntity(new StringEntity(expectationStr)); - - LOGGER.info("Sending metadata request: " + uri); - LOGGER.info(expectationStr); - - return hopsworksClient.handleRequest(putRequest, Expectation.class); - } - - private Expectation put(Integer projectId, Integer entityId, Integer featurestoreId, String name) - throws FeatureStoreException, IOException { - HopsworksClient hopsworksClient = getInstance(); - String pathTemplate = PROJECT_PATH + FeatureStoreApi.FEATURE_STORE_PATH + EXPECTATIONS_PATH; - LOGGER.info("pathTemplate: " + pathTemplate); - String uri = UriTemplate.fromTemplate(pathTemplate) - .set("projectId", projectId) - .set("fsId", featurestoreId) - .set("entityType", entityType.getValue()) - .set("entityId", entityId) - .set("name", name) - .expand(); - - HttpPut putRequest = new HttpPut(uri); - - LOGGER.info("Sending metadata request: " + uri); - return hopsworksClient.handleRequest(putRequest, Expectation.class); - } - - public void detach(FeatureGroupBase featureGroupBase) throws FeatureStoreException, IOException { - for (Expectation expectation : get(featureGroupBase)) { - detach(featureGroupBase, expectation); - } - } - - public void detach(FeatureGroupBase featureGroupBase, Expectation expectation) throws FeatureStoreException, - IOException { - delete(featureGroupBase.getFeatureStore().getProjectId(), - featureGroupBase.getId(), - featureGroupBase.getFeatureStore().getId(), - expectation.getName()); - } - - /** - * Detach an expectation from a feature group. - * @param featureGroupBase feature group - * @param name name of the expectation - * @throws FeatureStoreException FeatureStoreException - * @throws IOException IOException - */ - public void detach(FeatureGroupBase featureGroupBase, String name) throws FeatureStoreException, IOException { - delete(featureGroupBase.getFeatureStore().getProjectId(), featureGroupBase.getId(), - featureGroupBase.getFeatureStore().getId(), - name); - } - - public void delete(FeatureStore featureStore) throws FeatureStoreException, IOException { - for (Expectation expectation : get(featureStore)) { - delete(featureStore, expectation); - } - } - - public void delete(FeatureStore featureStore, Expectation expectation) throws FeatureStoreException, IOException { - delete(featureStore.getProjectId(), null, featureStore.getId(), expectation.getName()); - } - - /** - * Delete an expectation from the feature store. - * @param featureStore featureStore - * @param name name of the expectation - * @throws FeatureStoreException FeatureStoreException - * @throws IOException IOException - */ - public void delete(FeatureStore featureStore, String name) throws FeatureStoreException, IOException { - delete(featureStore.getProjectId(), null, featureStore.getId(), name); - } - - private void delete(Integer projectId, Integer entityId, Integer featurestoreId, String name) - throws FeatureStoreException, IOException { - String pathTemplate = PROJECT_PATH + FeatureStoreApi.FEATURE_STORE_PATH + EXPECTATIONS_PATH; - LOGGER.info("pathTemplate: " + pathTemplate); - UriTemplate uriTemplate = UriTemplate.fromTemplate(pathTemplate) - .set("projectId", projectId) - .set("fsId", featurestoreId) - .set("name", name); - - if (entityId != null) { - uriTemplate.set("entityType", entityType.getValue()) - .set("entityId", entityId); - } - - String uri = uriTemplate.expand(); - HttpDelete deleteRequest = new HttpDelete(uri); - - LOGGER.info("Sending metadata request: " + uri); - HopsworksClient hopsworksClient = getInstance(); - hopsworksClient.handleRequest(deleteRequest); - } - - public List get(FeatureStore featureStore) throws FeatureStoreException, IOException { - return get(featureStore.getProjectId(), null, featureStore.getId(), null); - } - - public Expectation get(FeatureStore featureStore, String name) throws FeatureStoreException, IOException { - List expectations = get(featureStore.getProjectId(), null, featureStore.getId(), name); - return !expectations.isEmpty() ? expectations.get(0) : null; - } - - public List get(FeatureGroupBase featureGroupBase) throws FeatureStoreException, IOException { - return get(featureGroupBase.getFeatureStore().getProjectId(), featureGroupBase.getId(), - featureGroupBase.getFeatureStore().getId(), null); - } - - /** - * Used by PY5J. - * @param projectId projectId - * @param featuregroupId featuregroupId - * @param featurestoreId featurestoreId - * @return list of expectations - * @throws FeatureStoreException FeatureStoreException - * @throws IOException IOException - */ - public List get(Integer projectId, Integer featuregroupId, Integer featurestoreId) - throws FeatureStoreException, IOException { - return get(projectId, featuregroupId, featurestoreId, null); - } - - public Expectation get(FeatureGroupBase featureGroupBase, String name) throws FeatureStoreException, IOException { - List expectations = - get(featureGroupBase.getFeatureStore().getProjectId(), null, featureGroupBase.getFeatureStore().getId(), name); - return !expectations.isEmpty() ? expectations.get(0) : null; - } - - private List get(Integer projectId, Integer entityId, Integer featurestoreId, String name) - throws FeatureStoreException, IOException { - String pathTemplate = PROJECT_PATH + FeatureStoreApi.FEATURE_STORE_PATH + EXPECTATIONS_PATH; - - UriTemplate uriTemplate = UriTemplate.fromTemplate(pathTemplate) - .set("projectId", projectId) - .set("fsId", featurestoreId) - .set("expand","rules"); - - if (entityId != null) { - uriTemplate - .set("entityType", entityType.getValue()) - .set("entityId", entityId); - } - - if (name != null) { - uriTemplate.set("name", name); - } - - String uri = uriTemplate.expand(); - LOGGER.info("Sending metadata request: " + uri); - HttpGet getRequest = new HttpGet(uri); - HopsworksClient hopsworksClient = getInstance(); - Expectation dto = hopsworksClient.handleRequest(getRequest, Expectation.class); - LOGGER.info("Received expectations dto: " + dto); - List expectations; - if (dto.getCount() == null) { - expectations = new ArrayList<>(); - expectations.add(dto); - } else { - expectations = dto.getItems(); - } - LOGGER.info("Received expectations: " + expectations); - return expectations; - } - -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupApi.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupApi.java index 23e98a2cc6..1005a6c5b6 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupApi.java +++ b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupApi.java @@ -25,7 +25,6 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.StreamFeatureGroup; import com.logicalclocks.hsfs.TimeTravelFormat; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import org.apache.http.client.methods.HttpDelete; import org.apache.http.client.methods.HttpGet; import org.apache.http.HttpHeaders; @@ -295,8 +294,7 @@ public FeatureGroup getOrCreateFeatureGroup(FeatureStore featureStore, String na String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, - StatisticsConfig statisticsConfig, ValidationType validationType, - scala.collection.Seq expectations, String eventTime) + StatisticsConfig statisticsConfig, String eventTime) throws IOException, FeatureStoreException { @@ -316,8 +314,6 @@ public FeatureGroup getOrCreateFeatureGroup(FeatureStore featureStore, String na .onlineEnabled(onlineEnabled) .timeTravelFormat(timeTravelFormat) .statisticsConfig(statisticsConfig) - .validationType(validationType) - .expectations(expectations) .eventTime(eventTime) .build(); @@ -335,7 +331,6 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(FeatureStore featureStor List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, StatisticsConfig statisticsConfig, - scala.collection.Seq expectations, String eventTime) throws IOException, FeatureStoreException { @@ -354,7 +349,6 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(FeatureStore featureStor .hudiPrecombineKey(hudiPrecombineKey) .onlineEnabled(onlineEnabled) .statisticsConfig(statisticsConfig) - .expectations(expectations) .eventTime(eventTime) .build(); diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupBase.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupBase.java index 793f07a8d8..428f52a96c 100644 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupBase.java +++ b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupBase.java @@ -26,18 +26,14 @@ import com.logicalclocks.hsfs.constructor.Filter; import com.logicalclocks.hsfs.constructor.FilterLogic; import com.logicalclocks.hsfs.constructor.Query; -import com.logicalclocks.hsfs.engine.DataValidationEngine; import com.logicalclocks.hsfs.engine.FeatureGroupBaseEngine; import com.logicalclocks.hsfs.engine.StatisticsEngine; -import com.logicalclocks.hsfs.metadata.validation.ValidationType; import lombok.Getter; import lombok.NoArgsConstructor; import lombok.Setter; import org.apache.avro.Schema; -import org.apache.commons.lang3.tuple.ImmutablePair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import scala.collection.JavaConverters; import java.io.IOException; import java.text.ParseException; @@ -93,10 +89,6 @@ public class FeatureGroupBase { @Setter protected StatisticsConfig statisticsConfig = new StatisticsConfig(); - @Getter - @Setter - protected ValidationType validationType = ValidationType.NONE; - @Getter @Setter protected List expectationsNames; @@ -105,12 +97,8 @@ public class FeatureGroupBase { @Setter protected String location; - @JsonIgnore - protected List expectations; - private FeatureGroupBaseEngine featureGroupBaseEngine = new FeatureGroupBaseEngine(); protected StatisticsEngine statisticsEngine = new StatisticsEngine(EntityEndpointType.FEATURE_GROUP); - protected final ExpectationsApi expectationsApi = new ExpectationsApi(EntityEndpointType.FEATURE_GROUP); private static final Logger LOGGER = LoggerFactory.getLogger(FeatureGroupBase.class); @@ -378,103 +366,6 @@ public List getPrimaryKeys() { return primaryKeys; } - public Expectation getExpectation(String name) throws FeatureStoreException, IOException { - return expectationsApi.get(this, name); - } - - @JsonIgnore - public scala.collection.Seq getExpectations() throws FeatureStoreException, IOException { - return JavaConverters.asScalaBufferConverter(expectationsApi.get(this)).asScala().toSeq(); - } - - public scala.collection.Seq attachExpectations(scala.collection.Seq expectations) - throws FeatureStoreException, IOException { - List expectationsList = new ArrayList<>(); - for (Expectation expectation : (List) JavaConverters.seqAsJavaListConverter(expectations).asJava()) { - expectationsList.add(attachExpectation(expectation)); - } - return JavaConverters.asScalaBufferConverter(expectationsList).asScala().toSeq(); - } - - public Expectation attachExpectation(Expectation expectation) throws FeatureStoreException, IOException { - return attachExpectation(expectation.getName()); - } - - public Expectation attachExpectation(String name) throws FeatureStoreException, IOException { - // Turn on validation for this FG and set stricter setting - if (validationType == ValidationType.NONE) { - updateValidationType(ValidationType.STRICT); - } - return expectationsApi.put(this, name); - } - - public void detachExpectation(Expectation expectation) throws FeatureStoreException, IOException { - detachExpectation(expectation.getName()); - } - - public void detachExpectation(String name) throws FeatureStoreException, IOException { - expectationsApi.detach(this, name); - } - - public void detachExpectations(scala.collection.Seq expectations) - throws FeatureStoreException, IOException { - for (Expectation expectation : (List) JavaConverters.seqAsJavaListConverter(expectations).asJava()) { - expectationsApi.detach(this, expectation); - } - } - - /** - * Update the FG validation type. - * @param validationType validationType - * @throws FeatureStoreException FeatureStoreException - * @throws IOException IOException - */ - public void updateValidationType(ValidationType validationType) throws FeatureStoreException, IOException { - this.validationType = validationType; - featureGroupBaseEngine.updateValidationType(this, this.getClass()); - } - - @JsonIgnore - public FeatureGroupValidation getValidation(Long time, DataValidationEngine.ValidationTimeType type) - throws FeatureStoreException, IOException { - return DataValidationEngine.getInstance().getValidation(this, - new ImmutablePair<>(type, time)); - } - - public FeatureGroupValidation validate() throws FeatureStoreException, IOException { - // Run data validation for entire feature group - return DataValidationEngine.getInstance().validate(this, this.read(), expectations, true); - } - - public FeatureGroupValidation validate(S data) throws FeatureStoreException, IOException { - return validate(data, false); - } - - public FeatureGroupValidation validate(S data, Boolean logActivity) throws FeatureStoreException, - IOException { - // Check if an expectation contains features. If it does not, try to use all the current FG features - List expectations = expectationsApi.get(this); - final List features = new ArrayList<>(); - LOGGER.debug("validate :: expectations = " + expectations); - for (Expectation expectation : expectations) { - if (expectation.getFeatures() == null || expectation.getFeatures().isEmpty()) { - // Get all feature names from FG - LOGGER.debug("validate :: getFeatures = " + getFeatures()); - if (features.isEmpty()) { - getFeatures().stream().forEach(x -> features.add(x.getName())); - } - expectation.setFeatures(features); - LOGGER.debug("validate :: expectation = " + expectation); - } - } - return DataValidationEngine.getInstance().validate(this, data, expectations, logActivity); - } - - @JsonIgnore - public List getValidations() throws FeatureStoreException, IOException { - return DataValidationEngine.getInstance().getValidations(this); - } - public String getOnlineTopicName() throws FeatureStoreException, IOException { // This method should be overridden by the FeatureGroup/StreamFeatureGroup classes return null; diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidation.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidation.java deleted file mode 100644 index fa6e7d523d..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidation.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import lombok.ToString; - -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.util.List; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@ToString -@Builder -public class FeatureGroupValidation extends RestDto { - - @Getter @Setter - private Integer validationId; - @Getter @Setter - private Long validationTime; - @Getter @Setter - private Long commitTime; - @Getter @Setter - private List expectationResults; - @Getter @Setter - private String validationPath; - @Getter @Setter - private ExpectationResult.Status status; - - @JsonIgnore - public String getCommitTimeAsDateTimeFormat() { - Instant instant = Instant.ofEpochSecond(commitTime); - return LocalDateTime.ofInstant(instant, ZoneOffset.UTC).format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")); - } - -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidations.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidations.java deleted file mode 100644 index 8f953d7326..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidations.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; - -import java.util.List; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@Builder -public class FeatureGroupValidations { - - @Getter @Setter - private Long validationTime; - @Getter @Setter - private List expectationResults; - @Getter @Setter - private Boolean logActivity = true; - - @Override - public String toString() { - return "DataValidationResults{" - + "validationTime=" + validationTime - + ", expectationResults='" + expectationResults + '\'' - + ", logActivity='" + logActivity + '\'' - + '}'; - } -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidationsApi.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidationsApi.java deleted file mode 100644 index 8850410da0..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/FeatureGroupValidationsApi.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.damnhandy.uri.template.UriTemplate; -import com.logicalclocks.hsfs.EntityEndpointType; -import com.logicalclocks.hsfs.FeatureStoreException; -import com.logicalclocks.hsfs.engine.DataValidationEngine; -import lombok.NonNull; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.apache.http.HttpHeaders; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPut; -import org.apache.http.entity.StringEntity; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import static com.logicalclocks.hsfs.metadata.HopsworksClient.PROJECT_PATH; -import static com.logicalclocks.hsfs.metadata.HopsworksClient.getInstance; - -public class FeatureGroupValidationsApi { - - public static final String ENTITY_ROOT_PATH = "{/entityType}"; - public static final String ENTITY_ID_PATH = ENTITY_ROOT_PATH + "{/entityId}"; - public static final String RESULTS_PATH = - ENTITY_ID_PATH + "/validations{/id}{?filter_by,sort_by,offset,limit}"; - - private static final Logger LOGGER = LoggerFactory.getLogger(FeatureGroupValidationsApi.class); - - private final EntityEndpointType entityType; - - public FeatureGroupValidationsApi(@NonNull EntityEndpointType entityType) { - this.entityType = entityType; - } - - public List get(FeatureGroupBase featureGroupBase) - throws FeatureStoreException, IOException { - return get(featureGroupBase.getFeatureStore().getProjectId(), featureGroupBase.getFeatureStore().getId(), - featureGroupBase.getId(), null); - } - - public FeatureGroupValidation get(FeatureGroupBase featureGroupBase, - ImmutablePair pair) throws FeatureStoreException, IOException { - return get(featureGroupBase.getFeatureStore().getProjectId(), featureGroupBase.getFeatureStore().getId(), - featureGroupBase.getId(), pair).get(0); - } - - private List get(Integer projectId, Integer featurestoreId, Integer entityId, - ImmutablePair pair) - throws FeatureStoreException, IOException { - - String pathTemplate = PROJECT_PATH + FeatureStoreApi.FEATURE_STORE_PATH + RESULTS_PATH; - - UriTemplate uriTemplate = UriTemplate.fromTemplate(pathTemplate) - .set("projectId", projectId) - .set("fsId", featurestoreId) - .set("entityType", entityType.getValue()) - .set("entityId", entityId); - if (pair != null) { - if (pair.getLeft() == DataValidationEngine.ValidationTimeType.VALIDATION_TIME) { - uriTemplate.set("filter_by", "validation_time_eq:" + pair.getRight()); - } else if (pair.getLeft() == DataValidationEngine.ValidationTimeType.COMMIT_TIME) { - uriTemplate.set("filter_by", "commit_time_eq:" + pair.getRight()); - } - } - - String uri = uriTemplate.expand(); - LOGGER.info("Sending metadata request: " + uri); - HttpGet getRequest = new HttpGet(uri); - HopsworksClient hopsworksClient = getInstance(); - FeatureGroupValidation dto = hopsworksClient.handleRequest(getRequest, FeatureGroupValidation.class); - List validations; - if (dto.getCount() == null) { - validations = new ArrayList<>(); - validations.add(dto); - } else { - validations = dto.getItems(); - } - LOGGER.info("Received validations: " + validations); - return validations; - } - - - public FeatureGroupValidation put(FeatureGroupBase featureGroupBase, - FeatureGroupValidation featureGroupValidation, Boolean logActivity) - throws FeatureStoreException, IOException { - return put(featureGroupBase.getFeatureStore().getProjectId(), featureGroupBase.getFeatureStore().getId(), - featureGroupBase.getId(), featureGroupValidation, logActivity); - } - - private FeatureGroupValidation put(Integer projectId, Integer featurestoreId, Integer entityId, - FeatureGroupValidation featureGroupValidation, Boolean logActivity) - throws FeatureStoreException, IOException { - - HopsworksClient hopsworksClient = getInstance(); - String pathTemplate = PROJECT_PATH + FeatureStoreApi.FEATURE_STORE_PATH + RESULTS_PATH; - - String uri = UriTemplate.fromTemplate(pathTemplate) - .set("projectId", projectId) - .set("fsId", featurestoreId) - .set("entityType", entityType.getValue()) - .set("entityId", entityId) - .expand(); - - FeatureGroupValidations validations = - FeatureGroupValidations.builder().expectationResults(featureGroupValidation.getExpectationResults()) - .validationTime(featureGroupValidation.getValidationTime()).build(); - if (logActivity != null) { - validations.setLogActivity(logActivity); - } - - String results = hopsworksClient.getObjectMapper().writeValueAsString(validations); - HttpPut putRequest = new HttpPut(uri); - putRequest.setHeader(HttpHeaders.CONTENT_TYPE, "application/json"); - putRequest.setEntity(new StringEntity(results)); - - LOGGER.info("Sending metadata request: " + uri); - LOGGER.info(results); - - return hopsworksClient.handleRequest(putRequest, FeatureGroupValidation.class); - } - -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/RuleDefinition.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/RuleDefinition.java deleted file mode 100644 index c8f35c8496..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/RuleDefinition.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.logicalclocks.hsfs.metadata.validation.FeatureType; -import com.logicalclocks.hsfs.metadata.validation.RuleName; -import com.logicalclocks.hsfs.metadata.validation.Predicate; -import com.logicalclocks.hsfs.metadata.validation.AcceptedType; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import lombok.ToString; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@ToString -@Builder -/* - Used when fetching validation rules from the /rules resource. - */ -public class RuleDefinition extends RestDto { - - @Getter @Setter - private RuleName name; - @Getter @Setter - private Predicate predicate; - @Getter @Setter - private AcceptedType acceptedType; - @Getter @Setter - private FeatureType featureType; - @Getter @Setter - private String description; - -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/RulesApi.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/RulesApi.java deleted file mode 100644 index c72b179e58..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/RulesApi.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.logicalclocks.hsfs.metadata; - -import com.damnhandy.uri.template.UriTemplate; -import com.logicalclocks.hsfs.FeatureStoreException; -import com.logicalclocks.hsfs.metadata.validation.RuleName; -import org.apache.http.client.methods.HttpGet; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.collection.JavaConverters; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import static com.logicalclocks.hsfs.metadata.HopsworksClient.API_PATH; -import static com.logicalclocks.hsfs.metadata.HopsworksClient.getInstance; - -public class RulesApi { - - public static final String RULE_DEFINITIONS_PATH = API_PATH - + "/rules{/name}{/predicate}{?filter_by,sort_by,offset,limit}"; - - private static final Logger LOGGER = LoggerFactory.getLogger(RulesApi.class); - - - public scala.collection.Seq get() throws FeatureStoreException, IOException { - return JavaConverters.asScalaBufferConverter(getRules(null)).asScala().toSeq(); - } - - public RuleDefinition get(RuleName name) throws FeatureStoreException, IOException { - return getRules(name).get(0); - } - - private List getRules(RuleName name) - throws FeatureStoreException, IOException { - - UriTemplate uriTemplate = UriTemplate.fromTemplate(RULE_DEFINITIONS_PATH); - - if (name != null) { - uriTemplate.set("name", name); - } - String uri = uriTemplate.expand(); - - LOGGER.info("Sending metadata request: " + uri); - HttpGet getRequest = new HttpGet(uri); - HopsworksClient hopsworksClient = getInstance(); - RuleDefinition rulesDto = hopsworksClient.handleRequest(getRequest, RuleDefinition.class); - List rules; - if (rulesDto.getCount() == null) { - rules = new ArrayList<>(); - rules.add(rulesDto); - } else { - rules = rulesDto.getItems(); - } - LOGGER.info("Received ruleDefinitions: " + rules); - return rules; - } -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/ValidationResult.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/ValidationResult.java deleted file mode 100644 index e400c5b03e..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/ValidationResult.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import com.logicalclocks.hsfs.metadata.validation.Rule; -import lombok.ToString; - -import java.util.List; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@ToString -@Builder -public class ValidationResult { - - @Getter @Setter - private ExpectationResult.Status status; - @Getter @Setter - private String message; - @Getter @Setter - private String value; - @Getter @Setter - private List features; - @Getter @Setter - private Rule rule; -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/AcceptedType.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/AcceptedType.java deleted file mode 100644 index f116830ab5..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/AcceptedType.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata.validation; - -public enum AcceptedType { - Null, Fractional, Integral, Boolean, String -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/FeatureType.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/FeatureType.java deleted file mode 100644 index f353a10458..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/FeatureType.java +++ /dev/null @@ -1,5 +0,0 @@ -package com.logicalclocks.hsfs.metadata.validation; - -public enum FeatureType { - Numerical, Categorical -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Level.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Level.java deleted file mode 100644 index d9886bfdec..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Level.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata.validation; - -public enum Level { - WARNING, - ERROR; -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Predicate.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Predicate.java deleted file mode 100644 index 5ad5323eff..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Predicate.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata.validation; - -public enum Predicate { - LEGAL_VALUES, - ACCEPTED_TYPE, - PATTERN, - FEATURE -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Rule.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Rule.java deleted file mode 100644 index e8c61f7053..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/Rule.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.logicalclocks.hsfs.metadata.validation; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.logicalclocks.hsfs.metadata.RuleDefinition; -import com.logicalclocks.hsfs.metadata.RulesApi; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import lombok.ToString; -import scala.collection.JavaConverters; - -import java.util.List; - -@JsonIgnoreProperties(ignoreUnknown = true) -@NoArgsConstructor -@AllArgsConstructor -@ToString -/* - Used when creating rules for FeatureStore expectations. - */ -public class Rule { - - private final RulesApi featureStoreRulesApi = new RulesApi(); - - @Getter @Setter - private RuleName name; - @Getter @Setter - private Level level; - @Getter @Setter - private Double min; - @Getter @Setter - private Double max; - @Getter @Setter - private String value; - @Getter @Setter - private String pattern; - @Getter @Setter - private AcceptedType acceptedType; - @Getter @Setter - private String feature; - @Getter @Setter - private List legalValues; - - public static Rule.RuleBuilder createRule(RuleDefinition rule) { - return com.logicalclocks.hsfs.metadata.validation.Rule.builder().rule(rule); - } - - public static Rule.RuleBuilder createRule(RuleName name) { - return com.logicalclocks.hsfs.metadata.validation.Rule.builder().name(name); - } - - @Builder - public Rule(RuleName name, RuleDefinition rule, Level level, Double min, Double max, String value, String pattern, - AcceptedType acceptedType, String feature, scala.collection.Seq legalValues) { - if (rule != null) { - this.name = rule.getName(); - } else { - this.name = name; - } - this.level = level; - this.min = min; - this.max = max; - this.value = value; - this.pattern = pattern; - this.acceptedType = acceptedType; - this.feature = feature; - if (legalValues != null) { - this.legalValues = JavaConverters.seqAsJavaListConverter(legalValues).asJava(); - } - } - -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/RuleName.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/RuleName.java deleted file mode 100644 index 38cd1f3707..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/RuleName.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata.validation; - -public enum RuleName { - HAS_MEAN, - HAS_MIN, - HAS_MAX, - HAS_SUM, - HAS_SIZE, - HAS_COMPLETENESS, - HAS_UNIQUENESS, - HAS_DISTINCTNESS, - HAS_UNIQUE_VALUE_RATIO, - HAS_NUMBER_OF_DISTINCT_VALUES, - HAS_ENTROPY, - HAS_MUTUAL_INFORMATION, - HAS_APPROX_QUANTILE, - HAS_STANDARD_DEVIATION, - HAS_APPROX_COUNT_DISTINCT, - HAS_CORRELATION, - HAS_PATTERN, - HAS_MIN_LENGTH, - HAS_MAX_LENGTH, - HAS_DATATYPE, - IS_NON_NEGATIVE, - IS_POSITIVE, - IS_LESS_THAN, - IS_LESS_THAN_OR_EQUAL_TO, - IS_GREATER_THAN, - IS_GREATER_THAN_OR_EQUAL_TO, - IS_CONTAINED_IN -} diff --git a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/ValidationType.java b/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/ValidationType.java deleted file mode 100644 index 291801de67..0000000000 --- a/java/src/main/java/com/logicalclocks/hsfs/metadata/validation/ValidationType.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2020 Logical Clocks AB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * - * See the License for the specific language governing permissions and limitations under the License. - */ - -package com.logicalclocks.hsfs.metadata.validation; - -public enum ValidationType { - // Data validation is performed and feature group is updated only if validation status is "Success" - STRICT(1), - // Data validation is performed and feature group is updated only if validation status is "Warning" or lower - WARNING(2), - // Data validation is performed and feature group is updated only if validation status is "Failure" or lower - ALL(3), - // Data validation not performed on feature group - NONE(4); - private final int severity; - - private ValidationType(int severity) { - this.severity = severity; - } - - public int getSeverity() { - return severity; - } - - public static ValidationType fromSeverity(int v) { - for (ValidationType c : ValidationType.values()) { - if (c.severity == v) { - return c; - } - } - throw new IllegalArgumentException("" + v); - } -} diff --git a/mkdocs.yml b/mkdocs.yml index c8e6a98880..014eaa26cc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,10 +25,6 @@ nav: - Storage Connector: generated/api/storage_connector_api.md - Feature: generated/api/feature_api.md - Query: generated/api/query_api.md - - Rule Definiton: generated/api/rule_definition_api.md - - Rule: generated/api/rule_api.md - - Expectation: generated/api/expectation_api.md - - Validation: generated/api/validation_api.md - Transformation Functions: generated/api/transformation_functions_api.md - Job configuration: generated/api/job_configuration.md - Contributing: CONTRIBUTING.md diff --git a/python/hsfs/connection.py b/python/hsfs/connection.py index e00aa158a7..2465903077 100644 --- a/python/hsfs/connection.py +++ b/python/hsfs/connection.py @@ -21,7 +21,7 @@ from hsfs.decorators import connected, not_connected from hsfs import engine, client, util -from hsfs.core import feature_store_api, project_api, hosts_api, services_api, rules_api +from hsfs.core import feature_store_api, project_api, hosts_api, services_api AWS_DEFAULT_REGION = "default" HOPSWORKS_PORT_DEFAULT = 443 @@ -135,7 +135,6 @@ def __init__( self._api_key_file = api_key_file self._api_key_value = api_key_value self._connected = False - self._rules_api = rules_api.RulesApi() self.connect() @@ -486,16 +485,6 @@ def api_key_file(self, api_key_file): def api_key_value(self, api_key_value): self._api_key_value = api_key_value - def get_rules(self): - """Get a rule with a certain name or all rules available for data validation.""" - - return self._rules_api.get() - - def get_rule(self, name: str): - """Get a rule with a certain name or all rules available for data validation.""" - - return self._rules_api.get(name) - def __enter__(self): self.connect() return self diff --git a/python/hsfs/core/data_validation_engine.py b/python/hsfs/core/data_validation_engine.py deleted file mode 100644 index f05c7c9720..0000000000 --- a/python/hsfs/core/data_validation_engine.py +++ /dev/null @@ -1,154 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import time - -from hsfs import ( - engine, - expectation, - rule, - expectation_result, - feature_group_validation, - validation_result, -) -from hsfs.client import exceptions -from hsfs.core import validations_api, expectations_api - - -class DataValidationEngine: - def __init__(self, feature_store_id, entity_type): - self._feature_group_validation_api = validations_api.FeatureGroupValidationsApi( - feature_store_id, entity_type - ) - self._expectations_api = expectations_api.ExpectationsApi( - feature_store_id, entity_type - ) - - def ingest_validate(self, feature_group, feature_dataframe): - if feature_group.validation_type != "NONE" and engine.get_type() == "spark": - # If the engine is Python, the validation will be executed by - # the Hopsworks job ingesting the data - return self.validate(feature_group, feature_dataframe, True) - return None - - def validate(self, feature_group, feature_dataframe, log_activity): - """Perform data validation for a dataframe and send the result json to Hopsworks.""" - - validation_time = int(round(time.time() * 1000)) - if len(feature_dataframe.head(1)) == 0: - raise exceptions.FeatureStoreException( - "There is no data in the entity that you are trying to validate data " - "for. A possible cause might be that you inserted only data " - "to the online storage of a feature group." - ) - - expectations = self._expectations_api.get(feature_group=feature_group) - # Check if an expectation contains features. If it does not, try to use all the current FG features - feature_names = [feature.name for feature in feature_group.features] - for exp in expectations: - if not exp.features: - exp.features = feature_names - expectation_results_java = engine.get_instance().validate( - feature_dataframe, expectations - ) - # Loop through Java object and convert to Python - expectation_results = [] - for exp_res in expectation_results_java: - # Create the Expectation - exp = exp_res.getExpectation() - rules_python = [] - for exp_rule in exp.getRules(): - legal_values = [] - if exp_rule.getLegalValues() is not None: - for legal_value in exp_rule.getLegalValues(): - legal_values.append(legal_value) - rules_python.append( - rule.Rule( - name=exp_rule.getName().name(), - level=exp_rule.getLevel().name(), - min=exp_rule.getMin(), - max=exp_rule.getMax(), - pattern=exp_rule.getPattern(), - accepted_type=exp_rule.getAcceptedType().name() - if exp_rule.getAcceptedType() is not None - else None, - feature=exp_rule.getFeature(), - legal_values=legal_values, - ) - ) - - features_python = [] - for feature in exp.getFeatures(): - features_python.append(feature) - expectation_python = expectation.Expectation( - name=exp.getName(), - description=exp.getDescription(), - features=features_python, - rules=rules_python, - ) - # Create the ValidationResult - validation_results_python = [] - for validation_result_java in exp_res.getResults(): - # Create rule python - legal_values = [] - if validation_result_java.getRule().getLegalValues() is not None: - for ( - legal_value - ) in validation_result_java.getRule().getLegalValues(): - legal_values.append(legal_value) - validation_rule_python = rule.Rule( - name=validation_result_java.getRule().getName().name(), - level=validation_result_java.getRule().getLevel().name(), - min=validation_result_java.getRule().getMin(), - max=validation_result_java.getRule().getMax(), - pattern=validation_result_java.getRule().getPattern(), - accepted_type=validation_result_java.getRule() - .getAcceptedType() - .name() - if validation_result_java.getRule().getAcceptedType() is not None - else None, - feature=validation_result_java.getRule().getFeature(), - legal_values=legal_values, - ) - - features = [feature for feature in validation_result_java.getFeatures()] - - validation_results_python.append( - validation_result.ValidationResult( - status=validation_result_java.getStatus().name(), - message=validation_result_java.getMessage(), - value=validation_result_java.getValue(), - features=features, - rule=validation_rule_python, - ) - ) - - expectation_result_python = expectation_result.ExpectationResult( - expectation=expectation_python, results=validation_results_python - ) - expectation_results.append(expectation_result_python) - validation_python = feature_group_validation.FeatureGroupValidation( - validation_time=validation_time, - expectation_results=expectation_results, - log_activity=log_activity, - ) - return self._feature_group_validation_api.put(feature_group, validation_python) - - def get_validations(self, feature_group, validation_time=None, commit_time=None): - """Get feature group data validation results for the specified validation or commit time.""" - return self._feature_group_validation_api.get( - feature_group, validation_time, commit_time - ) diff --git a/python/hsfs/core/expectations_api.py b/python/hsfs/core/expectations_api.py deleted file mode 100644 index e723084254..0000000000 --- a/python/hsfs/core/expectations_api.py +++ /dev/null @@ -1,144 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from hsfs import client, expectation - - -class ExpectationsApi: - def __init__(self, feature_store_id, entity_type=None): - """Expectations endpoint for `featurestores` and `featuregroups` resource. - - :param feature_store_id: id of the respective featurestore - :type feature_store_id: int - :param entity_type: "featuregroups" - :type entity_type: str - """ - self._feature_store_id = feature_store_id - self._entity_type = entity_type - - def create(self, expectation): - """Create and Feature Store expectation or Attach it by name to a Feature Group. - - :param expectation: expectation object to be created for a feature store - :type expectation: `Expectation` - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - "expectations", - ] - - headers = {"content-type": "application/json"} - payload = expectation.json() if expectation else None - _client._send_request("PUT", path_params, headers=headers, data=payload) - - def attach(self, feature_group, name): - """Attach a Feature Store expectation to a Feature Group. - - :param feature_group: metadata object of the instance to attach the expectation to - :type feature_group: FeatureGroup - :param name: name of the expectation to be attached - :type name: str - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - self._entity_type, - feature_group.id, - "expectations", - name, - ] - - _client._send_request("PUT", path_params) - - def delete(self, name): - """Delete a Feature Store expectation. - - :param name: name of the expectation to be deleted - :type name: str - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - "expectations", - name, - ] - - _client._send_request("DELETE", path_params) - - def detach(self, feature_group, name): - """Detach a Feature Store expectation from a Feature Group. - - :param feature_group: metadata object of the instance to attach the expectation to - :type feature_group: FeatureGroup - :param name: name of the expectation to be attached - :type name: str - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - self._entity_type, - feature_group.id, - "expectations", - name, - ] - - _client._send_request("DELETE", path_params) - - def get(self, name=None, feature_group=None): - """Get the expectations of a feature store or feature group. - - Gets all feature store expectations if no feature group is specified. - Gets all feature store or feature group expectations if no name is specified. - - :param name: expectation name - :type name: str - :param feature_group: feature group to get the expectations of - :type feature_group: FeatureGroup - :return: list of expectations - :rtype: list of dict - """ - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - ] - - if feature_group is not None: - path_params.extend([self._entity_type, feature_group.id, "expectations"]) - else: - path_params.append("expectations") - - if name: - path_params.append(name) - - return expectation.Expectation.from_response_json( - _client._send_request("GET", path_params) - ) diff --git a/python/hsfs/core/expectations_engine.py b/python/hsfs/core/expectations_engine.py deleted file mode 100644 index 89ff59465c..0000000000 --- a/python/hsfs/core/expectations_engine.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from hsfs.core import expectations_api - - -class ExpectationsEngine: - def __init__(self, feature_store_id): - """Expectations engine. - - :param feature_store_id: id of the respective featurestore - :type feature_store_id: int - """ - self._feature_store_id = feature_store_id - self._expectations_api = expectations_api.ExpectationsApi(feature_store_id) - - def save(self, expectation): - self._expectations_api.create(expectation) diff --git a/python/hsfs/core/feature_group_api.py b/python/hsfs/core/feature_group_api.py index b178672c11..6c53bba8b2 100644 --- a/python/hsfs/core/feature_group_api.py +++ b/python/hsfs/core/feature_group_api.py @@ -149,8 +149,7 @@ def update_metadata( feature group. feature_group_copy: FeatureGroup. Metadata object of the feature group with the information to be updated. - query_parameter: str. Query parameter that controls which information is updated. E.g. "updateMetadata", - or "validationType". + query_parameter: str. Query parameter that controls which information is updated. E.g. "updateMetadata". query_parameter_value: Str. Value of the query_parameter. # Returns diff --git a/python/hsfs/core/feature_group_base_engine.py b/python/hsfs/core/feature_group_base_engine.py index 3dbe5adaa9..b6dbf570bb 100644 --- a/python/hsfs/core/feature_group_base_engine.py +++ b/python/hsfs/core/feature_group_base_engine.py @@ -62,12 +62,3 @@ def new_feature_list(self, feature_group, updated_features): ): new_features.append(feature) return new_features + updated_features - - def update_validation_type(self, feature_group): - """Update the metadata attribute specified of the feature group .""" - self._feature_group_api.update_metadata( - feature_group, - feature_group, - "validationType", - feature_group.validation_type, - ) diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py index be847a7ddc..7601b14b09 100644 --- a/python/hsfs/core/feature_group_engine.py +++ b/python/hsfs/core/feature_group_engine.py @@ -33,12 +33,6 @@ def save(self, feature_group, feature_dataframe, write_options, validation_optio feature_group, feature_dataframe, write_options ) - # deequ validation only on spark - validation = feature_group._data_validation_engine.ingest_validate( - feature_group, feature_dataframe - ) - validation_id = validation.validation_id if validation is not None else None - # ge validation on python and non stream feature groups on spark ge_report = feature_group._great_expectation_engine.validate( feature_group, feature_dataframe, True, validation_options @@ -61,7 +55,6 @@ def save(self, feature_group, feature_dataframe, write_options, validation_optio None, offline_write_options, online_write_options, - validation_id, ), ge_report, ) @@ -82,12 +75,6 @@ def insert( feature_group, feature_dataframe, write_options ) - # deequ validation only on spark - validation = feature_group._data_validation_engine.ingest_validate( - feature_group, feature_dataframe - ) - validation_id = validation.validation_id if validation is not None else None - # ge validation on python and non stream feature groups on spark ge_report = feature_group._great_expectation_engine.validate( feature_group, feature_dataframe, True, validation_options @@ -116,7 +103,6 @@ def insert( storage, offline_write_options, online_write_options, - validation_id, ), ge_report, ) diff --git a/python/hsfs/core/rules_api.py b/python/hsfs/core/rules_api.py deleted file mode 100644 index 3daff30355..0000000000 --- a/python/hsfs/core/rules_api.py +++ /dev/null @@ -1,42 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from hsfs import client, ruledefinition - - -class RulesApi: - def __init__(self): - """Rules endpoint for Feature Store data validation rules.""" - - def get(self, name=None): - """Get the rules available in Hopsworks to be used for data validation. - - Gets all rules if no rule name is specified. - - :param name: rule name - :type name: str - :return: list of rules - :rtype: list - """ - _client = client.get_instance() - path_params = ["rules"] - - if name is not None: - path_params.append(name) - - return ruledefinition.RuleDefinition.from_response_json( - _client._send_request("GET", path_params) - ) diff --git a/python/hsfs/core/validations_api.py b/python/hsfs/core/validations_api.py deleted file mode 100644 index bb4a9216f4..0000000000 --- a/python/hsfs/core/validations_api.py +++ /dev/null @@ -1,79 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from hsfs import client, feature_group_validation as fgv - - -class FeatureGroupValidationsApi: - def __init__(self, feature_store_id, entity_type): - """Data validations endpoint for `trainingdatasets` `featuregroups`. - - :param feature_store_id: id of the respective featurestore - :type feature_store_id: int - :param entity_type: "featuregroups" - :type entity_type: str - """ - self._feature_store_id = feature_store_id - self._entity_type = entity_type - - def put(self, metadata_instance, feature_group_validation): - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - self._entity_type, - metadata_instance.id, - "validations", - ] - headers = {"content-type": "application/json"} - return fgv.FeatureGroupValidation.from_response_json( - _client._send_request( - "PUT", - path_params, - headers=headers, - data=feature_group_validation.json(), - ) - ) - - def get(self, metadata_instance, validation_time=None, commit_time=None): - """Gets the statistics for a specific commit time for an instance.""" - _client = client.get_instance() - path_params = [ - "project", - _client._project_id, - "featurestores", - self._feature_store_id, - self._entity_type, - metadata_instance.id, - "validations", - ] - headers = {"content-type": "application/json"} - if validation_time is not None: - query_params = { - "filter_by": "validation_time_eq:" + str(validation_time), - } - elif commit_time: - query_params = { - "filter_by": "commit_time_eq:" + str(commit_time), - } - else: - query_params = None - - return fgv.FeatureGroupValidation.from_response_json( - _client._send_request("GET", path_params, query_params, headers=headers) - ) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 67f52cdc75..20c2eab054 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -553,53 +553,6 @@ def profile( ) ) - def validate(self, dataframe, expectations, log_activity=True): - """Run data validation on the dataframe with Deequ.""" - - expectations_java = [] - for expectation in expectations: - rules = [] - for rule in expectation.rules: - rules.append( - self._jvm.com.logicalclocks.hsfs.metadata.validation.Rule.builder() - .name( - self._jvm.com.logicalclocks.hsfs.metadata.validation.RuleName.valueOf( - rule.get("name") - ) - ) - .level( - self._jvm.com.logicalclocks.hsfs.metadata.validation.Level.valueOf( - rule.get("level") - ) - ) - .min(rule.get("min", None)) - .max(rule.get("max", None)) - .pattern(rule.get("pattern", None)) - .acceptedType( - self._jvm.com.logicalclocks.hsfs.metadata.validation.AcceptedType.valueOf( - rule.get("accepted_type") - ) - if rule.get("accepted_type") is not None - else None - ) - .feature((rule.get("feature", None))) - .legalValues(rule.get("legal_values", None)) - .build() - ) - expectation = ( - self._jvm.com.logicalclocks.hsfs.metadata.Expectation.builder() - .name(expectation.name) - .description(expectation.description) - .features(expectation.features) - .rules(rules) - .build() - ) - expectations_java.append(expectation) - - return self._jvm.com.logicalclocks.hsfs.engine.DataValidationEngine.getInstance().validate( - dataframe._jdf, expectations_java - ) - def validate_with_great_expectations( self, dataframe: TypeVar("pyspark.sql.DataFrame"), # noqa: F821 diff --git a/python/hsfs/expectation.py b/python/hsfs/expectation.py deleted file mode 100644 index b9d48b1f56..0000000000 --- a/python/hsfs/expectation.py +++ /dev/null @@ -1,106 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json - -import humps -from hsfs import util -from hsfs.core import expectations_engine - - -class Expectation: - """Metadata object representing an feature validation expectation in the Feature Store.""" - - def __init__( - self, - name, - features, - rules, - description=None, - featurestore_id=None, - href=None, - expand=None, - items=None, - count=None, - type=None, - ): - self._name = name - self._features = features - self._rules = rules - self._description = description - self._featurestore_id = featurestore_id - - def save(self): - """Persist the expectation metadata object to the feature store.""" - expectations_engine.ExpectationsEngine(self._featurestore_id).save(self) - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if "count" in json_decamelized: - if json_decamelized["count"] == 0: - return [] - return [cls(**expectation) for expectation in json_decamelized["items"]] - else: - return cls(**json_decamelized) - - def json(self): - return json.dumps(self, cls=util.FeatureStoreEncoder) - - def to_dict(self): - return { - "name": self._name, - "description": self._description, - "features": self._features, - "rules": self._rules, - } - - @property - def name(self): - """Name of the expectation, unique per feature store (project).""" - return self._name - - @name.setter - def name(self, name): - self._name = name - - @property - def description(self): - """Description of the expectation.""" - return self._description - - @description.setter - def description(self, description): - self._description = description - - @property - def features(self): - """Optional list of features this expectation is applied to. If no features are provided, the expectation - will be applied to all the feature group features.""" - return self._features - - @features.setter - def features(self, features): - self._features = features - - @property - def rules(self): - """List of rules applied to the features of the expectation.""" - return self._rules - - @rules.setter - def rules(self, rules): - self._rules = rules diff --git a/python/hsfs/expectation_result.py b/python/hsfs/expectation_result.py deleted file mode 100644 index e42ba27259..0000000000 --- a/python/hsfs/expectation_result.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import humps -import json - -from hsfs import util - - -class ExpectationResult: - """Metadata object representing the expectation results of the data into a Feature Group.""" - - def __init__( - self, - expectation, - results, - status=None, - href=None, - expand=None, - items=None, - count=None, - type=None, - ): - self._status = status - self._expectation = expectation - self._results = results - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if json_decamelized["count"] == 0: - return [] - return [ - cls(**expectation_result) - for expectation_result in json_decamelized["items"] - ] - - def json(self): - return json.dumps(self, cls=util.FeatureStoreEncoder) - - def to_dict(self): - return { - "status": self._status, - "expectation": self._expectation, - "results": self._results, - } - - @property - def status(self): - """Status of the expectation after feature ingestion, one of "NONE", "SUCCESS", "WARNING", "FAILURE".""" - return self._status - - @status.setter - def status(self, status): - self._status = status - - @property - def expectation(self): - """The expectation this result refers to.""" - return self._expectation - - @expectation.setter - def expectation(self, expectation): - self._expectation = expectation - - @property - def results(self): - return self._results - - @results.setter - def results(self, results): - """List of validation results, that is results for all feature-rule pairs.""" - self._results = results diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index b1758f25aa..ac765a131d 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -32,9 +32,7 @@ expectation_suite_engine, validation_report_engine, code_engine, - data_validation_engine, external_feature_group_engine, - expectations_api, ) from hsfs.core.deltastreamer_jobconf import DeltaStreamerJobConf @@ -46,19 +44,12 @@ class FeatureGroupBase: - def __init__(self, featurestore_id, validation_type, location): - self._validation_type = validation_type.upper() + def __init__(self, featurestore_id, location): self._location = location self._statistics_engine = statistics_engine.StatisticsEngine( featurestore_id, self.ENTITY_TYPE ) self._code_engine = code_engine.CodeEngine(featurestore_id, self.ENTITY_TYPE) - self._expectations_api = expectations_api.ExpectationsApi( - featurestore_id, "featuregroups" - ) - self._data_validation_engine = data_validation_engine.DataValidationEngine( - featurestore_id, self.ENTITY_TYPE - ) self._great_expectation_engine = ( great_expectation_engine.GreatExpectationEngine(featurestore_id) ) @@ -384,73 +375,149 @@ def append_features(self, features: Union[feature.Feature, List[feature.Feature] self._feature_group_engine.append_features(self, new_features) return self - def attach_expectation(self, expectation): - """Attach a feature group expectation. If feature group validation is not already enabled, it will be enabled - and set to the stricter setting. + def get_expectation_suite(self, ge_type: bool = True): + """Return the expectation suite attached to the feature group if it exists. # Arguments - name: The expectation name. + ge_type: If `True` returns a native Great Expectation type, Hopsworks + custom type otherwise. Conversion can be performed via the `to_ge_type()` + method on hopsworks type. Defaults to `True`. # Returns - `Expectation`. The expectation metadata object. + `ExpectationSuite`. The expectation suite attached to the feature group. + # Raises + `RestAPIException`. """ - # Turn on validation for this FG and set stricter setting - if self._validation_type == "NONE": - self._validation_type = "STRICT" - - return self._expectations_api.attach(self, expectation.name) + self._expectation_suite = self._expectation_suite_engine.get(self) + if self._expectation_suite is not None and ge_type is True: + return self._expectation_suite.to_ge_type() + else: + return self._expectation_suite - def detach_expectation(self, expectation): - """Remove an expectation from a feature group. + def save_expectation_suite( + self, + expectation_suite: Union[ExpectationSuite, ge.core.ExpectationSuite], + run_validation=True, + validation_ingestion_policy="ALWAYS", + ): + """Attach an expectation suite to a feature group and saves it for future use. If an expectation + suite is already attached, it is replaced. Note that the provided expectation suite is modified + inplace to include expectationId fields. # Arguments - name: The expectation name. + expectation_suite: The expectation suite to attach to the featuregroup. + run_validation: Set whether the expectation_suite will run on ingestion + validation_ingestion_policy: Set the policy for ingestion to the featuregroup. + - "STRICT" only allows DataFrame passing validation to be inserted into featuregroup. + - "ALWAYS" always insert the DataFrame to the featuregroup, irrespective of overall validation result. - # Returns - `Expectation`. The expectation metadata object. + # Raises + `RestAPIException`. + """ + if isinstance(expectation_suite, ge.core.ExpectationSuite): + tmp_expectation_suite = ExpectationSuite.from_ge_type( + ge_expectation_suite=expectation_suite, + run_validation=run_validation, + validation_ingestion_policy=validation_ingestion_policy, + ) + elif isinstance(expectation_suite, ExpectationSuite): + tmp_expectation_suite = expectation_suite + else: + raise TypeError( + "The provided expectation suite type `{}` is not supported. Use Great Expectation `ExpectationSuite` or HSFS' own `ExpectationSuite` object.".format( + type(expectation_suite) + ) + ) + + self._expectation_suite = self._expectation_suite_engine.save( + self, tmp_expectation_suite + ) + + expectation_suite = self._expectation_suite.to_ge_type() + + def delete_expectation_suite(self): + """Delete the expectation suite attached to the featuregroup. + # Raises + `RestAPIException`. """ - return self._expectations_api.detach(self, expectation.name) + self._expectation_suite_engine.delete(self) + self._expectation_suite = None - def get_expectations(self): - """Get all feature group expectations. + def get_latest_validation_report(self, ge_type: bool = True): + """Return the latest validation report attached to the feature group if it exists. # Arguments - name: The expectation name. + ge_type: If `True` returns a native Great Expectation type, Hopsworks + custom type otherwise. Conversion can be performed via the `to_ge_type()` + method on hopsworks type. Defaults to `True`. # Returns - `Expectation`. A list of expectation metadata objects. + `ValidationReport`. The latest validation report attached to the feature group. + # Raises + `RestAPIException`. """ - return self._expectations_api.get(feature_group=self) + if ge_type is True: + return self._validation_report_engine.get_last(self).to_ge_type() + else: + return self._validation_report_engine.get_last(self) - def get_expectation(self, name: str): - """Get attached expectation by name for this feature group. Name is unique across a feature store. + def get_all_validation_reports(self, ge_type: bool = True): + """Return the latest validation report attached to the feature group if it exists. # Arguments - name: The expectation name. + ge_type: If `True` returns a native Great Expectation type, Hopsworks + custom type otherwise. Conversion can be performed via the `to_ge_type()` + method on hopsworks type. Defaults to `True`. # Returns - `Expectation`. The expectation metadata object. + `ValidationReport`. The latest validation report attached to the feature group. + # Raises + `RestAPIException`. """ - return self._expectations_api.get(name, self) + if ge_type is True: + return [ + report.to_ge_type() + for report in self._validation_report_engine.get_all(self) + ] + return self._validation_report_engine.get_all(self) - def get_validations(self, validation_time=None, commit_time=None): - """Get feature group data validation results based on the attached expectations. + def save_validation_report( + self, + validation_report: Union[ + dict, + ValidationReport, + ge.core.expectation_validation_result.ExpectationSuiteValidationResult, + ], + ge_type: bool = True, + ): + """Save validation report to hopsworks platform along previous reports of the same featuregroup. # Arguments - validation_time: The data validation time, when the data validation started. - commit_time: The commit time of a time travel enabled feature group. - - # Returns - `FeatureGroupValidation`. The feature group validation metadata object. + validation_report: The validation report to attach to the featuregroup. + ge_type: If `True` returns a native Great Expectation type, Hopsworks + custom type otherwise. Conversion can be performed via the `to_ge_type()` + method on hopsworks type. Defaults to `True`. + # Raises + `RestAPIException`. """ - return self._data_validation_engine.get_validations( - self, validation_time, commit_time - ) + if isinstance( + validation_report, + ge.core.expectation_validation_result.ExpectationSuiteValidationResult, + ): + report = ValidationReport(**validation_report.to_json_dict()) + elif isinstance(validation_report, dict): + report = ValidationReport(**validation_report) + elif isinstance(validation_report, ValidationReport): + report = validation_report + + if ge_type: + return self._validation_report_engine.save(self, report).to_ge_type() + return self._validation_report_engine.save(self, report) def __getattr__(self, name): try: @@ -567,6 +634,31 @@ def event_time(self, feature_name): def location(self): return self._location + @property + def expectation_suite(self): + """Expectation Suite configuration object defining the settings for + data validation of the feature group.""" + return self._expectation_suite + + @expectation_suite.setter + def expectation_suite(self, expectation_suite): + if isinstance(expectation_suite, ExpectationSuite): + self._expectation_suite = expectation_suite + elif isinstance(expectation_suite, ge.core.expectation_suite.ExpectationSuite): + self._expectation_suite = ExpectationSuite( + **expectation_suite.to_json_dict() + ) + elif isinstance(expectation_suite, dict): + self._expectation_suite = ExpectationSuite(**expectation_suite) + elif expectation_suite is None: + self._expectation_suite = expectation_suite + else: + raise TypeError( + "The argument `expectation_suite` has to be `None` of type `ExpectationSuite` or `dict`, but is of type: `{}`".format( + type(expectation_suite) + ) + ) + class FeatureGroup(FeatureGroupBase): CACHED_FEATURE_GROUP = "CACHED_FEATURE_GROUP" @@ -591,14 +683,12 @@ def __init__( online_enabled=False, time_travel_format=None, statistics_config=None, - validation_type="NONE", - expectations=None, online_topic_name=None, event_time=None, stream=False, expectation_suite=None, ): - super().__init__(featurestore_id, validation_type, location) + super().__init__(featurestore_id, location) self._feature_store_id = featurestore_id self._feature_store_name = featurestore_name @@ -666,13 +756,6 @@ def __init__( self.statistics_config = statistics_config self.expectation_suite = expectation_suite - if expectations is not None: - self._expectations_names = [ - expectation.name for expectation in expectations - ] - else: - self._expectations_names = [] - self._feature_group_engine = feature_group_engine.FeatureGroupEngine( featurestore_id ) @@ -1143,7 +1226,6 @@ def validate( dataframe: Optional[ Union[pd.DataFrame, TypeVar("pyspark.sql.DataFrame")] # noqa: F821 ] = None, - log_activity: Optional[bool] = False, save_report: Optional[bool] = False, validation_options: Optional[Dict[Any, Any]] = {}, ): @@ -1154,8 +1236,6 @@ def validate( # Arguments dataframe: The PySpark dataframe to run the data validation expectations against. - log_activity: Boolean to indicate whether to persist validation results along - with the feature group. Defaults to `False`. expectation_suite: Optionally provide an Expectation Suite to override the one that is possibly attached to the feature group. This is useful for testing new Expectation suites. When an extra suite is provided, the results @@ -1174,159 +1254,11 @@ def validate( # Activity is logged only if a the validation concerts the feature group and not a specific dataframe if dataframe is None: dataframe = self.read() - log_activity = True - return ( - self._data_validation_engine.validate(self, dataframe, log_activity), - self._great_expectation_engine.validate( - self, dataframe, save_report, validation_options - ), + return self._great_expectation_engine.validate( + self, dataframe, save_report, validation_options ) - def get_expectation_suite(self, ge_type: bool = True): - """Return the expectation suite attached to the feature group if it exists. - - # Arguments - ge_type: If `True` returns a native Great Expectation type, Hopsworks - custom type otherwise. Conversion can be performed via the `to_ge_type()` - method on hopsworks type. Defaults to `True`. - - # Returns - `ExpectationSuite`. The expectation suite attached to the feature group. - - # Raises - `RestAPIException`. - """ - self._expectation_suite = self._expectation_suite_engine.get(self) - if self._expectation_suite is not None and ge_type is True: - return self._expectation_suite.to_ge_type() - else: - return self._expectation_suite - - def save_expectation_suite( - self, - expectation_suite: Union[ExpectationSuite, ge.core.ExpectationSuite], - run_validation=True, - validation_ingestion_policy="ALWAYS", - ): - """Attach an expectation suite to a feature group and saves it for future use. If an expectation - suite is already attached, it is replaced. Note that the provided expectation suite is modified - inplace to include expectationId fields. - - # Arguments - expectation_suite: The expectation suite to attach to the featuregroup. - run_validation: Set whether the expectation_suite will run on ingestion - validation_ingestion_policy: Set the policy for ingestion to the featuregroup. - - "STRICT" only allows DataFrame passing validation to be inserted into featuregroup. - - "ALWAYS" always insert the DataFrame to the featuregroup, irrespective of overall validation result. - - # Raises - `RestAPIException`. - """ - if isinstance(expectation_suite, ge.core.ExpectationSuite): - tmp_expectation_suite = ExpectationSuite.from_ge_type( - ge_expectation_suite=expectation_suite, - run_validation=run_validation, - validation_ingestion_policy=validation_ingestion_policy, - ) - elif isinstance(expectation_suite, ExpectationSuite): - tmp_expectation_suite = expectation_suite - else: - raise TypeError( - "The provided expectation suite type `{}` is not supported. Use Great Expectation `ExpectationSuite` or HSFS' own `ExpectationSuite` object.".format( - type(expectation_suite) - ) - ) - - self._expectation_suite = self._expectation_suite_engine.save( - self, tmp_expectation_suite - ) - - expectation_suite = self._expectation_suite.to_ge_type() - - def delete_expectation_suite(self): - """Delete the expectation suite attached to the featuregroup. - - # Raises - `RestAPIException`. - """ - self._expectation_suite_engine.delete(self) - self._expectation_suite = None - - def get_latest_validation_report(self, ge_type: bool = True): - """Return the latest validation report attached to the feature group if it exists. - - # Arguments - ge_type: If `True` returns a native Great Expectation type, Hopsworks - custom type otherwise. Conversion can be performed via the `to_ge_type()` - method on hopsworks type. Defaults to `True`. - - # Returns - `ValidationReport`. The latest validation report attached to the feature group. - - # Raises - `RestAPIException`. - """ - if ge_type is True: - return self._validation_report_engine.get_last(self).to_ge_type() - else: - return self._validation_report_engine.get_last(self) - - def get_all_validation_reports(self, ge_type: bool = True): - """Return the latest validation report attached to the feature group if it exists. - - # Arguments - ge_type: If `True` returns a native Great Expectation type, Hopsworks - custom type otherwise. Conversion can be performed via the `to_ge_type()` - method on hopsworks type. Defaults to `True`. - - # Returns - `ValidationReport`. The latest validation report attached to the feature group. - - # Raises - `RestAPIException`. - """ - if ge_type is True: - return [ - report.to_ge_type() - for report in self._validation_report_engine.get_all(self) - ] - return self._validation_report_engine.get_all(self) - - def save_validation_report( - self, - validation_report: Union[ - dict, - ValidationReport, - ge.core.expectation_validation_result.ExpectationSuiteValidationResult, - ], - ge_type: bool = True, - ): - """Save validation report to hopsworks platform along previous reports of the same featuregroup. - - # Arguments - validation_report: The validation report to attach to the featuregroup. - ge_type: If `True` returns a native Great Expectation type, Hopsworks - custom type otherwise. Conversion can be performed via the `to_ge_type()` - method on hopsworks type. Defaults to `True`. - - # Raises - `RestAPIException`. - """ - if isinstance( - validation_report, - ge.core.expectation_validation_result.ExpectationSuiteValidationResult, - ): - report = ValidationReport(**validation_report.to_json_dict()) - elif isinstance(validation_report, dict): - report = ValidationReport(**validation_report) - elif isinstance(validation_report, ValidationReport): - report = validation_report - - if ge_type: - return self._validation_report_engine.save(self, report).to_ge_type() - return self._validation_report_engine.save(self, report) - def compute_statistics(self, wallclock_time: Optional[str] = None): """Recompute the statistics for the feature group and save them to the feature store. @@ -1417,8 +1349,6 @@ def to_dict(self): if not self._stream else "streamFeatureGroupDTO", "statisticsConfig": self._statistics_config, - "validationType": self._validation_type, - "expectationsNames": self._expectations_names, "eventTime": self._event_time, "expectationSuite": self._expectation_suite, } @@ -1530,27 +1460,11 @@ def avro_schema(self): self._avro_schema = self._feature_group_engine.get_avro_schema(self) return self._avro_schema - @property - def validation_type(self): - """Validation type, one of "STRICT", "WARNING", "ALL", "NONE".""" - return self._validation_type - - @property - def expectations_names(self): - """The names of expectations attached to this feature group.""" - return self._expectations_names - @property def stream(self): """Whether to enable real time stream writing capabilities.""" return self._stream - @property - def expectation_suite(self): - """Expectation Suite configuration object defining the settings for - data validation of the feature group.""" - return self._expectation_suite - @version.setter def version(self, version): self._version = version @@ -1579,41 +1493,10 @@ def hudi_precombine_key(self, hudi_precombine_key): def online_enabled(self, new_online_enabled): self._online_enabled = new_online_enabled - @validation_type.setter - def validation_type(self, new_validation_type): - if new_validation_type is None: - self._validation_type = "NONE" - else: - self._validation_type = new_validation_type.upper() - self._feature_group_engine.update_validation_type(self) - - @expectations_names.setter - def expectations_names(self, new_expectations_names): - self._expectations_names = new_expectations_names - @stream.setter def stream(self, stream): self._stream = stream - @expectation_suite.setter - def expectation_suite(self, expectation_suite): - if isinstance(expectation_suite, ExpectationSuite): - self._expectation_suite = expectation_suite - elif isinstance(expectation_suite, ge.core.expectation_suite.ExpectationSuite): - self._expectation_suite = ExpectationSuite( - **expectation_suite.to_json_dict() - ) - elif isinstance(expectation_suite, dict): - self._expectation_suite = ExpectationSuite(**expectation_suite) - elif expectation_suite is None: - self._expectation_suite = expectation_suite - else: - raise TypeError( - "The argument `expectation_suite` has to be `None` of type `ExpectationSuite` or `dict`, but is of type: `{}`".format( - type(expectation_suite) - ) - ) - class ExternalFeatureGroup(FeatureGroupBase): EXTERNAL_FEATURE_GROUP = "ON_DEMAND_FEATURE_GROUP" @@ -1639,11 +1522,9 @@ def __init__( location=None, statistics_config=None, event_time=None, - validation_type="NONE", - expectations=None, expectation_suite=None, ): - super().__init__(featurestore_id, validation_type, location) + super().__init__(featurestore_id, location) self._feature_store_id = featurestore_id self._feature_store_name = featurestore_name @@ -1702,18 +1583,11 @@ def __init__( else: self._storage_connector = storage_connector - if expectations is not None: - self._expectations_names = [ - expectation.name for expectation in expectations - ] - else: - self._expectations_names = [] + self.expectation_suite = expectation_suite def save(self): self._feature_group_engine.save(self) self._code_engine.save_code(self) - if self._validation_type != "NONE": - self.validate() if self.statistics_config.enabled: self._statistics_engine.compute_statistics(self, self.read()) @@ -1738,14 +1612,20 @@ def show(self, n): ) return self.select_all().show(n) - def validate(self): # noqa: F821 + def validate( + self, + save_report: Optional[bool] = False, + validation_options: Optional[Dict[Any, Any]] = {}, + ): """Run validation based on the attached expectations # Returns `FeatureGroupValidation`. The feature group validation metadata object. """ - return self._data_validation_engine.validate(self, self.read(), True) + return self._great_expectation_engine.validate( + self, self.read(), save_report, validation_options + ) @classmethod def from_response_json(cls, json_dict): @@ -1787,8 +1667,6 @@ def to_dict(self): "type": "onDemandFeaturegroupDTO", "statisticsConfig": self._statistics_config, "eventTime": self._event_time, - "validationType": self._validation_type, - "expectationsNames": self._expectations_names, "expectationSuite": self._expectation_suite, } @@ -1840,16 +1718,6 @@ def creator(self): def created(self): return self._created - @property - def validation_type(self): - """Validation type, one of "STRICT", "WARNING", "ALL", "NONE".""" - return self._validation_type - - @property - def expectations_names(self): - """The names of expectations attached to this feature group.""" - return self._expectations_names - @version.setter def version(self, version): self._version = version @@ -1861,15 +1729,3 @@ def description(self, new_description): @features.setter def features(self, new_features): self._features = new_features - - @validation_type.setter - def validation_type(self, new_validation_type): - if new_validation_type is None: - self._validation_type = "NONE" - else: - self._validation_type = new_validation_type.upper() - self._feature_group_engine.update_validation_type(self) - - @expectations_names.setter - def expectations_names(self, new_expectations_names): - self._expectations_names = new_expectations_names diff --git a/python/hsfs/feature_group_validation.py b/python/hsfs/feature_group_validation.py deleted file mode 100644 index 188ee163ca..0000000000 --- a/python/hsfs/feature_group_validation.py +++ /dev/null @@ -1,137 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import humps -import json - -from hsfs import util - - -class FeatureGroupValidation: - """Metadata object representing the validation result of a feature group. - - Refer to expectation_result for individual feature group expectation results. - """ - - def __init__( - self, - validation_time, - expectation_results, - validation_id=None, - status=None, - validation_path=None, - commit_time=None, - log_activity=True, - href=None, - expand=None, - items=None, - count=None, - type=None, - ): - self._validation_id = validation_id - self._validation_time = validation_time - self._status = status - self._expectation_results = expectation_results - self._validation_path = validation_path - self._commit_time = commit_time - self._log_activity = log_activity - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if "count" in json_decamelized: - if json_decamelized["count"] == 0: - return [] - return [ - cls(**feature_group_validation) - for feature_group_validation in json_decamelized["items"] - ] - else: - return cls(**json_decamelized) - - def json(self): - return json.dumps(self, cls=util.FeatureStoreEncoder) - - def to_dict(self): - return { - "validationId": self._validation_id, - "validationTime": self._validation_time, - "expectationResults": self._expectation_results, - "logActivity": self._log_activity, - } - - @property - def validation_id(self): - """Unique id of the feature group validation.""" - return self._validation_id - - @validation_id.setter - def validation_id(self, validation_id): - self._validation_id = validation_id - - @property - def validation_time(self): - """Timestamp in seconds of when feature validation started.""" - return self._validation_time - - @validation_time.setter - def validation_time(self, validation_time): - self._validation_time = validation_time - - @property - def status(self): - """Status of the expectation after feature ingestion, one of "NONE", "SUCCESS", "WARNING", "FAILURE".""" - return self._status - - @status.setter - def status(self, status): - self._status = status - - @property - def expectation_results(self): - """List of expectation results.""" - return self._expectation_results - - @expectation_results.setter - def expectation_results(self, expectation_results): - self._expectation_results = expectation_results - - @property - def validation_path(self): - """Path in the Hopsworks datasets where the feature group validation results are persisted.""" - return self._validation_path - - @validation_path.setter - def validation_path(self, validation_path): - self._validation_path = validation_path - - @property - def commit_time(self): - """Timestamp in seconds of when the feature dataframe was committed (time-travel FGs only).""" - return self._commit_time - - @commit_time.setter - def commit_time(self, commit_time): - self._commit_time = commit_time - - @property - def log_activity(self): - """Whether to log the validation as a feature group activity. Default to True. Used internally in hsfs""" - return self._log_activity - - @log_activity.setter - def log_activity(self, log_activity): - self._log_activity = log_activity diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index fb8cdde526..6358fb0953 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -33,8 +33,6 @@ feature, util, storage_connector, - expectation, - rule, expectation_suite, feature_view, ) @@ -42,7 +40,6 @@ feature_group_api, storage_connector_api, training_dataset_api, - expectations_api, feature_group_engine, feature_view_engine, ) @@ -98,7 +95,6 @@ def __init__( self._id ) self._training_dataset_api = training_dataset_api.TrainingDatasetApi(self._id) - self._expectations_api = expectations_api.ExpectationsApi(self._id) self._feature_group_engine = feature_group_engine.FeatureGroupEngine(self._id) @@ -372,39 +368,6 @@ def get_online_storage_connector(self): """ return self._storage_connector_api.get_online_connector() - def get_expectation(self, name: str): - """Get an expectation entity from the feature store. - - Getting an expectation from the Feature Store means getting its metadata handle - so you can subsequently add features and/or rules and save it which will overwrite the previous instance. - - # Arguments - name: Name of the training dataset to get. - - # Returns - `Expectation`: The expectation metadata object. - - # Raises - `RestAPIError`: If unable to retrieve the expectation from the feature store. - """ - - return self._expectations_api.get(name) - - def get_expectations(self): - """Get all expectation entities from the feature store. - - Getting expectations from the Feature Store means getting their metadata handles - so you can subsequently add features and/or rules and save it which will overwrite the previous instance. - - # Returns - `Expectation`: The expectation metadata object. - - # Raises - `RestAPIError`: If unable to retrieve the expectations from the feature store. - """ - - return self._expectations_api.get() - def create_feature_group( self, name: str, @@ -417,8 +380,6 @@ def create_feature_group( hudi_precombine_key: Optional[str] = None, features: Optional[List[feature.Feature]] = [], statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, - validation_type: Optional[str] = "NONE", - expectations: Optional[List[expectation.Expectation]] = [], event_time: Optional[str] = None, stream: Optional[bool] = False, expectation_suite: Optional[ @@ -467,12 +428,6 @@ def create_feature_group( The values should be booleans indicating the setting. To fully turn off statistics computation pass `statistics_config=False`. Defaults to `None` and will compute only descriptive statistics. - validation_type: Optionally, set the validation type to one of "NONE", "STRICT", - "WARNING", "ALL". Determines the mode in which data validation is applied on - ingested or already existing feature group data. - expectations: Optionally, a list of expectations to be attached to the feature group. - The expectations list contains Expectation metadata objects which can be retrieved with - the `get_expectation()` and `get_expectations()` functions. event_time: Optionally, provide the name of the feature containing the event time for the features in this feature group. If event_time is set the feature group can be used for point-in-time joins. Defaults to `None`. @@ -499,8 +454,6 @@ def create_feature_group( featurestore_name=self._name, features=features, statistics_config=statistics_config, - validation_type=validation_type, - expectations=expectations, event_time=event_time, stream=stream, expectation_suite=expectation_suite, @@ -518,8 +471,6 @@ def get_or_create_feature_group( hudi_precombine_key: Optional[str] = None, features: Optional[List[feature.Feature]] = [], statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, - validation_type: Optional[str] = "NONE", - expectations: Optional[List[expectation.Expectation]] = [], expectation_suite: Optional[ Union[expectation_suite.ExpectationSuite, ge.core.ExpectationSuite] ] = None, @@ -566,12 +517,6 @@ def get_or_create_feature_group( The values should be booleans indicating the setting. To fully turn off statistics computation pass `statistics_config=False`. Defaults to `None` and will compute only descriptive statistics. - validation_type: Optionally, set the validation type to one of "NONE", "STRICT", - "WARNING", "ALL". Determines the mode in which data validation is applied on - ingested or already existing feature group data. - expectations: Optionally, a list of expectations to be attached to the feature group. - The expectations list contains Expectation metadata objects which can be retrieved with - the `get_expectation()` and `get_expectations()` functions. expectation_suite: Optionally, attach an expectation suite to the feature group which dataframes should be validated against upon insertion. Defaults to `None`. @@ -582,7 +527,6 @@ def get_or_create_feature_group( Stream enabled Feature Groups have unified single API for writing streaming features transparently to both online and offline store. - # Returns `FeatureGroup`. The feature group metadata object. """ @@ -609,8 +553,6 @@ def get_or_create_feature_group( featurestore_name=self._name, features=features, statistics_config=statistics_config, - validation_type=validation_type, - expectations=expectations, event_time=event_time, stream=stream, expectation_suite=expectation_suite, @@ -632,8 +574,9 @@ def create_on_demand_feature_group( features: Optional[List[feature.Feature]] = [], statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, event_time: Optional[str] = None, - validation_type: Optional[str] = "NONE", - expectations: Optional[List[expectation.Expectation]] = [], + expectation_suite: Optional[ + Union[expectation_suite.ExpectationSuite, ge.core.ExpectationSuite] + ] = None, ): """Create a external feature group metadata object. @@ -681,12 +624,9 @@ def create_on_demand_feature_group( event_time: Optionally, provide the name of the feature containing the event time for the features in this feature group. If event_time is set the feature group can be used for point-in-time joins. Defaults to `None`. - validation_type: Optionally, set the validation type to one of "NONE", "STRICT", - "WARNING", "ALL". Determines the mode in which data validation is applied on - ingested or already existing feature group data. - expectations: Optionally, a list of expectations to be attached to the feature group. - The expectations list contains Expectation metadata objects which can be retrieved with - the `get_expectation()` and `get_expectations()` functions. + expectation_suite: Optionally, attach an expectation suite to the feature + group which dataframes should be validated against upon insertion. + Defaults to `None`. # Returns `ExternalFeatureGroup`. The external feature group metadata object. @@ -706,8 +646,7 @@ def create_on_demand_feature_group( features=features, statistics_config=statistics_config, event_time=event_time, - validation_type=validation_type, - expectations=expectations, + expectation_suite=expectation_suite, ) def create_external_feature_group( @@ -724,8 +663,9 @@ def create_external_feature_group( features: Optional[List[feature.Feature]] = [], statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, event_time: Optional[str] = None, - validation_type: Optional[str] = "NONE", - expectations: Optional[List[expectation.Expectation]] = [], + expectation_suite: Optional[ + Union[expectation_suite.ExpectationSuite, ge.core.ExpectationSuite] + ] = None, ): """Create a external feature group metadata object. @@ -770,12 +710,9 @@ def create_external_feature_group( event_time: Optionally, provide the name of the feature containing the event time for the features in this feature group. If event_time is set the feature group can be used for point-in-time joins. Defaults to `None`. - validation_type: Optionally, set the validation type to one of "NONE", "STRICT", - "WARNING", "ALL". Determines the mode in which data validation is applied on - ingested or already existing feature group data. - expectations: Optionally, a list of expectations to be attached to the feature group. - The expectations list contains Expectation metadata objects which can be retrieved with - the `get_expectation()` and `get_expectations()` functions. + expectation_suite: Optionally, attach an expectation suite to the feature + group which dataframes should be validated against upon insertion. + Defaults to `None`. # Returns `ExternalFeatureGroup`. The external feature group metadata object. @@ -795,8 +732,7 @@ def create_external_feature_group( features=features, statistics_config=statistics_config, event_time=event_time, - validation_type=validation_type, - expectations=expectations, + expectation_suite=expectation_suite, ) def create_training_dataset( @@ -904,49 +840,6 @@ def create_training_dataset( train_split=train_split, ) - def create_expectation( - self, - name: str, - description: Optional[str] = "", - features: Optional[List[str]] = [], - rules: Optional[List[rule.Rule]] = [], - ): - """Create an expectation metadata object. - - !!! note "Lazy" - This method is lazy and does not persist the expectation in the - feature store on its own. To materialize the expectation and save - call the `save()` method of the expectation metadata object. - - # Arguments - name: Name of the expectation to create. - description: A string describing the expectation that can describe its business logic and applications - within the feature store. - features: The features this expectation is applied on. - rules: The validation rules this expectation will apply to the features. - - # Returns: - `Expectation`: The expectation metadata object. - """ - return expectation.Expectation( - name=name, - description=description, - features=features, - rules=rules, - featurestore_id=self._id, - ) - - def delete_expectation( - self, - name: str, - ): - """Delete an expectation from the feature store. - - # Arguments - name: Name of the training dataset to create. - """ - return self._expectations_api.delete(name) - def create_transformation_function( self, transformation_function: callable, diff --git a/python/hsfs/rule.py b/python/hsfs/rule.py deleted file mode 100644 index 8754aae280..0000000000 --- a/python/hsfs/rule.py +++ /dev/null @@ -1,147 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import humps -import json - -from hsfs import util - - -class Rule: - """Metadata object representing the validation rule that is used by feature group expectations. - - This class is made for hsfs internal use only. - """ - - def __init__( - self, - name: str, - level, - min=None, - max=None, - pattern=None, - accepted_type=None, - feature=None, - legal_values=None, - href=None, - expand=None, - items=None, - count=None, - type=None, - ): - self.name = name - self._level = level - self._min = min - self._max = max - self._pattern = pattern - self._accepted_type = accepted_type - self._feature = feature - self._legal_values = legal_values - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if json_decamelized["count"] == 0: - return [] - return [cls(**rule) for rule in json_decamelized["items"]] - - def json(self): - return json.dumps(self, cls=util.FeatureStoreEncoder) - - def to_dict(self): - return { - "name": self._name, - "level": self._level, - "min": self._min, - "max": self._max, - "pattern": self._pattern, - "acceptedType": self._accepted_type, - "feature": self._feature, - "legalValues": self._legal_values, - } - - @property - def name(self): - """Name of the rule as found in rule definitions.""" - return self._name - - @name.setter - def name(self, name): - self._name = name.upper() - - @property - def level(self): - """Severity level of a rule, one of "WARNING" or "ERROR".""" - return self._level - - @level.setter - def level(self, level): - self._level = level - - @property - def min(self): - """The lower bound of the value range this feature should fall into.""" - return self._min - - @min.setter - def min(self, min): - self._min = min - - @property - def max(self): - """The upper bound of the value range this feature should fall into.""" - return self._max - - @max.setter - def max(self, max): - self._max = max - - @property - def pattern(self): - """Pattern to check for a feature's pattern compliance. Applicable only to the HAS_PATTERN rule.""" - return self._pattern - - @pattern.setter - def pattern(self, pattern): - self._pattern = pattern - - @property - def feature(self): - """Feature to compare the expectation's features to, applied only to Compliance rules.""" - return self._feature - - @feature.setter - def feature(self, feature): - self._feature = feature - - @property - def accepted_type(self): - """Data type accepted for a feature. Applicable only to the HAS_DATATYPE rule. - Accepted types are: Null, Fractional, Integral, Boolean, String""" - return self._accepted_type - - @accepted_type.setter - def accepted_type(self, accepted_type): - self._accepted_type = accepted_type - - @property - def legal_values(self): - """List of legal values a feature should be found int. feature.Applicable only to IS_CONTAINED_IN rule.""" - return self._legal_values - - @legal_values.setter - def legal_values(self, legal_values): - self._legal_values = legal_values diff --git a/python/hsfs/ruledefinition.py b/python/hsfs/ruledefinition.py deleted file mode 100644 index cb4f5c66e5..0000000000 --- a/python/hsfs/ruledefinition.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import humps -import json - -from hsfs import util - - -class RuleDefinition: - """Metadata object representing the validation rule that is used by feature group expectations. - - The set of rule definitions, for example "has max", "has avg" is provided by hsfs and cannot be modified. - """ - - def __init__( - self, - name, - accepted_type, - predicate=None, - feature_type=None, - description=None, - href=None, - expand=None, - items=None, - count=None, - type=None, - ): - self._name = name - self._predicate = predicate - self._accepted_type = accepted_type - self._feature_type = feature_type - self._description = description - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if "count" in json_decamelized: - if json_decamelized["count"] == 0: - return [] - return [ - cls(**ruledefinition) for ruledefinition in json_decamelized["items"] - ] - else: - return cls(**json_decamelized) - - def json(self): - return json.dumps(self, cls=util.FeatureStoreEncoder) - - def to_dict(self): - return { - "name": self._name, - "predicate": self._predicate, - "acceptedType": self._accepted_type, - "featureType": self._feature_type, - "description": self._description, - } - - @property - def name(self): - """Name of the rule definition. Unique across all features stores.""" - return self._name - - @property - def predicate(self): - """Predicate of the rule definition, one of "LEGAL_VALUES", "ACCEPTED_TYPE", "PATTERN".""" - return self._predicate - - @property - def accepted_type(self): - """The type of the feature, one of "Null", "Fractional", "Integral", "Boolean", "String".""" - return self._accepted_type - - @property - def feature_type(self): - """The type of the feature, one of "Numerical", "Categorical".""" - return self._feature_type - - @property - def description(self): - return self._description diff --git a/python/hsfs/validation_result.py b/python/hsfs/validation_result.py deleted file mode 100644 index de1fb3a719..0000000000 --- a/python/hsfs/validation_result.py +++ /dev/null @@ -1,109 +0,0 @@ -# -# Copyright 2020 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import humps -import json - -from hsfs import util - - -class ValidationResult: - """Metadata object representing the validation result of a single rule of an expectation result of a Feature Group.""" - - def __init__( - self, - status, - message, - value, - features, - rule, - href=None, - expand=None, - items=None, - count=None, - type=None, - ): - self._status = status - self._message = message - self._value = value - self._features = features - self._rule = rule - - @classmethod - def from_response_json(cls, json_dict): - json_decamelized = humps.decamelize(json_dict) - if json_decamelized["count"] == 0: - return [] - return [ - cls(**validation_result) for validation_result in json_decamelized["items"] - ] - - def json(self): - return json.dumps(self, cls=util.FeatureStoreEncoder) - - def to_dict(self): - return { - "status": self._status, - "message": self._message, - "value": self._value, - "features": self._features, - "rule": self._rule, - } - - @property - def status(self): - return self._status - - @status.setter - def status(self, status): - """Status of the expectation after feature ingestion, one of "NONE", "SUCCESS", "WARNING", "FAILURE".""" - self._status = status - - @property - def message(self): - """Message describing the outcome of applying the rule against the feature.""" - return self._message - - @message.setter - def message(self, message): - self._message = message - - @property - def value(self): - """The computed value of the feature according to the rule.""" - return self._value - - @value.setter - def value(self, value): - self._value = value - - @property - def features(self): - """Feature of the validation result on which the rule was applied.""" - return self._features - - @features.setter - def features(self, features): - self._features = features - - @property - def rule(self): - """Feature of the validation result on which the rule was applied.""" - return self._rule - - @rule.setter - def rule(self, rule): - self._rule = rule