feast-dev · feast-ci-bot · Dec 22, 2020 · Dec 11, 2020 · Dec 14, 2020 · Dec 15, 2020
diff --git a/.github/workflows/complete.yml b/.github/workflows/complete.yml
@@ -128,6 +128,10 @@ jobs:
           java-version: '11'
           java-package: jdk
           architecture: x64
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.6'
+          architecture: 'x64'
       - uses: actions/cache@v2
         with:
           path: ~/.m2/repository
@@ -136,6 +140,13 @@ jobs:
             ${{ runner.os }}-it-maven-
       - name: Run integration tests
         run:  make test-java-integration
+      - name: Save report
+        uses: actions/upload-artifact@v2
+        if: failure()
+        with:
+          name: it-report
+          path: spark/ingestion/target/test-reports/TestSuite.txt
+          retention-days: 5
 
   tests-docker-compose:
     needs:

diff --git a/.github/workflows/master_only.yml b/.github/workflows/master_only.yml
@@ -76,4 +76,31 @@ jobs:
             VERSION=${RELEASE_VERSION:1}
             make build-java-no-tests REVISION=${VERSION}
             gsutil cp ./spark/ingestion/target/feast-ingestion-spark-${VERSION}.jar gs://${PUBLISH_BUCKET}/spark/ingestion/
-          fi
+          fi
+
+  publish-ingestion-pylibs:
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+        python-version: [ 3.6, 3.7, 3.8 ]
+    env:
+      PUBLISH_BUCKET: feast-jobs
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v2
+      - uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
+        with:
+          version: '290.0.1'
+          export_default_credentials: true
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
+          service_account_key: ${{ secrets.GCP_SA_KEY }}
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Create libs archive
+        env:
+          PY_VERSION: ${{ matrix.python-version }}
+        run: |
+          export PLATFORM=$(python -c 'import platform; print(platform.system().lower())')
+          ./infra/scripts/build-ingestion-py-dependencies.sh "py${PY_VERSION}-$PLATFORM" gs://${PUBLISH_BUCKET}/spark/validation/
diff --git a/Makefile b/Makefile
@@ -45,19 +45,19 @@ lint-java:
 	${MVN} --no-transfer-progress spotless:check
 
 test-java:
-	${MVN} --no-transfer-progress test
+	${MVN} --no-transfer-progress -DskipITs=true test
 
 test-java-integration:
 	${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true clean verify
 
 test-java-with-coverage:
-	${MVN} --no-transfer-progress test jacoco:report-aggregate
+	${MVN} --no-transfer-progress -DskipITs=true test jacoco:report-aggregate
 
 build-java:
 	${MVN} clean verify
 
 build-java-no-tests:
-	${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -Drevision=${REVISION} clean package
+	${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -DskipITs=true -Drevision=${REVISION} clean package
 
 # Python SDK
 

@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+PLATFORM=$1
+DESTINATION=$2
+PACKAGES=${PACKAGES:-"great-expectations==0.13.2 pyarrow==2.0.0"}
+
+tmp_dir=$(mktemp -d)
+
+pip3 install -t ${tmp_dir}/libs $PACKAGES
+
+cd $tmp_dir
+tar -czf pylibs-ge-$PLATFORM.tar.gz libs/
+if [[ $DESTINATION == gs* ]]; then
+  gsutil cp pylibs-ge-$PLATFORM.tar.gz $GS_DESTINATION
+else
+  mv pylibs-ge-$PLATFORM.tar.gz $DESTINATION
+fi
diff --git a/pom.xml b/pom.xml
@@ -105,6 +105,7 @@
         <parent.basedir>${maven.multiModuleProjectDirectory}</parent.basedir>
 
         <skipUTs>false</skipUTs>
+        <skipITs>false</skipITs>
         <feast.auth.providers.http.client.package.name>feast.common.auth.providers.http.client</feast.auth.providers.http.client.package.name>
     </properties>
 

@@ -222,6 +222,10 @@ class ConfigOptions(metaclass=ConfigMeta):
     #: https://github.com/gojekfarm/stencil
     STENCIL_URL: str = ""
 
+    #: If set to true rows that do not pass custom validation (see feast.contrib.validation)
+    #: won't be saved to Online Storage
+    INGESTION_DROP_INVALID_ROWS = "False"
+
     #: EMR cluster to run Feast Spark Jobs in
     EMR_CLUSTER_ID: Optional[str] = None
 

@@ -0,0 +1,13 @@
+import io
+
+try:
+    from pyspark import cloudpickle
+except ImportError:
+    raise ImportError("pyspark must be installed to enable validation functionality")
+
+
+def serialize_udf(fun, return_type) -> bytes:
+    buffer = io.BytesIO()
+    command = (fun, return_type)
+    cloudpickle.dump(command, buffer)
+    return buffer.getvalue()
@@ -0,0 +1,126 @@
+import io
+import json
+from typing import TYPE_CHECKING
+from urllib.parse import urlparse
+
+import pandas as pd
+
+from feast.constants import ConfigOptions
+from feast.contrib.validation.base import serialize_udf
+from feast.staging.storage_client import get_staging_client
+
+try:
+    from great_expectations.core import ExpectationSuite
+    from great_expectations.dataset import PandasDataset
+except ImportError:
+    raise ImportError(
+        "great_expectations must be installed to enable validation functionality. "
+        "Please install feast[validation]"
+    )
+
+try:
+    from pyspark.sql.types import BooleanType
+except ImportError:
+    raise ImportError(
+        "pyspark must be installed to enable validation functionality. "
+        "Please install feast[validation]"
+    )
+
+
+if TYPE_CHECKING:
+    from feast import Client, FeatureTable
+
+
+GE_PACKED_ARCHIVE = "https://storage.googleapis.com/feast-jobs/spark/validation/pylibs-ge-%(platform)s.tar.gz"
+_UNSET = object()
+
+
+class ValidationUDF:
+    def __init__(self, name: str, pickled_code: bytes):
+        self.name = name
+        self.pickled_code = pickled_code
+
+
+def create_validation_udf(name: str, expectations: ExpectationSuite) -> ValidationUDF:
+    """
+    Wraps your expectations into Spark UDF.
+
+    Expectations should be generated & validated using training dataset:
+    >>> from great_expectations.dataset import PandasDataset
+    >>> ds = PandasDataset.from_dataset(you_training_df)
+    >>> ds.expect_column_values_to_be_between('column', 0, 100)
+
+    >>> expectations = ds.get_expectation_suite()
+
+    Important: you expectations should pass on training dataset, only successful checks
+    will be converted and stored in ExpectationSuite.
+
+    Now you can create UDF that will validate data during ingestion:
+    >>> create_validation_udf("myValidation", expectations)
+
+    :param name
+    :param expectations: collection of expectation gathered on training dataset
+    :return: ValidationUDF with serialized code
+    """
+
+    def udf(df: pd.DataFrame) -> pd.Series:
+        ds = PandasDataset.from_dataset(df)
+        result = ds.validate(expectations, result_format="COMPLETE")
+        valid_rows = pd.Series([True] * df.shape[0])
+
+        for check in result.results:
+            if check.success:
+                continue
+
+            if check.exception_info["raised_exception"]:
+                # ToDo: probably we should mark all rows as invalid
+                continue
+
+            valid_rows.iloc[check.result["unexpected_index_list"]] = False
+
+        return valid_rows
+
+    pickled_code = serialize_udf(udf, BooleanType())
+    return ValidationUDF(name, pickled_code)
+
+
+def apply_validation(
+    client: "Client",
+    feature_table: "FeatureTable",
+    udf: ValidationUDF,
+    validation_window_secs: int,
+    include_py_libs=_UNSET,
+):
+    """
+    Uploads validation udf code to staging location &
+    stores path to udf code and required python libraries as FeatureTable labels.
+    """
+    include_py_libs = (
+        include_py_libs if include_py_libs is not _UNSET else GE_PACKED_ARCHIVE
+    )
+
+    staging_location = client._config.get(ConfigOptions.SPARK_STAGING_LOCATION).rstrip(
+        "/"
+    )
+    staging_scheme = urlparse(staging_location).scheme
+    staging_client = get_staging_client(staging_scheme)
+
+    pickled_code_fp = io.BytesIO(udf.pickled_code)
+    remote_path = f"{staging_location}/udfs/{udf.name}.pickle"
+    staging_client.upload_fileobj(
+        pickled_code_fp, f"{udf.name}.pickle", remote_uri=urlparse(remote_path)
+    )
+
+    feature_table.labels.update(
+        {
+            "_validation": json.dumps(
+                dict(
+                    name=udf.name,
+                    pickled_code_path=remote_path,
+                    include_archive_path=include_py_libs,
+                )
+            ),
+            "_streaming_trigger_secs": str(validation_window_secs),
+        }
+    )
+    client.apply_feature_table(feature_table)
@@ -307,6 +307,7 @@ def __init__(
         statsd_port: Optional[int] = None,
         deadletter_path: Optional[str] = None,
         stencil_url: Optional[str] = None,
+        drop_invalid_rows: bool = False,
     ):
         self._feature_table = feature_table
         self._source = source
@@ -318,6 +319,7 @@ def __init__(
         self._statsd_port = statsd_port
         self._deadletter_path = deadletter_path
         self._stencil_url = stencil_url
+        self._drop_invalid_rows = drop_invalid_rows
 
     def _get_redis_config(self):
         return dict(host=self._redis_host, port=self._redis_port, ssl=self._redis_ssl)
@@ -362,6 +364,9 @@ def get_arguments(self) -> List[str]:
         if self._stencil_url:
             args.extend(["--stencil-url", self._stencil_url])
 
+        if self._drop_invalid_rows:
+            args.extend(["--drop-invalid"])
+
         return args
 
 
@@ -430,6 +435,7 @@ def __init__(
         statsd_port: Optional[int] = None,
         deadletter_path: Optional[str] = None,
         stencil_url: Optional[str] = None,
+        drop_invalid_rows: bool = False,
     ):
         super().__init__(
             feature_table,
@@ -442,6 +448,7 @@ def __init__(
             statsd_port,
             deadletter_path,
             stencil_url,
+            drop_invalid_rows,
         )
         self._extra_jars = extra_jars
 

@@ -148,6 +148,7 @@ def _feature_table_to_argument(
             for n in feature_table.entities
         ],
         "max_age": feature_table.max_age.ToSeconds() if feature_table.max_age else None,
+        "labels": dict(feature_table.labels),
     }
 
 
@@ -288,6 +289,7 @@ def get_stream_to_online_ingestion_params(
         and client._config.getint(opt.STATSD_PORT),
         deadletter_path=client._config.get(opt.DEADLETTER_PATH),
         stencil_url=client._config.get(opt.STENCIL_URL),
+        drop_invalid_rows=client._config.get(opt.INGESTION_DROP_INVALID_ROWS),
     )
 
 

@@ -332,6 +332,7 @@ def _stream_ingestion_step(
             ],
             "Args": ["spark-submit", "--class", "feast.ingestion.IngestionJob"]
             + jars_args
+            + ["--conf", "spark.yarn.isPython=true"]
             + ["--packages", BQ_SPARK_PACKAGE, jar_path]
             + args,
             "Jar": "command-runner.jar",

@@ -284,6 +284,8 @@ def dataproc_submit(
                 "spark.executor.instances": self.executor_instances,
                 "spark.executor.cores": self.executor_cores,
                 "spark.executor.memory": self.executor_memory,
+                "spark.pyspark.driver.python": "python3.6",
+                "spark.pyspark.python": "python3.6",
             }
 
             properties.update(extra_properties)

@@ -20,4 +20,5 @@ pytest-lazy-fixture==0.6.3
 pytest-timeout==1.4.2
 pytest-ordering==0.6.*
 pytest-mock==1.10.4
-PyYAML==5.3.1
+PyYAML==5.3.1
+great-expectations==0.13.2
@@ -77,7 +77,10 @@
     install_requires=REQUIRED,
     # https://stackoverflow.com/questions/28509965/setuptools-development-requirements
     # Install dev requirements with: pip install -e .[dev]
-    extras_require={"dev": ["mypy-protobuf==1.*", "grpcio-testing==1.*"]},
+    extras_require={
+        "dev": ["mypy-protobuf==1.*", "grpcio-testing==1.*"],
+        "validation": ["great_expectations==0.13.2", "pyspark==3.0.1"]
+    },
     include_package_data=True,
     license="Apache",
     classifiers=[

diff --git a/spark/ingestion/pom.xml b/spark/ingestion/pom.xml
@@ -258,6 +258,11 @@
                         <goals>
                             <goal>test</goal>
                         </goals>
+                        <configuration>
+                            <reportsDirectory>${project.build.directory}/test-reports</reportsDirectory>
+                            <junitxml>.</junitxml>
+                            <filereports>WDF TestSuite.txt</filereports>
+                        </configuration>
                     </execution>
                 </executions>
             </plugin>
@@ -366,6 +371,24 @@
                     <skipTests>true</skipTests>
                 </configuration>
             </plugin>
+            <plugin>
+                <artifactId>exec-maven-plugin</artifactId>
+                <groupId>org.codehaus.mojo</groupId>
+                <executions>
+                    <execution>
+                        <id>Python UDF setup</id>
+                        <phase>generate-test-sources</phase>
+                        <goals>
+                            <goal>exec</goal>
+                        </goals>
+                        <configuration>
+                            <skip>${skipITs}</skip>
+                            <executable>./setup.sh</executable>
+                            <workingDirectory>${basedir}/src/test/resources/python/</workingDirectory>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
         <pluginManagement>
             <plugins>