diff --git a/examples/statistics/Historical Feature Statistics with Feast, TFDV and Facets.ipynb b/examples/statistics/Historical Feature Statistics with Feast, TFDV and Facets.ipynb new file mode 100644 index 0000000000..2ee48e1b1d --- /dev/null +++ b/examples/statistics/Historical Feature Statistics with Feast, TFDV and Facets.ipynb @@ -0,0 +1,706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Historical Feature Statistics with Feast, TFDV and Facets\n", + "\n", + "This tutorial covers how Feast can be used in conjunction with TFDV and Facets to retrieve statistics about feature datasets. \n", + "\n", + "The notebook showcases how Feast's integration with TFDV allows users to:\n", + "\n", + "1. Define TFX feature schemas and persist these properties in the Feature Store\n", + "2. Validate new data against the defined schema\n", + "3. Validate data already in Feast against the defined schema\n", + "\n", + "**Prerequisites**:\n", + "\n", + "- Feast running with at least 1 BigQuery warehouse store" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "setting project to statistics...\n", + "project already exists, skipping.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import pytest\n", + "import pytz\n", + "import uuid\n", + "import time\n", + "from datetime import datetime, timedelta\n", + "\n", + "from feast.client import Client\n", + "from feast.entity import Entity\n", + "from feast.feature import Feature\n", + "from feast.feature_set import FeatureSet\n", + "from feast.type_map import ValueType\n", + "from google.protobuf import json_format\n", + "from google.protobuf.duration_pb2 import Duration\n", + "from tensorflow_metadata.proto.v0 import statistics_pb2\n", + "import tensorflow_data_validation as tfdv\n", + "\n", + "PROJECT_NAME = \"statistics\"\n", + "IRIS_DATASET = \"http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\"\n", + "BIGQUERY_STORE_NAME = \"serving\"\n", + "client = Client(core_url=\"localhost:6565\")\n", + "print(f\"setting project to {PROJECT_NAME}...\")\n", + "try:\n", + " client.create_project(PROJECT_NAME)\n", + "except:\n", + " print(\"project already exists, skipping.\")\n", + "client.set_project(PROJECT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we are using the iris dataset. More information about this dataset can be found [here](http://archive.ics.uci.edu/ml/datasets/iris)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_lengthsepal_widthpetal_lengthpetal_widthclassdatetime
05.13.51.40.2Iris-setosa2020-04-12 07:22:07.065951+00:00
14.93.01.40.2Iris-setosa2020-04-12 07:22:07.065951+00:00
24.73.21.30.2Iris-setosa2020-04-12 07:22:07.065951+00:00
34.63.11.50.2Iris-setosa2020-04-12 07:22:07.065951+00:00
45.03.61.40.2Iris-setosa2020-04-12 07:22:07.065951+00:00
\n", + "
" + ], + "text/plain": [ + " sepal_length sepal_width petal_length petal_width class \\\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa \n", + "1 4.9 3.0 1.4 0.2 Iris-setosa \n", + "2 4.7 3.2 1.3 0.2 Iris-setosa \n", + "3 4.6 3.1 1.5 0.2 Iris-setosa \n", + "4 5.0 3.6 1.4 0.2 Iris-setosa \n", + "\n", + " datetime \n", + "0 2020-04-12 07:22:07.065951+00:00 \n", + "1 2020-04-12 07:22:07.065951+00:00 \n", + "2 2020-04-12 07:22:07.065951+00:00 \n", + "3 2020-04-12 07:22:07.065951+00:00 \n", + "4 2020-04-12 07:22:07.065951+00:00 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_feature_names = [\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\"]\n", + "df = pd.read_csv(IRIS_DATASET, names=iris_feature_names + [\"class\"])\n", + "\n", + "# Add datetime to satisfy Feast\n", + "current_datetime = datetime.utcnow().replace(tzinfo=pytz.utc)\n", + "df['datetime'] = current_datetime - timedelta(days=1)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TFDV schema as part of the feature set definition\n", + "\n", + "An integral part of TFDV is the feature [schemas](https://github.com/tensorflow/metadata/blob/master/tensorflow_metadata/proto/v0/schema.proto) that describe the expected properties of the data in a dataset, such as:\n", + "- expected feature presence\n", + "- type\n", + "- expected domains of features\n", + "\n", + "These schemas, which can be [manually defined or generated by TFDV](https://www.tensorflow.org/tfx/data_validation/get_started#inferring_a_schema_over_the_data), can be then used to extend the definition of features within the feature set. As part of the spec, the schema is persisted within Feast, and is used for both in-flight data validation, as well as offline integration with TFDV.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Ignoring feature datetime of type datetime64[ns, UTC]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Entity class(ValueType.STRING) manually updated (replacing an existing field).\n", + "Feature sepal_length (ValueType.DOUBLE) added from dataframe.\n", + "Feature sepal_width (ValueType.DOUBLE) added from dataframe.\n", + "Feature petal_length (ValueType.DOUBLE) added from dataframe.\n", + "Feature petal_width (ValueType.DOUBLE) added from dataframe.\n", + "\n", + "{\n", + " \"spec\": {\n", + " \"name\": \"iris\",\n", + " \"entities\": [\n", + " {\n", + " \"name\": \"class\",\n", + " \"valueType\": \"STRING\",\n", + " \"presence\": {\n", + " \"minFraction\": 1.0,\n", + " \"minCount\": \"1\"\n", + " },\n", + " \"shape\": {\n", + " \"dim\": [\n", + " {\n", + " \"size\": \"1\"\n", + " }\n", + " ]\n", + " },\n", + " \"stringDomain\": {\n", + " \"name\": \"class\",\n", + " \"value\": [\n", + " \"Iris-setosa\",\n", + " \"Iris-versicolor\",\n", + " \"Iris-virginica\"\n", + " ]\n", + " }\n", + " }\n", + " ],\n", + " \"features\": [\n", + " {\n", + " \"name\": \"sepal_length\",\n", + " \"valueType\": \"DOUBLE\",\n", + " \"presence\": {\n", + " \"minFraction\": 1.0,\n", + " \"minCount\": \"1\"\n", + " },\n", + " \"shape\": {\n", + " \"dim\": [\n", + " {\n", + " \"size\": \"1\"\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"sepal_width\",\n", + " \"valueType\": \"DOUBLE\",\n", + " \"presence\": {\n", + " \"minFraction\": 1.0,\n", + " \"minCount\": \"1\"\n", + " },\n", + " \"shape\": {\n", + " \"dim\": [\n", + " {\n", + " \"size\": \"1\"\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"petal_length\",\n", + " \"valueType\": \"DOUBLE\",\n", + " \"presence\": {\n", + " \"minFraction\": 1.0,\n", + " \"minCount\": \"1\"\n", + " },\n", + " \"shape\": {\n", + " \"dim\": [\n", + " {\n", + " \"size\": \"1\"\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"petal_width\",\n", + " \"valueType\": \"DOUBLE\",\n", + " \"presence\": {\n", + " \"minFraction\": 1.0,\n", + " \"minCount\": \"1\"\n", + " },\n", + " \"shape\": {\n", + " \"dim\": [\n", + " {\n", + " \"size\": \"1\"\n", + " }\n", + " ]\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " \"meta\": {}\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/zhiling/.pyenv/versions/3.7.2/envs/feast-0.2-dev/lib/python3.7/site-packages/tensorflow_data_validation/arrow/arrow_util.py:236: FutureWarning: Calling .data on ChunkedArray is provided for compatibility after Column was removed, simply drop this attribute\n", + " types.FeaturePath([column_name]), column.data.chunk(0), weights):\n" + ] + } + ], + "source": [ + "# Infer a schema over the iris dataset. These values can be tweaked as necessary.\n", + "stats = tfdv.generate_statistics_from_dataframe(df)\n", + "schema = tfdv.infer_schema(statistics=stats)\n", + "\n", + "# Create a new FeatureSet or retrieve an existing FeatureSet in Feast\n", + "feature_set = FeatureSet(name=\"iris\")\n", + "feature_set.infer_fields_from_df(df[['datetime'] + iris_feature_names], \n", + " entities=[Entity(name=\"class\", dtype=ValueType.STRING)])\n", + "\n", + "# Update the entities and features with constraints defined in the schema\n", + "feature_set.import_tfx_schema(schema)\n", + "print(feature_set)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing statistics over an ingested dataset\n", + "\n", + "Feast is able to compute statistics for any data that has been ingested into the system. Statistics can be computed over either discrete datasets using *dataset_ids* or periods of time using a specified time range.\n", + "\n", + "These statistics are computed at a historical store (caveat: only BQ is supported at the moment). The feature statistics returned in the form of TFX's `DatasetFeatureStatisticsList`, which can then be directly fed back into TFDV methods to either visualise the data statistics, or validate the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature set updated/created: \"iris:1\"\n", + "Waiting for feature set to be ready for ingestion...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 150/150 [00:01<00:00, 142.05rows/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ingestion complete!\n", + "\n", + "Ingestion statistics:\n", + "Success: 150/150\n", + "Removing temporary file(s)...\n", + "dataset id: a45b5760-76c9-3cfe-8479-2eb6020a73e3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Apply the featureset\n", + "client.apply(feature_set)\n", + "\n", + "# When a dataset is ingested into Feast, a unique dataset id referencing the ingested dataset is returned. \n", + "dataset_id = client.ingest(feature_set, df)\n", + "print(\"dataset id: \" + dataset_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Get statistics from Feast for the ingested dataset.\n", + "# The statistics are calculated over the data in the store specified.\n", + "stats = client.get_statistics(\n", + " feature_set_id=f'{PROJECT_NAME}/iris:1', \n", + " store=BIGQUERY_STORE_NAME, \n", + " features=iris_feature_names, \n", + " dataset_ids=[dataset_id])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualising statistics with facets\n", + "\n", + "Since Feast outputs statistics in a format compatible with the TFDV API, the stats object can be directly passed to `tfdv.visualize_statistics()` to visualise, in-line, the output statistics on [Facets](https://pair-code.github.io/facets/), allowing for easy and interactive exploration of the shape and distribution of the data inside Feast." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tfdv.visualize_statistics(stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Validating correctness of subsequent datasets \n", + "\n", + "While it is useful to explore dataset statistics using facets, since we have already defined a schema that specifies a dataset's bounds of correctness, we can leverage TFDV's `validate_statistics` to validate if subsequent datasets are problematic or not. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is possible to validate correctness of a new dataset prior to ingestion by retrieving the schema from the feature set, and comparing computed statistics against that schema. \n", + "\n", + "This can be useful if we want to avoid ingesting problematic data into Feast." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Ignoring feature datetime of type datetime64[ns, UTC]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Anomaly short descriptionAnomaly long description
Feature name
'class'Unexpected string valuesExamples contain values missing from the schema: Iris-nonsensica (~33%).
\n", + "
" + ], + "text/plain": [ + " Anomaly short description \\\n", + "Feature name \n", + "'class' Unexpected string values \n", + "\n", + " Anomaly long description \n", + "Feature name \n", + "'class' Examples contain values missing from the schema: Iris-nonsensica (~33%). " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Ingest a new dataset with obviously incorrect data\n", + "df_2 = pd.DataFrame(\n", + " {\n", + " \"datetime\": current_datetime,\n", + " \"class\": [\"Iris-setosa\", \"Iris-virginica\", \"Iris-nonsensica\"],\n", + " \"sepal_length\": [4.3, 6.9, 12],\n", + " \"sepal_width\": [3.0, 2.8, 1.1],\n", + " \"petal_length\": [1.2, 4.9, 2.2],\n", + " \"petal_width\": [0.1, 1.8, 0]\n", + " }\n", + ")\n", + "\n", + "# Validate correctness\n", + "stats_2 = tfdv.generate_statistics_from_dataframe(df_2)\n", + "anomalies = tfdv.validate_statistics(statistics=stats_2, schema=feature_set.export_tfx_schema())\n", + "tfdv.display_anomalies(anomalies)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, the data can be ingested into Feast, and the statistics computed at the store. This has the benefit of offloading statistics computation for large datasets to Feast." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/3 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Anomaly short descriptionAnomaly long description
Feature name
'class'Unexpected string valuesExamples contain values missing from the schema: Iris-nonsensica (~33%).
\n", + "" + ], + "text/plain": [ + " Anomaly short description \\\n", + "Feature name \n", + "'class' Unexpected string values \n", + "\n", + " Anomaly long description \n", + "Feature name \n", + "'class' Examples contain values missing from the schema: Iris-nonsensica (~33%). " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Ingest the data into Feast\n", + "dataset_id_2 = client.ingest(feature_set, df_2)\n", + "time.sleep(10) # Sleep is not necessary if not using DirectRunner\n", + "\n", + "# Compute statistics over the new dataset\n", + "stats_2 = client.get_statistics(\n", + " feature_set_id=f'{PROJECT_NAME}/iris:1', \n", + " store=BIGQUERY_STORE_NAME, \n", + " features=iris_feature_names, \n", + " dataset_ids=[dataset_id_2])\n", + "\n", + "# Detect anomalies in the dataset\n", + "anomalies = tfdv.validate_statistics(statistics=stats_2, schema=feature_set.export_tfx_schema())\n", + "tfdv.display_anomalies(anomalies)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}