Merge branch 'main' into feature-924

Signed-off-by: Boli Guan <ifendoe@gmail.com>
feathr-ai · Jan 5, 2023 · 05390a2 · 05390a2
2 parents 1dabc83 + 9c338bc
commit 05390a2
Show file tree

Hide file tree

Showing 134 changed files with 31,188 additions and 18,754 deletions.
diff --git a/.github/workflows/publish-to-maven.yml b/.github/workflows/publish-to-maven.yml
@@ -35,3 +35,21 @@ jobs:
  PGP_SECRET: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }}
  SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
  SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
+
+ # Publish Released Fat Jar to Blob Storage 
+ - name: Gradle build
+ run: |
+ ./gradlew build
+ # remote folder for CI upload
+ echo "CI_SPARK_REMOTE_JAR_FOLDER=feathr_jar_release" >> $GITHUB_ENV
+ # get local jar name without path
+ echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls build/libs/*.jar)" >> $GITHUB_ENV
+ 
+ - name: Azure Blob Storage Upload (Overwrite)
+ uses: fixpoint/azblob-upload-artifact@v4
+ with:
+ connection-string: ${{secrets.SPARK_JAR_BLOB_CONNECTION_STRING}}
+ name: ${{ env.CI_SPARK_REMOTE_JAR_FOLDER}}
+ path: ${{ env.FEATHR_LOCAL_JAR_FULL_NAME_PATH}}
+ container: ${{secrets.SPARK_JAR_BLOB_CONTAINER}}
+ cleanup: "true"
diff --git a/docs/README.md b/docs/README.md
@@ -1,6 +1,6 @@
 <html>
  <h1 align="center">
-  Feathr
+  <img src="./images/feathr_logo.png" width="256"/>
  </h1>
  <h3 align="center">
   An enterprise-grade, high-performance feature store
@@ -14,6 +14,7 @@
 </html>
 
 
+
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue)](https://github.com/feathr-ai/feathr/blob/main/LICENSE)
 [![GitHub Release](https://img.shields.io/github/v/release/feathr-ai/feathr.svg?style=flat&sort=semver&color=blue)](https://github.com/feathr-ai/feathr/releases)
 [![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://feathr-ai.github.io/feathr/)
@@ -61,17 +62,17 @@ If you want to set up everything manually, you can checkout the [Feathr CLI depl
 
 - For more details on Feathr, read our [documentation](https://feathr-ai.github.io/feathr/).
 - For Python API references, read the [Python API Reference](https://feathr.readthedocs.io/).
-- For technical talks on Feathr, see the [slides here](./talks/Feathr%20Feature%20Store%20Talk.pdf). The recording is [here](https://www.youtube.com/watch?v=gZg01UKQMTY).
+- For technical talks on Feathr, see the [slides here](./talks/Feathr%20Feature%20Store%20Talk.pdf) and [here](./talks/Feathr%20Community%20Talk%20%E2%80%93%20An%20Enterprise-Grade%20High%20Performance%20Feature%20Store.pdf). The recording is [here](https://www.youtube.com/watch?v=gZg01UKQMTY).
 
 ## 🧪 Samples
 
-|Name|Description|Platform|
-|---|---|---|
-|[NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb)|Quickstart notebook that showcases how to define, materialize, and register features with NYC taxi-fare prediction sample data.|Azure Synapse, Databricks, Local Spark|
-|[Databricks Quickstart NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb)|Quickstart Databricks notebook with NYC taxi-fare prediction sample data.|Databricks|
-|[Feature Embedding](./samples/feature_embedding.ipynb)|Feathr UDF example showing how to define and use feature embedding with a pre-trained Transformer model and hotel review sample data.|Databricks|
-|[Fraud Detection Demo](./samples/fraud_detection_demo.ipynb)|An example to demonstrate Feature Store using multiple data sources such as user account and transaction data.|Azure Synapse, Databricks, Local Spark|
-|[Product Recommendation Demo](./samples/product_recommendation_demo_advanced.ipynb)|Feathr Feature Store example notebook with a product recommendation scenario|Azure Synapse, Databricks, Local Spark|
+| Name | Description | Platform |
+| ----------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------- |
+| [NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb) | Quickstart notebook that showcases how to define, materialize, and register features with NYC taxi-fare prediction sample data. | Azure Synapse, Databricks, Local Spark |
+| [Databricks Quickstart NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb) | Quickstart Databricks notebook with NYC taxi-fare prediction sample data. | Databricks |
+| [Feature Embedding](./samples/feature_embedding.ipynb) | Feathr UDF example showing how to define and use feature embedding with a pre-trained Transformer model and hotel review sample data. | Databricks |
+| [Fraud Detection Demo](./samples/fraud_detection_demo.ipynb) | An example to demonstrate Feature Store using multiple data sources such as user account and transaction data. | Azure Synapse, Databricks, Local Spark |
+| [Product Recommendation Demo](./samples/product_recommendation_demo_advanced.ipynb) | Feathr Feature Store example notebook with a product recommendation scenario | Azure Synapse, Databricks, Local Spark |
 
 ## 🛠️ Install Feathr Client Locally
 
@@ -174,9 +175,9 @@ Follow the [quick start Jupyter Notebook](https://github.com/feathr-ai/feathr/bl
 ## 🗣️ Tech Talks on Feathr
 
 - [Introduction to Feathr - Beginner's guide](https://www.youtube.com/watch?v=gZg01UKQMTY)
-- [Document Intelligence using Azure Feature Store (Feathr) and SynapseML
- ](https://mybuild.microsoft.com/en-US/sessions/5bdff7d5-23e6-4f0d-9175-da8325d05c2a?source=sessions)
+- [Document Intelligence using Azure Feature Store (Feathr) and SynapseML](https://mybuild.microsoft.com/en-US/sessions/5bdff7d5-23e6-4f0d-9175-da8325d05c2a?source=sessions)
 - [Notebook tutorial: Build a Product Recommendation Machine Learning Model with Feathr Feature Store](https://www.youtube.com/watch?v=2KSM-NLfvY0)
+- [Feathr talk in Feature Store Summit](https://www.youtube.com/watch?v=u8nLY9Savxk)
 
 ## ⚙️ Cloud Integrations and Architecture
 

diff --git a/docs/concepts/registry-access-control.md b/docs/concepts/registry-access-control.md
@@ -38,20 +38,20 @@ Feature level access control is **NOT** supported yet. Users are encouraged to g
 ### Role
 
 A _role_ is a collection of permissions. We have 3 built-in roles with different permissions:
-| Role | Description | Permissions |
+| Role  | Description  | Permissions  |
 | -------- | -------------------------- | ------------------- |
-| Admin | The owner of project | Read, Write, Manage |
-| Producer | The contributor of project | Read, Write |
-| Consumer | The reader of project | Read |
+| Admin  | The owner of project  | Read, Write, Manage |
+| Producer | The contributor of project | Read, Write  |
+| Consumer | The reader of project  | Read  |
 
 ### Permission
 
 _permission_ refers to the a certain kind of access to registry metadata or role assignment records.
-| Permission | Description |
-| ---------- | --------------------------------------------------------- |
-| Read | Read registry meta data; `GET` Registry APIs |
-| Write | Write registry meta data; `POST` Registry APIs |
-| Manage | Create and manage role assignment records with management APIs |
+| Permission | Description  |
+| ---------- | -------------------------------------------------------------- |
+| Read  | Read registry meta data; `GET` Registry APIs  |
+| Write  | Write registry meta data; `POST` Registry APIs  |
+| Manage  | Create and manage role assignment records with management APIs |
 
 ### User
 

diff --git a/docs/how-to-guides/azure_resource_provision.json b/docs/how-to-guides/azure_resource_provision.json
@@ -408,7 +408,11 @@
  {
  "name": "AZURE_CLIENT_ID",
  "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', variables('identityName')), '2018-11-30','Full').properties.clientId]"
- }
+ },
+ {
+ "name": "DOCKER_ENABLE_CI",
+ "value": "true"
+ }
  ]
  }
  }

diff --git a/docs/how-to-guides/feathr-job-configuration.md b/docs/how-to-guides/feathr-job-configuration.md
@@ -33,3 +33,11 @@ Examples when using the above job configurations when materializing features:
 ```python
 client.materialize_features(settings, execution_configurations=SparkExecutionConfiguration({"spark.feathr.inputFormat": "parquet", "spark.feathr.outputFormat": "parquet"}))
 ```
+
+## Config not applied issue
+Please note that `execution_configurations` argument only works when using a new job cluster in Databricks : [Cluster spark config not applied](https://learn.microsoft.com/en-us/azure/databricks/kb/clusters/cluster-spark-config-not-applied)
+
+If you are using an existing cluster, please manually add them to the cluster spark configuration. This can be done in Databrick Cluster UI : [Edit a cluster](https://learn.microsoft.com/en-us/azure/databricks/clusters/clusters-manage#--edit-a-cluster)
+
+
+
diff --git a/docs/images/feathr_logo.png b/docs/images/feathr_logo.png
diff --git a/docs/samples/feature_embedding.ipynb b/docs/samples/feature_embedding.ipynb
@@ -45,6 +45,7 @@
  "outputs": [],
  "source": [
  "import json\n",
+ "import os\n",
  "\n",
  "import pandas as pd\n",
  "from pyspark.sql import DataFrame\n",
@@ -102,7 +103,7 @@
  },
  "outputs": [],
  "source": [
- "RESOURCE_PREFIX = None # TODO fill the value\n",
+ "RESOURCE_PREFIX = \"\" # TODO fill the value\n",
  "PROJECT_NAME = \"hotel_reviews_embedding\"\n",
  "\n",
  "REGISTRY_ENDPOINT = f\"https://{RESOURCE_PREFIX}webapp.azurewebsites.net/api/v1\"\n",
@@ -114,8 +115,8 @@
  " SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = f\"https://{ctx.tags().get('browserHostName').get()}\"\n",
  "else:\n",
  " # TODO fill the values.\n",
- " DATABRICKS_WORKSPACE_TOKEN_VALUE = None\n",
- " SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = None\n",
+ " DATABRICKS_WORKSPACE_TOKEN_VALUE = os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\")\n",
+ " SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = os.environ.get(\"SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL\")\n",
  "\n",
  "# We'll need an authentication credential to access Azure resources and register features \n",
  "USE_CLI_AUTH = False # Set True to use interactive authentication\n",
@@ -146,7 +147,6 @@
  " credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n",
  "elif AZURE_TENANT_ID and AZURE_CLIENT_ID and AZURE_CLIENT_SECRET:\n",
  " # Use Environment variable secret\n",
- " import os\n",
  " from azure.identity import EnvironmentCredential\n",
  " os.environ[\"AZURE_TENANT_ID\"] = AZURE_TENANT_ID\n",
  " os.environ[\"AZURE_CLIENT_ID\"] = AZURE_CLIENT_ID\n",
@@ -315,6 +315,7 @@
  "client = FeathrClient(\n",
  " config_path=config_path,\n",
  " credential=credential,\n",
+ " use_env_vars=False,\n",
  ")"
  ]
  },
@@ -791,7 +792,7 @@
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
- "version": "3.10.4"
+ "version": "3.10.8 (main, Nov 24 2022, 14:13:03) [GCC 11.2.0]"
  },
  "vscode": {
  "interpreter": {

diff --git a/docs/talks/Feathr Community Talk – An Enterprise-Grade High Performance Feature Store.pdf b/docs/talks/Feathr Community Talk – An Enterprise-Grade High Performance Feature Store.pdf
diff --git a/feathr-impl/src/main/java/com/linkedin/feathr/common/FeatureAggregationType.java b/feathr-impl/src/main/java/com/linkedin/feathr/common/FeatureAggregationType.java
@@ -12,5 +12,6 @@ public enum FeatureAggregationType {
  ELEMENTWISE_MAX,
  ELEMENTWISE_MIN,
  ELEMENTWISE_AVG,
- ELEMENTWISE_SUM
+ ELEMENTWISE_SUM,
+ FIRST
 }
diff --git a/...ain/scala/com/linkedin/feathr/offline/derived/strategies/SequentialJoinAsDerivation.scala b/...ain/scala/com/linkedin/feathr/offline/derived/strategies/SequentialJoinAsDerivation.scala
@@ -290,6 +290,8 @@ private[offline] class SequentialJoinAsDerivation(ss: SparkSession,
  throw new FeathrConfigException(
  ErrorLabel.FEATHR_USER_ERROR,
  s"Empty aggregation is not supported for feature ${derivedFeature.producedFeatureNames.head}, in sequential join.")
+ } else if (aggregationFunction == FIRST.toString) {
+ applyFirstAggregation(seqJoinProducedFeatureName, joined, groupByCol)
  } else if (aggregationFunction == UNION.toString) {
  applyUnionAggregation(seqJoinProducedFeatureName, joined, groupByCol)
  } else if (Seq(SUM, MAX, MIN, AVG).map(_.toString).contains(aggregationFunction)) {
@@ -315,6 +317,20 @@ private[offline] class SequentialJoinAsDerivation(ss: SparkSession,
  }
  }
 
+ /**
+ * Apply FIRST Aggregate function for SeqJoin/LookUp Feature.
+ * Note: The function is non-deterministic because its results depends on the order of the rows which may be non-deterministic after a shuffle.
+ * This is designed to be used only when the there's only one looked up feature value in a LookUp feature
+ * @param seqJoinProducedFeatureName name of the column which will have the seqJoin feature
+ * @param joinedDF Dataframe produced after the SeqJoin and before aggregation
+ * @param groupByCol groupby column
+ * @return dataframe with only the groupBy columns and the aggregated feature value result
+ */
+ private[feathr] def applyFirstAggregation(seqJoinProducedFeatureName: String, joinedDF: DataFrame, groupByCol: String): DataFrame = {
+ val (groupedDF, preservedColumns) = getGroupedDF(joinedDF, groupByCol, seqJoinProducedFeatureName)
+ groupedDF.agg(first(seqJoinProducedFeatureName).alias(seqJoinProducedFeatureName), preservedColumns: _*)
+ }
+
  /**
  * Apply Union aggregation for SeqJoin. For more information on the union aggregation rules see the wiki
 

diff --git a/...r-impl/src/main/scala/com/linkedin/feathr/offline/transformation/FDSConversionUtils.scala b/...r-impl/src/main/scala/com/linkedin/feathr/offline/transformation/FDSConversionUtils.scala
@@ -371,7 +371,7 @@ private[offline] object FDSConversionUtils {
  }
  // we need to sort arrays according to dimension array of the 1d sparse tensor, i.e. the first array
  val valType = targetType.asInstanceOf[StructType].fields(1).dataType.asInstanceOf[ArrayType].elementType
- val indexArray = arrays(0).asInstanceOf[Array[Any]]
+ val indexArray = arrays(0).toArray
  val sortedArrays = if (indexArray.nonEmpty) {
  val firstElement = indexArray.head
  val sortedArrays = firstElement match {

diff --git a/feathr-impl/src/test/resources/anchor1-obs.csv b/feathr-impl/src/test/resources/anchor1-obs.csv
@@ -0,0 +1,4 @@
+mId,alpha,beta,gamma,omega
+1,apple,10,10,0.1
+2,orange,10,3,0.1
+3,banana,10,2,0.9
diff --git a/feathr-impl/src/test/resources/anchor1-source-lookup.csv b/feathr-impl/src/test/resources/anchor1-source-lookup.csv
@@ -0,0 +1,4 @@
+alpha,price
+apple,1
+orange,2
+banana,3
diff --git a/feathr-impl/src/test/scala/com/linkedin/feathr/offline/LookupFeatureIntegTest.scala b/feathr-impl/src/test/scala/com/linkedin/feathr/offline/LookupFeatureIntegTest.scala
@@ -0,0 +1,87 @@
+package com.linkedin.feathr.offline
+
+import com.linkedin.feathr.offline.util.FeathrTestUtils.assertDataFrameApproximatelyEquals
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types._
+import org.testng.annotations.Test
+
+class LookupFeatureIntegTest extends FeathrIntegTest {
+
+ /**
+ * Test look up feature with first aggregation
+ */
+ @Test
+ def testLookupFeatureWithFirstAgg: Unit = {
+ val df = runLocalFeatureJoinForTest(
+ joinConfigAsString =
+ """
+ | features: [ {
+ | key: [mId]
+ | featureList: [ "item_price"]
+ | }
+ | ]
+ | """.stripMargin,
+ featureDefAsString =
+ """
+ | anchors: {
+ | anchor1: {
+ | source: "anchor1-source.csv"
+ | key: "mId"
+ | features: {
+ | f_alpha: "alpha"
+ | }
+ | }
+ | anchor2: {
+ | source: "anchor1-source-lookup.csv"
+ | key: "alpha"
+ | features: {
+ | f_price: "(float)price"
+ | }
+ | }
+ | }
+ | derivations: {
+ | item_price: {
+ | key: [mId]
+ | join: {
+ | base: {key: [mId], feature: f_alpha}
+ | expansion: {key: [item_id], feature: f_price}
+ | }
+ | aggregation: FIRST
+ | type: {
+ | type: TENSOR
+ | tensorCategory: DENSE
+ | dimensionType: []
+ | valType: FLOAT
+ | }
+ | }
+ | }
+ |""".stripMargin,
+ observationDataPath = "anchor1-obs.csv")
+ val selectedColumns = Seq("mId", "item_price")
+ val filteredDf = df.data.select(selectedColumns.head, selectedColumns.tail: _*)
+
+ val expectedDf = ss.createDataFrame(
+ ss.sparkContext.parallelize(
+ Seq(
+ Row(
+ "1",
+ 1.0f
+ ), Row(
+ "2",
+ 2.0f
+ ), Row(
+ "3",
+ 3.0f
+ ),
+ )
+ ),
+ StructType(
+ List(
+ StructField("mId", StringType, true),
+ StructField("item_price", FloatType, true))))
+
+ def cmpFunc(row: Row): String = if (row.get(0) != null) row.get(0).toString else "null"
+
+ assertDataFrameApproximatelyEquals(filteredDf, expectedDf, cmpFunc)
+ }
+}
diff --git a/feathr-impl/src/test/scala/com/linkedin/feathr/offline/SlidingWindowAggIntegTest.scala b/feathr-impl/src/test/scala/com/linkedin/feathr/offline/SlidingWindowAggIntegTest.scala
@@ -193,7 +193,7 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest {
  |features: [
  | {
  | key: [mId],
- | featureList: ["aEmbedding"]
+ | featureList: ["aEmbedding", "memberEmbeddingAutoTZ"]
  | }
  |]
  """.stripMargin,
@@ -219,6 +219,17 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest {
  | aggregation: LATEST
  | window: 3d
  | }
+ | memberEmbeddingAutoTZ: {
+ | def: "embedding"
+ | aggregation: LATEST
+ | window: 3d
+ | type: {
+ | type: TENSOR
+ | tensorCategory: SPARSE
+ | dimensionType: [INT]
+ | valType: FLOAT
+ | }
+ | }
  | }
  | }
  |}
@@ -229,6 +240,8 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest {
 
  assertEquals(featureList.size, 2)
  assertEquals(featureList(0).getAs[Row]("aEmbedding"), mutable.WrappedArray.make(Array(5.5f, 5.8f)))
+ assertEquals(featureList(0).getAs[Row]("memberEmbeddingAutoTZ"),
+ TestUtils.build1dSparseTensorFDSRow(Array(0, 1), Array(5.5f, 5.8f)))
  }
 
  /**