Skip to content

Commit

Permalink
Merge branch 'main' into feature-924
Browse files Browse the repository at this point in the history
Signed-off-by: Boli Guan <ifendoe@gmail.com>
  • Loading branch information
Fendoe committed Jan 5, 2023
2 parents 1dabc83 + 9c338bc commit 05390a2
Show file tree
Hide file tree
Showing 134 changed files with 31,188 additions and 18,754 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/publish-to-maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,21 @@ jobs:
PGP_SECRET: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }}
SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}

# Publish Released Fat Jar to Blob Storage
- name: Gradle build
run: |
./gradlew build
# remote folder for CI upload
echo "CI_SPARK_REMOTE_JAR_FOLDER=feathr_jar_release" >> $GITHUB_ENV
# get local jar name without path
echo "FEATHR_LOCAL_JAR_FULL_NAME_PATH=$(ls build/libs/*.jar)" >> $GITHUB_ENV
- name: Azure Blob Storage Upload (Overwrite)
uses: fixpoint/azblob-upload-artifact@v4
with:
connection-string: ${{secrets.SPARK_JAR_BLOB_CONNECTION_STRING}}
name: ${{ env.CI_SPARK_REMOTE_JAR_FOLDER}}
path: ${{ env.FEATHR_LOCAL_JAR_FULL_NAME_PATH}}
container: ${{secrets.SPARK_JAR_BLOB_CONTAINER}}
cleanup: "true"
23 changes: 12 additions & 11 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<html>
<h1 align="center">
Feathr
<img src="./images/feathr_logo.png" width="256"/>
</h1>
<h3 align="center">
An enterprise-grade, high-performance feature store
Expand All @@ -14,6 +14,7 @@
</html>



[![License](https://img.shields.io/badge/License-Apache%202.0-blue)](https://github.com/feathr-ai/feathr/blob/main/LICENSE)
[![GitHub Release](https://img.shields.io/github/v/release/feathr-ai/feathr.svg?style=flat&sort=semver&color=blue)](https://github.com/feathr-ai/feathr/releases)
[![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://feathr-ai.github.io/feathr/)
Expand Down Expand Up @@ -61,17 +62,17 @@ If you want to set up everything manually, you can checkout the [Feathr CLI depl

- For more details on Feathr, read our [documentation](https://feathr-ai.github.io/feathr/).
- For Python API references, read the [Python API Reference](https://feathr.readthedocs.io/).
- For technical talks on Feathr, see the [slides here](./talks/Feathr%20Feature%20Store%20Talk.pdf). The recording is [here](https://www.youtube.com/watch?v=gZg01UKQMTY).
- For technical talks on Feathr, see the [slides here](./talks/Feathr%20Feature%20Store%20Talk.pdf) and [here](./talks/Feathr%20Community%20Talk%20%E2%80%93%20An%20Enterprise-Grade%20High%20Performance%20Feature%20Store.pdf). The recording is [here](https://www.youtube.com/watch?v=gZg01UKQMTY).

## 🧪 Samples

|Name|Description|Platform|
|---|---|---|
|[NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb)|Quickstart notebook that showcases how to define, materialize, and register features with NYC taxi-fare prediction sample data.|Azure Synapse, Databricks, Local Spark|
|[Databricks Quickstart NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb)|Quickstart Databricks notebook with NYC taxi-fare prediction sample data.|Databricks|
|[Feature Embedding](./samples/feature_embedding.ipynb)|Feathr UDF example showing how to define and use feature embedding with a pre-trained Transformer model and hotel review sample data.|Databricks|
|[Fraud Detection Demo](./samples/fraud_detection_demo.ipynb)|An example to demonstrate Feature Store using multiple data sources such as user account and transaction data.|Azure Synapse, Databricks, Local Spark|
|[Product Recommendation Demo](./samples/product_recommendation_demo_advanced.ipynb)|Feathr Feature Store example notebook with a product recommendation scenario|Azure Synapse, Databricks, Local Spark|
| Name | Description | Platform |
| ----------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------- |
| [NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb) | Quickstart notebook that showcases how to define, materialize, and register features with NYC taxi-fare prediction sample data. | Azure Synapse, Databricks, Local Spark |
| [Databricks Quickstart NYC Taxi Demo](./samples/nyc_taxi_demo.ipynb) | Quickstart Databricks notebook with NYC taxi-fare prediction sample data. | Databricks |
| [Feature Embedding](./samples/feature_embedding.ipynb) | Feathr UDF example showing how to define and use feature embedding with a pre-trained Transformer model and hotel review sample data. | Databricks |
| [Fraud Detection Demo](./samples/fraud_detection_demo.ipynb) | An example to demonstrate Feature Store using multiple data sources such as user account and transaction data. | Azure Synapse, Databricks, Local Spark |
| [Product Recommendation Demo](./samples/product_recommendation_demo_advanced.ipynb) | Feathr Feature Store example notebook with a product recommendation scenario | Azure Synapse, Databricks, Local Spark |

## 🛠️ Install Feathr Client Locally

Expand Down Expand Up @@ -174,9 +175,9 @@ Follow the [quick start Jupyter Notebook](https://github.com/feathr-ai/feathr/bl
## 🗣️ Tech Talks on Feathr

- [Introduction to Feathr - Beginner's guide](https://www.youtube.com/watch?v=gZg01UKQMTY)
- [Document Intelligence using Azure Feature Store (Feathr) and SynapseML
](https://mybuild.microsoft.com/en-US/sessions/5bdff7d5-23e6-4f0d-9175-da8325d05c2a?source=sessions)
- [Document Intelligence using Azure Feature Store (Feathr) and SynapseML](https://mybuild.microsoft.com/en-US/sessions/5bdff7d5-23e6-4f0d-9175-da8325d05c2a?source=sessions)
- [Notebook tutorial: Build a Product Recommendation Machine Learning Model with Feathr Feature Store](https://www.youtube.com/watch?v=2KSM-NLfvY0)
- [Feathr talk in Feature Store Summit](https://www.youtube.com/watch?v=u8nLY9Savxk)

## ⚙️ Cloud Integrations and Architecture

Expand Down
18 changes: 9 additions & 9 deletions docs/concepts/registry-access-control.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,20 @@ Feature level access control is **NOT** supported yet. Users are encouraged to g
### Role

A _role_ is a collection of permissions. We have 3 built-in roles with different permissions:
| Role | Description | Permissions |
| Role | Description | Permissions |
| -------- | -------------------------- | ------------------- |
| Admin | The owner of project | Read, Write, Manage |
| Producer | The contributor of project | Read, Write |
| Consumer | The reader of project | Read |
| Admin | The owner of project | Read, Write, Manage |
| Producer | The contributor of project | Read, Write |
| Consumer | The reader of project | Read |

### Permission

_permission_ refers to the a certain kind of access to registry metadata or role assignment records.
| Permission | Description |
| ---------- | --------------------------------------------------------- |
| Read | Read registry meta data; `GET` Registry APIs |
| Write | Write registry meta data; `POST` Registry APIs |
| Manage | Create and manage role assignment records with management APIs |
| Permission | Description |
| ---------- | -------------------------------------------------------------- |
| Read | Read registry meta data; `GET` Registry APIs |
| Write | Write registry meta data; `POST` Registry APIs |
| Manage | Create and manage role assignment records with management APIs |

### User

Expand Down
6 changes: 5 additions & 1 deletion docs/how-to-guides/azure_resource_provision.json
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,11 @@
{
"name": "AZURE_CLIENT_ID",
"value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', variables('identityName')), '2018-11-30','Full').properties.clientId]"
}
},
{
"name": "DOCKER_ENABLE_CI",
"value": "true"
}
]
}
}
Expand Down
8 changes: 8 additions & 0 deletions docs/how-to-guides/feathr-job-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,11 @@ Examples when using the above job configurations when materializing features:
```python
client.materialize_features(settings, execution_configurations=SparkExecutionConfiguration({"spark.feathr.inputFormat": "parquet", "spark.feathr.outputFormat": "parquet"}))
```

## Config not applied issue
Please note that `execution_configurations` argument only works when using a new job cluster in Databricks : [Cluster spark config not applied](https://learn.microsoft.com/en-us/azure/databricks/kb/clusters/cluster-spark-config-not-applied)

If you are using an existing cluster, please manually add them to the cluster spark configuration. This can be done in Databrick Cluster UI : [Edit a cluster](https://learn.microsoft.com/en-us/azure/databricks/clusters/clusters-manage#--edit-a-cluster)



Binary file added docs/images/feathr_logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 6 additions & 5 deletions docs/samples/feature_embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"outputs": [],
"source": [
"import json\n",
"import os\n",
"\n",
"import pandas as pd\n",
"from pyspark.sql import DataFrame\n",
Expand Down Expand Up @@ -102,7 +103,7 @@
},
"outputs": [],
"source": [
"RESOURCE_PREFIX = None # TODO fill the value\n",
"RESOURCE_PREFIX = \"\" # TODO fill the value\n",
"PROJECT_NAME = \"hotel_reviews_embedding\"\n",
"\n",
"REGISTRY_ENDPOINT = f\"https://{RESOURCE_PREFIX}webapp.azurewebsites.net/api/v1\"\n",
Expand All @@ -114,8 +115,8 @@
" SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = f\"https://{ctx.tags().get('browserHostName').get()}\"\n",
"else:\n",
" # TODO fill the values.\n",
" DATABRICKS_WORKSPACE_TOKEN_VALUE = None\n",
" SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = None\n",
" DATABRICKS_WORKSPACE_TOKEN_VALUE = os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\")\n",
" SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = os.environ.get(\"SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL\")\n",
"\n",
"# We'll need an authentication credential to access Azure resources and register features \n",
"USE_CLI_AUTH = False # Set True to use interactive authentication\n",
Expand Down Expand Up @@ -146,7 +147,6 @@
" credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n",
"elif AZURE_TENANT_ID and AZURE_CLIENT_ID and AZURE_CLIENT_SECRET:\n",
" # Use Environment variable secret\n",
" import os\n",
" from azure.identity import EnvironmentCredential\n",
" os.environ[\"AZURE_TENANT_ID\"] = AZURE_TENANT_ID\n",
" os.environ[\"AZURE_CLIENT_ID\"] = AZURE_CLIENT_ID\n",
Expand Down Expand Up @@ -315,6 +315,7 @@
"client = FeathrClient(\n",
" config_path=config_path,\n",
" credential=credential,\n",
" use_env_vars=False,\n",
")"
]
},
Expand Down Expand Up @@ -791,7 +792,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.8 (main, Nov 24 2022, 14:13:03) [GCC 11.2.0]"
},
"vscode": {
"interpreter": {
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ public enum FeatureAggregationType {
ELEMENTWISE_MAX,
ELEMENTWISE_MIN,
ELEMENTWISE_AVG,
ELEMENTWISE_SUM
ELEMENTWISE_SUM,
FIRST
}
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ private[offline] class SequentialJoinAsDerivation(ss: SparkSession,
throw new FeathrConfigException(
ErrorLabel.FEATHR_USER_ERROR,
s"Empty aggregation is not supported for feature ${derivedFeature.producedFeatureNames.head}, in sequential join.")
} else if (aggregationFunction == FIRST.toString) {
applyFirstAggregation(seqJoinProducedFeatureName, joined, groupByCol)
} else if (aggregationFunction == UNION.toString) {
applyUnionAggregation(seqJoinProducedFeatureName, joined, groupByCol)
} else if (Seq(SUM, MAX, MIN, AVG).map(_.toString).contains(aggregationFunction)) {
Expand All @@ -315,6 +317,20 @@ private[offline] class SequentialJoinAsDerivation(ss: SparkSession,
}
}

/**
* Apply FIRST Aggregate function for SeqJoin/LookUp Feature.
* Note: The function is non-deterministic because its results depends on the order of the rows which may be non-deterministic after a shuffle.
* This is designed to be used only when the there's only one looked up feature value in a LookUp feature
* @param seqJoinProducedFeatureName name of the column which will have the seqJoin feature
* @param joinedDF Dataframe produced after the SeqJoin and before aggregation
* @param groupByCol groupby column
* @return dataframe with only the groupBy columns and the aggregated feature value result
*/
private[feathr] def applyFirstAggregation(seqJoinProducedFeatureName: String, joinedDF: DataFrame, groupByCol: String): DataFrame = {
val (groupedDF, preservedColumns) = getGroupedDF(joinedDF, groupByCol, seqJoinProducedFeatureName)
groupedDF.agg(first(seqJoinProducedFeatureName).alias(seqJoinProducedFeatureName), preservedColumns: _*)
}

/**
* Apply Union aggregation for SeqJoin. For more information on the union aggregation rules see the wiki
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ private[offline] object FDSConversionUtils {
}
// we need to sort arrays according to dimension array of the 1d sparse tensor, i.e. the first array
val valType = targetType.asInstanceOf[StructType].fields(1).dataType.asInstanceOf[ArrayType].elementType
val indexArray = arrays(0).asInstanceOf[Array[Any]]
val indexArray = arrays(0).toArray
val sortedArrays = if (indexArray.nonEmpty) {
val firstElement = indexArray.head
val sortedArrays = firstElement match {
Expand Down
4 changes: 4 additions & 0 deletions feathr-impl/src/test/resources/anchor1-obs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
mId,alpha,beta,gamma,omega
1,apple,10,10,0.1
2,orange,10,3,0.1
3,banana,10,2,0.9
4 changes: 4 additions & 0 deletions feathr-impl/src/test/resources/anchor1-source-lookup.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
alpha,price
apple,1
orange,2
banana,3
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package com.linkedin.feathr.offline

import com.linkedin.feathr.offline.util.FeathrTestUtils.assertDataFrameApproximatelyEquals
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.testng.annotations.Test

class LookupFeatureIntegTest extends FeathrIntegTest {

/**
* Test look up feature with first aggregation
*/
@Test
def testLookupFeatureWithFirstAgg: Unit = {
val df = runLocalFeatureJoinForTest(
joinConfigAsString =
"""
| features: [ {
| key: [mId]
| featureList: [ "item_price"]
| }
| ]
| """.stripMargin,
featureDefAsString =
"""
| anchors: {
| anchor1: {
| source: "anchor1-source.csv"
| key: "mId"
| features: {
| f_alpha: "alpha"
| }
| }
| anchor2: {
| source: "anchor1-source-lookup.csv"
| key: "alpha"
| features: {
| f_price: "(float)price"
| }
| }
| }
| derivations: {
| item_price: {
| key: [mId]
| join: {
| base: {key: [mId], feature: f_alpha}
| expansion: {key: [item_id], feature: f_price}
| }
| aggregation: FIRST
| type: {
| type: TENSOR
| tensorCategory: DENSE
| dimensionType: []
| valType: FLOAT
| }
| }
| }
|""".stripMargin,
observationDataPath = "anchor1-obs.csv")
val selectedColumns = Seq("mId", "item_price")
val filteredDf = df.data.select(selectedColumns.head, selectedColumns.tail: _*)

val expectedDf = ss.createDataFrame(
ss.sparkContext.parallelize(
Seq(
Row(
"1",
1.0f
), Row(
"2",
2.0f
), Row(
"3",
3.0f
),
)
),
StructType(
List(
StructField("mId", StringType, true),
StructField("item_price", FloatType, true))))

def cmpFunc(row: Row): String = if (row.get(0) != null) row.get(0).toString else "null"

assertDataFrameApproximatelyEquals(filteredDf, expectedDf, cmpFunc)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest {
|features: [
| {
| key: [mId],
| featureList: ["aEmbedding"]
| featureList: ["aEmbedding", "memberEmbeddingAutoTZ"]
| }
|]
""".stripMargin,
Expand All @@ -219,6 +219,17 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest {
| aggregation: LATEST
| window: 3d
| }
| memberEmbeddingAutoTZ: {
| def: "embedding"
| aggregation: LATEST
| window: 3d
| type: {
| type: TENSOR
| tensorCategory: SPARSE
| dimensionType: [INT]
| valType: FLOAT
| }
| }
| }
| }
|}
Expand All @@ -229,6 +240,8 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest {

assertEquals(featureList.size, 2)
assertEquals(featureList(0).getAs[Row]("aEmbedding"), mutable.WrappedArray.make(Array(5.5f, 5.8f)))
assertEquals(featureList(0).getAs[Row]("memberEmbeddingAutoTZ"),
TestUtils.build1dSparseTensorFDSRow(Array(0, 1), Array(5.5f, 5.8f)))
}

/**
Expand Down
Loading

0 comments on commit 05390a2

Please sign in to comment.