diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb
index dd22a1378..9a0038917 100644
--- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb
+++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb
@@ -7,7 +7,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
+ "# Copyright 2023 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
@@ -133,7 +133,7 @@
"\n",
"# for running this example on CPU, uncomment the following lines\n",
"# %pip install tensorflow-cpu \"feast==0.31\" faiss-cpu\n",
- "# %pip uninstall cudf"
+ "# %pip uninstall cudf\n"
]
},
{
@@ -146,7 +146,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-06-20 23:45:23.539085: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
+ "2023-06-29 19:49:32.836544: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
" warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n"
@@ -167,11 +167,12 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-06-20 23:45:31.002019: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
+ "2023-06-29 19:49:37.094972: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
- "2023-06-20 23:45:31.232986: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n",
- "2023-06-20 23:45:31.233033: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
- "2023-06-20 23:45:31.233242: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8192 MB memory: -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:06:00.0, compute capability: 7.0\n",
+ "2023-06-29 19:49:38.134481: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n",
+ "2023-06-29 19:49:38.134526: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n",
+ "2023-06-29 19:49:38.135533: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n",
+ "2023-06-29 19:49:38.135562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n",
"/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
@@ -186,21 +187,19 @@
],
"source": [
"import os\n",
- "# for running this example on CPU, comment out the line below\n",
- "os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"\n",
- "\n",
"import nvtabular as nvt\n",
"from nvtabular.ops import Rename, Filter, Dropna, LambdaOp, Categorify, \\\n",
" TagAsUserFeatures, TagAsUserID, TagAsItemFeatures, TagAsItemID, AddMetadata\n",
"\n",
"from merlin.schema.tags import Tags\n",
- "\n",
+ "from merlin.dag.ops.subgraph import Subgraph\n",
"import merlin.models.tf as mm\n",
"from merlin.io.dataset import Dataset\n",
"from merlin.datasets.ecommerce import transform_aliccp\n",
"import tensorflow as tf\n",
"\n",
- "import logging"
+ "# for running this example on CPU, comment out the line below\n",
+ "os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\""
]
},
{
@@ -211,6 +210,8 @@
"outputs": [],
"source": [
"# disable INFO and DEBUG logging everywhere\n",
+ "import logging\n",
+ "\n",
"logging.disable(logging.WARNING)"
]
},
@@ -269,681 +270,633 @@
},
{
"cell_type": "markdown",
- "id": "2e428d01-f2f0-42d4-85d0-0986bb83a847",
- "metadata": {},
- "source": [
- "### Feature Engineering with NVTabular"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "d4bf870c-30cf-4074-88d3-b75981b3a873",
+ "id": "7bd843be-dfba-4f8b-bac1-608e6571352d",
"metadata": {},
- "outputs": [],
"source": [
- "output_path = os.path.join(DATA_FOLDER, \"processed_nvt\")"
+ "### Set up a feature store with Feast"
]
},
{
"cell_type": "markdown",
- "id": "1e7bfb5c-88ed-4cf9-8a17-98c0284adb36",
+ "id": "c543b71c-6ba2-4e43-8779-8bffb62d2cee",
"metadata": {},
"source": [
- "In the following NVTabular workflow, notice that we apply the `Dropna()` Operator at the end. We add the Operator to remove rows with missing values in the final DataFrame after the preceding transformations. Although, the synthetic dataset that we generate and use in this notebook does not have null entries, you might have null entries in your `user_id` and `item_id` columns in your own custom dataset. Therefore, while applying `Dropna()` we will not be registering null `user_id_raw` and `item_id_raw` values in the feature store, and will be avoiding potential issues that can occur because of any null entries."
+ "Before we move onto the next step, we need to create a Feast feature repository. [Feast](https://feast.dev/) is an end-to-end open source feature store for machine learning. Feast (Feature Store) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to real-time models.\n",
+ "\n",
+ "We will create the feature repo in the current working directory, which is `BASE_DIR` for us."
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "f91ada78-4e4d-4415-ab94-e351aa454e9e",
+ "execution_count": 7,
+ "id": "2e7e96d2-9cd2-40d1-b356-8cd76b57bb4a",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Creating a new Feast repository in \u001b[1m\u001b[32m/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo\u001b[0m.\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
- "user_id_raw = [\"user_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsUserFeatures()\n",
- "item_id_raw = [\"item_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsItemFeatures()\n",
- "\n",
- "user_id = [\"user_id\"] >> Categorify(dtype=\"int32\") >> TagAsUserID()\n",
- "item_id = [\"item_id\"] >> Categorify(dtype=\"int32\") >> TagAsItemID()\n",
- "\n",
- "item_features = (\n",
- " [\"item_category\", \"item_shop\", \"item_brand\"] >> Categorify(dtype=\"int32\") >> TagAsItemFeatures()\n",
- ")\n",
- "\n",
- "user_features = (\n",
- " [\n",
- " \"user_shops\",\n",
- " \"user_profile\",\n",
- " \"user_group\",\n",
- " \"user_gender\",\n",
- " \"user_age\",\n",
- " \"user_consumption_2\",\n",
- " \"user_is_occupied\",\n",
- " \"user_geography\",\n",
- " \"user_intentions\",\n",
- " \"user_brands\",\n",
- " \"user_categories\",\n",
- " ] >> Categorify(dtype=\"int32\") >> TagAsUserFeatures()\n",
- ")\n",
- "\n",
- "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n",
- "\n",
- "outputs = user_id + item_id + item_features + user_features + user_id_raw + item_id_raw + targets\n",
- "\n",
- "# add dropna op to filter rows with nulls\n",
- "outputs = outputs >> Dropna()"
+ "!rm -rf $BASE_DIR/feast_repo\n",
+ "!cd $BASE_DIR && feast init feast_repo"
]
},
{
"cell_type": "markdown",
- "id": "71aae006-a161-4127-889a-8f433a9f7362",
+ "id": "f6d4d773-144e-4e34-82cd-f2b50fce601c",
"metadata": {},
"source": [
- "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path."
+ "You should be seeing a message like Creating a new Feast repository in ... printed out above. Now, navigate to the `feature_repo` folder and remove the demo parquet file created by default, and `examples.py` file."
]
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "814e8438-642a-4f03-baaf-44dab8d1b5e5",
+ "execution_count": 8,
+ "id": "26ba2521-ed1b-4c2b-afdd-26b4a5a9c008",
"metadata": {},
"outputs": [],
"source": [
- "transform_aliccp(\n",
- " (train_raw, valid_raw), output_path, nvt_workflow=outputs, workflow_name=\"workflow\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "09c87748-af61-42b8-8574-1afe3d71118f",
- "metadata": {},
- "source": [
- "### Training a Retrieval Model with Two-Tower Model"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e644fcba-7b0b-44c0-97fd-80f4fcb01191",
- "metadata": {},
- "source": [
- "We start with the offline candidate retrieval stage. We are going to train a Two-Tower model for item retrieval. To learn more about the Two-tower model you can visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb)."
+ "feature_repo_path = os.path.join(BASE_DIR, \"feast_repo/feature_repo\")\n",
+ "if os.path.exists(f\"{feature_repo_path}/example_repo.py\"):\n",
+ " os.remove(f\"{feature_repo_path}/example_repo.py\")\n",
+ "if os.path.exists(f\"{feature_repo_path}/data/driver_stats.parquet\"):\n",
+ " os.remove(f\"{feature_repo_path}/data/driver_stats.parquet\")"
]
},
{
"cell_type": "markdown",
- "id": "cf9bca46-a6b6-4a73-afd8-fe2869c60748",
+ "id": "24ae0e29-c156-4df9-8977-238786160a8c",
"metadata": {},
"source": [
- "#### Feature Engineering with NVTabular"
+ "### Exporting user and item features"
]
},
{
- "cell_type": "markdown",
- "id": "da2b09cc-09fb-4814-a1cb-7e6168d9eb4b",
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ea0b369c-2f01-42e3-9f3c-74c3ff4a6d64",
"metadata": {},
+ "outputs": [],
"source": [
- "We are going to process our raw categorical features by encoding them using `Categorify()` operator and tag the features with `user` or `item` tags in the schema file. To learn more about [NVTabular](https://github.com/NVIDIA-Merlin/NVTabular) and the schema object visit this example [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) in the Merlin Models repo."
+ "from merlin.models.utils.dataset import unique_rows_by_features\n",
+ "\n",
+ "user_features = (\n",
+ " unique_rows_by_features(train_raw, Tags.USER, Tags.USER_ID)\n",
+ " .compute()\n",
+ " .reset_index(drop=True)\n",
+ ")"
]
},
{
"cell_type": "markdown",
- "id": "f3bc7abd-8d97-452b-a4af-5227821a99c9",
+ "id": "4f2d12f5-c753-4392-b113-965d97d2fe35",
"metadata": {},
"source": [
- "Define a new output path to store the filtered datasets and schema files."
+ "We will artificially add `datetime` and `created` timestamp columns to our user_features dataframe. This required by Feast to track the user-item features and their creation time and to determine which version to use when we query Feast."
]
},
{
"cell_type": "code",
"execution_count": 10,
- "id": "df72a793-194b-44f4-80c3-aaa368a9a01e",
+ "id": "d30bd2f8-8a78-4df7-9bc4-42bd741c5b99",
"metadata": {},
"outputs": [],
"source": [
- "output_path2 = os.path.join(DATA_FOLDER, \"processed/retrieval\")"
+ "from datetime import datetime\n",
+ "\n",
+ "user_features[\"datetime\"] = datetime.now()\n",
+ "user_features[\"datetime\"] = user_features[\"datetime\"].astype(\"datetime64[ns]\")\n",
+ "user_features[\"created\"] = datetime.now()\n",
+ "user_features[\"created\"] = user_features[\"created\"].astype(\"datetime64[ns]\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
- "id": "251d4697-8f9c-4c93-8de4-c3480a8378de",
- "metadata": {},
- "outputs": [],
- "source": [
- "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n",
- "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ffd7e2ac-a251-49d0-943b-e9272c852ba6",
+ "id": "d4998cd1-9dcd-4911-8f23-372e197b41e9",
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user_id | \n",
+ " user_shops | \n",
+ " user_profile | \n",
+ " user_group | \n",
+ " user_gender | \n",
+ " user_age | \n",
+ " user_consumption_1 | \n",
+ " user_consumption_2 | \n",
+ " user_is_occupied | \n",
+ " user_geography | \n",
+ " user_intentions | \n",
+ " user_brands | \n",
+ " user_categories | \n",
+ " datetime | \n",
+ " created | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 590 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 171 | \n",
+ " 293 | \n",
+ " 31 | \n",
+ " 2023-06-29 19:49:50.300270 | \n",
+ " 2023-06-29 19:49:50.303330 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user_id user_shops user_profile user_group user_gender user_age \\\n",
+ "6 7 590 1 1 1 1 \n",
+ "\n",
+ " user_consumption_1 user_consumption_2 user_is_occupied user_geography \\\n",
+ "6 1 1 1 1 \n",
+ "\n",
+ " user_intentions user_brands user_categories datetime \\\n",
+ "6 171 293 31 2023-06-29 19:49:50.300270 \n",
+ "\n",
+ " created \n",
+ "6 2023-06-29 19:49:50.303330 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "We select only positive interaction rows where `click==1` in the dataset with `Filter()` operator."
+ "user_features[user_features[\"user_id\"] == 7]"
]
},
{
"cell_type": "code",
"execution_count": 12,
- "id": "7e085a6d-74ad-4c24-8e7c-4e449c15f471",
+ "id": "2981b3ed-6156-49f0-aa14-326a3853a58a",
"metadata": {},
"outputs": [],
"source": [
- "inputs = train_tt.schema.column_names\n",
- "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n",
- "\n",
- "workflow2 = nvt.Workflow(outputs)\n",
- "\n",
- "workflow2.fit(train_tt)\n",
- "\n",
- "workflow2.transform(train_tt).to_parquet(\n",
- " output_path=os.path.join(output_path2, \"train\")\n",
- ")\n",
- "\n",
- "workflow2.transform(valid_tt).to_parquet(\n",
- " output_path=os.path.join(output_path2, \"valid\")\n",
+ "user_features.to_parquet(\n",
+ " os.path.join(feature_repo_path, \"data\", \"user_features.parquet\")\n",
")"
]
},
- {
- "cell_type": "markdown",
- "id": "cc4721ae-7228-4d3f-9586-dcdfefecc19f",
- "metadata": {},
- "source": [
- "NVTabular exported the schema file, `schema.pbtxt` a protobuf text file, of our processed dataset. To learn more about the schema object and schema file you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) notebook."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "aa025b80-0f18-437c-a85f-4edcb89f4222",
- "metadata": {},
- "source": [
- "**Read filtered parquet files as Dataset objects.**"
- ]
- },
{
"cell_type": "code",
"execution_count": 13,
- "id": "252a8e60-b447-46b5-ade6-3557cbafa797",
+ "id": "0a33a668-8e2a-4546-8f54-0060d405ba91",
"metadata": {},
"outputs": [],
"source": [
- "train_tt = Dataset(os.path.join(output_path2, \"train\", \"*.parquet\"), part_size=\"500MB\")\n",
- "valid_tt = Dataset(os.path.join(output_path2, \"valid\", \"*.parquet\"), part_size=\"500MB\")"
+ "item_features = (\n",
+ " unique_rows_by_features(train_raw, Tags.ITEM, Tags.ITEM_ID)\n",
+ " .compute()\n",
+ " .reset_index(drop=True)\n",
+ ")"
]
},
{
"cell_type": "code",
"execution_count": 14,
- "id": "71063653-2f39-4b54-8399-145d6f281d4d",
+ "id": "68a694d6-926f-4b0f-8edc-8cc7ac85ade7",
"metadata": {},
"outputs": [],
"source": [
- "schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['user_id_raw', 'item_id_raw', 'click'])\n",
- "train_tt.schema = schema\n",
- "valid_tt.schema = schema"
+ "item_features[\"datetime\"] = datetime.now()\n",
+ "item_features[\"datetime\"] = item_features[\"datetime\"].astype(\"datetime64[ns]\")\n",
+ "item_features[\"created\"] = datetime.now()\n",
+ "item_features[\"created\"] = item_features[\"created\"].astype(\"datetime64[ns]\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
- "id": "9312511a-f368-42f2-93d2-eb95aebbf46c",
- "metadata": {},
- "outputs": [],
- "source": [
- "model_tt = mm.TwoTowerModel(\n",
- " schema,\n",
- " query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),\n",
- " samplers=[mm.InBatchSampler()],\n",
- " embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "4d47cb8b-e06a-4932-9a19-fb244ef43152",
+ "id": "6c03fa22-b112-4243-bbe1-1cd7260cb85b",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.8/dist-packages/keras/initializers/initializers_v2.py:120: UserWarning: The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "5/5 [==============================] - 18s 712ms/step - loss: 8.9090 - recall_at_10: 0.0069 - ndcg_at_10: 0.0045 - regularization_loss: 0.0000e+00 - loss_batch: 8.5771 - val_loss: 8.9027 - val_recall_at_10: 0.0113 - val_ndcg_at_10: 0.0072 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.7921\n"
- ]
- },
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " item_id | \n",
+ " item_category | \n",
+ " item_shop | \n",
+ " item_brand | \n",
+ " item_intention | \n",
+ " datetime | \n",
+ " created | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2023-06-29 19:49:50.410715 | \n",
+ " 2023-06-29 19:49:50.412307 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 412 | \n",
+ " 142 | \n",
+ " 66 | \n",
+ " 2023-06-29 19:49:50.410715 | \n",
+ " 2023-06-29 19:49:50.412307 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 12 | \n",
+ " 824 | \n",
+ " 284 | \n",
+ " 132 | \n",
+ " 2023-06-29 19:49:50.410715 | \n",
+ " 2023-06-29 19:49:50.412307 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 18 | \n",
+ " 1236 | \n",
+ " 426 | \n",
+ " 197 | \n",
+ " 2023-06-29 19:49:50.410715 | \n",
+ " 2023-06-29 19:49:50.412307 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 24 | \n",
+ " 1648 | \n",
+ " 568 | \n",
+ " 263 | \n",
+ " 2023-06-29 19:49:50.410715 | \n",
+ " 2023-06-29 19:49:50.412307 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- ""
+ " item_id item_category item_shop item_brand item_intention \\\n",
+ "0 1 1 1 1 1 \n",
+ "1 2 6 412 142 66 \n",
+ "2 3 12 824 284 132 \n",
+ "3 4 18 1236 426 197 \n",
+ "4 5 24 1648 568 263 \n",
+ "\n",
+ " datetime created \n",
+ "0 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n",
+ "1 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n",
+ "2 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n",
+ "3 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n",
+ "4 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 "
]
},
- "execution_count": 16,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "model_tt.compile(\n",
- " optimizer=\"adam\",\n",
- " run_eagerly=False,\n",
- " loss=\"categorical_crossentropy\",\n",
- " metrics=[mm.RecallAt(10), mm.NDCGAt(10)],\n",
- ")\n",
- "model_tt.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)"
+ "item_features.head()"
]
},
{
- "cell_type": "markdown",
- "id": "80d83007-f9e8-408f-9f65-a0e9e19cb586",
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "c312884b-a1f8-4e08-8068-696e06a9bf46",
"metadata": {},
+ "outputs": [],
"source": [
- "### Exporting query (user) model"
+ "# save to disk\n",
+ "item_features.to_parquet(\n",
+ " os.path.join(feature_repo_path, \"data\", \"item_features.parquet\")\n",
+ ")"
]
},
{
"cell_type": "markdown",
- "id": "22af58a9-5525-454a-bf25-a9df0462aa53",
+ "id": "2e428d01-f2f0-42d4-85d0-0986bb83a847",
"metadata": {},
"source": [
- "We export the query tower to use it later during the model deployment stage with Merlin Systems."
+ "### Feature Engineering with NVTabular"
]
},
{
"cell_type": "code",
"execution_count": 17,
- "id": "d2370f13-ff9a-4ee0-ba1e-451c7bec0f8a",
+ "id": "d4bf870c-30cf-4074-88d3-b75981b3a873",
"metadata": {},
"outputs": [],
"source": [
- "query_tower = model_tt.retrieval_block.query_block()\n",
- "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))"
+ "output_path = os.path.join(DATA_FOLDER, \"processed_nvt\")"
]
},
{
"cell_type": "markdown",
- "id": "e16401d4",
- "metadata": {
- "tags": []
- },
+ "id": "1e7bfb5c-88ed-4cf9-8a17-98c0284adb36",
+ "metadata": {},
"source": [
- "### Training a Ranking Model with DLRM"
+ "In the following NVTabular workflow, notice that we apply the `Dropna()` Operator at the end. We add the Operator to remove rows with missing values in the final DataFrame after the preceding transformations. Although, the synthetic dataset that we generate and use in this notebook does not have null entries, you might have null entries in your `user_id` and `item_id` columns in your own custom dataset. Therefore, while applying `Dropna()` we will not be registering null `user_id_raw` and `item_id_raw` values in the feature store, and will be avoiding potential issues that can occur because of any null entries."
]
},
{
- "cell_type": "markdown",
- "id": "b72e8a2a-fc4a-43ab-934c-6d941c56aad2",
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "f91ada78-4e4d-4415-ab94-e351aa454e9e",
"metadata": {},
+ "outputs": [],
"source": [
- "Now we will move onto training an offline ranking model. This ranking model will be used for scoring our retrieved items."
+ "user_id_raw = [\"user_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsUserFeatures()\n",
+ "item_id_raw = [\"item_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsItemFeatures()\n",
+ "\n",
+ "\n",
+ "item_cat = Categorify(dtype=\"int32\")\n",
+ "items = ([\"item_id\",\"item_category\", \"item_shop\", \"item_brand\"] >> item_cat)\n",
+ "\n",
+ "subgraph_item = Subgraph(\n",
+ " \"item\", \n",
+ " Subgraph(\"items_cat\", items) + \n",
+ " (items[\"item_id\"] >> TagAsItemID()) + \n",
+ " (items[\"item_category\", \"item_shop\", \"item_brand\"] >> TagAsItemFeatures())\n",
+ ")\n",
+ "subgraph_user = Subgraph(\n",
+ " \"user\",\n",
+ " ([\"user_id\"] >> Categorify(dtype=\"int32\") >> TagAsUserID()) +\n",
+ " (\n",
+ " [\n",
+ " \"user_shops\",\n",
+ " \"user_profile\",\n",
+ " \"user_group\",\n",
+ " \"user_gender\",\n",
+ " \"user_age\",\n",
+ " \"user_consumption_2\",\n",
+ " \"user_is_occupied\",\n",
+ " \"user_geography\",\n",
+ " \"user_intentions\",\n",
+ " \"user_brands\",\n",
+ " \"user_categories\",\n",
+ " ] >> Categorify(dtype=\"int32\") >> TagAsUserFeatures()\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n",
+ "outputs = subgraph_user + subgraph_item + targets\n",
+ "\n",
+ "# add dropna op to filter rows with nulls\n",
+ "outputs = outputs >> Dropna()\n",
+ "nvt_wkflow = nvt.Workflow(outputs)"
]
},
{
"cell_type": "markdown",
- "id": "c4f2b234",
+ "id": "71aae006-a161-4127-889a-8f433a9f7362",
"metadata": {},
"source": [
- "Read processed parquet files. We use the `schema` object to define our model."
+ "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path."
]
},
{
"cell_type": "code",
- "execution_count": 18,
- "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d",
+ "execution_count": 19,
+ "id": "814e8438-642a-4f03-baaf-44dab8d1b5e5",
"metadata": {},
"outputs": [],
"source": [
- "# define train and valid dataset objects\n",
- "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n",
- "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n",
- "\n",
- "# define schema object\n",
- "schema = train.schema.without(['user_id_raw', 'item_id_raw'])"
+ "transform_aliccp(\n",
+ " (train_raw, valid_raw), output_path, nvt_workflow=nvt_wkflow, workflow_name=\"workflow\"\n",
+ ")"
]
},
{
- "cell_type": "code",
- "execution_count": 19,
- "id": "30e4ebc2",
+ "cell_type": "markdown",
+ "id": "09c87748-af61-42b8-8574-1afe3d71118f",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'click'"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n",
- "target_column"
+ "### Training a Retrieval Model with Two-Tower Model"
]
},
{
"cell_type": "markdown",
- "id": "8f68e26b",
+ "id": "e644fcba-7b0b-44c0-97fd-80f4fcb01191",
"metadata": {},
"source": [
- "Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture is a popular neural network model originally proposed by Facebook in 2019. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074). To learn more about DLRM architetcture please visit `Exploring-different-models` [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/04-Exporting-ranking-models.ipynb) in the Merlin Models GH repo."
+ "We start with the offline candidate retrieval stage. We are going to train a Two-Tower model for item retrieval. To learn more about the Two-tower model you can visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb)."
]
},
{
- "cell_type": "code",
- "execution_count": 20,
- "id": "e4325080",
+ "cell_type": "markdown",
+ "id": "cf9bca46-a6b6-4a73-afd8-fe2869c60748",
"metadata": {},
- "outputs": [],
"source": [
- "model = mm.DLRMModel(\n",
- " schema,\n",
- " embedding_dim=64,\n",
- " bottom_block=mm.MLPBlock([128, 64]),\n",
- " top_block=mm.MLPBlock([128, 64, 32]),\n",
- " prediction_tasks=mm.BinaryClassificationTask(target_column),\n",
- ")"
+ "#### Feature Engineering with NVTabular"
]
},
{
- "cell_type": "code",
- "execution_count": 21,
- "id": "bfe2aa9e",
+ "cell_type": "markdown",
+ "id": "da2b09cc-09fb-4814-a1cb-7e6168d9eb4b",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "5/5 [==============================] - 9s 519ms/step - loss: 0.6932 - auc: 0.5008 - regularization_loss: 0.0000e+00 - loss_batch: 0.6931 - val_loss: 0.6932 - val_auc: 0.5034 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6932\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "model.compile(optimizer=\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n",
- "model.fit(train, validation_data=valid, batch_size=16 * 1024)"
+ "We are going to process our raw categorical features by encoding them using `Categorify()` operator and tag the features with `user` or `item` tags in the schema file. To learn more about [NVTabular](https://github.com/NVIDIA-Merlin/NVTabular) and the schema object visit this example [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) in the Merlin Models repo."
]
},
{
"cell_type": "markdown",
- "id": "498c4d49-7a59-4260-87b9-b86b66f2c67f",
+ "id": "f3bc7abd-8d97-452b-a4af-5227821a99c9",
"metadata": {},
"source": [
- "Let's save our DLRM model to be able to load back at the deployment stage. "
+ "Define a new output path to store the filtered datasets and schema files."
]
},
{
"cell_type": "code",
- "execution_count": 22,
- "id": "00447c12-ea80-4d98-ab47-cc1a982a6958",
+ "execution_count": 20,
+ "id": "df72a793-194b-44f4-80c3-aaa368a9a01e",
"metadata": {},
"outputs": [],
"source": [
- "model.save(os.path.join(BASE_DIR, \"dlrm\"))"
+ "output_path2 = os.path.join(DATA_FOLDER, \"processed/retrieval\")"
]
},
{
- "cell_type": "markdown",
- "id": "d64a3f3f-81d8-489c-835f-c62f76df22d5",
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "251d4697-8f9c-4c93-8de4-c3480a8378de",
"metadata": {},
+ "outputs": [],
"source": [
- "In the following cells we are going to export the required user and item features files, and save the query (user) tower model and item embeddings to disk. If you want to read more about exporting retrieval models, please visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb) notebook in Merlin Models library repo."
+ "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n",
+ "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))"
]
},
{
"cell_type": "markdown",
- "id": "5da1f434-f5a1-4478-b588-7e7ec17e6a88",
+ "id": "ffd7e2ac-a251-49d0-943b-e9272c852ba6",
"metadata": {},
"source": [
- "### Set up a feature store with Feast"
+ "We select only positive interaction rows where `click==1` in the dataset with `Filter()` operator."
]
},
{
- "cell_type": "markdown",
- "id": "99a4e939-d3cf-44f0-9012-d2af3264ee25",
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7e085a6d-74ad-4c24-8e7c-4e449c15f471",
"metadata": {},
+ "outputs": [],
"source": [
- "Before we move onto the next step, we need to create a Feast feature repository. [Feast](https://feast.dev/) is an end-to-end open source feature store for machine learning. Feast (Feature Store) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to real-time models.\n",
+ "inputs = train_tt.schema.column_names\n",
+ "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n",
"\n",
- "We will create the feature repo in the current working directory, which is `BASE_DIR` for us."
+ "nvt_wkflow.fit(train_tt)\n",
+ "\n",
+ "nvt_wkflow.transform(train_tt).to_parquet(\n",
+ " output_path=os.path.join(output_path2, \"train\")\n",
+ ")\n",
+ "\n",
+ "nvt_wkflow.transform(valid_tt).to_parquet(\n",
+ " output_path=os.path.join(output_path2, \"valid\")\n",
+ ")"
]
},
{
- "cell_type": "code",
- "execution_count": 23,
- "id": "2e7e96d2-9cd2-40d1-b356-8cd76b57bb4a",
+ "cell_type": "markdown",
+ "id": "cc4721ae-7228-4d3f-9586-dcdfefecc19f",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Creating a new Feast repository in \u001b[1m\u001b[32m/Merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo\u001b[0m.\n",
- "\n"
- ]
- }
- ],
"source": [
- "!rm -rf $BASE_DIR/feast_repo\n",
- "!cd $BASE_DIR && feast init feast_repo"
+ "NVTabular exported the schema file, `schema.pbtxt` a protobuf text file, of our processed dataset. To learn more about the schema object and schema file you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) notebook."
]
},
{
"cell_type": "markdown",
- "id": "5e630e53-8336-487a-9ceb-133b1538acfb",
+ "id": "aa025b80-0f18-437c-a85f-4edcb89f4222",
"metadata": {},
"source": [
- "You should be seeing a message like Creating a new Feast repository in ... printed out above. Now, navigate to the `feature_repo` folder and remove the demo parquet file created by default, and `examples.py` file."
+ "**Read filtered parquet files as Dataset objects.**"
]
},
{
"cell_type": "code",
- "execution_count": 24,
- "id": "26ba2521-ed1b-4c2b-afdd-26b4a5a9c008",
+ "execution_count": 23,
+ "id": "252a8e60-b447-46b5-ade6-3557cbafa797",
"metadata": {},
"outputs": [],
"source": [
- "feature_repo_path = os.path.join(BASE_DIR, \"feast_repo/feature_repo\")\n",
- "if os.path.exists(f\"{feature_repo_path}/example_repo.py\"):\n",
- " os.remove(f\"{feature_repo_path}/example_repo.py\")\n",
- "if os.path.exists(f\"{feature_repo_path}/data/driver_stats.parquet\"):\n",
- " os.remove(f\"{feature_repo_path}/data/driver_stats.parquet\")"
+ "train_tt = Dataset(os.path.join(output_path2, \"train\", \"*.parquet\"), part_size=\"500MB\")\n",
+ "valid_tt = Dataset(os.path.join(output_path2, \"valid\", \"*.parquet\"), part_size=\"500MB\")"
]
},
{
- "cell_type": "markdown",
- "id": "78315676-eb6c-405a-b1fd-3174ea328406",
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "71063653-2f39-4b54-8399-145d6f281d4d",
"metadata": {},
+ "outputs": [],
"source": [
- "### Exporting user and item features"
+ "schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['click'])\n",
+ "train_tt.schema = schema\n",
+ "valid_tt.schema = schema"
]
},
{
"cell_type": "code",
"execution_count": 25,
- "id": "ea0b369c-2f01-42e3-9f3c-74c3ff4a6d64",
+ "id": "9312511a-f368-42f2-93d2-eb95aebbf46c",
"metadata": {},
"outputs": [],
"source": [
- "from merlin.models.utils.dataset import unique_rows_by_features\n",
- "\n",
- "user_features = (\n",
- " unique_rows_by_features(train, Tags.USER, Tags.USER_ID)\n",
- " .compute()\n",
- " .reset_index(drop=True)\n",
+ "model_tt = mm.TwoTowerModel(\n",
+ " schema,\n",
+ " query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),\n",
+ " samplers=[mm.InBatchSampler()],\n",
+ " embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 26,
- "id": "6b0949f9-e67a-414f-9d74-65f138e820a8",
+ "id": "4d47cb8b-e06a-4932-9a19-fb244ef43152",
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.8/dist-packages/keras/initializers/initializers_v2.py:120: UserWarning: The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9/9 [==============================] - 11s 275ms/step - loss: 8.9538 - recall_at_10: 0.0101 - ndcg_at_10: 0.0067 - regularization_loss: 0.0000e+00 - loss_batch: 8.8711 - val_loss: 8.9179 - val_recall_at_10: 0.0212 - val_ndcg_at_10: 0.0155 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.5806\n"
+ ]
+ },
{
"data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " user_id | \n",
- " user_shops | \n",
- " user_profile | \n",
- " user_group | \n",
- " user_gender | \n",
- " user_age | \n",
- " user_consumption_2 | \n",
- " user_is_occupied | \n",
- " user_geography | \n",
- " user_intentions | \n",
- " user_brands | \n",
- " user_categories | \n",
- " user_id_raw | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4 | \n",
- " 4 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 7 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 6 | \n",
- " 6 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 6 | \n",
- " 6 | \n",
- " 6 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 7 | \n",
- " 7 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 7 | \n",
- " 7 | \n",
- " 7 | \n",
- " 9 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
"text/plain": [
- " user_id user_shops user_profile user_group user_gender user_age \\\n",
- "0 3 3 3 3 3 3 \n",
- "1 4 4 3 3 3 3 \n",
- "2 5 5 3 3 3 3 \n",
- "3 6 6 3 3 3 3 \n",
- "4 7 7 3 3 3 3 \n",
- "\n",
- " user_consumption_2 user_is_occupied user_geography user_intentions \\\n",
- "0 3 3 3 3 \n",
- "1 3 3 3 4 \n",
- "2 3 3 3 5 \n",
- "3 3 3 3 6 \n",
- "4 3 3 3 7 \n",
- "\n",
- " user_brands user_categories user_id_raw \n",
- "0 3 3 6 \n",
- "1 4 4 8 \n",
- "2 5 5 7 \n",
- "3 6 6 5 \n",
- "4 7 7 9 "
+ ""
]
},
"execution_count": 26,
@@ -952,413 +905,209 @@
}
],
"source": [
- "user_features.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4a46bd8c-1337-4c74-a85b-25348a897d90",
- "metadata": {},
- "source": [
- "We will artificially add `datetime` and `created` timestamp columns to our user_features dataframe. This required by Feast to track the user-item features and their creation time and to determine which version to use when we query Feast."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "d30bd2f8-8a78-4df7-9bc4-42bd741c5b99",
- "metadata": {},
- "outputs": [],
- "source": [
- "from datetime import datetime\n",
- "\n",
- "user_features[\"datetime\"] = datetime.now()\n",
- "user_features[\"datetime\"] = user_features[\"datetime\"].astype(\"datetime64[ns]\")\n",
- "user_features[\"created\"] = datetime.now()\n",
- "user_features[\"created\"] = user_features[\"created\"].astype(\"datetime64[ns]\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "d4998cd1-9dcd-4911-8f23-372e197b41e9",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " user_id | \n",
- " user_shops | \n",
- " user_profile | \n",
- " user_group | \n",
- " user_gender | \n",
- " user_age | \n",
- " user_consumption_2 | \n",
- " user_is_occupied | \n",
- " user_geography | \n",
- " user_intentions | \n",
- " user_brands | \n",
- " user_categories | \n",
- " user_id_raw | \n",
- " datetime | \n",
- " created | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 6 | \n",
- " 2023-06-20 23:47:09.436667 | \n",
- " 2023-06-20 23:47:09.438518 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4 | \n",
- " 4 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 8 | \n",
- " 2023-06-20 23:47:09.436667 | \n",
- " 2023-06-20 23:47:09.438518 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 7 | \n",
- " 2023-06-20 23:47:09.436667 | \n",
- " 2023-06-20 23:47:09.438518 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 6 | \n",
- " 6 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 6 | \n",
- " 6 | \n",
- " 6 | \n",
- " 5 | \n",
- " 2023-06-20 23:47:09.436667 | \n",
- " 2023-06-20 23:47:09.438518 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 7 | \n",
- " 7 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 7 | \n",
- " 7 | \n",
- " 7 | \n",
- " 9 | \n",
- " 2023-06-20 23:47:09.436667 | \n",
- " 2023-06-20 23:47:09.438518 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " user_id user_shops user_profile user_group user_gender user_age \\\n",
- "0 3 3 3 3 3 3 \n",
- "1 4 4 3 3 3 3 \n",
- "2 5 5 3 3 3 3 \n",
- "3 6 6 3 3 3 3 \n",
- "4 7 7 3 3 3 3 \n",
- "\n",
- " user_consumption_2 user_is_occupied user_geography user_intentions \\\n",
- "0 3 3 3 3 \n",
- "1 3 3 3 4 \n",
- "2 3 3 3 5 \n",
- "3 3 3 3 6 \n",
- "4 3 3 3 7 \n",
- "\n",
- " user_brands user_categories user_id_raw datetime \\\n",
- "0 3 3 6 2023-06-20 23:47:09.436667 \n",
- "1 4 4 8 2023-06-20 23:47:09.436667 \n",
- "2 5 5 7 2023-06-20 23:47:09.436667 \n",
- "3 6 6 5 2023-06-20 23:47:09.436667 \n",
- "4 7 7 9 2023-06-20 23:47:09.436667 \n",
- "\n",
- " created \n",
- "0 2023-06-20 23:47:09.438518 \n",
- "1 2023-06-20 23:47:09.438518 \n",
- "2 2023-06-20 23:47:09.438518 \n",
- "3 2023-06-20 23:47:09.438518 \n",
- "4 2023-06-20 23:47:09.438518 "
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "model_tt.compile(\n",
+ " optimizer=\"adam\",\n",
+ " run_eagerly=False,\n",
+ " loss=\"categorical_crossentropy\",\n",
+ " metrics=[mm.RecallAt(10), mm.NDCGAt(10)],\n",
+ ")\n",
+ "model_tt.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80d83007-f9e8-408f-9f65-a0e9e19cb586",
+ "metadata": {},
"source": [
- "user_features.head()"
+ "### Exporting query (user) model"
]
},
{
- "cell_type": "code",
- "execution_count": 29,
- "id": "2981b3ed-6156-49f0-aa14-326a3853a58a",
+ "cell_type": "markdown",
+ "id": "22af58a9-5525-454a-bf25-a9df0462aa53",
"metadata": {},
- "outputs": [],
"source": [
- "user_features.to_parquet(os.path.join(feature_repo_path, \"data\", \"user_features.parquet\"))"
+ "We export the query tower to use it later during the model deployment stage with Merlin Systems."
]
},
{
"cell_type": "code",
- "execution_count": 30,
- "id": "0a33a668-8e2a-4546-8f54-0060d405ba91",
+ "execution_count": 27,
+ "id": "d2370f13-ff9a-4ee0-ba1e-451c7bec0f8a",
"metadata": {},
"outputs": [],
"source": [
- "item_features = (\n",
- " unique_rows_by_features(train, Tags.ITEM, Tags.ITEM_ID)\n",
- " .compute()\n",
- " .reset_index(drop=True)\n",
- ")"
+ "query_tower = model_tt.retrieval_block.query_block()\n",
+ "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e16401d4",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Training a Ranking Model with DLRM"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b72e8a2a-fc4a-43ab-934c-6d941c56aad2",
+ "metadata": {},
+ "source": [
+ "Now we will move onto training an offline ranking model. This ranking model will be used for scoring our retrieved items."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c4f2b234",
+ "metadata": {},
+ "source": [
+ "Read processed parquet files. We use the `schema` object to define our model."
]
},
{
"cell_type": "code",
- "execution_count": 31,
- "id": "68a694d6-926f-4b0f-8edc-8cc7ac85ade7",
+ "execution_count": 28,
+ "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d",
"metadata": {},
"outputs": [],
"source": [
- "item_features[\"datetime\"] = datetime.now()\n",
- "item_features[\"datetime\"] = item_features[\"datetime\"].astype(\"datetime64[ns]\")\n",
- "item_features[\"created\"] = datetime.now()\n",
- "item_features[\"created\"] = item_features[\"created\"].astype(\"datetime64[ns]\")"
+ "# define train and valid dataset objects\n",
+ "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n",
+ "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n",
+ "\n",
+ "# define schema object\n",
+ "schema = train.schema"
]
},
{
"cell_type": "code",
- "execution_count": 32,
- "id": "6c03fa22-b112-4243-bbe1-1cd7260cb85b",
+ "execution_count": 29,
+ "id": "30e4ebc2",
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " item_id | \n",
- " item_category | \n",
- " item_shop | \n",
- " item_brand | \n",
- " item_id_raw | \n",
- " datetime | \n",
- " created | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 3 | \n",
- " 6 | \n",
- " 2023-06-20 23:47:09.557793 | \n",
- " 2023-06-20 23:47:09.559325 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 7 | \n",
- " 2023-06-20 23:47:09.557793 | \n",
- " 2023-06-20 23:47:09.559325 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 5 | \n",
- " 10 | \n",
- " 2023-06-20 23:47:09.557793 | \n",
- " 2023-06-20 23:47:09.559325 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 6 | \n",
- " 6 | \n",
- " 6 | \n",
- " 6 | \n",
- " 8 | \n",
- " 2023-06-20 23:47:09.557793 | \n",
- " 2023-06-20 23:47:09.559325 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 7 | \n",
- " 7 | \n",
- " 7 | \n",
- " 7 | \n",
- " 5 | \n",
- " 2023-06-20 23:47:09.557793 | \n",
- " 2023-06-20 23:47:09.559325 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
"text/plain": [
- " item_id item_category item_shop item_brand item_id_raw \\\n",
- "0 3 3 3 3 6 \n",
- "1 4 4 4 4 7 \n",
- "2 5 5 5 5 10 \n",
- "3 6 6 6 6 8 \n",
- "4 7 7 7 7 5 \n",
- "\n",
- " datetime created \n",
- "0 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n",
- "1 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n",
- "2 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n",
- "3 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n",
- "4 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 "
+ "'click'"
]
},
- "execution_count": 32,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "item_features.head()"
+ "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n",
+ "target_column"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f68e26b",
+ "metadata": {},
+ "source": [
+ "Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture is a popular neural network model originally proposed by Facebook in 2019. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074). To learn more about DLRM architetcture please visit `Exploring-different-models` [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/04-Exporting-ranking-models.ipynb) in the Merlin Models GH repo."
]
},
{
"cell_type": "code",
- "execution_count": 33,
- "id": "c312884b-a1f8-4e08-8068-696e06a9bf46",
+ "execution_count": 30,
+ "id": "e4325080",
"metadata": {},
"outputs": [],
"source": [
- "# save to disk\n",
- "item_features.to_parquet(\n",
- " os.path.join(feature_repo_path, \"data\", \"item_features.parquet\")\n",
+ "model = mm.DLRMModel(\n",
+ " schema,\n",
+ " embedding_dim=64,\n",
+ " bottom_block=mm.MLPBlock([128, 64]),\n",
+ " top_block=mm.MLPBlock([128, 64, 32]),\n",
+ " prediction_tasks=mm.BinaryClassificationTask(target_column),\n",
")"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "bfe2aa9e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "5/5 [==============================] - 5s 305ms/step - loss: 0.6932 - auc: 0.5005 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.5029 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6931\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.compile(optimizer=\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n",
+ "model.fit(train, validation_data=valid, batch_size=16 * 1024)"
+ ]
+ },
{
"cell_type": "markdown",
- "id": "ff30ceab-b264-4509-9c5b-5a10425e143b",
+ "id": "498c4d49-7a59-4260-87b9-b86b66f2c67f",
"metadata": {},
"source": [
- "### Extract and save Item embeddings"
+ "Let's save our DLRM model to be able to load back at the deployment stage. "
]
},
{
"cell_type": "code",
- "execution_count": 34,
- "id": "00f1fe65-882e-4962-bb16-19a130fda215",
+ "execution_count": 32,
+ "id": "00447c12-ea80-4d98-ab47-cc1a982a6958",
"metadata": {},
"outputs": [],
"source": [
- "item_embs = model_tt.item_embeddings(\n",
- " Dataset(item_features, schema=schema), batch_size=1024\n",
- ")\n",
- "item_embs_df = item_embs.compute(scheduler=\"synchronous\")"
+ "model.save(os.path.join(BASE_DIR, \"dlrm\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d64a3f3f-81d8-489c-835f-c62f76df22d5",
+ "metadata": {},
+ "source": [
+ "In the following cells we are going to export the required user and item features files, and save the query (user) tower model and item embeddings to disk. If you want to read more about exporting retrieval models, please visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb) notebook in Merlin Models library repo."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff30ceab-b264-4509-9c5b-5a10425e143b",
+ "metadata": {},
+ "source": [
+ "### Extract and save Item embeddings"
]
},
{
"cell_type": "code",
- "execution_count": 35,
- "id": "cf8b82ea-6cce-4dab-ad17-114b5e7eabd4",
+ "execution_count": 33,
+ "id": "e62f65f8-e8f1-447e-9500-5960807c36f2",
"metadata": {},
"outputs": [],
"source": [
- "# select only item_id together with embedding columns\n",
- "item_embeddings = item_embs_df.drop(\n",
- " columns=[\"item_category\", \"item_shop\", \"item_brand\"]\n",
- ")"
+ "from merlin.systems.dag.ops.tensorflow import PredictTensorflow\n",
+ "from merlin.systems.dag.ops.workflow import TransformWorkflow\n",
+ "\n",
+ "workflow = nvt.Workflow([\"item_id\"] + (['item_id', 'item_brand', 'item_category', 'item_shop'] >> TransformWorkflow(nvt_wkflow.get_subworkflow(\"item\")) >> PredictTensorflow(model_tt.first.item_block())))\n",
+ "item_embeddings = workflow.fit_transform(Dataset(item_features)).to_ddf().compute()"
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 34,
"id": "e02f0957-6665-400a-80c0-60b307466caf",
"metadata": {},
"outputs": [
@@ -1384,191 +1133,60 @@
" \n",
" | \n",
" item_id | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- " 3 | \n",
- " 4 | \n",
- " 5 | \n",
- " 6 | \n",
- " 7 | \n",
- " 8 | \n",
- " ... | \n",
- " 54 | \n",
- " 55 | \n",
- " 56 | \n",
- " 57 | \n",
- " 58 | \n",
- " 59 | \n",
- " 60 | \n",
- " 61 | \n",
- " 62 | \n",
- " 63 | \n",
+ " output_1 | \n",
"
\n",
" \n",
" \n",
" \n",
- " 0 | \n",
- " 3 | \n",
- " -0.055164 | \n",
- " -0.018832 | \n",
- " -0.009478 | \n",
- " -0.016874 | \n",
- " 0.015988 | \n",
- " -0.022928 | \n",
- " 0.022611 | \n",
- " -0.030984 | \n",
- " -0.045701 | \n",
- " ... | \n",
- " 0.007060 | \n",
- " 0.032204 | \n",
- " 0.011515 | \n",
- " 0.012811 | \n",
- " 0.002650 | \n",
- " 0.023448 | \n",
- " 0.021759 | \n",
- " -0.011316 | \n",
- " -0.035275 | \n",
- " -0.004572 | \n",
+ " 453 | \n",
+ " 945 | \n",
+ " [0.012117806822061539, -0.02241620607674122, 0... | \n",
"
\n",
" \n",
- " 1 | \n",
- " 4 | \n",
- " -0.027412 | \n",
- " -0.007417 | \n",
- " -0.023730 | \n",
- " -0.008385 | \n",
- " 0.028241 | \n",
- " -0.004143 | \n",
- " 0.001301 | \n",
- " -0.040613 | \n",
- " -0.020645 | \n",
- " ... | \n",
- " 0.001835 | \n",
- " 0.010697 | \n",
- " 0.006311 | \n",
- " 0.007290 | \n",
- " -0.014959 | \n",
- " 0.025217 | \n",
- " 0.041697 | \n",
- " -0.012126 | \n",
- " -0.022523 | \n",
- " -0.001903 | \n",
+ " 454 | \n",
+ " 948 | \n",
+ " [0.012117806822061539, -0.02241620607674122, 0... | \n",
"
\n",
" \n",
- " 2 | \n",
- " 5 | \n",
- " -0.009581 | \n",
- " 0.016263 | \n",
- " -0.027931 | \n",
- " -0.023079 | \n",
- " 0.006483 | \n",
- " 0.006133 | \n",
- " -0.027449 | \n",
- " 0.027797 | \n",
- " 0.045743 | \n",
- " ... | \n",
- " -0.003662 | \n",
- " 0.054940 | \n",
- " 0.013501 | \n",
- " -0.004127 | \n",
- " -0.001858 | \n",
- " -0.000462 | \n",
- " -0.018047 | \n",
- " 0.036427 | \n",
- " 0.009524 | \n",
- " 0.006689 | \n",
+ " 455 | \n",
+ " 956 | \n",
+ " [0.012117806822061539, -0.02241620607674122, 0... | \n",
"
\n",
" \n",
- " 3 | \n",
- " 6 | \n",
- " -0.007599 | \n",
- " -0.012074 | \n",
- " 0.024879 | \n",
- " -0.008080 | \n",
- " -0.025010 | \n",
- " -0.000266 | \n",
- " 0.005489 | \n",
- " -0.014263 | \n",
- " -0.019343 | \n",
- " ... | \n",
- " -0.030220 | \n",
- " 0.011863 | \n",
- " -0.008515 | \n",
- " 0.011286 | \n",
- " -0.000907 | \n",
- " 0.014882 | \n",
- " 0.035699 | \n",
- " -0.007068 | \n",
- " 0.012995 | \n",
- " 0.001644 | \n",
+ " 456 | \n",
+ " 1437 | \n",
+ " [0.012117806822061539, -0.02241620607674122, 0... | \n",
"
\n",
" \n",
- " 4 | \n",
- " 7 | \n",
- " -0.070002 | \n",
- " 0.001031 | \n",
- " -0.001309 | \n",
- " -0.014118 | \n",
- " -0.036672 | \n",
- " -0.012943 | \n",
- " 0.009711 | \n",
- " -0.008856 | \n",
- " -0.032054 | \n",
- " ... | \n",
- " -0.023113 | \n",
- " 0.000600 | \n",
- " -0.005711 | \n",
- " 0.044277 | \n",
- " -0.004765 | \n",
- " 0.016184 | \n",
- " 0.028223 | \n",
- " 0.002914 | \n",
- " 0.032516 | \n",
- " 0.026521 | \n",
+ " 457 | \n",
+ " 1469 | \n",
+ " [0.012117806822061539, -0.02241620607674122, 0... | \n",
"
\n",
" \n",
"\n",
- "5 rows × 65 columns
\n",
""
],
"text/plain": [
- " item_id 0 1 2 3 4 5 \\\n",
- "0 3 -0.055164 -0.018832 -0.009478 -0.016874 0.015988 -0.022928 \n",
- "1 4 -0.027412 -0.007417 -0.023730 -0.008385 0.028241 -0.004143 \n",
- "2 5 -0.009581 0.016263 -0.027931 -0.023079 0.006483 0.006133 \n",
- "3 6 -0.007599 -0.012074 0.024879 -0.008080 -0.025010 -0.000266 \n",
- "4 7 -0.070002 0.001031 -0.001309 -0.014118 -0.036672 -0.012943 \n",
- "\n",
- " 6 7 8 ... 54 55 56 57 \\\n",
- "0 0.022611 -0.030984 -0.045701 ... 0.007060 0.032204 0.011515 0.012811 \n",
- "1 0.001301 -0.040613 -0.020645 ... 0.001835 0.010697 0.006311 0.007290 \n",
- "2 -0.027449 0.027797 0.045743 ... -0.003662 0.054940 0.013501 -0.004127 \n",
- "3 0.005489 -0.014263 -0.019343 ... -0.030220 0.011863 -0.008515 0.011286 \n",
- "4 0.009711 -0.008856 -0.032054 ... -0.023113 0.000600 -0.005711 0.044277 \n",
- "\n",
- " 58 59 60 61 62 63 \n",
- "0 0.002650 0.023448 0.021759 -0.011316 -0.035275 -0.004572 \n",
- "1 -0.014959 0.025217 0.041697 -0.012126 -0.022523 -0.001903 \n",
- "2 -0.001858 -0.000462 -0.018047 0.036427 0.009524 0.006689 \n",
- "3 -0.000907 0.014882 0.035699 -0.007068 0.012995 0.001644 \n",
- "4 -0.004765 0.016184 0.028223 0.002914 0.032516 0.026521 \n",
- "\n",
- "[5 rows x 65 columns]"
+ " item_id output_1\n",
+ "453 945 [0.012117806822061539, -0.02241620607674122, 0...\n",
+ "454 948 [0.012117806822061539, -0.02241620607674122, 0...\n",
+ "455 956 [0.012117806822061539, -0.02241620607674122, 0...\n",
+ "456 1437 [0.012117806822061539, -0.02241620607674122, 0...\n",
+ "457 1469 [0.012117806822061539, -0.02241620607674122, 0..."
]
},
- "execution_count": 36,
+ "execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "item_embeddings.head()"
+ "item_embeddings.tail()"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 35,
"id": "66d7271e-0ea6-4568-ac5a-04089735f542",
"metadata": {},
"outputs": [],
@@ -1595,7 +1213,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 36,
"id": "4ee27d67-e35a-42c5-8025-ed73f35c8e13",
"metadata": {},
"outputs": [],
@@ -1614,11 +1232,11 @@
" created_timestamp_column=\"created\",\n",
")\n",
"\n",
- "user_raw = Entity(name=\"user_id_raw\", value_type=ValueType.INT32, join_keys=[\"user_id_raw\"],)\n",
+ "user = Entity(name=\"user_id\", value_type=ValueType.INT32, join_keys=[\"user_id\"],)\n",
"\n",
"user_features_view = FeatureView(\n",
" name=\"user_features\",\n",
- " entities=[user_raw],\n",
+ " entities=[user],\n",
" ttl=timedelta(0),\n",
" schema=[\n",
" Field(name=\"user_shops\", dtype=Int32),\n",
@@ -1632,7 +1250,6 @@
" Field(name=\"user_intentions\", dtype=Int32),\n",
" Field(name=\"user_brands\", dtype=Int32),\n",
" Field(name=\"user_categories\", dtype=Int32),\n",
- " Field(name=\"user_id\", dtype=Int32),\n",
" ],\n",
" online=True,\n",
" source=user_features,\n",
@@ -1647,7 +1264,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 37,
"id": "48a5927c-840d-410c-8f5b-bebce4f79640",
"metadata": {},
"outputs": [],
@@ -1676,7 +1293,6 @@
" Field(name=\"item_category\", dtype=Int32),\n",
" Field(name=\"item_shop\", dtype=Int32),\n",
" Field(name=\"item_brand\", dtype=Int32),\n",
- " Field(name=\"item_id_raw\", dtype=Int32),\n",
" ],\n",
" online=True,\n",
" source=item_features,\n",
@@ -1699,7 +1315,7 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 38,
"id": "57133c1e-18d9-4ccb-9704-cdebd271985e",
"metadata": {},
"outputs": [
@@ -1719,7 +1335,7 @@
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 39,
"id": "986d53ea-c946-4046-a390-6d3b8801d280",
"metadata": {},
"outputs": [
@@ -1749,8 +1365,9 @@
"source": [
"import seedir as sd\n",
"\n",
+ "feature_repo_path = os.path.join(BASE_DIR, \"feast_repo\")\n",
"sd.seedir(\n",
- " os.path.join(BASE_DIR, \"feast_repo\"),\n",
+ " feature_repo_path,\n",
" style=\"lines\",\n",
" itemlimit=10,\n",
" depthlimit=3,\n",
@@ -1769,6 +1386,14 @@
"\n",
"For the next step, move on to the `02-Deploying-multi-stage-Recsys-with-Merlin-Systems.ipynb` notebook to deploy our saved models as an ensemble to TIS and obtain prediction results for a given request."
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3c5bd646-8121-4f32-bff8-137d50e3b8a2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb
index ff84e063e..e97257251 100644
--- a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb
+++ b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb
@@ -7,7 +7,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
+ "# Copyright 2023 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
@@ -27,6 +27,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "03166488-1651-4025-84ed-4e9e5db34933",
"metadata": {},
@@ -43,6 +44,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "38d75184-cd24-4fe3-90f4-d76028626576",
"metadata": {},
@@ -51,6 +53,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "da9dadb5-6eec-4a1b-99f9-929523f5cc07",
"metadata": {},
@@ -59,6 +62,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "538677a3-acc6-48f6-acb6-d5bb5fe2e2d2",
"metadata": {},
@@ -67,6 +71,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "a27e18d7-b3e4-481c-b69e-23193b212c56",
"metadata": {},
@@ -75,11 +80,11 @@
"\n",
"In case you need to install them for running this example on GPU, execute the following script in a cell.\n",
"```\n",
- "%pip install \"feast==0.31\" faiss-gpu\n",
+ "%pip install \"feast<0.31\" faiss-gpu\n",
"```\n",
"or the following script in a cell for CPU.\n",
"```\n",
- "%pip install tensorflow-cpu \"feast==0.31\" faiss-cpu\n",
+ "%pip install tensorflow-cpu \"feast<0.31\" faiss-cpu\n",
"```"
]
},
@@ -99,7 +104,7 @@
" _descriptor.FieldDescriptor(\n",
"/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:30: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n",
" _INTEGERSTATISTICS = _descriptor.Descriptor(\n",
- "2023-06-20 23:49:49.177129: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
+ "2023-06-29 19:50:56.885234: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_shape_pb2.py:18: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n",
" DESCRIPTOR = _descriptor.FileDescriptor(\n",
@@ -140,10 +145,12 @@
"from merlin.systems.dag.ops.softmax_sampling import SoftmaxSampling\n",
"from merlin.systems.dag.ops.tensorflow import PredictTensorflow\n",
"from merlin.systems.dag.ops.unroll_features import UnrollFeatures\n",
- "from merlin.systems.triton.utils import send_triton_request"
+ "from merlin.systems.triton.utils import send_triton_request\n",
+ "from merlin.systems.dag.ops.workflow import TransformWorkflow"
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "55ead20e-c573-462e-9aa2-c3494bf0129f",
"metadata": {},
@@ -152,6 +159,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "e2ac115e-4794-4a69-a962-8481f6e86df3",
"metadata": {},
@@ -169,6 +177,8 @@
"outputs": [],
"source": [
"BASE_DIR = os.environ.get(\"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\")\n",
+ "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n",
+ "\n",
"\n",
"# define feature repo path\n",
"feast_repo_path = os.path.join(BASE_DIR, \"feast_repo/feature_repo/\")"
@@ -184,8 +194,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "/Merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo/feature_repo\n",
- "Created entity \u001b[1m\u001b[32muser_id_raw\u001b[0m\n",
+ "/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo/feature_repo\n",
+ "Created entity \u001b[1m\u001b[32muser_id\u001b[0m\n",
"Created entity \u001b[1m\u001b[32mitem_id\u001b[0m\n",
"Created feature view \u001b[1m\u001b[32muser_features\u001b[0m\n",
"Created feature view \u001b[1m\u001b[32mitem_features\u001b[0m\n",
@@ -202,6 +212,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "c641fcd2-bd11-4569-80d4-2ae5e01a5cad",
"metadata": {},
@@ -228,9 +239,9 @@
"Materializing \u001b[1m\u001b[32m2\u001b[0m feature views from \u001b[1m\u001b[32m1995-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n",
"\n",
"\u001b[1m\u001b[32muser_features\u001b[0m:\n",
- "100%|███████████████████████████████████████████████████████████| 456/456 [00:00<00:00, 1136.51it/s]\n",
+ "100%|███████████████████████████████████████████████████████████| 460/460 [00:00<00:00, 2521.27it/s]\n",
"\u001b[1m\u001b[32mitem_features\u001b[0m:\n",
- "100%|███████████████████████████████████████████████████████████| 436/436 [00:00<00:00, 2878.99it/s]\n"
+ "100%|███████████████████████████████████████████████████████████| 458/458 [00:00<00:00, 3335.12it/s]\n"
]
}
],
@@ -239,6 +250,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "8fcc26e6-f6f3-4e44-bf3c-3b8e66dc9fd6",
"metadata": {},
@@ -279,6 +291,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "e768637c-0a4d-404b-8b58-7182fef0ab0e",
"metadata": {},
@@ -287,6 +300,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "efada1e1-2556-4a26-b0ba-9cb96b3b151f",
"metadata": {},
@@ -306,6 +320,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "2aa037c0-7dad-427c-98bb-3da413e8fd14",
"metadata": {},
@@ -326,6 +341,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "8b996019-bd2a-44e0-b004-4f412b300d63",
"metadata": {},
@@ -345,21 +361,19 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "WARNING clustering 436 points to 32 centroids: please provide at least 1248 training points\n"
+ "WARNING clustering 458 points to 32 centroids: please provide at least 1248 training points\n"
]
}
],
"source": [
"from merlin.systems.dag.ops.faiss import QueryFaiss, setup_faiss \n",
"\n",
- "item_embeddings = np.ascontiguousarray(\n",
- " pd.read_parquet(os.path.join(BASE_DIR, \"item_embeddings.parquet\")).to_numpy()\n",
- ")\n",
- "item_embeddings_df = pd.DataFrame({\"item_id\": item_embeddings[:,0].astype(int), \"embedding\": item_embeddings[:,1:].tolist()})\n",
- "setup_faiss(item_embeddings_df, faiss_index_path)"
+ "item_embeddings = pd.read_parquet(os.path.join(BASE_DIR, \"item_embeddings.parquet\"))\n",
+ "setup_faiss(item_embeddings, faiss_index_path, embedding_column=\"output_1\")"
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "46697177-512a-473e-8cca-9fe51d3daa03",
"metadata": {},
@@ -378,6 +392,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "5c45df06-0cbe-4b52-ac1f-786e763895d7",
"metadata": {},
@@ -395,9 +410,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-20 23:50:04+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n",
+ "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:51:06+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n",
"\n",
- "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-20 23:50:04+00:00\u001b[0m:\n"
+ "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:51:06+00:00\u001b[0m:\n"
]
},
{
@@ -411,15 +426,30 @@
"source": [
"from merlin.systems.dag.ops.feast import QueryFeast \n",
"\n",
- "user_features = [\"user_id_raw\"] >> QueryFeast.from_feature_view(\n",
+ "user_attributes = [\"user_id\"] >> QueryFeast.from_feature_view(\n",
" store=feature_store,\n",
" view=\"user_features\",\n",
- " column=\"user_id_raw\",\n",
- " include_id=False,\n",
+ " column=\"user_id\",\n",
+ " include_id=True,\n",
")"
]
},
{
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "f11299b6-20d4-4687-bb0e-b855a9bcb9eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nvtabular import Workflow\n",
+ "\n",
+ "nvt_workflow = Workflow.load(os.path.join(DATA_FOLDER, 'processed_nvt/workflow'))\n",
+ "user_subgraph = nvt_workflow.get_subworkflow(\"user\")\n",
+ "user_features = user_attributes >> TransformWorkflow(user_subgraph)"
+ ]
+ },
+ {
+ "attachments": {},
"cell_type": "markdown",
"id": "27e25be7-3ff0-49c2-a3fc-03ec4d615e77",
"metadata": {},
@@ -429,7 +459,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"id": "21139caa-3a51-42e6-b006-21a92c95f1bc",
"metadata": {},
"outputs": [
@@ -439,7 +469,7 @@
""
]
},
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -448,12 +478,13 @@
"# prevent TF to claim all GPU memory\n",
"from merlin.dataloader.tf_utils import configure_tensorflow\n",
"\n",
+ "\n",
"configure_tensorflow()"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"id": "47c2d9b1-51dc-4549-977d-d7941ee6486c",
"metadata": {},
"outputs": [
@@ -461,9 +492,10 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2023-06-20 23:50:06.005776: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
+ "2023-06-29 19:51:07.269579: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
- "2023-06-20 23:50:09.981326: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8192 MB memory: -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:06:00.0, compute capability: 7.0\n",
+ "2023-06-29 19:51:10.430459: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n",
+ "2023-06-29 19:51:10.431356: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n",
"WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 52). These functions will not be directly callable after loading.\n"
]
},
@@ -471,14 +503,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "INFO:tensorflow:Assets written to: /tmp/tmp7n9o9yv2/assets\n"
+ "INFO:tensorflow:Assets written to: /tmp/tmpdalflmaz/assets\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "INFO:tensorflow:Assets written to: /tmp/tmp7n9o9yv2/assets\n"
+ "INFO:tensorflow:Assets written to: /tmp/tmpdalflmaz/assets\n"
]
}
],
@@ -494,6 +526,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "8ce4429c-1fe1-4304-bcdf-badebe3b5485",
"metadata": {},
@@ -503,7 +536,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"id": "b270f663-0ae1-4356-acd4-5f8c986abf4d",
"metadata": {},
"outputs": [
@@ -511,9 +544,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-20 23:50:17+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n",
+ "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:51:14+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n",
"\n",
- "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-20 23:50:17+00:00\u001b[0m:\n"
+ "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:51:14+00:00\u001b[0m:\n"
]
},
{
@@ -525,7 +558,7 @@
}
],
"source": [
- "item_features = retrieval[\"candidate_ids\"] >> QueryFeast.from_feature_view(\n",
+ "item_attributes = retrieval[\"candidate_ids\"] >> QueryFeast.from_feature_view(\n",
" store=feature_store,\n",
" view=\"item_features\",\n",
" column=\"candidate_ids\",\n",
@@ -535,6 +568,18 @@
]
},
{
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "0d0a4531-665c-48a1-98a9-216c955449b7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "item_subgraph = nvt_workflow.get_subworkflow(\"item\")\n",
+ "item_features = item_attributes >> TransformWorkflow(item_subgraph)"
+ ]
+ },
+ {
+ "attachments": {},
"cell_type": "markdown",
"id": "304a4d09-db05-4666-b520-75dbbbc7ab17",
"metadata": {},
@@ -544,7 +589,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 17,
"id": "eb0ef434-03a5-4a36-afb9-e19a43243c64",
"metadata": {},
"outputs": [],
@@ -570,6 +615,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "7fb0ce66-6b6c-43be-885e-a5435c3bbd9e",
"metadata": {},
@@ -579,7 +625,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 18,
"id": "ce31723e-af4d-4827-bb60-3a9fafcd9da6",
"metadata": {},
"outputs": [
@@ -594,14 +640,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "INFO:tensorflow:Assets written to: /tmp/tmpbt6mf1gw/assets\n"
+ "INFO:tensorflow:Assets written to: /tmp/tmpqdd_jn5e/assets\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "INFO:tensorflow:Assets written to: /tmp/tmpbt6mf1gw/assets\n"
+ "INFO:tensorflow:Assets written to: /tmp/tmpqdd_jn5e/assets\n"
]
}
],
@@ -610,6 +656,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "7f86fa47-de61-4007-ab55-9076e12ce963",
"metadata": {},
@@ -619,18 +666,19 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 19,
"id": "7f65598b-e3e7-4238-a73e-19d00c3deb26",
"metadata": {},
"outputs": [],
"source": [
"top_k=10\n",
- "ordering = combined_features[\"item_id_raw\"] >> SoftmaxSampling(\n",
- " relevance_col=ranking[\"click/binary_classification_task\"], topk=top_k, temperature=20.0\n",
+ "ordering = combined_features[\"item_id\"] >> SoftmaxSampling(\n",
+ " relevance_col=ranking[\"click/binary_classification_task\"], topk=top_k, temperature=0.00000001\n",
")"
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "f4e2e389-d884-44a1-8e32-4916a0eb43cf",
"metadata": {},
@@ -642,6 +690,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "50bc2e4f-5e58-4ad4-8ae5-d79ad286978f",
"metadata": {},
@@ -651,7 +700,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 20,
"id": "b28c452f-543c-45a4-9995-130ca6919669",
"metadata": {},
"outputs": [],
@@ -661,6 +710,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "a061bd82-e553-4823-8d14-3ae88a458c14",
"metadata": {},
@@ -670,21 +720,21 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 21,
"id": "9c8b7b94-5559-4587-a272-4d9de2d53dd1",
"metadata": {},
"outputs": [],
"source": [
"request_schema = Schema(\n",
" [\n",
- " ColumnSchema(\"user_id_raw\", dtype=np.int32),\n",
+ " ColumnSchema(\"user_id\", dtype=np.int32),\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 22,
"id": "6c64d686-aed5-42f8-b517-482b4237c69f",
"metadata": {},
"outputs": [
@@ -709,6 +759,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "276eedd8-5dc0-4ad0-8725-c8da60fea693",
"metadata": {},
@@ -718,7 +769,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 23,
"id": "89182219-40a6-458c-af0e-7a8e83f364aa",
"metadata": {},
"outputs": [
@@ -727,7 +778,25 @@
"output_type": "stream",
"text": [
"poc_ensemble/\n",
- "├─0_predicttensorflowtriton/\n",
+ "├─0_transformworkflowtriton/\n",
+ "│ ├─1/\n",
+ "│ │ ├─model.py\n",
+ "│ │ └─workflow/\n",
+ "│ │ ├─categories/\n",
+ "│ │ │ ├─unique.user_age.parquet\n",
+ "│ │ │ ├─unique.user_brands.parquet\n",
+ "│ │ │ ├─unique.user_categories.parquet\n",
+ "│ │ │ ├─unique.user_consumption_2.parquet\n",
+ "│ │ │ ├─unique.user_gender.parquet\n",
+ "│ │ │ ├─unique.user_geography.parquet\n",
+ "│ │ │ ├─unique.user_group.parquet\n",
+ "│ │ │ ├─unique.user_id.parquet\n",
+ "│ │ │ ├─unique.user_intentions.parquet\n",
+ "│ │ │ └─unique.user_is_occupied.parquet\n",
+ "│ │ ├─metadata.json\n",
+ "│ │ └─workflow.pkl\n",
+ "│ └─config.pbtxt\n",
+ "├─1_predicttensorflowtriton/\n",
"│ ├─1/\n",
"│ │ └─model.savedmodel/\n",
"│ │ ├─assets/\n",
@@ -738,7 +807,19 @@
"│ │ ├─variables.data-00000-of-00001\n",
"│ │ └─variables.index\n",
"│ └─config.pbtxt\n",
- "├─1_predicttensorflowtriton/\n",
+ "├─2_transformworkflowtriton/\n",
+ "│ ├─1/\n",
+ "│ │ ├─model.py\n",
+ "│ │ └─workflow/\n",
+ "│ │ ├─categories/\n",
+ "│ │ │ ├─unique.item_brand.parquet\n",
+ "│ │ │ ├─unique.item_category.parquet\n",
+ "│ │ │ ├─unique.item_id.parquet\n",
+ "│ │ │ └─unique.item_shop.parquet\n",
+ "│ │ ├─metadata.json\n",
+ "│ │ └─workflow.pkl\n",
+ "│ └─config.pbtxt\n",
+ "├─3_predicttensorflowtriton/\n",
"│ ├─1/\n",
"│ │ └─model.savedmodel/\n",
"│ │ ├─.merlin/\n",
@@ -768,6 +849,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "fe7962cc-f26d-4a4a-b5a3-d214e0f37456",
"metadata": {
@@ -778,6 +860,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "8c07c620-7d6c-4275-87fe-e5b94335bdb9",
"metadata": {},
@@ -792,6 +875,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "6c0a798f-6abf-4cbb-87f8-f60a6e757092",
"metadata": {},
@@ -800,6 +884,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "3b0794b1-b9e0-4508-bf6e-cc823ac5c693",
"metadata": {},
@@ -808,6 +893,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "af9efbde-4dac-42f1-9ace-096f75bac2b5",
"metadata": {},
@@ -817,7 +903,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 24,
"id": "d08a8975-9c32-467b-99ec-df66319f854b",
"metadata": {},
"outputs": [
@@ -825,8 +911,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- " user_id_raw\n",
- "0 7\n"
+ " user_id\n",
+ "0 7\n"
]
}
],
@@ -835,12 +921,13 @@
"from merlin.core.dispatch import make_df\n",
"\n",
"# create a request to be sent to TIS\n",
- "request = make_df({\"user_id_raw\": [7]})\n",
- "request[\"user_id_raw\"] = request[\"user_id_raw\"].astype(np.int32)\n",
+ "request = make_df({\"user_id\": [7]})\n",
+ "request[\"user_id\"] = request[\"user_id\"].astype(np.int32)\n",
"print(request)"
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "28e9e27f-6658-4302-b142-08b05215e48f",
"metadata": {},
@@ -850,20 +937,20 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 25,
"id": "74ec62f2-5935-45c6-8058-e1cdade6f80f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'ordered_ids': array([[266, 381, 145, 6, 232, 651, 83, 244, 107, 69]], dtype=int32),\n",
- " 'ordered_scores': array([[0.50194645, 0.50282484, 0.50340647, 0.5027974 , 0.50236404,\n",
- " 0.50230837, 0.50244445, 0.5022982 , 0.50169003, 0.50216776]],\n",
+ "{'ordered_ids': array([[100, 168, 324, 79, 361, 294, 267, 289, 397, 189]], dtype=int32),\n",
+ " 'ordered_scores': array([[0.5016385 , 0.50176895, 0.5017176 , 0.5024097 , 0.5018236 ,\n",
+ " 0.5018286 , 0.50162375, 0.5015677 , 0.50175667, 0.5014358 ]],\n",
" dtype=float32)}"
]
},
- "execution_count": 23,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -874,6 +961,7 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
"id": "b4605dbe-5f97-4b31-8ee4-ce7c1cb69d97",
"metadata": {},
diff --git a/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py b/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py
index 435742499..138ea554f 100644
--- a/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py
+++ b/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py
@@ -74,7 +74,7 @@ def test_func(tmpdir):
df_lib = get_lib()
train = df_lib.read_parquet(
os.path.join("{tmpdir / "data"}/processed_nvt/", "train", "part_0.parquet"),
- columns=["user_id_raw"],
+ columns=["user_id"],
)
batch = train[:1]
from merlin.systems.triton.utils import run_ensemble_on_tritonserver