From 65c63738e8c24f9aba0287528a371952871daefe Mon Sep 17 00:00:00 2001 From: Julio Date: Wed, 21 Jun 2023 18:13:17 -0400 Subject: [PATCH 1/5] updates notebooks for multistage with subgraphs --- ...ding-Recommender-Systems-with-Merlin.ipynb | 1693 ++++++++--------- ...lti-stage-RecSys-with-Merlin-Systems.ipynb | 218 ++- 2 files changed, 922 insertions(+), 989 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index dd22a1378..990c568ed 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "# Copyright 2023 NVIDIA Corporation. All Rights Reserved.\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -129,11 +129,11 @@ "outputs": [], "source": [ "# for running this example on GPU, install the following libraries\n", - "# %pip install \"feast==0.31\" faiss-gpu\n", + "# %pip install \"feast<0.20\" faiss-gpu\n", "\n", "# for running this example on CPU, uncomment the following lines\n", - "# %pip install tensorflow-cpu \"feast==0.31\" faiss-cpu\n", - "# %pip uninstall cudf" + "# %pip install tensorflow-cpu \"feast<0.20\" faiss-cpu\n", + "# %pip uninstall cudf\n" ] }, { @@ -146,61 +146,50 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-20 23:45:23.539085: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", - " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n" + "2023-06-21 21:24:41.476144: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n", - "[INFO]: sparse_operation_kit is imported\n", - "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n", - "[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.1.4-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so\n", - "[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.1.4-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so\n" + "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2023-06-20 23:45:31.002019: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", + " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n", + "2023-06-21 21:24:43.274327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", + "2023-06-21 21:24:43.274369: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 2dca7910ae98\n", + "2023-06-21 21:24:43.274380: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 2dca7910ae98\n", + "2023-06-21 21:24:43.274481: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program\n", + "2023-06-21 21:24:43.274508: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1\n", + "2023-06-21 21:24:43.621683: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-06-20 23:45:31.232986: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", - "2023-06-20 23:45:31.233033: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", - "2023-06-20 23:45:31.233242: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8192 MB memory: -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:06:00.0, compute capability: 7.0\n", "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[SOK INFO] Initialize finished, communication tool: horovod\n" - ] } ], "source": [ "import os\n", - "# for running this example on CPU, comment out the line below\n", - "os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"\n", - "\n", "import nvtabular as nvt\n", "from nvtabular.ops import Rename, Filter, Dropna, LambdaOp, Categorify, \\\n", " TagAsUserFeatures, TagAsUserID, TagAsItemFeatures, TagAsItemID, AddMetadata\n", "\n", "from merlin.schema.tags import Tags\n", - "\n", + "from merlin.dag.ops.subgraph import Subgraph\n", "import merlin.models.tf as mm\n", "from merlin.io.dataset import Dataset\n", "from merlin.datasets.ecommerce import transform_aliccp\n", "import tensorflow as tf\n", "\n", - "import logging" + "# for running this example on CPU, comment out the line below\n", + "os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"" ] }, { @@ -211,6 +200,8 @@ "outputs": [], "source": [ "# disable INFO and DEBUG logging everywhere\n", + "import logging\n", + "\n", "logging.disable(logging.WARNING)" ] }, @@ -251,7 +242,16 @@ "execution_count": 6, "id": "b44b3378-7297-4946-a271-742a9239bc3e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], "source": [ "from merlin.datasets.synthetic import generate_data\n", "\n", @@ -269,1078 +269,972 @@ }, { "cell_type": "markdown", - "id": "2e428d01-f2f0-42d4-85d0-0986bb83a847", - "metadata": {}, - "source": [ - "### Feature Engineering with NVTabular" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d4bf870c-30cf-4074-88d3-b75981b3a873", + "id": "7bd843be-dfba-4f8b-bac1-608e6571352d", "metadata": {}, - "outputs": [], "source": [ - "output_path = os.path.join(DATA_FOLDER, \"processed_nvt\")" + "### Set up a feature store with Feast" ] }, { "cell_type": "markdown", - "id": "1e7bfb5c-88ed-4cf9-8a17-98c0284adb36", + "id": "c543b71c-6ba2-4e43-8779-8bffb62d2cee", "metadata": {}, "source": [ - "In the following NVTabular workflow, notice that we apply the `Dropna()` Operator at the end. We add the Operator to remove rows with missing values in the final DataFrame after the preceding transformations. Although, the synthetic dataset that we generate and use in this notebook does not have null entries, you might have null entries in your `user_id` and `item_id` columns in your own custom dataset. Therefore, while applying `Dropna()` we will not be registering null `user_id_raw` and `item_id_raw` values in the feature store, and will be avoiding potential issues that can occur because of any null entries." + "Before we move onto the next step, we need to create a Feast feature repository. [Feast](https://feast.dev/) is an end-to-end open source feature store for machine learning. Feast (Feature Store) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to real-time models.\n", + "\n", + "We will create the feature repo in the current working directory, which is `BASE_DIR` for us." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "f91ada78-4e4d-4415-ab94-e351aa454e9e", + "execution_count": 7, + "id": "2e7e96d2-9cd2-40d1-b356-8cd76b57bb4a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Creating a new Feast repository in \u001b[1m\u001b[32m/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo\u001b[0m.\n", + "\n" + ] + } + ], "source": [ - "user_id_raw = [\"user_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsUserFeatures()\n", - "item_id_raw = [\"item_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsItemFeatures()\n", - "\n", - "user_id = [\"user_id\"] >> Categorify(dtype=\"int32\") >> TagAsUserID()\n", - "item_id = [\"item_id\"] >> Categorify(dtype=\"int32\") >> TagAsItemID()\n", - "\n", - "item_features = (\n", - " [\"item_category\", \"item_shop\", \"item_brand\"] >> Categorify(dtype=\"int32\") >> TagAsItemFeatures()\n", - ")\n", - "\n", - "user_features = (\n", - " [\n", - " \"user_shops\",\n", - " \"user_profile\",\n", - " \"user_group\",\n", - " \"user_gender\",\n", - " \"user_age\",\n", - " \"user_consumption_2\",\n", - " \"user_is_occupied\",\n", - " \"user_geography\",\n", - " \"user_intentions\",\n", - " \"user_brands\",\n", - " \"user_categories\",\n", - " ] >> Categorify(dtype=\"int32\") >> TagAsUserFeatures()\n", - ")\n", - "\n", - "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n", - "\n", - "outputs = user_id + item_id + item_features + user_features + user_id_raw + item_id_raw + targets\n", - "\n", - "# add dropna op to filter rows with nulls\n", - "outputs = outputs >> Dropna()" + "!rm -rf $BASE_DIR/feast_repo\n", + "!cd $BASE_DIR && feast init feast_repo" ] }, { "cell_type": "markdown", - "id": "71aae006-a161-4127-889a-8f433a9f7362", + "id": "f6d4d773-144e-4e34-82cd-f2b50fce601c", "metadata": {}, "source": [ - "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path." + "You should be seeing a message like Creating a new Feast repository in ... printed out above. Now, navigate to the `feature_repo` folder and remove the demo parquet file created by default, and `examples.py` file." ] }, { "cell_type": "code", - "execution_count": 9, - "id": "814e8438-642a-4f03-baaf-44dab8d1b5e5", + "execution_count": 8, + "id": "26ba2521-ed1b-4c2b-afdd-26b4a5a9c008", "metadata": {}, "outputs": [], "source": [ - "transform_aliccp(\n", - " (train_raw, valid_raw), output_path, nvt_workflow=outputs, workflow_name=\"workflow\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "09c87748-af61-42b8-8574-1afe3d71118f", - "metadata": {}, - "source": [ - "### Training a Retrieval Model with Two-Tower Model" - ] - }, - { - "cell_type": "markdown", - "id": "e644fcba-7b0b-44c0-97fd-80f4fcb01191", - "metadata": {}, - "source": [ - "We start with the offline candidate retrieval stage. We are going to train a Two-Tower model for item retrieval. To learn more about the Two-tower model you can visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb)." + "feature_repo_path = os.path.join(BASE_DIR, \"feast_repo/feature_repo\")\n", + "if os.path.exists(f\"{feature_repo_path}/example_repo.py\"):\n", + " os.remove(f\"{feature_repo_path}/example_repo.py\")\n", + "if os.path.exists(f\"{feature_repo_path}/data/driver_stats.parquet\"):\n", + " os.remove(f\"{feature_repo_path}/data/driver_stats.parquet\")" ] }, { "cell_type": "markdown", - "id": "cf9bca46-a6b6-4a73-afd8-fe2869c60748", + "id": "24ae0e29-c156-4df9-8977-238786160a8c", "metadata": {}, "source": [ - "#### Feature Engineering with NVTabular" + "### Exporting user and item features" ] }, { - "cell_type": "markdown", - "id": "da2b09cc-09fb-4814-a1cb-7e6168d9eb4b", + "cell_type": "code", + "execution_count": 9, + "id": "ea0b369c-2f01-42e3-9f3c-74c3ff4a6d64", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "We are going to process our raw categorical features by encoding them using `Categorify()` operator and tag the features with `user` or `item` tags in the schema file. To learn more about [NVTabular](https://github.com/NVIDIA-Merlin/NVTabular) and the schema object visit this example [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) in the Merlin Models repo." + "from merlin.models.utils.dataset import unique_rows_by_features\n", + "\n", + "user_features = (\n", + " unique_rows_by_features(train_raw, Tags.USER, Tags.USER_ID)\n", + " .compute()\n", + " .reset_index(drop=True)\n", + ")" ] }, { "cell_type": "markdown", - "id": "f3bc7abd-8d97-452b-a4af-5227821a99c9", + "id": "4f2d12f5-c753-4392-b113-965d97d2fe35", "metadata": {}, "source": [ - "Define a new output path to store the filtered datasets and schema files." + "We will artificially add `datetime` and `created` timestamp columns to our user_features dataframe. This required by Feast to track the user-item features and their creation time and to determine which version to use when we query Feast." ] }, { "cell_type": "code", "execution_count": 10, - "id": "df72a793-194b-44f4-80c3-aaa368a9a01e", + "id": "d30bd2f8-8a78-4df7-9bc4-42bd741c5b99", "metadata": {}, "outputs": [], "source": [ - "output_path2 = os.path.join(DATA_FOLDER, \"processed/retrieval\")" + "from datetime import datetime\n", + "\n", + "user_features[\"datetime\"] = datetime.now()\n", + "user_features[\"datetime\"] = user_features[\"datetime\"].astype(\"datetime64[ns]\")\n", + "user_features[\"created\"] = datetime.now()\n", + "user_features[\"created\"] = user_features[\"created\"].astype(\"datetime64[ns]\")" ] }, { "cell_type": "code", "execution_count": 11, - "id": "251d4697-8f9c-4c93-8de4-c3480a8378de", - "metadata": {}, - "outputs": [], - "source": [ - "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n", - "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))" - ] - }, - { - "cell_type": "markdown", - "id": "ffd7e2ac-a251-49d0-943b-e9272c852ba6", + "id": "d4998cd1-9dcd-4911-8f23-372e197b41e9", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_iduser_shopsuser_profileuser_groupuser_genderuser_ageuser_consumption_1user_consumption_2user_is_occupieduser_geographyuser_intentionsuser_brandsuser_categoriesdatetimecreated
38765811111111191327352023-06-21 21:24:49.0828042023-06-21 21:24:49.085539
\n", + "
" + ], + "text/plain": [ + " user_id user_shops user_profile user_group user_gender user_age \\\n", + "38 7 658 1 1 1 1 \n", + "\n", + " user_consumption_1 user_consumption_2 user_is_occupied user_geography \\\n", + "38 1 1 1 1 \n", + "\n", + " user_intentions user_brands user_categories datetime \\\n", + "38 191 327 35 2023-06-21 21:24:49.082804 \n", + "\n", + " created \n", + "38 2023-06-21 21:24:49.085539 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "We select only positive interaction rows where `click==1` in the dataset with `Filter()` operator." + "user_features[user_features[\"user_id\"] == 7]" ] }, { "cell_type": "code", "execution_count": 12, - "id": "7e085a6d-74ad-4c24-8e7c-4e449c15f471", + "id": "2981b3ed-6156-49f0-aa14-326a3853a58a", "metadata": {}, "outputs": [], "source": [ - "inputs = train_tt.schema.column_names\n", - "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n", - "\n", - "workflow2 = nvt.Workflow(outputs)\n", - "\n", - "workflow2.fit(train_tt)\n", - "\n", - "workflow2.transform(train_tt).to_parquet(\n", - " output_path=os.path.join(output_path2, \"train\")\n", - ")\n", - "\n", - "workflow2.transform(valid_tt).to_parquet(\n", - " output_path=os.path.join(output_path2, \"valid\")\n", + "user_features.to_parquet(\n", + " os.path.join(feature_repo_path, \"data\", \"user_features.parquet\")\n", ")" ] }, { - "cell_type": "markdown", - "id": "cc4721ae-7228-4d3f-9586-dcdfefecc19f", + "cell_type": "code", + "execution_count": 13, + "id": "0a33a668-8e2a-4546-8f54-0060d405ba91", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "NVTabular exported the schema file, `schema.pbtxt` a protobuf text file, of our processed dataset. To learn more about the schema object and schema file you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) notebook." + "item_features = (\n", + " unique_rows_by_features(train_raw, Tags.ITEM, Tags.ITEM_ID)\n", + " .compute()\n", + " .reset_index(drop=True)\n", + ")" ] }, { - "cell_type": "markdown", - "id": "aa025b80-0f18-437c-a85f-4edcb89f4222", + "cell_type": "code", + "execution_count": 14, + "id": "68a694d6-926f-4b0f-8edc-8cc7ac85ade7", "metadata": {}, + "outputs": [], "source": [ - "**Read filtered parquet files as Dataset objects.**" + "item_features[\"datetime\"] = datetime.now()\n", + "item_features[\"datetime\"] = item_features[\"datetime\"].astype(\"datetime64[ns]\")\n", + "item_features[\"created\"] = datetime.now()\n", + "item_features[\"created\"] = item_features[\"created\"].astype(\"datetime64[ns]\")" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "252a8e60-b447-46b5-ade6-3557cbafa797", + "execution_count": 15, + "id": "6c03fa22-b112-4243-bbe1-1cd7260cb85b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_iditem_categoryitem_shopitem_branditem_intentiondatetimecreated
0291389672333115412023-06-21 21:24:49.1459832023-06-21 21:24:49.147882
141510373571662023-06-21 21:24:49.1459832023-06-21 21:24:49.147882
21779552719048812023-06-21 21:24:49.1459832023-06-21 21:24:49.147882
3155756531961832084712023-06-21 21:24:49.1459832023-06-21 21:24:49.147882
41989621821429912023-06-21 21:24:49.1459832023-06-21 21:24:49.147882
\n", + "
" + ], + "text/plain": [ + " item_id item_category item_shop item_brand item_intention \\\n", + "0 29 138 9672 3331 1541 \n", + "1 4 15 1037 357 166 \n", + "2 17 79 5527 1904 881 \n", + "3 155 756 53196 18320 8471 \n", + "4 19 89 6218 2142 991 \n", + "\n", + " datetime created \n", + "0 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", + "1 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", + "2 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", + "3 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", + "4 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_features.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c312884b-a1f8-4e08-8068-696e06a9bf46", "metadata": {}, "outputs": [], "source": [ - "train_tt = Dataset(os.path.join(output_path2, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", - "valid_tt = Dataset(os.path.join(output_path2, \"valid\", \"*.parquet\"), part_size=\"500MB\")" + "# save to disk\n", + "item_features.to_parquet(\n", + " os.path.join(feature_repo_path, \"data\", \"item_features.parquet\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2e428d01-f2f0-42d4-85d0-0986bb83a847", + "metadata": {}, + "source": [ + "### Feature Engineering with NVTabular" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "71063653-2f39-4b54-8399-145d6f281d4d", + "execution_count": 17, + "id": "d4bf870c-30cf-4074-88d3-b75981b3a873", "metadata": {}, "outputs": [], "source": [ - "schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['user_id_raw', 'item_id_raw', 'click'])\n", - "train_tt.schema = schema\n", - "valid_tt.schema = schema" + "output_path = os.path.join(DATA_FOLDER, \"processed_nvt\")" + ] + }, + { + "cell_type": "markdown", + "id": "1e7bfb5c-88ed-4cf9-8a17-98c0284adb36", + "metadata": {}, + "source": [ + "In the following NVTabular workflow, notice that we apply the `Dropna()` Operator at the end. We add the Operator to remove rows with missing values in the final DataFrame after the preceding transformations. Although, the synthetic dataset that we generate and use in this notebook does not have null entries, you might have null entries in your `user_id` and `item_id` columns in your own custom dataset. Therefore, while applying `Dropna()` we will not be registering null `user_id_raw` and `item_id_raw` values in the feature store, and will be avoiding potential issues that can occur because of any null entries." ] }, { "cell_type": "code", - "execution_count": 15, - "id": "9312511a-f368-42f2-93d2-eb95aebbf46c", + "execution_count": 18, + "id": "f91ada78-4e4d-4415-ab94-e351aa454e9e", "metadata": {}, "outputs": [], "source": [ - "model_tt = mm.TwoTowerModel(\n", - " schema,\n", - " query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),\n", - " samplers=[mm.InBatchSampler()],\n", - " embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),\n", - ")" + "user_id_raw = [\"user_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsUserFeatures()\n", + "item_id_raw = [\"item_id\"] >> Rename(postfix='_raw') >> LambdaOp(lambda col: col.astype(\"int32\")) >> TagAsItemFeatures()\n", + "\n", + "\n", + "item_cat = Categorify(dtype=\"int32\")\n", + "items = ([\"item_id\",\"item_category\", \"item_shop\", \"item_brand\"] >> item_cat)\n", + "\n", + "subgraph_item = Subgraph(\n", + " \"item\", \n", + " Subgraph(\"items_cat\", items) + \n", + " (items[\"item_id\"] >> TagAsItemID()) + \n", + " (items[\"item_category\", \"item_shop\", \"item_brand\"] >> TagAsItemFeatures())\n", + ")\n", + "subgraph_user = Subgraph(\n", + " \"user\",\n", + " ([\"user_id\"] >> Categorify(dtype=\"int32\") >> TagAsUserID()) +\n", + " (\n", + " [\n", + " \"user_shops\",\n", + " \"user_profile\",\n", + " \"user_group\",\n", + " \"user_gender\",\n", + " \"user_age\",\n", + " \"user_consumption_2\",\n", + " \"user_is_occupied\",\n", + " \"user_geography\",\n", + " \"user_intentions\",\n", + " \"user_brands\",\n", + " \"user_categories\",\n", + " ] >> Categorify(dtype=\"int32\") >> TagAsUserFeatures()\n", + " )\n", + ")\n", + "\n", + "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n", + "outputs = subgraph_user + subgraph_item + targets\n", + "\n", + "# add dropna op to filter rows with nulls\n", + "outputs = outputs >> Dropna()" + ] + }, + { + "cell_type": "markdown", + "id": "71aae006-a161-4127-889a-8f433a9f7362", + "metadata": {}, + "source": [ + "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "4d47cb8b-e06a-4932-9a19-fb244ef43152", + "execution_count": 19, + "id": "814e8438-642a-4f03-baaf-44dab8d1b5e5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.8/dist-packages/keras/initializers/initializers_v2.py:120: UserWarning: The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", " warnings.warn(\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5/5 [==============================] - 18s 712ms/step - loss: 8.9090 - recall_at_10: 0.0069 - ndcg_at_10: 0.0045 - regularization_loss: 0.0000e+00 - loss_batch: 8.5771 - val_loss: 8.9027 - val_recall_at_10: 0.0113 - val_ndcg_at_10: 0.0072 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.7921\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "model_tt.compile(\n", - " optimizer=\"adam\",\n", - " run_eagerly=False,\n", - " loss=\"categorical_crossentropy\",\n", - " metrics=[mm.RecallAt(10), mm.NDCGAt(10)],\n", - ")\n", - "model_tt.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)" + "transform_aliccp(\n", + " (train_raw, valid_raw), output_path, nvt_workflow=outputs, workflow_name=\"workflow\"\n", + ")" ] }, { "cell_type": "markdown", - "id": "80d83007-f9e8-408f-9f65-a0e9e19cb586", + "id": "09c87748-af61-42b8-8574-1afe3d71118f", "metadata": {}, "source": [ - "### Exporting query (user) model" + "### Training a Retrieval Model with Two-Tower Model" ] }, { "cell_type": "markdown", - "id": "22af58a9-5525-454a-bf25-a9df0462aa53", - "metadata": {}, - "source": [ - "We export the query tower to use it later during the model deployment stage with Merlin Systems." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d2370f13-ff9a-4ee0-ba1e-451c7bec0f8a", + "id": "e644fcba-7b0b-44c0-97fd-80f4fcb01191", "metadata": {}, - "outputs": [], "source": [ - "query_tower = model_tt.retrieval_block.query_block()\n", - "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))" + "We start with the offline candidate retrieval stage. We are going to train a Two-Tower model for item retrieval. To learn more about the Two-tower model you can visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb)." ] }, { "cell_type": "markdown", - "id": "e16401d4", - "metadata": { - "tags": [] - }, + "id": "cf9bca46-a6b6-4a73-afd8-fe2869c60748", + "metadata": {}, "source": [ - "### Training a Ranking Model with DLRM" + "#### Feature Engineering with NVTabular" ] }, { "cell_type": "markdown", - "id": "b72e8a2a-fc4a-43ab-934c-6d941c56aad2", + "id": "da2b09cc-09fb-4814-a1cb-7e6168d9eb4b", "metadata": {}, "source": [ - "Now we will move onto training an offline ranking model. This ranking model will be used for scoring our retrieved items." + "We are going to process our raw categorical features by encoding them using `Categorify()` operator and tag the features with `user` or `item` tags in the schema file. To learn more about [NVTabular](https://github.com/NVIDIA-Merlin/NVTabular) and the schema object visit this example [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) in the Merlin Models repo." ] }, { "cell_type": "markdown", - "id": "c4f2b234", + "id": "f3bc7abd-8d97-452b-a4af-5227821a99c9", "metadata": {}, "source": [ - "Read processed parquet files. We use the `schema` object to define our model." + "Define a new output path to store the filtered datasets and schema files." ] }, { "cell_type": "code", - "execution_count": 18, - "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d", + "execution_count": 20, + "id": "df72a793-194b-44f4-80c3-aaa368a9a01e", "metadata": {}, "outputs": [], "source": [ - "# define train and valid dataset objects\n", - "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", - "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n", - "\n", - "# define schema object\n", - "schema = train.schema.without(['user_id_raw', 'item_id_raw'])" + "output_path2 = os.path.join(DATA_FOLDER, \"processed/retrieval\")" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "30e4ebc2", + "execution_count": 21, + "id": "251d4697-8f9c-4c93-8de4-c3480a8378de", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'click'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] } ], "source": [ - "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n", - "target_column" + "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n", + "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))" ] }, { "cell_type": "markdown", - "id": "8f68e26b", - "metadata": {}, - "source": [ - "Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture is a popular neural network model originally proposed by Facebook in 2019. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074). To learn more about DLRM architetcture please visit `Exploring-different-models` [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/04-Exporting-ranking-models.ipynb) in the Merlin Models GH repo." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "e4325080", + "id": "ffd7e2ac-a251-49d0-943b-e9272c852ba6", "metadata": {}, - "outputs": [], "source": [ - "model = mm.DLRMModel(\n", - " schema,\n", - " embedding_dim=64,\n", - " bottom_block=mm.MLPBlock([128, 64]),\n", - " top_block=mm.MLPBlock([128, 64, 32]),\n", - " prediction_tasks=mm.BinaryClassificationTask(target_column),\n", - ")" + "We select only positive interaction rows where `click==1` in the dataset with `Filter()` operator." ] }, { "cell_type": "code", - "execution_count": 21, - "id": "bfe2aa9e", + "execution_count": 22, + "id": "7e085a6d-74ad-4c24-8e7c-4e449c15f471", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "5/5 [==============================] - 9s 519ms/step - loss: 0.6932 - auc: 0.5008 - regularization_loss: 0.0000e+00 - loss_batch: 0.6931 - val_loss: 0.6932 - val_auc: 0.5034 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6932\n" + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "model.compile(optimizer=\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n", - "model.fit(train, validation_data=valid, batch_size=16 * 1024)" + "inputs = train_tt.schema.column_names\n", + "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n", + "\n", + "workflow2 = nvt.Workflow(outputs)\n", + "\n", + "workflow2.fit(train_tt)\n", + "\n", + "workflow2.transform(train_tt).to_parquet(\n", + " output_path=os.path.join(output_path2, \"train\")\n", + ")\n", + "\n", + "workflow2.transform(valid_tt).to_parquet(\n", + " output_path=os.path.join(output_path2, \"valid\")\n", + ")" ] }, { "cell_type": "markdown", - "id": "498c4d49-7a59-4260-87b9-b86b66f2c67f", + "id": "cc4721ae-7228-4d3f-9586-dcdfefecc19f", "metadata": {}, "source": [ - "Let's save our DLRM model to be able to load back at the deployment stage. " + "NVTabular exported the schema file, `schema.pbtxt` a protobuf text file, of our processed dataset. To learn more about the schema object and schema file you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) notebook." ] }, { - "cell_type": "code", - "execution_count": 22, - "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", + "cell_type": "markdown", + "id": "aa025b80-0f18-437c-a85f-4edcb89f4222", "metadata": {}, - "outputs": [], "source": [ - "model.save(os.path.join(BASE_DIR, \"dlrm\"))" + "**Read filtered parquet files as Dataset objects.**" ] }, { - "cell_type": "markdown", - "id": "d64a3f3f-81d8-489c-835f-c62f76df22d5", + "cell_type": "code", + "execution_count": 23, + "id": "252a8e60-b447-46b5-ade6-3557cbafa797", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "In the following cells we are going to export the required user and item features files, and save the query (user) tower model and item embeddings to disk. If you want to read more about exporting retrieval models, please visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb) notebook in Merlin Models library repo." + "train_tt = Dataset(os.path.join(output_path2, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", + "valid_tt = Dataset(os.path.join(output_path2, \"valid\", \"*.parquet\"), part_size=\"500MB\")" ] }, { - "cell_type": "markdown", - "id": "5da1f434-f5a1-4478-b588-7e7ec17e6a88", + "cell_type": "code", + "execution_count": 24, + "id": "71063653-2f39-4b54-8399-145d6f281d4d", "metadata": {}, + "outputs": [], "source": [ - "### Set up a feature store with Feast" + "schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['user_id_raw', 'item_id_raw', 'click'])\n", + "train_tt.schema = schema\n", + "valid_tt.schema = schema" ] }, { - "cell_type": "markdown", - "id": "99a4e939-d3cf-44f0-9012-d2af3264ee25", + "cell_type": "code", + "execution_count": 25, + "id": "9312511a-f368-42f2-93d2-eb95aebbf46c", "metadata": {}, + "outputs": [], "source": [ - "Before we move onto the next step, we need to create a Feast feature repository. [Feast](https://feast.dev/) is an end-to-end open source feature store for machine learning. Feast (Feature Store) is a customizable operational data system that re-uses existing infrastructure to manage and serve machine learning features to real-time models.\n", - "\n", - "We will create the feature repo in the current working directory, which is `BASE_DIR` for us." + "model_tt = mm.TwoTowerModel(\n", + " schema,\n", + " query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),\n", + " samplers=[mm.InBatchSampler()],\n", + " embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),\n", + ")" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "2e7e96d2-9cd2-40d1-b356-8cd76b57bb4a", + "execution_count": 26, + "id": "4d47cb8b-e06a-4932-9a19-fb244ef43152", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/keras/initializers/initializers_v2.py:120: UserWarning: The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n", + " warnings.warn(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Creating a new Feast repository in \u001b[1m\u001b[32m/Merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo\u001b[0m.\n", - "\n" + "5/5 [==============================] - 13s 1s/step - loss: 8.9092 - recall_at_10: 0.0076 - ndcg_at_10: 0.0058 - regularization_loss: 0.0000e+00 - loss_batch: 8.5704 - val_loss: 8.9050 - val_recall_at_10: 0.0121 - val_ndcg_at_10: 0.0097 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.7986\n" ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "!rm -rf $BASE_DIR/feast_repo\n", - "!cd $BASE_DIR && feast init feast_repo" + "model_tt.compile(\n", + " optimizer=\"adam\",\n", + " run_eagerly=False,\n", + " loss=\"categorical_crossentropy\",\n", + " metrics=[mm.RecallAt(10), mm.NDCGAt(10)],\n", + ")\n", + "model_tt.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)" ] }, { "cell_type": "markdown", - "id": "5e630e53-8336-487a-9ceb-133b1538acfb", + "id": "80d83007-f9e8-408f-9f65-a0e9e19cb586", "metadata": {}, "source": [ - "You should be seeing a message like Creating a new Feast repository in ... printed out above. Now, navigate to the `feature_repo` folder and remove the demo parquet file created by default, and `examples.py` file." + "### Exporting query (user) model" + ] + }, + { + "cell_type": "markdown", + "id": "22af58a9-5525-454a-bf25-a9df0462aa53", + "metadata": {}, + "source": [ + "We export the query tower to use it later during the model deployment stage with Merlin Systems." ] }, { "cell_type": "code", - "execution_count": 24, - "id": "26ba2521-ed1b-4c2b-afdd-26b4a5a9c008", + "execution_count": 27, + "id": "d2370f13-ff9a-4ee0-ba1e-451c7bec0f8a", "metadata": {}, "outputs": [], "source": [ - "feature_repo_path = os.path.join(BASE_DIR, \"feast_repo/feature_repo\")\n", - "if os.path.exists(f\"{feature_repo_path}/example_repo.py\"):\n", - " os.remove(f\"{feature_repo_path}/example_repo.py\")\n", - "if os.path.exists(f\"{feature_repo_path}/data/driver_stats.parquet\"):\n", - " os.remove(f\"{feature_repo_path}/data/driver_stats.parquet\")" + "query_tower = model_tt.retrieval_block.query_block()\n", + "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))" ] }, { "cell_type": "markdown", - "id": "78315676-eb6c-405a-b1fd-3174ea328406", + "id": "e16401d4", + "metadata": { + "tags": [] + }, + "source": [ + "### Training a Ranking Model with DLRM" + ] + }, + { + "cell_type": "markdown", + "id": "b72e8a2a-fc4a-43ab-934c-6d941c56aad2", "metadata": {}, "source": [ - "### Exporting user and item features" + "Now we will move onto training an offline ranking model. This ranking model will be used for scoring our retrieved items." ] }, { - "cell_type": "code", - "execution_count": 25, - "id": "ea0b369c-2f01-42e3-9f3c-74c3ff4a6d64", + "cell_type": "markdown", + "id": "c4f2b234", "metadata": {}, - "outputs": [], "source": [ - "from merlin.models.utils.dataset import unique_rows_by_features\n", - "\n", - "user_features = (\n", - " unique_rows_by_features(train, Tags.USER, Tags.USER_ID)\n", - " .compute()\n", - " .reset_index(drop=True)\n", - ")" + "Read processed parquet files. We use the `schema` object to define our model." ] }, { "cell_type": "code", - "execution_count": 26, - "id": "6b0949f9-e67a-414f-9d74-65f138e820a8", + "execution_count": 28, + "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_iduser_shopsuser_profileuser_groupuser_genderuser_ageuser_consumption_2user_is_occupieduser_geographyuser_intentionsuser_brandsuser_categoriesuser_id_raw
03333333333336
14433333334448
25533333335557
36633333336665
47733333337779
\n", - "
" - ], + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# define train and valid dataset objects\n", + "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", + "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n", + "\n", + "# define schema object\n", + "schema = train.schema.without(['user_id_raw', 'item_id_raw'])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "30e4ebc2", + "metadata": {}, + "outputs": [ + { + "data": { "text/plain": [ - " user_id user_shops user_profile user_group user_gender user_age \\\n", - "0 3 3 3 3 3 3 \n", - "1 4 4 3 3 3 3 \n", - "2 5 5 3 3 3 3 \n", - "3 6 6 3 3 3 3 \n", - "4 7 7 3 3 3 3 \n", - "\n", - " user_consumption_2 user_is_occupied user_geography user_intentions \\\n", - "0 3 3 3 3 \n", - "1 3 3 3 4 \n", - "2 3 3 3 5 \n", - "3 3 3 3 6 \n", - "4 3 3 3 7 \n", - "\n", - " user_brands user_categories user_id_raw \n", - "0 3 3 6 \n", - "1 4 4 8 \n", - "2 5 5 7 \n", - "3 6 6 5 \n", - "4 7 7 9 " + "'click'" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "user_features.head()" + "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n", + "target_column" ] }, { "cell_type": "markdown", - "id": "4a46bd8c-1337-4c74-a85b-25348a897d90", + "id": "8f68e26b", "metadata": {}, "source": [ - "We will artificially add `datetime` and `created` timestamp columns to our user_features dataframe. This required by Feast to track the user-item features and their creation time and to determine which version to use when we query Feast." + "Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture is a popular neural network model originally proposed by Facebook in 2019. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074). To learn more about DLRM architetcture please visit `Exploring-different-models` [notebook](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/04-Exporting-ranking-models.ipynb) in the Merlin Models GH repo." ] }, { "cell_type": "code", - "execution_count": 27, - "id": "d30bd2f8-8a78-4df7-9bc4-42bd741c5b99", + "execution_count": 30, + "id": "e4325080", "metadata": {}, "outputs": [], "source": [ - "from datetime import datetime\n", - "\n", - "user_features[\"datetime\"] = datetime.now()\n", - "user_features[\"datetime\"] = user_features[\"datetime\"].astype(\"datetime64[ns]\")\n", - "user_features[\"created\"] = datetime.now()\n", - "user_features[\"created\"] = user_features[\"created\"].astype(\"datetime64[ns]\")" + "model = mm.DLRMModel(\n", + " schema,\n", + " embedding_dim=64,\n", + " bottom_block=mm.MLPBlock([128, 64]),\n", + " top_block=mm.MLPBlock([128, 64, 32]),\n", + " prediction_tasks=mm.BinaryClassificationTask(target_column),\n", + ")" ] }, { "cell_type": "code", - "execution_count": 28, - "id": "d4998cd1-9dcd-4911-8f23-372e197b41e9", + "execution_count": 31, + "id": "bfe2aa9e", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5/5 [==============================] - 5s 271ms/step - loss: 0.6932 - auc: 0.4989 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.4994 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6932\n" + ] + }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_iduser_shopsuser_profileuser_groupuser_genderuser_ageuser_consumption_2user_is_occupieduser_geographyuser_intentionsuser_brandsuser_categoriesuser_id_rawdatetimecreated
033333333333362023-06-20 23:47:09.4366672023-06-20 23:47:09.438518
144333333344482023-06-20 23:47:09.4366672023-06-20 23:47:09.438518
255333333355572023-06-20 23:47:09.4366672023-06-20 23:47:09.438518
366333333366652023-06-20 23:47:09.4366672023-06-20 23:47:09.438518
477333333377792023-06-20 23:47:09.4366672023-06-20 23:47:09.438518
\n", - "
" - ], "text/plain": [ - " user_id user_shops user_profile user_group user_gender user_age \\\n", - "0 3 3 3 3 3 3 \n", - "1 4 4 3 3 3 3 \n", - "2 5 5 3 3 3 3 \n", - "3 6 6 3 3 3 3 \n", - "4 7 7 3 3 3 3 \n", - "\n", - " user_consumption_2 user_is_occupied user_geography user_intentions \\\n", - "0 3 3 3 3 \n", - "1 3 3 3 4 \n", - "2 3 3 3 5 \n", - "3 3 3 3 6 \n", - "4 3 3 3 7 \n", - "\n", - " user_brands user_categories user_id_raw datetime \\\n", - "0 3 3 6 2023-06-20 23:47:09.436667 \n", - "1 4 4 8 2023-06-20 23:47:09.436667 \n", - "2 5 5 7 2023-06-20 23:47:09.436667 \n", - "3 6 6 5 2023-06-20 23:47:09.436667 \n", - "4 7 7 9 2023-06-20 23:47:09.436667 \n", - "\n", - " created \n", - "0 2023-06-20 23:47:09.438518 \n", - "1 2023-06-20 23:47:09.438518 \n", - "2 2023-06-20 23:47:09.438518 \n", - "3 2023-06-20 23:47:09.438518 \n", - "4 2023-06-20 23:47:09.438518 " + "" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "user_features.head()" + "model.compile(optimizer=\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n", + "model.fit(train, validation_data=valid, batch_size=16 * 1024)" ] }, { - "cell_type": "code", - "execution_count": 29, - "id": "2981b3ed-6156-49f0-aa14-326a3853a58a", + "cell_type": "markdown", + "id": "498c4d49-7a59-4260-87b9-b86b66f2c67f", "metadata": {}, - "outputs": [], "source": [ - "user_features.to_parquet(os.path.join(feature_repo_path, \"data\", \"user_features.parquet\"))" + "Let's save our DLRM model to be able to load back at the deployment stage. " ] }, { "cell_type": "code", - "execution_count": 30, - "id": "0a33a668-8e2a-4546-8f54-0060d405ba91", + "execution_count": 32, + "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", "metadata": {}, "outputs": [], "source": [ - "item_features = (\n", - " unique_rows_by_features(train, Tags.ITEM, Tags.ITEM_ID)\n", - " .compute()\n", - " .reset_index(drop=True)\n", - ")" + "model.save(os.path.join(BASE_DIR, \"dlrm\"))" ] }, { - "cell_type": "code", - "execution_count": 31, - "id": "68a694d6-926f-4b0f-8edc-8cc7ac85ade7", + "cell_type": "markdown", + "id": "d64a3f3f-81d8-489c-835f-c62f76df22d5", "metadata": {}, - "outputs": [], "source": [ - "item_features[\"datetime\"] = datetime.now()\n", - "item_features[\"datetime\"] = item_features[\"datetime\"].astype(\"datetime64[ns]\")\n", - "item_features[\"created\"] = datetime.now()\n", - "item_features[\"created\"] = item_features[\"created\"].astype(\"datetime64[ns]\")" + "In the following cells we are going to export the required user and item features files, and save the query (user) tower model and item embeddings to disk. If you want to read more about exporting retrieval models, please visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/05-Retrieval-Model.ipynb) notebook in Merlin Models library repo." ] }, { - "cell_type": "code", - "execution_count": 32, - "id": "6c03fa22-b112-4243-bbe1-1cd7260cb85b", + "cell_type": "markdown", + "id": "ff30ceab-b264-4509-9c5b-5a10425e143b", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
item_iditem_categoryitem_shopitem_branditem_id_rawdatetimecreated
0333362023-06-20 23:47:09.5577932023-06-20 23:47:09.559325
1444472023-06-20 23:47:09.5577932023-06-20 23:47:09.559325
25555102023-06-20 23:47:09.5577932023-06-20 23:47:09.559325
3666682023-06-20 23:47:09.5577932023-06-20 23:47:09.559325
4777752023-06-20 23:47:09.5577932023-06-20 23:47:09.559325
\n", - "
" - ], - "text/plain": [ - " item_id item_category item_shop item_brand item_id_raw \\\n", - "0 3 3 3 3 6 \n", - "1 4 4 4 4 7 \n", - "2 5 5 5 5 10 \n", - "3 6 6 6 6 8 \n", - "4 7 7 7 7 5 \n", - "\n", - " datetime created \n", - "0 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n", - "1 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n", - "2 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n", - "3 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 \n", - "4 2023-06-20 23:47:09.557793 2023-06-20 23:47:09.559325 " - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "item_features.head()" + "### Extract and save Item embeddings" ] }, { "cell_type": "code", "execution_count": 33, - "id": "c312884b-a1f8-4e08-8068-696e06a9bf46", - "metadata": {}, - "outputs": [], - "source": [ - "# save to disk\n", - "item_features.to_parquet(\n", - " os.path.join(feature_repo_path, \"data\", \"item_features.parquet\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ff30ceab-b264-4509-9c5b-5a10425e143b", + "id": "e62f65f8-e8f1-447e-9500-5960807c36f2", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "### Extract and save Item embeddings" + "nvt_wkflow = nvt.Workflow.load(output_path + \"/workflow\")\n", + "cat_wkflow = nvt_wkflow.get_subworkflow(\"items_cat\")\n", + "item_features_ds = Dataset(item_features, schema=schema)\n", + "item_feature_cat_ds = cat_wkflow.transform(item_features_ds).to_ddf().compute()" ] }, { "cell_type": "code", "execution_count": 34, - "id": "00f1fe65-882e-4962-bb16-19a130fda215", + "id": "6a4848a7-aa4f-4f8a-8b40-6c8458ac4fcd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n", + "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", + " warnings.warn(\n" + ] + } + ], "source": [ "item_embs = model_tt.item_embeddings(\n", - " Dataset(item_features, schema=schema), batch_size=1024\n", + " Dataset(item_feature_cat_ds, schema=schema), batch_size=1024\n", ")\n", - "item_embs_df = item_embs.compute(scheduler=\"synchronous\")" + "item_embs_df = item_embs.compute(scheduler=\"synchronous\")\n", + "item_embs_df[\"item_id\"] = item_features[\"item_id\"]" ] }, { @@ -1409,123 +1303,123 @@ " \n", " \n", " 0\n", - " 3\n", - " -0.055164\n", - " -0.018832\n", - " -0.009478\n", - " -0.016874\n", - " 0.015988\n", - " -0.022928\n", - " 0.022611\n", - " -0.030984\n", - " -0.045701\n", + " 29\n", + " -0.004295\n", + " -0.032384\n", + " -0.044917\n", + " 0.039122\n", + " -0.016758\n", + " -0.059473\n", + " -0.007811\n", + " 0.004419\n", + " -0.044857\n", " ...\n", - " 0.007060\n", - " 0.032204\n", - " 0.011515\n", - " 0.012811\n", - " 0.002650\n", - " 0.023448\n", - " 0.021759\n", - " -0.011316\n", - " -0.035275\n", - " -0.004572\n", + " 0.019839\n", + " 0.031614\n", + " 0.066616\n", + " -0.023459\n", + " 0.039532\n", + " -0.025300\n", + " 0.002040\n", + " 0.010800\n", + " -0.031893\n", + " 0.009897\n", " \n", " \n", " 1\n", " 4\n", - " -0.027412\n", - " -0.007417\n", - " -0.023730\n", - " -0.008385\n", - " 0.028241\n", - " -0.004143\n", - " 0.001301\n", - " -0.040613\n", - " -0.020645\n", + " 0.007210\n", + " -0.004949\n", + " -0.021168\n", + " 0.039533\n", + " -0.004339\n", + " -0.026979\n", + " 0.018726\n", + " -0.034300\n", + " -0.010744\n", " ...\n", - " 0.001835\n", - " 0.010697\n", - " 0.006311\n", - " 0.007290\n", - " -0.014959\n", - " 0.025217\n", - " 0.041697\n", - " -0.012126\n", - " -0.022523\n", - " -0.001903\n", + " 0.021441\n", + " -0.008866\n", + " 0.018915\n", + " 0.001428\n", + " 0.007287\n", + " 0.003946\n", + " -0.029646\n", + " -0.023998\n", + " -0.021912\n", + " 0.005516\n", " \n", " \n", " 2\n", - " 5\n", - " -0.009581\n", - " 0.016263\n", - " -0.027931\n", - " -0.023079\n", - " 0.006483\n", - " 0.006133\n", - " -0.027449\n", - " 0.027797\n", - " 0.045743\n", + " 17\n", + " 0.034115\n", + " -0.007572\n", + " -0.045769\n", + " 0.038766\n", + " -0.018994\n", + " -0.003735\n", + " -0.013748\n", + " 0.003397\n", + " 0.018028\n", " ...\n", - " -0.003662\n", - " 0.054940\n", - " 0.013501\n", - " -0.004127\n", - " -0.001858\n", - " -0.000462\n", - " -0.018047\n", - " 0.036427\n", - " 0.009524\n", - " 0.006689\n", + " 0.002467\n", + " 0.029187\n", + " -0.023114\n", + " 0.007315\n", + " 0.001796\n", + " 0.013247\n", + " 0.011309\n", + " -0.004574\n", + " -0.011722\n", + " 0.004382\n", " \n", " \n", " 3\n", - " 6\n", - " -0.007599\n", - " -0.012074\n", - " 0.024879\n", - " -0.008080\n", - " -0.025010\n", - " -0.000266\n", - " 0.005489\n", - " -0.014263\n", - " -0.019343\n", + " 155\n", + " -0.014619\n", + " -0.001738\n", + " -0.006829\n", + " 0.019568\n", + " -0.025870\n", + " -0.043351\n", + " 0.007577\n", + " -0.038977\n", + " -0.015209\n", " ...\n", - " -0.030220\n", - " 0.011863\n", - " -0.008515\n", - " 0.011286\n", - " -0.000907\n", - " 0.014882\n", - " 0.035699\n", - " -0.007068\n", - " 0.012995\n", - " 0.001644\n", + " -0.011659\n", + " 0.011859\n", + " -0.004721\n", + " 0.002480\n", + " 0.040565\n", + " -0.023915\n", + " -0.039050\n", + " -0.013832\n", + " -0.028899\n", + " 0.034076\n", " \n", " \n", " 4\n", - " 7\n", - " -0.070002\n", - " 0.001031\n", - " -0.001309\n", - " -0.014118\n", - " -0.036672\n", - " -0.012943\n", - " 0.009711\n", - " -0.008856\n", - " -0.032054\n", + " 19\n", + " 0.027284\n", + " -0.039710\n", + " -0.013016\n", + " -0.021763\n", + " -0.019920\n", + " -0.019573\n", + " 0.004436\n", + " 0.005504\n", + " -0.018312\n", " ...\n", - " -0.023113\n", - " 0.000600\n", - " -0.005711\n", - " 0.044277\n", - " -0.004765\n", - " 0.016184\n", - " 0.028223\n", - " 0.002914\n", - " 0.032516\n", - " 0.026521\n", + " 0.007124\n", + " 0.005688\n", + " 0.018035\n", + " 0.018919\n", + " 0.020091\n", + " -0.017181\n", + " 0.027977\n", + " -0.032007\n", + " -0.005940\n", + " 0.013642\n", " \n", " \n", "\n", @@ -1534,25 +1428,25 @@ ], "text/plain": [ " item_id 0 1 2 3 4 5 \\\n", - "0 3 -0.055164 -0.018832 -0.009478 -0.016874 0.015988 -0.022928 \n", - "1 4 -0.027412 -0.007417 -0.023730 -0.008385 0.028241 -0.004143 \n", - "2 5 -0.009581 0.016263 -0.027931 -0.023079 0.006483 0.006133 \n", - "3 6 -0.007599 -0.012074 0.024879 -0.008080 -0.025010 -0.000266 \n", - "4 7 -0.070002 0.001031 -0.001309 -0.014118 -0.036672 -0.012943 \n", + "0 29 -0.004295 -0.032384 -0.044917 0.039122 -0.016758 -0.059473 \n", + "1 4 0.007210 -0.004949 -0.021168 0.039533 -0.004339 -0.026979 \n", + "2 17 0.034115 -0.007572 -0.045769 0.038766 -0.018994 -0.003735 \n", + "3 155 -0.014619 -0.001738 -0.006829 0.019568 -0.025870 -0.043351 \n", + "4 19 0.027284 -0.039710 -0.013016 -0.021763 -0.019920 -0.019573 \n", "\n", " 6 7 8 ... 54 55 56 57 \\\n", - "0 0.022611 -0.030984 -0.045701 ... 0.007060 0.032204 0.011515 0.012811 \n", - "1 0.001301 -0.040613 -0.020645 ... 0.001835 0.010697 0.006311 0.007290 \n", - "2 -0.027449 0.027797 0.045743 ... -0.003662 0.054940 0.013501 -0.004127 \n", - "3 0.005489 -0.014263 -0.019343 ... -0.030220 0.011863 -0.008515 0.011286 \n", - "4 0.009711 -0.008856 -0.032054 ... -0.023113 0.000600 -0.005711 0.044277 \n", + "0 -0.007811 0.004419 -0.044857 ... 0.019839 0.031614 0.066616 -0.023459 \n", + "1 0.018726 -0.034300 -0.010744 ... 0.021441 -0.008866 0.018915 0.001428 \n", + "2 -0.013748 0.003397 0.018028 ... 0.002467 0.029187 -0.023114 0.007315 \n", + "3 0.007577 -0.038977 -0.015209 ... -0.011659 0.011859 -0.004721 0.002480 \n", + "4 0.004436 0.005504 -0.018312 ... 0.007124 0.005688 0.018035 0.018919 \n", "\n", " 58 59 60 61 62 63 \n", - "0 0.002650 0.023448 0.021759 -0.011316 -0.035275 -0.004572 \n", - "1 -0.014959 0.025217 0.041697 -0.012126 -0.022523 -0.001903 \n", - "2 -0.001858 -0.000462 -0.018047 0.036427 0.009524 0.006689 \n", - "3 -0.000907 0.014882 0.035699 -0.007068 0.012995 0.001644 \n", - "4 -0.004765 0.016184 0.028223 0.002914 0.032516 0.026521 \n", + "0 0.039532 -0.025300 0.002040 0.010800 -0.031893 0.009897 \n", + "1 0.007287 0.003946 -0.029646 -0.023998 -0.021912 0.005516 \n", + "2 0.001796 0.013247 0.011309 -0.004574 -0.011722 0.004382 \n", + "3 0.040565 -0.023915 -0.039050 -0.013832 -0.028899 0.034076 \n", + "4 0.020091 -0.017181 0.027977 -0.032007 -0.005940 0.013642 \n", "\n", "[5 rows x 65 columns]" ] @@ -1614,11 +1508,11 @@ " created_timestamp_column=\"created\",\n", ")\n", "\n", - "user_raw = Entity(name=\"user_id_raw\", value_type=ValueType.INT32, join_keys=[\"user_id_raw\"],)\n", + "user = Entity(name=\"user_id\", value_type=ValueType.INT32, join_keys=[\"user_id\"],)\n", "\n", "user_features_view = FeatureView(\n", " name=\"user_features\",\n", - " entities=[user_raw],\n", + " entities=[user],\n", " ttl=timedelta(0),\n", " schema=[\n", " Field(name=\"user_shops\", dtype=Int32),\n", @@ -1632,7 +1526,6 @@ " Field(name=\"user_intentions\", dtype=Int32),\n", " Field(name=\"user_brands\", dtype=Int32),\n", " Field(name=\"user_categories\", dtype=Int32),\n", - " Field(name=\"user_id\", dtype=Int32),\n", " ],\n", " online=True,\n", " source=user_features,\n", @@ -1676,7 +1569,6 @@ " Field(name=\"item_category\", dtype=Int32),\n", " Field(name=\"item_shop\", dtype=Int32),\n", " Field(name=\"item_brand\", dtype=Int32),\n", - " Field(name=\"item_id_raw\", dtype=Int32),\n", " ],\n", " online=True,\n", " source=item_features,\n", @@ -1749,8 +1641,9 @@ "source": [ "import seedir as sd\n", "\n", + "feature_repo_path = os.path.join(BASE_DIR, \"feast_repo\")\n", "sd.seedir(\n", - " os.path.join(BASE_DIR, \"feast_repo\"),\n", + " feature_repo_path,\n", " style=\"lines\",\n", " itemlimit=10,\n", " depthlimit=3,\n", @@ -1769,6 +1662,14 @@ "\n", "For the next step, move on to the `02-Deploying-multi-stage-Recsys-with-Merlin-Systems.ipynb` notebook to deploy our saved models as an ensemble to TIS and obtain prediction results for a given request." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c5bd646-8121-4f32-bff8-137d50e3b8a2", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb index e2b0e5470..965509a13 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", + "# Copyright 2023 NVIDIA Corporation. All Rights Reserved.\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -75,11 +75,11 @@ "\n", "In case you need to install them for running this example on GPU, execute the following script in a cell.\n", "```\n", - "%pip install \"feast==0.31\" faiss-gpu\n", + "%pip install \"feast<0.31\" faiss-gpu\n", "```\n", "or the following script in a cell for CPU.\n", "```\n", - "%pip install tensorflow-cpu \"feast==0.31\" faiss-cpu\n", + "%pip install tensorflow-cpu \"feast<0.31\" faiss-cpu\n", "```" ] }, @@ -93,13 +93,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:19: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " DESCRIPTOR = _descriptor.FileDescriptor(\n", - "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:37: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " _descriptor.FieldDescriptor(\n", - "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:30: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " _INTEGERSTATISTICS = _descriptor.Descriptor(\n", - "2023-06-20 23:49:49.177129: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "2023-06-21 21:37:09.169418: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_shape_pb2.py:18: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", " DESCRIPTOR = _descriptor.FileDescriptor(\n", @@ -121,10 +115,28 @@ " DESCRIPTOR = _descriptor.FileDescriptor(\n", "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/resource_handle_pb2.py:39: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", " _descriptor.FieldDescriptor(\n", + "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/resource_handle_pb2.py:32: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " _RESOURCEHANDLEPROTO_DTYPEANDSHAPE = _descriptor.Descriptor(\n", + "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " DESCRIPTOR = _descriptor.FileDescriptor(\n", + "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " _descriptor.FieldDescriptor(\n", + "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_pb2.py:33: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " _TENSORPROTO = _descriptor.Descriptor(\n", + "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/attr_value_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " DESCRIPTOR = _descriptor.FileDescriptor(\n", + "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/attr_value_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " _descriptor.FieldDescriptor(\n", "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n", "/usr/local/lib/python3.8/dist-packages/nvtabular/loader/__init__.py:19: DeprecationWarning: The `nvtabular.loader` module has moved to a new repository, at https://github.com/NVIDIA-Merlin/dataloader . Support for importing from `nvtabular.loader` is deprecated, and will be removed in a future version. Please update your imports to refer to `merlinloader`.\n", - " warnings.warn(\n" + " warnings.warn(\n", + "2023-06-21 21:37:11.305888: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", + "2023-06-21 21:37:11.305925: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 2dca7910ae98\n", + "2023-06-21 21:37:11.305933: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 2dca7910ae98\n", + "2023-06-21 21:37:11.306011: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 525.85.12\n", + "2023-06-21 21:37:11.306030: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1\n", + "2023-06-21 21:37:11.306037: E tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 515.65.1 does not match DSO version 525.85.12 -- cannot find working devices in this configuration\n" ] } ], @@ -140,7 +152,8 @@ "from merlin.systems.dag.ops.softmax_sampling import SoftmaxSampling\n", "from merlin.systems.dag.ops.tensorflow import PredictTensorflow\n", "from merlin.systems.dag.ops.unroll_features import UnrollFeatures\n", - "from merlin.systems.triton.utils import send_triton_request" + "from merlin.systems.triton.utils import send_triton_request\n", + "from merlin.systems.dag.ops.workflow import TransformWorkflow" ] }, { @@ -184,15 +197,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/Merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo/feature_repo\n", - "Created entity \u001b[1m\u001b[32muser_id_raw\u001b[0m\n", - "Created entity \u001b[1m\u001b[32mitem_id\u001b[0m\n", - "Created feature view \u001b[1m\u001b[32muser_features\u001b[0m\n", - "Created feature view \u001b[1m\u001b[32mitem_features\u001b[0m\n", - "\n", - "Created sqlite table \u001b[1m\u001b[32mfeast_repo_item_features\u001b[0m\n", - "Created sqlite table \u001b[1m\u001b[32mfeast_repo_user_features\u001b[0m\n", - "\n" + "/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo/feature_repo\n", + "\u001b[1m\u001b[94mNo changes to registry\n", + "\u001b[1m\u001b[94mNo changes to infrastructure\n" ] } ], @@ -228,9 +235,9 @@ "Materializing \u001b[1m\u001b[32m2\u001b[0m feature views from \u001b[1m\u001b[32m1995-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", "\u001b[1m\u001b[32muser_features\u001b[0m:\n", - "100%|███████████████████████████████████████████████████████████| 456/456 [00:00<00:00, 1136.51it/s]\n", + "100%|███████████████████████████████████████████████████████████| 457/457 [00:00<00:00, 2914.62it/s]\n", "\u001b[1m\u001b[32mitem_features\u001b[0m:\n", - "100%|███████████████████████████████████████████████████████████| 436/436 [00:00<00:00, 2878.99it/s]\n" + "100%|███████████████████████████████████████████████████████████| 451/451 [00:00<00:00, 8542.45it/s]\n" ] } ], @@ -345,7 +352,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING clustering 436 points to 32 centroids: please provide at least 1248 training points\n" + "WARNING clustering 451 points to 32 centroids: please provide at least 1248 training points\n" ] } ], @@ -394,9 +401,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-20 23:50:04+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-21 21:37:18+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-20 23:50:04+00:00\u001b[0m:\n" + "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-21 21:37:18+00:00\u001b[0m:\n" ] }, { @@ -410,14 +417,28 @@ "source": [ "from merlin.systems.dag.ops.feast import QueryFeast \n", "\n", - "user_features = [\"user_id_raw\"] >> QueryFeast.from_feature_view(\n", + "user_attributes = [\"user_id\"] >> QueryFeast.from_feature_view(\n", " store=feature_store,\n", " view=\"user_features\",\n", - " column=\"user_id_raw\",\n", - " include_id=False,\n", + " column=\"user_id\",\n", + " include_id=True,\n", ")" ] }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f11299b6-20d4-4687-bb0e-b855a9bcb9eb", + "metadata": {}, + "outputs": [], + "source": [ + "from nvtabular import Workflow\n", + "\n", + "nvt_workflow = Workflow.load('/workspace/data/processed_nvt/workflow')\n", + "user_subgraph = nvt_workflow.get_subworkflow(\"user\")\n", + "user_features = user_attributes >> TransformWorkflow(user_subgraph)" + ] + }, { "cell_type": "markdown", "id": "27e25be7-3ff0-49c2-a3fc-03ec4d615e77", @@ -428,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "21139caa-3a51-42e6-b006-21a92c95f1bc", "metadata": {}, "outputs": [ @@ -438,7 +459,7 @@ "" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -447,12 +468,13 @@ "# prevent TF to claim all GPU memory\n", "from merlin.dataloader.tf_utils import configure_tensorflow\n", "\n", + "\n", "configure_tensorflow()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "47c2d9b1-51dc-4549-977d-d7941ee6486c", "metadata": {}, "outputs": [ @@ -460,24 +482,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-20 23:50:06.005776: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-06-20 23:50:09.981326: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8192 MB memory: -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:06:00.0, compute capability: 7.0\n", - "WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 52). These functions will not be directly callable after loading.\n" + "2023-06-21 21:37:19.332291: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmp7n9o9yv2/assets\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmp7n9o9yv2/assets\n" + "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n" ] } ], @@ -502,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "b270f663-0ae1-4356-acd4-5f8c986abf4d", "metadata": {}, "outputs": [ @@ -510,9 +523,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-20 23:50:17+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-21 21:37:21+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-20 23:50:17+00:00\u001b[0m:\n" + "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-21 21:37:21+00:00\u001b[0m:\n" ] }, { @@ -524,7 +537,7 @@ } ], "source": [ - "item_features = retrieval[\"candidate_ids\"] >> QueryFeast.from_feature_view(\n", + "item_attributes = retrieval[\"candidate_ids\"] >> QueryFeast.from_feature_view(\n", " store=feature_store,\n", " view=\"item_features\",\n", " column=\"candidate_ids\",\n", @@ -533,6 +546,17 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0d0a4531-665c-48a1-98a9-216c955449b7", + "metadata": {}, + "outputs": [], + "source": [ + "item_subgraph = nvt_workflow.get_subworkflow(\"item\")\n", + "item_features = item_attributes >> TransformWorkflow(item_subgraph)" + ] + }, { "cell_type": "markdown", "id": "304a4d09-db05-4666-b520-75dbbbc7ab17", @@ -543,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "eb0ef434-03a5-4a36-afb9-e19a43243c64", "metadata": {}, "outputs": [], @@ -578,32 +602,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "ce31723e-af4d-4827-bb60-3a9fafcd9da6", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 98). These functions will not be directly callable after loading.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmpbt6mf1gw/assets\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmpbt6mf1gw/assets\n" - ] - } - ], + "outputs": [], "source": [ "ranking = combined_features >> PredictTensorflow(ranking_model_path)" ] @@ -618,14 +620,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "7f65598b-e3e7-4238-a73e-19d00c3deb26", "metadata": {}, "outputs": [], "source": [ "top_k=10\n", - "ordering = combined_features[\"item_id_raw\"] >> SoftmaxSampling(\n", - " relevance_col=ranking[\"click/binary_classification_task\"], topk=top_k, temperature=20.0\n", + "ordering = combined_features[\"item_id\"] >> SoftmaxSampling(\n", + " relevance_col=ranking[\"click/binary_classification_task\"], topk=top_k, temperature=0.00000001\n", ")" ] }, @@ -650,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "b28c452f-543c-45a4-9995-130ca6919669", "metadata": {}, "outputs": [], @@ -669,21 +671,21 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "9c8b7b94-5559-4587-a272-4d9de2d53dd1", "metadata": {}, "outputs": [], "source": [ "request_schema = Schema(\n", " [\n", - " ColumnSchema(\"user_id_raw\", dtype=np.int32),\n", + " ColumnSchema(\"user_id\", dtype=np.int32),\n", " ]\n", ")" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "6c64d686-aed5-42f8-b517-482b4237c69f", "metadata": {}, "outputs": [ @@ -717,7 +719,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "89182219-40a6-458c-af0e-7a8e83f364aa", "metadata": {}, "outputs": [ @@ -726,7 +728,25 @@ "output_type": "stream", "text": [ "poc_ensemble/\n", - "├─0_predicttensorflowtriton/\n", + "├─0_transformworkflowtriton/\n", + "│ ├─1/\n", + "│ │ ├─model.py\n", + "│ │ └─workflow/\n", + "│ │ ├─categories/\n", + "│ │ │ ├─unique.user_age.parquet\n", + "│ │ │ ├─unique.user_brands.parquet\n", + "│ │ │ ├─unique.user_categories.parquet\n", + "│ │ │ ├─unique.user_consumption_2.parquet\n", + "│ │ │ ├─unique.user_gender.parquet\n", + "│ │ │ ├─unique.user_geography.parquet\n", + "│ │ │ ├─unique.user_group.parquet\n", + "│ │ │ ├─unique.user_id.parquet\n", + "│ │ │ ├─unique.user_intentions.parquet\n", + "│ │ │ └─unique.user_is_occupied.parquet\n", + "│ │ ├─metadata.json\n", + "│ │ └─workflow.pkl\n", + "│ └─config.pbtxt\n", + "├─1_predicttensorflowtriton/\n", "│ ├─1/\n", "│ │ └─model.savedmodel/\n", "│ │ ├─assets/\n", @@ -737,7 +757,19 @@ "│ │ ├─variables.data-00000-of-00001\n", "│ │ └─variables.index\n", "│ └─config.pbtxt\n", - "├─1_predicttensorflowtriton/\n", + "├─2_transformworkflowtriton/\n", + "│ ├─1/\n", + "│ │ ├─model.py\n", + "│ │ └─workflow/\n", + "│ │ ├─categories/\n", + "│ │ │ ├─unique.item_brand.parquet\n", + "│ │ │ ├─unique.item_category.parquet\n", + "│ │ │ ├─unique.item_id.parquet\n", + "│ │ │ └─unique.item_shop.parquet\n", + "│ │ ├─metadata.json\n", + "│ │ └─workflow.pkl\n", + "│ └─config.pbtxt\n", + "├─3_predicttensorflowtriton/\n", "│ ├─1/\n", "│ │ └─model.savedmodel/\n", "│ │ ├─.merlin/\n", @@ -816,7 +848,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "d08a8975-9c32-467b-99ec-df66319f854b", "metadata": {}, "outputs": [ @@ -824,8 +856,8 @@ "name": "stdout", "output_type": "stream", "text": [ - " user_id_raw\n", - "0 7\n" + " user_id\n", + "0 7\n" ] } ], @@ -834,8 +866,8 @@ "from merlin.core.dispatch import make_df\n", "\n", "# create a request to be sent to TIS\n", - "request = make_df({\"user_id_raw\": [7]})\n", - "request[\"user_id_raw\"] = request[\"user_id_raw\"].astype(np.int32)\n", + "request = make_df({\"user_id\": [7]})\n", + "request[\"user_id\"] = request[\"user_id\"].astype(np.int32)\n", "print(request)" ] }, @@ -849,20 +881,20 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "id": "74ec62f2-5935-45c6-8058-e1cdade6f80f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'ordered_ids': array([[266, 381, 145, 6, 232, 651, 83, 244, 107, 69]], dtype=int32),\n", - " 'ordered_scores': array([[0.50194645, 0.50282484, 0.50340647, 0.5027974 , 0.50236404,\n", - " 0.50230837, 0.50244445, 0.5022982 , 0.50169003, 0.50216776]],\n", + "{'ordered_ids': array([[343, 72, 248, 74, 91, 394, 194, 306, 333, 266]], dtype=int32),\n", + " 'ordered_scores': array([[0.49981913, 0.49877545, 0.49930254, 0.5005477 , 0.5007775 ,\n", + " 0.4999408 , 0.49992177, 0.50006884, 0.50042826, 0.4995823 ]],\n", " dtype=float32)}" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } From a996c7bafa0235fab9521f13fb00188ee84846b9 Mon Sep 17 00:00:00 2001 From: Julio Date: Tue, 27 Jun 2023 17:38:52 -0400 Subject: [PATCH 2/5] remove unnecessary raw_id calls --- .../01-Building-Recommender-Systems-with-Merlin.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index 990c568ed..3dcfd9d08 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -129,10 +129,10 @@ "outputs": [], "source": [ "# for running this example on GPU, install the following libraries\n", - "# %pip install \"feast<0.20\" faiss-gpu\n", + "# %pip install \"feast==0.31\" faiss-gpu\n", "\n", "# for running this example on CPU, uncomment the following lines\n", - "# %pip install tensorflow-cpu \"feast<0.20\" faiss-cpu\n", + "# %pip install tensorflow-cpu \"feast==0.31\" faiss-cpu\n", "# %pip uninstall cudf\n" ] }, @@ -930,7 +930,7 @@ "metadata": {}, "outputs": [], "source": [ - "schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['user_id_raw', 'item_id_raw', 'click'])\n", + "schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['click'])\n", "train_tt.schema = schema\n", "valid_tt.schema = schema" ] @@ -1068,7 +1068,7 @@ "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n", "\n", "# define schema object\n", - "schema = train.schema.without(['user_id_raw', 'item_id_raw'])" + "schema = train.schema" ] }, { From f877254fc3d3bc40dc7dc0146295acd3a641d7de Mon Sep 17 00:00:00 2001 From: Julio Date: Thu, 29 Jun 2023 15:22:35 -0400 Subject: [PATCH 3/5] changes to multistage example to clean up embeddings and update faiss setup --- ...ding-Recommender-Systems-with-Merlin.ipynb | 611 ++++-------------- ...lti-stage-RecSys-with-Merlin-Systems.ipynb | 112 ++-- 2 files changed, 185 insertions(+), 538 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index 3dcfd9d08..3dbe42dc5 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -146,33 +146,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-21 21:24:41.476144: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + "2023-06-29 19:20:02.816099: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", + " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n" + "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n", + "[INFO]: sparse_operation_kit is imported\n", + "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n", + "[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.1.4-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so\n", + "[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.1.4-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", - " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n", - "2023-06-21 21:24:43.274327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", - "2023-06-21 21:24:43.274369: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 2dca7910ae98\n", - "2023-06-21 21:24:43.274380: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 2dca7910ae98\n", - "2023-06-21 21:24:43.274481: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program\n", - "2023-06-21 21:24:43.274508: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1\n", - "2023-06-21 21:24:43.621683: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "2023-06-29 19:20:07.245419: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-06-29 19:20:08.267091: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", + "2023-06-29 19:20:08.267138: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n", + "2023-06-29 19:20:08.268109: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", + "2023-06-29 19:20:08.268137: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n", "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[SOK INFO] Initialize finished, communication tool: horovod\n" + ] } ], "source": [ @@ -225,7 +235,7 @@ "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n", "# set up the base dir for feature store\n", "BASE_DIR = os.environ.get(\n", - " \"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", + " \"BASE_DIR\", \"/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", ")" ] }, @@ -242,16 +252,7 @@ "execution_count": 6, "id": "b44b3378-7297-4946-a271-742a9239bc3e", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "from merlin.datasets.synthetic import generate_data\n", "\n", @@ -341,16 +342,7 @@ "execution_count": 9, "id": "ea0b369c-2f01-42e3-9f3c-74c3ff4a6d64", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "from merlin.models.utils.dataset import unique_rows_by_features\n", "\n", @@ -430,9 +422,9 @@ " \n", " \n", " \n", - " 38\n", + " 6\n", " 7\n", - " 658\n", + " 530\n", " 1\n", " 1\n", " 1\n", @@ -441,28 +433,28 @@ " 1\n", " 1\n", " 1\n", - " 191\n", - " 327\n", - " 35\n", - " 2023-06-21 21:24:49.082804\n", - " 2023-06-21 21:24:49.085539\n", + " 154\n", + " 264\n", + " 28\n", + " 2023-06-29 19:20:20.311986\n", + " 2023-06-29 19:20:20.314307\n", " \n", " \n", "\n", "" ], "text/plain": [ - " user_id user_shops user_profile user_group user_gender user_age \\\n", - "38 7 658 1 1 1 1 \n", + " user_id user_shops user_profile user_group user_gender user_age \\\n", + "6 7 530 1 1 1 1 \n", "\n", - " user_consumption_1 user_consumption_2 user_is_occupied user_geography \\\n", - "38 1 1 1 1 \n", + " user_consumption_1 user_consumption_2 user_is_occupied user_geography \\\n", + "6 1 1 1 1 \n", "\n", - " user_intentions user_brands user_categories datetime \\\n", - "38 191 327 35 2023-06-21 21:24:49.082804 \n", + " user_intentions user_brands user_categories datetime \\\n", + "6 154 264 28 2023-06-29 19:20:20.311986 \n", "\n", - " created \n", - "38 2023-06-21 21:24:49.085539 " + " created \n", + "6 2023-06-29 19:20:20.314307 " ] }, "execution_count": 11, @@ -491,16 +483,7 @@ "execution_count": 13, "id": "0a33a668-8e2a-4546-8f54-0060d405ba91", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "item_features = (\n", " unique_rows_by_features(train_raw, Tags.ITEM, Tags.ITEM_ID)\n", @@ -561,53 +544,53 @@ " \n", " \n", " 0\n", - " 29\n", - " 138\n", - " 9672\n", - " 3331\n", - " 1541\n", - " 2023-06-21 21:24:49.145983\n", - " 2023-06-21 21:24:49.147882\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 2023-06-29 19:20:20.413296\n", + " 2023-06-29 19:20:20.414521\n", " \n", " \n", " 1\n", - " 4\n", - " 15\n", - " 1037\n", - " 357\n", - " 166\n", - " 2023-06-21 21:24:49.145983\n", - " 2023-06-21 21:24:49.147882\n", + " 2\n", + " 7\n", + " 457\n", + " 158\n", + " 73\n", + " 2023-06-29 19:20:20.413296\n", + " 2023-06-29 19:20:20.414521\n", " \n", " \n", " 2\n", - " 17\n", - " 79\n", - " 5527\n", - " 1904\n", - " 881\n", - " 2023-06-21 21:24:49.145983\n", - " 2023-06-21 21:24:49.147882\n", + " 3\n", + " 13\n", + " 914\n", + " 315\n", + " 146\n", + " 2023-06-29 19:20:20.413296\n", + " 2023-06-29 19:20:20.414521\n", " \n", " \n", " 3\n", - " 155\n", - " 756\n", - " 53196\n", - " 18320\n", - " 8471\n", - " 2023-06-21 21:24:49.145983\n", - " 2023-06-21 21:24:49.147882\n", + " 4\n", + " 20\n", + " 1371\n", + " 473\n", + " 219\n", + " 2023-06-29 19:20:20.413296\n", + " 2023-06-29 19:20:20.414521\n", " \n", " \n", " 4\n", - " 19\n", - " 89\n", - " 6218\n", - " 2142\n", - " 991\n", - " 2023-06-21 21:24:49.145983\n", - " 2023-06-21 21:24:49.147882\n", + " 5\n", + " 26\n", + " 1828\n", + " 630\n", + " 292\n", + " 2023-06-29 19:20:20.413296\n", + " 2023-06-29 19:20:20.414521\n", " \n", " \n", "\n", @@ -615,18 +598,18 @@ ], "text/plain": [ " item_id item_category item_shop item_brand item_intention \\\n", - "0 29 138 9672 3331 1541 \n", - "1 4 15 1037 357 166 \n", - "2 17 79 5527 1904 881 \n", - "3 155 756 53196 18320 8471 \n", - "4 19 89 6218 2142 991 \n", + "0 1 1 1 1 1 \n", + "1 2 7 457 158 73 \n", + "2 3 13 914 315 146 \n", + "3 4 20 1371 473 219 \n", + "4 5 26 1828 630 292 \n", "\n", " datetime created \n", - "0 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", - "1 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", - "2 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", - "3 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 \n", - "4 2023-06-21 21:24:49.145983 2023-06-21 21:24:49.147882 " + "0 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", + "1 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", + "2 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", + "3 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", + "4 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 " ] }, "execution_count": 15, @@ -721,7 +704,8 @@ "outputs = subgraph_user + subgraph_item + targets\n", "\n", "# add dropna op to filter rows with nulls\n", - "outputs = outputs >> Dropna()" + "outputs = outputs >> Dropna()\n", + "nvt_wkflow = nvt.Workflow(outputs)" ] }, { @@ -737,35 +721,10 @@ "execution_count": 19, "id": "814e8438-642a-4f03-baaf-44dab8d1b5e5", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "transform_aliccp(\n", - " (train_raw, valid_raw), output_path, nvt_workflow=outputs, workflow_name=\"workflow\"\n", + " (train_raw, valid_raw), output_path, nvt_workflow=nvt_wkflow, workflow_name=\"workflow\"\n", ")" ] }, @@ -824,18 +783,7 @@ "execution_count": 21, "id": "251d4697-8f9c-4c93-8de4-c3480a8378de", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n", "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))" @@ -854,33 +802,18 @@ "execution_count": 22, "id": "7e085a6d-74ad-4c24-8e7c-4e449c15f471", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "inputs = train_tt.schema.column_names\n", "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n", "\n", - "workflow2 = nvt.Workflow(outputs)\n", - "\n", - "workflow2.fit(train_tt)\n", + "nvt_wkflow.fit(train_tt)\n", "\n", - "workflow2.transform(train_tt).to_parquet(\n", + "nvt_wkflow.transform(train_tt).to_parquet(\n", " output_path=os.path.join(output_path2, \"train\")\n", ")\n", "\n", - "workflow2.transform(valid_tt).to_parquet(\n", + "nvt_wkflow.transform(valid_tt).to_parquet(\n", " output_path=os.path.join(output_path2, \"valid\")\n", ")" ] @@ -906,18 +839,7 @@ "execution_count": 23, "id": "252a8e60-b447-46b5-ade6-3557cbafa797", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "train_tt = Dataset(os.path.join(output_path2, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", "valid_tt = Dataset(os.path.join(output_path2, \"valid\", \"*.parquet\"), part_size=\"500MB\")" @@ -968,13 +890,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 13s 1s/step - loss: 8.9092 - recall_at_10: 0.0076 - ndcg_at_10: 0.0058 - regularization_loss: 0.0000e+00 - loss_batch: 8.5704 - val_loss: 8.9050 - val_recall_at_10: 0.0121 - val_ndcg_at_10: 0.0097 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.7986\n" + "9/9 [==============================] - 10s 275ms/step - loss: 8.9538 - recall_at_10: 0.0055 - ndcg_at_10: 0.0038 - regularization_loss: 0.0000e+00 - loss_batch: 8.8710 - val_loss: 8.9181 - val_recall_at_10: 0.0165 - val_ndcg_at_10: 0.0109 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.5802\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 26, @@ -1050,18 +972,7 @@ "execution_count": 28, "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "# define train and valid dataset objects\n", "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", @@ -1127,13 +1038,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 5s 271ms/step - loss: 0.6932 - auc: 0.4989 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.4994 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6932\n" + "5/5 [==============================] - 5s 312ms/step - loss: 0.6931 - auc: 0.4991 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.4983 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6931\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 31, @@ -1156,7 +1067,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", "metadata": {}, "outputs": [], @@ -1182,287 +1093,31 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "e62f65f8-e8f1-447e-9500-5960807c36f2", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "nvt_wkflow = nvt.Workflow.load(output_path + \"/workflow\")\n", - "cat_wkflow = nvt_wkflow.get_subworkflow(\"items_cat\")\n", - "item_features_ds = Dataset(item_features, schema=schema)\n", - "item_feature_cat_ds = cat_wkflow.transform(item_features_ds).to_ddf().compute()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "6a4848a7-aa4f-4f8a-8b40-6c8458ac4fcd", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n", - "/usr/local/lib/python3.8/dist-packages/merlin/io/dataset.py:267: UserWarning: Initializing an NVTabular Dataset in CPU mode.This is an experimental feature with extremely limited support!\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "item_embs = model_tt.item_embeddings(\n", - " Dataset(item_feature_cat_ds, schema=schema), batch_size=1024\n", - ")\n", - "item_embs_df = item_embs.compute(scheduler=\"synchronous\")\n", - "item_embs_df[\"item_id\"] = item_features[\"item_id\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "cf8b82ea-6cce-4dab-ad17-114b5e7eabd4", - "metadata": {}, "outputs": [], "source": [ - "# select only item_id together with embedding columns\n", - "item_embeddings = item_embs_df.drop(\n", - " columns=[\"item_category\", \"item_shop\", \"item_brand\"]\n", - ")" + "from merlin.systems.dag.ops.tensorflow import PredictTensorflow\n", + "from merlin.systems.dag.ops.workflow import TransformWorkflow\n", + "\n", + "workflow = nvt.Workflow([\"item_id\"] + (['item_id', 'item_brand', 'item_category', 'item_shop'] >> TransformWorkflow(nvt_wkflow.get_subworkflow(\"item\")) >> PredictTensorflow(model_tt.first.item_block())))\n", + "item_embeddings = workflow.fit_transform(Dataset(item_features)).to_ddf().compute()" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "e02f0957-6665-400a-80c0-60b307466caf", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
item_id012345678...54555657585960616263
029-0.004295-0.032384-0.0449170.039122-0.016758-0.059473-0.0078110.004419-0.044857...0.0198390.0316140.066616-0.0234590.039532-0.0253000.0020400.010800-0.0318930.009897
140.007210-0.004949-0.0211680.039533-0.004339-0.0269790.018726-0.034300-0.010744...0.021441-0.0088660.0189150.0014280.0072870.003946-0.029646-0.023998-0.0219120.005516
2170.034115-0.007572-0.0457690.038766-0.018994-0.003735-0.0137480.0033970.018028...0.0024670.029187-0.0231140.0073150.0017960.0132470.011309-0.004574-0.0117220.004382
3155-0.014619-0.001738-0.0068290.019568-0.025870-0.0433510.007577-0.038977-0.015209...-0.0116590.011859-0.0047210.0024800.040565-0.023915-0.039050-0.013832-0.0288990.034076
4190.027284-0.039710-0.013016-0.021763-0.019920-0.0195730.0044360.005504-0.018312...0.0071240.0056880.0180350.0189190.020091-0.0171810.027977-0.032007-0.0059400.013642
\n", - "

5 rows × 65 columns

\n", - "
" - ], - "text/plain": [ - " item_id 0 1 2 3 4 5 \\\n", - "0 29 -0.004295 -0.032384 -0.044917 0.039122 -0.016758 -0.059473 \n", - "1 4 0.007210 -0.004949 -0.021168 0.039533 -0.004339 -0.026979 \n", - "2 17 0.034115 -0.007572 -0.045769 0.038766 -0.018994 -0.003735 \n", - "3 155 -0.014619 -0.001738 -0.006829 0.019568 -0.025870 -0.043351 \n", - "4 19 0.027284 -0.039710 -0.013016 -0.021763 -0.019920 -0.019573 \n", - "\n", - " 6 7 8 ... 54 55 56 57 \\\n", - "0 -0.007811 0.004419 -0.044857 ... 0.019839 0.031614 0.066616 -0.023459 \n", - "1 0.018726 -0.034300 -0.010744 ... 0.021441 -0.008866 0.018915 0.001428 \n", - "2 -0.013748 0.003397 0.018028 ... 0.002467 0.029187 -0.023114 0.007315 \n", - "3 0.007577 -0.038977 -0.015209 ... -0.011659 0.011859 -0.004721 0.002480 \n", - "4 0.004436 0.005504 -0.018312 ... 0.007124 0.005688 0.018035 0.018919 \n", - "\n", - " 58 59 60 61 62 63 \n", - "0 0.039532 -0.025300 0.002040 0.010800 -0.031893 0.009897 \n", - "1 0.007287 0.003946 -0.029646 -0.023998 -0.021912 0.005516 \n", - "2 0.001796 0.013247 0.011309 -0.004574 -0.011722 0.004382 \n", - "3 0.040565 -0.023915 -0.039050 -0.013832 -0.028899 0.034076 \n", - "4 0.020091 -0.017181 0.027977 -0.032007 -0.005940 0.013642 \n", - "\n", - "[5 rows x 65 columns]" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "item_embeddings.head()" + "item_embeddings.tail()" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "66d7271e-0ea6-4568-ac5a-04089735f542", "metadata": {}, "outputs": [], @@ -1489,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "4ee27d67-e35a-42c5-8025-ed73f35c8e13", "metadata": {}, "outputs": [], @@ -1540,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "48a5927c-840d-410c-8f5b-bebce4f79640", "metadata": {}, "outputs": [], @@ -1591,19 +1246,10 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "57133c1e-18d9-4ccb-9704-cdebd271985e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: seedir in /usr/local/lib/python3.8/dist-packages (0.4.2)\n", - "Requirement already satisfied: natsort in /usr/local/lib/python3.8/dist-packages (from seedir) (8.4.0)\n" - ] - } - ], + "outputs": [], "source": [ "# install seedir\n", "!pip install seedir" @@ -1611,33 +1257,10 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "986d53ea-c946-4046-a390-6d3b8801d280", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "feast_repo/\n", - "├─README.md\n", - "├─__init__.py\n", - "└─feature_repo/\n", - " ├─__init__.py\n", - " ├─__pycache__/\n", - " │ ├─__init__.cpython-38.pyc\n", - " │ ├─example_repo.cpython-38.pyc\n", - " │ └─test_workflow.cpython-38.pyc\n", - " ├─data/\n", - " │ ├─item_features.parquet\n", - " │ └─user_features.parquet\n", - " ├─feature_store.yaml\n", - " ├─item_features.py\n", - " ├─test_workflow.py\n", - " └─user_features.py\n" - ] - } - ], + "outputs": [], "source": [ "import seedir as sd\n", "\n", diff --git a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb index 965509a13..3d1d417b0 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb @@ -93,7 +93,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-21 21:37:09.169418: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:19: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " DESCRIPTOR = _descriptor.FileDescriptor(\n", + "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:37: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " _descriptor.FieldDescriptor(\n", + "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:30: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", + " _INTEGERSTATISTICS = _descriptor.Descriptor(\n", + "2023-06-29 19:13:17.254704: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_shape_pb2.py:18: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", " DESCRIPTOR = _descriptor.FileDescriptor(\n", @@ -115,28 +121,10 @@ " DESCRIPTOR = _descriptor.FileDescriptor(\n", "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/resource_handle_pb2.py:39: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", " _descriptor.FieldDescriptor(\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/resource_handle_pb2.py:32: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " _RESOURCEHANDLEPROTO_DTYPEANDSHAPE = _descriptor.Descriptor(\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " DESCRIPTOR = _descriptor.FileDescriptor(\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " _descriptor.FieldDescriptor(\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_pb2.py:33: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " _TENSORPROTO = _descriptor.Descriptor(\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/attr_value_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " DESCRIPTOR = _descriptor.FileDescriptor(\n", - "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/attr_value_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", - " _descriptor.FieldDescriptor(\n", "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n", "/usr/local/lib/python3.8/dist-packages/nvtabular/loader/__init__.py:19: DeprecationWarning: The `nvtabular.loader` module has moved to a new repository, at https://github.com/NVIDIA-Merlin/dataloader . Support for importing from `nvtabular.loader` is deprecated, and will be removed in a future version. Please update your imports to refer to `merlinloader`.\n", - " warnings.warn(\n", - "2023-06-21 21:37:11.305888: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", - "2023-06-21 21:37:11.305925: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 2dca7910ae98\n", - "2023-06-21 21:37:11.305933: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 2dca7910ae98\n", - "2023-06-21 21:37:11.306011: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 525.85.12\n", - "2023-06-21 21:37:11.306030: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1\n", - "2023-06-21 21:37:11.306037: E tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 515.65.1 does not match DSO version 525.85.12 -- cannot find working devices in this configuration\n" + " warnings.warn(\n" ] } ], @@ -198,8 +186,14 @@ "output_type": "stream", "text": [ "/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo/feature_repo\n", - "\u001b[1m\u001b[94mNo changes to registry\n", - "\u001b[1m\u001b[94mNo changes to infrastructure\n" + "Created entity \u001b[1m\u001b[32mitem_id\u001b[0m\n", + "Created entity \u001b[1m\u001b[32muser_id\u001b[0m\n", + "Created feature view \u001b[1m\u001b[32mitem_features\u001b[0m\n", + "Created feature view \u001b[1m\u001b[32muser_features\u001b[0m\n", + "\n", + "Created sqlite table \u001b[1m\u001b[32mfeast_repo_item_features\u001b[0m\n", + "Created sqlite table \u001b[1m\u001b[32mfeast_repo_user_features\u001b[0m\n", + "\n" ] } ], @@ -234,10 +228,10 @@ "text": [ "Materializing \u001b[1m\u001b[32m2\u001b[0m feature views from \u001b[1m\u001b[32m1995-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32muser_features\u001b[0m:\n", - "100%|███████████████████████████████████████████████████████████| 457/457 [00:00<00:00, 2914.62it/s]\n", "\u001b[1m\u001b[32mitem_features\u001b[0m:\n", - "100%|███████████████████████████████████████████████████████████| 451/451 [00:00<00:00, 8542.45it/s]\n" + "100%|███████████████████████████████████████████████████████████| 450/450 [00:00<00:00, 5815.84it/s]\n", + "\u001b[1m\u001b[32muser_features\u001b[0m:\n", + "100%|███████████████████████████████████████████████████████████| 448/448 [00:00<00:00, 1758.64it/s]\n" ] } ], @@ -344,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "0b6cc5bf-d07c-4963-a748-6e2b4827ee36", "metadata": {}, "outputs": [ @@ -352,16 +346,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING clustering 451 points to 32 centroids: please provide at least 1248 training points\n" + "WARNING clustering 450 points to 32 centroids: please provide at least 1248 training points\n" ] } ], "source": [ "from merlin.systems.dag.ops.faiss import QueryFaiss, setup_faiss \n", "\n", - "item_embeddings = np.ascontiguousarray(\n", - " pd.read_parquet(os.path.join(BASE_DIR, \"item_embeddings.parquet\")).to_numpy()\n", - ")\n", + "item_embeddings = pd.read_parquet(os.path.join(BASE_DIR, \"item_embeddings.parquet\"))\n", "setup_faiss(item_embeddings, faiss_index_path)" ] }, @@ -375,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "3bc00e04-c70c-4882-9952-66f4dbb97bdc", "metadata": {}, "outputs": [], @@ -393,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "3decbe7b-03e3-4978-baac-03f6a0b078c9", "metadata": {}, "outputs": [ @@ -401,9 +393,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-21 21:37:18+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:14:10+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-21 21:37:18+00:00\u001b[0m:\n" + "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:14:10+00:00\u001b[0m:\n" ] }, { @@ -427,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "f11299b6-20d4-4687-bb0e-b855a9bcb9eb", "metadata": {}, "outputs": [], @@ -482,15 +474,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-21 21:37:19.332291: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + "2023-06-29 19:14:11.423802: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-06-29 19:14:14.615977: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n", + "2023-06-29 19:14:14.616886: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n", + "WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 52). These functions will not be directly callable after loading.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n" + "INFO:tensorflow:Assets written to: /tmp/tmpqzazhnjq/assets\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: /tmp/tmpqzazhnjq/assets\n" ] } ], @@ -523,9 +525,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-21 21:37:21+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:14:18+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-21 21:37:21+00:00\u001b[0m:\n" + "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:14:18+00:00\u001b[0m:\n" ] }, { @@ -605,7 +607,29 @@ "execution_count": 19, "id": "ce31723e-af4d-4827-bb60-3a9fafcd9da6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 98). These functions will not be directly callable after loading.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: /tmp/tmp6epm9p86/assets\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: /tmp/tmp6epm9p86/assets\n" + ] + } + ], "source": [ "ranking = combined_features >> PredictTensorflow(ranking_model_path)" ] @@ -888,9 +912,9 @@ { "data": { "text/plain": [ - "{'ordered_ids': array([[343, 72, 248, 74, 91, 394, 194, 306, 333, 266]], dtype=int32),\n", - " 'ordered_scores': array([[0.49981913, 0.49877545, 0.49930254, 0.5005477 , 0.5007775 ,\n", - " 0.4999408 , 0.49992177, 0.50006884, 0.50042826, 0.4995823 ]],\n", + "{'ordered_ids': array([[ 52, 102, 42, 204, 312, 117, 414, 258, 14, 450]], dtype=int32),\n", + " 'ordered_scores': array([[0.5010059 , 0.5018582 , 0.5001918 , 0.50212526, 0.5004832 ,\n", + " 0.5006511 , 0.50049436, 0.5014268 , 0.5005215 , 0.5017036 ]],\n", " dtype=float32)}" ] }, From 68b66af9358d73ce2e6c90346a08318e805deea9 Mon Sep 17 00:00:00 2001 From: Julio Date: Thu, 29 Jun 2023 15:59:50 -0400 Subject: [PATCH 4/5] update notebooks with output --- ...ding-Recommender-Systems-with-Merlin.ipynb | 233 +++++++++++++----- ...lti-stage-RecSys-with-Merlin-Systems.ipynb | 82 +++--- 2 files changed, 208 insertions(+), 107 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index 3dbe42dc5..9a0038917 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -146,7 +146,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-29 19:20:02.816099: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "2023-06-29 19:49:32.836544: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", " warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n" @@ -167,12 +167,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-29 19:20:07.245419: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "2023-06-29 19:49:37.094972: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-06-29 19:20:08.267091: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", - "2023-06-29 19:20:08.267138: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n", - "2023-06-29 19:20:08.268109: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", - "2023-06-29 19:20:08.268137: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n", + "2023-06-29 19:49:38.134481: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", + "2023-06-29 19:49:38.134526: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n", + "2023-06-29 19:49:38.135533: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", + "2023-06-29 19:49:38.135562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n", "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] @@ -235,7 +235,7 @@ "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n", "# set up the base dir for feature store\n", "BASE_DIR = os.environ.get(\n", - " \"BASE_DIR\", \"/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", + " \"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", ")" ] }, @@ -424,7 +424,7 @@ " \n", " 6\n", " 7\n", - " 530\n", + " 590\n", " 1\n", " 1\n", " 1\n", @@ -433,11 +433,11 @@ " 1\n", " 1\n", " 1\n", - " 154\n", - " 264\n", - " 28\n", - " 2023-06-29 19:20:20.311986\n", - " 2023-06-29 19:20:20.314307\n", + " 171\n", + " 293\n", + " 31\n", + " 2023-06-29 19:49:50.300270\n", + " 2023-06-29 19:49:50.303330\n", " \n", " \n", "\n", @@ -445,16 +445,16 @@ ], "text/plain": [ " user_id user_shops user_profile user_group user_gender user_age \\\n", - "6 7 530 1 1 1 1 \n", + "6 7 590 1 1 1 1 \n", "\n", " user_consumption_1 user_consumption_2 user_is_occupied user_geography \\\n", "6 1 1 1 1 \n", "\n", " user_intentions user_brands user_categories datetime \\\n", - "6 154 264 28 2023-06-29 19:20:20.311986 \n", + "6 171 293 31 2023-06-29 19:49:50.300270 \n", "\n", " created \n", - "6 2023-06-29 19:20:20.314307 " + "6 2023-06-29 19:49:50.303330 " ] }, "execution_count": 11, @@ -549,48 +549,48 @@ " 1\n", " 1\n", " 1\n", - " 2023-06-29 19:20:20.413296\n", - " 2023-06-29 19:20:20.414521\n", + " 2023-06-29 19:49:50.410715\n", + " 2023-06-29 19:49:50.412307\n", " \n", " \n", " 1\n", " 2\n", - " 7\n", - " 457\n", - " 158\n", - " 73\n", - " 2023-06-29 19:20:20.413296\n", - " 2023-06-29 19:20:20.414521\n", + " 6\n", + " 412\n", + " 142\n", + " 66\n", + " 2023-06-29 19:49:50.410715\n", + " 2023-06-29 19:49:50.412307\n", " \n", " \n", " 2\n", " 3\n", - " 13\n", - " 914\n", - " 315\n", - " 146\n", - " 2023-06-29 19:20:20.413296\n", - " 2023-06-29 19:20:20.414521\n", + " 12\n", + " 824\n", + " 284\n", + " 132\n", + " 2023-06-29 19:49:50.410715\n", + " 2023-06-29 19:49:50.412307\n", " \n", " \n", " 3\n", " 4\n", - " 20\n", - " 1371\n", - " 473\n", - " 219\n", - " 2023-06-29 19:20:20.413296\n", - " 2023-06-29 19:20:20.414521\n", + " 18\n", + " 1236\n", + " 426\n", + " 197\n", + " 2023-06-29 19:49:50.410715\n", + " 2023-06-29 19:49:50.412307\n", " \n", " \n", " 4\n", " 5\n", - " 26\n", - " 1828\n", - " 630\n", - " 292\n", - " 2023-06-29 19:20:20.413296\n", - " 2023-06-29 19:20:20.414521\n", + " 24\n", + " 1648\n", + " 568\n", + " 263\n", + " 2023-06-29 19:49:50.410715\n", + " 2023-06-29 19:49:50.412307\n", " \n", " \n", "\n", @@ -599,17 +599,17 @@ "text/plain": [ " item_id item_category item_shop item_brand item_intention \\\n", "0 1 1 1 1 1 \n", - "1 2 7 457 158 73 \n", - "2 3 13 914 315 146 \n", - "3 4 20 1371 473 219 \n", - "4 5 26 1828 630 292 \n", + "1 2 6 412 142 66 \n", + "2 3 12 824 284 132 \n", + "3 4 18 1236 426 197 \n", + "4 5 24 1648 568 263 \n", "\n", " datetime created \n", - "0 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", - "1 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", - "2 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", - "3 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 \n", - "4 2023-06-29 19:20:20.413296 2023-06-29 19:20:20.414521 " + "0 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n", + "1 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n", + "2 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n", + "3 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 \n", + "4 2023-06-29 19:49:50.410715 2023-06-29 19:49:50.412307 " ] }, "execution_count": 15, @@ -890,13 +890,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "9/9 [==============================] - 10s 275ms/step - loss: 8.9538 - recall_at_10: 0.0055 - ndcg_at_10: 0.0038 - regularization_loss: 0.0000e+00 - loss_batch: 8.8710 - val_loss: 8.9181 - val_recall_at_10: 0.0165 - val_ndcg_at_10: 0.0109 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.5802\n" + "9/9 [==============================] - 11s 275ms/step - loss: 8.9538 - recall_at_10: 0.0101 - ndcg_at_10: 0.0067 - regularization_loss: 0.0000e+00 - loss_batch: 8.8711 - val_loss: 8.9179 - val_recall_at_10: 0.0212 - val_ndcg_at_10: 0.0155 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 8.5806\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 26, @@ -1038,13 +1038,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 5s 312ms/step - loss: 0.6931 - auc: 0.4991 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.4983 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6931\n" + "5/5 [==============================] - 5s 305ms/step - loss: 0.6932 - auc: 0.5005 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.5029 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6931\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 31, @@ -1067,7 +1067,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", "metadata": {}, "outputs": [], @@ -1093,7 +1093,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "e62f65f8-e8f1-447e-9500-5960807c36f2", "metadata": {}, "outputs": [], @@ -1107,17 +1107,86 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "e02f0957-6665-400a-80c0-60b307466caf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_idoutput_1
453945[0.012117806822061539, -0.02241620607674122, 0...
454948[0.012117806822061539, -0.02241620607674122, 0...
455956[0.012117806822061539, -0.02241620607674122, 0...
4561437[0.012117806822061539, -0.02241620607674122, 0...
4571469[0.012117806822061539, -0.02241620607674122, 0...
\n", + "
" + ], + "text/plain": [ + " item_id output_1\n", + "453 945 [0.012117806822061539, -0.02241620607674122, 0...\n", + "454 948 [0.012117806822061539, -0.02241620607674122, 0...\n", + "455 956 [0.012117806822061539, -0.02241620607674122, 0...\n", + "456 1437 [0.012117806822061539, -0.02241620607674122, 0...\n", + "457 1469 [0.012117806822061539, -0.02241620607674122, 0..." + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "item_embeddings.tail()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "66d7271e-0ea6-4568-ac5a-04089735f542", "metadata": {}, "outputs": [], @@ -1144,7 +1213,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "4ee27d67-e35a-42c5-8025-ed73f35c8e13", "metadata": {}, "outputs": [], @@ -1195,7 +1264,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "48a5927c-840d-410c-8f5b-bebce4f79640", "metadata": {}, "outputs": [], @@ -1246,10 +1315,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "57133c1e-18d9-4ccb-9704-cdebd271985e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: seedir in /usr/local/lib/python3.8/dist-packages (0.4.2)\n", + "Requirement already satisfied: natsort in /usr/local/lib/python3.8/dist-packages (from seedir) (8.4.0)\n" + ] + } + ], "source": [ "# install seedir\n", "!pip install seedir" @@ -1257,10 +1335,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "986d53ea-c946-4046-a390-6d3b8801d280", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "feast_repo/\n", + "├─README.md\n", + "├─__init__.py\n", + "└─feature_repo/\n", + " ├─__init__.py\n", + " ├─__pycache__/\n", + " │ ├─__init__.cpython-38.pyc\n", + " │ ├─example_repo.cpython-38.pyc\n", + " │ └─test_workflow.cpython-38.pyc\n", + " ├─data/\n", + " │ ├─item_features.parquet\n", + " │ └─user_features.parquet\n", + " ├─feature_store.yaml\n", + " ├─item_features.py\n", + " ├─test_workflow.py\n", + " └─user_features.py\n" + ] + } + ], "source": [ "import seedir as sd\n", "\n", diff --git a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb index 3d1d417b0..15f0060d3 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb @@ -99,7 +99,7 @@ " _descriptor.FieldDescriptor(\n", "/usr/local/lib/python3.8/dist-packages/cudf/utils/metadata/orc_column_statistics_pb2.py:30: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", " _INTEGERSTATISTICS = _descriptor.Descriptor(\n", - "2023-06-29 19:13:17.254704: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "2023-06-29 19:50:56.885234: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "/usr/local/lib/python3.8/dist-packages/tensorflow/core/framework/tensor_shape_pb2.py:18: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.\n", " DESCRIPTOR = _descriptor.FileDescriptor(\n", @@ -186,10 +186,10 @@ "output_type": "stream", "text": [ "/raid/workshared/merlin/examples/Building-and-deploying-multi-stage-RecSys/feast_repo/feature_repo\n", - "Created entity \u001b[1m\u001b[32mitem_id\u001b[0m\n", "Created entity \u001b[1m\u001b[32muser_id\u001b[0m\n", - "Created feature view \u001b[1m\u001b[32mitem_features\u001b[0m\n", + "Created entity \u001b[1m\u001b[32mitem_id\u001b[0m\n", "Created feature view \u001b[1m\u001b[32muser_features\u001b[0m\n", + "Created feature view \u001b[1m\u001b[32mitem_features\u001b[0m\n", "\n", "Created sqlite table \u001b[1m\u001b[32mfeast_repo_item_features\u001b[0m\n", "Created sqlite table \u001b[1m\u001b[32mfeast_repo_user_features\u001b[0m\n", @@ -228,10 +228,10 @@ "text": [ "Materializing \u001b[1m\u001b[32m2\u001b[0m feature views from \u001b[1m\u001b[32m1995-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32mitem_features\u001b[0m:\n", - "100%|███████████████████████████████████████████████████████████| 450/450 [00:00<00:00, 5815.84it/s]\n", "\u001b[1m\u001b[32muser_features\u001b[0m:\n", - "100%|███████████████████████████████████████████████████████████| 448/448 [00:00<00:00, 1758.64it/s]\n" + "100%|███████████████████████████████████████████████████████████| 460/460 [00:00<00:00, 2521.27it/s]\n", + "\u001b[1m\u001b[32mitem_features\u001b[0m:\n", + "100%|███████████████████████████████████████████████████████████| 458/458 [00:00<00:00, 3335.12it/s]\n" ] } ], @@ -338,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "0b6cc5bf-d07c-4963-a748-6e2b4827ee36", "metadata": {}, "outputs": [ @@ -346,7 +346,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING clustering 450 points to 32 centroids: please provide at least 1248 training points\n" + "WARNING clustering 458 points to 32 centroids: please provide at least 1248 training points\n" ] } ], @@ -354,7 +354,7 @@ "from merlin.systems.dag.ops.faiss import QueryFaiss, setup_faiss \n", "\n", "item_embeddings = pd.read_parquet(os.path.join(BASE_DIR, \"item_embeddings.parquet\"))\n", - "setup_faiss(item_embeddings, faiss_index_path)" + "setup_faiss(item_embeddings, faiss_index_path, embedding_column=\"output_1\")" ] }, { @@ -367,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "3bc00e04-c70c-4882-9952-66f4dbb97bdc", "metadata": {}, "outputs": [], @@ -385,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "3decbe7b-03e3-4978-baac-03f6a0b078c9", "metadata": {}, "outputs": [ @@ -393,9 +393,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:14:10+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:51:06+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:14:10+00:00\u001b[0m:\n" + "\u001b[1m\u001b[32muser_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:51:06+00:00\u001b[0m:\n" ] }, { @@ -419,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "f11299b6-20d4-4687-bb0e-b855a9bcb9eb", "metadata": {}, "outputs": [], @@ -441,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "21139caa-3a51-42e6-b006-21a92c95f1bc", "metadata": {}, "outputs": [ @@ -451,7 +451,7 @@ "" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -466,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "47c2d9b1-51dc-4549-977d-d7941ee6486c", "metadata": {}, "outputs": [ @@ -474,10 +474,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-29 19:14:11.423802: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", + "2023-06-29 19:51:07.269579: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-06-29 19:14:14.615977: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n", - "2023-06-29 19:14:14.616886: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n", + "2023-06-29 19:51:10.430459: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24576 MB memory: -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:15:00.0, compute capability: 7.5\n", + "2023-06-29 19:51:10.431356: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 24576 MB memory: -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:2d:00.0, compute capability: 7.5\n", "WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 52). These functions will not be directly callable after loading.\n" ] }, @@ -485,14 +485,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmpqzazhnjq/assets\n" + "INFO:tensorflow:Assets written to: /tmp/tmpdalflmaz/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmpqzazhnjq/assets\n" + "INFO:tensorflow:Assets written to: /tmp/tmpdalflmaz/assets\n" ] } ], @@ -517,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "b270f663-0ae1-4356-acd4-5f8c986abf4d", "metadata": {}, "outputs": [ @@ -525,9 +525,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:14:18+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", + "Materializing \u001b[1m\u001b[32m1\u001b[0m feature views to \u001b[1m\u001b[32m2023-06-29 19:51:14+00:00\u001b[0m into the \u001b[1m\u001b[32msqlite\u001b[0m online store.\n", "\n", - "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:14:18+00:00\u001b[0m:\n" + "\u001b[1m\u001b[32mitem_features\u001b[0m from \u001b[1m\u001b[32m2025-01-01 01:01:01+00:00\u001b[0m to \u001b[1m\u001b[32m2023-06-29 19:51:14+00:00\u001b[0m:\n" ] }, { @@ -550,7 +550,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "0d0a4531-665c-48a1-98a9-216c955449b7", "metadata": {}, "outputs": [], @@ -569,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "eb0ef434-03a5-4a36-afb9-e19a43243c64", "metadata": {}, "outputs": [], @@ -604,7 +604,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "ce31723e-af4d-4827-bb60-3a9fafcd9da6", "metadata": {}, "outputs": [ @@ -619,14 +619,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmp6epm9p86/assets\n" + "INFO:tensorflow:Assets written to: /tmp/tmpqdd_jn5e/assets\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:tensorflow:Assets written to: /tmp/tmp6epm9p86/assets\n" + "INFO:tensorflow:Assets written to: /tmp/tmpqdd_jn5e/assets\n" ] } ], @@ -644,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "7f65598b-e3e7-4238-a73e-19d00c3deb26", "metadata": {}, "outputs": [], @@ -676,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "b28c452f-543c-45a4-9995-130ca6919669", "metadata": {}, "outputs": [], @@ -695,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "9c8b7b94-5559-4587-a272-4d9de2d53dd1", "metadata": {}, "outputs": [], @@ -709,7 +709,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "id": "6c64d686-aed5-42f8-b517-482b4237c69f", "metadata": {}, "outputs": [ @@ -743,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "id": "89182219-40a6-458c-af0e-7a8e83f364aa", "metadata": {}, "outputs": [ @@ -872,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "id": "d08a8975-9c32-467b-99ec-df66319f854b", "metadata": {}, "outputs": [ @@ -905,20 +905,20 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "74ec62f2-5935-45c6-8058-e1cdade6f80f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'ordered_ids': array([[ 52, 102, 42, 204, 312, 117, 414, 258, 14, 450]], dtype=int32),\n", - " 'ordered_scores': array([[0.5010059 , 0.5018582 , 0.5001918 , 0.50212526, 0.5004832 ,\n", - " 0.5006511 , 0.50049436, 0.5014268 , 0.5005215 , 0.5017036 ]],\n", + "{'ordered_ids': array([[100, 168, 324, 79, 361, 294, 267, 289, 397, 189]], dtype=int32),\n", + " 'ordered_scores': array([[0.5016385 , 0.50176895, 0.5017176 , 0.5024097 , 0.5018236 ,\n", + " 0.5018286 , 0.50162375, 0.5015677 , 0.50175667, 0.5014358 ]],\n", " dtype=float32)}" ] }, - "execution_count": 26, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } From 3a70d016432c529170fbae437a8b52594a47a6d6 Mon Sep 17 00:00:00 2001 From: Julio Date: Tue, 4 Jul 2023 16:09:34 -0400 Subject: [PATCH 5/5] add data folder env for notebook 2 and fix unit test --- ...lti-stage-RecSys-with-Merlin-Systems.ipynb | 35 ++++++++++++++++++- ...t_building_deploying_multi_stage_RecSys.py | 2 +- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb index 15f0060d3..e97257251 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/02-Deploying-multi-stage-RecSys-with-Merlin-Systems.ipynb @@ -27,6 +27,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "03166488-1651-4025-84ed-4e9e5db34933", "metadata": {}, @@ -43,6 +44,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "38d75184-cd24-4fe3-90f4-d76028626576", "metadata": {}, @@ -51,6 +53,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "da9dadb5-6eec-4a1b-99f9-929523f5cc07", "metadata": {}, @@ -59,6 +62,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "538677a3-acc6-48f6-acb6-d5bb5fe2e2d2", "metadata": {}, @@ -67,6 +71,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a27e18d7-b3e4-481c-b69e-23193b212c56", "metadata": {}, @@ -145,6 +150,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "55ead20e-c573-462e-9aa2-c3494bf0129f", "metadata": {}, @@ -153,6 +159,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e2ac115e-4794-4a69-a962-8481f6e86df3", "metadata": {}, @@ -170,6 +177,8 @@ "outputs": [], "source": [ "BASE_DIR = os.environ.get(\"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\")\n", + "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n", + "\n", "\n", "# define feature repo path\n", "feast_repo_path = os.path.join(BASE_DIR, \"feast_repo/feature_repo/\")" @@ -203,6 +212,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c641fcd2-bd11-4569-80d4-2ae5e01a5cad", "metadata": {}, @@ -240,6 +250,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8fcc26e6-f6f3-4e44-bf3c-3b8e66dc9fd6", "metadata": {}, @@ -280,6 +291,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e768637c-0a4d-404b-8b58-7182fef0ab0e", "metadata": {}, @@ -288,6 +300,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "efada1e1-2556-4a26-b0ba-9cb96b3b151f", "metadata": {}, @@ -307,6 +320,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2aa037c0-7dad-427c-98bb-3da413e8fd14", "metadata": {}, @@ -327,6 +341,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8b996019-bd2a-44e0-b004-4f412b300d63", "metadata": {}, @@ -358,6 +373,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "46697177-512a-473e-8cca-9fe51d3daa03", "metadata": {}, @@ -376,6 +392,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5c45df06-0cbe-4b52-ac1f-786e763895d7", "metadata": {}, @@ -426,12 +443,13 @@ "source": [ "from nvtabular import Workflow\n", "\n", - "nvt_workflow = Workflow.load('/workspace/data/processed_nvt/workflow')\n", + "nvt_workflow = Workflow.load(os.path.join(DATA_FOLDER, 'processed_nvt/workflow'))\n", "user_subgraph = nvt_workflow.get_subworkflow(\"user\")\n", "user_features = user_attributes >> TransformWorkflow(user_subgraph)" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "27e25be7-3ff0-49c2-a3fc-03ec4d615e77", "metadata": {}, @@ -508,6 +526,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8ce4429c-1fe1-4304-bcdf-badebe3b5485", "metadata": {}, @@ -560,6 +579,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "304a4d09-db05-4666-b520-75dbbbc7ab17", "metadata": {}, @@ -595,6 +615,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7fb0ce66-6b6c-43be-885e-a5435c3bbd9e", "metadata": {}, @@ -635,6 +656,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7f86fa47-de61-4007-ab55-9076e12ce963", "metadata": {}, @@ -656,6 +678,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f4e2e389-d884-44a1-8e32-4916a0eb43cf", "metadata": {}, @@ -667,6 +690,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "50bc2e4f-5e58-4ad4-8ae5-d79ad286978f", "metadata": {}, @@ -686,6 +710,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a061bd82-e553-4823-8d14-3ae88a458c14", "metadata": {}, @@ -734,6 +759,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "276eedd8-5dc0-4ad0-8725-c8da60fea693", "metadata": {}, @@ -823,6 +849,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fe7962cc-f26d-4a4a-b5a3-d214e0f37456", "metadata": { @@ -833,6 +860,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8c07c620-7d6c-4275-87fe-e5b94335bdb9", "metadata": {}, @@ -847,6 +875,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6c0a798f-6abf-4cbb-87f8-f60a6e757092", "metadata": {}, @@ -855,6 +884,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3b0794b1-b9e0-4508-bf6e-cc823ac5c693", "metadata": {}, @@ -863,6 +893,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "af9efbde-4dac-42f1-9ace-096f75bac2b5", "metadata": {}, @@ -896,6 +927,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "28e9e27f-6658-4302-b142-08b05215e48f", "metadata": {}, @@ -929,6 +961,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b4605dbe-5f97-4b31-8ee4-ce7c1cb69d97", "metadata": {}, diff --git a/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py b/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py index 435742499..138ea554f 100644 --- a/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py +++ b/tests/unit/examples/test_building_deploying_multi_stage_RecSys.py @@ -74,7 +74,7 @@ def test_func(tmpdir): df_lib = get_lib() train = df_lib.read_parquet( os.path.join("{tmpdir / "data"}/processed_nvt/", "train", "part_0.parquet"), - columns=["user_id_raw"], + columns=["user_id"], ) batch = train[:1] from merlin.systems.triton.utils import run_ensemble_on_tritonserver