From 5f849cc154c8d0c8d0e3b5e53c4c7e867749964f Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Jul 2022 15:02:26 +0000 Subject: [PATCH 1/3] fixes in the first notebook --- ...ding-Recommender-Systems-with-Merlin.ipynb | 745 +++++++++--------- 1 file changed, 362 insertions(+), 383 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index e4a0f3c2c..484d4d7fd 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -120,38 +120,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "2cd8cc8d-5cc7-4a9f-91e5-3deec6f1fe74", "metadata": {}, - "outputs": [ - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 1;\n", - " var nbb_unformatted_code = \"%load_ext nb_black\\n# for running this example on GPU, install the following libraries\\n# %pip install tensorflow \\\"feast<0.20\\\" faiss-gpu\\n\\n# for running this example on CPU, uncomment the following lines\\n# %pip install tensorflow-cpu \\\"feast<0.20\\\" faiss-cpu\\n# %pip uninstall cudf\";\n", - " var nbb_formatted_code = \"%load_ext nb_black\\n# for running this example on GPU, install the following libraries\\n# %pip install tensorflow \\\"feast<0.20\\\" faiss-gpu\\n\\n# for running this example on CPU, uncomment the following lines\\n# %pip install tensorflow-cpu \\\"feast<0.20\\\" faiss-cpu\\n# %pip uninstall cudf\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# for running this example on GPU, install the following libraries\n", "# %pip install tensorflow \"feast<0.20\" faiss-gpu\n", @@ -171,9 +143,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-06-13 15:45:04.158619: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "2022-07-28 14:45:46.631510: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-06-13 15:45:06.652810: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory: -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:06:00.0, compute capability: 7.0\n" + "2022-07-28 14:45:47.678608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16249 MB memory: -> device: 0, name: Quadro GV100, pci bus id: 0000:2d:00.0, compute capability: 7.0\n" ] } ], @@ -188,12 +160,14 @@ " TagAsUserFeatures,\n", " AddMetadata,\n", " Filter,\n", + " Rename\n", ")\n", "\n", "from merlin.schema.tags import Tags\n", "\n", "import merlin.models.tf as mm\n", "from merlin.io.dataset import Dataset\n", + "from merlin.datasets.ecommerce import transform_aliccp\n", "import tensorflow as tf\n", "\n", "# for running this example on CPU, comment out the line below\n", @@ -220,7 +194,7 @@ "source": [ "In this example notebook, we will generate the synthetic train and test datasets mimicking the real [Ali-CCP: Alibaba Click and Conversion Prediction](https://tianchi.aliyun.com/dataset/dataDetail?dataId=408#1) dataset to build our recommender system models.\n", "\n", - "First, we define our input and output paths." + "First, we define our input path and feature repo path." ] }, { @@ -231,7 +205,10 @@ "outputs": [], "source": [ "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n", - "output_path = os.path.join(DATA_FOLDER, \"processed/ranking\")" + "# set up the base dir to for feature store\n", + "BASE_DIR = os.environ.get(\n", + " \"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", + ")" ] }, { @@ -252,7 +229,7 @@ "from merlin.datasets.synthetic import generate_data\n", "\n", "NUM_ROWS = os.environ.get(\"NUM_ROWS\", 100_000)\n", - "train, valid = generate_data(\"aliccp-raw\", int(NUM_ROWS), set_sizes=(0.7, 0.3))" + "train_raw, valid_raw = generate_data(\"aliccp-raw\", int(NUM_ROWS), set_sizes=(0.7, 0.3))" ] }, { @@ -263,12 +240,28 @@ "If you would like to use the real ALI-CCP dataset, you can use [get_aliccp()](https://github.com/NVIDIA-Merlin/models/blob/main/merlin/datasets/ecommerce/aliccp/dataset.py) function instead. This function takes the raw csv files, and generate parquet files that can be directly fed to NVTabular workflow above." ] }, + { + "cell_type": "markdown", + "id": "09c87748-af61-42b8-8574-1afe3d71118f", + "metadata": {}, + "source": [ + "### Training a Retrieval Model with Two-Tower Model" + ] + }, + { + "cell_type": "markdown", + "id": "e644fcba-7b0b-44c0-97fd-80f4fcb01191", + "metadata": {}, + "source": [ + "We start with the offline candidate retrieval stage. We are going to train a Two-Tower model for item retrieval. To learn more about the Two-tower model you can visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/main/examples/05-Retrieval-Model.ipynb)." + ] + }, { "cell_type": "markdown", "id": "cf9bca46-a6b6-4a73-afd8-fe2869c60748", "metadata": {}, "source": [ - "### Feature Engineering with NVTabular" + "#### Feature Engineering with NVTabular" ] }, { @@ -282,26 +275,42 @@ { "cell_type": "code", "execution_count": 7, - "id": "550d45c9", + "id": "df72a793-194b-44f4-80c3-aaa368a9a01e", + "metadata": {}, + "outputs": [], + "source": [ + "output_path = os.path.join(DATA_FOLDER, \"processed/retrieval\")" + ] + }, + { + "cell_type": "markdown", + "id": "ffd7e2ac-a251-49d0-943b-e9272c852ba6", + "metadata": {}, + "source": [ + "We select only positive interaction rows where `click==1` in the dataset with `Filter()` operator." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7e085a6d-74ad-4c24-8e7c-4e449c15f471", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "CPU times: user 242 µs, sys: 102 µs, total: 344 µs\n", - "Wall time: 365 µs\n" + "/usr/local/lib/python3.8/dist-packages/cudf/core/frame.py:384: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n", + " warnings.warn(\n" ] } ], "source": [ - "%%time\n", - "\n", - "user_id = [\"user_id\"] >> Categorify(dtype=\"int32\") >> TagAsUserID()\n", - "item_id = [\"item_id\"] >> Categorify(dtype=\"int32\") >> TagAsItemID()\n", + "user_id = [\"user_id\"] >> Categorify(dtype=\"int32\", out_path='./categories_tt') >> TagAsUserID()\n", + "item_id = [\"item_id\"] >> Categorify(dtype=\"int32\", out_path='./categories_tt') >> TagAsItemID()\n", "\n", "item_features = (\n", - " [\"item_category\", \"item_shop\", \"item_brand\"] >> Categorify(dtype=\"int32\") >> TagAsItemFeatures()\n", + " [\"item_category\", \"item_shop\", \"item_brand\"] >> Categorify(dtype=\"int32\", out_path='./categories_tt') >> TagAsItemFeatures()\n", ")\n", "\n", "user_features = (\n", @@ -317,244 +326,146 @@ " \"user_intentions\",\n", " \"user_brands\",\n", " \"user_categories\",\n", - " ] >> Categorify(dtype=\"int32\") >> TagAsUserFeatures()\n", + " ] >> Categorify(dtype=\"int32\", out_path='./categories_tt') >> TagAsUserFeatures()\n", ")\n", "\n", - "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n", + "inputs = user_id + item_id + item_features + user_features + [\"click\"]\n", "\n", - "outputs = user_id + item_id + item_features + user_features + targets" - ] - }, - { - "cell_type": "markdown", - "id": "ad19550f-49db-48a3-83c6-aad7d348673c", - "metadata": {}, - "source": [ - "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e117e7b5-5007-424b-8d3f-9e1db245fd4c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/lib/python3.8/site-packages/cudf/core/dataframe.py:1292: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "from merlin.datasets.ecommerce import transform_aliccp\n", + "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n", "\n", "transform_aliccp(\n", - " (train, valid), output_path, nvt_workflow=outputs, workflow_name=\"workflow_ranking\"\n", + " (train_raw, valid_raw),\n", + " output_path,\n", + " nvt_workflow=outputs,\n", + " workflow_name=\"workflow_retrieval\",\n", ")" ] }, { "cell_type": "markdown", - "id": "e16401d4", - "metadata": { - "tags": [] - }, - "source": [ - "### Training a Ranking Model with DLRM" - ] - }, - { - "cell_type": "markdown", - "id": "c4f2b234", + "id": "cc4721ae-7228-4d3f-9586-dcdfefecc19f", "metadata": {}, "source": [ - "NVTabular exported the schema file, `schema.pbtxt` a protobuf text file, of our processed dataset. To learn more about the schema object and schema file you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](https://github.com/NVIDIA-Merlin/models/blob/main/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) notebook.\n", - "\n", - "We use the `schema` object to define our model." + "NVTabular exported the schema file, `schema.pbtxt` a protobuf text file, of our processed dataset. To learn more about the schema object and schema file you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](https://github.com/NVIDIA-Merlin/models/blob/main/examples/02-Merlin-Models-and-NVTabular-integration.ipynb) notebook." ] }, { "cell_type": "code", "execution_count": 9, - "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d", + "id": "71063653-2f39-4b54-8399-145d6f281d4d", "metadata": {}, "outputs": [], "source": [ - "# define train and valid dataset objects\n", - "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", - "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n", + "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n", + "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))\n", "\n", - "# define schema object\n", - "schema = train.schema" + "schema = train_tt.schema\n", + "schema = schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER])" ] }, { "cell_type": "code", "execution_count": 10, - "id": "30e4ebc2", + "id": "9312511a-f368-42f2-93d2-eb95aebbf46c", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'click'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-28 14:45:50.388784: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n" + ] } ], "source": [ - "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n", - "target_column" - ] - }, - { - "cell_type": "markdown", - "id": "8f68e26b", - "metadata": {}, - "source": [ - "Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture is a popular neural network model originally proposed by Facebook in 2019. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074). To learn more about DLRM architetcture please visit `Exploring-different-models` [notebook](https://github.com/NVIDIA-Merlin/models/blob/main/examples/04-Exporting-ranking-models.ipynb) in the Merlin Models GH repo." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e4325080", - "metadata": {}, - "outputs": [], - "source": [ - "model = mm.DLRMModel(\n", + "model_tt = mm.TwoTowerModel(\n", " schema,\n", - " embedding_dim=64,\n", - " bottom_block=mm.MLPBlock([128, 64]),\n", - " top_block=mm.MLPBlock([128, 64, 32]),\n", - " prediction_tasks=mm.BinaryClassificationTask(target_column),\n", + " query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),\n", + " samplers=[mm.InBatchSampler()],\n", + " embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),\n", ")" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "bfe2aa9e", + "execution_count": 11, + "id": "4d47cb8b-e06a-4932-9a19-fb244ef43152", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 9s 313ms/step - loss: 0.6931 - auc: 0.4984 - val_loss: 0.6932 - val_auc: 0.5018\n" + "5/5 [==============================] - 10s 441ms/step - loss: 8.9115 - recall_at_10: 0.0082 - ndcg_at_10: 0.0061 - regularization_loss: 0.0000e+00 - val_loss: 8.9090 - val_recall_at_10: 0.0096 - val_ndcg_at_10: 0.0063 - val_regularization_loss: 0.0000e+00\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model.compile(optimizer=\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n", - "model.fit(train, validation_data=valid, batch_size=16 * 1024)" - ] - }, - { - "cell_type": "markdown", - "id": "18d91780-88d3-4dd8-9ed2-db33424d3e98", - "metadata": {}, - "source": [ - "We will create the feature repo in the current working directory, which is `BASE_DIR` for us." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "dd78a82e", - "metadata": {}, - "outputs": [], - "source": [ - "# set up the base dir to for feature store\n", - "BASE_DIR = os.environ.get(\n", - " \"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "498c4d49-7a59-4260-87b9-b86b66f2c67f", - "metadata": {}, - "source": [ - "Let's save our DLRM model to be able to load back at the deployment stage. " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", - "metadata": {}, - "outputs": [], - "source": [ - "model.save(os.path.join(BASE_DIR, \"dlrm\"))" + "model_tt.compile(\n", + " optimizer=\"adam\",\n", + " run_eagerly=False,\n", + " loss=\"categorical_crossentropy\",\n", + " metrics=[mm.RecallAt(10), mm.NDCGAt(10)],\n", + ")\n", + "model_tt.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)" ] }, { "cell_type": "markdown", - "id": "91205a3c-f46e-45a0-b668-1a9bdef0c51d", - "metadata": {}, + "id": "e16401d4", + "metadata": { + "tags": [] + }, "source": [ - "### Training a Retrieval Model with Two-Tower Model" + "### Training a Ranking Model with DLRM" ] }, { "cell_type": "markdown", - "id": "255e5dbf-f648-4667-8dc3-47feef88d3f1", + "id": "ccc14bbf-b813-4306-a9a5-ccb1ccc56b5e", "metadata": {}, "source": [ - "Now we move to the offline retrieval stage. We are going to train a Two-Tower model for item retrieval. To learn more about the Two-tower model you can visit [05-Retrieval-Model.ipynb](https://github.com/NVIDIA-Merlin/models/blob/main/examples/05-Retrieval-Model.ipynb)." + "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path." ] }, { "cell_type": "code", - "execution_count": 15, - "id": "00de24e9-331a-486e-9843-6c554ad2ec77", + "execution_count": 12, + "id": "6a4b2ad0-c873-4a4a-8466-d21b5d181c74", "metadata": {}, "outputs": [], "source": [ - "output_path = os.path.join(DATA_FOLDER, \"processed/retrieval\")" - ] - }, - { - "cell_type": "markdown", - "id": "9cdaec18-84f2-42f6-bee5-d66cf28c03f4", - "metadata": {}, - "source": [ - "We select only positive interaction rows where `click==1` in the dataset with `Filter()` operator." + "output_path = os.path.join(DATA_FOLDER, \"processed/ranking\")" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "22a7d605-478f-40e6-a5dc-3e7a61e9b035", + "execution_count": 13, + "id": "7a6bc984-f1b7-4e2f-97ca-612be0d8e390", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/usr/lib/python3.8/site-packages/cudf/core/dataframe.py:1292: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n", - " warnings.warn(\n" + "CPU times: user 305 µs, sys: 38 µs, total: 343 µs\n", + "Wall time: 351 µs\n" ] } ], "source": [ + "%%time\n", + "\n", "user_id = [\"user_id\"] >> Categorify(dtype=\"int32\") >> TagAsUserID()\n", "item_id = [\"item_id\"] >> Categorify(dtype=\"int32\") >> TagAsItemID()\n", "\n", @@ -578,79 +489,137 @@ " ] >> Categorify(dtype=\"int32\") >> TagAsUserFeatures()\n", ")\n", "\n", - "inputs = user_id + item_id + item_features + user_features + [\"click\"]\n", - "\n", - "outputs = inputs >> Filter(f=lambda df: df[\"click\"] == 1)\n", + "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n", "\n", + "outputs = user_id + item_id + item_features + user_features + targets" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6db6387d-174c-4cbe-a995-496faeb0c512", + "metadata": {}, + "outputs": [], + "source": [ "transform_aliccp(\n", - " (train, valid),\n", - " output_path,\n", - " nvt_workflow=outputs,\n", - " workflow_name=\"workflow_retrieval\",\n", + " (train_raw, valid_raw), output_path, nvt_workflow=outputs, workflow_name=\"workflow_ranking\"\n", ")" ] }, + { + "cell_type": "markdown", + "id": "c4f2b234", + "metadata": {}, + "source": [ + "We use the `schema` object to define our model." + ] + }, { "cell_type": "code", - "execution_count": 17, - "id": "dc150549-6fa0-441f-939d-a358e56d5e43", + "execution_count": 15, + "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d", "metadata": {}, "outputs": [], "source": [ - "train_tt = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n", - "valid_tt = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))\n", + "# define train and valid dataset objects\n", + "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"), part_size=\"500MB\")\n", + "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"), part_size=\"500MB\")\n", "\n", - "schema = train_tt.schema\n", - "schema = schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER])" + "# define schema object\n", + "schema = train.schema" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "02471088-0ed8-42e7-968e-b7e68865d55c", + "execution_count": 16, + "id": "30e4ebc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'click'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n", + "target_column" + ] + }, + { + "cell_type": "markdown", + "id": "8f68e26b", + "metadata": {}, + "source": [ + "Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture is a popular neural network model originally proposed by Facebook in 2019. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074). To learn more about DLRM architetcture please visit `Exploring-different-models` [notebook](https://github.com/NVIDIA-Merlin/models/blob/main/examples/04-Exporting-ranking-models.ipynb) in the Merlin Models GH repo." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e4325080", "metadata": {}, "outputs": [], "source": [ - "model = mm.TwoTowerModel(\n", + "model = mm.DLRMModel(\n", " schema,\n", - " query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),\n", - " samplers=[mm.InBatchSampler()],\n", - " embedding_options=mm.EmbeddingOptions(infer_embedding_sizes=True),\n", + " embedding_dim=64,\n", + " bottom_block=mm.MLPBlock([128, 64]),\n", + " top_block=mm.MLPBlock([128, 64, 32]),\n", + " prediction_tasks=mm.BinaryClassificationTask(target_column),\n", ")" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "d6703d7c-d38f-4d6d-a20a-9ee95ff1e256", + "execution_count": 18, + "id": "bfe2aa9e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 11s 566ms/step - loss: 8.8953 - recall_at_10: 0.0175 - ndcg_10: 0.0135 - val_loss: 8.8948 - val_recall_at_10: 0.0358 - val_ndcg_10: 0.0339\n" + "5/5 [==============================] - 4s 227ms/step - loss: 0.6931 - auc: 0.4982 - regularization_loss: 0.0000e+00 - val_loss: 0.6932 - val_auc: 0.5019 - val_regularization_loss: 0.0000e+00\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model.compile(\n", - " optimizer=\"adam\",\n", - " run_eagerly=False,\n", - " loss=\"categorical_crossentropy\",\n", - " metrics=[mm.RecallAt(10), mm.NDCGAt(10)],\n", - ")\n", - "model.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)" + "model.compile(optimizer=\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n", + "model.fit(train, validation_data=valid, batch_size=16 * 1024)" + ] + }, + { + "cell_type": "markdown", + "id": "498c4d49-7a59-4260-87b9-b86b66f2c67f", + "metadata": {}, + "source": [ + "Let's save our DLRM model to be able to load back at the deployment stage. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", + "metadata": {}, + "outputs": [], + "source": [ + "model.save(os.path.join(BASE_DIR, \"dlrm\"))" ] }, { @@ -674,7 +643,7 @@ "id": "99a4e939-d3cf-44f0-9012-d2af3264ee25", "metadata": {}, "source": [ - "Before we move onto the next step, we need to create a Feast feature repository." + "Before we move onto the next step, we need to create a Feast feature repository. We will create the feature repo in the current working directory, which is `BASE_DIR` for us." ] }, { @@ -735,7 +704,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_tower = model.retrieval_block.query_block()\n", + "query_tower = model_tt.retrieval_block.query_block()\n", "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))" ] }, @@ -997,8 +966,8 @@ " 1\n", " 1\n", " 1\n", - " 2022-06-13 15:46:06.144215\n", - " 2022-06-13 15:46:06.765273\n", + " 2022-07-28 14:46:29.651986\n", + " 2022-07-28 14:46:29.653266\n", " \n", " \n", " 1\n", @@ -1014,8 +983,8 @@ " 2\n", " 2\n", " 2\n", - " 2022-06-13 15:46:06.144215\n", - " 2022-06-13 15:46:06.765273\n", + " 2022-07-28 14:46:29.651986\n", + " 2022-07-28 14:46:29.653266\n", " \n", " \n", " 2\n", @@ -1031,8 +1000,8 @@ " 3\n", " 3\n", " 3\n", - " 2022-06-13 15:46:06.144215\n", - " 2022-06-13 15:46:06.765273\n", + " 2022-07-28 14:46:29.651986\n", + " 2022-07-28 14:46:29.653266\n", " \n", " \n", " 3\n", @@ -1048,8 +1017,8 @@ " 4\n", " 4\n", " 4\n", - " 2022-06-13 15:46:06.144215\n", - " 2022-06-13 15:46:06.765273\n", + " 2022-07-28 14:46:29.651986\n", + " 2022-07-28 14:46:29.653266\n", " \n", " \n", " 4\n", @@ -1065,8 +1034,8 @@ " 5\n", " 5\n", " 5\n", - " 2022-06-13 15:46:06.144215\n", - " 2022-06-13 15:46:06.765273\n", + " 2022-07-28 14:46:29.651986\n", + " 2022-07-28 14:46:29.653266\n", " \n", " \n", "\n", @@ -1088,18 +1057,18 @@ "4 1 1 1 5 \n", "\n", " user_brands user_categories datetime \\\n", - "0 1 1 2022-06-13 15:46:06.144215 \n", - "1 2 2 2022-06-13 15:46:06.144215 \n", - "2 3 3 2022-06-13 15:46:06.144215 \n", - "3 4 4 2022-06-13 15:46:06.144215 \n", - "4 5 5 2022-06-13 15:46:06.144215 \n", + "0 1 1 2022-07-28 14:46:29.651986 \n", + "1 2 2 2022-07-28 14:46:29.651986 \n", + "2 3 3 2022-07-28 14:46:29.651986 \n", + "3 4 4 2022-07-28 14:46:29.651986 \n", + "4 5 5 2022-07-28 14:46:29.651986 \n", "\n", " created \n", - "0 2022-06-13 15:46:06.765273 \n", - "1 2022-06-13 15:46:06.765273 \n", - "2 2022-06-13 15:46:06.765273 \n", - "3 2022-06-13 15:46:06.765273 \n", - "4 2022-06-13 15:46:06.765273 " + "0 2022-07-28 14:46:29.653266 \n", + "1 2022-07-28 14:46:29.653266 \n", + "2 2022-07-28 14:46:29.653266 \n", + "3 2022-07-28 14:46:29.653266 \n", + "4 2022-07-28 14:46:29.653266 " ] }, "execution_count": 26, @@ -1146,7 +1115,7 @@ { "data": { "text/plain": [ - "(441, 4)" + "(453, 4)" ] }, "execution_count": 29, @@ -1213,8 +1182,8 @@ " 1\n", " 1\n", " 1\n", - " 2022-06-13 15:46:06.930595\n", - " 2022-06-13 15:46:06.933351\n", + " 2022-07-28 14:46:29.747641\n", + " 2022-07-28 14:46:29.748486\n", " \n", " \n", " 1\n", @@ -1222,8 +1191,8 @@ " 2\n", " 2\n", " 2\n", - " 2022-06-13 15:46:06.930595\n", - " 2022-06-13 15:46:06.933351\n", + " 2022-07-28 14:46:29.747641\n", + " 2022-07-28 14:46:29.748486\n", " \n", " \n", " 2\n", @@ -1231,8 +1200,8 @@ " 3\n", " 3\n", " 3\n", - " 2022-06-13 15:46:06.930595\n", - " 2022-06-13 15:46:06.933351\n", + " 2022-07-28 14:46:29.747641\n", + " 2022-07-28 14:46:29.748486\n", " \n", " \n", " 3\n", @@ -1240,8 +1209,8 @@ " 4\n", " 4\n", " 4\n", - " 2022-06-13 15:46:06.930595\n", - " 2022-06-13 15:46:06.933351\n", + " 2022-07-28 14:46:29.747641\n", + " 2022-07-28 14:46:29.748486\n", " \n", " \n", " 4\n", @@ -1249,8 +1218,8 @@ " 5\n", " 5\n", " 5\n", - " 2022-06-13 15:46:06.930595\n", - " 2022-06-13 15:46:06.933351\n", + " 2022-07-28 14:46:29.747641\n", + " 2022-07-28 14:46:29.748486\n", " \n", " \n", "\n", @@ -1258,18 +1227,18 @@ ], "text/plain": [ " item_id item_category item_shop item_brand datetime \\\n", - "0 1 1 1 1 2022-06-13 15:46:06.930595 \n", - "1 2 2 2 2 2022-06-13 15:46:06.930595 \n", - "2 3 3 3 3 2022-06-13 15:46:06.930595 \n", - "3 4 4 4 4 2022-06-13 15:46:06.930595 \n", - "4 5 5 5 5 2022-06-13 15:46:06.930595 \n", + "0 1 1 1 1 2022-07-28 14:46:29.747641 \n", + "1 2 2 2 2 2022-07-28 14:46:29.747641 \n", + "2 3 3 3 3 2022-07-28 14:46:29.747641 \n", + "3 4 4 4 4 2022-07-28 14:46:29.747641 \n", + "4 5 5 5 5 2022-07-28 14:46:29.747641 \n", "\n", " created \n", - "0 2022-06-13 15:46:06.933351 \n", - "1 2022-06-13 15:46:06.933351 \n", - "2 2022-06-13 15:46:06.933351 \n", - "3 2022-06-13 15:46:06.933351 \n", - "4 2022-06-13 15:46:06.933351 " + "0 2022-07-28 14:46:29.748486 \n", + "1 2022-07-28 14:46:29.748486 \n", + "2 2022-07-28 14:46:29.748486 \n", + "3 2022-07-28 14:46:29.748486 \n", + "4 2022-07-28 14:46:29.748486 " ] }, "execution_count": 31, @@ -1309,7 +1278,7 @@ "metadata": {}, "outputs": [], "source": [ - "item_embs = model.item_embeddings(\n", + "item_embs = model_tt.item_embeddings(\n", " Dataset(item_features, schema=schema), batch_size=1024\n", ")\n", "item_embs_df = item_embs.compute(scheduler=\"synchronous\")" @@ -1382,122 +1351,122 @@ " \n", " 0\n", " 1\n", - " -0.014644\n", - " -0.030120\n", - " -0.043844\n", - " -0.032847\n", - " 0.011542\n", - " -0.023905\n", - " -0.029672\n", - " -0.019096\n", - " 0.003238\n", + " -0.011209\n", + " -0.003091\n", + " -0.019507\n", + " -0.002687\n", + " 0.011972\n", + " -0.021300\n", + " -0.032583\n", + " 0.004853\n", + " 0.014860\n", " ...\n", - " -0.037217\n", - " 0.038835\n", - " 0.007689\n", - " 0.000896\n", - " -0.009022\n", - " -0.010495\n", - " 0.023698\n", - " 0.033870\n", - " -0.029741\n", - " -0.030549\n", + " 0.004789\n", + " 0.010169\n", + " 0.001538\n", + " 0.007499\n", + " -0.031698\n", + " 0.005041\n", + " -0.010608\n", + " -0.017708\n", + " 0.028459\n", + " -0.005681\n", " \n", " \n", " 1\n", " 2\n", - " -0.026403\n", - " -0.005529\n", - " -0.025439\n", - " -0.003412\n", - " 0.005109\n", - " -0.013113\n", - " 0.029868\n", - " -0.038708\n", - " 0.003663\n", + " 0.003402\n", + " -0.017468\n", + " -0.044606\n", + " -0.018614\n", + " 0.013404\n", + " 0.035754\n", + " -0.045333\n", + " -0.008586\n", + " -0.001069\n", " ...\n", - " -0.030878\n", - " -0.014543\n", - " -0.017892\n", - " 0.025364\n", - " -0.032985\n", - " -0.006969\n", - " 0.031746\n", - " -0.003650\n", - " -0.019432\n", - " -0.006048\n", + " 0.013446\n", + " 0.000704\n", + " 0.004869\n", + " 0.026757\n", + " 0.013619\n", + " -0.026476\n", + " -0.012150\n", + " -0.022430\n", + " -0.000142\n", + " -0.001121\n", " \n", " \n", " 2\n", " 3\n", - " -0.031090\n", - " -0.035648\n", - " -0.042533\n", - " -0.020776\n", - " 0.000392\n", - " -0.001049\n", - " -0.014174\n", - " -0.000956\n", - " -0.053086\n", + " 0.019445\n", + " -0.007307\n", + " -0.033769\n", + " 0.028448\n", + " 0.015149\n", + " 0.044320\n", + " -0.060556\n", + " -0.025492\n", + " -0.044191\n", " ...\n", - " 0.023282\n", - " 0.025333\n", - " 0.016674\n", - " 0.017381\n", - " -0.007932\n", - " -0.029791\n", - " 0.007936\n", - " 0.024365\n", - " -0.033152\n", - " -0.006021\n", + " -0.030190\n", + " -0.007296\n", + " 0.005592\n", + " 0.051516\n", + " 0.012991\n", + " -0.023061\n", + " 0.006839\n", + " -0.001478\n", + " 0.067653\n", + " 0.025774\n", " \n", " \n", " 3\n", " 4\n", - " 0.021717\n", - " 0.009807\n", - " -0.005943\n", - " -0.022264\n", - " 0.023473\n", - " -0.033276\n", - " -0.025934\n", - " -0.023402\n", - " 0.017395\n", + " -0.014708\n", + " 0.015244\n", + " -0.062631\n", + " -0.004068\n", + " 0.032136\n", + " 0.017407\n", + " -0.067003\n", + " -0.006915\n", + " -0.034325\n", " ...\n", - " -0.051937\n", - " 0.004534\n", - " 0.017306\n", - " 0.012567\n", - " 0.021283\n", - " 0.006707\n", - " -0.012907\n", - " 0.037199\n", - " 0.002680\n", - " -0.035851\n", + " -0.046192\n", + " 0.000838\n", + " -0.010202\n", + " -0.011279\n", + " 0.036282\n", + " 0.037406\n", + " -0.040168\n", + " 0.002909\n", + " -0.027596\n", + " -0.019525\n", " \n", " \n", " 4\n", " 5\n", - " 0.000333\n", - " -0.007780\n", - " -0.028711\n", - " 0.000049\n", - " 0.019983\n", - " 0.008787\n", - " -0.021814\n", - " -0.000160\n", - " -0.031236\n", + " 0.008686\n", + " -0.011467\n", + " -0.040448\n", + " 0.007353\n", + " -0.022232\n", + " -0.038102\n", + " -0.072839\n", + " 0.008757\n", + " 0.017982\n", " ...\n", - " 0.021217\n", - " 0.053417\n", - " 0.006594\n", - " 0.011942\n", - " 0.005315\n", - " 0.001932\n", - " -0.020481\n", - " 0.018115\n", - " -0.002176\n", - " 0.002586\n", + " 0.015238\n", + " 0.028096\n", + " 0.020939\n", + " 0.054904\n", + " 0.011474\n", + " -0.010195\n", + " 0.033874\n", + " -0.028908\n", + " 0.005968\n", + " 0.036249\n", " \n", " \n", "\n", @@ -1506,25 +1475,25 @@ ], "text/plain": [ " item_id 0 1 2 3 4 5 \\\n", - "0 1 -0.014644 -0.030120 -0.043844 -0.032847 0.011542 -0.023905 \n", - "1 2 -0.026403 -0.005529 -0.025439 -0.003412 0.005109 -0.013113 \n", - "2 3 -0.031090 -0.035648 -0.042533 -0.020776 0.000392 -0.001049 \n", - "3 4 0.021717 0.009807 -0.005943 -0.022264 0.023473 -0.033276 \n", - "4 5 0.000333 -0.007780 -0.028711 0.000049 0.019983 0.008787 \n", + "0 1 -0.011209 -0.003091 -0.019507 -0.002687 0.011972 -0.021300 \n", + "1 2 0.003402 -0.017468 -0.044606 -0.018614 0.013404 0.035754 \n", + "2 3 0.019445 -0.007307 -0.033769 0.028448 0.015149 0.044320 \n", + "3 4 -0.014708 0.015244 -0.062631 -0.004068 0.032136 0.017407 \n", + "4 5 0.008686 -0.011467 -0.040448 0.007353 -0.022232 -0.038102 \n", "\n", " 6 7 8 ... 54 55 56 57 \\\n", - "0 -0.029672 -0.019096 0.003238 ... -0.037217 0.038835 0.007689 0.000896 \n", - "1 0.029868 -0.038708 0.003663 ... -0.030878 -0.014543 -0.017892 0.025364 \n", - "2 -0.014174 -0.000956 -0.053086 ... 0.023282 0.025333 0.016674 0.017381 \n", - "3 -0.025934 -0.023402 0.017395 ... -0.051937 0.004534 0.017306 0.012567 \n", - "4 -0.021814 -0.000160 -0.031236 ... 0.021217 0.053417 0.006594 0.011942 \n", + "0 -0.032583 0.004853 0.014860 ... 0.004789 0.010169 0.001538 0.007499 \n", + "1 -0.045333 -0.008586 -0.001069 ... 0.013446 0.000704 0.004869 0.026757 \n", + "2 -0.060556 -0.025492 -0.044191 ... -0.030190 -0.007296 0.005592 0.051516 \n", + "3 -0.067003 -0.006915 -0.034325 ... -0.046192 0.000838 -0.010202 -0.011279 \n", + "4 -0.072839 0.008757 0.017982 ... 0.015238 0.028096 0.020939 0.054904 \n", "\n", " 58 59 60 61 62 63 \n", - "0 -0.009022 -0.010495 0.023698 0.033870 -0.029741 -0.030549 \n", - "1 -0.032985 -0.006969 0.031746 -0.003650 -0.019432 -0.006048 \n", - "2 -0.007932 -0.029791 0.007936 0.024365 -0.033152 -0.006021 \n", - "3 0.021283 0.006707 -0.012907 0.037199 0.002680 -0.035851 \n", - "4 0.005315 0.001932 -0.020481 0.018115 -0.002176 0.002586 \n", + "0 -0.031698 0.005041 -0.010608 -0.017708 0.028459 -0.005681 \n", + "1 0.013619 -0.026476 -0.012150 -0.022430 -0.000142 -0.001121 \n", + "2 0.012991 -0.023061 0.006839 -0.001478 0.067653 0.025774 \n", + "3 0.036282 0.037406 -0.040168 0.002909 -0.027596 -0.019525 \n", + "4 0.011474 -0.010195 0.033874 -0.028908 0.005968 0.036249 \n", "\n", "[5 rows x 65 columns]" ] @@ -1669,10 +1638,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "57133c1e-18d9-4ccb-9704-cdebd271985e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: seedir in /usr/local/lib/python3.8/dist-packages (0.3.1)\n", + "Requirement already satisfied: natsort in /usr/local/lib/python3.8/dist-packages (from seedir) (8.1.0)\n", + "Requirement already satisfied: emoji in /usr/local/lib/python3.8/dist-packages (from seedir) (2.0.0)\n" + ] + } + ], "source": [ "# install seedir\n", "!pip install seedir" From 140e10192670ff4c14f6348b6760ed59ed259e08 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Jul 2022 15:10:13 +0000 Subject: [PATCH 2/3] update text --- ...ding-Recommender-Systems-with-Merlin.ipynb | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index 484d4d7fd..1ab2fa71b 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -115,7 +115,7 @@ "source": [ "**Compatibility:**\n", "\n", - "These notebooks are developed and tested using our latest inference container on [NVIDIA's docker registry](https://catalog.ngc.nvidia.com/containers?filters=&orderBy=dateModifiedDESC&query=merlin)." + "These notebooks are developed and tested using our latest `merlin-tensorflow:22.XX` container on [NVIDIA's docker registry](https://catalog.ngc.nvidia.com/containers?filters=&orderBy=dateModifiedDESC&query=merlin)." ] }, { @@ -199,13 +199,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "81ddb370", "metadata": {}, "outputs": [], "source": [ "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n", - "# set up the base dir to for feature store\n", + "# set up the base dir for feature store\n", "BASE_DIR = os.environ.get(\n", " \"BASE_DIR\", \"/Merlin/examples/Building-and-deploying-multi-stage-RecSys/\"\n", ")" @@ -430,12 +430,28 @@ "### Training a Ranking Model with DLRM" ] }, + { + "cell_type": "markdown", + "id": "b72e8a2a-fc4a-43ab-934c-6d941c56aad2", + "metadata": {}, + "source": [ + "Now we will move onto training an offline ranking model. This ranking model will be used for scoring our retrieved items." + ] + }, + { + "cell_type": "markdown", + "id": "5243f652-141f-4151-b05a-6d36396e719f", + "metadata": {}, + "source": [ + "#### Feature Engineering with NVTabular" + ] + }, { "cell_type": "markdown", "id": "ccc14bbf-b813-4306-a9a5-ccb1ccc56b5e", "metadata": {}, "source": [ - "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path." + "Define output path." ] }, { @@ -494,6 +510,14 @@ "outputs = user_id + item_id + item_features + user_features + targets" ] }, + { + "cell_type": "markdown", + "id": "59751552-82e2-4fce-ba13-88e79bda8222", + "metadata": {}, + "source": [ + "Let's call `transform_aliccp` utility function to be able to perform `fit` and `transform` steps on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, and also save our workflow model. After fit and transform, the processed parquet files are saved to output_path." + ] + }, { "cell_type": "code", "execution_count": 14, From bb8740bd2e9fd10eebb02419c28ec235cc460176 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Jul 2022 16:31:40 +0000 Subject: [PATCH 3/3] move export query tower cell up --- ...ding-Recommender-Systems-with-Merlin.ipynb | 413 +++++++++--------- 1 file changed, 210 insertions(+), 203 deletions(-) diff --git a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb index 1ab2fa71b..e1deec30a 100644 --- a/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb +++ b/examples/Building-and-deploying-multi-stage-RecSys/01-Building-Recommender-Systems-with-Merlin.ipynb @@ -135,20 +135,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "08cdbfcc", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-28 14:45:46.631510: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-07-28 14:45:47.678608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16249 MB memory: -> device: 0, name: Quadro GV100, pci bus id: 0000:2d:00.0, compute capability: 7.0\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "\n", @@ -199,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "81ddb370", "metadata": {}, "outputs": [], @@ -373,7 +363,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-07-28 14:45:50.388784: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n" + "2022-07-29 16:05:31.324620: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.\n" ] } ], @@ -396,13 +386,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 10s 441ms/step - loss: 8.9115 - recall_at_10: 0.0082 - ndcg_at_10: 0.0061 - regularization_loss: 0.0000e+00 - val_loss: 8.9090 - val_recall_at_10: 0.0096 - val_ndcg_at_10: 0.0063 - val_regularization_loss: 0.0000e+00\n" + "5/5 [==============================] - 10s 444ms/step - loss: 8.9095 - recall_at_10: 0.0059 - ndcg_at_10: 0.0032 - regularization_loss: 0.0000e+00 - val_loss: 8.9096 - val_recall_at_10: 0.0079 - val_ndcg_at_10: 0.0036 - val_regularization_loss: 0.0000e+00\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 11, @@ -420,6 +410,33 @@ "model_tt.fit(train_tt, validation_data=valid_tt, batch_size=1024 * 8, epochs=1)" ] }, + { + "cell_type": "markdown", + "id": "80d83007-f9e8-408f-9f65-a0e9e19cb586", + "metadata": {}, + "source": [ + "### Exporting query (user) model" + ] + }, + { + "cell_type": "markdown", + "id": "22af58a9-5525-454a-bf25-a9df0462aa53", + "metadata": {}, + "source": [ + "We export the query tower to use it later during the model deployment stage with Merlin Systems." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d2370f13-ff9a-4ee0-ba1e-451c7bec0f8a", + "metadata": {}, + "outputs": [], + "source": [ + "query_tower = model_tt.retrieval_block.query_block()\n", + "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))" + ] + }, { "cell_type": "markdown", "id": "e16401d4", @@ -456,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "6a4b2ad0-c873-4a4a-8466-d21b5d181c74", "metadata": {}, "outputs": [], @@ -466,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "7a6bc984-f1b7-4e2f-97ca-612be0d8e390", "metadata": {}, "outputs": [ @@ -474,8 +491,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 305 µs, sys: 38 µs, total: 343 µs\n", - "Wall time: 351 µs\n" + "CPU times: user 162 µs, sys: 17 µs, total: 179 µs\n", + "Wall time: 183 µs\n" ] } ], @@ -520,10 +537,19 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "6db6387d-174c-4cbe-a995-496faeb0c512", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/cudf/core/frame.py:384: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "transform_aliccp(\n", " (train_raw, valid_raw), output_path, nvt_workflow=outputs, workflow_name=\"workflow_ranking\"\n", @@ -540,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "cb870461-6ac2-49b2-ba6a-2da6ecb57f1d", "metadata": {}, "outputs": [], @@ -555,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "30e4ebc2", "metadata": {}, "outputs": [ @@ -565,7 +591,7 @@ "'click'" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -585,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "e4325080", "metadata": {}, "outputs": [], @@ -601,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "bfe2aa9e", "metadata": {}, "outputs": [ @@ -609,16 +635,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "5/5 [==============================] - 4s 227ms/step - loss: 0.6931 - auc: 0.4982 - regularization_loss: 0.0000e+00 - val_loss: 0.6932 - val_auc: 0.5019 - val_regularization_loss: 0.0000e+00\n" + "5/5 [==============================] - 4s 231ms/step - loss: 0.6932 - auc: 0.4999 - regularization_loss: 0.0000e+00 - val_loss: 0.6931 - val_auc: 0.5002 - val_regularization_loss: 0.0000e+00\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "00447c12-ea80-4d98-ab47-cc1a982a6958", "metadata": {}, "outputs": [], @@ -672,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "2e7e96d2-9cd2-40d1-b356-8cd76b57bb4a", "metadata": {}, "outputs": [ @@ -701,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "26ba2521-ed1b-4c2b-afdd-26b4a5a9c008", "metadata": {}, "outputs": [], @@ -713,25 +739,6 @@ " os.remove(f\"{feature_repo_path}/data/driver_stats.parquet\")" ] }, - { - "cell_type": "markdown", - "id": "e44fc89d-170b-41a1-a29b-5f958ed31399", - "metadata": {}, - "source": [ - "### Exporting query (user) model" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "2af24597-e89c-43a4-9a13-458d8bed7c8a", - "metadata": {}, - "outputs": [], - "source": [ - "query_tower = model_tt.retrieval_block.query_block()\n", - "query_tower.save(os.path.join(BASE_DIR, \"query_tower\"))" - ] - }, { "cell_type": "markdown", "id": "78315676-eb6c-405a-b1fd-3174ea328406", @@ -990,8 +997,8 @@ " 1\n", " 1\n", " 1\n", - " 2022-07-28 14:46:29.651986\n", - " 2022-07-28 14:46:29.653266\n", + " 2022-07-29 16:06:09.642304\n", + " 2022-07-29 16:06:09.643715\n", " \n", " \n", " 1\n", @@ -1007,8 +1014,8 @@ " 2\n", " 2\n", " 2\n", - " 2022-07-28 14:46:29.651986\n", - " 2022-07-28 14:46:29.653266\n", + " 2022-07-29 16:06:09.642304\n", + " 2022-07-29 16:06:09.643715\n", " \n", " \n", " 2\n", @@ -1024,8 +1031,8 @@ " 3\n", " 3\n", " 3\n", - " 2022-07-28 14:46:29.651986\n", - " 2022-07-28 14:46:29.653266\n", + " 2022-07-29 16:06:09.642304\n", + " 2022-07-29 16:06:09.643715\n", " \n", " \n", " 3\n", @@ -1041,8 +1048,8 @@ " 4\n", " 4\n", " 4\n", - " 2022-07-28 14:46:29.651986\n", - " 2022-07-28 14:46:29.653266\n", + " 2022-07-29 16:06:09.642304\n", + " 2022-07-29 16:06:09.643715\n", " \n", " \n", " 4\n", @@ -1058,8 +1065,8 @@ " 5\n", " 5\n", " 5\n", - " 2022-07-28 14:46:29.651986\n", - " 2022-07-28 14:46:29.653266\n", + " 2022-07-29 16:06:09.642304\n", + " 2022-07-29 16:06:09.643715\n", " \n", " \n", "\n", @@ -1081,18 +1088,18 @@ "4 1 1 1 5 \n", "\n", " user_brands user_categories datetime \\\n", - "0 1 1 2022-07-28 14:46:29.651986 \n", - "1 2 2 2022-07-28 14:46:29.651986 \n", - "2 3 3 2022-07-28 14:46:29.651986 \n", - "3 4 4 2022-07-28 14:46:29.651986 \n", - "4 5 5 2022-07-28 14:46:29.651986 \n", + "0 1 1 2022-07-29 16:06:09.642304 \n", + "1 2 2 2022-07-29 16:06:09.642304 \n", + "2 3 3 2022-07-29 16:06:09.642304 \n", + "3 4 4 2022-07-29 16:06:09.642304 \n", + "4 5 5 2022-07-29 16:06:09.642304 \n", "\n", " created \n", - "0 2022-07-28 14:46:29.653266 \n", - "1 2022-07-28 14:46:29.653266 \n", - "2 2022-07-28 14:46:29.653266 \n", - "3 2022-07-28 14:46:29.653266 \n", - "4 2022-07-28 14:46:29.653266 " + "0 2022-07-29 16:06:09.643715 \n", + "1 2022-07-29 16:06:09.643715 \n", + "2 2022-07-29 16:06:09.643715 \n", + "3 2022-07-29 16:06:09.643715 \n", + "4 2022-07-29 16:06:09.643715 " ] }, "execution_count": 26, @@ -1139,7 +1146,7 @@ { "data": { "text/plain": [ - "(453, 4)" + "(451, 4)" ] }, "execution_count": 29, @@ -1206,8 +1213,8 @@ " 1\n", " 1\n", " 1\n", - " 2022-07-28 14:46:29.747641\n", - " 2022-07-28 14:46:29.748486\n", + " 2022-07-29 16:06:09.742965\n", + " 2022-07-29 16:06:09.745334\n", " \n", " \n", " 1\n", @@ -1215,8 +1222,8 @@ " 2\n", " 2\n", " 2\n", - " 2022-07-28 14:46:29.747641\n", - " 2022-07-28 14:46:29.748486\n", + " 2022-07-29 16:06:09.742965\n", + " 2022-07-29 16:06:09.745334\n", " \n", " \n", " 2\n", @@ -1224,8 +1231,8 @@ " 3\n", " 3\n", " 3\n", - " 2022-07-28 14:46:29.747641\n", - " 2022-07-28 14:46:29.748486\n", + " 2022-07-29 16:06:09.742965\n", + " 2022-07-29 16:06:09.745334\n", " \n", " \n", " 3\n", @@ -1233,8 +1240,8 @@ " 4\n", " 4\n", " 4\n", - " 2022-07-28 14:46:29.747641\n", - " 2022-07-28 14:46:29.748486\n", + " 2022-07-29 16:06:09.742965\n", + " 2022-07-29 16:06:09.745334\n", " \n", " \n", " 4\n", @@ -1242,8 +1249,8 @@ " 5\n", " 5\n", " 5\n", - " 2022-07-28 14:46:29.747641\n", - " 2022-07-28 14:46:29.748486\n", + " 2022-07-29 16:06:09.742965\n", + " 2022-07-29 16:06:09.745334\n", " \n", " \n", "\n", @@ -1251,18 +1258,18 @@ ], "text/plain": [ " item_id item_category item_shop item_brand datetime \\\n", - "0 1 1 1 1 2022-07-28 14:46:29.747641 \n", - "1 2 2 2 2 2022-07-28 14:46:29.747641 \n", - "2 3 3 3 3 2022-07-28 14:46:29.747641 \n", - "3 4 4 4 4 2022-07-28 14:46:29.747641 \n", - "4 5 5 5 5 2022-07-28 14:46:29.747641 \n", + "0 1 1 1 1 2022-07-29 16:06:09.742965 \n", + "1 2 2 2 2 2022-07-29 16:06:09.742965 \n", + "2 3 3 3 3 2022-07-29 16:06:09.742965 \n", + "3 4 4 4 4 2022-07-29 16:06:09.742965 \n", + "4 5 5 5 5 2022-07-29 16:06:09.742965 \n", "\n", " created \n", - "0 2022-07-28 14:46:29.748486 \n", - "1 2022-07-28 14:46:29.748486 \n", - "2 2022-07-28 14:46:29.748486 \n", - "3 2022-07-28 14:46:29.748486 \n", - "4 2022-07-28 14:46:29.748486 " + "0 2022-07-29 16:06:09.745334 \n", + "1 2022-07-29 16:06:09.745334 \n", + "2 2022-07-29 16:06:09.745334 \n", + "3 2022-07-29 16:06:09.745334 \n", + "4 2022-07-29 16:06:09.745334 " ] }, "execution_count": 31, @@ -1375,122 +1382,122 @@ " \n", " 0\n", " 1\n", - " -0.011209\n", - " -0.003091\n", - " -0.019507\n", - " -0.002687\n", - " 0.011972\n", - " -0.021300\n", - " -0.032583\n", - " 0.004853\n", - " 0.014860\n", + " 0.021452\n", + " -0.039451\n", + " -0.027525\n", + " 0.004748\n", + " -0.038861\n", + " 0.004086\n", + " -0.015699\n", + " -0.043150\n", + " 0.009985\n", " ...\n", - " 0.004789\n", - " 0.010169\n", - " 0.001538\n", - " 0.007499\n", - " -0.031698\n", - " 0.005041\n", - " -0.010608\n", - " -0.017708\n", - " 0.028459\n", - " -0.005681\n", + " 0.004878\n", + " -0.017287\n", + " -0.052191\n", + " -0.039028\n", + " 0.009816\n", + " 0.030303\n", + " -0.023015\n", + " 0.025190\n", + " -0.048255\n", + " -0.004642\n", " \n", " \n", " 1\n", " 2\n", - " 0.003402\n", - " -0.017468\n", - " -0.044606\n", - " -0.018614\n", - " 0.013404\n", - " 0.035754\n", - " -0.045333\n", - " -0.008586\n", - " -0.001069\n", + " -0.006522\n", + " -0.024186\n", + " -0.014698\n", + " 0.026402\n", + " -0.038433\n", + " -0.005294\n", + " -0.022629\n", + " 0.008153\n", + " 0.027704\n", " ...\n", - " 0.013446\n", - " 0.000704\n", - " 0.004869\n", - " 0.026757\n", - " 0.013619\n", - " -0.026476\n", - " -0.012150\n", - " -0.022430\n", - " -0.000142\n", - " -0.001121\n", + " -0.003684\n", + " -0.002784\n", + " -0.006920\n", + " -0.008715\n", + " -0.046778\n", + " 0.010102\n", + " -0.056791\n", + " 0.017473\n", + " -0.028276\n", + " -0.014962\n", " \n", " \n", " 2\n", " 3\n", - " 0.019445\n", - " -0.007307\n", - " -0.033769\n", - " 0.028448\n", - " 0.015149\n", - " 0.044320\n", - " -0.060556\n", - " -0.025492\n", - " -0.044191\n", + " 0.015643\n", + " -0.039508\n", + " -0.008537\n", + " -0.006320\n", + " -0.047506\n", + " -0.041426\n", + " 0.011790\n", + " -0.028635\n", + " 0.022870\n", " ...\n", - " -0.030190\n", - " -0.007296\n", - " 0.005592\n", - " 0.051516\n", - " 0.012991\n", - " -0.023061\n", - " 0.006839\n", - " -0.001478\n", - " 0.067653\n", - " 0.025774\n", + " 0.004367\n", + " -0.024234\n", + " -0.035562\n", + " -0.031195\n", + " 0.067566\n", + " 0.044429\n", + " 0.007656\n", + " 0.018322\n", + " -0.010219\n", + " -0.003382\n", " \n", " \n", " 3\n", " 4\n", - " -0.014708\n", - " 0.015244\n", - " -0.062631\n", - " -0.004068\n", - " 0.032136\n", - " 0.017407\n", - " -0.067003\n", - " -0.006915\n", - " -0.034325\n", + " 0.035367\n", + " -0.004361\n", + " -0.004915\n", + " -0.007458\n", + " -0.024948\n", + " -0.022634\n", + " -0.008486\n", + " 0.002527\n", + " 0.020076\n", " ...\n", - " -0.046192\n", - " 0.000838\n", - " -0.010202\n", - " -0.011279\n", - " 0.036282\n", - " 0.037406\n", - " -0.040168\n", - " 0.002909\n", - " -0.027596\n", - " -0.019525\n", + " 0.040158\n", + " -0.015545\n", + " -0.023736\n", + " 0.010247\n", + " 0.027722\n", + " 0.016833\n", + " 0.007536\n", + " 0.004979\n", + " -0.018067\n", + " -0.008719\n", " \n", " \n", " 4\n", " 5\n", - " 0.008686\n", - " -0.011467\n", - " -0.040448\n", - " 0.007353\n", - " -0.022232\n", - " -0.038102\n", - " -0.072839\n", - " 0.008757\n", - " 0.017982\n", + " 0.037872\n", + " -0.031143\n", + " 0.030403\n", + " 0.025478\n", + " -0.051041\n", + " 0.031561\n", + " 0.002363\n", + " -0.020428\n", + " 0.011938\n", " ...\n", - " 0.015238\n", - " 0.028096\n", - " 0.020939\n", - " 0.054904\n", - " 0.011474\n", - " -0.010195\n", - " 0.033874\n", - " -0.028908\n", - " 0.005968\n", - " 0.036249\n", + " -0.003469\n", + " -0.013381\n", + " 0.009062\n", + " -0.024175\n", + " -0.010814\n", + " 0.011117\n", + " -0.011316\n", + " -0.014056\n", + " -0.036326\n", + " -0.001744\n", " \n", " \n", "\n", @@ -1499,25 +1506,25 @@ ], "text/plain": [ " item_id 0 1 2 3 4 5 \\\n", - "0 1 -0.011209 -0.003091 -0.019507 -0.002687 0.011972 -0.021300 \n", - "1 2 0.003402 -0.017468 -0.044606 -0.018614 0.013404 0.035754 \n", - "2 3 0.019445 -0.007307 -0.033769 0.028448 0.015149 0.044320 \n", - "3 4 -0.014708 0.015244 -0.062631 -0.004068 0.032136 0.017407 \n", - "4 5 0.008686 -0.011467 -0.040448 0.007353 -0.022232 -0.038102 \n", + "0 1 0.021452 -0.039451 -0.027525 0.004748 -0.038861 0.004086 \n", + "1 2 -0.006522 -0.024186 -0.014698 0.026402 -0.038433 -0.005294 \n", + "2 3 0.015643 -0.039508 -0.008537 -0.006320 -0.047506 -0.041426 \n", + "3 4 0.035367 -0.004361 -0.004915 -0.007458 -0.024948 -0.022634 \n", + "4 5 0.037872 -0.031143 0.030403 0.025478 -0.051041 0.031561 \n", "\n", " 6 7 8 ... 54 55 56 57 \\\n", - "0 -0.032583 0.004853 0.014860 ... 0.004789 0.010169 0.001538 0.007499 \n", - "1 -0.045333 -0.008586 -0.001069 ... 0.013446 0.000704 0.004869 0.026757 \n", - "2 -0.060556 -0.025492 -0.044191 ... -0.030190 -0.007296 0.005592 0.051516 \n", - "3 -0.067003 -0.006915 -0.034325 ... -0.046192 0.000838 -0.010202 -0.011279 \n", - "4 -0.072839 0.008757 0.017982 ... 0.015238 0.028096 0.020939 0.054904 \n", + "0 -0.015699 -0.043150 0.009985 ... 0.004878 -0.017287 -0.052191 -0.039028 \n", + "1 -0.022629 0.008153 0.027704 ... -0.003684 -0.002784 -0.006920 -0.008715 \n", + "2 0.011790 -0.028635 0.022870 ... 0.004367 -0.024234 -0.035562 -0.031195 \n", + "3 -0.008486 0.002527 0.020076 ... 0.040158 -0.015545 -0.023736 0.010247 \n", + "4 0.002363 -0.020428 0.011938 ... -0.003469 -0.013381 0.009062 -0.024175 \n", "\n", " 58 59 60 61 62 63 \n", - "0 -0.031698 0.005041 -0.010608 -0.017708 0.028459 -0.005681 \n", - "1 0.013619 -0.026476 -0.012150 -0.022430 -0.000142 -0.001121 \n", - "2 0.012991 -0.023061 0.006839 -0.001478 0.067653 0.025774 \n", - "3 0.036282 0.037406 -0.040168 0.002909 -0.027596 -0.019525 \n", - "4 0.011474 -0.010195 0.033874 -0.028908 0.005968 0.036249 \n", + "0 0.009816 0.030303 -0.023015 0.025190 -0.048255 -0.004642 \n", + "1 -0.046778 0.010102 -0.056791 0.017473 -0.028276 -0.014962 \n", + "2 0.067566 0.044429 0.007656 0.018322 -0.010219 -0.003382 \n", + "3 0.027722 0.016833 0.007536 0.004979 -0.018067 -0.008719 \n", + "4 -0.010814 0.011117 -0.011316 -0.014056 -0.036326 -0.001744 \n", "\n", "[5 rows x 65 columns]" ]