From cfd1434b00d4242ad025a1be08dc8aa81e95e532 Mon Sep 17 00:00:00 2001
From: qqiao <qqiao@nvidia.com>
Date: Tue, 13 Jun 2023 02:56:04 -0700
Subject: [PATCH 01/13] Update the dockerfile for new upstream

---
 docker/dockerfile.merlin | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin
index 153f84f39..3606957b9 100644
--- a/docker/dockerfile.merlin
+++ b/docker/dockerfile.merlin
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1.2
-ARG TRITON_VERSION=23.03
-ARG DLFW_VERSION=23.03
+ARG TRITON_VERSION=23.05
+ARG DLFW_VERSION=23.05
 
 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min
@@ -40,10 +40,10 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
 # Set up NVIDIA package repository
 RUN apt clean && apt update -y --fix-missing && \
     apt install -y --no-install-recommends software-properties-common && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
-    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
-    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
     apt install -y --no-install-recommends \
         autoconf \
         automake \
@@ -93,7 +93,7 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
 
 RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \
                 fastrlock nvidia-pyindex pybind11 pytest \ 
-                transformers==4.12 tensorflow-metadata betterproto \
+                transformers tensorflow-metadata betterproto \
                 cachetools graphviz nvtx scipy "scikit-learn<1.2" \
                 tritonclient[all]==2.29.0 grpcio-channelz fiddle wandb npy-append-array \
                 git+https://github.com/rapidsai/asvdb.git@main \
@@ -101,7 +101,7 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<
                 lightfm implicit \
                 numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \
                 pynvml==11.4.1
-RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime==1.11.1 pycuda
+RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime pycuda
 RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} 
 RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com
 
@@ -134,9 +134,9 @@ RUN git clone --branch v1.7.2 https://github.com/facebookresearch/faiss.git buil
     rm -rf build-env
 
 # Install spdlog
-RUN git clone --branch v1.9.2 https://github.com/gabime/spdlog.git build-env && \
+RUN git clone --branch v1.11.0 https://github.com/gabime/spdlog.git build-env && \
     pushd build-env && \
-    mkdir build && cd build && cmake .. && make -j && make install && \
+    mkdir build && cd build && cmake -DSPDLOG_BUILD_SHARED=ON .. && make -j && make install && \
     popd && \
     rm -rf build-env
 
@@ -159,10 +159,10 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
 # Set up NVIDIA package repository
 RUN apt update -y --fix-missing && \
     apt install -y --no-install-recommends software-properties-common && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
-    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
-    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
     apt install -y --no-install-recommends \
         ca-certificates \
         clang-format \
@@ -219,6 +219,8 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/lib:${JAVA_HOME}/lib/server
 
 # Includes
 COPY --chown=1000:1000 --from=build /usr/local/include/spdlog/ /usr/local/include/spdlog/
+COPY --chown=1000:1000 --from=build /usr/local/lib/libspdlog* /usr/local/lib/
+COPY --chown=1000:1000 --from=build /usr/local/lib/cmake/spdlog /usr/local/lib/cmake/spdlog/
 
 # Binaries
 COPY --chown=1000:1000 --from=build /usr/local/bin/cmake /usr/local/bin/
@@ -245,9 +247,11 @@ COPY --chown=1000:1000 --from=triton /usr/local/cuda-12.1/targets/x86_64-linux/l
 ENV PATH=/opt/tritonserver/bin:${PATH}:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib
 
+ARG PYTHON_VERSION=3.10
+
 # Python Packages
-COPY --chown=1000:1000 --from=build /usr/local/lib/python3.8/dist-packages /usr/local/lib/python3.8/dist-packages/
-ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python3.8/dist-packages/
+COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/
+ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/
 
 
 # rapids components from the DLFW image
@@ -261,12 +265,13 @@ COPY --chown=1000:1000 --from=dlfw /usr/include/parquet /usr/include/parquet/
 COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/
 COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/
 COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
+COPY --chown=1000:1000 --from=dlfw /usr/include/fmt /usr/include/fmt/
+
 # ptx compiler required by cubinlinker
 COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
 COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
 RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;
 
-ARG PYTHON_VERSION=3.8
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow
@@ -383,6 +388,7 @@ RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
         git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
         cd /hugectr && \
         git submodule update --init --recursive && \
+        cd third_party/librdkafka && ./configure && make -j$(nproc) && make install && cd ../.. && \
         mkdir build && \
         cd build && \
         if [[ "${INSTALL_HDFS}" == "false" ]]; then \

From 8b0f75302f7f2ab9e4c6d3b3e0327a1528b19542 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Wed, 14 Jun 2023 11:15:45 -0700
Subject: [PATCH 02/13] Updates to make build compatible with 23.05 base image.

---
 docker/dockerfile.merlin | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin
index 3606957b9..ccf1075d6 100644
--- a/docker/dockerfile.merlin
+++ b/docker/dockerfile.merlin
@@ -125,7 +125,7 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib
 # don't include it https://github.com/kyamagu/faiss-wheels/issues/54)
 RUN git clone --branch v1.7.2 https://github.com/facebookresearch/faiss.git build-env && \
     pushd build-env && \
-    cmake -B build . -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="60;70;80" && \
+    cmake -B build . -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="60;70;80;90" && \
     make -C build -j $(nproc) faiss swigfaiss && \
     pushd build/faiss/python && \
     python setup.py install && \
@@ -133,13 +133,6 @@ RUN git clone --branch v1.7.2 https://github.com/facebookresearch/faiss.git buil
     popd && \
     rm -rf build-env
 
-# Install spdlog
-RUN git clone --branch v1.11.0 https://github.com/gabime/spdlog.git build-env && \
-    pushd build-env && \
-    mkdir build && cd build && cmake -DSPDLOG_BUILD_SHARED=ON .. && make -j && make install && \
-    popd && \
-    rm -rf build-env
-
 # Clean up
 RUN rm -rf /repos
 
@@ -217,11 +210,6 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
 ENV JAVA_HOME=/usr/lib/jvm/default-java
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/lib:${JAVA_HOME}/lib/server
 
-# Includes
-COPY --chown=1000:1000 --from=build /usr/local/include/spdlog/ /usr/local/include/spdlog/
-COPY --chown=1000:1000 --from=build /usr/local/lib/libspdlog* /usr/local/lib/
-COPY --chown=1000:1000 --from=build /usr/local/lib/cmake/spdlog /usr/local/lib/cmake/spdlog/
-
 # Binaries
 COPY --chown=1000:1000 --from=build /usr/local/bin/cmake /usr/local/bin/
 COPY --chown=1000:1000 --from=build /usr/local/bin/pytest /usr/local/bin/
@@ -247,7 +235,7 @@ COPY --chown=1000:1000 --from=triton /usr/local/cuda-12.1/targets/x86_64-linux/l
 ENV PATH=/opt/tritonserver/bin:${PATH}:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib
 
-ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=3.10
 
 # Python Packages
 COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/
@@ -261,11 +249,13 @@ COPY --chown=1000:1000 --from=dlfw /usr/lib/libparquet* /usr/lib/
 COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Arrow /usr/lib/cmake/Arrow/
 COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Parquet /usr/lib/cmake/Parquet/
 COPY --chown=1000:1000 --from=dlfw /usr/lib/libnvcomp* /usr/lib/
+
+COPY --chown=1000:1000 --from=dlfw /usr/include/fmt /usr/include/fmt/
+COPY --chown=1000:1000 --from=dlfw /usr/include/spdlog /usr/include/spdlog/
+COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
 COPY --chown=1000:1000 --from=dlfw /usr/include/parquet /usr/include/parquet/
 COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/
 COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/
-COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
-COPY --chown=1000:1000 --from=dlfw /usr/include/fmt /usr/include/fmt/
 
 # ptx compiler required by cubinlinker
 COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
@@ -388,7 +378,6 @@ RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
         git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
         cd /hugectr && \
         git submodule update --init --recursive && \
-        cd third_party/librdkafka && ./configure && make -j$(nproc) && make install && cd ../.. && \
         mkdir build && \
         cd build && \
         if [[ "${INSTALL_HDFS}" == "false" ]]; then \

From ba943fac0094426e2dd3b16476f83822a9415f6b Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Tue, 20 Jun 2023 06:02:04 -0700
Subject: [PATCH 03/13] Simplified build process for HugeCTR training image.

---
 docker/dockerfile.ctr | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/docker/dockerfile.ctr b/docker/dockerfile.ctr
index e054f0ad5..7f3c8c918 100644
--- a/docker/dockerfile.ctr
+++ b/docker/dockerfile.ctr
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=22.12
-ARG TRITON_VERSION=22.11
+ARG MERLIN_VERSION=23.05
+ARG TRITON_VERSION=23.05
 
 ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
 
@@ -9,16 +9,6 @@ FROM ${BASE_IMAGE} as base
 ARG HUGECTR_VER=main
 ARG HUGECTR_BACKEND_VER=main
 
-# Envs
-ENV CUDA_SHORT_VERSION=11.6
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
-ENV CUDA_HOME=/usr/local/cuda
-ENV CUDA_PATH=$CUDA_HOME
-ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs
-ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
-ENV PATH=$PATH:/usr/lib/x86_64-linux-gnu/
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-
 RUN pip install --no-cache-dir --upgrade notebook ipython 
 RUN pip install --no-cache-dir mpi4py
 
@@ -29,12 +19,11 @@ RUN cd /opt/hpcx/ompi/include/openmpi/opal/mca/hwloc/hwloc201 && rm -rfv hwloc20
 RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.4/hwloc-${HWLOC_VER}.tar.gz && \
     mkdir -p /var/tmp && tar -x -f /var/tmp/hwloc-${HWLOC_VER}.tar.gz -C /var/tmp && \
     cd /var/tmp/hwloc-${HWLOC_VER} && \
-    ./configure CPPFLAGS="-I/usr/local/cuda/include/ -L/usr/local/cuda/lib64/" LDFLAGS="-L/usr/local/cuda/lib64" --enable-cuda && \
+    ./configure CPPFLAGS="-I${CUDA_HOME}/include/ -L${CUDA_HOME}/lib64/" LDFLAGS="-L${CUDA_HOME}/lib64" --enable-cuda && \
     make -j$(nproc) && make install && \
     rm -rf /var/tmp/hwloc-${HWLOC_VER} /var/tmp/hwloc-${HWLOC_VER}.tar.gz
 
 
-
 # -----------------------------------------------------------------------------
 #    HugeCTR + Dependencies
 
@@ -62,21 +51,18 @@ RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
 RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
 RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
 
-RUN rm -rf /usr/lib/x86_64-linux-gnu/libibverbs.so && \
-    ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1.14.36.0 /usr/lib/x86_64-linux-gnu/libibverbs.so
-
 # Install HugeCTR
 ARG HUGECTR_HOME=/usr/local/hugectr
 RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
-        rm -rf /usr/local/hugectr/lib/libgmock* /usr/local/hugectr/lib/pkgconfig/gmock* /usr/local/hugectr/include/gmock && \
-        rm -rf /usr/local/hugectr/lib/libgtest* /usr/local/hugectr/lib/pkgconfig/gtest* /usr/local/hugectr/include/gtest && \
+        rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
+        rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
         git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
         cd /hugectr && \
         git submodule update --init --recursive && \
         mkdir build && \
         cd build && \
         LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
-        export PATH=$PATH:/usr/local/cuda-${CUDA_SHORT_VERSION}/compat && \
+        export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
         if [[ "${INSTALL_HDFS}" == "false" ]]; then \
             cmake -DCMAKE_BUILD_TYPE=Release -DSM="60;61;70;75;80;90" -DENABLE_MULTINODES=ON .. \
         ; else \
@@ -119,9 +105,6 @@ RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
     ; fi
 RUN ln -s ${HUGECTR_HOME}/backends/hugectr /opt/tritonserver/backends/hugectr
 
-# Remove fake lib
-RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1
-
 # Clean up
 RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked
 RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch

From 1a9bbb4a5736a312310e797066028ca5abd9b878 Mon Sep 17 00:00:00 2001
From: qqiao <qqiao@nvidia.com>
Date: Mon, 3 Jul 2023 05:17:14 -0700
Subject: [PATCH 04/13] Update tf and torch dockerfile for new upstream image

---
 docker/dockerfile.tf    | 23 ++++++++++++-----------
 docker/dockerfile.torch | 23 ++++++++++++-----------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf
index 988eb66bb..26f6a8ea8 100644
--- a/docker/dockerfile.tf
+++ b/docker/dockerfile.tf
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=22.12
-ARG TRITON_VERSION=22.11
-ARG TENSORFLOW_VERSION=22.11
+ARG MERLIN_VERSION=23.05
+ARG TRITON_VERSION=23.05
+ARG TENSORFLOW_VERSION=23.05
 
 ARG DLFW_IMAGE=nvcr.io/nvidia/tensorflow:${TENSORFLOW_VERSION}-tf2-py3
 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
@@ -20,15 +20,16 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorflow backe
 RUN pip install --no-cache-dir tensorflow protobuf==3.20.3 wrapt==1.14.0 \
     && pip uninstall tensorflow keras -y
 
+ENV PYTHON_VERSION=3.10
 # DLFW Tensorflow packages
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/tensorflow /usr/local/lib/python3.8/dist-packages/tensorflow/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/tensorflow-*.dist-info /usr/local/lib/python3.8/dist-packages/tensorflow.dist-info/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/keras /usr/local/lib/python3.8/dist-packages/keras/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/keras-*.dist-info /usr/local/lib/python3.8/dist-packages/keras.dist-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow.dist-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/keras.dist-info/
 COPY --chown=1000:1000 --from=dlfw /usr/local/bin/saved_model_cli /usr/local/bin/saved_model_cli
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/tensorflow/ /usr/local/lib/tensorflow/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/horovod /usr/local/lib/python3.8/dist-packages/horovod/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/horovod-*.dist-info /usr/local/lib/python3.8/dist-packages/horovod.dist-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/horovod.dist-info/
 COPY --chown=1000:1000 --from=dlfw /usr/local/bin/horovodrun /usr/local/bin/horovodrun
 
 # Need to install transformers after tensorflow has been pulled in, so it builds artifacts correctly.
@@ -42,7 +43,7 @@ ARG _CI_JOB_TOKEN=""
 ARG HUGECTR_VER=main
 
 ENV CPATH=$CPATH:${HUGECTR_HOME}/include \
-    LD_LIBRARY_PATH=${HUGECTR_HOME}/lib:/usr/local/lib/python3.8/dist-packages/tensorflow:$LD_LIBRARY_PATH \
+    LD_LIBRARY_PATH=${HUGECTR_HOME}/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
     LIBRARY_PATH=${HUGECTR_HOME}/lib:$LIBRARY_PATH \
     SOK_COMPILE_UNIT_TEST=ON
 
@@ -69,7 +70,7 @@ RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \
 	mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
     	rm -rf /hugectr && mkdir -p /hugectr && \
         mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit; \
-    fi; \
+    fi && \
     if [ "$INSTALL_DISTRIBUTED_EMBEDDINGS" == "true" ]; then \
         git clone --branch ${TFDE_VER} --depth 1 https://github.com/NVIDIA-Merlin/distributed-embeddings.git /distributed_embeddings/ && \
         cd /distributed_embeddings && git submodule update --init --recursive && \
diff --git a/docker/dockerfile.torch b/docker/dockerfile.torch
index e2f192972..676747233 100644
--- a/docker/dockerfile.torch
+++ b/docker/dockerfile.torch
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=22.12
-ARG TRITON_VERSION=22.11
-ARG TORCH_VERSION=22.11
+ARG MERLIN_VERSION=23.05
+ARG TRITON_VERSION=23.05
+ARG TORCH_VERSION=23.05
 
 ARG DLFW_IMAGE=nvcr.io/nvidia/pytorch:${TORCH_VERSION}-py3
 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
@@ -19,24 +19,25 @@ RUN apt update -y --fix-missing && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
+ENV PYTHON_VERSION=3.10
 # Torch Metrics and Lightning (without torch)
 RUN pip install --no-cache-dir --no-deps torch torchmetrics pytorch-lightning lightning-utilities \
         && pip install --no-cache-dir --upgrade pip \
         && pip install sympy \
-        && rm -rf /usr/local/lib/python3.8/dist-packages/torch \
-        && rm -rf /usr/local/lib/python3.8/dist-packages/caffe2
+        && rm -rf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch \
+        && rm -rf /usr/local/lib/python${PYTHON_VERSION}/dist-packages/caffe2
 
 # Triton Torch backend
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/pytorch backends/pytorch
 
 # DLFW Python packages
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/numba /usr/local/lib/python3.8/dist-packages/numba
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/numpy /usr/local/lib/python3.8/dist-packages/numpy
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/torch /usr/local/lib/python3.8/dist-packages/torch
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch
 
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/numba-*.dist-info /usr/local/lib/python3.8/dist-packages/numba.dist-info/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/numpy-*.dist-info /usr/local/lib/python3.8/dist-packages/numpy.dist-info/
-COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python3.8/dist-packages/torch-*.egg-info /usr/local/lib/python3.8/dist-packages/torch.egg-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numpy.dist-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-*.egg-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch.egg-info/
 
 
 # Add all torch libraries to /usr/local

From 0f9f78fa73ed4e044ca37244cf48af2af04e9741 Mon Sep 17 00:00:00 2001
From: qqiao <qqiao@nvidia.com>
Date: Mon, 3 Jul 2023 07:42:32 -0700
Subject: [PATCH 05/13] Remove env PYTHON_VERSION since it's already in base.

---
 docker/dockerfile.tf    | 1 -
 docker/dockerfile.torch | 1 -
 2 files changed, 2 deletions(-)

diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf
index 26f6a8ea8..4c7a2dc8a 100644
--- a/docker/dockerfile.tf
+++ b/docker/dockerfile.tf
@@ -20,7 +20,6 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorflow backe
 RUN pip install --no-cache-dir tensorflow protobuf==3.20.3 wrapt==1.14.0 \
     && pip uninstall tensorflow keras -y
 
-ENV PYTHON_VERSION=3.10
 # DLFW Tensorflow packages
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow/
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow.dist-info/
diff --git a/docker/dockerfile.torch b/docker/dockerfile.torch
index 676747233..d9fa27e3e 100644
--- a/docker/dockerfile.torch
+++ b/docker/dockerfile.torch
@@ -19,7 +19,6 @@ RUN apt update -y --fix-missing && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV PYTHON_VERSION=3.10
 # Torch Metrics and Lightning (without torch)
 RUN pip install --no-cache-dir --no-deps torch torchmetrics pytorch-lightning lightning-utilities \
         && pip install --no-cache-dir --upgrade pip \

From d94148346d34126af9b63e15b705c2f8ab4649e5 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Tue, 4 Jul 2023 03:01:48 -0700
Subject: [PATCH 06/13] Tick base image up to 23.06, fix `tritonclient`
 dependency.

---
 docker/dockerfile.merlin | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin
index ccf1075d6..07a50b7e5 100644
--- a/docker/dockerfile.merlin
+++ b/docker/dockerfile.merlin
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1.2
-ARG TRITON_VERSION=23.05
-ARG DLFW_VERSION=23.05
+ARG TRITON_VERSION=23.06
+ARG DLFW_VERSION=23.06
 
 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min
@@ -88,14 +88,12 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
 # A fix has already been merged but not yet released:
 # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7859
 # 2023-02-22: pynvml==11.5.0 is currently incompatible with our version of dask/distributed
-# tritonclient[all]==2.29.0: latest tritonclient removes the perf_* binaries, so specified to version 2.29.0
-#cupy-cuda12x 
 
 RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \
                 fastrlock nvidia-pyindex pybind11 pytest \ 
                 transformers tensorflow-metadata betterproto \
                 cachetools graphviz nvtx scipy "scikit-learn<1.2" \
-                tritonclient[all]==2.29.0 grpcio-channelz fiddle wandb npy-append-array \
+                tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \
                 git+https://github.com/rapidsai/asvdb.git@main \
                 xgboost==1.6.2 lightgbm treelite==2.4.0 treelite_runtime==2.4.0 \
                 lightfm implicit \

From 3f178f5eba4e13e6e463e59feb7328afaee3ef09 Mon Sep 17 00:00:00 2001
From: qqiao <qqiao@nvidia.com>
Date: Tue, 4 Jul 2023 05:30:57 -0700
Subject: [PATCH 07/13] Update base to 23.06

---
 docker/dockerfile.tf    | 6 +++---
 docker/dockerfile.torch | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf
index 4c7a2dc8a..0dafdff29 100644
--- a/docker/dockerfile.tf
+++ b/docker/dockerfile.tf
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=23.05
-ARG TRITON_VERSION=23.05
-ARG TENSORFLOW_VERSION=23.05
+ARG MERLIN_VERSION=23.06
+ARG TRITON_VERSION=23.06
+ARG TENSORFLOW_VERSION=23.06
 
 ARG DLFW_IMAGE=nvcr.io/nvidia/tensorflow:${TENSORFLOW_VERSION}-tf2-py3
 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
diff --git a/docker/dockerfile.torch b/docker/dockerfile.torch
index d9fa27e3e..bbc87040f 100644
--- a/docker/dockerfile.torch
+++ b/docker/dockerfile.torch
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=23.05
-ARG TRITON_VERSION=23.05
-ARG TORCH_VERSION=23.05
+ARG MERLIN_VERSION=23.06
+ARG TRITON_VERSION=23.06
+ARG TORCH_VERSION=23.06
 
 ARG DLFW_IMAGE=nvcr.io/nvidia/pytorch:${TORCH_VERSION}-py3
 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3

From 3ca7659df7cceebf185075e84875af45885cee30 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Tue, 4 Jul 2023 05:41:19 -0700
Subject: [PATCH 08/13] Tick up base image version.

---
 docker/dockerfile.ctr | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/dockerfile.ctr b/docker/dockerfile.ctr
index 7f3c8c918..f9e767438 100644
--- a/docker/dockerfile.ctr
+++ b/docker/dockerfile.ctr
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1.2
-ARG MERLIN_VERSION=23.05
-ARG TRITON_VERSION=23.05
+ARG MERLIN_VERSION=23.06
+ARG TRITON_VERSION=23.06
 
 ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
 

From d07f67422b5acf9adb6fd00ba01dc2b828385bd4 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Wed, 5 Jul 2023 08:39:19 +0000
Subject: [PATCH 09/13] Merge branch 'main' into fix-update_base_23.05

---
 CHANGELOG.md                                  |  134 ++
 README.md                                     |    2 +-
 ci/dockerfile.ci                              |   20 +-
 docs/data.json                                |  120 ++
 .../scripts/preproc/preprocessing.py          |   45 +-
 examples/ranking/README.md                    |   47 +
 ...g-DLRM-model-with-Models-and-Systems.ipynb | 1775 +++++++++++++++++
 ...n-Implicit-Model-With-Merlin-Systems.ipynb |  488 +++++
 ...An-XGboost-Model-With-Merlin-Systems.ipynb |  545 +++++
 .../examples/quick_start/test_preproc.py      |  292 +++
 .../examples/quick_start/test_ranking.py      |  502 +++++
 ...i_building_deploying_multi_stage_RecSys.py |    1 -
 ...g_an_implicit_model_with_merlin_systems.py |   59 +
 ...ng_an_xgboost_model_with_merlin_systems.py |   50 +
 ...ving_ranking_models_with_merlin_systems.py |   47 +
 15 files changed, 4101 insertions(+), 26 deletions(-)
 create mode 100644 examples/ranking/README.md
 create mode 100644 examples/ranking/tf/Training-and-Deploying-DLRM-model-with-Models-and-Systems.ipynb
 create mode 100644 examples/traditional-ml/Serving-An-Implicit-Model-With-Merlin-Systems.ipynb
 create mode 100644 examples/traditional-ml/Serving-An-XGboost-Model-With-Merlin-Systems.ipynb
 create mode 100644 tests/integration/examples/quick_start/test_preproc.py
 create mode 100644 tests/integration/examples/quick_start/test_ranking.py
 create mode 100644 tests/integration/examples/test_serving_an_implicit_model_with_merlin_systems.py
 create mode 100644 tests/integration/examples/test_serving_an_xgboost_model_with_merlin_systems.py
 create mode 100644 tests/integration/examples/test_serving_ranking_models_with_merlin_systems.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ab073bdb..ec7b35b5e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,139 @@
 # Merlin Changelog
 
+<!--  This is a template to copy/paste when starting the changelog for a new release:
+
+## [release]
+
+### NVTabular
+
+#### Major Changes
+
+#### Added
+
+#### Deprecated/Removed
+
+#### Fixed Bugs
+
+### Models
+
+#### Major Changes
+
+#### Added
+
+#### Deprecated/Removed
+
+#### Fixed Bugs
+
+### Transformers4Rec
+
+#### Major Changes
+
+#### Added
+
+#### Deprecated/Removed
+
+#### Fixed Bugs
+
+### Core
+
+#### Major Changes
+
+#### Added
+
+#### Deprecated/Removed
+
+#### Fixed Bugs
+
+### Systems
+
+#### Major Changes
+
+#### Added
+
+#### Deprecated/Removed
+
+#### Fixed Bugs
+
+### Dataloader
+
+#### Major Changes
+
+#### Added
+
+#### Deprecated/Removed
+
+#### Fixed Bugs
+-->
+
+## [23.06]
+
+### NVTabular
+
+#### Major Changes
+
+* Moved some functionality from NVTabular to `merlin-core`, but left alias implace for import backwards compatibility. Some examples are `LambdaOp`, `AddMetadataOp`, `StatOperator`, `WorkflowNode`, and others. [#1823](https://github.com/NVIDIA-Merlin/NVTabular/pull/1823), [#1825](https://github.com/NVIDIA-Merlin/NVTabular/pull/1825)
+* Updated `Categorify` to correctly handle nulls [#1836](https://github.com/NVIDIA-Merlin/NVTabular/pull/1836).
+
+#### Added
+
+* Added support for retrieving subworkflows using get_subworkflow API. Returns a subgraph wrapped in a new workflow object. [#1842](https://github.com/NVIDIA-Merlin/NVTabular/pull/1842)
+
+#### Deprecated/Removed
+
+* Removed the `nvtabular.inference` module. This functionality now exists in `merlin-systems` [#1822](https://github.com/NVIDIA-Merlin/NVTabular/pull/1822)
+#### Fixed Bugs
+
+### Models
+
+#### Added
+
+* Add support of transformer-based retrieval models [#1128](https://github.com/NVIDIA-Merlin/models/pull/1128)
+
+### Merlin
+
+#### Added
+
+* Improvements in Quick-start for ranking example [#1014](https://github.com/NVIDIA-Merlin/Merlin/pull/1014)
+  * In `preprocessing.py`, added support to target encoding features, configurable through these new CLI arguments: `--target_encoding_features`, `--target_encoding_targets`, `--target_encoding_kfold`, `--target_encoding_smoothing`.
+  * In `ranking.py`: added support to select some columns to keep (`--keep_columns`) or remove (`--ignore_columns`) from at dataloading / training / evaluation.
+
+#### Fixed Bugs
+
+* Fixed in Quick-start for ranking example [#1017](https://github.com/NVIDIA-Merlin/Merlin/pull/1017):
+  * Fixed `preprocessing.py`, which was not standardizing and tagging continuous columns properly
+  * Fixed Wide&Deep and DeepFM models to use the updated API
+
+### Transformers4Rec
+
+#### Added
+
+* Improved docstring coverage [#706](https://github.com/NVIDIA-Merlin/Transformers4Rec/pull/706)
+
+#### Fixed Bugs
+
+* Add support for providing a scalar cut-off in metrics, and Fix recall@1 that results higher than the upper cut-offs sometimes. [#720](https://github.com/NVIDIA-Merlin/Transformers4Rec/pull/720)
+* Fix the CLM performance mismatch between model evaluation and manual inference [#723](https://github.com/NVIDIA-Merlin/Transformers4Rec/pull/723)
+* Fixed OOM issues when evaluating/predicting [#721](https://github.com/NVIDIA-Merlin/Transformers4Rec/pull/721)
+  * API breaking notice: This fix changes the default output of `trainer.predict()` API, that returns a `PredictionOutput` object with a predictions property. Before this change, when the `predict_top_k` option was not set (default) the predictions property was as 2D tensor (batch size, item cardinality) with the scores for all the items. As now we set `T4RecTrainingArguments.predict_top_k` by default, the predictions property returns a tuple with `(top-100 predicted item ids, top-100 prediction scores)`.
+
+### Core
+
+#### Major Changes
+
+* Merged NVTabular Operator base class with Base Operator in core. 
+
+#### Added
+
+* Migrated some operators from NVTabular to core, allowing use in `merlin-systems`. (i.e. `LambdaOp` - changed to user defined function (UDF) and add metadata operator).
+* Created subgraph operator to allow for recall and use of parts of a graph
+
+### Systems
+
+#### Added
+
+* Added Test cases to debut functional support for core operators in systems ensembles
+* Added API to retrieve sub ensembles. 
+
 ## [23.05]
 
 ### NVTabular
diff --git a/README.md b/README.md
index 539d89926..93957a42f 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ real-world use cases.
 
 ## Merlin Is Built On
 
-**[cuDF](https://github.com/rapidsai/cudf)**<br> Merlin relies on cuDF for
+**[RAPIDS cuDF](https://github.com/rapidsai/cudf)**<br> Merlin relies on cuDF for
 GPU-accelerated DataFrame operations used in feature engineering.
 
 **[Dask](https://www.dask.org/)**<br> Merlin relies on Dask to distribute and scale
diff --git a/ci/dockerfile.ci b/ci/dockerfile.ci
index fcc51b32b..7d9865d40 100644
--- a/ci/dockerfile.ci
+++ b/ci/dockerfile.ci
@@ -162,11 +162,11 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
 
 # Set up NVIDIA package repository
 RUN apt update -y --fix-missing && \
-    apt install -y --no-install-recommends software-properties-common && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
-    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
-    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
+    apt install -y --no-install-recommends software-properties-common
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
     apt install -y --no-install-recommends \
         ca-certificates \
         clang-format \
@@ -266,9 +266,9 @@ COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/
 COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/
 COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
 # ptx compiler required by cubinlinker
-# COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
-# COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
-# RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;
+COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
+COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
+RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && pip install .;
 
 ARG PYTHON_VERSION=3.10
 # Python Packages
@@ -285,7 +285,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupyx
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_backends
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba
-# COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker
 
 
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cudf.dist-info/
@@ -295,7 +295,7 @@ COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-p
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm.dist-info/
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy_*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cupy.dist-info/
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/numba.dist-info/
-# COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/
+COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker-*.dist-info /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cubinlinker.dist-info/
 
 
 RUN pip install --no-cache-dir tensorflow && pip uninstall tensorflow keras -y
diff --git a/docs/data.json b/docs/data.json
index a5033b174..9e355dc04 100644
--- a/docs/data.json
+++ b/docs/data.json
@@ -359,6 +359,46 @@
       "timestamp_utc": "2023-06-07T20:14:32.616950",
       "transformers4rec": "23.5.0",
       "triton": "2.31.0"
+    },
+    "23.06": {
+      "base_container": "Triton version 23.04",
+      "compressedSize": "6.95 GB",
+      "cublas": "12.1.3.1",
+      "cuda": "12.1.0.023",
+      "cudf": "23.02.00",
+      "cudnn": "8.9.0.131",
+      "cufft": "11.0.2.4",
+      "curand": "10.3.2.56",
+      "cusolver": "11.4.4.55",
+      "cusparse": "12.0.2.55",
+      "cutensor": "1.7.0.1",
+      "dgx_system": "* DGX-1\n* DGX-2\n* DGX A100\n* DGX Station",
+      "distributed_embeddings": "Not applicable",
+      "gpu_model": "* `NVIDIA Ampere GPU Architecture <https://www.nvidia.com/en-us/geforce/turing>`_\n* `Turing <https://www.nvidia.com/en-us/geforce/turing/>`_\n* `Volta <https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/>`_\n* `Pascal <https://www.nvidia.com/en-us/data-center/pascal-gpu-architecture/>`_",
+      "hugectr": "23.6.0",
+      "hugectr2onnx": "Not applicable",
+      "merlin.core": "23.6.0",
+      "merlin.dataloader": "23.6.0",
+      "merlin.models": "23.6.0",
+      "merlin.systems": "23.6.0",
+      "nvidia_driver": "NVIDIA Driver version 465.19.01\nor later is required.  However,\nif you're running on Data Center\nGPUs (formerly Tesla) such as T4,\nyou can use any of the following\nNVIDIA Driver versions:\n\n* 418.40 (or later R418)\n* 440.33 (or later R440)\n* 450.51 (or later R450)\n* 460.27 (or later R460)\n\n**Note**: The CUDA Driver\nCompatibility Package does not\nsupport all drivers.",
+      "nvidia_pytorch": "Not applicable",
+      "nvidia_tensorflow": "Not applicable",
+      "nvtabular": "23.6.0",
+      "openmpi": "4.1.4",
+      "os": "Ubuntu 20.04.5 LTS",
+      "python_major": "3",
+      "pytorch": "Not applicable",
+      "release": "23.06",
+      "rmm": "23.02.00",
+      "size": "781.9 GB",
+      "sm": "60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90, 60, 61, 70, 75, 80, 90",
+      "sparse_operation_kit": "Not applicable",
+      "tensorrt": "8.6.1.2+cuda12.0.1.011",
+      "tf": "Not applicable",
+      "timestamp_utc": "2023-06-29T07:26:25.583573",
+      "transformers4rec": "23.6.0",
+      "triton": "2.33.0"
     }
   },
   "nvcr.io/nvidia/merlin/merlin-inference": {
@@ -1035,6 +1075,46 @@
       "timestamp_utc": "2023-06-07T20:13:59.481254",
       "transformers4rec": "23.5.0",
       "triton": "2.31.0"
+    },
+    "23.06": {
+      "base_container": "Triton version 23.04",
+      "compressedSize": "8.39 GB",
+      "cublas": "12.1.3.1",
+      "cuda": "12.1.0.023",
+      "cudf": "23.02.00",
+      "cudnn": "8.9.0.131",
+      "cufft": "11.0.2.4",
+      "curand": "10.3.2.56",
+      "cusolver": "11.4.4.55",
+      "cusparse": "12.0.2.55",
+      "cutensor": "1.7.0.1",
+      "dgx_system": "* DGX-1\n* DGX-2\n* DGX A100\n* DGX Station",
+      "distributed_embeddings": "Not applicable",
+      "gpu_model": "* `NVIDIA Ampere GPU Architecture <https://www.nvidia.com/en-us/geforce/turing>`_\n* `Turing <https://www.nvidia.com/en-us/geforce/turing/>`_\n* `Volta <https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/>`_\n* `Pascal <https://www.nvidia.com/en-us/data-center/pascal-gpu-architecture/>`_",
+      "hugectr": "Not applicable",
+      "hugectr2onnx": "Not applicable",
+      "merlin.core": "23.6.0",
+      "merlin.dataloader": "23.6.0",
+      "merlin.models": "23.6.0",
+      "merlin.systems": "23.6.0",
+      "nvidia_driver": "NVIDIA Driver version 465.19.01\nor later is required.  However,\nif you're running on Data Center\nGPUs (formerly Tesla) such as T4,\nyou can use any of the following\nNVIDIA Driver versions:\n\n* 418.40 (or later R418)\n* 440.33 (or later R440)\n* 450.51 (or later R450)\n* 460.27 (or later R460)\n\n**Note**: The CUDA Driver\nCompatibility Package does not\nsupport all drivers.",
+      "nvidia_pytorch": "Not applicable",
+      "nvidia_tensorflow": "Not applicable",
+      "nvtabular": "23.6.0",
+      "openmpi": "4.1.4",
+      "os": "Ubuntu 20.04.5 LTS",
+      "python_major": "3",
+      "pytorch": "2.0.1",
+      "release": "23.06",
+      "rmm": "23.02.00",
+      "size": "784.75 GB",
+      "sm": "Not applicable",
+      "sparse_operation_kit": "Not applicable",
+      "tensorrt": "8.6.1.2+cuda12.0.1.011",
+      "tf": "Not applicable",
+      "timestamp_utc": "2023-06-29T07:25:52.907749",
+      "transformers4rec": "23.6.0",
+      "triton": "2.33.0"
     }
   },
   "nvcr.io/nvidia/merlin/merlin-pytorch-inference": {
@@ -1830,6 +1910,46 @@
       "timestamp_utc": "2023-06-07T20:13:21.204966",
       "transformers4rec": "23.5.0",
       "triton": "2.31.0"
+    },
+    "23.06": {
+      "base_container": "Triton version 23.04",
+      "compressedSize": "8.2 GB",
+      "cublas": "12.1.3.1",
+      "cuda": "12.1.0.023",
+      "cudf": "23.02.00",
+      "cudnn": "8.9.0.131",
+      "cufft": "11.0.2.4",
+      "curand": "10.3.2.56",
+      "cusolver": "11.4.4.55",
+      "cusparse": "12.0.2.55",
+      "cutensor": "1.7.0.1",
+      "dgx_system": "* DGX-1\n* DGX-2\n* DGX A100\n* DGX Station",
+      "distributed_embeddings": "Not applicable",
+      "gpu_model": "* `NVIDIA Ampere GPU Architecture <https://www.nvidia.com/en-us/geforce/turing>`_\n* `Turing <https://www.nvidia.com/en-us/geforce/turing/>`_\n* `Volta <https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/>`_\n* `Pascal <https://www.nvidia.com/en-us/data-center/pascal-gpu-architecture/>`_",
+      "hugectr": "Not applicable",
+      "hugectr2onnx": "Not applicable",
+      "merlin.core": "23.6.0",
+      "merlin.dataloader": "23.6.0",
+      "merlin.models": "23.6.0",
+      "merlin.systems": "23.6.0",
+      "nvidia_driver": "NVIDIA Driver version 465.19.01\nor later is required.  However,\nif you're running on Data Center\nGPUs (formerly Tesla) such as T4,\nyou can use any of the following\nNVIDIA Driver versions:\n\n* 418.40 (or later R418)\n* 440.33 (or later R440)\n* 450.51 (or later R450)\n* 460.27 (or later R460)\n\n**Note**: The CUDA Driver\nCompatibility Package does not\nsupport all drivers.",
+      "nvidia_pytorch": "Not applicable",
+      "nvidia_tensorflow": "Not applicable",
+      "nvtabular": "23.6.0",
+      "openmpi": "4.1.4",
+      "os": "Ubuntu 20.04.5 LTS",
+      "python_major": "3",
+      "pytorch": "Not applicable",
+      "release": "23.06",
+      "rmm": "23.02.00",
+      "size": "785.21 GB",
+      "sm": "Not applicable",
+      "sparse_operation_kit": "1.2.0",
+      "tensorrt": "8.6.1.2+cuda12.0.1.011",
+      "tf": "2.12.0",
+      "timestamp_utc": "2023-06-29T07:25:15.869683",
+      "transformers4rec": "23.6.0",
+      "triton": "2.33.0"
     }
   },
   "nvcr.io/nvidia/merlin/merlin-tensorflow-inference": {
diff --git a/examples/quick_start/scripts/preproc/preprocessing.py b/examples/quick_start/scripts/preproc/preprocessing.py
index b3c62ef3e..43843600f 100644
--- a/examples/quick_start/scripts/preproc/preprocessing.py
+++ b/examples/quick_start/scripts/preproc/preprocessing.py
@@ -1,6 +1,7 @@
 import gc
 import logging
 import os
+import shutil
 from functools import reduce
 from typing import Optional
 
@@ -12,6 +13,7 @@
 from .args_parsing import parse_arguments
 
 INDEX_TMP_COL = "__index"
+NVT_OUTPUT_FOLDER = "nvt_outputs"
 
 
 def filter_by_freq(df_to_filter, df_for_stats, column, min_freq=None, max_freq=None):
@@ -221,7 +223,8 @@ def generate_nvt_features(self):
 
         for col in args.categorical_features:
             feats[col] = [col] >> nvt_ops.Categorify(
-                freq_threshold=args.categ_min_freq_capping
+                freq_threshold=args.categ_min_freq_capping,
+                out_path=NVT_OUTPUT_FOLDER,
             )
         for col in args.continuous_features:
             feats[col] = [col]
@@ -244,14 +247,13 @@ def generate_nvt_features(self):
 
             if args.target_encoding_targets and args.target_encoding_features:
                 for target_col in args.target_encoding_targets:
-                    feats[f"{target_col}_te_features"] = (
-                        args.target_encoding_features
-                        >> nvt.ops.TargetEncoding(
-                            [target_col],
-                            kfold=args.target_encoding_kfold,
-                            p_smooth=args.target_encoding_smoothing,
-                            out_dtype="float32",
-                        )
+                    feats[
+                        f"{target_col}_te_features"
+                    ] = args.target_encoding_features >> nvt.ops.TargetEncoding(
+                        [target_col],
+                        kfold=args.target_encoding_kfold,
+                        p_smooth=args.target_encoding_smoothing,
+                        out_dtype="float32",
                     )
 
         for col in args.user_features:
@@ -322,7 +324,9 @@ def merge_dataset_features_values(
         ).excluding_by_name([INDEX_TMP_COL])
 
         dataset_joint = nvt.Dataset(
-            dataset_joint, schema=schema_joint, cpu=not self.gpu,
+            dataset_joint,
+            schema=schema_joint,
+            cpu=not self.gpu,
         )
 
         return dataset_joint
@@ -430,6 +434,16 @@ def run(self):
 
         output_dataset_path = args.output_path
 
+        nvt_outputs_folder = os.path.join(output_dataset_path, NVT_OUTPUT_FOLDER)
+
+        if os.path.exists(nvt_outputs_folder):
+            logging.info(
+                "The NVTabular output folder already exists and is "
+                "being deleted: {nvt_outputs_folder}"
+            )
+            # Delete Folder code
+            shutil.rmtree(nvt_outputs_folder)
+
         train_dataset = nvt.Dataset(ddf, cpu=not self.gpu)
         # Processing features and targets in separate workflows, because
         # targets might not be available for test/predict_dataset
@@ -442,7 +456,8 @@ def run(self):
             train_dataset_features, train_dataset_targets, "train", args
         )
         train_dataset_preproc.to_parquet(
-            output_train_dataset_path, output_files=args.output_num_partitions,
+            output_train_dataset_path,
+            output_files=args.output_num_partitions,
         )
 
         if args.eval_data_path or args.dataset_split_strategy:
@@ -459,7 +474,8 @@ def run(self):
                 eval_dataset_features, eval_dataset_targets, "eval", args
             )
             eval_dataset_preproc.to_parquet(
-                output_eval_dataset_path, output_files=args.output_num_partitions,
+                output_eval_dataset_path,
+                output_files=args.output_num_partitions,
             )
 
         if args.predict_data_path:
@@ -484,9 +500,10 @@ def run(self):
             logging.info(f"Saving predict/test set: {output_predict_dataset_path}")
 
             new_predict_dataset.to_parquet(
-                output_predict_dataset_path, output_files=args.output_num_partitions,
+                output_predict_dataset_path,
+                output_files=args.output_num_partitions,
             )
-        nvt_save_path = os.path.join(output_dataset_path, "workflow")
+        nvt_save_path = os.path.join(nvt_outputs_folder, "workflow")
         logging.info(f"Saving nvtabular workflow to: {nvt_save_path}")
         nvt_workflow_features.save(nvt_save_path)
 
diff --git a/examples/ranking/README.md b/examples/ranking/README.md
new file mode 100644
index 000000000..ac1e58f26
--- /dev/null
+++ b/examples/ranking/README.md
@@ -0,0 +1,47 @@
+# Training and Deploying Ranking models with Merlin
+
+Ranking models are probably the most common use case in recommender systems. The examples under this folder are designed to demonstrate how to build, train and evaluate a ranking model (e.g. DLRM) using Merlin Models and deploy on [Triton Inference Server](https://github.com/triton-inference-server/server) with Merlin Systems. Currently we support models built with TensorFlow framework, and traditional-ml models like XGBoost and python-based models with implicit datasets. Examples built with PyTorch framework are being developed and will be added here soon. 
+
+To learn more about ranking models, please visit this documentation [page](https://nvidia-merlin.github.io/Merlin/stable/guide/recommender_models.html#).
+
+## Running the Example Notebooks
+
+Docker containers are available from the NVIDIA GPU Cloud.
+We use the latest stable version of the [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) container to run the example notebooks. To run the example notebooks using Docker containers, perform the following steps:
+
+
+1. Pull and start the container by running the following command:
+
+   ```shell
+   docker run --gpus all --rm -it \
+     -p 8888:8888 -p 8797:8787 -p 8796:8786 --ipc=host \
+     nvcr.io/nvidia/merlin/merlin-tensorflow:23.XX /bin/bash
+   ```
+
+   > You can find the release tags and more information on the [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow) container page.
+
+   The container opens a shell when the run command execution is completed.
+   Your shell prompt should look similar to the following example:
+
+   ```shell
+   root@2efa5b50b909:
+   ```
+
+2. Start the JupyterLab server by running the following command:
+
+   ```shell
+   jupyter-lab --allow-root --ip='0.0.0.0'
+   ```
+
+   View the messages in your terminal to identify the URL for JupyterLab.
+   The messages in your terminal show similar lines to the following example:
+
+   ```shell
+   Or copy and paste one of these URLs:
+   http://2efa5b50b909:8888/lab?token=9b537d1fda9e4e9cadc673ba2a472e247deee69a6229ff8d
+   or http://127.0.0.1:8888/lab?token=9b537d1fda9e4e9cadc673ba2a472e247deee69a6229ff8d
+   ```
+
+3. Open a browser and use the `127.0.0.1` URL provided in the messages by JupyterLab.
+
+4. After you log in to JupyterLab, navigate to the `/Merlin/examples/ranking` directory to try out the example notebooks.
diff --git a/examples/ranking/tf/Training-and-Deploying-DLRM-model-with-Models-and-Systems.ipynb b/examples/ranking/tf/Training-and-Deploying-DLRM-model-with-Models-and-Systems.ipynb
new file mode 100644
index 000000000..0f213f3dc
--- /dev/null
+++ b/examples/ranking/tf/Training-and-Deploying-DLRM-model-with-Models-and-Systems.ipynb
@@ -0,0 +1,1775 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bc80cfdd",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# ================================\n",
+    "\n",
+    "# Each user is responsible for checking the content of datasets and the\n",
+    "# applicable licenses and determining if suitable for the intended use."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51acf955",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "<img src=\"https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_models_04-exporting-ranking-models/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# Exporting Ranking Models\n",
+    "\n",
+    "This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) container. \n",
+    "\n",
+    "In this example notebook we demonstrate how to export (save) NVTabular `workflow` and a `ranking model` for model deployment with [Merlin Systems](https://github.com/NVIDIA-Merlin/systems) library. \n",
+    "\n",
+    "Learning Objectives:\n",
+    "\n",
+    "- Export NVTabular workflow for model deployment\n",
+    "- Export TensorFlow DLRM model for model deployment\n",
+    "- Load saved NVTabular Workflow\n",
+    "- Load saved trained Merlin Models model\n",
+    "- Create Ensemble Graph\n",
+    "- Export Ensemble Graph\n",
+    "- Deploy model on Triton Inference Server\n",
+    "\n",
+    "We will follow the steps below:\n",
+    "- Prepare the data with NVTabular and export NVTabular workflow\n",
+    "- Train a DLRM model with Merlin Models and export the trained model\n",
+    "- Launch Triton server and deploy trained models on Triton\n",
+    "- Send request to Triton and receive back the response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93e4fec3",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Importing Libraries"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eab14a7d",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Let's start with importing the libraries that we'll use in this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "37d5020c",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-06-28 21:03:00.600621: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
+      "  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n",
+      "[INFO]: sparse_operation_kit is imported\n",
+      "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n",
+      "[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.2.0-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so\n",
+      "[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.2.0-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so\n",
+      "[SOK INFO] Initialize finished, communication tool: horovod\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-06-28 21:03:07.070258: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n",
+      "2023-06-28 21:03:07.070303: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
+      "2023-06-28 21:03:07.070448: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16249 MB memory:  -> device: 0, name: Quadro GV100, pci bus id: 0000:2d:00.0, compute capability: 7.0\n",
+      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "os.environ[\"TF_GPU_ALLOCATOR\"]=\"cuda_malloc_async\"\n",
+    "\n",
+    "import nvtabular as nvt\n",
+    "from nvtabular.ops import *\n",
+    "import numpy as np\n",
+    "\n",
+    "from merlin.models.utils.example_utils import workflow_fit_transform\n",
+    "from merlin.schema.tags import Tags\n",
+    "\n",
+    "import merlin.models.tf as mm\n",
+    "from merlin.io.dataset import Dataset\n",
+    "import tensorflow as tf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cbb650a7",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Feature Engineering with NVTabular"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c715cd5",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "We use the synthetic train and test datasets generated by mimicking the real [Ali-CCP: Alibaba Click and Conversion Prediction](https://tianchi.aliyun.com/dataset/dataDetail?dataId=408#1) dataset to build our recommender system ranking models. \n",
+    "\n",
+    "If you would like to use real Ali-CCP dataset instead, you can download the training and test datasets on [tianchi.aliyun.com](https://tianchi.aliyun.com/dataset/dataDetail?dataId=408#1). You can then use [get_aliccp()](https://github.com/NVIDIA-Merlin/models/blob/stable/merlin/datasets/ecommerce/aliccp/dataset.py#L43) function to curate the raw csv files and save them as parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "44c7457b-08c4-4453-bacc-5c8eef7042d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from merlin.datasets.synthetic import generate_data\n",
+    "\n",
+    "DATA_FOLDER = os.environ.get(\"DATA_FOLDER\", \"/workspace/data/\")\n",
+    "NUM_ROWS = os.environ.get(\"NUM_ROWS\", 1000000)\n",
+    "SYNTHETIC_DATA = eval(os.environ.get(\"SYNTHETIC_DATA\", \"True\"))\n",
+    "BATCH_SIZE = int(os.environ.get(\"BATCH_SIZE\", 512))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b6651cc8",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "if SYNTHETIC_DATA:\n",
+    "    train, valid = generate_data(\"aliccp-raw\", int(NUM_ROWS), set_sizes=(0.8, 0.2))\n",
+    "    # save the datasets as parquet files\n",
+    "    train.to_ddf().to_parquet(os.path.join(DATA_FOLDER, \"train\"))\n",
+    "    valid.to_ddf().to_parquet(os.path.join(DATA_FOLDER, \"valid\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecf0e794",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Let's define our input and output paths."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1124f2c1",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_path = os.path.join(DATA_FOLDER, \"train\", \"*.parquet\")\n",
+    "valid_path = os.path.join(DATA_FOLDER, \"valid\", \"*.parquet\")\n",
+    "output_path = os.path.join(DATA_FOLDER, \"processed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e1162c0",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "After we execute `fit()` and `transform()` functions on the raw dataset applying the operators defined in the NVTabular workflow pipeline below, the processed parquet files are saved to `output_path`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "89b3ddc6",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2.61 s, sys: 1.09 s, total: 3.7 s\n",
+      "Wall time: 3.68 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "category_temp_directory = os.path.join(DATA_FOLDER, \"categories\")\n",
+    "user_id = [\"user_id\"] >> Categorify(out_path=category_temp_directory) >> TagAsUserID()\n",
+    "item_id = [\"item_id\"] >> Categorify(out_path=category_temp_directory) >> TagAsItemID()\n",
+    "targets = [\"click\"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, \"target\"])\n",
+    "\n",
+    "item_features = [\"item_category\", \"item_shop\", \"item_brand\"] >> Categorify(out_path=category_temp_directory) >> TagAsItemFeatures()\n",
+    "\n",
+    "user_features = (\n",
+    "    [\n",
+    "        \"user_shops\",\n",
+    "        \"user_profile\",\n",
+    "        \"user_group\",\n",
+    "        \"user_gender\",\n",
+    "        \"user_age\",\n",
+    "        \"user_consumption_2\",\n",
+    "        \"user_is_occupied\",\n",
+    "        \"user_geography\",\n",
+    "        \"user_intentions\",\n",
+    "        \"user_brands\",\n",
+    "        \"user_categories\",\n",
+    "    ]\n",
+    "    >> Categorify(out_path=category_temp_directory)\n",
+    "    >> TagAsUserFeatures()\n",
+    ")\n",
+    "\n",
+    "outputs = user_id + item_id + item_features + user_features + targets\n",
+    "\n",
+    "workflow = nvt.Workflow(outputs)\n",
+    "\n",
+    "train_dataset = nvt.Dataset(train_path)\n",
+    "valid_dataset = nvt.Dataset(valid_path)\n",
+    "\n",
+    "workflow.fit(train_dataset)\n",
+    "workflow.transform(train_dataset).to_parquet(output_path=output_path + \"/train/\")\n",
+    "workflow.transform(valid_dataset).to_parquet(output_path=output_path + \"/valid/\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8afd8b10",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "We save NVTabular `workflow` model in the current working directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3e367206",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "workflow.save(os.path.join(DATA_FOLDER, \"workflow\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be619646",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "Let's check out our saved workflow model folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5e03167a",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: seedir in /usr/local/lib/python3.8/dist-packages (0.4.2)\n",
+      "Requirement already satisfied: natsort in /usr/local/lib/python3.8/dist-packages (from seedir) (8.4.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install seedir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "aeafadbe",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data/\n",
+      "├─categories/\n",
+      "│ └─categories/\n",
+      "│   ├─meta.item_brand.parquet\n",
+      "│   ├─meta.item_category.parquet\n",
+      "│   ├─meta.item_id.parquet\n",
+      "│   ├─meta.item_shop.parquet\n",
+      "│   ├─meta.user_age.parquet\n",
+      "│   ├─meta.user_brands.parquet\n",
+      "│   ├─meta.user_categories.parquet\n",
+      "│   ├─meta.user_consumption_2.parquet\n",
+      "│   ├─meta.user_gender.parquet\n",
+      "│   └─meta.user_geography.parquet\n",
+      "├─dlrm/\n",
+      "│ ├─.merlin/\n",
+      "│ │ ├─input_schema.json\n",
+      "│ │ └─output_schema.json\n",
+      "│ ├─assets/\n",
+      "│ ├─fingerprint.pb\n",
+      "│ ├─keras_metadata.pb\n",
+      "│ ├─saved_model.pb\n",
+      "│ └─variables/\n",
+      "│   ├─variables.data-00000-of-00001\n",
+      "│   └─variables.index\n",
+      "├─processed/\n",
+      "│ ├─train/\n",
+      "│ │ ├─.merlin/\n",
+      "│ │ ├─_file_list.txt\n",
+      "│ │ ├─_metadata\n",
+      "│ │ ├─_metadata.json\n",
+      "│ │ ├─part_0.parquet\n",
+      "│ │ └─schema.pbtxt\n",
+      "│ └─valid/\n",
+      "│   ├─.merlin/\n",
+      "│   ├─_file_list.txt\n",
+      "│   ├─_metadata\n",
+      "│   ├─_metadata.json\n",
+      "│   ├─part_0.parquet\n",
+      "│   └─schema.pbtxt\n",
+      "├─train/\n",
+      "│ └─part.0.parquet\n",
+      "├─valid/\n",
+      "│ └─part.0.parquet\n",
+      "└─workflow/\n",
+      "  ├─categories/\n",
+      "  │ ├─unique.item_brand.parquet\n",
+      "  │ ├─unique.item_category.parquet\n",
+      "  │ ├─unique.item_id.parquet\n",
+      "  │ ├─unique.item_shop.parquet\n",
+      "  │ ├─unique.user_age.parquet\n",
+      "  │ ├─unique.user_brands.parquet\n",
+      "  │ ├─unique.user_categories.parquet\n",
+      "  │ ├─unique.user_consumption_2.parquet\n",
+      "  │ ├─unique.user_gender.parquet\n",
+      "  │ └─unique.user_geography.parquet\n",
+      "  ├─metadata.json\n",
+      "  └─workflow.pkl\n"
+     ]
+    }
+   ],
+   "source": [
+    "import seedir as sd\n",
+    "\n",
+    "sd.seedir(\n",
+    "    DATA_FOLDER,\n",
+    "    style=\"lines\",\n",
+    "    itemlimit=10,\n",
+    "    depthlimit=3,\n",
+    "    exclude_folders=\".ipynb_checkpoints\",\n",
+    "    sort=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93f8e0ee",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Build and Train a DLRM model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "56f24b6b",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "In this example, we build, train, and export a Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture. To learn more about how to train different deep learning models, how easily transition from one model to another and the seamless integration between data preparation and model training visit [03-Exploring-different-models.ipynb](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/03-Exploring-different-models.ipynb) notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ceb8dcc",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "NVTabular workflow above exports a schema file, schema.pbtxt, of our processed dataset. To learn more about the schema object, schema file  and `tags`, you can explore [02-Merlin-Models-and-NVTabular-integration.ipynb](02-Merlin-Models-and-NVTabular-integration.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "be3a3421",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# define train and valid dataset objects\n",
+    "train = Dataset(os.path.join(output_path, \"train\", \"*.parquet\"))\n",
+    "valid = Dataset(os.path.join(output_path, \"valid\", \"*.parquet\"))\n",
+    "\n",
+    "# define schema object\n",
+    "schema = train.schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b164b7ff",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'click'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target_column = schema.select_by_tag(Tags.TARGET).column_names[0]\n",
+    "target_column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "71847bb9",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model = mm.DLRMModel(\n",
+    "    schema,\n",
+    "    embedding_dim=64,\n",
+    "    bottom_block=mm.MLPBlock([128, 64]),\n",
+    "    top_block=mm.MLPBlock([128, 64, 32]),\n",
+    "    prediction_tasks=mm.BinaryOutput(target_column),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d009deb7",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-06-28 21:03:36.828993: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32\n",
+      "\t [[{{node Placeholder/_0}}]]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1563/1563 [==============================] - ETA: 0s - loss: 0.6932 - auc: 0.4998 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-06-28 21:04:40.190967: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32\n",
+      "\t [[{{node Placeholder/_0}}]]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1563/1563 [==============================] - 69s 38ms/step - loss: 0.6932 - auc: 0.4998 - regularization_loss: 0.0000e+00 - loss_batch: 0.6932 - val_loss: 0.6931 - val_auc: 0.5000 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.6932\n",
+      "CPU times: user 1min 51s, sys: 14.1 s, total: 2min 5s\n",
+      "Wall time: 1min 11s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<keras.callbacks.History at 0x7f74b2a4b1c0>"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "model.compile(\"adam\", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])\n",
+    "model.fit(train, validation_data=valid, batch_size=BATCH_SIZE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adc7051d",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "### Save model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f999a063",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as model_context_layer_call_fn, model_context_layer_call_and_return_conditional_losses, prepare_list_features_layer_call_fn, prepare_list_features_layer_call_and_return_conditional_losses, dense_9_layer_call_fn while saving (showing 5 of 96). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: /workspace/data/dlrm/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: /workspace/data/dlrm/assets\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.save(os.path.join(DATA_FOLDER, \"dlrm\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a9235b9",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "We have NVTabular wokflow  and DLRM model exported, now it is time to move on to the next step: model deployment with [Merlin Systems](https://github.com/NVIDIA-Merlin/systems). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4f2667e",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Deploying the model with Merlin Systems"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee302de0",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "The last step of machine learning (ML)/deep learning (DL) pipeline is to deploy the ETL workflow and saved model into production. In the production setting, we want to transform the input data as done during training (ETL). We need to apply the same mean/std for continuous features and use the same categorical mapping to convert the categories to continuous integer before we use the DL model for a prediction. Therefore, we deploy the NVTabular workflow with the Tensorflow model as an ensemble model to Triton Inference using [Merlin Systems](https://github.com/NVIDIA-Merlin/systems) library very easily. The ensemble model guarantees that the same transformation is applied to the raw inputs.\n",
+    "\n",
+    "In the next steps, we will learn how to deploy NVTabular workflow and the trained DLRM model into [Triton Inference Server](https://github.com/triton-inference-server/server) with [Merlin Systems](https://github.com/NVIDIA-Merlin/systems) library. NVIDIA Triton Inference Server (TIS) simplifies the deployment of AI models at scale in production. TIS provides a cloud and edge inferencing solution optimized for both CPUs and GPUs. It supports a number of different machine learning frameworks such as TensorFlow and PyTorch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84002b14-5be5-4896-96ac-ea058bf8b7e3",
+   "metadata": {},
+   "source": [
+    "First, we load the `nvtabular.Workflow` that we created in with this [example](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/04-Exporting-ranking-models.ipynb). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3e6b6cf0-2867-4cce-ade6-d0d86e2f7de7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nvtabular.workflow import Workflow\n",
+    "\n",
+    "workflow = Workflow.load(os.path.join(DATA_FOLDER, \"workflow\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f206c105-0cd2-4710-af63-a8862307b67e",
+   "metadata": {},
+   "source": [
+    "After we load the workflow, we remove the label columns from it's inputs. This removes all columns with the TARGET tag from the workflow. We do this because we need to set the workflow to only require the features needed to predict, not train, when creating an inference pipeline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "73729623-89af-442f-82ad-0ffad6e73fd3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<nvtabular.workflow.workflow.Workflow at 0x7f74b290f550>"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from merlin.schema.tags import Tags\n",
+    "\n",
+    "label_columns = workflow.output_schema.select_by_tag(Tags.TARGET).column_names\n",
+    "workflow.remove_inputs(label_columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09eddb76-971c-4a69-bfda-e1fe127b2582",
+   "metadata": {},
+   "source": [
+    "After loading the workflow, we load the model. This model was trained with the output of the workflow from the Exporting Ranking Models example from Merlin Models.\n",
+    "\n",
+    "First, we need to import the Merlin Models library. Loading a TensorFlow model, which is based on custom subclasses, requires to the subclass definition. Otherwise, TensorFlow cannot load correctly load the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "ec339d34-3667-4fe2-a9f5-ecb770e5c9a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_model_path = os.path.join(DATA_FOLDER, \"dlrm\")\n",
+    "\n",
+    "model = tf.keras.models.load_model(tf_model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b93b911f-4c6b-427b-a1f9-888f0f8c7613",
+   "metadata": {},
+   "source": [
+    "### Create the Ensemble Graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98f74ec8-c4e7-49e7-ba86-eae2f8f0b744",
+   "metadata": {},
+   "source": [
+    "After we have both the model and the workflow loaded, we can create the ensemble graph. You create the graph. The goal is to illustrate the path of data through your full system. In this example we only serve a workflow with a model, but you can add other components that help you meet your business logic requirements.\n",
+    "\n",
+    "Because this example has two components—a model and a workflow—we require two operators. These operators, also known as inference operators, are meant to abstract away all the \"hard parts\" of loading a specific component, such as a workflow or model, into Triton Inference Server.\n",
+    "\n",
+    "The following code block shows how to use two inference operators:\n",
+    "\n",
+    "- **TransformWorkflow:**<br>\n",
+    "    This operator ensures that the workflow is correctly saved and packaged with the required config so the server will know how to load it.\n",
+    "\n",
+    "- **PredictTensorflow:**<br>\n",
+    "    This operator will do something similar with the model, loaded before.\n",
+    "\n",
+    "Let's give it a try."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "86a951b0-1d44-456e-9dc1-d77e98223248",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as model_context_2_layer_call_fn, model_context_2_layer_call_and_return_conditional_losses, prepare_list_features_2_layer_call_fn, prepare_list_features_2_layer_call_and_return_conditional_losses, dense_9_layer_call_fn while saving (showing 5 of 96). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: /tmp/tmpomjyo5xq/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: /tmp/tmpomjyo5xq/assets\n"
+     ]
+    }
+   ],
+   "source": [
+    "from merlin.systems.dag.ops.workflow import TransformWorkflow\n",
+    "from merlin.systems.dag.ops.tensorflow import PredictTensorflow\n",
+    "\n",
+    "serving_operators = workflow.input_schema.column_names >> TransformWorkflow(workflow) >> PredictTensorflow(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef8324aa-756f-4df8-9c51-595e473b0ce5",
+   "metadata": {},
+   "source": [
+    "### Export Graph as Ensemble"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59a33309-4a82-41ac-8439-2a6aa95241af",
+   "metadata": {},
+   "source": [
+    "The last step is to create the ensemble artifacts that Triton Inference Server can consume. To make these artifacts, we import the Ensemble class. The class is responsible for interpreting the graph and exporting the correct files for the server.\n",
+    "\n",
+    "After you run the following cell, you'll see that we create a ColumnSchema for the expected inputs to the workflow. The workflow is a Schema.\n",
+    "\n",
+    "When you are creating an Ensemble object you supply the graph and a schema representing the starting input of the graph. the inputs to the ensemble graph are the inputs to the first operator of your graph.\n",
+    "\n",
+    "After you have created the Ensemble you export the graph, supplying an export path for the Ensemble.export function.\n",
+    "\n",
+    "This returns an ensemble config which represents the entire inference pipeline and a list of node-specific configs.\n",
+    "\n",
+    "Let's take a look below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4e305ed1-4c19-470c-82c0-0635f2d1d851",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>dtype</th>\n",
+       "      <th>is_list</th>\n",
+       "      <th>is_ragged</th>\n",
+       "      <th>properties.num_buckets</th>\n",
+       "      <th>properties.freq_threshold</th>\n",
+       "      <th>properties.max_size</th>\n",
+       "      <th>properties.cat_path</th>\n",
+       "      <th>properties.domain.min</th>\n",
+       "      <th>properties.domain.max</th>\n",
+       "      <th>properties.domain.name</th>\n",
+       "      <th>properties.embedding_sizes.cardinality</th>\n",
+       "      <th>properties.embedding_sizes.dimension</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>user_id</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.ID, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>772</td>\n",
+       "      <td>user_id</td>\n",
+       "      <td>773</td>\n",
+       "      <td>66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>item_id</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.ITEM, Tags.ID)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.i...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>789</td>\n",
+       "      <td>item_id</td>\n",
+       "      <td>790</td>\n",
+       "      <td>67</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>item_category</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.ITEM)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.i...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>789</td>\n",
+       "      <td>item_category</td>\n",
+       "      <td>790</td>\n",
+       "      <td>67</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>item_shop</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.ITEM)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.i...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>789</td>\n",
+       "      <td>item_shop</td>\n",
+       "      <td>790</td>\n",
+       "      <td>67</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>item_brand</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.ITEM)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.i...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>789</td>\n",
+       "      <td>item_brand</td>\n",
+       "      <td>790</td>\n",
+       "      <td>67</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>user_shops</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>772</td>\n",
+       "      <td>user_shops</td>\n",
+       "      <td>773</td>\n",
+       "      <td>66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>user_profile</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>73</td>\n",
+       "      <td>user_profile</td>\n",
+       "      <td>74</td>\n",
+       "      <td>18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>user_group</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>user_group</td>\n",
+       "      <td>14</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>user_gender</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>user_gender</td>\n",
+       "      <td>5</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>user_age</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>user_age</td>\n",
+       "      <td>9</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>user_consumption_2</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>user_consumption_2</td>\n",
+       "      <td>6</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>user_is_occupied</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>user_is_occupied</td>\n",
+       "      <td>5</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>user_geography</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>6</td>\n",
+       "      <td>user_geography</td>\n",
+       "      <td>7</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>user_intentions</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>772</td>\n",
+       "      <td>user_intentions</td>\n",
+       "      <td>773</td>\n",
+       "      <td>66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>user_brands</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>772</td>\n",
+       "      <td>user_brands</td>\n",
+       "      <td>773</td>\n",
+       "      <td>66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>user_categories</td>\n",
+       "      <td>(Tags.CATEGORICAL, Tags.USER)</td>\n",
+       "      <td>DType(name='int64', element_type=&lt;ElementType....</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>/workspace/data/categories/categories/unique.u...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>772</td>\n",
+       "      <td>user_categories</td>\n",
+       "      <td>773</td>\n",
+       "      <td>66</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "[{'name': 'user_id', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ID: 'id'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_id.parquet', 'domain': {'min': 0, 'max': 772, 'name': 'user_id'}, 'embedding_sizes': {'cardinality': 773, 'dimension': 66}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_id', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ITEM: 'item'>, <Tags.ID: 'id'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.item_id.parquet', 'domain': {'min': 0, 'max': 789, 'name': 'item_id'}, 'embedding_sizes': {'cardinality': 790, 'dimension': 67}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_category', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ITEM: 'item'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.item_category.parquet', 'domain': {'min': 0, 'max': 789, 'name': 'item_category'}, 'embedding_sizes': {'cardinality': 790, 'dimension': 67}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_shop', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ITEM: 'item'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.item_shop.parquet', 'domain': {'min': 0, 'max': 789, 'name': 'item_shop'}, 'embedding_sizes': {'cardinality': 790, 'dimension': 67}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'item_brand', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.ITEM: 'item'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.item_brand.parquet', 'domain': {'min': 0, 'max': 789, 'name': 'item_brand'}, 'embedding_sizes': {'cardinality': 790, 'dimension': 67}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_shops', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_shops.parquet', 'domain': {'min': 0, 'max': 772, 'name': 'user_shops'}, 'embedding_sizes': {'cardinality': 773, 'dimension': 66}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_profile', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_profile.parquet', 'domain': {'min': 0, 'max': 73, 'name': 'user_profile'}, 'embedding_sizes': {'cardinality': 74, 'dimension': 18}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_group', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_group.parquet', 'domain': {'min': 0, 'max': 13, 'name': 'user_group'}, 'embedding_sizes': {'cardinality': 14, 'dimension': 16}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_gender', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_gender.parquet', 'domain': {'min': 0, 'max': 4, 'name': 'user_gender'}, 'embedding_sizes': {'cardinality': 5, 'dimension': 16}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_age', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_age.parquet', 'domain': {'min': 0, 'max': 8, 'name': 'user_age'}, 'embedding_sizes': {'cardinality': 9, 'dimension': 16}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_consumption_2', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_consumption_2.parquet', 'domain': {'min': 0, 'max': 5, 'name': 'user_consumption_2'}, 'embedding_sizes': {'cardinality': 6, 'dimension': 16}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_is_occupied', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_is_occupied.parquet', 'domain': {'min': 0, 'max': 4, 'name': 'user_is_occupied'}, 'embedding_sizes': {'cardinality': 5, 'dimension': 16}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_geography', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_geography.parquet', 'domain': {'min': 0, 'max': 6, 'name': 'user_geography'}, 'embedding_sizes': {'cardinality': 7, 'dimension': 16}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_intentions', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_intentions.parquet', 'domain': {'min': 0, 'max': 772, 'name': 'user_intentions'}, 'embedding_sizes': {'cardinality': 773, 'dimension': 66}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_brands', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_brands.parquet', 'domain': {'min': 0, 'max': 772, 'name': 'user_brands'}, 'embedding_sizes': {'cardinality': 773, 'dimension': 66}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'user_categories', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.USER: 'user'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': '/workspace/data/categories/categories/unique.user_categories.parquet', 'domain': {'min': 0, 'max': 772, 'name': 'user_categories'}, 'embedding_sizes': {'cardinality': 773, 'dimension': 66}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "workflow.output_schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "440ce7a2-9a6d-47aa-9506-b7027ab0ffa9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:Found untraced functions such as model_context_2_layer_call_fn, model_context_2_layer_call_and_return_conditional_losses, prepare_list_features_2_layer_call_fn, prepare_list_features_2_layer_call_and_return_conditional_losses, dense_9_layer_call_fn while saving (showing 5 of 96). These functions will not be directly callable after loading.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: /workspace/data/ensemble/1_predicttensorflowtriton/1/model.savedmodel/assets\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Assets written to: /workspace/data/ensemble/1_predicttensorflowtriton/1/model.savedmodel/assets\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from merlin.systems.dag.ensemble import Ensemble\n",
+    "\n",
+    "ensemble = Ensemble(serving_operators, workflow.input_schema)\n",
+    "\n",
+    "export_path = os.path.join(DATA_FOLDER, \"ensemble\")\n",
+    "\n",
+    "ens_conf, node_confs = ensemble.export(export_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "987010e0-fae5-48dc-9781-f68e1e0035f3",
+   "metadata": {},
+   "source": [
+    "Display the path to the directory with the ensemble."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "cfa8da6c-1f81-4ee2-b8d2-3dca75aeaedc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/data/ensemble\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(export_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d47dd90-39df-4c0b-849c-c06f11977512",
+   "metadata": {},
+   "source": [
+    "### Verification of Ensemble Artifacts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d09fb14-92d8-44d4-b727-91ecf0e2dd71",
+   "metadata": {},
+   "source": [
+    "After we export the ensemble, we can check the export path for the graph's artifacts. The directory structure represents an ordering number followed by an operator identifier such as `0_transformworkflow`, `1_predicttensorflow`, and so on.\n",
+    "\n",
+    "Inside each of those directories, the export method writes a config.pbtxt file and a directory with a number. The number indicates the version and begins at 1. The artifacts for each operator are found inside the version folder. These artifacts vary depending on the operator in use.\n",
+    "\n",
+    "Install the seedir python package so we can view some of the directory contents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "be28d294-f9f2-4086-9d79-5bd9d93c603a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ensemble/\n",
+      "├─0_transformworkflowtriton/\n",
+      "│ ├─1/\n",
+      "│ │ ├─model.py\n",
+      "│ │ └─workflow/\n",
+      "│ └─config.pbtxt\n",
+      "├─1_predicttensorflowtriton/\n",
+      "│ ├─1/\n",
+      "│ │ └─model.savedmodel/\n",
+      "│ └─config.pbtxt\n",
+      "└─executor_model/\n",
+      "  ├─1/\n",
+      "  │ ├─ensemble/\n",
+      "  │ └─model.py\n",
+      "  └─config.pbtxt\n"
+     ]
+    }
+   ],
+   "source": [
+    "sd.seedir(export_path, style='lines', itemlimit=10, depthlimit=3, exclude_folders='.ipynb_checkpoints', sort=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9d0bc00b-3d39-4093-90d6-6bc4c84507f9",
+   "metadata": {},
+   "source": [
+    "### Starting Triton Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d42fb70f-a9b3-4240-b190-b6e68b0c9b88",
+   "metadata": {},
+   "source": [
+    "After we export the ensemble, we are ready to start the Triton Inference Server. The server is installed in all the Merlin inference containers. If you are not using one of our containers, then ensure it is installed in your environment. For more information, see the Triton Inference Server documentation.\n",
+    "\n",
+    "You can start the server by running the following command:\n",
+    "```\n",
+    "tritonserver --model-repository=/workspace/data/ensemble\n",
+    "\n",
+    "For the --model-repository argument, specify the same value as the export_path that you specified previously in the ensemble.export method.\n",
+    "```\n",
+    "After you run the tritonserver command, wait until your terminal shows messages like the following example:\n",
+    "```\n",
+    "I0414 18:29:50.741833 4067 grpc_server.cc:4421] Started GRPCInferenceService at 0.0.0.0:8001\n",
+    "I0414 18:29:50.742197 4067 http_server.cc:3113] Started HTTPService at 0.0.0.0:8000\n",
+    "I0414 18:29:50.783470 4067 http_server.cc:178] Started Metrics Service at 0.0.0.0:8002\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c838873-ee04-45bb-9725-01105c66956b",
+   "metadata": {},
+   "source": [
+    "### Retrieving Recommendations from Triton Inference Server\n",
+    "Now that our server is running, we can send requests to it. This request is composed of values that correspond to the request schema that was created when we exported the ensemble graph.\n",
+    "\n",
+    "In the code below we create a request to send to triton and send it. We will then analyze the response, to show the full experience.\n",
+    "\n",
+    "First we need to ensure that we have a client connected to the server that we started. To do this, we use the Triton HTTP client library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "bc26afee-8853-4bb0-a027-68613248088c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "client created.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tritonclient.http as client\n",
+    "\n",
+    "# Create a triton client\n",
+    "try:\n",
+    "    triton_client = client.InferenceServerClient(url=\"localhost:8000\", verbose=True)\n",
+    "    print(\"client created.\")\n",
+    "except Exception as e:\n",
+    "    print(\"channel creation failed: \" + str(e))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23fb2d2e-60f2-4406-82aa-f040403729a3",
+   "metadata": {},
+   "source": [
+    "After we create the client and verified it is connected to the server instance, we can communicate with the server and ensure all the models are loaded correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "b12afdf8-1e73-4eb3-9ea1-85dd4773a649",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GET /v2/health/live, headers None\n",
+      "<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>\n",
+      "POST /v2/repository/index, headers None\n",
+      "\n",
+      "<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '191'}>\n",
+      "bytearray(b'[{\"name\":\"0_transformworkflowtriton\",\"version\":\"1\",\"state\":\"READY\"},{\"name\":\"1_predicttensorflowtriton\",\"version\":\"1\",\"state\":\"READY\"},{\"name\":\"executor_model\",\"version\":\"1\",\"state\":\"READY\"}]')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'name': '0_transformworkflowtriton', 'version': '1', 'state': 'READY'},\n",
+       " {'name': '1_predicttensorflowtriton', 'version': '1', 'state': 'READY'},\n",
+       " {'name': 'executor_model', 'version': '1', 'state': 'READY'}]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# ensure triton is in a good state\n",
+    "triton_client.is_server_live()\n",
+    "triton_client.get_model_repository_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "420d5603-568f-41f4-ac0d-c5852b9f33dd",
+   "metadata": {},
+   "source": [
+    "After verifying the models are correctly loaded by the server, we use some original, raw validation data and send it as an inference request to the server.\n",
+    "\n",
+    "The df_lib object is cudf if a GPU is available and pandas otherwise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "5bb6aec6-0a7e-41a7-b42d-0a378c530d28",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>item_id</th>\n",
+       "      <th>item_category</th>\n",
+       "      <th>item_shop</th>\n",
+       "      <th>item_brand</th>\n",
+       "      <th>user_shops</th>\n",
+       "      <th>user_profile</th>\n",
+       "      <th>user_group</th>\n",
+       "      <th>user_gender</th>\n",
+       "      <th>user_age</th>\n",
+       "      <th>user_consumption_2</th>\n",
+       "      <th>user_is_occupied</th>\n",
+       "      <th>user_geography</th>\n",
+       "      <th>user_intentions</th>\n",
+       "      <th>user_brands</th>\n",
+       "      <th>user_categories</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>__null_dask_index__</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>800000</th>\n",
+       "      <td>25</td>\n",
+       "      <td>26</td>\n",
+       "      <td>85</td>\n",
+       "      <td>5936</td>\n",
+       "      <td>2045</td>\n",
+       "      <td>1670</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>484</td>\n",
+       "      <td>830</td>\n",
+       "      <td>88</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>800001</th>\n",
+       "      <td>28</td>\n",
+       "      <td>13</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2850</td>\n",
+       "      <td>982</td>\n",
+       "      <td>1879</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>544</td>\n",
+       "      <td>934</td>\n",
+       "      <td>98</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>800002</th>\n",
+       "      <td>9</td>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "      <td>238</td>\n",
+       "      <td>82</td>\n",
+       "      <td>557</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>162</td>\n",
+       "      <td>277</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     user_id  item_id  item_category  item_shop  item_brand  \\\n",
+       "__null_dask_index__                                                           \n",
+       "800000                    25       26             85       5936        2045   \n",
+       "800001                    28       13             41       2850         982   \n",
+       "800002                     9        2              4        238          82   \n",
+       "\n",
+       "                     user_shops  user_profile  user_group  user_gender  \\\n",
+       "__null_dask_index__                                                      \n",
+       "800000                     1670             2           1            1   \n",
+       "800001                     1879             2           1            1   \n",
+       "800002                      557             1           1            1   \n",
+       "\n",
+       "                     user_age  user_consumption_2  user_is_occupied  \\\n",
+       "__null_dask_index__                                                   \n",
+       "800000                      1                   1                 1   \n",
+       "800001                      1                   1                 1   \n",
+       "800002                      1                   1                 1   \n",
+       "\n",
+       "                     user_geography  user_intentions  user_brands  \\\n",
+       "__null_dask_index__                                                 \n",
+       "800000                            1              484          830   \n",
+       "800001                            1              544          934   \n",
+       "800002                            1              162          277   \n",
+       "\n",
+       "                     user_categories  \n",
+       "__null_dask_index__                   \n",
+       "800000                            88  \n",
+       "800001                            98  \n",
+       "800002                            30  "
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from merlin.core.dispatch import get_lib\n",
+    "\n",
+    "df_lib = get_lib()\n",
+    "\n",
+    "# read in data for request\n",
+    "batch = df_lib.read_parquet(\n",
+    "    os.path.join(DATA_FOLDER,\"valid\", \"part.0.parquet\"), columns=workflow.input_schema.column_names\n",
+    ").head(3)\n",
+    "batch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06b23899-dbb4-4701-bfa3-0d21da333159",
+   "metadata": {},
+   "source": [
+    "After we isolate our batch, we convert the dataframe representation into inputs for Triton. We also declare the outputs that we expect to receive from the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "080d84dc-9c09-4d94-8ced-8d160ca88f01",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['click/binary_output']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from merlin.systems.triton import convert_df_to_triton_input\n",
+    "import tritonclient.grpc as grpcclient\n",
+    "# create inputs and outputs\n",
+    "\n",
+    "inputs = convert_df_to_triton_input(workflow.input_schema, batch, grpcclient.InferInput)\n",
+    "\n",
+    "output_cols = ensemble.graph.output_schema.column_names\n",
+    "print(output_cols)\n",
+    "\n",
+    "outputs = [\n",
+    "    grpcclient.InferRequestedOutput(col)\n",
+    "    for col in output_cols\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01208b4d-5478-48a3-9114-b904b2ca2167",
+   "metadata": {},
+   "source": [
+    "Now that our inputs and outputs are created, we can use the triton_client that we created earlier to send the inference request."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "95dea3b8-92aa-41f9-a1b4-2cb516c6b793",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# send request to tritonserver\n",
+    "with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "    response = client.infer(\"executor_model\", inputs, request_id=\"1\", outputs=outputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b921c299-df76-45ef-9acc-5b17bc52bd3a",
+   "metadata": {},
+   "source": [
+    "When the server completes the inference request, it returns a response, i.e. likelihood per request. This response is parsed to get the desired predictions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "b766ef55-5661-4268-aed9-6f4096cce58d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.5002032]\n",
+      " [0.5001995]\n",
+      " [0.5001995]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "predictions = response.as_numpy('click/binary_output')\n",
+    "print(predictions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24ee5636-600a-4422-8165-f70e8e847031",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "This sample notebook started with data preprocessing and model training. We learned how to create an ensemble graph, verify the ensemble artifacts in the file system, and then put the ensemble into production with Triton Inference Server. Finally, we sent a simple inference request to the server and printed the response."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "merlin": {
+   "containers": [
+    "nvcr.io/nvidia/merlin/merlin-tensorflow:latest"
+   ]
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "a398807c5c2ed8e5ff9d9890488d007fa99cbabcec733962e21659a28c5da99b"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/traditional-ml/Serving-An-Implicit-Model-With-Merlin-Systems.ipynb b/examples/traditional-ml/Serving-An-Implicit-Model-With-Merlin-Systems.ipynb
new file mode 100644
index 000000000..d645a172f
--- /dev/null
+++ b/examples/traditional-ml/Serving-An-Implicit-Model-With-Merlin-Systems.ipynb
@@ -0,0 +1,488 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5cdba80f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2022 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# ==============================================================================\n",
+    "\n",
+    "# Each user is responsible for checking the content of datasets and the\n",
+    "# applicable licenses and determining if suitable for the intended use."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77acbcad",
+   "metadata": {},
+   "source": [
+    "<img src=\"https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_systems_serving-an-implicit-model-with-merlin-systems/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# Serving an Implicit Model with Merlin Systems\n",
+    "\n",
+    "This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow) container. This Jupyter notebook example demonstrates how to deploy an `Implicit` model to Triton Inference Server (TIS) and generate prediction results for a given query.\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "NVIDIA Merlin is an open source framework that accelerates and scales end-to-end recommender system pipelines. The Merlin framework is broken up into several sub components, these include: Merlin-Core, Merlin-Models, NVTabular and Merlin-Systems. Merlin Systems will be the focus of this example.\n",
+    "\n",
+    "The purpose of the Merlin Systems library is to make it easy for Merlin users to quickly deploy their recommender systems from development to [Triton Inference Server](https://github.com/triton-inference-server/server). We extended the same user-friendly API users are accustomed to in NVTabular and leverage it to accommodate deploying recommender system components to TIS. \n",
+    "\n",
+    "### Learning objectives\n",
+    "\n",
+    "In this notebook, we learn how to deploy an NVTabular Workflow and a trained `Implicit` model from Merlin Models to Triton.\n",
+    "- Create Ensemble Graph\n",
+    "- Export Ensemble Graph\n",
+    "- Run Triton server\n",
+    "- Send request to Triton and verify results\n",
+    "\n",
+    "### Dataset\n",
+    "\n",
+    "We use the [MovieLens 100k Dataset](https://grouplens.org/datasets/movielens/100k/). It consists of ratings a user has given a movie along with some metadata for the user and the movie. We train an Implicit model to predict the rating based on user and item features and proceed to deploy it to the Triton Inference Server.\n",
+    "\n",
+    "It is important to note that the steps taken in this notebook are generalized and can be applied to any set of workflows and models. \n",
+    "\n",
+    "### Tools\n",
+    "\n",
+    "- NVTabular\n",
+    "- Merlin Models\n",
+    "- Merlin Systems\n",
+    "- Triton Inference Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6efad6b8",
+   "metadata": {},
+   "source": [
+    "## Prerequisite: Preparing the data and Training Implicit"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "356ef8c9",
+   "metadata": {},
+   "source": [
+    "In this tutorial our objective is to demonstrate how to serve an `Implicit` model. In order for us to be able to do so, we begin by downloading data and training a model. We breeze through these activities below.\n",
+    "\n",
+    "If you would like to learn more about training an `Implicit` model using the Merlin Models library, please consult this [tutorial](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/07-Train-traditional-ML-models-using-the-Merlin-Models-API.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edea28d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import nvtabular as nvt\n",
+    "import numpy as np\n",
+    "from merlin.schema.tags import Tags\n",
+    "from merlin.models.implicit import BayesianPersonalizedRanking\n",
+    "\n",
+    "from merlin.datasets.entertainment import get_movielens\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b756a12f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensemble_export_path = os.environ.get(\"OUTPUT_DATA_DIR\", \"ensemble\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c10a993",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "train, _ = get_movielens(variant='ml-100k')\n",
+    "\n",
+    "# the implicit model expects a `user_id` column hence the need to rename it\n",
+    "train = nvt.Dataset(train.compute().rename(columns = {'userId': 'user_id'}))\n",
+    "\n",
+    "user_id  = ['user_id'] >> nvt.ops.Categorify() >> nvt.ops.TagAsUserID()\n",
+    "movieId  = ['movieId'] >> nvt.ops.Categorify() >> nvt.ops.TagAsItemID()\n",
+    "\n",
+    "train_workflow = nvt.Workflow(user_id + movieId)\n",
+    "train_transformed = train_workflow.fit_transform(train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff168b4a",
+   "metadata": {},
+   "source": [
+    "Having preprocessed our data, let's train our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d0b55be5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2022-09-05 09:32:07.681291: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:991] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-09-05 09:32:07.681740: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:991] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-09-05 09:32:07.681877: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:991] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "/usr/local/lib/python3.8/dist-packages/cudf/core/frame.py:384: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n",
+      "  warnings.warn(\n",
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 777.52it/s, train_auc=85.42%, skipped=29.68%]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = BayesianPersonalizedRanking()\n",
+    "model.fit(train_transformed)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4a3cf39",
+   "metadata": {},
+   "source": [
+    "## Create the Ensemble Graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc40083e",
+   "metadata": {},
+   "source": [
+    "Let us now define an `Ensemble` that will be used for serving predictions on the Triton Inference Server.\n",
+    "\n",
+    "An `Ensemble` defines operations to be performed on incoming requests. It begins with specifying what fields the inference request will contain.\n",
+    "\n",
+    "Our model was trained on data that included the `movieId` column. However, in production, this information will not be available to us, this is what we will be trying to predict.\n",
+    "\n",
+    "In general, you want to define a preprocessing workflow once and apply it throughout the lifecycle of your model, from training all the way to serving in production. Redefining the workflows on the go, or using custom written code for these operations, can be a source of subtle bugs.\n",
+    "\n",
+    "In order to ensure we process our data in the same way in production as we do in training, let us now modify the training preprocessing pipeline and use it to construct our inference workflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fa8dc34a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inf_workflow = train_workflow.remove_inputs(['movieId'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d71c5636",
+   "metadata": {},
+   "source": [
+    "Equipped with the modified data preprocessing workflow, let us define the full set of inference operations we will want to run on the Triton Inference Server.\n",
+    "\n",
+    "We begin by stating what data the server can expect (`inf_workflow.input_schema.column_names`). We proceed to wrap our `inf_workflow` in `TransformWorkflow` -- an operator we can leverage for executing our NVTabular workflow during serving.\n",
+    "\n",
+    "Last but not least, having received and preprocessed the data, we instruct the Triton Inference Server to perform inference using the model that we trained. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "de9e2237",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from merlin.systems.dag.ops.implicit import PredictImplicit\n",
+    "from merlin.systems.dag.ensemble import Ensemble\n",
+    "from merlin.systems.dag.ops.workflow import TransformWorkflow\n",
+    "\n",
+    "inf_ops = inf_workflow.input_schema.column_names >> TransformWorkflow(inf_workflow) \\\n",
+    "                    >> PredictImplicit(model.implicit_model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76dad9c3",
+   "metadata": {},
+   "source": [
+    "With inference operations defined, all that remains now is outputting the ensemble to disk so that it can be loaded up when Triton starts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e23a7fc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensemble = Ensemble(inf_ops, inf_workflow.input_schema)\n",
+    "ensemble.export(ensemble_export_path);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9165dfd",
+   "metadata": {},
+   "source": [
+    "## Starting the Triton Inference Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "353e8602",
+   "metadata": {},
+   "source": [
+    "After we export the ensemble, we are ready to start the Triton Inference Server. The server is installed in Merlin Tensorflow and Merlin PyTorch containers. If you are not using one of our containers, then ensure it is installed in your environment. For more information, see the Triton Inference Server [documentation](https://github.com/triton-inference-server/server/blob/r22.03/README.md#documentation).\n",
+    "\n",
+    "You can start the server by running the following command:\n",
+    "\n",
+    "```shell\n",
+    "tritonserver --model-repository=ensemble\n",
+    "```\n",
+    "\n",
+    "For the `--model-repository` argument, specify the same value as the `export_path` that you specified previously in the `ensemble.export` method.\n",
+    "\n",
+    "After you run the `tritonserver` command, wait until your terminal shows messages like the following example:\n",
+    "\n",
+    "```shell\n",
+    "I0414 18:29:50.741833 4067 grpc_server.cc:4421] Started GRPCInferenceService at 0.0.0.0:8001\n",
+    "I0414 18:29:50.742197 4067 http_server.cc:3113] Started HTTPService at 0.0.0.0:8000\n",
+    "I0414 18:29:50.783470 4067 http_server.cc:178] Started Metrics Service at 0.0.0.0:8002\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65b7e4e8",
+   "metadata": {},
+   "source": [
+    "## Retrieving Recommendations from Triton Inference Server\n",
+    "\n",
+    "Now that our server is running, we can send requests to it. This request is composed of values that correspond to the request schema that was created when we exported the ensemble graph.\n",
+    "\n",
+    "In the code below we create a request to send to Triton and send it. We will then analyze the response, to show the full experience.\n",
+    "\n",
+    "We begin by obtaining 10 examples from our train data to include in the request."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2d61751b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>86</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>96</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>109</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>143</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>183</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>609</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>858</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   user_id\n",
+       "0        6\n",
+       "1       15\n",
+       "2       70\n",
+       "3       86\n",
+       "4       96\n",
+       "5      109\n",
+       "6      143\n",
+       "7      183\n",
+       "8      609\n",
+       "9      858"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ten_examples = train.compute()['user_id'].unique().sample(10).sort_values().to_frame().reset_index(drop=True)\n",
+    "ten_examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7808bc12",
+   "metadata": {},
+   "source": [
+    "Let's now package the information up as inputs and send it to Triton for inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "2fefd5b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from merlin.systems.triton import convert_df_to_triton_input\n",
+    "import tritonclient.grpc as grpcclient\n",
+    "\n",
+    "inputs = convert_df_to_triton_input(inf_workflow.input_schema, ten_examples, grpcclient.InferInput)\n",
+    "\n",
+    "outputs = [\n",
+    "    grpcclient.InferRequestedOutput(col)\n",
+    "    for col in inf_ops.output_schema.column_names\n",
+    "]\n",
+    "# send request to tritonserver\n",
+    "with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "    response = client.infer(\"executor_model\", inputs, outputs=outputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3dc7909f",
+   "metadata": {},
+   "source": [
+    "We can now compare the predictions from the server to those from our local model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6ddd35cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions_from_triton = response.as_numpy(outputs[0].name())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6f28fdfe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "local_predictions = model.predict(inf_workflow.transform(nvt.Dataset(ten_examples)))[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "e946de27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.testing.assert_allclose(predictions_from_triton, local_predictions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8aa4456",
+   "metadata": {},
+   "source": [
+    "We managed to preprocess the data in the same way in serving as we did during training and obtain the same predictions!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/traditional-ml/Serving-An-XGboost-Model-With-Merlin-Systems.ipynb b/examples/traditional-ml/Serving-An-XGboost-Model-With-Merlin-Systems.ipynb
new file mode 100644
index 000000000..88cc2be8c
--- /dev/null
+++ b/examples/traditional-ml/Serving-An-XGboost-Model-With-Merlin-Systems.ipynb
@@ -0,0 +1,545 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5cdba80f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2022 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# ==============================================================================\n",
+    "\n",
+    "# Each user is responsible for checking the content of datasets and the\n",
+    "# applicable licenses and determining if suitable for the intended use."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77acbcad",
+   "metadata": {},
+   "source": [
+    "<img src=\"https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_systems_serving-an-xgboost-model-with-merlin-systems/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# Serving an XGBoost Model with Merlin Systems\n",
+    "\n",
+    "This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow) container. This Jupyter notebook example demonstrates how to deploy an `XGBoost` model to Triton Inference Server (TIS) and generate prediction results for a given query.\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "NVIDIA Merlin is an open source framework that accelerates and scales end-to-end recommender system pipelines. The Merlin framework is broken up into several sub components, these include: Merlin-Core, Merlin-Models, NVTabular and Merlin-Systems. Merlin Systems will be the focus of this example.\n",
+    "\n",
+    "The purpose of the Merlin Systems library is to make it easy for Merlin users to quickly deploy their recommender systems from development to [Triton Inference Server](https://github.com/triton-inference-server/server). We extended the same user-friendly API users are accustomed to in NVTabular and leveraged it to accommodate deploying recommender system components to TIS. \n",
+    "\n",
+    "### Learning objectives\n",
+    "\n",
+    "In this notebook, we learn how to deploy a NVTabular Workflow and a trained XGBoost model from Merlin Models to Triton.\n",
+    "- Create Ensemble Graph\n",
+    "- Export Ensemble Graph\n",
+    "- Run Triton server\n",
+    "- Send request to Triton and verify results\n",
+    "\n",
+    "### Dataset\n",
+    "\n",
+    "We use the [MovieLens 100k Dataset](https://grouplens.org/datasets/movielens/100k/). It consists of ratings a user has given a movie along with some metadata for the user and the movie. We train an XGBoost model to predict the rating based on user and item features and proceed to deploy it to the Triton Inference Server.\n",
+    "\n",
+    "It is important to note that the steps take in this notebook are generalized and can be applied to any set of workflow and models. \n",
+    "\n",
+    "### Tools\n",
+    "\n",
+    "- NVTabular\n",
+    "- Merlin Models\n",
+    "- Merlin Systems\n",
+    "- Triton Inference Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6efad6b8",
+   "metadata": {},
+   "source": [
+    "## Prerequisite: Preparing the data and Training XGBoost"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "356ef8c9",
+   "metadata": {},
+   "source": [
+    "In this tutorial our objective is to demonstrate how to serve an `XGBoost` model. In order for us to be able to do so, we begin by downloading data and training a model. We breeze through these activities below.\n",
+    "\n",
+    "If you would like to learn more about training an `XGBoost` model using the Merlin Models library, please consult this [tutorial](https://github.com/NVIDIA-Merlin/models/blob/stable/examples/07-Train-an-xgboost-model-using-the-Merlin-Models-API.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0385d38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from merlin.core.utils import Distributed\n",
+    "from merlin.models.xgb import XGBoost\n",
+    "import nvtabular as nvt\n",
+    "import numpy as np\n",
+    "from merlin.schema.tags import Tags\n",
+    "\n",
+    "from merlin.datasets.entertainment import get_movielens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f79d5736",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensemble_export_path = os.environ.get(\"OUTPUT_DATA_DIR\", \"ensemble\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0a2d3208",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-08-05 22:27:29.446602: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:991] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-08-05 22:27:29.447091: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:991] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-08-05 22:27:29.447227: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:991] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "downloading ml-100k.zip: 4.94MB [00:03, 1.45MB/s]                                                                                                                                                                                                                                                                                                                                         \n",
+      "unzipping files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 262.32files/s]\n",
+      "INFO:merlin.datasets.entertainment.movielens.dataset:starting ETL..\n",
+      "/usr/local/lib/python3.8/dist-packages/cudf/core/frame.py:384: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n",
+      "  warnings.warn(\n",
+      "2022-08-05 22:27:39,947 - distributed.diskutils - INFO - Found stale lock file and directory '/workspace/dask-worker-space/worker-oqemvhkv', purging\n",
+      "2022-08-05 22:27:39,947 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
+      "[22:27:41] task [xgboost.dask]:tcp://127.0.0.1:41809 got new rank 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0]\ttrain-rmse:2.36952\n",
+      "[20]\ttrain-rmse:0.95316\n",
+      "[40]\ttrain-rmse:0.92447\n",
+      "[60]\ttrain-rmse:0.90741\n",
+      "[80]\ttrain-rmse:0.89437\n",
+      "[84]\ttrain-rmse:0.89138\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "train, _ = get_movielens(variant='ml-100k')\n",
+    "\n",
+    "preprocess_categories = ['movieId', 'userId', 'genres'] >> nvt.ops.Categorify(freq_threshold=2, dtype=np.int32)\n",
+    "preprocess_rating = ['rating'] >> nvt.ops.AddTags(tags=[Tags.TARGET, Tags.REGRESSION])\n",
+    "\n",
+    "train_workflow = nvt.Workflow(preprocess_categories + preprocess_rating + train.schema.without(['rating_binary', 'title']).column_names)\n",
+    "train_transformed = train_workflow.fit_transform(train)\n",
+    "\n",
+    "with Distributed():\n",
+    "    model = XGBoost(schema=train_transformed.schema)\n",
+    "    model.fit(\n",
+    "        train_transformed,\n",
+    "        num_boost_round=85,\n",
+    "        verbose_eval=20\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4a3cf39",
+   "metadata": {},
+   "source": [
+    "## Create the Ensemble Graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc40083e",
+   "metadata": {},
+   "source": [
+    "Let us now define an `Ensemble` that will be used for serving predictions on the Triton Inference Server.\n",
+    "\n",
+    "An `Ensemble` defines operations to be performed on incoming requests. It begins with specifying what fields that the inference request will contain.\n",
+    "\n",
+    "Our model was trained on data that included the `target` column. However, in production, this information will not be available to us.\n",
+    "\n",
+    "In general, you want to define a preprocessing workflow once and apply it throughout the lifecycle of your model, from training all the way to serving in production. Redefining the workflows on the go, or using custom written code for these operations, can be a source of subtle bugs.\n",
+    "\n",
+    "In order to ensure we process our data in the same way in production as we do in training, let us now modify the training preprocessing pipeline and use it to construct our inference workflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fa8dc34a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inf_workflow = train_workflow.remove_inputs(['rating'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d71c5636",
+   "metadata": {},
+   "source": [
+    "Equipped with the modified data preprocessing workflow, let us define the full set of inference operations we will want to run on the Triton Inference Server.\n",
+    "\n",
+    "We begin by stating what data the server can expect (`inf_workflow.input_schema.column_names`). We proceed to wrap our `inf_workflow` in `TransformWorkflow` -- an operator we can leverage for executing our NVTabular workflow during serving.\n",
+    "\n",
+    "Last but not least, having received and preprocessed the data, we instruct the Triton Inference Server to perform inference using the model that we trained. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "de9e2237",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from merlin.systems.dag.ops.fil import PredictForest\n",
+    "from merlin.systems.dag.ensemble import Ensemble\n",
+    "from merlin.systems.dag.ops.workflow import TransformWorkflow\n",
+    "\n",
+    "inf_ops = inf_workflow.input_schema.column_names >> TransformWorkflow(inf_workflow) \\\n",
+    "                    >> PredictForest(model, inf_workflow.output_schema)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76dad9c3",
+   "metadata": {},
+   "source": [
+    "With inference operations defined, all that remains now is outputting the ensemble to disk so that it can be loaded up when Triton starts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e23a7fc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensemble = Ensemble(inf_ops, inf_workflow.input_schema)\n",
+    "ensemble.export(ensemble_export_path);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9165dfd",
+   "metadata": {},
+   "source": [
+    "## Starting the Triton Inference Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "353e8602",
+   "metadata": {},
+   "source": [
+    "After we export the ensemble, we are ready to start the Triton Inference Server. The server is installed in Merlin Tensorflow and Merlin PyTorch containers. If you are not using one of our containers, then ensure it is installed in your environment. For more information, see the Triton Inference Server [documentation](https://github.com/triton-inference-server/server/blob/r22.03/README.md#documentation).\n",
+    "\n",
+    "You can start the server by running the following command:\n",
+    "\n",
+    "```shell\n",
+    "tritonserver --model-repository=ensemble\n",
+    "```\n",
+    "\n",
+    "For the `--model-repository` argument, specify the same value as the `export_path` that you specified previously in the `ensemble.export` method.\n",
+    "\n",
+    "After you run the `tritonserver` command, wait until your terminal shows messages like the following example:\n",
+    "\n",
+    "```shell\n",
+    "I0414 18:29:50.741833 4067 grpc_server.cc:4421] Started GRPCInferenceService at 0.0.0.0:8001\n",
+    "I0414 18:29:50.742197 4067 http_server.cc:3113] Started HTTPService at 0.0.0.0:8000\n",
+    "I0414 18:29:50.783470 4067 http_server.cc:178] Started Metrics Service at 0.0.0.0:8002\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65b7e4e8",
+   "metadata": {},
+   "source": [
+    "## Retrieving Recommendations from Triton Inference Server\n",
+    "\n",
+    "Now that our server is running, we can send requests to it. This request is composed of values that correspond to the request schema that was created when we exported the ensemble graph.\n",
+    "\n",
+    "In the code below we create a request to send to Triton and send it. We will then analyze the response, to show the full experience.\n",
+    "\n",
+    "We begin by obtaining 10 examples from our train data to include in the request."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "2d61751b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>movieId</th>\n",
+       "      <th>userId</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>TE_movieId_rating</th>\n",
+       "      <th>userId_count</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>rating_binary</th>\n",
+       "      <th>age</th>\n",
+       "      <th>title</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>7</td>\n",
+       "      <td>77</td>\n",
+       "      <td>43</td>\n",
+       "      <td>0.779876</td>\n",
+       "      <td>5.572154</td>\n",
+       "      <td>1</td>\n",
+       "      <td>77</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Toy Story (1995)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>231</td>\n",
+       "      <td>77</td>\n",
+       "      <td>13</td>\n",
+       "      <td>-0.896619</td>\n",
+       "      <td>5.572154</td>\n",
+       "      <td>1</td>\n",
+       "      <td>77</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>GoldenEye (1995)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>366</td>\n",
+       "      <td>77</td>\n",
+       "      <td>17</td>\n",
+       "      <td>-0.954632</td>\n",
+       "      <td>5.572154</td>\n",
+       "      <td>1</td>\n",
+       "      <td>77</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Four Rooms (1995)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>96</td>\n",
+       "      <td>77</td>\n",
+       "      <td>89</td>\n",
+       "      <td>-0.093809</td>\n",
+       "      <td>5.572154</td>\n",
+       "      <td>1</td>\n",
+       "      <td>77</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Get Shorty (1995)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>383</td>\n",
+       "      <td>77</td>\n",
+       "      <td>25</td>\n",
+       "      <td>-0.539376</td>\n",
+       "      <td>5.572154</td>\n",
+       "      <td>1</td>\n",
+       "      <td>77</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Copycat (1995)</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   movieId  userId  genres  TE_movieId_rating  userId_count  gender  zip_code  \\\n",
+       "0        7      77      43           0.779876      5.572154       1        77   \n",
+       "1      231      77      13          -0.896619      5.572154       1        77   \n",
+       "2      366      77      17          -0.954632      5.572154       1        77   \n",
+       "3       96      77      89          -0.093809      5.572154       1        77   \n",
+       "4      383      77      25          -0.539376      5.572154       1        77   \n",
+       "\n",
+       "   rating  rating_binary  age              title  \n",
+       "0       5              1    1   Toy Story (1995)  \n",
+       "1       3              0    1   GoldenEye (1995)  \n",
+       "2       4              1    1  Four Rooms (1995)  \n",
+       "3       3              0    1  Get Shorty (1995)  \n",
+       "4       3              0    1     Copycat (1995)  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ten_examples = train.compute()\n",
+    "ten_examples.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7808bc12",
+   "metadata": {},
+   "source": [
+    "Let's now package the information up as inputs and send it to Triton for inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2fefd5b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from merlin.systems.triton import convert_df_to_triton_input\n",
+    "import tritonclient.grpc as grpcclient\n",
+    "\n",
+    "ten_examples = train.compute().drop(columns=['rating', 'title', 'rating_binary'])[:10]\n",
+    "inputs = convert_df_to_triton_input(inf_workflow.input_schema, ten_examples, grpcclient.InferInput)\n",
+    "\n",
+    "outputs = [\n",
+    "    grpcclient.InferRequestedOutput(col)\n",
+    "    for col in inf_ops.output_schema.column_names\n",
+    "]\n",
+    "# send request to tritonserver\n",
+    "with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "    response = client.infer(\"executor_model\", inputs, outputs=outputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3dc7909f",
+   "metadata": {},
+   "source": [
+    "We can now compare the predictions from the server to those from our local model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6ddd35cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions_from_triton = response.as_numpy(outputs[0].name())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6f28fdfe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/distributed/node.py:180: UserWarning: Port 8787 is already in use.\n",
+      "Perhaps you already have a cluster running?\n",
+      "Hosting the HTTP server on port 35647 instead\n",
+      "  warnings.warn(\n",
+      "2022-08-05 22:28:22,197 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
+     ]
+    }
+   ],
+   "source": [
+    "with Distributed():\n",
+    "    local_predictions = model.predict(train_transformed)[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "e946de27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert np.allclose(predictions_from_triton, local_predictions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8aa4456",
+   "metadata": {},
+   "source": [
+    "We managed to preprocess the data in the same way in serving as we did during training and obtain the same predictions!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/integration/examples/quick_start/test_preproc.py b/tests/integration/examples/quick_start/test_preproc.py
new file mode 100644
index 000000000..73204271d
--- /dev/null
+++ b/tests/integration/examples/quick_start/test_preproc.py
@@ -0,0 +1,292 @@
+import os
+import tempfile
+
+import cudf
+import numpy as np
+import pytest
+from examples.quick_start.scripts.preproc.preprocessing import PreprocessingRunner
+from merlin.schema import Tags
+from merlin.schema.io.tensorflow_metadata import TensorflowMetadata
+
+STANDARD_CI_TENREC_DATA_PATH = "/raid/data/tenrec_ci/"
+
+
+def kwargs_to_cli_ags(**kwargs):
+    cli_args = []
+    for k, v in kwargs.items():
+        cli_args.append(f"--{k}")
+        if v is not None:
+            cli_args.append(str(v))
+    args = PreprocessingRunner.parse_cli_args(cli_args)
+    return args
+
+
+@pytest.fixture
+def tenrec_data_path():
+    data_path = os.getenv("CI_TENREC_DATA_PATH", STANDARD_CI_TENREC_DATA_PATH)
+    return data_path
+
+
+def get_schema_from_path(path):
+    tf_metadata = TensorflowMetadata.from_proto_text_file(str(path))
+    schema = tf_metadata.to_merlin_schema()
+    return schema
+
+
+def check_schema(path, categ_cols_max_values=None):
+    schema = get_schema_from_path(path)
+    assert set(schema.column_names) == set(
+        [
+            "user_id",
+            "item_id",
+            "video_category",
+            "gender",
+            "age",
+            "click",
+            "follow",
+            "like",
+            "share",
+            "watching_times",
+            "TE_user_id_follow",
+            "TE_item_id_follow",
+            "TE_user_id_click",
+            "TE_item_id_click",
+        ]
+    )
+
+    assert set(schema.select_by_tag(Tags.USER_ID).column_names) == set(["user_id"])
+    assert set(schema.select_by_tag(Tags.ITEM_ID).column_names) == set(["item_id"])
+    assert set(schema.select_by_tag(Tags.CATEGORICAL).column_names) == set(
+        ["user_id", "item_id", "video_category", "gender"]
+    )
+    target_encoding_feats = [
+        "TE_user_id_click",
+        "TE_user_id_follow",
+        "TE_item_id_click",
+        "TE_item_id_follow",
+    ]
+    assert set(schema.select_by_tag(Tags.CONTINUOUS).column_names) == set(
+        ["age"] + target_encoding_feats
+    )
+    assert set(schema.select_by_tag(Tags.BINARY_CLASSIFICATION).column_names) == set(
+        ["click", "follow", "like", "share"]
+    )
+    assert set(schema.select_by_tag(Tags.REGRESSION).column_names) == set(
+        ["watching_times"]
+    )
+    assert set(schema.select_by_tag(Tags.TARGET).column_names) == set(
+        ["click", "follow", "like", "share", "watching_times"]
+    )
+
+    if categ_cols_max_values:
+        categ_features = schema.select_by_tag(Tags.CATEGORICAL).column_names
+        for col in categ_features:
+            assert schema[col].int_domain.max == categ_cols_max_values[col]
+
+    return schema
+
+
+@pytest.mark.parametrize("use_dask_cluster", [True, False])
+def test_ranking_preprocessing(tenrec_data_path, use_dask_cluster):
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        additional_kwargs = {}
+        if use_dask_cluster:
+            additional_kwargs["enable_dask_cuda_cluster"] = None
+            additional_kwargs["persist_intermediate_files"] = None
+
+        args = kwargs_to_cli_ags(
+            data_path=os.path.join(tenrec_data_path, "raw/QK-video-10M.csv"),
+            input_data_format="csv",
+            csv_na_values="\\N",
+            output_path=tmp_output_folder,
+            categorical_features="user_id,item_id,video_category,gender",
+            continuous_features="age",
+            target_encoding_features="user_id,item_id",
+            target_encoding_targets="click,follow",
+            binary_classif_targets="click,follow,like,share",
+            regression_targets="watching_times",
+            to_int32="user_id,item_id",
+            to_int16="watching_times",
+            to_int8="gender,age,video_category,click,follow,like,share",
+            user_id_feature="user_id",
+            item_id_feature="item_id",
+            **additional_kwargs,
+        )
+        runner = PreprocessingRunner(args)
+        runner.run()
+
+        expected_max_values = {
+            "user_id": 296088,
+            "item_id": 617033,
+            "video_category": 2,
+            "gender": 3,
+            "click": 1,
+            "follow": 1,
+            "like": 1,
+            "share": 1,
+            "watching_times": 528,
+        }
+
+        schema = check_schema(
+            os.path.join(tmp_output_folder, "train/"), expected_max_values
+        )
+
+        expected_dtypes = {
+            "user_id": np.dtype("int64"),
+            "item_id": np.dtype("int64"),
+            "video_category": np.dtype("int64"),
+            "gender": np.dtype("int64"),
+            "age": np.dtype("float64"),
+            "TE_user_id_click": np.dtype("float32"),
+            "TE_item_id_click": np.dtype("float32"),
+            "TE_user_id_follow": np.dtype("float32"),
+            "TE_item_id_follow": np.dtype("float32"),
+            "click": np.dtype("int8"),
+            "follow": np.dtype("int8"),
+            "like": np.dtype("int8"),
+            "share": np.dtype("int8"),
+            "watching_times": np.dtype("int16"),
+        }
+
+        train_df = cudf.read_parquet(os.path.join(tmp_output_folder, "train/*.parquet"))
+        assert not train_df.isna().max().max()  # Check if there are null values
+        assert len(train_df) == 10000000  # row count
+
+        assert train_df.dtypes.to_dict() == expected_dtypes
+
+        categ_features = schema.select_by_tag(Tags.CATEGORICAL).column_names
+        target_features = schema.select_by_tag(Tags.TARGET).column_names
+        assert (
+            train_df[categ_features + target_features].max().to_dict()
+            == expected_max_values
+        )
+
+        # Checking age standardization
+        assert 0.0 == pytest.approx(train_df["age"].mean(), abs=1e-3)
+        assert 1.0 == pytest.approx(train_df["age"].std(), abs=1e-3)
+
+        # Check target encoding features
+        te_features = [
+            "TE_user_id_follow",
+            "TE_item_id_follow",
+            "TE_user_id_click",
+            "TE_item_id_click",
+        ]
+        assert (
+            train_df[te_features].min().min() >= 0
+            and train_df[te_features].max().max() <= 1
+        )
+
+
+@pytest.mark.parametrize("split_strategy", ["random", "random_by_user", "temporal"])
+def test_ranking_preprocessing_split_strategies(tenrec_data_path, split_strategy):
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        additional_kwargs = {}
+        if split_strategy in ["random", "random_by_user"]:
+            additional_kwargs["random_split_eval_perc"] = 0.2
+        elif split_strategy == "temporal":
+            additional_kwargs["timestamp_feature"] = "item_id"
+            additional_kwargs["dataset_split_temporal_timestamp"] = 15000
+
+        args = kwargs_to_cli_ags(
+            data_path=os.path.join(tenrec_data_path, "raw/QK-video-10M.csv"),
+            input_data_format="csv",
+            csv_na_values="\\N",
+            output_path=tmp_output_folder,
+            categorical_features="user_id,item_id,video_category,gender,age",
+            binary_classif_targets="click,follow,like,share",
+            regression_targets="watching_times",
+            to_int32="user_id,item_id",
+            to_int16="watching_times",
+            to_int8="gender,age,video_category,click,follow,like,share",
+            user_id_feature="user_id",
+            item_id_feature="item_id",
+            dataset_split_strategy=split_strategy,
+            **additional_kwargs,
+        )
+        runner = PreprocessingRunner(args)
+        runner.run()
+
+        total_rows = 10000000
+
+        train_df = cudf.read_parquet(os.path.join(tmp_output_folder, "train/*.parquet"))
+        rows_train = len(train_df)
+
+        eval_df = cudf.read_parquet(os.path.join(tmp_output_folder, "eval/*.parquet"))
+        rows_eval = len(eval_df)
+
+        assert rows_train + rows_eval == total_rows
+
+        if split_strategy in ["random", "random_by_user"]:
+            assert 0.20 == pytest.approx(rows_eval / float(total_rows), abs=0.02)
+
+            if split_strategy == "random_by_user":
+                assert train_df["user_id"].nunique() == pytest.approx(
+                    eval_df["user_id"].nunique(), rel=0.05
+                )
+
+        elif split_strategy == "temporal":
+            assert rows_train == 4636381
+            assert rows_eval == 5363619
+
+
+def test_ranking_preprocessing_filter_strategies(tenrec_data_path):
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            data_path=os.path.join(tenrec_data_path, "raw/QK-video-10M.csv"),
+            input_data_format="csv",
+            csv_na_values="\\N",
+            output_path=tmp_output_folder,
+            categorical_features="user_id,item_id,video_category,gender,age",
+            binary_classif_targets="click,follow,like,share",
+            regression_targets="watching_times",
+            to_int32="user_id,item_id",
+            to_int16="watching_times",
+            to_int8="gender,age,video_category,click,follow,like,share",
+            user_id_feature="user_id",
+            item_id_feature="item_id",
+            filter_query="click==1 or (click==0 and follow==0 and like==0 and share==0)",
+            min_item_freq=5,
+            min_user_freq=5,
+            max_user_freq=200,
+            num_max_rounds_filtering=5,
+        )
+        runner = PreprocessingRunner(args)
+        runner.run()
+
+        total_rows = 9102904
+
+        train_df = cudf.read_parquet(os.path.join(tmp_output_folder, "train/*.parquet"))
+        assert len(train_df) == total_rows
+
+        assert train_df.groupby("item_id").size().min() >= 5
+        assert train_df.groupby("user_id").size().min() >= 5
+
+
+def test_ranking_preprocessing_freq_capping(tenrec_data_path):
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            data_path=os.path.join(tenrec_data_path, "raw/QK-video-10M.csv"),
+            input_data_format="csv",
+            csv_na_values="\\N",
+            output_path=tmp_output_folder,
+            categorical_features="user_id,item_id,video_category,gender,age",
+            binary_classif_targets="click,follow,like,share",
+            regression_targets="watching_times",
+            to_int32="user_id,item_id",
+            to_int16="watching_times",
+            to_int8="gender,age,video_category,click,follow,like,share",
+            user_id_feature="user_id",
+            item_id_feature="item_id",
+            categ_min_freq_capping=30,
+        )
+        runner = PreprocessingRunner(args)
+        runner.run()
+
+        total_rows = 10000000
+
+        train_df = cudf.read_parquet(os.path.join(tmp_output_folder, "train/*.parquet"))
+        assert len(train_df) == total_rows
+
+        assert train_df.groupby("item_id").size().min() >= 30
+        assert train_df.groupby("user_id").size().min() >= 30
diff --git a/tests/integration/examples/quick_start/test_ranking.py b/tests/integration/examples/quick_start/test_ranking.py
new file mode 100644
index 000000000..7953e4254
--- /dev/null
+++ b/tests/integration/examples/quick_start/test_ranking.py
@@ -0,0 +1,502 @@
+import os
+import tempfile
+import time
+
+import pytest
+from examples.quick_start.scripts.ranking.ranking import RankingTrainEvalRunner
+from merlin.io.dataset import Dataset
+
+STANDARD_CI_TENREC_DATA_PATH = "/raid/data/tenrec_ci/"
+
+
+@pytest.fixture
+def tenrec_data_path():
+    data_path = os.getenv("CI_TENREC_DATA_PATH", STANDARD_CI_TENREC_DATA_PATH)
+    return data_path
+
+
+def kwargs_to_cli_ags(**kwargs):
+    cli_args = []
+    for k, v in kwargs.items():
+        cli_args.append(f"--{k}")
+        if v is not None:
+            cli_args.append(str(v))
+    args = RankingTrainEvalRunner.parse_cli_args(cli_args)
+    return args
+
+
+def get_datasets(path):
+    train_ds = Dataset(os.path.join(path, "preproc/train/*.parquet"), part_size="500MB")
+    eval_ds = Dataset(os.path.join(path, "preproc/eval/*.parquet"), part_size="500MB")
+    return train_ds, eval_ds
+
+
+def test_ranking_single_task_mlp(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click",
+            stl_positive_class_weight=3,
+            model="mlp",
+            mlp_layers="64,32",
+            embedding_sizes_multiplier=6,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            ["loss", "auc", "prauc", "logloss", "regularization_loss", "loss_batch"]
+        )
+
+        assert metrics["loss"] < 0.7
+        assert metrics["logloss"] < 0.7
+        assert metrics["auc"] > 0.75
+        assert metrics["prauc"] > 0.60
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] < 0.8
+        # assert elapsed_time < 60  # 23s in a V100
+
+
+def test_ranking_single_task_dlrm(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click",
+            stl_positive_class_weight=3,
+            model="dlrm",
+            embeddings_dim=64,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            mlp_layers="64,32",
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            ["loss", "auc", "prauc", "logloss", "regularization_loss", "loss_batch"]
+        )
+
+        assert metrics["loss"] < 0.7
+        assert metrics["logloss"] < 0.7
+        assert metrics["auc"] > 0.75
+        assert metrics["prauc"] > 0.60
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] < 0.8
+        # assert elapsed_time < 60
+
+
+def test_ranking_single_task_dcn(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click",
+            stl_positive_class_weight=3,
+            model="dcn",
+            dcn_interacted_layer_num=5,
+            mlp_layers="64,32",
+            embedding_sizes_multiplier=6,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            ["loss", "auc", "prauc", "logloss", "regularization_loss", "loss_batch"]
+        )
+
+        assert metrics["loss"] < 0.7
+        assert metrics["logloss"] < 0.7
+        assert metrics["auc"] > 0.75
+        assert metrics["prauc"] > 0.60
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] < 0.8
+        # assert elapsed_time < 60
+
+
+def test_ranking_single_task_wide_n_deep(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click",
+            stl_positive_class_weight=3,
+            model="wide_n_deep",
+            wnd_hashed_cross_num_bins=5000,
+            wnd_ignore_combinations="item_id:video_category,user_id:gender,user_id:age",
+            wnd_wide_l2_reg=1e-5,
+            mlp_layers="64,32",
+            embedding_sizes_multiplier=6,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            ["loss", "auc", "prauc", "logloss", "regularization_loss", "loss_batch"]
+        )
+
+        assert metrics["loss"] < 0.7
+        assert metrics["logloss"] < 0.7
+        assert metrics["auc"] > 0.75
+        assert metrics["prauc"] > 0.60
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] < 0.8
+        # assert elapsed_time < 60
+
+
+def test_ranking_single_task_deepfm(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click",
+            stl_positive_class_weight=3,
+            model="deepfm",
+            mlp_layers="64,32",
+            embedding_sizes_multiplier=6,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            ["loss", "auc", "prauc", "logloss", "regularization_loss", "loss_batch"]
+        )
+
+        assert metrics["loss"] < 0.7
+        assert metrics["logloss"] < 0.7
+        assert metrics["auc"] > 0.75
+        assert metrics["prauc"] > 0.60
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] < 0.8
+        # assert elapsed_time < 120
+
+
+def test_ranking_multi_task_dlrm(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click,follow,watching_times",
+            mtl_pos_class_weight_click=1,
+            mtl_pos_class_weight_like=2,
+            mtl_loss_weight_click=1,
+            mtl_loss_weight_follow=2,
+            mtl_loss_weight_watching_times=5,
+            use_task_towers=True,
+            tower_layers=64,
+            model="dlrm",
+            embeddings_dim=64,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            mlp_layers="64,32",
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            [
+                "loss",
+                "click/binary_output_loss",
+                "follow/binary_output_loss",
+                "click/binary_output/auc",
+                "click/binary_output/prauc",
+                "click/binary_output/logloss",
+                "follow/binary_output/auc",
+                "follow/binary_output/prauc",
+                "follow/binary_output/logloss",
+                "watching_times/regression_output_loss",
+                "watching_times/regression_output/root_mean_squared_error",
+                "regularization_loss",
+                "loss_batch",
+            ]
+        )
+
+        assert metrics["loss"] < 4
+        assert metrics["click/binary_output_loss"] < 0.7
+        assert metrics["follow/binary_output_loss"] < 0.1
+        assert metrics["watching_times/regression_output_loss"] < 0.6
+        assert metrics["click/binary_output/auc"] > 0.65
+        assert metrics["click/binary_output/prauc"] > 0.5
+        assert metrics["click/binary_output/logloss"] < 0.65
+        assert metrics["follow/binary_output/auc"] > 0.35
+        assert metrics["follow/binary_output/prauc"] > 0
+        assert metrics["follow/binary_output/logloss"] < 0.1
+        assert metrics["watching_times/regression_output/root_mean_squared_error"] < 0.8
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] > 0.0
+        # assert elapsed_time < 60
+
+
+def test_ranking_multi_task_mmoe(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click,follow,watching_times",
+            mtl_pos_class_weight_click=1,
+            mtl_pos_class_weight_like=2,
+            mtl_loss_weight_click=1,
+            mtl_loss_weight_follow=2,
+            mtl_loss_weight_watching_times=5,
+            use_task_towers=True,
+            tower_layers=64,
+            model="mmoe",
+            mmoe_num_mlp_experts=4,
+            embedding_sizes_multiplier=5,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            mlp_layers="64,32",
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            [
+                "loss",
+                "click/binary_output_loss",
+                "follow/binary_output_loss",
+                "click/binary_output/auc",
+                "click/binary_output/prauc",
+                "click/binary_output/logloss",
+                "follow/binary_output/auc",
+                "follow/binary_output/prauc",
+                "follow/binary_output/logloss",
+                "watching_times/regression_output_loss",
+                "watching_times/regression_output/root_mean_squared_error",
+                "regularization_loss",
+                "loss_batch",
+                "gate_click/binary_output_weight_0",
+                "gate_click/binary_output_weight_1",
+                "gate_click/binary_output_weight_2",
+                "gate_click/binary_output_weight_3",
+                "gate_follow/binary_output_weight_0",
+                "gate_follow/binary_output_weight_1",
+                "gate_follow/binary_output_weight_2",
+                "gate_follow/binary_output_weight_3",
+                "gate_watching_times/regression_output_weight_0",
+                "gate_watching_times/regression_output_weight_1",
+                "gate_watching_times/regression_output_weight_2",
+                "gate_watching_times/regression_output_weight_3",
+            ]
+        )
+
+        assert metrics["loss"] < 4
+        assert metrics["click/binary_output_loss"] < 0.7
+        assert metrics["follow/binary_output_loss"] < 0.1
+        assert metrics["watching_times/regression_output_loss"] < 0.6
+        assert metrics["click/binary_output/auc"] > 0.65
+        assert metrics["click/binary_output/prauc"] > 0.5
+        assert metrics["click/binary_output/logloss"] < 0.65
+        assert metrics["follow/binary_output/auc"] > 0.35
+        assert metrics["follow/binary_output/prauc"] > 0
+        assert metrics["follow/binary_output/logloss"] < 0.1
+        assert metrics["watching_times/regression_output/root_mean_squared_error"] < 0.8
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] > 0.0
+        # assert elapsed_time < 60
+
+
+def test_ranking_multi_task_ple(tenrec_data_path):
+    args = kwargs_to_cli_ags()
+    train_ds, eval_ds = get_datasets(tenrec_data_path)
+
+    with tempfile.TemporaryDirectory() as tmp_output_folder:
+        args = kwargs_to_cli_ags(
+            output_path=tmp_output_folder,
+            tasks="click,follow,watching_times",
+            mtl_pos_class_weight_click=1,
+            mtl_pos_class_weight_like=2,
+            mtl_loss_weight_click=1,
+            mtl_loss_weight_follow=2,
+            mtl_loss_weight_watching_times=5,
+            ple_num_layers=2,
+            use_task_towers=True,
+            tower_layers=64,
+            model="ple",
+            cgc_num_shared_experts=3,
+            cgc_num_task_experts=1,
+            embedding_sizes_multiplier=5,
+            l2_reg=1e-6,
+            embeddings_l2_reg=1e-8,
+            dropout=0.05,
+            mlp_layers="64,32",
+            lr=1e-4,
+            lr_decay_rate=0.99,
+            lr_decay_steps=100,
+            train_batch_size=8192,
+            eval_batch_size=8192,
+            epoch=3,
+        )
+
+        runner = RankingTrainEvalRunner(args, train_ds, train_ds, None, logger=None)
+
+        current_time = time.time()
+
+        metrics = runner.run()
+
+        elapsed_time = time.time() - current_time
+
+        assert set(metrics.keys()) == set(
+            [
+                "loss",
+                "click/binary_output_loss",
+                "follow/binary_output_loss",
+                "click/binary_output/auc",
+                "click/binary_output/prauc",
+                "click/binary_output/logloss",
+                "follow/binary_output/auc",
+                "follow/binary_output/prauc",
+                "follow/binary_output/logloss",
+                "watching_times/regression_output_loss",
+                "watching_times/regression_output/root_mean_squared_error",
+                "regularization_loss",
+                "loss_batch",
+                "gate_click/binary_output_weight_0",
+                "gate_click/binary_output_weight_1",
+                "gate_click/binary_output_weight_2",
+                "gate_click/binary_output_weight_3",
+                "gate_follow/binary_output_weight_0",
+                "gate_follow/binary_output_weight_1",
+                "gate_follow/binary_output_weight_2",
+                "gate_follow/binary_output_weight_3",
+                "shared_gate_weight_0",
+                "shared_gate_weight_1",
+                "shared_gate_weight_2",
+                "shared_gate_weight_3",
+                "shared_gate_weight_4",
+                "shared_gate_weight_5",
+                "gate_watching_times/regression_output_weight_0",
+                "gate_watching_times/regression_output_weight_1",
+                "gate_watching_times/regression_output_weight_2",
+                "gate_watching_times/regression_output_weight_3",
+            ]
+        )
+
+        assert metrics["loss"] < 4
+        assert metrics["click/binary_output_loss"] < 0.7
+        assert metrics["follow/binary_output_loss"] < 0.1
+        assert metrics["watching_times/regression_output_loss"] < 0.6
+        assert metrics["click/binary_output/auc"] > 0.65
+        assert metrics["click/binary_output/prauc"] > 0.5
+        assert metrics["click/binary_output/logloss"] < 0.65
+        assert metrics["follow/binary_output/auc"] > 0.35
+        assert metrics["follow/binary_output/prauc"] > 0
+        assert metrics["follow/binary_output/logloss"] < 0.1
+        assert metrics["watching_times/regression_output/root_mean_squared_error"] < 0.8
+        assert metrics["regularization_loss"] > 0.0
+        assert metrics["loss_batch"] > 0.0
+        # assert elapsed_time < 60
diff --git a/tests/integration/examples/test_ci_building_deploying_multi_stage_RecSys.py b/tests/integration/examples/test_ci_building_deploying_multi_stage_RecSys.py
index be647c654..8e3733811 100644
--- a/tests/integration/examples/test_ci_building_deploying_multi_stage_RecSys.py
+++ b/tests/integration/examples/test_ci_building_deploying_multi_stage_RecSys.py
@@ -2,7 +2,6 @@
 
 import pytest
 from testbook import testbook
-
 from tests.conftest import REPO_ROOT
 
 pytest.importorskip("tensorflow")
diff --git a/tests/integration/examples/test_serving_an_implicit_model_with_merlin_systems.py b/tests/integration/examples/test_serving_an_implicit_model_with_merlin_systems.py
new file mode 100644
index 000000000..540c79c93
--- /dev/null
+++ b/tests/integration/examples/test_serving_an_implicit_model_with_merlin_systems.py
@@ -0,0 +1,59 @@
+import shutil
+
+import pytest
+from testbook import testbook
+
+from merlin.systems.triton.utils import run_triton_server
+from merlin.core.compat import cudf
+from tests.conftest import REPO_ROOT
+
+pytest.importorskip("implicit")
+pytest.importorskip("merlin.models")
+
+
+if cudf:
+
+    _TRAIN_ON_GPU = [True, False]
+else:
+    _TRAIN_ON_GPU = [False]
+
+TRITON_SERVER_PATH = shutil.which("tritonserver")
+
+
+@pytest.mark.notebook
+@pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found")
+@pytest.mark.parametrize("gpu", _TRAIN_ON_GPU)
+def test_example_serving_implicit(gpu, tmpdir):
+    with testbook(
+        REPO_ROOT / "examples/traditional-ml/Serving-An-Implicit-Model-With-Merlin-Systems.ipynb",
+        execute=False,
+        timeout=180,
+    ) as tb:
+        tb.inject(
+            f"""
+            import os
+            os.environ["OUTPUT_DATA_DIR"] = "{tmpdir}/ensemble"
+            os.environ["USE_GPU"] = "{int(gpu)}"
+            from unittest.mock import patch
+            from merlin.datasets.synthetic import generate_data
+            mock_train, mock_valid = generate_data(
+                input="movielens-100k",
+                num_rows=1000,
+                set_sizes=(0.8, 0.2)
+            )
+            p1 = patch(
+                "merlin.datasets.entertainment.get_movielens",
+                return_value=[mock_train, mock_valid]
+            )
+            p1.start()
+            """,
+            pop=True,
+        )
+
+        tb.execute_cell(list(range(0, 18)))
+
+        with run_triton_server(f"{tmpdir}/ensemble", grpc_port=8001):
+            tb.execute_cell(list(range(18, len(tb.cells) - 2)))
+            pft = tb.ref("predictions_from_triton")
+            lp = tb.ref("local_predictions")
+            assert pft.shape == lp.shape
diff --git a/tests/integration/examples/test_serving_an_xgboost_model_with_merlin_systems.py b/tests/integration/examples/test_serving_an_xgboost_model_with_merlin_systems.py
new file mode 100644
index 000000000..684a6f8a3
--- /dev/null
+++ b/tests/integration/examples/test_serving_an_xgboost_model_with_merlin_systems.py
@@ -0,0 +1,50 @@
+import shutil
+
+import pytest
+from testbook import testbook
+
+from merlin.systems.triton.utils import run_triton_server
+from tests.conftest import REPO_ROOT
+
+pytest.importorskip("tensorflow")
+pytest.importorskip("merlin.models")
+pytest.importorskip("xgboost")
+
+TRITON_SERVER_PATH = shutil.which("tritonserver")
+
+
+@pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found")
+@pytest.mark.notebook
+def test_example_serving_xgboost(tmpdir):
+    with testbook(
+        REPO_ROOT / "examples/traditional-ml/Serving-An-XGboost-Model-With-Merlin-Systems.ipynb",
+        execute=False,
+        timeout=180,
+    ) as tb:
+        tb.inject(
+            f"""
+            import os
+            os.environ["OUTPUT_DATA_DIR"] = "{tmpdir}/ensemble"
+            from unittest.mock import patch
+            from merlin.datasets.synthetic import generate_data
+            mock_train, mock_valid = generate_data(
+                input="movielens-100k",
+                num_rows=1000,
+                set_sizes=(0.8, 0.2)
+            )
+            p1 = patch(
+                "merlin.datasets.entertainment.get_movielens",
+                return_value=[mock_train, mock_valid]
+            )
+            p1.start()
+            """
+        )
+        NUM_OF_CELLS = len(tb.cells)
+
+        tb.execute_cell(list(range(0, 14)))
+
+        with run_triton_server(f"{tmpdir}/ensemble", grpc_port=8001):
+            tb.execute_cell(list(range(14, NUM_OF_CELLS - 1)))
+            pft = tb.ref("predictions_from_triton")
+            lp = tb.ref("local_predictions")
+            assert pft.shape == lp.shape
diff --git a/tests/integration/examples/test_serving_ranking_models_with_merlin_systems.py b/tests/integration/examples/test_serving_ranking_models_with_merlin_systems.py
new file mode 100644
index 000000000..086d6c80c
--- /dev/null
+++ b/tests/integration/examples/test_serving_ranking_models_with_merlin_systems.py
@@ -0,0 +1,47 @@
+import os
+import shutil
+
+import pytest
+from testbook import testbook
+
+from merlin.systems.triton.utils import run_triton_server
+
+from tests.conftest import REPO_ROOT
+
+pytest.importorskip("cudf")
+pytest.importorskip("tensorflow")
+pytest.importorskip("merlin.models")
+
+TRITON_SERVER_PATH = shutil.which("tritonserver")
+
+
+@pytest.mark.notebook
+@pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found")
+def test_serving_ranking_models(tmp_path):
+    with testbook(
+        REPO_ROOT / "examples/ranking/tf/Training-and-Deploying-DLRM-model-with-Models-and-Systems.ipynb",
+        execute=False,
+        timeout=180,
+    ) as tb:
+        tb.inject(
+            f"""
+            import os
+            os.environ["DATA_FOLDER"] = "{tmp_path}"
+            os.environ["NUM_ROWS"] = "2000"
+            """
+        )
+        NUM_OF_CELLS = len(tb.cells)
+        print("num_cells:", NUM_OF_CELLS)
+        tb.execute_cell(list(range(0, NUM_OF_CELLS - 12)))
+        assert os.path.isdir(f"{tmp_path}/dlrm")
+        assert os.path.isdir(f"{tmp_path}/ensemble")
+        assert os.listdir(f"{tmp_path}/ensemble")
+        assert os.path.isdir(f"{tmp_path}/workflow")
+
+        with run_triton_server(f"{tmp_path}/ensemble", grpc_port=8001):
+            tb.execute_cell(list(range(50, NUM_OF_CELLS - 1)))
+        
+        preds = tb.ref("predictions")
+        assert len(preds) == 3
+
+   

From edbd126359fa914d100624280aa5e141af3ca587 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Wed, 5 Jul 2023 15:21:00 +0000
Subject: [PATCH 10/13] Allow cross-compiling on x86 + NVIDIA Grace (ARM64).

---
 docker/dockerfile.ctr    |  9 +------
 docker/dockerfile.merlin | 56 ++++++++++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/docker/dockerfile.ctr b/docker/dockerfile.ctr
index f9e767438..f79b5ed85 100644
--- a/docker/dockerfile.ctr
+++ b/docker/dockerfile.ctr
@@ -49,7 +49,7 @@ ENV HCOLL_ENABLE_MCAST=0
 # link sub modules expected by hugectr cmake
 RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
 RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
-RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
+RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
 
 # Install HugeCTR
 ARG HUGECTR_HOME=/usr/local/hugectr
@@ -77,13 +77,6 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
         mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
     ; fi
 
-
-ENV PATH=$PATH:${HUGECTR_HOME}/bin \
-    CPATH=$CPATH:${HUGECTR_HOME}/include \
-    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib \
-    PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
-
-
 ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
 ARG TRITON_VERSION
 # Install Triton inference backend.
diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin
index 07a50b7e5..150ee9c0f 100644
--- a/docker/dockerfile.merlin
+++ b/docker/dockerfile.merlin
@@ -11,6 +11,9 @@ FROM ${DLFW_IMAGE} as dlfw
 FROM ${BASE_IMAGE} as build
 
 # Args
+ARG TARGETOS
+ARG TARGETARCH
+
 ARG DASK_VER=2023.1.1
 ARG MERLIN_VER=main
 ARG CORE_VER=main
@@ -38,12 +41,13 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extra
 ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
 
 # Set up NVIDIA package repository
-RUN apt clean && apt update -y --fix-missing && \
+RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \
+    apt clean && apt update -y --fix-missing && \
     apt install -y --no-install-recommends software-properties-common && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-ubuntu2204.pin && \
     mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
-    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/3bf863cc.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/ /" && \
     apt install -y --no-install-recommends \
         autoconf \
         automake \
@@ -95,10 +99,11 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<
                 cachetools graphviz nvtx scipy "scikit-learn<1.2" \
                 tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \
                 git+https://github.com/rapidsai/asvdb.git@main \
-                xgboost==1.6.2 lightgbm treelite==2.4.0 treelite_runtime==2.4.0 \
+                xgboost==1.6.2 lightgbm \
                 lightfm implicit \
                 numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \
                 pynvml==11.4.1
+RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0
 RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime pycuda
 RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER} 
 RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com
@@ -113,7 +118,8 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/
-COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil backends/fil/
+# NOTE 2023-07: fil-backend is not available on ARM.
+COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/
 COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/.
 
 ENV PATH=/opt/tritonserver/bin:${PATH}:
@@ -139,6 +145,10 @@ CMD ["/bin/bash"]
 
 FROM ${BASE_IMAGE} as base
 
+# Args
+ARG TARGETOS
+ARG TARGETARCH
+
 # Envs
 ENV CUDA_HOME=/usr/local/cuda
 ENV CUDA_PATH=$CUDA_HOME
@@ -148,12 +158,13 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extra
 ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
 
 # Set up NVIDIA package repository
-RUN apt update -y --fix-missing && \
+RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \
+    apt update -y --fix-missing && \
     apt install -y --no-install-recommends software-properties-common && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-ubuntu2204.pin && \
     mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
-    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/3bf863cc.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/ /" && \
     apt install -y --no-install-recommends \
         ca-certificates \
         clang-format \
@@ -196,9 +207,12 @@ RUN apt update -y --fix-missing && \
         #   Required to run Hadoop.
         openssh-server \
         # [ HugeCTR ]
-        libaio-dev \
+        libaio-dev && \
+        # NOTE: libnvinfer is installed anyway, just Python bindings are missing on ARM.
+    if [[ "$TARGETARCH" != "arm64" ]]; then \
         # TensorRT dependencies
-        python3-libnvinfer && \
+        apt install -y --no-install-recommends python3-libnvinfer \
+    ; fi && \
     apt autoremove -y && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
@@ -222,24 +236,28 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/bin bin/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/
-COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/python/
-COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil backends/fil/
-COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/
+COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/
+# NOTE 2023-07: fil-backend is not available on ARM.
+COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/
+COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/
 COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/.
-COPY --chown=1000:1000 --from=triton /usr/lib/x86_64-linux-gnu/libdcgm.so.2 /usr/lib/x86_64-linux-gnu/libdcgm.so.2
-COPY --chown=1000:1000 --from=triton /usr/local/cuda-12.1/targets/x86_64-linux/lib/libcupti.so.12 /usr/local/cuda-12.1/targets/x86_64-linux/lib/libcupti.so.12
+COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.2 /tmp
+RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \
+    mv /tmp/libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \
+    chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \
+    ln -s libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so
 
 
 ENV PATH=/opt/tritonserver/bin:${PATH}:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib
 
+# python --version | sed -e 's/[A-Za-z ]*//g' | awk -F'.' '{print $1"."$2}'
 ENV PYTHON_VERSION=3.10
 
 # Python Packages
 COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/
 ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/
 
-
 # rapids components from the DLFW image
 COPY --chown=1000:1000 --from=dlfw /usr/lib/libcudf* /usr/lib/
 COPY --chown=1000:1000 --from=dlfw /usr/lib/libarrow* /usr/lib/
@@ -256,8 +274,6 @@ COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/
 COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/
 
 # ptx compiler required by cubinlinker
-COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
-COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
 RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;
 
 COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm

From 3b0b3757bc027690374beb3bcd85f5af10058867 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Fri, 7 Jul 2023 13:46:35 +0000
Subject: [PATCH 11/13] Reverse two wrongful changes.

---
 docker/dockerfile.merlin | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin
index 150ee9c0f..0d1669735 100644
--- a/docker/dockerfile.merlin
+++ b/docker/dockerfile.merlin
@@ -236,10 +236,10 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/bin bin/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/
-COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/
+COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/python/
 # NOTE 2023-07: fil-backend is not available on ARM.
 COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/
-COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/
+COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/
 COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/.
 COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.2 /tmp
 RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \

From 838248f9e66ebc3e956795ce806041df4cf8f0ca Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Fri, 7 Jul 2023 13:47:07 +0000
Subject: [PATCH 12/13] Update TF dockerfile for x86 + Grace/ARM64 cross
 compile.

---
 docker/dockerfile.tf | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf
index 0dafdff29..d69576691 100644
--- a/docker/dockerfile.tf
+++ b/docker/dockerfile.tf
@@ -41,19 +41,19 @@ ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
 ARG _CI_JOB_TOKEN=""
 ARG HUGECTR_VER=main
 
-ENV CPATH=$CPATH:${HUGECTR_HOME}/include \
-    LD_LIBRARY_PATH=${HUGECTR_HOME}/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
+ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
     LIBRARY_PATH=${HUGECTR_HOME}/lib:$LIBRARY_PATH \
     SOK_COMPILE_UNIT_TEST=ON
 
 RUN mkdir -p /usr/local/nvidia/lib64 && \
-    ln -s /usr/local/cuda/lib64/libcusolver.so /usr/local/nvidia/lib64/libcusolver.so.10
+    ln -s /usr/local/cuda/lib64/libcusolver.so /usr/local/nvidia/lib64/libcusolver.so
 
-RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
+RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
 
 # Install distributed-embeddings and sok
 ARG INSTALL_DISTRIBUTED_EMBEDDINGS=false
-ARG TFDE_VER=v0.3
+ARG TFDE_VER=v23.03.00
+
 RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \
         git clone --branch ${HUGECTR_VER} --depth 1 --recurse-submodules --shallow-submodules https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
         pushd /hugectr && \
@@ -65,14 +65,14 @@ RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \
         # Install HPS TF plugin
         cd ../hps_tf && \
         python setup.py install && \
-        popd &&\
-	mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
+        popd && \
+        mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
     	rm -rf /hugectr && mkdir -p /hugectr && \
-        mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit; \
-    fi && \
+        mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit \
+    ; fi && \
     if [ "$INSTALL_DISTRIBUTED_EMBEDDINGS" == "true" ]; then \
         git clone --branch ${TFDE_VER} --depth 1 https://github.com/NVIDIA-Merlin/distributed-embeddings.git /distributed_embeddings/ && \
         cd /distributed_embeddings && git submodule update --init --recursive && \
-        make pip_pkg && pip install --no-cache-dir artifacts/*.whl && make clean; \
-    fi; 
+        make pip_pkg && pip install --no-cache-dir artifacts/*.whl && make clean \
+    ; fi
 

From 6d4c453cc00a7254de20cc95728343b7243d8265 Mon Sep 17 00:00:00 2001
From: Matthias Langer <bashimao@users.noreply.github.com>
Date: Thu, 13 Jul 2023 00:48:58 -0700
Subject: [PATCH 13/13] Just add an empty linefor symmetry reasons.

---
 docker/dockerfile.merlin | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin
index 0d1669735..f76c8528d 100644
--- a/docker/dockerfile.merlin
+++ b/docker/dockerfile.merlin
@@ -258,6 +258,7 @@ ENV PYTHON_VERSION=3.10
 COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/
 ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/
 
+
 # rapids components from the DLFW image
 COPY --chown=1000:1000 --from=dlfw /usr/lib/libcudf* /usr/lib/
 COPY --chown=1000:1000 --from=dlfw /usr/lib/libarrow* /usr/lib/