From 4ee800e1c62d1510ecde6da5cc2d958b488de945 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 8 Mar 2022 20:47:06 -0800 Subject: [PATCH 01/23] Initial pull --- docker/inference/dockerfile.ctr | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index f83017411..a80ac2f92 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -1,7 +1,9 @@ # syntax=docker/dockerfile:1.2 ARG TRITON_VERSION=22.02 -ARG IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min -FROM ${IMAGE} +ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 +ARG BASE_IMAGE=${FULL_IMAGE}-min +FROM ${BASE_IMAGE} + # Args ARG CORE_VER=main @@ -13,7 +15,7 @@ ARG MODELS_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main ARG TF4REC_VER=main -# + # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib ENV DEBIAN_FRONTEND=noninteractive @@ -51,6 +53,19 @@ RUN pip install tritonclient[all] grpcio-channelz RUN pip install dask==2021.11.2 distributed==2021.11.2 dask[dataframe]==2021.11.2 dask-cuda RUN pip install git+https://github.com/rapidsai/asvdb.git@main + +# Triton Server +FROM ${FULL_IMAGE} as full +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=full /opt/tritonserver/LICENSE . +COPY --chown=1000:1000 --from=full /opt/tritonserver/TRITON_VERSION . +COPY --chown=1000:1000 --from=full /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf . +COPY --chown=1000:1000 --from=full /opt/tritonserver/bin bin/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/. + # Install cmake RUN apt remove --purge cmake -y && wget http://www.cmake.org/files/v3.21/cmake-3.21.1.tar.gz && \ tar xf cmake-3.21.1.tar.gz && cd cmake-3.21.1 && ./configure && make && make install From db83a5c3b2bc60da2f9d41f177575bbfcc3f1ea8 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 9 Mar 2022 13:43:34 -0800 Subject: [PATCH 02/23] Removes numba --- docker/inference/dockerfile.ctr | 2 +- docker/training/dockerfile.ctr | 3 --- docker/training/dockerfile.torch | 3 --- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index a80ac2f92..869dcd4bd 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -48,7 +48,7 @@ RUN ln -s /usr/bin/python3 /usr/bin/python # Install multiple packages RUN pip install cupy-cuda115 nvidia-pyindex pybind11 pytest protobuf transformers==4.12 tensorflow-metadata RUN pip install betterproto cachetools graphviz nvtx scipy sklearn -RUN pip install pandas numba==0.55.1 numpy==1.21.5 +RUN pip install numba --no-deps RUN pip install tritonclient[all] grpcio-channelz RUN pip install dask==2021.11.2 distributed==2021.11.2 dask[dataframe]==2021.11.2 dask-cuda RUN pip install git+https://github.com/rapidsai/asvdb.git@main diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 64dabf613..3e57f801e 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -41,8 +41,6 @@ RUN apt remove --purge cmake -y && wget http://www.cmake.org/files/v3.21/cmake-3 RUN pip install nvidia-pyindex mpi4py onnx onnxruntime RUN pip install betterproto graphviz pybind11 pytest RUN pip install --upgrade ipython -RUN pip install numba==0.55.1 numpy==1.21.5 --no-deps -RUN pip install --ignore-installed llvmlite==0.38.0 --no-deps RUN pip install tritonclient[all] grpcio-channelz RUN pip install git+https://github.com/rapidsai/asvdb.git@main @@ -173,7 +171,6 @@ ENV PYTHONPATH=/hugectr/onnx_converter:$PYTHONPATH RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1 # Clean up -RUN pip install numba==0.53.1 numpy==1.22.2 --no-deps RUN rm -rf /repos RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index a6185d585..cac05de12 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -36,8 +36,6 @@ RUN pip install --upgrade Pillow RUN pip install nvidia-pyindex RUN pip install tritonclient[all] grpcio-channelz RUN pip install --no-deps fastai fastcore fastprogress fastdownload -RUN pip install numba==0.55.1 numpy==1.21.5 --no-deps -RUN pip install --ignore-installed llvmlite==0.38.0 --no-deps RUN pip install git+https://github.com/rapidsai/asvdb.git@main RUN CC=/usr/bin/gcc CXX=/usr/bin/g++ HOROVOD_CUDA_HOME=/usr/local/cuda/ HOROVOD_BUILD_CUDA_CC_LIST=60,70,75,80 HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 HOROVOD_NCCL_LINK=SHARED pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master @@ -63,7 +61,6 @@ RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ ENV PYTHONPATH=/models:$PYTHONPATH # Clean up -RUN pip install numba==0.53.1 numpy==1.22.2 --no-deps RUN rm -rf /repos RUN rm -rf /opt/conda/share/jupyter/lab/staging/node_modules/marked RUN rm -rf /opt/conda/share/jupyter/lab/staging/node_modules/node-fetch From 5ae734606d699b1b767be6631887bd188c387879 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 9 Mar 2022 13:48:32 -0800 Subject: [PATCH 03/23] Fixes full reference --- docker/inference/dockerfile.ctr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 869dcd4bd..4d0be44aa 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -2,7 +2,8 @@ ARG TRITON_VERSION=22.02 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 ARG BASE_IMAGE=${FULL_IMAGE}-min -FROM ${BASE_IMAGE} +FROM ${FULL_IMAGE} as full +FROM ${BASE_IMAGE} as base # Args @@ -55,7 +56,6 @@ RUN pip install git+https://github.com/rapidsai/asvdb.git@main # Triton Server -FROM ${FULL_IMAGE} as full WORKDIR /opt/tritonserver COPY --chown=1000:1000 --from=full /opt/tritonserver/LICENSE . COPY --chown=1000:1000 --from=full /opt/tritonserver/TRITON_VERSION . From 8e0fdd6e6ffabf91948024a4e184a025a10d7f01 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Thu, 10 Mar 2022 12:57:32 -0800 Subject: [PATCH 04/23] Adds pahs --- docker/inference/dockerfile.ctr | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 4d0be44aa..83d63fdda 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -65,6 +65,8 @@ COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/ COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/ COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/ repoagents/ COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/. +ENV PATH=/opt/tritonserver/bin:${PATH}: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib # Install cmake RUN apt remove --purge cmake -y && wget http://www.cmake.org/files/v3.21/cmake-3.21.1.tar.gz && \ From 49eb534667a3f81715a5378c1385d947080535a1 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Thu, 10 Mar 2022 17:29:36 -0800 Subject: [PATCH 05/23] Updates CI --- ci/test_container.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test_container.sh b/ci/test_container.sh index 2891b5bcf..74731546f 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -55,6 +55,6 @@ fi # Waiting to sync integration tests with them # Test Transformers4Rec -if [ "$container" != "merlin-training" ]; then +if [[ "$container" == "merlin-tensorflow-training" || "$container" == "merlin-pytorch-training" ]]; then /transformers4rec/ci/test_integration.sh $container $devices fi From 17486a0e2a15f43823926185682963b628a1f1c2 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Thu, 10 Mar 2022 19:05:16 -0800 Subject: [PATCH 06/23] Update packes --- docker/inference/dockerfile.ctr | 20 ++++++++++++++------ docker/inference/dockerfile.tf | 8 ++++---- docker/inference/dockerfile.torch | 8 ++++---- docker/training/dockerfile.ctr | 10 +++++----- docker/training/dockerfile.tf | 10 +++++----- docker/training/dockerfile.torch | 4 ++-- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 83d63fdda..052f5d29d 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -26,22 +26,30 @@ ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin ENV PYTHONPATH=/usr/lib/python3.8/site-packages:$PYTHONPATH -# Install packages +# Install system packages ENV DEBIAN_FRONTEND=noninteractive + +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ + mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub && \ + add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" + RUN apt update -y --fix-missing && \ - apt install -y --no-install-recommends software-properties-common && \ - apt-get install -y --no-install-recommends \ + apt install -y --no-install-recommends \ clang-format \ + libb64-dev \ libboost-serialization-dev \ libcurl4-openssl-dev \ + libre2-dev \ libssl-dev \ libtbb-dev \ protobuf-compiler \ python3-dev \ python3-pip \ - rapidjson-dev &&\ - apt-get autoremove -y && \ - apt-get clean && \ + rapidjson-dev \ + software-properties-common && \ + apt autoremove -y && \ + apt clean && \ rm -rf /var/lib/apt/lists/* RUN ln -s /usr/bin/python3 /usr/bin/python diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 3f4a37ef2..d5647163e 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -26,8 +26,7 @@ ENV PYTHONPATH=/usr/lib/python3.8/site-packages:$PYTHONPATH # Install packages ENV DEBIAN_FRONTEND=noninteractive RUN apt update -y --fix-missing && \ - apt install -y --no-install-recommends software-properties-common && \ - apt-get install -y --no-install-recommends \ + apt install -y --no-install-recommends \ clang-format \ libboost-serialization-dev \ libexpat1-dev \ @@ -37,9 +36,10 @@ RUN apt update -y --fix-missing && \ policykit-1 \ protobuf-compiler \ rapidjson-dev \ + software-properties-common \ zlib1g-dev && \ - apt-get autoremove -y && \ - apt-get clean && \ + apt autoremove -y && \ + apt clean && \ rm -rf /var/lib/apt/lists/* RUN ln -s /usr/bin/python3 /usr/bin/python diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index 15461915d..56801b719 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -26,8 +26,7 @@ ENV PYTHONPATH=/usr/lib/python3.8/site-packages:$PYTHONPATH # Install packages ENV DEBIAN_FRONTEND=noninteractive RUN apt update -y --fix-missing && \ - apt install -y --no-install-recommends software-properties-common && \ - apt-get install -y --no-install-recommends \ + apt install -y --no-install-recommends \ clang-format \ libboost-serialization-dev \ libexpat1-dev \ @@ -37,9 +36,10 @@ RUN apt update -y --fix-missing && \ policykit-1 \ protobuf-compiler \ rapidjson-dev \ + software-properties-common \ zlib1g-dev && \ - apt-get autoremove -y && \ - apt-get clean && \ + apt autoremove -y && \ + apt clean && \ rm -rf /var/lib/apt/lists/* RUN ln -s /usr/bin/python3 /usr/bin/python diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 3e57f801e..c061abc60 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -21,16 +21,16 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib # Install system packages ENV DEBIAN_FRONTEND=noninteractive RUN apt update -y --fix-missing && \ - apt install -y --no-install-recommends software-properties-common && \ - apt-get install -y --no-install-recommends \ + apt install -y --no-install-recommends \ clang-format \ graphviz \ libaio-dev \ libexpat1-dev \ libtbb-dev \ - protobuf-compiler && \ - apt-get autoremove -y && \ - apt-get clean && \ + protobuf-compiler \ + software-properties-common && \ + apt autoremove -y && \ + apt clean && \ rm -rf /var/lib/apt/lists/* # Install cmake newer version diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index 0a6cecdf5..4555858c4 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -19,14 +19,14 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin # Install system packages ENV DEBIAN_FRONTEND=noninteractive RUN apt update -y --fix-missing && \ - apt install -y --no-install-recommends software-properties-common && \ - apt-get install -y --no-install-recommends \ + apt install -y --no-install-recommends \ libexpat1-dev \ libsasl2-2 \ graphviz \ - protobuf-compiler && \ - apt-get autoremove -y && \ - apt-get clean && \ + protobuf-compiler \ + software-properties-common && \ + apt autoremove -y && \ + apt clean && \ rm -rf /var/lib/apt/lists/* # Install multiple packages diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index cac05de12..c38e6a2b5 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -18,11 +18,11 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin # Install system packages ENV DEBIAN_FRONTEND=noninteractive RUN apt update -y --fix-missing && \ - apt install -y --no-install-recommends software-properties-common && \ apt install -y --no-install-recommends \ libexpat1-dev \ libsasl2-2 \ - graphviz && \ + graphviz \ + software-properties-common && \ apt autoremove -y && \ apt clean && \ rm -rf /var/lib/apt/lists/* From 6afa4f27f23370678b72c73853c249727ea4fb3a Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Thu, 10 Mar 2022 19:33:34 -0800 Subject: [PATCH 07/23] updates tests --- ci/test_container.sh | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/ci/test_container.sh b/ci/test_container.sh index 74731546f..0e5936a8b 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -7,8 +7,20 @@ devices=$2 # Unit tests # ############## +## Test Core +/core/ci/test_unit.sh $container $devices + ## Test NVTabular -pytest /nvtabular/tests/unit +/nvtabular/ci/test_unit.sh $container $devices + +if [ "$container" != "merlin-training" ]; then + ## Test Transformers4Rec + /transformers4rec/ci/test_unit.sh $container $devices + + ## Test Models + pip install coverage + /models/ci/test_unit.sh $container $devices +fi ## Test HugeCTR ### Training container @@ -28,19 +40,6 @@ if [ "$container" == "merlin-training" ]; then # inference_test fi -## Test Transformers4Rec -if [ "$container" != "merlin-training" ]; then - /transformers4rec/ci/test_unit.sh $container $devices -fi - -## Test Models -if [ "$container" != "merlin-training" ]; then - pip install coverage - chmod +x /models/ci/test_unit.sh - /models/ci/test_unit.sh $container $devices -fi - - ##################### # Integration tests # ##################### @@ -48,13 +47,11 @@ fi # Test NVTabular ## Not shared storage in blossom yet regex="merlin(.)*-inference" -if [[ "$container" =~ $regex ]]; then +if [[ ! "$container" =~ $regex ]]; then /nvtabular/ci/test_integration.sh $container $devices --report 1 fi -# Test HugeCTR -# Waiting to sync integration tests with them # Test Transformers4Rec -if [[ "$container" == "merlin-tensorflow-training" || "$container" == "merlin-pytorch-training" ]]; then +if [ "$container" != "merlin-training" ]; then /transformers4rec/ci/test_integration.sh $container $devices fi From db16008bba516ba9905382bfd97740f07e5c1f84 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Thu, 10 Mar 2022 19:34:18 -0800 Subject: [PATCH 08/23] Tmp --- ci/test_container.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/test_container.sh b/ci/test_container.sh index 0e5936a8b..234994e34 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -46,10 +46,10 @@ fi # Test NVTabular ## Not shared storage in blossom yet -regex="merlin(.)*-inference" -if [[ ! "$container" =~ $regex ]]; then - /nvtabular/ci/test_integration.sh $container $devices --report 1 -fi +#regex="merlin(.)*-inference" +#if [[ ! "$container" =~ $regex ]]; then +# /nvtabular/ci/test_integration.sh $container $devices --report 1 +#fi # Test Transformers4Rec if [ "$container" != "merlin-training" ]; then From d239f978c3cbf6887d24a59fcd3d54d184167003 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Fri, 11 Mar 2022 19:04:21 -0800 Subject: [PATCH 09/23] update testing script --- ci/test_container.sh | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/ci/test_container.sh b/ci/test_container.sh index 234994e34..87a3c987b 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -23,21 +23,8 @@ if [ "$container" != "merlin-training" ]; then fi ## Test HugeCTR -### Training container if [ "$container" == "merlin-training" ]; then - # layers_test && \ Running oom in blossom - checker_test && \ - # data_reader_test && \ Need Multi-GPU - device_map_test && \ - loss_test && \ - optimizer_test && \ - regularizers_test # && \ - # parser_test && \ Needs Multi-GPU - # auc_test Needs Multi-GPU -### Inference container -# elif [ "$container" == "merlin-inference" ]; then - # HugeCTR - Deactivated until it is self-contained and it runs - # inference_test + /hugectr/ci/test_unit.sh $container $devices fi ##################### From 27b69c2d14d95b75823c1c5a92c556f2bfaf9397 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Fri, 11 Mar 2022 19:13:47 -0800 Subject: [PATCH 10/23] fixes install --- docker/inference/dockerfile.ctr | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 052f5d29d..2ec808298 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -29,7 +29,9 @@ ENV PYTHONPATH=/usr/lib/python3.8/site-packages:$PYTHONPATH # Install system packages ENV DEBIAN_FRONTEND=noninteractive -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ +RUN apt update -y --fix-missing && \ + apt install -y --no-install-recommends software-properties-common && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub && \ add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" @@ -47,7 +49,6 @@ RUN apt update -y --fix-missing && \ python3-dev \ python3-pip \ rapidjson-dev \ - software-properties-common && \ apt autoremove -y && \ apt clean && \ rm -rf /var/lib/apt/lists/* From 68a76b9b28ea57ad4d2bf66c6d194544f4d7407b Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 14:47:04 -0700 Subject: [PATCH 11/23] Add core --- docker/inference/dockerfile.ctr | 8 +++++++- docker/inference/dockerfile.tf | 8 +++++++- docker/inference/dockerfile.torch | 8 +++++++- docker/training/dockerfile.ctr | 6 ++++++ docker/training/dockerfile.tf | 6 ++++++ docker/training/dockerfile.torch | 6 ++++++ 6 files changed, 39 insertions(+), 3 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 2ec808298..b1020680e 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -16,6 +16,7 @@ ARG MODELS_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main ARG TF4REC_VER=main +ARG SYSTEMS_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib @@ -48,7 +49,7 @@ RUN apt update -y --fix-missing && \ protobuf-compiler \ python3-dev \ python3-pip \ - rapidjson-dev \ + rapidjson-dev && \ apt autoremove -y && \ apt clean && \ rm -rf /var/lib/apt/lists/* @@ -164,6 +165,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index d5647163e..07c51aefd 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -13,7 +13,8 @@ ARG MODELS_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main ARG TF4REC_VER=main -# +ARG SYSTEMS_VER=main + # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib ENV DEBIAN_FRONTEND=noninteractive @@ -139,6 +140,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index 56801b719..c12c1cf0f 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -13,7 +13,8 @@ ARG MODELS_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main ARG TF4REC_VER=main -# +ARG SYSTEMS_VER=main + # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib ENV DEBIAN_FRONTEND=noninteractive @@ -139,6 +140,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index c061abc60..666e5318e 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -7,6 +7,7 @@ ARG NVTAB_VER=main ARG MODELS_VER=main ARG HUGECTR_VER=master ARG HWLOC_VER=2.4.1 +ARG SYSTEMS_VER=main # Envs ENV CUDA_SHORT_VERSION=11.6 @@ -49,6 +50,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install --no-deps -e . ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index 4555858c4..dd16c3807 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -8,6 +8,7 @@ ARG HUGECTR_VER=master ARG NVTAB_VER=main ARG MODELS_VER=main ARG TF4REC_VER=main +ARG SYSTEMS_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib @@ -42,6 +43,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install --no-deps -e . ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + ARG INSTALL_NVT=true # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index c38e6a2b5..aefef98bf 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -7,6 +7,7 @@ ARG DASK_VER=2021.11.2 ARG NVTAB_VER=main ARG MODELS_VER=main ARG TF4REC_VER=main +ARG SYSTEMS_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib @@ -44,6 +45,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ From 7e4acde774f92d14425430bb1f1a1bc2df112bdf Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 14:53:47 -0700 Subject: [PATCH 12/23] Add all components --- docker/inference/dockerfile.ctr | 10 +++++----- docker/inference/dockerfile.tf | 12 ++++++------ docker/inference/dockerfile.torch | 12 ++++++------ docker/training/dockerfile.ctr | 18 +++++++++++++++--- docker/training/dockerfile.tf | 4 ++-- docker/training/dockerfile.torch | 5 ++--- 6 files changed, 36 insertions(+), 25 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index b1020680e..bc806bbbb 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -7,16 +7,16 @@ FROM ${BASE_IMAGE} as base # Args -ARG CORE_VER=main -ARG RMM_VER=v21.12.00 ARG CUDF_VER=v21.12.02 +ARG RMM_VER=v21.12.00 +ARG CORE_VER=main +ARG HUGECTR_VER=master +ARG HUGECTR_BACKEND_VER=main ARG NVTAB_VER=main ARG NVTAB_BACKEND_VER=main ARG MODELS_VER=main -ARG HUGECTR_VER=master -ARG HUGECTR_BACKEND_VER=main -ARG TF4REC_VER=main ARG SYSTEMS_VER=main +ARG TF4REC_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 07c51aefd..26e6264c1 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -4,16 +4,16 @@ ARG IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-tf2-python-py3 FROM ${IMAGE} # Args -ARG CORE_VER=main -ARG RMM_VER=v21.12.00 ARG CUDF_VER=v21.12.02 -ARG NVTAB_VER=main -ARG NVTAB_BACKEND_VER=main -ARG MODELS_VER=main +ARG RMM_VER=v21.12.00 +ARG CORE_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main -ARG TF4REC_VER=main +ARG MODELS_VER=main +ARG NVTAB_VER=main +ARG NVTAB_BACKEND_VER=main ARG SYSTEMS_VER=main +ARG TF4REC_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index c12c1cf0f..d42212267 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -4,16 +4,16 @@ ARG IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-pyt-python-py3 FROM ${IMAGE} # Args -ARG CORE_VER=main -ARG RMM_VER=v21.12.00 ARG CUDF_VER=v21.12.02 -ARG NVTAB_VER=main -ARG NVTAB_BACKEND_VER=main -ARG MODELS_VER=main +ARG RMM_VER=v21.12.00 +ARG CORE_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main -ARG TF4REC_VER=main +ARG MODELS_VER=main +ARG NVTAB_VER=main +ARG NVTAB_BACKEND_VER=main ARG SYSTEMS_VER=main +ARG TF4REC_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 666e5318e..c34ef7823 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -3,11 +3,13 @@ ARG IMAGE=nvcr.io/nvidia/tensorflow:22.02-tf2-py3 FROM ${IMAGE} # Args -ARG NVTAB_VER=main -ARG MODELS_VER=main -ARG HUGECTR_VER=master ARG HWLOC_VER=2.4.1 +ARG CORE_VER=main +ARG HUGECTR_VER=master +ARG MODELS_VER=main +ARG NVTAB_VER=main ARG SYSTEMS_VER=main +ARG TF4REC_VER=main # Envs ENV CUDA_SHORT_VERSION=11.6 @@ -61,6 +63,16 @@ RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps ENV PYTHONPATH=/nvtabular:$PYTHONPATH +# Install Transformers4Rec +RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps +ENV PYTHONPATH=/transformers4rec:$PYTHONPATH + +# Install Models +RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ + cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps +ENV PYTHONPATH=/models:$PYTHONPATH + # Install CUDA-Aware hwloc RUN cd /opt/hpcx/ompi/include/openmpi/opal/mca/hwloc/hwloc201 && rm -rfv hwloc201.h hwloc/include/hwloc.h RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.4/hwloc-${HWLOC_VER}.tar.gz && \ diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index dd16c3807..a53f8cd74 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -5,10 +5,10 @@ FROM ${IMAGE} # Args ARG CORE_VER=main ARG HUGECTR_VER=master -ARG NVTAB_VER=main ARG MODELS_VER=main -ARG TF4REC_VER=main +ARG NVTAB_VER=main ARG SYSTEMS_VER=main +ARG TF4REC_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index aefef98bf..2f082d331 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -3,11 +3,10 @@ FROM ${IMAGE} # Args ARG CORE_VER=main -ARG DASK_VER=2021.11.2 -ARG NVTAB_VER=main ARG MODELS_VER=main -ARG TF4REC_VER=main +ARG NVTAB_VER=main ARG SYSTEMS_VER=main +ARG TF4REC_VER=main # Envs ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib From f713ee1cde53daf9b7a556ac3674811c0aa55fec Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 14:55:49 -0700 Subject: [PATCH 13/23] Updates ci container --- ci/dockerfile.ci | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/ci/dockerfile.ci b/ci/dockerfile.ci index c0e7c7e86..cffedcc20 100644 --- a/ci/dockerfile.ci +++ b/ci/dockerfile.ci @@ -4,17 +4,17 @@ ARG IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 FROM ${IMAGE} # Args -ARG CORE_VER=main -ARG RMM_VER=v21.12.00 ARG CUDF_VER=v21.12.02 -ARG NVTAB_VER=main -ARG NVTAB_BACKEND_VER=main -ARG MODELS_VER=main +ARG HWLOC_VER=2.4.1 +ARG RMM_VER=v21.12.00 +ARG CORE_VER=main ARG HUGECTR_VER=master ARG HUGECTR_BACKEND_VER=main +ARG MODELS_VER=main +ARG NVTAB_VER=main +ARG NVTAB_BACKEND_VER=main +ARG SYSTEMS_VER=main ARG TF4REC_VER=main -ARG HWLOC_VER=2.4.1 - # Envs ENV CUDA_SHORT_VERSION=11.6 @@ -153,6 +153,11 @@ RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps ENV PYTHONPATH=/core:$PYTHONPATH +# Install Merlin Systems +RUN git clone https://github.com/NVIDIA-Merlin/systems.git /systems/ && \ + cd /systems/ && git checkout ${SYSTEMS_VER} && pip install --no-deps -e . + ENV PYTHONPATH=/systems:$PYTHONPATH + # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ From 0b930139584f567462e1e58cfc503cfebf7d6ee4 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 16:38:59 -0700 Subject: [PATCH 14/23] transformers --- docker/inference/dockerfile.ctr | 1 - docker/inference/dockerfile.tf | 1 - docker/inference/dockerfile.torch | 1 - docker/training/dockerfile.ctr | 2 +- docker/training/dockerfile.tf | 2 +- 5 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index bc806bbbb..e10bcb06b 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -177,7 +177,6 @@ RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ ENV PYTHONPATH=/nvtabular:$PYTHONPATH # Install Transformers4Rec -RUN pip install transformers==4.12 RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps ENV PYTHONPATH=/transformers4rec:$PYTHONPATH diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 26e6264c1..b3a03f4ba 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -152,7 +152,6 @@ RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ ENV PYTHONPATH=/nvtabular:$PYTHONPATH # Install Transformers4Rec -RUN pip install transformers==4.12 RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps ENV PYTHONPATH=/transformers4rec:$PYTHONPATH diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index d42212267..947a878ff 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -152,7 +152,6 @@ RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ ENV PYTHONPATH=/nvtabular:$PYTHONPATH # Install Transformers4Rec -RUN pip install transformers==4.12 RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps ENV PYTHONPATH=/transformers4rec:$PYTHONPATH diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index c34ef7823..051b5a609 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -42,7 +42,7 @@ RUN apt remove --purge cmake -y && wget http://www.cmake.org/files/v3.21/cmake-3 # Install multiple packages RUN pip install nvidia-pyindex mpi4py onnx onnxruntime -RUN pip install betterproto graphviz pybind11 pytest +RUN pip install betterproto graphviz pybind11 pytest transformers==4.12 RUN pip install --upgrade ipython RUN pip install tritonclient[all] grpcio-channelz RUN pip install git+https://github.com/rapidsai/asvdb.git@main diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index a53f8cd74..85314292f 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -31,7 +31,7 @@ RUN apt update -y --fix-missing && \ rm -rf /var/lib/apt/lists/* # Install multiple packages -RUN pip install betterproto graphviz pybind11 pydot pytest mpi4py +RUN pip install betterproto graphviz pybind11 pydot pytest mpi4py transformers==4.12 RUN pip install --upgrade ipython RUN pip install nvidia-pyindex RUN pip install tritonclient[all] grpcio-channelz From b726bb36ae5a32e38c40decc2ae1cff80ff7052f Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 16:40:15 -0700 Subject: [PATCH 15/23] Update testting --- ci/test_container.sh | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/ci/test_container.sh b/ci/test_container.sh index 87a3c987b..0f33583aa 100755 --- a/ci/test_container.sh +++ b/ci/test_container.sh @@ -13,14 +13,12 @@ devices=$2 ## Test NVTabular /nvtabular/ci/test_unit.sh $container $devices -if [ "$container" != "merlin-training" ]; then - ## Test Transformers4Rec - /transformers4rec/ci/test_unit.sh $container $devices +## Test Transformers4Rec +/transformers4rec/ci/test_unit.sh $container $devices - ## Test Models - pip install coverage - /models/ci/test_unit.sh $container $devices -fi +## Test Models +pip install coverage +/models/ci/test_unit.sh $container $devices ## Test HugeCTR if [ "$container" == "merlin-training" ]; then @@ -33,12 +31,10 @@ fi # Test NVTabular ## Not shared storage in blossom yet -#regex="merlin(.)*-inference" -#if [[ ! "$container" =~ $regex ]]; then -# /nvtabular/ci/test_integration.sh $container $devices --report 1 -#fi +regex="merlin(.)*-inference" +if [[ ! "$container" =~ $regex ]]; then + /nvtabular/ci/test_integration.sh $container $devices --report 1 +fi # Test Transformers4Rec -if [ "$container" != "merlin-training" ]; then - /transformers4rec/ci/test_integration.sh $container $devices -fi +/transformers4rec/ci/test_integration.sh $container $devices From 88e12f22e03f3f1cd9d6aaca293754a754068074 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 16:50:13 -0700 Subject: [PATCH 16/23] Adds fil backend --- docker/inference/dockerfile.tf | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index b3a03f4ba..b41e43a26 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -1,7 +1,9 @@ # syntax=docker/dockerfile:1.2 ARG TRITON_VERSION=22.02 -ARG IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-tf2-python-py3 -FROM ${IMAGE} +ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-tf2-python-py3 +FROM ${FULL_IMAGE} as full +FROM ${BASE_IMAGE} as bas # Args ARG CUDF_VER=v21.12.02 @@ -53,6 +55,12 @@ RUN pip install tritonclient[all] grpcio-channelz RUN pip install dask==2021.11.2 distributed==2021.11.2 dask[dataframe]==2021.11.2 dask-cuda RUN pip install git+https://github.com/rapidsai/asvdb.git@main + + +# Triton Server +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/fil fil/ + # Install cmake RUN apt remove --purge cmake -y && wget http://www.cmake.org/files/v3.21/cmake-3.21.1.tar.gz && \ tar xf cmake-3.21.1.tar.gz && cd cmake-3.21.1 && ./configure && make && make install From 240685d09372dfa97b02b62b4a0edf67bb900b61 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Tue, 15 Mar 2022 16:50:36 -0700 Subject: [PATCH 17/23] Fixes spaces --- docker/inference/dockerfile.tf | 2 -- 1 file changed, 2 deletions(-) diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index b41e43a26..7826981ff 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -55,8 +55,6 @@ RUN pip install tritonclient[all] grpcio-channelz RUN pip install dask==2021.11.2 distributed==2021.11.2 dask[dataframe]==2021.11.2 dask-cuda RUN pip install git+https://github.com/rapidsai/asvdb.git@main - - # Triton Server WORKDIR /opt/tritonserver COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/fil fil/ From 356e9f104783e5626d9053c5b53890237fa374ee Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 16 Mar 2022 12:30:41 -0700 Subject: [PATCH 18/23] fixes pckconfig --- docker/inference/dockerfile.ctr | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index e10bcb06b..ada5d8b15 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -46,6 +46,7 @@ RUN apt update -y --fix-missing && \ libre2-dev \ libssl-dev \ libtbb-dev \ + pkg-config \ protobuf-compiler \ python3-dev \ python3-pip \ From 9aca0af3303a3c3a82e6c961201211fa4ae5f0c0 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 16 Mar 2022 12:41:35 -0700 Subject: [PATCH 19/23] Fix path --- docker/inference/dockerfile.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 7826981ff..9168fd46f 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -57,7 +57,7 @@ RUN pip install git+https://github.com/rapidsai/asvdb.git@main # Triton Server WORKDIR /opt/tritonserver -COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/fil fil/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/fil backends/fil/ # Install cmake RUN apt remove --purge cmake -y && wget http://www.cmake.org/files/v3.21/cmake-3.21.1.tar.gz && \ From 3e81413cb486e25614b29d00b41f53c057faed3c Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 16 Mar 2022 15:45:49 -0700 Subject: [PATCH 20/23] Add packages --- docker/inference/dockerfile.ctr | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index ada5d8b15..5e0da74c7 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -40,6 +40,7 @@ RUN apt update -y --fix-missing && \ RUN apt update -y --fix-missing && \ apt install -y --no-install-recommends \ clang-format \ + datacenter-gpu-manager \ libb64-dev \ libboost-serialization-dev \ libcurl4-openssl-dev \ @@ -58,12 +59,13 @@ RUN apt update -y --fix-missing && \ RUN ln -s /usr/bin/python3 /usr/bin/python # Install multiple packages +RUN pip install numba==0.55.1 numpu==1.21.5 --no-deps RUN pip install cupy-cuda115 nvidia-pyindex pybind11 pytest protobuf transformers==4.12 tensorflow-metadata -RUN pip install betterproto cachetools graphviz nvtx scipy sklearn -RUN pip install numba --no-deps +RUN pip install betterproto cachetools graphviz nvtx scipy sklearn pandas RUN pip install tritonclient[all] grpcio-channelz RUN pip install dask==2021.11.2 distributed==2021.11.2 dask[dataframe]==2021.11.2 dask-cuda RUN pip install git+https://github.com/rapidsai/asvdb.git@main +RUN pip install numba==0.55.1 numpu==1.21.5 --no-deps # Triton Server From 26f598955013d73d75e001750c5b65db06922c60 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 16 Mar 2022 15:55:43 -0700 Subject: [PATCH 21/23] Fix typo --- docker/inference/dockerfile.ctr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 5e0da74c7..d53842ffe 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -59,13 +59,13 @@ RUN apt update -y --fix-missing && \ RUN ln -s /usr/bin/python3 /usr/bin/python # Install multiple packages -RUN pip install numba==0.55.1 numpu==1.21.5 --no-deps +RUN pip install numba==0.55.1 numpy==1.21.5 --no-deps RUN pip install cupy-cuda115 nvidia-pyindex pybind11 pytest protobuf transformers==4.12 tensorflow-metadata RUN pip install betterproto cachetools graphviz nvtx scipy sklearn pandas RUN pip install tritonclient[all] grpcio-channelz RUN pip install dask==2021.11.2 distributed==2021.11.2 dask[dataframe]==2021.11.2 dask-cuda RUN pip install git+https://github.com/rapidsai/asvdb.git@main -RUN pip install numba==0.55.1 numpu==1.21.5 --no-deps +RUN pip install numba==0.55.1 numpy==1.21.5 --no-deps # Triton Server From 4e18863073de7d9a8e022d37e0a0beaad28fd531 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 16 Mar 2022 17:52:03 -0700 Subject: [PATCH 22/23] Fixes vulnerabilties --- docker/inference/dockerfile.ctr | 1 + docker/inference/dockerfile.tf | 1 + docker/inference/dockerfile.torch | 1 + docker/training/dockerfile.ctr | 2 ++ docker/training/dockerfile.tf | 2 ++ docker/training/dockerfile.torch | 2 ++ 6 files changed, 9 insertions(+) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index d53842ffe..89344e136 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -47,6 +47,7 @@ RUN apt update -y --fix-missing && \ libre2-dev \ libssl-dev \ libtbb-dev \ + openssl \ pkg-config \ protobuf-compiler \ python3-dev \ diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 9168fd46f..ef288e444 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -36,6 +36,7 @@ RUN apt update -y --fix-missing && \ libsasl2-2 \ libssl-dev \ libtbb-dev \ + openssl \ policykit-1 \ protobuf-compiler \ rapidjson-dev \ diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index 947a878ff..2686504aa 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -34,6 +34,7 @@ RUN apt update -y --fix-missing && \ libsasl2-2 \ libssl-dev \ libtbb-dev \ + openssl \ policykit-1 \ protobuf-compiler \ rapidjson-dev \ diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 051b5a609..e6d898f11 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -29,7 +29,9 @@ RUN apt update -y --fix-missing && \ graphviz \ libaio-dev \ libexpat1-dev \ + libssl-dev \ libtbb-dev \ + openssl \ protobuf-compiler \ software-properties-common && \ apt autoremove -y && \ diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index 85314292f..9f9093528 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -23,7 +23,9 @@ RUN apt update -y --fix-missing && \ apt install -y --no-install-recommends \ libexpat1-dev \ libsasl2-2 \ + libssl-dev \ graphviz \ + openssl \ protobuf-compiler \ software-properties-common && \ apt autoremove -y && \ diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index 2f082d331..b7ae4cd4e 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -21,7 +21,9 @@ RUN apt update -y --fix-missing && \ apt install -y --no-install-recommends \ libexpat1-dev \ libsasl2-2 \ + libssl-dev \ graphviz \ + openssl \ software-properties-common && \ apt autoremove -y && \ apt clean && \ From ba7a1b0a169e6ceef515caeb5170298ecf630fd5 Mon Sep 17 00:00:00 2001 From: Alberto Alvarez Date: Wed, 16 Mar 2022 18:42:15 -0700 Subject: [PATCH 23/23] Add python backend --- docker/inference/dockerfile.ctr | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 89344e136..0497143ee 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -77,7 +77,8 @@ COPY --chown=1000:1000 --from=full /opt/tritonserver/NVIDIA_Deep_Learning_Contai COPY --chown=1000:1000 --from=full /opt/tritonserver/bin bin/ COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/ COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/ -COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/ repoagents/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/python backends/python COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/. ENV PATH=/opt/tritonserver/bin:${PATH}: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib