From cb2341a750f187e31952c5d0cedda15029f585ae Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Mon, 2 May 2022 16:24:09 -0400 Subject: [PATCH 1/4] add entrypoint to all containers --- docker/inference/dockerfile.ctr | 1 + docker/inference/dockerfile.tf | 1 + docker/inference/dockerfile.torch | 1 + docker/training/dockerfile.ctr | 1 + docker/training/dockerfile.tf | 1 + docker/training/dockerfile.torch | 1 + 6 files changed, 6 insertions(+) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 17a109d0e..b049c7bf1 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -381,3 +381,4 @@ RUN rm -rf /repos HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 1e55df0d1..d4c96b7c9 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -205,3 +205,4 @@ RUN rm -rf /repos HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index ca0bbb54b..544e39a5f 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -199,3 +199,4 @@ RUN rm -rf /repos HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 9f9ec804b..e9234f918 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -273,3 +273,4 @@ RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index 724920708..cc8dc1580 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -123,3 +123,4 @@ RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index c7c80c7b8..e8f2ab2db 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -86,3 +86,4 @@ RUN rm -rf /opt/conda/share/jupyter/lab/staging/node_modules/node-fetch HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] From fa45e642ef7d9ce1fc47cb5e2cee67df8d6c85f1 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Wed, 4 May 2022 20:22:40 -0400 Subject: [PATCH 2/4] remove -e for pip installs --- docker/inference/dockerfile.ctr | 8 ++++---- docker/inference/dockerfile.tf | 10 +++++----- docker/inference/dockerfile.torch | 8 ++++---- docker/training/dockerfile.ctr | 6 +++--- docker/training/dockerfile.tf | 2 +- docker/training/dockerfile.torch | 8 ++++---- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 19bc00a87..59a731d30 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -192,7 +192,7 @@ RUN git clone https://github.com/rapidsai/cudf.git build-env && cd build-env/ && # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -203,17 +203,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps; + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps; ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index d4c96b7c9..e642ada57 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -85,7 +85,7 @@ RUN git clone --branch v1.9.2 https://github.com/gabime/spdlog.git build-env && # Install arrow ENV ARROW_HOME=/usr/local -RUN git clone --branch apache-arrow-5.0.0 --recurse-submodules https://github.com/apache/arrow.git build-env && \ +RUN git clone --branch apache-arrow-6.0.1 --recurse-submodules https://github.com/apache/arrow.git build-env && \ pushd build-env && \ export PARQUET_TEST_DATA="${PWD}/cpp/submodules/parquet-testing/data" && \ export ARROW_TEST_DATA="${PWD}/testing/data" && \ @@ -156,7 +156,7 @@ RUN git clone https://github.com/rapidsai/cudf.git build-env && cd build-env/ && # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -167,17 +167,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps; + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps; ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index 544e39a5f..241b60008 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -150,7 +150,7 @@ RUN git clone https://github.com/rapidsai/cudf.git build-env && cd build-env/ && # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -161,17 +161,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps; + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps; ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 4bda7a84e..d3372b53e 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -95,17 +95,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index cc8dc1580..61cfe4ee2 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -80,7 +80,7 @@ ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index e8f2ab2db..ecfa1b913 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -52,7 +52,7 @@ RUN pip install git+https://github.com/rapidsai/asvdb.git@main # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -63,17 +63,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo From e0fd0a7025be7661d36bad8f6b286cbd5714498c Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Mon, 9 May 2022 00:46:50 -0400 Subject: [PATCH 3/4] add key update mechanism to ci dockerfile --- ci/dockerfile.ci | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ci/dockerfile.ci b/ci/dockerfile.ci index 3643c883b..568e34a76 100644 --- a/ci/dockerfile.ci +++ b/ci/dockerfile.ci @@ -30,6 +30,15 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib # Install packages ENV DEBIAN_FRONTEND=noninteractive + +RUN apt update -y --fix-missing && \ + apt install -y --no-install-recommends software-properties-common && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ + mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" + + RUN apt update -y --fix-missing && \ apt install -y --no-install-recommends software-properties-common && \ apt-get install -y --no-install-recommends \ From b9424e5e6930da5686cfed3165ab24e5545f3656 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Mon, 9 May 2022 15:52:44 -0400 Subject: [PATCH 4/4] remove nvm for triple a rating --- docker/training/dockerfile.ctr | 2 +- docker/training/dockerfile.tf | 2 +- docker/training/dockerfile.torch | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index d3372b53e..f19269a09 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -270,7 +270,7 @@ ENV PATH=$PATH:${HUGECTR_HOME}/bin \ RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1 # Clean up -RUN rm -rf /repos +RUN rm -rf /repos /usr/local/nvm/ RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index b4296ea4a..f7f04b309 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -120,7 +120,7 @@ RUN if [ "$INSTALL_DISTRIBUTED_EMBEDDINGS" == "true" ]; then \ fi # Clean up -RUN rm -rf /repos +RUN rm -rf /repos /usr/local/nvm/ RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index ecfa1b913..a5d490745 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -80,7 +80,7 @@ ENV PYTHONPATH=$PYTHONPATH:/models RUN git clone https://github.com/NVIDIA-Merlin/Merlin/ /Merlin # Clean up -RUN rm -rf /repos +RUN rm -rf /repos /usr/local/nvm/ RUN rm -rf /opt/conda/share/jupyter/lab/staging/node_modules/marked RUN rm -rf /opt/conda/share/jupyter/lab/staging/node_modules/node-fetch