From cb2341a750f187e31952c5d0cedda15029f585ae Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Mon, 2 May 2022 16:24:09 -0400 Subject: [PATCH 1/4] add entrypoint to all containers --- docker/inference/dockerfile.ctr | 1 + docker/inference/dockerfile.tf | 1 + docker/inference/dockerfile.torch | 1 + docker/training/dockerfile.ctr | 1 + docker/training/dockerfile.tf | 1 + docker/training/dockerfile.torch | 1 + 6 files changed, 6 insertions(+) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 17a109d0e..b049c7bf1 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -381,3 +381,4 @@ RUN rm -rf /repos HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index 1e55df0d1..d4c96b7c9 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -205,3 +205,4 @@ RUN rm -rf /repos HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index ca0bbb54b..544e39a5f 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -199,3 +199,4 @@ RUN rm -rf /repos HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 9f9ec804b..e9234f918 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -273,3 +273,4 @@ RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index 724920708..cc8dc1580 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -123,3 +123,4 @@ RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index c7c80c7b8..e8f2ab2db 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -86,3 +86,4 @@ RUN rm -rf /opt/conda/share/jupyter/lab/staging/node_modules/node-fetch HEALTHCHECK NONE CMD ["/bin/bash"] +ENTRYPOINT ["/bin/bash", "-c", "/opt/nvidia/nvidia_entrypoint.sh"] From fa45e642ef7d9ce1fc47cb5e2cee67df8d6c85f1 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Wed, 4 May 2022 20:22:40 -0400 Subject: [PATCH 2/4] remove -e for pip installs --- docker/inference/dockerfile.ctr | 8 ++++---- docker/inference/dockerfile.tf | 10 +++++----- docker/inference/dockerfile.torch | 8 ++++---- docker/training/dockerfile.ctr | 6 +++--- docker/training/dockerfile.tf | 2 +- docker/training/dockerfile.torch | 8 ++++---- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docker/inference/dockerfile.ctr b/docker/inference/dockerfile.ctr index 19bc00a87..59a731d30 100644 --- a/docker/inference/dockerfile.ctr +++ b/docker/inference/dockerfile.ctr @@ -192,7 +192,7 @@ RUN git clone https://github.com/rapidsai/cudf.git build-env && cd build-env/ && # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -203,17 +203,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps; + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps; ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/inference/dockerfile.tf b/docker/inference/dockerfile.tf index d4c96b7c9..e642ada57 100644 --- a/docker/inference/dockerfile.tf +++ b/docker/inference/dockerfile.tf @@ -85,7 +85,7 @@ RUN git clone --branch v1.9.2 https://github.com/gabime/spdlog.git build-env && # Install arrow ENV ARROW_HOME=/usr/local -RUN git clone --branch apache-arrow-5.0.0 --recurse-submodules https://github.com/apache/arrow.git build-env && \ +RUN git clone --branch apache-arrow-6.0.1 --recurse-submodules https://github.com/apache/arrow.git build-env && \ pushd build-env && \ export PARQUET_TEST_DATA="${PWD}/cpp/submodules/parquet-testing/data" && \ export ARROW_TEST_DATA="${PWD}/testing/data" && \ @@ -156,7 +156,7 @@ RUN git clone https://github.com/rapidsai/cudf.git build-env && cd build-env/ && # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -167,17 +167,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps; + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps; ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/inference/dockerfile.torch b/docker/inference/dockerfile.torch index 544e39a5f..241b60008 100644 --- a/docker/inference/dockerfile.torch +++ b/docker/inference/dockerfile.torch @@ -150,7 +150,7 @@ RUN git clone https://github.com/rapidsai/cudf.git build-env && cd build-env/ && # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -161,17 +161,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps; + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps; ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/training/dockerfile.ctr b/docker/training/dockerfile.ctr index 4bda7a84e..d3372b53e 100644 --- a/docker/training/dockerfile.ctr +++ b/docker/training/dockerfile.ctr @@ -95,17 +95,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index cc8dc1580..61cfe4ee2 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -80,7 +80,7 @@ ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo diff --git a/docker/training/dockerfile.torch b/docker/training/dockerfile.torch index e8f2ab2db..ecfa1b913 100644 --- a/docker/training/dockerfile.torch +++ b/docker/training/dockerfile.torch @@ -52,7 +52,7 @@ RUN pip install git+https://github.com/rapidsai/asvdb.git@main # Install Merlin Core RUN git clone https://github.com/NVIDIA-Merlin/core.git /core/ && \ - cd /core/ && git checkout ${CORE_VER} && pip install -e . --no-deps + cd /core/ && git checkout ${CORE_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/core # Install Merlin Systems @@ -63,17 +63,17 @@ ENV PYTHONPATH=$PYTHONPATH:/systems # Install NVTabular ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' RUN git clone https://github.com/NVIDIA-Merlin/NVTabular.git /nvtabular/ && \ - cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install -e . --no-deps + cd /nvtabular/ && git checkout ${NVTAB_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/nvtabular # Install Transformers4Rec RUN git clone https://github.com/NVIDIA-Merlin/Transformers4Rec.git /transformers4rec && \ - cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install -e . --no-deps + cd /transformers4rec/ && git checkout ${TF4REC_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/transformers4rec # Install Models RUN git clone https://github.com/NVIDIA-Merlin/Models.git /models/ && \ - cd /models/ && git checkout ${MODELS_VER} && pip install -e . --no-deps + cd /models/ && git checkout ${MODELS_VER} && pip install . --no-deps ENV PYTHONPATH=$PYTHONPATH:/models # Add Merlin Repo From 0b30e36dff54652a29ff907b2bd4db0d9cdf44a5 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Fri, 13 May 2022 01:59:05 -0400 Subject: [PATCH 3/4] add keyring to fix bad key nvidia issue --- docker/dockerfile.merlin | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index ef6659469..f197285ae 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -28,6 +28,15 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extra ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin ENV PYTHONPATH=/usr/lib/python3.8/site-packages:$PYTHONPATH + +RUN [ $(uname -m) = 'x86_64' ] \ + && curl -o /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \ + || curl -o /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-keyring_1.0-1_all.deb; \ + dpkg -i /tmp/cuda-keyring.deb \ + && rm /tmp/cuda-keyring.deb + # Set up NVIDIA package repository RUN apt update -y --fix-missing && \ apt install -y --no-install-recommends software-properties-common && \ From 57bbeeaaab6333aaf4fc3a62d4635b98a85444fe Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Wed, 18 May 2022 12:53:06 -0400 Subject: [PATCH 4/4] trying to move env call to see if it helps fix the install of distributed embeddings after sok install --- docker/training/dockerfile.tf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/training/dockerfile.tf b/docker/training/dockerfile.tf index 5f6a4614f..053f83257 100644 --- a/docker/training/dockerfile.tf +++ b/docker/training/dockerfile.tf @@ -104,7 +104,6 @@ RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \ popd && \ rm -rf build-env; \ fi -ENV PYTHONPATH=$PYTHONPATH:/usr/lib/python3.8/site-packages/merlin_sok-1.1.3-py3.8-linux-x86_64.egg # Install distributed-embeddings ARG INSTALL_DISTRIBUTED_EMBEDDINGS=true @@ -114,6 +113,9 @@ RUN if [ "$INSTALL_DISTRIBUTED_EMBEDDINGS" == "true" ]; then \ make pip_pkg && pip install artifacts/*.whl && make clean; \ fi +ENV PYTHONPATH=$PYTHONPATH:/usr/lib/python3.8/site-packages/merlin_sok-1.1.3-py3.8-linux-x86_64.egg + + # Clean up RUN rm -rf /repos RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked