Skip to content

Commit

Permalink
CTR dockerfile for 23.06 (#1018)
Browse files Browse the repository at this point in the history
* Update the dockerfile for new upstream

* Updates to make build compatible with 23.05 base image.

* Simplified build process for HugeCTR training image.

* Tick base image up to 23.06, fix `tritonclient` dependency.

* Tick up base image version.

* Merge branch 'main' into fix-update_base_23.05

---------

Co-authored-by: qqiao <qqiao@nvidia.com>
Co-authored-by: Karl Higley <karlb@nvidia.com>
Co-authored-by: Julio Perez <37191411+jperez999@users.noreply.github.com>
  • Loading branch information
4 people authored Jul 6, 2023
1 parent b022d97 commit 031d560
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 53 deletions.
29 changes: 6 additions & 23 deletions docker/dockerfile.ctr
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1.2
ARG MERLIN_VERSION=22.12
ARG TRITON_VERSION=22.11
ARG MERLIN_VERSION=23.06
ARG TRITON_VERSION=23.06

ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}

Expand All @@ -9,16 +9,6 @@ FROM ${BASE_IMAGE} as base
ARG HUGECTR_VER=main
ARG HUGECTR_BACKEND_VER=main

# Envs
ENV CUDA_SHORT_VERSION=11.6
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
ENV CUDA_HOME=/usr/local/cuda
ENV CUDA_PATH=$CUDA_HOME
ENV CUDA_CUDA_LIBRARY=${CUDA_HOME}/lib64/stubs
ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
ENV PATH=$PATH:/usr/lib/x86_64-linux-gnu/
RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1

RUN pip install --no-cache-dir --upgrade notebook ipython
RUN pip install --no-cache-dir mpi4py

Expand All @@ -29,12 +19,11 @@ RUN cd /opt/hpcx/ompi/include/openmpi/opal/mca/hwloc/hwloc201 && rm -rfv hwloc20
RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.4/hwloc-${HWLOC_VER}.tar.gz && \
mkdir -p /var/tmp && tar -x -f /var/tmp/hwloc-${HWLOC_VER}.tar.gz -C /var/tmp && \
cd /var/tmp/hwloc-${HWLOC_VER} && \
./configure CPPFLAGS="-I/usr/local/cuda/include/ -L/usr/local/cuda/lib64/" LDFLAGS="-L/usr/local/cuda/lib64" --enable-cuda && \
./configure CPPFLAGS="-I${CUDA_HOME}/include/ -L${CUDA_HOME}/lib64/" LDFLAGS="-L${CUDA_HOME}/lib64" --enable-cuda && \
make -j$(nproc) && make install && \
rm -rf /var/tmp/hwloc-${HWLOC_VER} /var/tmp/hwloc-${HWLOC_VER}.tar.gz



# -----------------------------------------------------------------------------
# HugeCTR + Dependencies

Expand Down Expand Up @@ -62,21 +51,18 @@ RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so

RUN rm -rf /usr/lib/x86_64-linux-gnu/libibverbs.so && \
ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1.14.36.0 /usr/lib/x86_64-linux-gnu/libibverbs.so

# Install HugeCTR
ARG HUGECTR_HOME=/usr/local/hugectr
RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
rm -rf /usr/local/hugectr/lib/libgmock* /usr/local/hugectr/lib/pkgconfig/gmock* /usr/local/hugectr/include/gmock && \
rm -rf /usr/local/hugectr/lib/libgtest* /usr/local/hugectr/lib/pkgconfig/gtest* /usr/local/hugectr/include/gtest && \
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
cd /hugectr && \
git submodule update --init --recursive && \
mkdir build && \
cd build && \
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
export PATH=$PATH:/usr/local/cuda-${CUDA_SHORT_VERSION}/compat && \
export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="60;61;70;75;80;90" -DENABLE_MULTINODES=ON .. \
; else \
Expand Down Expand Up @@ -119,9 +105,6 @@ RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
; fi
RUN ln -s ${HUGECTR_HOME}/backends/hugectr /opt/tritonserver/backends/hugectr

# Remove fake lib
RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1

# Clean up
RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/marked
RUN rm -rf /usr/local/share/jupyter/lab/staging/node_modules/node-fetch
53 changes: 23 additions & 30 deletions docker/dockerfile.merlin
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1.2
ARG TRITON_VERSION=23.03
ARG DLFW_VERSION=23.03
ARG TRITON_VERSION=23.06
ARG DLFW_VERSION=23.06

ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min
Expand Down Expand Up @@ -40,10 +40,10 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
# Set up NVIDIA package repository
RUN apt clean && apt update -y --fix-missing && \
apt install -y --no-install-recommends software-properties-common && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
apt install -y --no-install-recommends \
autoconf \
automake \
Expand Down Expand Up @@ -88,20 +88,18 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
# A fix has already been merged but not yet released:
# https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7859
# 2023-02-22: pynvml==11.5.0 is currently incompatible with our version of dask/distributed
# tritonclient[all]==2.29.0: latest tritonclient removes the perf_* binaries, so specified to version 2.29.0
#cupy-cuda12x

RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<3.25.0" ninja scikit-build pandas==1.5.2 \
fastrlock nvidia-pyindex pybind11 pytest \
transformers==4.12 tensorflow-metadata betterproto \
transformers tensorflow-metadata betterproto \
cachetools graphviz nvtx scipy "scikit-learn<1.2" \
tritonclient[all]==2.29.0 grpcio-channelz fiddle wandb npy-append-array \
tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \
git+https://github.com/rapidsai/asvdb.git@main \
xgboost==1.6.2 lightgbm treelite==2.4.0 treelite_runtime==2.4.0 \
lightfm implicit \
numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \
pynvml==11.4.1
RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime==1.11.1 pycuda
RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime pycuda
RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER}
RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com

Expand All @@ -125,21 +123,14 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib
# don't include it https://github.com/kyamagu/faiss-wheels/issues/54)
RUN git clone --branch v1.7.2 https://github.com/facebookresearch/faiss.git build-env && \
pushd build-env && \
cmake -B build . -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="60;70;80" && \
cmake -B build . -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="60;70;80;90" && \
make -C build -j $(nproc) faiss swigfaiss && \
pushd build/faiss/python && \
python setup.py install && \
popd && \
popd && \
rm -rf build-env

# Install spdlog
RUN git clone --branch v1.9.2 https://github.com/gabime/spdlog.git build-env && \
pushd build-env && \
mkdir build && cd build && cmake .. && make -j && make install && \
popd && \
rm -rf build-env

# Clean up
RUN rm -rf /repos

Expand All @@ -159,10 +150,10 @@ ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
# Set up NVIDIA package repository
RUN apt update -y --fix-missing && \
apt install -y --no-install-recommends software-properties-common && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
apt install -y --no-install-recommends \
ca-certificates \
clang-format \
Expand Down Expand Up @@ -217,9 +208,6 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
ENV JAVA_HOME=/usr/lib/jvm/default-java
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/lib:${JAVA_HOME}/lib/server

# Includes
COPY --chown=1000:1000 --from=build /usr/local/include/spdlog/ /usr/local/include/spdlog/

# Binaries
COPY --chown=1000:1000 --from=build /usr/local/bin/cmake /usr/local/bin/
COPY --chown=1000:1000 --from=build /usr/local/bin/pytest /usr/local/bin/
Expand All @@ -245,9 +233,11 @@ COPY --chown=1000:1000 --from=triton /usr/local/cuda-12.1/targets/x86_64-linux/l
ENV PATH=/opt/tritonserver/bin:${PATH}:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib

ENV PYTHON_VERSION=3.10

# Python Packages
COPY --chown=1000:1000 --from=build /usr/local/lib/python3.8/dist-packages /usr/local/lib/python3.8/dist-packages/
ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python3.8/dist-packages/
COPY --chown=1000:1000 --from=build /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages/
ENV PYTHONPATH=$PYTHONPATH:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/


# rapids components from the DLFW image
Expand All @@ -257,16 +247,19 @@ COPY --chown=1000:1000 --from=dlfw /usr/lib/libparquet* /usr/lib/
COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Arrow /usr/lib/cmake/Arrow/
COPY --chown=1000:1000 --from=dlfw /usr/lib/cmake/Parquet /usr/lib/cmake/Parquet/
COPY --chown=1000:1000 --from=dlfw /usr/lib/libnvcomp* /usr/lib/

COPY --chown=1000:1000 --from=dlfw /usr/include/fmt /usr/include/fmt/
COPY --chown=1000:1000 --from=dlfw /usr/include/spdlog /usr/include/spdlog/
COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/
COPY --chown=1000:1000 --from=dlfw /usr/include/parquet /usr/include/parquet/
COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/
COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/
COPY --chown=1000:1000 --from=dlfw /usr/include/rmm /usr/include/rmm/

# ptx compiler required by cubinlinker
COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;

ARG PYTHON_VERSION=3.8
COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm
COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda /usr/local/lib/python${PYTHON_VERSION}/dist-packages/cuda
COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pyarrow
Expand Down

0 comments on commit 031d560

Please sign in to comment.