-
Notifications
You must be signed in to change notification settings - Fork 40
/
Dockerfile-default-cuda
82 lines (65 loc) · 2.65 KB
/
Dockerfile-default-cuda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
COPY dockerfile_scripts /tmp/det_dockerfile_scripts
RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh
ARG WITH_NCCL
# Make it look for the system nccl, which hopefully is the newer one we built
# and installed
ENV HOROVOD_NCCL_HOME=${WITH_NCCL:+"/container/nccl"}
ENV HOROVOD_NCCL_LINK=${WITH_NCCL:+SHARED}
RUN if [ -n "${WITH_NCCL}" ]; then /tmp/det_dockerfile_scripts/build_nccl.sh; fi
#USING OFI
ARG WITH_OFI
ARG UCX_INSTALL_DIR=/container/ucx
ARG OMPI_INSTALL_DIR=/container/ompi
ARG OFI_INSTALL_DIR=/container/ofi
ENV NCCL_LIB_DIR=${HOROVOD_NCCL_HOME}/lib
ENV LD_LIBRARY_PATH=${WITH_OFI:+$NCCL_LIB_DIR:}$LD_LIBRARY_PATH
ARG TENSORFLOW_PIP
ARG TORCH_PIP
ARG TORCHVISION_PIP
RUN if [ "$TENSORFLOW_PIP" ]; then \
export HOROVOD_WITH_TENSORFLOW=1 \
&& python -m pip install $TENSORFLOW_PIP \
&& python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-tf.txt; \
else \
export HOROVOD_WITH_TENSORFLOW=0; \
fi
RUN if [ "$TORCH_PIP" ]; then \
python -m pip install $TORCH_PIP \
&& python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-torch.txt; \
fi
RUN if [ "$TORCHVISION_PIP" ]; then python -m pip install $TORCHVISION_PIP; fi
ARG TF_CUDA_SYM
RUN if [ "$TF_CUDA_SYM" ]; then ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/lib/python3.8/site-packages/tensorflow/python/libcusolver.so.10; fi
ARG TORCH_CUDA_ARCH_LIST
ARG APEX_GIT
RUN /tmp/det_dockerfile_scripts/install_apex.sh
ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
ARG WITH_AWS_TRACE
ARG INTERNAL_AWS_DS
ARG INTERNAL_AWS_PATH
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws.sh "$WITH_OFI" "$WITH_AWS_TRACE"; fi
#USING OFI
ARG AWS_LIB_DIR=${AWS_PLUGIN_INSTALL_DIR}/lib
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_LIB_DIR:}$LD_LIBRARY_PATH
ARG HOROVOD_WITH_TENSORFLOW
RUN if [ "$HOROVOD_WITH_TENSORFLOW" ]; then export HOROVOD_WITH_TENSORFLOW=$HOROVOD_WITH_TENSORFLOW; fi
ARG HOROVOD_PIP=""
ARG HOROVOD_WITH_PYTORCH=1
ARG HOROVOD_WITHOUT_MXNET=1
ARG HOROVOD_WITH_MPI
ARG HOROVOD_CPU_OPERATIONS
ARG HOROVOD_GPU_OPERATIONS
ARG HOROVOD_GPU_ALLREDUCE
ARG HOROVOD_WITHOUT_MPI
RUN pip install cmake==3.22.4 protobuf==3.20.3
RUN if [ -n "${HOROVOD_PIP}" ]; then ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
pip install "$HOROVOD_PIP" && \
ldconfig; fi
RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements.txt
# numpy 2.x breaks tensorflow. Using NGC recommended version.
RUN pip install numpy==1.24.4
ARG DEEPSPEED_PIP
RUN if [ -n "$DEEPSPEED_PIP" ]; then /tmp/det_dockerfile_scripts/install_deepspeed.sh; fi
RUN rm -r /tmp/*