From 91429ef25733b1cfb68726ee5f830ffddb25bd42 Mon Sep 17 00:00:00 2001 From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> Date: Thu, 14 Nov 2024 14:23:19 -0800 Subject: [PATCH 1/6] [PyTorch][Training][EC2] PyTorch 2.5.1 Currency Release (#4413) * [PyTorch][Training][EC2] PyTorch 2.5.1 Currency Release * upgrade packages * add ec2 tests * add py allowlist * fix py allowlist * test only ec2 * cat training logs * build efa 1.36 * efa test * add shm volume * run all ec2 * remove cat result * revert toml --- pytorch/training/buildspec-2-5-ec2.yml | 66 +++ pytorch/training/buildspec.yml | 2 +- .../training/docker/2.5/py3/Dockerfile.cpu | 337 ++++++++++++ .../Dockerfile.ec2.cpu.py_scan_allowlist.json | 3 + .../Dockerfile.ec2.gpu.py_scan_allowlist.json | 3 + .../docker/2.5/py3/cu124/Dockerfile.gpu | 485 ++++++++++++++++++ .../test_performance_pytorch_training.py | 2 +- test/dlc_tests/conftest.py | 3 +- ..._pytorch_training_performance_gpu_imagenet | 2 + ..._pytorch_training_performance_gpu_inductor | 8 +- .../dlc_tests/container_tests/bin/efa/testEFA | 5 +- .../bin/gluonnlp_tests/testNLP | 2 + .../bin/pytorch_tests/testNVApex | 4 +- .../bin/pytorch_tests/testPyTorch | 2 + .../bin/pytorch_tests/testPyTorchNcclVersion | 2 + .../bin/pytorch_tests/testPyTorchRegression | 5 + .../bin/pytorch_tests/testPyTorchwithInductor | 2 + .../bin/pytorch_tests/testTorchdata | 2 + test/dlc_tests/container_tests/bin/testCurand | 2 + test/dlc_tests/container_tests/bin/testMXNet | 2 + test/dlc_tests/container_tests/bin/testOpenCV | 6 +- .../container_tests/bin/testPip3Install | 2 + .../container_tests/bin/testPipInstall | 2 + .../dlc_tests/container_tests/bin/testSmdebug | 2 + .../container_tests/bin/testSmprofiler | 2 + test/dlc_tests/container_tests/bin/testTF1HVD | 3 + test/dlc_tests/container_tests/bin/testTF2HVD | 3 + .../container_tests/bin/testTFAddons | 2 + .../container_tests/bin/testTFKerasHVDAMP | 2 + .../container_tests/bin/testTFKerasHVDFP32 | 2 + .../container_tests/bin/testTensorBoard | 4 +- .../container_tests/bin/testTensorFlow | 4 +- .../training/test_pytorch_training_2_5.py | 132 +++++ test/dlc_tests/ec2/test_efa.py | 2 +- 34 files changed, 1090 insertions(+), 17 deletions(-) create mode 100644 pytorch/training/buildspec-2-5-ec2.yml create mode 100644 pytorch/training/docker/2.5/py3/Dockerfile.cpu create mode 100644 pytorch/training/docker/2.5/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json create mode 100644 pytorch/training/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.py_scan_allowlist.json create mode 100644 pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py diff --git a/pytorch/training/buildspec-2-5-ec2.yml b/pytorch/training/buildspec-2-5-ec2.yml new file mode 100644 index 000000000000..f79557bbdf12 --- /dev/null +++ b/pytorch/training/buildspec-2-5-ec2.yml @@ -0,0 +1,66 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.5.1 +short_version: &SHORT_VERSION "2.5" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + BuildEC2CPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 6500 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py311 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # build_tag_override: "beta:2.5.0-cpu-py311-ubuntu22.04-ec2" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT + BuildEC2GPUPTTrainPy3cu121DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 19700 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py311 + cuda_version: &CUDA_VERSION cu124 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # build_tag_override: "beta:2.5.0-gpu-py311-cu121-ubuntu22.04-ec2" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index 2c35946a030d..767e9a8e9bab 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-4-sm.yml +buildspec_pointer: buildspec-2-5-ec2.yml diff --git a/pytorch/training/docker/2.5/py3/Dockerfile.cpu b/pytorch/training/docker/2.5/py3/Dockerfile.cpu new file mode 100644 index 000000000000..bfa7c55a381f --- /dev/null +++ b/pytorch/training/docker/2.5/py3/Dockerfile.cpu @@ -0,0 +1,337 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.11.9 +ARG PYTHON_SHORT_VERSION=3.11 +ARG PYTORCH_VERSION=2.5.1 + +ARG MINIFORGE3_VERSION=24.9.0-0 +ARG OPEN_MPI_VERSION=4.1.7 + +# PyTorch Binaries +ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cpu/torch-2.5.1%2Bcpu-cp311-cp311-linux_x86_64.whl +ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cpu/torchvision-0.20.1%2Bcpu-cp311-cp311-linux_x86_64.whl +ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cpu/torchaudio-2.5.1%2Bcpu-cp311-cp311-linux_x86_64.whl +ARG TORCHTEXT_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cpu/torchtext-0.18.0%2Bcpu-cp311-cp311-linux_x86_64.whl + +FROM ubuntu:22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION + +ARG MINIFORGE3_VERSION +ARG OPEN_MPI_VERSION + +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" +ENV PATH="/opt/conda/bin:${PATH}" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \ + && cd openmpi-${OPEN_MPI_VERSION} \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && rm -rf openmpi-${OPEN_MPI_VERSION} + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="/home/.openmpi/bin:${PATH}" +ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# For conda ssl verification +ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt +RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE3_VERSION}/Miniforge3-${MINIFORGE3_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/miniforge3.sh \ + && ~/miniforge3.sh -b -p /opt/conda \ + && rm ~/miniforge3.sh + +RUN pip install --no-cache-dir --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 + +# Install common conda packages +RUN /opt/conda/bin/mamba install -y -c conda-forge \ + python=$PYTHON_VERSION \ + cython \ + cryptography \ + pyopenssl \ + pybind11 \ + cmake \ + curl \ + libcurl \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + pyyaml \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3<2" \ + "awscli<2" \ + && /opt/conda/bin/mamba clean -afy \ + && rm -rf /etc/apt/sources.list.d/* + +# Install common pip packages (in case of conda package is not available) +RUN pip install --no-cache-dir opencv-python mpi4py + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + torchtnt \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc \ + blis \ + "numpy<2" \ + && pip uninstall -y dataclasses + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +# Starts framework +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + torchtnt \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc \ + blis \ + "numpy<2" \ + && pip uninstall -y dataclasses + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2,<3" \ + "sagemaker-experiments<1" \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN /opt/conda/bin/mamba install -y -c conda-forge \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + scikit-learn \ + seaborn \ + shap \ + # pinned for sagemaker==2.232.2 + "cloudpickle==2.2.1" \ + && /opt/conda/bin/mamba clean -afy + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/pytorch/training/docker/2.5/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json b/pytorch/training/docker/2.5/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json new file mode 100644 index 000000000000..6603ab58714e --- /dev/null +++ b/pytorch/training/docker/2.5/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "70612": "In Jinja2, the from_string function is prone to Server Side Template Injection (SSTI) where it takes the \"source\" parameter as a template object, renders it, and then returns it. The attacker can exploit it with {{INJECTION COMMANDS}} in a URI. \r\nNOTE: The maintainer and multiple third parties believe that this vulnerability isn't valid because users shouldn't use untrusted templates without sandboxing." +} diff --git a/pytorch/training/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.py_scan_allowlist.json b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.py_scan_allowlist.json new file mode 100644 index 000000000000..6603ab58714e --- /dev/null +++ b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "70612": "In Jinja2, the from_string function is prone to Server Side Template Injection (SSTI) where it takes the \"source\" parameter as a template object, renders it, and then returns it. The attacker can exploit it with {{INJECTION COMMANDS}} in a URI. \r\nNOTE: The maintainer and multiple third parties believe that this vulnerability isn't valid because users shouldn't use untrusted templates without sandboxing." +} diff --git a/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu new file mode 100644 index 000000000000..15475e3b0e35 --- /dev/null +++ b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu @@ -0,0 +1,485 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.11.10 +ARG PYTHON_SHORT_VERSION=3.11 +ARG PYTORCH_VERSION=2.5.1 + +ARG MINIFORGE3_VERSION=24.9.0-0 +ARG CUDA_VERSION=12.4.1 +ARG CUDNN_VERSION=9.1.0.70 +ARG NCCL_VERSION=2.23.4 +ARG EFA_VERSION=1.36.0 +ARG HWLOC_VERSION=2.11.2 +ARG AWS_OFI_NCCL_VERSION=1.12.1 +ARG GDRCOPY_VERSION=2.4.2 +ARG TE_VERSION=1.11 +ARG FLASH_ATTN_VERSION=2.6.3 + +# PyTorch Binaries +ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cu124/torch-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl +ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cu124/torchvision-0.20.1%2Bcu124-cp311-cp311-linux_x86_64.whl +ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.5.1/x86/cu124/torchaudio-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl +ARG TORCHTEXT_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.4.0/x86/cu124/torchtext-0.18.0%2Bcu124-cp311-cp311-linux_x86_64.whl + +FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION + +ARG MINIFORGE3_VERSION +ARG CUDA_VERSION +ARG CUDNN_VERSION +ARG NCCL_VERSION +ARG EFA_VERSION +ARG HWLOC_VERSION +ARG AWS_OFI_NCCL_VERSION + +ENV CUDA_HOME="/usr/local/cuda" +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" +ENV PATH="/opt/conda/bin:${PATH}" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# older archs are not supported on CUDA12.1 like 3.7 for P2 instance +# 5.2 is G3 EC2 instance, 7.5 is G4*, 7.0 is p3*, 8.0 is P4*, 8.6 is G5* and 9.0 is P5* +ENV TORCH_CUDA_ARCH_LIST="5.2;7.0+PTX;7.5;8.0;8.6;9.0" +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + cuda-toolkit-12=${CUDA_VERSION}-1 \ + libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ + libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ + libhwloc-dev \ + libgomp1 \ + libibverbs-dev \ + libnuma1 \ + libnuma-dev \ + libtool \ + openssl \ + python3-dev \ + autoconf \ + pkg-config \ + check \ + libsubunit0 \ + libsubunit-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# For EFA, below flags are needed to install EFA on docker image +# -n, --no-verify Skip EFA device verification and test +# -l, --skip-limit-conf Skip EFA limit configuration +# -k, --skip-kmod Skip EFA kmod installation +RUN mkdir /tmp/efa \ + && cd /tmp/efa \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && apt-get update \ + && ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf /tmp/efa \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}" + +# Configure Open MPI and configure NCCL parameters +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \ + && echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf + +# Install hwloc +RUN mkdir /tmp/hwloc \ + && cd /tmp/hwloc \ + && wget https://download.open-mpi.org/release/hwloc/v${HWLOC_VERSION%.*}/hwloc-${HWLOC_VERSION}.tar.gz \ + && tar -xf hwloc-${HWLOC_VERSION}.tar.gz \ + && cd hwloc-${HWLOC_VERSION} \ + && ./configure \ + && make \ + && make install \ + && rm -rf /tmp/hwloc \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install aws-ofi-nccl plugin +RUN mkdir /tmp/aws-ofi-nccl \ + && cd /tmp/aws-ofi-nccl \ + && wget https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}-aws/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz \ + && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz \ + && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws \ + && ./autogen.sh \ + && ./configure --with-mpi=${OPEN_MPI_PATH} \ + --with-libfabric=${EFA_PATH} \ + --with-cuda=${CUDA_HOME} \ + --disable-tests \ + && make \ + && make install \ + && rm -rf /tmp/aws-ofi-nccl \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# For conda ssl verification +ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt +RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE3_VERSION}/Miniforge3-${MINIFORGE3_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/miniforge3.sh \ + && ~/miniforge3.sh -b -p /opt/conda \ + && rm ~/miniforge3.sh + +RUN pip install --no-cache-dir --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 + +# Install common conda packages +RUN /opt/conda/bin/mamba install -y -c conda-forge \ + python=$PYTHON_VERSION \ + cython \ + cryptography \ + pyopenssl \ + pybind11 \ + cmake \ + curl \ + libcurl \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + pyyaml \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3<2" \ + "awscli<2" \ + libgcc \ + ninja \ + && /opt/conda/bin/mamba clean -afy \ + && rm -rf /etc/apt/sources.list.d/* + +# Install common pip packages (in case of conda package is not available) +RUN pip install --no-cache-dir opencv-python mpi4py + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON +ARG NCCL_VERSION +ARG GDRCOPY_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + torchtnt \ + triton \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc \ + blis \ + "numpy<2" \ + && pip uninstall dataclasses + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +# Starts framework +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +ARG PYTHON +ARG NCCL_VERSION +ARG GDRCOPY_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL + +# SageMaker Profiler Binary +ARG SMP_URL=https://smppy.s3.amazonaws.com/pytorch/cu124/smprof-0.3.341-cp311-cp311-linux_x86_64.whl + +WORKDIR / + +# Install SageMaker Profiler Binary +RUN pip install --no-cache-dir -U ${SMP_URL} + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + torchtnt \ + triton \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc \ + blis \ + "numpy<2" \ + && pip uninstall dataclasses + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2,<3" \ + "sagemaker-experiments<1" \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN /opt/conda/bin/mamba install -y -c conda-forge \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + shap \ + scikit-learn \ + seaborn \ + cloudpickle \ + && /opt/conda/bin/mamba clean -afy + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index 0f1c5f1dd7ae..7dcbba3bd4fd 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -176,7 +176,7 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test( def post_process_pytorch_gpu_py3_synthetic_ec2_training_performance(connection, log_location): - line_to_read = 50 # increase this number if throughput is not in scope + line_to_read = 250 # increase this number if throughput is not in scope last_lines = connection.run(f"tail -n {line_to_read} {log_location}").stdout.split("\n") throughput = 0 for line in reversed(last_lines): diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 2dcf716d622f..78542e257c4c 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -53,6 +53,7 @@ # ECR repo name fixtures # PyTorch "pytorch_training", + "pytorch_training___2__5", "pytorch_training___2__4", "pytorch_training___2__3", "pytorch_training___2__2", @@ -996,7 +997,7 @@ def skip_serialized_release_pt_test(request): skip_dict = { "==1.13.*": ["cpu", "cu117"], ">=2.1,<2.4": ["cpu", "cu121"], - ">=2.4,<2.5": ["cpu", "cu124"], + ">=2.4,<2.6": ["cpu", "cu124"], } if _validate_pytorch_framework_version( request, image_uri, "skip_serialized_release_pt_test", skip_dict diff --git a/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_imagenet b/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_imagenet index 7b6f7aa1b50a..dc5b2b4439fd 100644 --- a/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_imagenet +++ b/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_imagenet @@ -1,5 +1,7 @@ #!/bin/bash +set -e + PYTHON_VERSION=$(python -c 'import sys; print(sys.version_info[0])' | tr -d "'") if [ "$PYTHON_VERSION" -eq 2 ] then diff --git a/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_inductor b/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_inductor index 257024c556ec..567c7ad8888b 100755 --- a/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_inductor +++ b/test/dlc_tests/container_tests/bin/benchmark/run_pytorch_training_performance_gpu_inductor @@ -32,7 +32,7 @@ pip install tabulate==0.9.0 TRAINING_LOG=${LOG_DIR}/pytorch_inductor_huggingface_benchmark.log -python benchmarks/dynamo/runner.py --suites=huggingface --training --dtypes=amp --compilers=inductor --output-dir=huggingface_logs --extra-args='--output-directory=./' > $TRAINING_LOG 2>&1 +python benchmarks/dynamo/runner.py --suites=huggingface --training --dtypes=amp --compilers=inductor --output-dir=huggingface_logs --extra-args='--output-directory=./' > $TRAINING_LOG 2>&1 RETURN_VAL=`echo $?` set -e @@ -47,7 +47,7 @@ fi TRAINING_LOG=${LOG_DIR}/pytorch_inductor_timm_benchmark.log -python benchmarks/dynamo/runner.py --suites=timm_models --training --dtypes=amp --compilers=inductor --output-dir=timm_logs --extra-args='--output-directory=./' > $TRAINING_LOG 2>&1 +python benchmarks/dynamo/runner.py --suites=timm_models --training --dtypes=amp --compilers=inductor --output-dir=timm_logs --extra-args='--output-directory=./' > $TRAINING_LOG 2>&1 RETURN_VAL=`echo $?` set -e @@ -64,7 +64,7 @@ fi TRAINING_LOG=${LOG_DIR}/pytorch_inductor_torchbench_benchmark.log # install torchdata and torchtext before installing torchbench -git clone --branch v0.6.0 https://github.com/pytorch/data.git +git clone --branch v0.6.0 https://github.com/pytorch/data.git cd data pip install . @@ -80,7 +80,7 @@ cd benchmark python install.py cd ../pytorch -python benchmarks/dynamo/runner.py --suites=torchbench --training --dtypes=amp --compilers=inductor --output-dir=torchbench_logs --extra-args='--output-directory=./' > $TRAINING_LOG 2>&1 +python benchmarks/dynamo/runner.py --suites=torchbench --training --dtypes=amp --compilers=inductor --output-dir=torchbench_logs --extra-args='--output-directory=./' > $TRAINING_LOG 2>&1 RETURN_VAL=`echo $?` set -e diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 96277780d017..72d2ce159f91 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -3,7 +3,7 @@ set -ex -NUM_HOSTS_file=$1 +NUM_HOSTS_FILE=$1 NUM_HOSTS=$2 if [[ -z "${CUDA_HOME}" ]]; then @@ -63,7 +63,8 @@ check_efa_nccl_all_reduce(){ # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch # versions in DLC images. - mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_file \ + mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ + -x NCCL_TUNER_PLUGIN=/usr/local/lib/libnccl-ofi-tuner.so \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ diff --git a/test/dlc_tests/container_tests/bin/gluonnlp_tests/testNLP b/test/dlc_tests/container_tests/bin/gluonnlp_tests/testNLP index 573ecd0517bc..234ec7a125d2 100644 --- a/test/dlc_tests/container_tests/bin/gluonnlp_tests/testNLP +++ b/test/dlc_tests/container_tests/bin/gluonnlp_tests/testNLP @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testNVApex b/test/dlc_tests/container_tests/bin/pytorch_tests/testNVApex index f1486a6272f5..2f0bf9d4d0fe 100644 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/testNVApex +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testNVApex @@ -1,10 +1,12 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin echo "Testing Nvidia Apex imports" -python -c "import torch; import apex; from apex import amp; from apex.parallel import DistributedDataParallel; from apex import optimizers; from apex.fp16_utils import *" +python -c "import torch; import apex; from apex import amp; from apex.parallel import DistributedDataParallel; from apex import optimizers; from apex.fp16_utils import *" || exit 1 exit 0 \ No newline at end of file diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorch b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorch index e8634b8895b5..35cba29594d7 100644 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorch +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorch @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchNcclVersion b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchNcclVersion index 95da2a63ed67..00680ba72c90 100644 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchNcclVersion +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchNcclVersion @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchRegression b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchRegression index 1ad37226d47a..91c86b07163d 100644 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchRegression +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchRegression @@ -1,9 +1,14 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs +if [ -d "${HOME_DIR}/artifacts/examples" ]; then + rm -rf ${HOME_DIR}/artifacts/examples +fi git clone https://github.com/pytorch/examples.git ${HOME_DIR}/artifacts/examples ${BIN_DIR}/pytorch_tests/testPyTorchRegressionHelper || exit 1 exit 0 diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchwithInductor b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchwithInductor index 29eff894d37b..0b33104a5aa5 100755 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchwithInductor +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testPyTorchwithInductor @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testTorchdata b/test/dlc_tests/container_tests/bin/pytorch_tests/testTorchdata index dd2b308abfe4..38498e6dd245 100644 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/testTorchdata +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/testTorchdata @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/testCurand b/test/dlc_tests/container_tests/bin/testCurand index a7d62305783a..6e2de2cd07cb 100644 --- a/test/dlc_tests/container_tests/bin/testCurand +++ b/test/dlc_tests/container_tests/bin/testCurand @@ -1,5 +1,7 @@ #!/bin/bash +set -e + cuda_include_pth=/usr/local/cuda/include if python -c "import torch" &> /dev/null; then diff --git a/test/dlc_tests/container_tests/bin/testMXNet b/test/dlc_tests/container_tests/bin/testMXNet index 2d09e43dfb57..4fe34e697a01 100644 --- a/test/dlc_tests/container_tests/bin/testMXNet +++ b/test/dlc_tests/container_tests/bin/testMXNet @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/testOpenCV b/test/dlc_tests/container_tests/bin/testOpenCV index 303f490a85ef..ced45649c9eb 100644 --- a/test/dlc_tests/container_tests/bin/testOpenCV +++ b/test/dlc_tests/container_tests/bin/testOpenCV @@ -6,8 +6,8 @@ LOG_DIR=/tmp/logs TRAINING_LOG=${LOG_DIR}/tensorflow_opencv_test.log set -ex -if [[ ! -d $LOG_DIR ]]; then - mkdir -p $LOG_DIR +if [[ ! -d $LOG_DIR ]]; then + mkdir -p $LOG_DIR fi echo "Simply verify if OpenCV works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG @@ -15,7 +15,7 @@ echo "Simply verify if OpenCV works well. You can follow progress on the log fil echo "Downloading a test image" wget -O ${LOG_DIR}/test_img.jpg https://docs.opencv.org/2.4/_images/GCC_CMake_Example_Tutorial.jpg -python ${BIN_DIR}/testOpenCV.py ${LOG_DIR}/test_img.jpg > ${TRAINING_LOG} +python ${BIN_DIR}/testOpenCV.py ${LOG_DIR}/test_img.jpg > ${TRAINING_LOG} if grep "Successfully test OpenCV" $TRAINING_LOG; then echo "Successfully verified OpenCV." diff --git a/test/dlc_tests/container_tests/bin/testPip3Install b/test/dlc_tests/container_tests/bin/testPip3Install index 92bfd7f977e0..c66bca169e05 100644 --- a/test/dlc_tests/container_tests/bin/testPip3Install +++ b/test/dlc_tests/container_tests/bin/testPip3Install @@ -1,5 +1,7 @@ #!/bin/bash +set -e + yes | pip3 install pytest --user || exit 1 python3 -c "import pytest; pytest.__version__" || exit 1 yes | pip3 uninstall pytest || exit 1 diff --git a/test/dlc_tests/container_tests/bin/testPipInstall b/test/dlc_tests/container_tests/bin/testPipInstall index daebb3492dfc..68e3f4883b65 100755 --- a/test/dlc_tests/container_tests/bin/testPipInstall +++ b/test/dlc_tests/container_tests/bin/testPipInstall @@ -1,5 +1,7 @@ #!/bin/bash +set -e + yes | pip install pytest --user || exit 1 python -c "import pytest; pytest.__version__" || exit 1 yes | pip uninstall pytest || exit 1 diff --git a/test/dlc_tests/container_tests/bin/testSmdebug b/test/dlc_tests/container_tests/bin/testSmdebug index b6ae5d8657f1..96d3fa86a78d 100755 --- a/test/dlc_tests/container_tests/bin/testSmdebug +++ b/test/dlc_tests/container_tests/bin/testSmdebug @@ -1,5 +1,7 @@ #!/bin/bash +set -e + if [ $# -lt 1 ]; then echo $0 echo "Not enough args" diff --git a/test/dlc_tests/container_tests/bin/testSmprofiler b/test/dlc_tests/container_tests/bin/testSmprofiler index b09c2df6c552..7c7c8fe55d4d 100755 --- a/test/dlc_tests/container_tests/bin/testSmprofiler +++ b/test/dlc_tests/container_tests/bin/testSmprofiler @@ -1,5 +1,7 @@ #!/bin/bash +set -e + echo "Running the sanity check for the profiler enabled smdebug binary." if [ $# -lt 1 ]; then echo $0 diff --git a/test/dlc_tests/container_tests/bin/testTF1HVD b/test/dlc_tests/container_tests/bin/testTF1HVD index abc331de6533..20219f63ba84 100644 --- a/test/dlc_tests/container_tests/bin/testTF1HVD +++ b/test/dlc_tests/container_tests/bin/testTF1HVD @@ -1,4 +1,7 @@ #!/bin/bash + +set -e + INSTANCE_TYPE=$1 if [ -z "$INSTANCE_TYPE" ];then echo "Usage: "$0" " diff --git a/test/dlc_tests/container_tests/bin/testTF2HVD b/test/dlc_tests/container_tests/bin/testTF2HVD index 954295aa7a43..4b1c1853f980 100644 --- a/test/dlc_tests/container_tests/bin/testTF2HVD +++ b/test/dlc_tests/container_tests/bin/testTF2HVD @@ -1,4 +1,7 @@ #!/bin/bash + +set -e + INSTANCE_TYPE=$1 if [ -z "$INSTANCE_TYPE" ];then echo "Usage: "$0" " diff --git a/test/dlc_tests/container_tests/bin/testTFAddons b/test/dlc_tests/container_tests/bin/testTFAddons index 63e0f30e743a..13838b039a1f 100644 --- a/test/dlc_tests/container_tests/bin/testTFAddons +++ b/test/dlc_tests/container_tests/bin/testTFAddons @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/testTFKerasHVDAMP b/test/dlc_tests/container_tests/bin/testTFKerasHVDAMP index 09973899d379..00145c6178b7 100644 --- a/test/dlc_tests/container_tests/bin/testTFKerasHVDAMP +++ b/test/dlc_tests/container_tests/bin/testTFKerasHVDAMP @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/testTFKerasHVDFP32 b/test/dlc_tests/container_tests/bin/testTFKerasHVDFP32 index 117075c71c35..81b5cc704cc5 100644 --- a/test/dlc_tests/container_tests/bin/testTFKerasHVDFP32 +++ b/test/dlc_tests/container_tests/bin/testTFKerasHVDFP32 @@ -1,5 +1,7 @@ #!/bin/bash +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs diff --git a/test/dlc_tests/container_tests/bin/testTensorBoard b/test/dlc_tests/container_tests/bin/testTensorBoard index 1f031dd83241..92a78839ace1 100644 --- a/test/dlc_tests/container_tests/bin/testTensorBoard +++ b/test/dlc_tests/container_tests/bin/testTensorBoard @@ -1,5 +1,7 @@ #!/bin/bash - + +set -e + HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin diff --git a/test/dlc_tests/container_tests/bin/testTensorFlow b/test/dlc_tests/container_tests/bin/testTensorFlow index e59a61e89e7b..5358bb29dcb5 100644 --- a/test/dlc_tests/container_tests/bin/testTensorFlow +++ b/test/dlc_tests/container_tests/bin/testTensorFlow @@ -4,8 +4,8 @@ HOME_DIR=/test BIN_DIR=${HOME_DIR}/bin LOG_DIR=${HOME_DIR}/logs -if [[ ! -d $LOG_DIR ]]; then - mkdir -p $LOG_DIR +if [[ ! -d $LOG_DIR ]]; then + mkdir -p $LOG_DIR fi # Use this to conditionally check TF version diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py new file mode 100644 index 000000000000..b8dfa06cc337 --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_5.py @@ -0,0 +1,132 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True +) +def test_pytorch_2_5_gpu( + pytorch_training___2__5, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__5 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.5 GPU") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_heavy_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +@pytest.mark.skipif( + test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), + reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +) +def test_pytorch_2_5_gpu_heavy( + pytorch_training___2__5, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__5 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.5 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +def test_pytorch_2_5_gpu_inductor( + pytorch_training___2__5, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__5 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.5 GPU Inductor") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_cpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_2_5_cpu(pytorch_training___2__5, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__5 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_cpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases += [ + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.5 CPU") diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 8851b035978b..921e94aad337 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -286,7 +286,7 @@ def _setup_container(connection, docker_image, container_name): # Share all EFA devices with container using --device for all EFA devices. connection.run( f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " - f"{docker_all_devices_arg} -v $HOME/container_tests:/test {docker_image} bash" + f"{docker_all_devices_arg} -v $HOME/container_tests:/test -v /dev/shm:/dev/shm {docker_image} bash" ) From 1703143f01acff276d979d36426eafb3bd5060d8 Mon Sep 17 00:00:00 2001 From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> Date: Thu, 14 Nov 2024 14:35:18 -0800 Subject: [PATCH 2/6] update license file for PT 2.5 (#4422) --- pytorch/training/docker/2.5/py3/Dockerfile.cpu | 2 +- pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.5/py3/Dockerfile.cpu b/pytorch/training/docker/2.5/py3/Dockerfile.cpu index bfa7c55a381f..3b68f7380b06 100644 --- a/pytorch/training/docker/2.5/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.5/py3/Dockerfile.cpu @@ -176,7 +176,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ # Install common pip packages (in case of conda package is not available) RUN pip install --no-cache-dir opencv-python mpi4py -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.5/license.txt COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py diff --git a/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu index 15475e3b0e35..1271caed70f8 100644 --- a/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu +++ b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu @@ -257,7 +257,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ # Install common pip packages (in case of conda package is not available) RUN pip install --no-cache-dir opencv-python mpi4py -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.5/license.txt COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py From c95dd296e65f5f75a803572141ac38c4eb364f94 Mon Sep 17 00:00:00 2001 From: Yadan-Wei <84998528+Yadan-Wei@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:45:53 -0800 Subject: [PATCH 3/6] [PyTorch][Inference][EC2] Enable PT2.5.1 Autopatch (#4421) * add core packages * add captum in core package * enable sm autopatch * revert toml * add requests to core packages * change version to 2.32.3 --------- Co-authored-by: Yadan Wei --- pytorch/inference/buildspec-2-5-ec2.yml | 2 +- pytorch/inference/buildspec-2-5-sm.yml | 2 +- pytorch/inference/buildspec.yml | 2 +- .../py3/Dockerfile.ec2.cpu.core_packages.json | 27 +++++++++++++++++ ...ockerfile.sagemaker.cpu.core_packages.json | 30 +++++++++++++++++++ .../Dockerfile.ec2.gpu.core_packages.json | 27 +++++++++++++++++ ...ockerfile.sagemaker.gpu.core_packages.json | 30 +++++++++++++++++++ 7 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 pytorch/inference/docker/2.5/py3/Dockerfile.ec2.cpu.core_packages.json create mode 100644 pytorch/inference/docker/2.5/py3/Dockerfile.sagemaker.cpu.core_packages.json create mode 100644 pytorch/inference/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.core_packages.json create mode 100644 pytorch/inference/docker/2.5/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json diff --git a/pytorch/inference/buildspec-2-5-ec2.yml b/pytorch/inference/buildspec-2-5-ec2.yml index a20ca9443cdb..a34fb55735f8 100644 --- a/pytorch/inference/buildspec-2-5-ec2.yml +++ b/pytorch/inference/buildspec-2-5-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -# autopatch_build: "True" +autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY diff --git a/pytorch/inference/buildspec-2-5-sm.yml b/pytorch/inference/buildspec-2-5-sm.yml index 354d31f305a4..0bba3d6138f4 100644 --- a/pytorch/inference/buildspec-2-5-sm.yml +++ b/pytorch/inference/buildspec-2-5-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.5.1 short_version: &SHORT_VERSION "2.5" arch_type: x86 -# autopatch_build: "True" +autopatch_build: "True" repository_info: inference_repository: &INFERENCE_REPOSITORY diff --git a/pytorch/inference/buildspec.yml b/pytorch/inference/buildspec.yml index 767e9a8e9bab..16412edfa389 100644 --- a/pytorch/inference/buildspec.yml +++ b/pytorch/inference/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-5-ec2.yml +buildspec_pointer: buildspec-2-5-sm.yml diff --git a/pytorch/inference/docker/2.5/py3/Dockerfile.ec2.cpu.core_packages.json b/pytorch/inference/docker/2.5/py3/Dockerfile.ec2.cpu.core_packages.json new file mode 100644 index 000000000000..b0b255fda778 --- /dev/null +++ b/pytorch/inference/docker/2.5/py3/Dockerfile.ec2.cpu.core_packages.json @@ -0,0 +1,27 @@ +{ + "captum": { + "version_specifier": "==0.6.0", + "skip": "True" + }, + "torchaudio": { + "version_specifier": "==2.5.1+cpu", + "skip": "True" + }, + "torchtext": { + "version_specifier": "==0.18.0+cpu", + "skip": "True" + }, + "torchvision": { + "version_specifier": "==0.20.1+cpu", + "skip": "True" + }, + "torchserve": { + "version_specifier": "==0.12.0" + }, + "requests": { + "version_specifier": ">=2.32.3" + }, + "torch-model-archiver": { + "version_specifier": "==0.12.0" + } +} diff --git a/pytorch/inference/docker/2.5/py3/Dockerfile.sagemaker.cpu.core_packages.json b/pytorch/inference/docker/2.5/py3/Dockerfile.sagemaker.cpu.core_packages.json new file mode 100644 index 000000000000..3856bf997ccc --- /dev/null +++ b/pytorch/inference/docker/2.5/py3/Dockerfile.sagemaker.cpu.core_packages.json @@ -0,0 +1,30 @@ +{ + "captum": { + "version_specifier": "==0.6.0", + "skip": "True" + }, + "torchaudio": { + "version_specifier": "==2.5.1+cpu", + "skip": "True" + }, + "torchtext": { + "version_specifier": "==0.18.0+cpu", + "skip": "True" + }, + "torchvision": { + "version_specifier": "==0.20.1+cpu", + "skip": "True" + }, + "requests": { + "version_specifier": ">=2.32.3" + }, + "torchserve": { + "version_specifier": "==0.12.0" + }, + "torch-model-archiver": { + "version_specifier": "==0.12.0" + }, + "sagemaker-pytorch-inference": { + "version_specifier": "==2.0.25" + } +} diff --git a/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.core_packages.json b/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.core_packages.json new file mode 100644 index 000000000000..25976b9498cf --- /dev/null +++ b/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.ec2.gpu.core_packages.json @@ -0,0 +1,27 @@ +{ + "captum": { + "version_specifier": "==0.6.0", + "skip": "True" + }, + "torchaudio": { + "version_specifier": "==2.5.1+cu124", + "skip": "True" + }, + "torchtext": { + "version_specifier": "==0.18.0+cu124", + "skip": "True" + }, + "torchvision": { + "version_specifier": "==0.20.1+cu124", + "skip": "True" + }, + "requests": { + "version_specifier": ">=2.32.3" + }, + "torchserve": { + "version_specifier": "==0.12.0" + }, + "torch-model-archiver": { + "version_specifier": "==0.12.0" + } +} diff --git a/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json b/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json new file mode 100644 index 000000000000..0b040c6962f6 --- /dev/null +++ b/pytorch/inference/docker/2.5/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json @@ -0,0 +1,30 @@ +{ + "captum": { + "version_specifier": "==0.6.0", + "skip": "True" + }, + "torchaudio": { + "version_specifier": "==2.5.1+cu124", + "skip": "True" + }, + "torchtext": { + "version_specifier": "==0.18.0+cu124", + "skip": "True" + }, + "torchvision": { + "version_specifier": "==0.20.1+cu124", + "skip": "True" + }, + "requests": { + "version_specifier": ">=2.32.3" + }, + "torchserve": { + "version_specifier": "==0.12.0" + }, + "torch-model-archiver": { + "version_specifier": "==0.12.0" + }, + "sagemaker-pytorch-inference": { + "version_specifier": "==2.0.25" + } +} From 5b1f2a614205ad110bf23e0109503ec0683a9c8d Mon Sep 17 00:00:00 2001 From: Daniel Gomez Antonio Date: Fri, 15 Nov 2024 09:26:01 -0800 Subject: [PATCH 4/6] [pytorch][sagemaker] Add SMDDP binary for PT 2.4 (#4403) * Add SMDDP binary for PT 2.4 * Trigger testing * Add missing ARG for PT version * Upgrade pinned dependecies * Upgrade pinned version of SM container 2.4 * Move mpi4py to be installed by pip * Revert "Upgrade pinned dependecies" This reverts commit 4a834dd3d06bb2345ef17ca8d74d305e9c6e5862. * Revert "Trigger testing" This reverts commit 8fed03cfc8386c2d8daf826bbcf060e5065b74fc. * Adapt to latest changes --------- Co-authored-by: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> --- pytorch/training/buildspec-2-4-sm.yml | 2 +- pytorch/training/docker/2.4/py3/Dockerfile.cpu | 3 +-- .../py3/Dockerfile.sagemaker.cpu.core_packages.json | 4 ++-- .../training/docker/2.4/py3/cu124/Dockerfile.gpu | 13 +++++++++++-- .../Dockerfile.sagemaker.gpu.core_packages.json | 4 ++-- test/sagemaker_tests/pytorch/training/conftest.py | 4 ++-- 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml index 994115f4029c..5a97e6344657 100644 --- a/pytorch/training/buildspec-2-4-sm.yml +++ b/pytorch/training/buildspec-2-4-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.4.0 short_version: &SHORT_VERSION "2.4" arch_type: x86 -autopatch_build: "True" +autopatch_build: "False" repository_info: training_repository: &TRAINING_REPOSITORY diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu index ca219179dc51..702991cb1cb8 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu @@ -163,7 +163,6 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ ipykernel \ pillow \ h5py \ - mpi4py \ fsspec \ "idna>=3.7" \ "tqdm>=4.66.3" \ @@ -175,7 +174,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ && rm -rf /etc/apt/sources.list.d/* # Install common pip packages (in case of conda package is not available) -RUN pip install --no-cache-dir opencv-python +RUN pip install --no-cache-dir opencv-python mpi4py RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json b/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json index f5aa1693174c..550b7143779a 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json +++ b/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json @@ -1,6 +1,6 @@ { "accelerate": { - "version_specifier": "==1.0.1", + "version_specifier": "==1.1.1", "skip": "True" }, "fastai": { @@ -8,7 +8,7 @@ "skip": "True" }, "s3torchconnector": { - "version_specifier": "==1.2.6", + "version_specifier": "==1.2.7", "skip": "True" }, "torchaudio": { diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu index 52dadeb20bb6..e4ba7b0a3d69 100644 --- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu +++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu @@ -243,7 +243,6 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ ipykernel \ pillow \ h5py \ - mpi4py \ fsspec \ "idna>=3.7" \ "tqdm>=4.66.3" \ @@ -258,7 +257,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ && rm -rf /etc/apt/sources.list.d/* # Install common pip packages (in case of conda package is not available) -RUN pip install --no-cache-dir opencv-python +RUN pip install --no-cache-dir opencv-python mpi4py RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.4/license.txt @@ -394,6 +393,7 @@ LABEL dlc_major_version="1" ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main ARG PYTHON +ARG PYTHON_SHORT_VERSION ARG NCCL_VERSION ARG GDRCOPY_VERSION ARG APEX_VERSION @@ -405,11 +405,17 @@ ARG TORCHTEXT_URL # SageMaker Profiler Binary ARG SMP_URL=https://smppy.s3.amazonaws.com/pytorch/cu124/smprof-0.3.341-cp311-cp311-linux_x86_64.whl +# SageMaker DataParallel Binary +ARG SMD_DATA_PARALLEL_URL=https://smdataparallel.s3.us-east-1.amazonaws.com/binary/pytorch/2.4.0/cu124/2024-11-04/smdistributed_dataparallel-2.5.0-cp311-cp311-linux_x86_64.whl + WORKDIR / # Install SageMaker Profiler Binary RUN pip install --no-cache-dir -U ${SMP_URL} +# Install SM Distributed DataParallel binary +RUN SMDATAPARALLEL_PT=1 pip install --no-cache-dir ${SMD_DATA_PARALLEL_URL} + # Install PyTorch RUN pip install --no-cache-dir -U \ ${TORCH_URL} \ @@ -490,6 +496,9 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ "cloudpickle==2.2.1" \ && /opt/conda/bin/mamba clean -afy +# Add SageMaker DataParallel to LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH="/opt/conda/lib/python${PYTHON_SHORT_VERSION}/site-packages/smdistributed/dataparallel/lib:$LD_LIBRARY_PATH" + # Copy workaround script for incorrect hostname COPY changehostname.c / COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json index 9fa3508f9775..85af35715996 100644 --- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json +++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json @@ -1,6 +1,6 @@ { "accelerate": { - "version_specifier": "==1.0.1", + "version_specifier": "==1.1.1", "skip": "True" }, "fastai": { @@ -12,7 +12,7 @@ "skip": "True" }, "s3torchconnector": { - "version_specifier": "==1.2.6", + "version_specifier": "==1.2.7", "skip": "True" }, "torchaudio": { diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py index ebe2cc235f39..fd5c46d49a63 100644 --- a/test/sagemaker_tests/pytorch/training/conftest.py +++ b/test/sagemaker_tests/pytorch/training/conftest.py @@ -511,7 +511,7 @@ def skip_pytorchddp_test( For each currency release, Once SMDDP binary is added, we skip pytorchddp tests due to `pytorchddp` and `smdistributed` launcher consolidation. See https://github.com/aws/sagemaker-python-sdk/pull/4698. """ - skip_dict = {">=2.1,<2.4": ["cu121"]} + skip_dict = {">=2.1,<=2.4": ["cu121"]} if _validate_pytorch_framework_version( request, processor, ecr_image, "skip_pytorchddp_test", skip_dict ): @@ -543,7 +543,7 @@ def skip_smddataparallel_test( For each currency release, we can skip SMDDP tests if the binary does not exist. However, when the SMDDP binaries are added, be sure to fix the test logic such that the tests are not skipped. """ - skip_dict = {"==2.0.*": ["cu121"], ">=2.4": ["cu124"]} + skip_dict = {"==2.0.*": ["cu121"], ">2.4": ["cu124"]} if _validate_pytorch_framework_version( request, processor, ecr_image, "skip_smddataparallel_test", skip_dict ): From a8ea109d60d2b6cd8e0ff172b29ec8498500e0cc Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Fri, 15 Nov 2024 19:09:14 +0100 Subject: [PATCH 5/6] [huggingface_pytorch] Inference - update for HuggingFace Transformers to 4.46.1 - Accelerate 1.1.0 - PyTorch 2.3 (#4392) * update * upgrades * add dlc developper config * fix ipex install * try fix ipex * skip ipex, rendu les armes * try fix * skip successful build * restore model * fix and upgrade * do build and remove trfrs which is not necessary * fix sanity * Revert "add dlc developper config" This reverts commit 4d7b3f42e4434e67157908109d26320f514ff0f7. --------- Co-authored-by: Haotian An <33510317+Captainia@users.noreply.github.com> --- .../pytorch/inference/buildspec-2-1-0.yml | 58 ++++ huggingface/pytorch/inference/buildspec.yml | 18 +- .../inference/docker/2.3/py3/Dockerfile.cpu | 249 ++++++++++++++++ .../docker/2.3/py3/cu121/Dockerfile.gpu | 265 ++++++++++++++++++ .../sagemaker/test_diffusers_model.py | 2 +- .../sagemaker/test_ipex_inference.py | 1 + .../sagemaker/test_torch_compile.py | 2 +- .../huggingface/inference/requirements.txt | 2 - 8 files changed, 584 insertions(+), 13 deletions(-) create mode 100644 huggingface/pytorch/inference/buildspec-2-1-0.yml create mode 100644 huggingface/pytorch/inference/docker/2.3/py3/Dockerfile.cpu create mode 100644 huggingface/pytorch/inference/docker/2.3/py3/cu121/Dockerfile.gpu diff --git a/huggingface/pytorch/inference/buildspec-2-1-0.yml b/huggingface/pytorch/inference/buildspec-2-1-0.yml new file mode 100644 index 000000000000..a677b85870ea --- /dev/null +++ b/huggingface/pytorch/inference/buildspec-2-1-0.yml @@ -0,0 +1,58 @@ +account_id: &ACCOUNT_ID +region: ®ION +base_framework: &BASE_FRAMEWORK pytorch +framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] +version: &VERSION 2.1.0 +short_version: &SHORT_VERSION "2.1" +contributor: huggingface +arch_type: x86 + +repository_info: + inference_repository: &INFERENCE_REPOSITORY + image_type: &INFERENCE_IMAGE_TYPE inference + root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *INFERENCE_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *INFERENCE_IMAGE_TYPE] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + +context: + inference_context: &INFERENCE_CONTEXT + mms-entrypoint: + source: ../../build_artifacts/inference/mms-entrypoint.py + target: mms-entrypoint.py + config: + source: ../../build_artifacts/inference/config.properties + target: config.properties + deep_learning_container: + source: ../../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + BuildHuggingFacePytorchCpuPy310InferenceDockerImage: + <<: *INFERENCE_REPOSITORY + build: &HUGGINGFACE_PYTORCH_CPU_INFERENCE_PY3 false + image_size_baseline: 15000 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py310 + os_version: &OS_VERSION ubuntu22.04 + transformers_version: &TRANSFORMERS_VERSION 4.37.0 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + context: + <<: *INFERENCE_CONTEXT + BuildHuggingFacePytorchGpuPy310Cu118InferenceDockerImage: + <<: *INFERENCE_REPOSITORY + build: &HUGGINGFACE_PYTORCH_GPU_INFERENCE_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 15000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py310 + cuda_version: &CUDA_VERSION cu118 + os_version: &OS_VERSION ubuntu20.04 + transformers_version: &TRANSFORMERS_VERSION 4.37.0 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', + *CUDA_VERSION, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, + *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + context: + <<: *INFERENCE_CONTEXT diff --git a/huggingface/pytorch/inference/buildspec.yml b/huggingface/pytorch/inference/buildspec.yml index a677b85870ea..3ab4ff9f31cf 100644 --- a/huggingface/pytorch/inference/buildspec.yml +++ b/huggingface/pytorch/inference/buildspec.yml @@ -2,8 +2,8 @@ account_id: &ACCOUNT_ID region: ®ION base_framework: &BASE_FRAMEWORK pytorch framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] -version: &VERSION 2.1.0 -short_version: &SHORT_VERSION "2.1" +version: &VERSION 2.3.0 +short_version: &SHORT_VERSION "2.3" contributor: huggingface arch_type: x86 @@ -27,29 +27,29 @@ context: target: deep_learning_container.py images: - BuildHuggingFacePytorchCpuPy310InferenceDockerImage: + BuildHuggingFacePytorchCpuPy311InferenceDockerImage: <<: *INFERENCE_REPOSITORY build: &HUGGINGFACE_PYTORCH_CPU_INFERENCE_PY3 false image_size_baseline: 15000 device_type: &DEVICE_TYPE cpu python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py310 + tag_python_version: &TAG_PYTHON_VERSION py311 os_version: &OS_VERSION ubuntu22.04 - transformers_version: &TRANSFORMERS_VERSION 4.37.0 + transformers_version: &TRANSFORMERS_VERSION 4.46.1 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] context: <<: *INFERENCE_CONTEXT - BuildHuggingFacePytorchGpuPy310Cu118InferenceDockerImage: + BuildHuggingFacePytorchGpuPy311Cu121InferenceDockerImage: <<: *INFERENCE_REPOSITORY build: &HUGGINGFACE_PYTORCH_GPU_INFERENCE_PY3 false image_size_baseline: &IMAGE_SIZE_BASELINE 15000 device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py310 - cuda_version: &CUDA_VERSION cu118 + tag_python_version: &TAG_PYTHON_VERSION py311 + cuda_version: &CUDA_VERSION cu121 os_version: &OS_VERSION ubuntu20.04 - transformers_version: &TRANSFORMERS_VERSION 4.37.0 + transformers_version: &TRANSFORMERS_VERSION 4.46.1 tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-', *CUDA_VERSION, '-', *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, diff --git a/huggingface/pytorch/inference/docker/2.3/py3/Dockerfile.cpu b/huggingface/pytorch/inference/docker/2.3/py3/Dockerfile.cpu new file mode 100644 index 000000000000..631921de5888 --- /dev/null +++ b/huggingface/pytorch/inference/docker/2.3/py3/Dockerfile.cpu @@ -0,0 +1,249 @@ +FROM ubuntu:22.04 AS base_image + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +LABEL com.amazonaws.sagemaker.capabilities.multi-models=true + +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.11.9 +ARG MINIFORGE3_VERSION=23.11.0-0 +ARG OPEN_MPI_VERSION=4.1.5 +ARG MMS_VERSION=1.1.11 + +# PyTorch Binaries and versions. +ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.3.0/cpu/torch-2.3.0%2Bcpu-cp311-cp311-linux_x86_64.whl +ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.3.0/cpu/torchvision-0.18.0%2Bcpu-cp311-cp311-linux_x86_64.whl +ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.3.0/cpu/torchaudio-2.3.0%2Bcpu-cp311-cp311-linux_x86_64.whl + +# HF ARGS +ARG TRANSFORMERS_VERSION +ARG HUGGINGFACE_HUB_VERSION=0.25.1 +ARG DIFFUSERS_VERSION=0.31.0 +ARG PEFT_VERSION=0.13.2 +ARG ACCELERATE_VERSION=1.1.0 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PATH=/opt/conda/bin:$PATH + +# Set Env Variables for the images +ENV TEMP=/tmp +ENV MKL_THREADING_LAYER=GNU + +ENV DLC_CONTAINER_TYPE=inference + +RUN apt-get update \ + && apt-get -y upgrade \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + ca-certificates \ + ccache \ + numactl \ + gcc-12 \ + g++-12 \ + make \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + openjdk-17-jdk \ + openssl \ + unzip \ + vim \ + wget \ + libjpeg-dev \ + libpng-dev \ + zlib1g-dev \ + libsndfile1-dev \ + ffmpeg \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 \ + && update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 \ + && update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 \ + && apt-get clean + +# Install OpenMPI +RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \ + && cd openmpi-$OPEN_MPI_VERSION \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-$OPEN_MPI_VERSION.tar.gz \ + && rm -rf openmpi-$OPEN_MPI_VERSION + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install CondaForge miniconda +RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE3_VERSION}/Miniforge3-${MINIFORGE3_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/miniforge3.sh \ + && ~/miniforge3.sh -b -p /opt/conda \ + && rm ~/miniforge3.sh \ + && /opt/conda/bin/conda install -c conda-forge \ + python=${PYTHON_VERSION} \ + cython \ + "mkl<2024.1.0" \ + mkl-include \ + parso \ + scipy \ + typing \ + h5py \ + requests \ + libgcc \ + cmake \ + packaging \ + "awscli<2" \ + boto3 \ + pyyaml \ + conda-content-trust \ + charset-normalizer \ + requests \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "zstandard>=0.22.0" \ + && /opt/conda/bin/conda clean -afy \ + && rm -rf /etc/apt/sources.list.d/* + +# symlink pip for OS use +RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 + +# Install Common python packages +RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -U \ + opencv-python \ + "pyopenssl>=24.0.0" \ + "cryptography>=42.0.5" \ + "ipython>=8.10.0,<9.0" \ + "awscli<2" \ + "urllib3>=1.26.18,<2" \ + "prompt-toolkit<3.0.39" \ + "setuptools>=70.0.0" + +# Ensure PyTorch did not get installed from Conda or pip, prior to now +# is CPU image, removing nvgpu +# Any Nvidia installs for the DLC will be below, removing nvidia and cuda packages from pip here +# Even the GPU image would not have nvidia or cuda packages in PIP. +RUN pip uninstall -y torch torchvision torchaudio multi-model-server + +# Install AWS-PyTorch, and other torch packages +RUN pip install --no-cache-dir -U \ + enum-compat==0.0.3 \ + "Pillow>=9.0.0" \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} + +WORKDIR / + +RUN pip install --no-cache-dir \ + multi-model-server==$MMS_VERSION \ + sagemaker-inference + +# Patches +# py-vuln: 71064 +RUN pip install --no-cache-dir -U "requests>=2.32.3" + +# add necessary certificate for aws sdk cpp download +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + +# create user and folders +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model + +# add MMS entrypoint +COPY mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY config.properties /etc/sagemaker-mms.properties +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py + +# add telemetry +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +RUN chmod +x /usr/local/bin/deep_learning_container.py + +################################# +# Hugging Face specific section # +################################# + +# install Hugging Face libraries and its dependencies +RUN pip install --no-cache-dir \ + # hf_transfer will be a built-in feature, remove the extra then + huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB_VERSION} \ + transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ + diffusers==${DIFFUSERS_VERSION} \ + peft==${PEFT_VERSION} \ + accelerate==${ACCELERATE_VERSION} \ + "protobuf>=3.19.5,<=3.20.2" \ + "sagemaker-huggingface-inference-toolkit==2.4.1" + +# hf_transfer will be a built-in feature, remove the env variavle then +ENV HF_HUB_ENABLE_HF_TRANSFER="1" + +##################### +# IPEX installation # +##################### + +# Skip ipex installation for now due to error: 0.18.0+cpu, the required version for compiling is 0.18.0+cpu... +# Install IPEx and its dependencies +# from source is mandatory for cutomized AWS PyTorch binaries: https://github.com/intel/intel-extension-for-pytorch/issues/317 +# RUN pip install --no-cache-dir intel-openmp tbb pyyaml +# RUN cd /opt/ \ +# && mkdir -p ipex \ +# && cd /opt/ipex \ +# && wget https://github.com/intel/intel-extension-for-pytorch/raw/v2.3.0%2Bcpu/scripts/compile_bundle.sh \ +# && MODE=3 bash compile_bundle.sh \ +# && rm -rf /opt/ipex && cd /opt/ + +# IPEx installation installs the numpy==1.25.1. That causes a pip check failure due to incompatibility with numba. +# Re-installing numpy after IPEx installation to get the appropriate numpy version and fix pip checks. +# RUN pip install --no-cache-dir \ +# "numpy<1.25" \ +# "pyyaml>=5.4" + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.3/license.txt + +## Cleanup ## +RUN pip cache purge \ + && rm -rf /tmp/tmp* \ + && rm -iRf /root/.cache \ + && rm -rf /opt/llvm-project \ + && rm -rf opt/intel-extension-for-pytorch + +EXPOSE 8080 8081 +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["serve"] \ No newline at end of file diff --git a/huggingface/pytorch/inference/docker/2.3/py3/cu121/Dockerfile.gpu b/huggingface/pytorch/inference/docker/2.3/py3/cu121/Dockerfile.gpu new file mode 100644 index 000000000000..26c6da73d603 --- /dev/null +++ b/huggingface/pytorch/inference/docker/2.3/py3/cu121/Dockerfile.gpu @@ -0,0 +1,265 @@ +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS base_image + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +LABEL com.amazonaws.sagemaker.capabilities.multi-models=true +LABEL com.amazonaws.sagemaker.inference.cuda.verified_versions=12.2 + +ARG MMS_VERSION=1.1.11 +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.11.9 +ARG MINIFORGE3_VERSION=23.11.0-0 +ARG OPEN_MPI_VERSION=4.1.5 + +# Nvidia software versions +ARG CUBLAS_VERSION=12.1.3.1 +ARG CUDNN_VERSION=8.9.2.26 +ARG NCCL_VERSION=2.20.5 +ARG NVML_VERSION=12.1.55 + +# PyTorch Binaries and versions. +ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.3.0/cuda12.1.1/torch-2.3.0%2Bcu121-cp311-cp311-linux_x86_64.whl +ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.3.0/cuda12.1.1/torchvision-0.18.0%2Bcu121-cp311-cp311-linux_x86_64.whl +ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.3.0/cuda12.1.1/torchaudio-2.3.0%2Bcu121-cp311-cp311-linux_x86_64.whl +ARG TRITON_VERSION=2.3.0 + +# HF ARGS +ARG TRANSFORMERS_VERSION +ARG HUGGINGFACE_HUB_VERSION=0.25.1 +ARG DIFFUSERS_VERSION=0.31.0 +ARG PEFT_VERSION=0.13.2 +ARG ACCELERATE_VERSION=1.0.1 + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PATH=/opt/conda/bin:$PATH + +# Set Env Variables for the images +ENV TEMP=/tmp +ENV MKL_THREADING_LAYER=GNU + +# Cuda Arch List setting Options +ENV TORCH_CUDA_ARCH_LIST="5.0 7.0+PTX 7.5+PTX 8.0 8.6 9.0" + +ENV DLC_CONTAINER_TYPE=inference + +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade \ + && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + libgssapi-krb5-2 \ + libcurl4-openssl-dev \ + cuda-cudart-12-1 \ + cuda-cudart-dev-12-1 \ + cuda-libraries-12-1 \ + cuda-libraries-dev-12-1 \ + cuda-command-line-tools-12-1 \ + cuda-nvcc-12-1 \ + libcublas-12-1=${CUBLAS_VERSION}-1 \ + libcublas-dev-12-1=${CUBLAS_VERSION}-1 \ + cuda-nvml-dev-12-1=${NVML_VERSION}-1 \ + libcudnn8=${CUDNN_VERSION}-1+cuda12.1 \ + curl \ + emacs \ + git \ + jq \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libgomp1 \ + libibverbs-dev \ + libnuma1 \ + libnuma-dev \ + libsm6 \ + libssl1.1 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + openjdk-17-jdk \ + openssl \ + vim \ + wget \ + unzip \ + libjpeg-dev \ + libpng-dev \ + zlib1g-dev \ + openssh-client \ + openssh-server \ + python3-dev \ + libsndfile1-dev \ + ffmpeg \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl +# preload system nccl for PyTorch to use if it is dynamically linking NCCL +ENV LD_PRELOAD="/usr/local/lib/libnccl.so" + +# Install OpenMPI +RUN wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \ + && cd openmpi-${OPEN_MPI_VERSION} \ + && ./configure --prefix=/home/.openmpi --with-cuda \ + && make all install \ + && cd .. \ + && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && rm -rf openmpi-${OPEN_MPI_VERSION} + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install CondaForge miniconda +RUN curl -L -o ~/miniforge3.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE3_VERSION}/Miniforge3-${MINIFORGE3_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/miniforge3.sh \ + && ~/miniforge3.sh -b -p /opt/conda \ + && rm ~/miniforge3.sh + +# Install common conda packages +RUN /opt/conda/bin/conda install -y -c conda-forge \ + python=${PYTHON_VERSION} \ + cython \ + "mkl<2024.1.0" \ + mkl-include \ + parso \ + scipy \ + numpy \ + pandas \ + pyarrow \ + typing \ + h5py \ + libgcc \ + cmake \ + packaging \ + "awscli<2" \ + boto3 \ + pyyaml \ + packaging \ + conda-content-trust \ + charset-normalizer \ + requests \ + "idna>=3.7"\ + "tqdm>=4.66.3" \ + "zstandard>=0.22.0" \ + && /opt/conda/bin/conda clean -afy \ + && rm -rf /etc/apt/sources.list.d/* + +# symlink pip for OS use +RUN pip install --upgrade pip --no-cache-dir --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 + +# Install Common python packages +RUN pip install --no-cache-dir -U \ + opencv-python \ + # "nvgpu" is a dependency of TS but is disabled in SM DLC build, + # via ENV Variable "TS_DISABLE_SYSTEM_METRICS=true" in the SM section of this file. + # due to incompatibility with SM hosts + nvgpu \ + "pyopenssl>=24.0.0" \ + enum-compat==0.0.3 \ + captum \ + "Pillow>=9.0.0" \ + "cryptography>=42.0.5" \ + "ipython>=8.10.0,<9.0" \ + "urllib3>=1.26.18,<2" \ + "prompt-toolkit<3.0.39" \ + "setuptools>=70.0.0" + +# Ensure PyTorch did not get installed from Conda or pip, prior to now +# Any Nvidia installs for the DLC will be below, removing nvidia and cuda packages from pip here +# Even the GPU image would not have nvidia or cuda packages in PIP. +RUN pip uninstall -y torch torchvision torchaudio torchdata model-archiver multi-model-server + +# Install AWS-PyTorch, and other torch packages +RUN pip install --no-cache-dir -U \ + # triton required for torch inductor + triton==${TRITON_VERSION} \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} + +# Patches +# py-vuln: 71064 +RUN pip install --no-cache-dir -U "requests>=2.32.3" + +# add necessary certificate for aws sdk cpp download +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + +# Install MMS +RUN pip install --no-cache-dir \ + multi-model-server==$MMS_VERSION \ + sagemaker-inference + +# create user and folders +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model + +# add MMS entrypoint +COPY mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY config.properties /etc/sagemaker-mms.properties +RUN chmod +x /usr/local/bin/dockerd-entrypoint.py + +# add telemetry +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +RUN chmod +x /usr/local/bin/deep_learning_container.py + +################################# +# Hugging Face specific section # +################################# + +# install Hugging Face libraries and its dependencies +RUN pip install --no-cache-dir \ + # hf_transfer will be a built-in feature, remove the extra then + huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB_VERSION} \ + transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ + diffusers==${DIFFUSERS_VERSION} \ + peft==${PEFT_VERSION} \ + accelerate==${ACCELERATE_VERSION} \ + "sagemaker-huggingface-inference-toolkit==2.4.1" + +# hf_transfer will be a built-in feature, remove the env variavle then +ENV HF_HUB_ENABLE_HF_TRANSFER="1" + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.3/license.txt + +## Cleanup ## +RUN pip cache purge \ + && rm -rf /tmp/tmp* \ + && rm -iRf /root/.cache + +EXPOSE 8080 8081 +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["serve"] \ No newline at end of file diff --git a/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_diffusers_model.py b/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_diffusers_model.py index 0358fc1ea026..b0b1a59a72c1 100644 --- a/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_diffusers_model.py +++ b/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_diffusers_model.py @@ -65,7 +65,7 @@ def test_diffusers_gpu_hosting( framework, _ = get_framework_and_version_from_tag(ecr_image) if "pytorch" not in framework: pytest.skip(f"Skipping test for non-pytorch image - {ecr_image}") - instance_type = instance_type or "ml.p3.2xlarge" + instance_type = instance_type or "ml.g5.4xlarge" invoke_sm_endpoint_helper_function( ecr_image=ecr_image, sagemaker_regions=sagemaker_regions, diff --git a/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_ipex_inference.py b/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_ipex_inference.py index 4d22191d3c20..61c6609324b6 100644 --- a/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_ipex_inference.py +++ b/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_ipex_inference.py @@ -32,6 +32,7 @@ @pytest.mark.processor("cpu") @pytest.mark.cpu_test @pytest.mark.team("sagemaker-1p-algorithms") +@pytest.mark.skip("Skip for pytorch 2.3, since ipex installation failed.") def test_ipex_hosting(framework_version, ecr_image, instance_type, sagemaker_regions, py_version): framework, _ = get_framework_and_version_from_tag(ecr_image) if "pytorch" not in framework: diff --git a/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_torch_compile.py b/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_torch_compile.py index 10063bccbb07..5fd3b999d5d0 100644 --- a/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_torch_compile.py +++ b/test/sagemaker_tests/huggingface/inference/integration/sagemaker/test_torch_compile.py @@ -67,7 +67,7 @@ def test_torch_compile_gpu_hosting( pytest.skip(f"Skipping test for non-pytorch image - {ecr_image}") if Version(framework_version) < Version("2.0"): pytest.skip("Skipping torch compile tests for PT 1.X") - instance_type = instance_type or "ml.p3.2xlarge" + instance_type = instance_type or "ml.g5.4xlarge" invoke_sm_endpoint_helper_function( ecr_image=ecr_image, sagemaker_regions=sagemaker_regions, diff --git a/test/sagemaker_tests/huggingface/inference/requirements.txt b/test/sagemaker_tests/huggingface/inference/requirements.txt index 0ed2dd6ba8f4..c2676a72a6bf 100644 --- a/test/sagemaker_tests/huggingface/inference/requirements.txt +++ b/test/sagemaker_tests/huggingface/inference/requirements.txt @@ -27,5 +27,3 @@ fabric invoke gitpython toml -huggingface_hub==0.23.2 -transformers==4.28.1 From 0d6e261cb276f776efed8a1ca0838ec64850aa77 Mon Sep 17 00:00:00 2001 From: Sally Seok <49303563+sallyseok@users.noreply.github.com> Date: Fri, 15 Nov 2024 11:18:49 -0800 Subject: [PATCH 6/6] Add dummy arm64 buildspec files (#4424) --- pytorch/inference/buildspec-arm64.yml | 1 + tensorflow/inference/buildspec-arm64.yml | 1 + 2 files changed, 2 insertions(+) create mode 100644 pytorch/inference/buildspec-arm64.yml create mode 100644 tensorflow/inference/buildspec-arm64.yml diff --git a/pytorch/inference/buildspec-arm64.yml b/pytorch/inference/buildspec-arm64.yml new file mode 100644 index 000000000000..baa195e4a814 --- /dev/null +++ b/pytorch/inference/buildspec-arm64.yml @@ -0,0 +1 @@ +buildspec_pointer: buildspec-graviton-2-4-ec2.yml diff --git a/tensorflow/inference/buildspec-arm64.yml b/tensorflow/inference/buildspec-arm64.yml new file mode 100644 index 000000000000..6d7eee9008f8 --- /dev/null +++ b/tensorflow/inference/buildspec-arm64.yml @@ -0,0 +1 @@ +buildspec_pointer: buildspec-2-16-graviton.yml