Lightning-AI · Borda · May 6, 2022 · Mar 18, 2022 · Mar 18, 2022 · Mar 25, 2022
@@ -28,13 +28,18 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: azure-gpus-spot
     container:
-      # TODO: Unpin sha256
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
 
     steps:
+      - bash: |
+          # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
+          pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          pip list
+        displayName: 'Install PyTorch LTS'
+
       - bash: |
           python -m pytest tests/benchmarks -v --durations=0
         displayName: 'Testing: benchmarks'

@@ -29,8 +29,7 @@ jobs:
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       # run on torch 1.8 as it's the LTS version
-      # TODO: Unpin sha256
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8@sha256:b75de74d4c7c820f442f246be8500c93f8b5797b84aa8531847e5fb317ed3dda"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@@ -55,6 +54,8 @@ jobs:
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install . --requirement requirements/devel.txt
+        # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
+        pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
         pip list
       displayName: 'Install dependencies'
 

@@ -75,6 +75,7 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
+          - {python_version: "3.7", pytorch_version: "1.8", cuda_version: "10.2"}
           - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
           - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
           # latest (used in Tutorials)

@@ -115,6 +115,7 @@ jobs:
       matrix:
         include:
           # the config used in '.azure-pipelines/gpu-tests.yml'
+          - {python_version: "3.7", pytorch_version: "1.8", cuda_version: "10.2"}
           - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1"}
           - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1"}
           # latest (used in Tutorials)

@@ -14,7 +14,8 @@
 
 ARG CUDA_VERSION=11.3.1
 
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+# TODO: Update OS to ubuntu20.04 when dropping CUDA 10.2
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
 
 ARG PYTHON_VERSION=3.9
 ARG PYTORCH_VERSION=1.8
@@ -47,6 +48,8 @@ RUN \
         ca-certificates \
         software-properties-common \
         libopenmpi-dev \
+        openmpi-bin \
+        ssh \
     && \
 
 # Install python
@@ -110,10 +113,14 @@ ENV \
     HOROVOD_WITH_MPI=1
 
 RUN \
+    # CUDA 10.2 doesn't support ampere architecture (8.0).
+    if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
     HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
     export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
+    echo $HOROVOD_BUILD_CUDA_CC_LIST && \
     cmake --version && \
     pip install --no-cache-dir -r ./requirements/strategies.txt && \
+    horovodrun --check-build && \
     rm -rf requirements/
 
 RUN \
@@ -127,6 +134,8 @@ RUN \
     fi
 
 RUN \
+    # CUDA 10.2 doesn't support ampere architecture (8.0).
+    if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
     # install NVIDIA apex
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
     python -c "from apex import amp"

@@ -101,7 +101,7 @@ def test_amp_cpus(tmpdir, strategy, precision, devices):
 
 @RunIf(min_gpus=2, min_torch="1.10")
 @pytest.mark.parametrize("strategy", [None, "dp", "ddp_spawn"])
-@pytest.mark.parametrize("precision", [16, "bf16"])
+@pytest.mark.parametrize("precision", [16, pytest.param("bf16", marks=RunIf(bf16_cuda=True))])
 @pytest.mark.parametrize("devices", [1, 2])
 def test_amp_gpus(tmpdir, strategy, precision, devices):
     """Make sure combinations of AMP and strategies work if supported."""