Skip to content

Commit

Permalink
Move ppc64le CI to TF 2.6.0 and PyTorch 1.9.1 (horovod#3208)
Browse files Browse the repository at this point in the history
Signed-off-by: Nicolas Castet <26874160+nvcastet@users.noreply.github.com>
  • Loading branch information
nvcastet authored Dec 13, 2021
1 parent df18797 commit 3627db3
Showing 1 changed file with 7 additions and 8 deletions.
15 changes: 7 additions & 8 deletions Jenkinsfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ pipeline {
agent {
docker {
alwaysPull true
// WMLCE 1.7.0 has CUDA 10.2, NCCL 2.5.6, TensorFlow 2.1.0, and PyTorch 1.8.0
image 'tensorflowppc64le/tensorflow-ppc64le:osuosl-ubuntu-horovod-wlmce1.7.0-py3.7-ppc64le'
// Open-CE 1.4.1 has CUDA 10.2, NCCL 2.8.3, TensorFlow 2.6.0, and PyTorch 1.9.1
image 'tensorflowppc64le/tensorflow-ppc64le:osuosl-ubi7-horovod-opence1.4.1-py3.9-ppc64le'
args '--cap-add=SYS_PTRACE --shm-size=256g'
label 'power8-gpu'
registryCredentialsId 'TensorFlow'
Expand All @@ -25,10 +25,9 @@ pipeline {
git submodule update --init --recursive
. ${CONDA_INIT}
conda activate ${CONDA_ENV}
conda install -y cmake make
set -xe
HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITH_TENSORFLOW=1 \
HOROVOD_CUDA_HOME=$CONDA_PREFIX HOROVOD_GPU_OPERATIONS=NCCL MAKEFLAGS="-j1" \
HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_TENSORFLOW=1 \
HOROVOD_CUDA_HOME="/usr/local/cuda" HOROVOD_GPU_OPERATIONS=NCCL \
pip install -v . --no-cache-dir --no-deps
'''
}
Expand All @@ -42,12 +41,12 @@ pipeline {
set -xe

# TensorFlow unit tests
horovodrun -n 2 -H localhost:2 --mpi-args="-pami_noib" pytest -k 'not multi_gpu' -v -s test/parallel/test_tensorflow.py
horovodrun -n 2 -H localhost:2 pytest -k 'not multi_gpu' -v -s test/parallel/test_tensorflow.py
# Container has only 2 GPUs, so run the 'multi_gpu' test seperatly on one process
horovodrun -n 1 -H localhost:1 --mpi-args="-pami_noib" pytest -k 'multi_gpu' -v -s test/parallel/test_tensorflow.py
horovodrun -n 1 -H localhost:1 pytest -k 'multi_gpu' -v -s test/parallel/test_tensorflow.py

# PyTorch unit tests
# horovodrun -n 2 -H localhost:2 --mpi-args="-pami_noib" pytest -v -s test/parallel/test_torch.py
horovodrun -n 2 -H localhost:2 pytest -v -s test/parallel/test_torch.py
'''
}
}
Expand Down

0 comments on commit 3627db3

Please sign in to comment.