diff --git a/.github/gen-workflow-ci.py b/.github/gen-workflow-ci.py index 5df3aef273..bf4f7e06fb 100644 --- a/.github/gen-workflow-ci.py +++ b/.github/gen-workflow-ci.py @@ -185,9 +185,11 @@ def build_and_test_images(id: str, images: List[str], parallel_images: str, tests_per_image: Dict[str, Set[str]], - tests: Dict[str, Dict]) -> str: + tests: Dict[str, Dict], + attempts: int = 3) -> str: if 'init-workflow' not in needs: needs.insert(0, 'init-workflow') + failure = "'failure'" return (f' {id}:\n' f' name: "{name} (${{{{ matrix.image }}}})"\n' f' needs: [{", ".join(needs)}]\n' @@ -298,13 +300,16 @@ def build_and_test_images(id: str, f' COMPOSE_DOCKER_CLI_BUILD: 1\n' f' DOCKER_BUILDKIT: 1\n' f'\n' + - '\n'.join([f' - name: "{test["label"]}"\n' - f' if: always() && steps.build.outcome == \'success\' && matrix.{test_id}\n' + '\n'.join([f' - name: "{test["label"]} [attempt {attempt} of {attempts}]"\n' + f' id: {test_id}_{attempt}\n' + f' continue-on-error: {"true" if attempt < attempts else "false"}\n' + f' if: always() && steps.build.outcome == \'success\' && matrix.{test_id} && {"true" if attempt == 1 else f"steps.{test_id}_{attempt-1}.outcome == {failure}"}\n' f' run: |\n' - f' mkdir -p artifacts/${{{{ matrix.image }}}}/{test_id}\n' - f' docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{{{ matrix.image }}}}/{test_id}:/artifacts" ${{{{ matrix.image }}}} /bin/bash /horovod/.github/timeout-and-retry.sh {test["timeout"]}m 3 10 {test["command"]}\n' + f' mkdir -p artifacts/${{{{ matrix.image }}}}/{test_id}_{attempt}\n' + f' docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{{{ matrix.image }}}}/{test_id}_{attempt}:/artifacts" ${{{{ matrix.image }}}} /usr/bin/timeout {test["timeout"]}m {test["command"]}\n' f' shell: bash\n' - for test_id, test in sorted(tests.items(), key=lambda test: test[0])]) + + for test_id, test in sorted(tests.items(), key=lambda test: test[0]) + for attempt in range(1, attempts+1)]) + f'\n' f' - name: Upload Test Results\n' f' uses: actions/upload-artifact@v2\n' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9744cdead3..00505c4464 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -310,566 +310,1586 @@ jobs: COMPOSE_DOCKER_CLI_BUILD: 1 DOCKER_BUILDKIT: 1 - - name: "Elastic Spark TensorFlow Tests 1" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 + - name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]" + id: Elastic_Spark_TensorFlow_Tests_1_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" shell: bash - - name: "Elastic Spark TensorFlow Tests 2" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 + - name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]" + id: Elastic_Spark_TensorFlow_Tests_1_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" shell: bash - - name: "Elastic Spark Torch Tests" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests + - name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]" + id: Elastic_Spark_TensorFlow_Tests_1_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" shell: bash - - name: "Elastic Tests 1" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 + - name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]" + id: Elastic_Spark_TensorFlow_Tests_2_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" shell: bash - - name: "Elastic Tests 2" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 + - name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]" + id: Elastic_Spark_TensorFlow_Tests_2_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" shell: bash - - name: "Gloo Cluster PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests + - name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]" + id: Elastic_Spark_TensorFlow_Tests_2_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" shell: bash - - name: "Gloo Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST + - name: "Elastic Spark Torch Tests [attempt 1 of 3]" + id: Elastic_Spark_Torch_Tests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" shell: bash - - name: "Gloo MXNet2 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST + - name: "Elastic Spark Torch Tests [attempt 2 of 3]" + id: Elastic_Spark_Torch_Tests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" shell: bash - - name: "Gloo MXNet MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST + - name: "Elastic Spark Torch Tests [attempt 3 of 3]" + id: Elastic_Spark_Torch_Tests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" shell: bash - - name: "Gloo Parallel PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests + - name: "Elastic Tests 1 [attempt 1 of 3]" + id: Elastic_Tests_1_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" shell: bash - - name: "Gloo PyTorch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST + - name: "Elastic Tests 1 [attempt 2 of 3]" + id: Elastic_Tests_1_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets + mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" shell: bash - - name: "Gloo Single PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests + - name: "Elastic Tests 1 [attempt 3 of 3]" + id: Elastic_Tests_1_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 15m 3 10 bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" + mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" shell: bash - - name: "Gloo TensorFlow 2.0 Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST + - name: "Elastic Tests 2 [attempt 1 of 3]" + id: Elastic_Tests_2_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" shell: bash - - name: "Gloo TensorFlow 2.0 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST + - name: "Elastic Tests 2 [attempt 2 of 3]" + id: Elastic_Tests_2_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" shell: bash - - name: "Gloo TensorFlow MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST + - name: "Elastic Tests 2 [attempt 3 of 3]" + id: Elastic_Tests_2_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" shell: bash - - name: "MPI Cluster PyTests" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests + - name: "Gloo Cluster PyTests [attempt 1 of 3]" + id: Gloo_Cluster_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" shell: bash - - name: "MPI Cluster PyTests [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI + - name: "Gloo Cluster PyTests [attempt 2 of 3]" + id: Gloo_Cluster_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" shell: bash - - name: "MPI Cluster PyTests [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI + - name: "Gloo Cluster PyTests [attempt 3 of 3]" + id: Gloo_Cluster_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" shell: bash - - name: "MPI MXNet MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST + - name: "Gloo Keras MNIST [attempt 1 of 3]" + id: Gloo_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py shell: bash - - name: "MPI MXNet MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI + - name: "Gloo Keras MNIST [attempt 2 of 3]" + id: Gloo_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py shell: bash - - name: "MPI MXNet MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI + - name: "Gloo Keras MNIST [attempt 3 of 3]" + id: Gloo_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py shell: bash - - name: "MPI Parallel PyTests" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests + - name: "Gloo MXNet2 MNIST [attempt 1 of 3]" + id: Gloo_MXNet2_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py shell: bash - - name: "MPI Parallel PyTests [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI + - name: "Gloo MXNet2 MNIST [attempt 2 of 3]" + id: Gloo_MXNet2_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST && steps.Gloo_MXNet2_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py shell: bash - - name: "MPI Parallel PyTests [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI + - name: "Gloo MXNet2 MNIST [attempt 3 of 3]" + id: Gloo_MXNet2_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST && steps.Gloo_MXNet2_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py shell: bash - - name: "MPI PyTorch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST + - name: "Gloo MXNet MNIST [attempt 1 of 3]" + id: Gloo_MXNet_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py shell: bash - - name: "MPI PyTorch MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI + - name: "Gloo MXNet MNIST [attempt 2 of 3]" + id: Gloo_MXNet_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST && steps.Gloo_MXNet_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py shell: bash - - name: "MPI PyTorch MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI + - name: "Gloo MXNet MNIST [attempt 3 of 3]" + id: Gloo_MXNet_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST && steps.Gloo_MXNet_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py shell: bash - - name: "MPI Single PyTests" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests + - name: "Gloo Parallel PyTests [attempt 1 of 3]" + id: Gloo_Parallel_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" shell: bash - - name: "MPI Single PyTests [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI + - name: "Gloo Parallel PyTests [attempt 2 of 3]" + id: Gloo_Parallel_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" shell: bash - - name: "MPI Single PyTests [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI + - name: "Gloo Parallel PyTests [attempt 3 of 3]" + id: Gloo_Parallel_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" shell: bash - - name: "MPI TensorFlow 2.0 Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST + - name: "Gloo PyTorch MNIST [attempt 1 of 3]" + id: Gloo_PyTorch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets shell: bash - - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI + - name: "Gloo PyTorch MNIST [attempt 2 of 3]" + id: Gloo_PyTorch_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST && steps.Gloo_PyTorch_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets shell: bash - - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI + - name: "Gloo PyTorch MNIST [attempt 3 of 3]" + id: Gloo_PyTorch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST && steps.Gloo_PyTorch_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets shell: bash - - name: "MPI TensorFlow 2.0 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST + - name: "Gloo Single PyTests [attempt 1 of 3]" + id: Gloo_Single_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" shell: bash - - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI + - name: "Gloo Single PyTests [attempt 2 of 3]" + id: Gloo_Single_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" shell: bash - - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI + - name: "Gloo Single PyTests [attempt 3 of 3]" + id: Gloo_Single_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" shell: bash - - name: "Run PyTests test_interactiverun" - if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun + - name: "Gloo TensorFlow 2.0 Keras MNIST [attempt 1 of 3]" + id: Gloo_TensorFlow_2_0_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py shell: bash - - name: "Single Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST + - name: "Gloo TensorFlow 2.0 Keras MNIST [attempt 2 of 3]" + id: Gloo_TensorFlow_2_0_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST && steps.Gloo_TensorFlow_2_0_Keras_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py shell: bash - - name: "Single MXNet2 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST + - name: "Gloo TensorFlow 2.0 Keras MNIST [attempt 3 of 3]" + id: Gloo_TensorFlow_2_0_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST && steps.Gloo_TensorFlow_2_0_Keras_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py shell: bash - - name: "Single MXNet MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST + - name: "Gloo TensorFlow 2.0 MNIST [attempt 1 of 3]" + id: Gloo_TensorFlow_2_0_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py shell: bash - - name: "Single MXNet MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI + - name: "Gloo TensorFlow 2.0 MNIST [attempt 2 of 3]" + id: Gloo_TensorFlow_2_0_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST && steps.Gloo_TensorFlow_2_0_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py shell: bash - - name: "Single MXNet MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI + - name: "Gloo TensorFlow 2.0 MNIST [attempt 3 of 3]" + id: Gloo_TensorFlow_2_0_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST && steps.Gloo_TensorFlow_2_0_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py shell: bash - - name: "Single PyTorch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST + - name: "Gloo TensorFlow MNIST [attempt 1 of 3]" + id: Gloo_TensorFlow_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py shell: bash - - name: "Single PyTorch MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI + - name: "Gloo TensorFlow MNIST [attempt 2 of 3]" + id: Gloo_TensorFlow_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py shell: bash - - name: "Single PyTorch MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI + - name: "Gloo TensorFlow MNIST [attempt 3 of 3]" + id: Gloo_TensorFlow_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py shell: bash - - name: "Spark Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST + - name: "MPI Cluster PyTests [attempt 1 of 3]" + id: MPI_Cluster_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" shell: bash - - name: "Spark Keras Rossmann Estimator" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator + - name: "MPI Cluster PyTests [attempt 2 of 3]" + id: MPI_Cluster_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" shell: bash - - name: "Spark Keras Rossmann Run" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run + - name: "MPI Cluster PyTests [attempt 3 of 3]" + id: MPI_Cluster_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" shell: bash - - name: "Spark Lightning MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST + - name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]" + id: MPI_Cluster_PyTests_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" shell: bash - - name: "Spark PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests + - name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]" + id: MPI_Cluster_PyTests_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" shell: bash - - name: "Spark Torch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST + - name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]" + id: MPI_Cluster_PyTests_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" shell: bash - - name: Upload Test Results - uses: actions/upload-artifact@v2 - if: always() && contains(matrix.image, '-cpu-') - with: - name: Unit Test Results - ${{ matrix.image }} - path: artifacts/${{ matrix.image }}/**/*.xml + - name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]" + id: MPI_Cluster_PyTests_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash - - name: Push test image - # We push test image to AWS ECR on push to Horovod master (not a fork) - if: > - github.event_name == 'push' && - github.ref == 'refs/heads/master' && - github.repository == 'horovod/horovod' && - steps.ecr.outcome == 'success' + - name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]" + id: MPI_Cluster_PyTests_ONECCL_OFI_2 continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_1.outcome == 'failure' run: | - docker tag ${{ matrix.image }} ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest - docker push ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash - build-gpu: - name: "Build GPU (${{ matrix.image }})" - needs: [init-workflow] - if: needs.init-workflow.outputs.run_builds_and_tests != 'false' - runs-on: ubuntu-latest + - name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]" + id: MPI_Cluster_PyTests_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash - strategy: - max-parallel: 6 - fail-fast: false - matrix: - include: - - image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_3_1-mxnet1_5_1_p0-pyspark3_1_2 - build_timeout: 40 + - name: "MPI MXNet MNIST [attempt 1 of 3]" + id: MPI_MXNet_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - image: test-gpu-gloo-py3_8-tf2_3_2-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_1_2 - build_timeout: 40 + - name: "MPI MXNet MNIST [attempt 2 of 3]" + id: MPI_MXNet_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST && steps.MPI_MXNet_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - image: test-gpu-gloo-py3_8-tf2_3_2-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_1_2 - build_timeout: 40 + - name: "MPI MXNet MNIST [attempt 3 of 3]" + id: MPI_MXNet_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST && steps.MPI_MXNet_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - image: test-gpu-gloo-py3_8-tf2_4_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_1_2 - build_timeout: 40 + - name: "MPI MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_MXNet_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - image: test-gpu-openmpi-gloo-py3_8-tf2_5_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 - build_timeout: 40 + - name: "MPI MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_MXNet_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI && steps.MPI_MXNet_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - image: test-mixed-openmpi-gloo-py3_8-tf2_5_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 - build_timeout: 40 + - name: "MPI MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_MXNet_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI && steps.MPI_MXNet_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - steps: - - name: Clean up disk space - # deleting these paths frees 38 GB disk space: - # sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc - # but this sometimes takes 3-4 minutes - # so we delete only some sub-paths which are known to be quick (10s) and 20 GB + - name: "MPI MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_MXNet_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI && true run: | - echo ::group::Disk space before clean up - df -h - echo ::endgroup:: + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ - /usr/share/dotnet/shared \ - /usr/local/lib/android/sdk/ndk \ - /usr/local/lib/android/sdk/build-tools \ - /opt/ghc - do - echo ::group::Deleting "$dir" - sudo du -hsc $dir | tail -n1 || true - sudo rm -rf $dir - echo ::endgroup:: - done + - name: "MPI MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_MXNet_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI && steps.MPI_MXNet_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - echo ::group::Disk space after clean up - df -h - echo ::endgroup:: + - name: "MPI MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_MXNet_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI && steps.MPI_MXNet_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: recursive + - name: "MPI Parallel PyTests [attempt 1 of 3]" + id: MPI_Parallel_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 + - name: "MPI Parallel PyTests [attempt 2 of 3]" + id: MPI_Parallel_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Setup docker-compose - run: pip install docker-compose + - name: "MPI Parallel PyTests [attempt 3 of 3]" + id: MPI_Parallel_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Configure AWS credentials - id: aws - uses: aws-actions/configure-aws-credentials@v1 - # AWS credentials are used to authenticate against AWS ECR to pull and push test images - # We can only authenticate when running on Horovod repo (not a fork) - if: github.repository == 'horovod/horovod' + - name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]" + id: MPI_Parallel_PyTests_ONECCL_MPI_1 continue-on-error: true - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-1 + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Login to Amazon ECR - id: ecr - if: steps.aws.outcome == 'success' + - name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]" + id: MPI_Parallel_PyTests_ONECCL_MPI_2 continue-on-error: true - uses: aws-actions/amazon-ecr-login@v1 + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Add cache_from to docker-compose YAML - if: steps.ecr.outcome == 'success' + - name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]" + id: MPI_Parallel_PyTests_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_2.outcome == 'failure' run: | - cat > docker-compose.test.override.yml < /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" shell: bash - - name: Pull latest test image - if: steps.ecr.outcome == 'success' + - name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]" + id: MPI_Parallel_PyTests_ONECCL_OFI_1 continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true run: | - docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest - env: - DOCKER_BUILDKIT: 1 + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Build - id: build + - name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]" + id: MPI_Parallel_PyTests_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_1.outcome == 'failure' run: | - override_yaml="" - if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi - .github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }} - env: - COMPOSE_DOCKER_CLI_BUILD: 1 - DOCKER_BUILDKIT: 1 + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + - name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]" + id: MPI_Parallel_PyTests_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash - - name: Upload Test Results - uses: actions/upload-artifact@v2 - if: always() && contains(matrix.image, '-cpu-') - with: - name: Unit Test Results - ${{ matrix.image }} - path: artifacts/${{ matrix.image }}/**/*.xml + - name: "MPI PyTorch MNIST [attempt 1 of 3]" + id: MPI_PyTorch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash - - name: Push test image - # We push test image to AWS ECR on push to Horovod master (not a fork) - if: > - github.event_name == 'push' && - github.ref == 'refs/heads/master' && - github.repository == 'horovod/horovod' && - steps.ecr.outcome == 'success' + - name: "MPI PyTorch MNIST [attempt 2 of 3]" + id: MPI_PyTorch_MNIST_2 continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST && steps.MPI_PyTorch_MNIST_1.outcome == 'failure' run: | - docker tag ${{ matrix.image }} ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest - docker push ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash - build-and-test-heads: - name: "Build and Test heads (${{ matrix.image }})" - needs: [init-workflow, build-and-test-cpu, build-gpu] - if: needs.init-workflow.outputs.run_builds_and_tests != 'false' - runs-on: ubuntu-latest + - name: "MPI PyTorch MNIST [attempt 3 of 3]" + id: MPI_PyTorch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST && steps.MPI_PyTorch_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash - strategy: - max-parallel: 2 - fail-fast: false - matrix: - include: - - image: test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_1_2 - Elastic_Tests_1: true - Gloo_Cluster_PyTests: true - Gloo_MXNet2_MNIST: true - Gloo_Parallel_PyTests: true - Gloo_PyTorch_MNIST: true - Gloo_Single_PyTests: true - Gloo_TensorFlow_2_0_Keras_MNIST: true - Gloo_TensorFlow_2_0_MNIST: true - Single_MXNet2_MNIST: true - Single_PyTorch_MNIST: true - Spark_Lightning_MNIST: true - Spark_PyTests: true - Spark_Torch_MNIST: true - build_timeout: 30 + - name: "MPI PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash - - image: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_1_2 - build_timeout: 40 + - name: "MPI PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI && steps.MPI_PyTorch_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash - steps: - - name: Clean up disk space - # deleting these paths frees 38 GB disk space: - # sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc - # but this sometimes takes 3-4 minutes - # so we delete only some sub-paths which are known to be quick (10s) and 20 GB + - name: "MPI PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI && steps.MPI_PyTorch_MNIST_ONECCL_MPI_2.outcome == 'failure' run: | - echo ::group::Disk space before clean up - df -h - echo ::endgroup:: + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash - for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ - /usr/share/dotnet/shared \ - /usr/local/lib/android/sdk/ndk \ - /usr/local/lib/android/sdk/build-tools \ + - name: "MPI PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI && steps.MPI_PyTorch_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI && steps.MPI_PyTorch_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI Single PyTests [attempt 1 of 3]" + id: MPI_Single_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [attempt 2 of 3]" + id: MPI_Single_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [attempt 3 of 3]" + id: MPI_Single_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]" + id: MPI_Single_PyTests_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]" + id: MPI_Single_PyTests_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]" + id: MPI_Single_PyTests_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]" + id: MPI_Single_PyTests_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]" + id: MPI_Single_PyTests_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]" + id: MPI_Single_PyTests_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST && steps.MPI_TensorFlow_2_0_Keras_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST && steps.MPI_TensorFlow_2_0_Keras_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST && steps.MPI_TensorFlow_2_0_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST && steps.MPI_TensorFlow_2_0_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "Run PyTests test_interactiverun [attempt 1 of 3]" + id: Run_PyTests_test_interactiverun_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" + shell: bash + + - name: "Run PyTests test_interactiverun [attempt 2 of 3]" + id: Run_PyTests_test_interactiverun_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" + shell: bash + + - name: "Run PyTests test_interactiverun [attempt 3 of 3]" + id: Run_PyTests_test_interactiverun_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" + shell: bash + + - name: "Single Keras MNIST [attempt 1 of 3]" + id: Single_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" + shell: bash + + - name: "Single Keras MNIST [attempt 2 of 3]" + id: Single_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" + shell: bash + + - name: "Single Keras MNIST [attempt 3 of 3]" + id: Single_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" + shell: bash + + - name: "Single MXNet2 MNIST [attempt 1 of 3]" + id: Single_MXNet2_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet2 MNIST [attempt 2 of 3]" + id: Single_MXNet2_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet2 MNIST [attempt 3 of 3]" + id: Single_MXNet2_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [attempt 1 of 3]" + id: Single_MXNet_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [attempt 2 of 3]" + id: Single_MXNet_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [attempt 3 of 3]" + id: Single_MXNet_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" + id: Single_MXNet_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" + id: Single_MXNet_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" + id: Single_MXNet_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" + id: Single_MXNet_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" + id: Single_MXNet_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" + id: Single_MXNet_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + shell: bash + + - name: "Single PyTorch MNIST [attempt 1 of 3]" + id: Single_PyTorch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [attempt 2 of 3]" + id: Single_PyTorch_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [attempt 3 of 3]" + id: Single_PyTorch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" + id: Single_PyTorch_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" + id: Single_PyTorch_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" + id: Single_PyTorch_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" + id: Single_PyTorch_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" + id: Single_PyTorch_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" + id: Single_PyTorch_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + shell: bash + + - name: "Spark Keras MNIST [attempt 1 of 3]" + id: Spark_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Keras MNIST [attempt 2 of 3]" + id: Spark_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Keras MNIST [attempt 3 of 3]" + id: Spark_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Keras Rossmann Estimator [attempt 1 of 3]" + id: Spark_Keras_Rossmann_Estimator_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" + shell: bash + + - name: "Spark Keras Rossmann Estimator [attempt 2 of 3]" + id: Spark_Keras_Rossmann_Estimator_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" + shell: bash + + - name: "Spark Keras Rossmann Estimator [attempt 3 of 3]" + id: Spark_Keras_Rossmann_Estimator_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" + shell: bash + + - name: "Spark Keras Rossmann Run [attempt 1 of 3]" + id: Spark_Keras_Rossmann_Run_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" + shell: bash + + - name: "Spark Keras Rossmann Run [attempt 2 of 3]" + id: Spark_Keras_Rossmann_Run_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" + shell: bash + + - name: "Spark Keras Rossmann Run [attempt 3 of 3]" + id: Spark_Keras_Rossmann_Run_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" + shell: bash + + - name: "Spark Lightning MNIST [attempt 1 of 3]" + id: Spark_Lightning_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Lightning MNIST [attempt 2 of 3]" + id: Spark_Lightning_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Lightning MNIST [attempt 3 of 3]" + id: Spark_Lightning_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark PyTests [attempt 1 of 3]" + id: Spark_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" + shell: bash + + - name: "Spark PyTests [attempt 2 of 3]" + id: Spark_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" + shell: bash + + - name: "Spark PyTests [attempt 3 of 3]" + id: Spark_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" + shell: bash + + - name: "Spark Torch MNIST [attempt 1 of 3]" + id: Spark_Torch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Torch MNIST [attempt 2 of 3]" + id: Spark_Torch_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: "Spark Torch MNIST [attempt 3 of 3]" + id: Spark_Torch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + shell: bash + + - name: Upload Test Results + uses: actions/upload-artifact@v2 + if: always() && contains(matrix.image, '-cpu-') + with: + name: Unit Test Results - ${{ matrix.image }} + path: artifacts/${{ matrix.image }}/**/*.xml + + - name: Push test image + # We push test image to AWS ECR on push to Horovod master (not a fork) + if: > + github.event_name == 'push' && + github.ref == 'refs/heads/master' && + github.repository == 'horovod/horovod' && + steps.ecr.outcome == 'success' + continue-on-error: true + run: | + docker tag ${{ matrix.image }} ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest + docker push ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest + + build-gpu: + name: "Build GPU (${{ matrix.image }})" + needs: [init-workflow] + if: needs.init-workflow.outputs.run_builds_and_tests != 'false' + runs-on: ubuntu-latest + + strategy: + max-parallel: 6 + fail-fast: false + matrix: + include: + - image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_3_1-mxnet1_5_1_p0-pyspark3_1_2 + build_timeout: 40 + + - image: test-gpu-gloo-py3_8-tf2_3_2-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_1_2 + build_timeout: 40 + + - image: test-gpu-gloo-py3_8-tf2_3_2-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_1_2 + build_timeout: 40 + + - image: test-gpu-gloo-py3_8-tf2_4_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_1_2 + build_timeout: 40 + + - image: test-gpu-openmpi-gloo-py3_8-tf2_5_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 + build_timeout: 40 + + - image: test-mixed-openmpi-gloo-py3_8-tf2_5_0-keras_none-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 + build_timeout: 40 + + steps: + - name: Clean up disk space + # deleting these paths frees 38 GB disk space: + # sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc + # but this sometimes takes 3-4 minutes + # so we delete only some sub-paths which are known to be quick (10s) and 20 GB + run: | + echo ::group::Disk space before clean up + df -h + echo ::endgroup:: + + for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ + /usr/share/dotnet/shared \ + /usr/local/lib/android/sdk/ndk \ + /usr/local/lib/android/sdk/build-tools \ + /opt/ghc + do + echo ::group::Deleting "$dir" + sudo du -hsc $dir | tail -n1 || true + sudo rm -rf $dir + echo ::endgroup:: + done + + echo ::group::Disk space after clean up + df -h + echo ::endgroup:: + + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Setup docker-compose + run: pip install docker-compose + + - name: Configure AWS credentials + id: aws + uses: aws-actions/configure-aws-credentials@v1 + # AWS credentials are used to authenticate against AWS ECR to pull and push test images + # We can only authenticate when running on Horovod repo (not a fork) + if: github.repository == 'horovod/horovod' + continue-on-error: true + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Login to Amazon ECR + id: ecr + if: steps.aws.outcome == 'success' + continue-on-error: true + uses: aws-actions/amazon-ecr-login@v1 + + - name: Add cache_from to docker-compose YAML + if: steps.ecr.outcome == 'success' + run: | + cat > docker-compose.test.override.yml < + github.event_name == 'push' && + github.ref == 'refs/heads/master' && + github.repository == 'horovod/horovod' && + steps.ecr.outcome == 'success' + continue-on-error: true + run: | + docker tag ${{ matrix.image }} ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest + docker push ${{ steps.ecr.outputs.registry }}:horovod-${{ matrix.image }}-latest + + build-and-test-heads: + name: "Build and Test heads (${{ matrix.image }})" + needs: [init-workflow, build-and-test-cpu, build-gpu] + if: needs.init-workflow.outputs.run_builds_and_tests != 'false' + runs-on: ubuntu-latest + + strategy: + max-parallel: 2 + fail-fast: false + matrix: + include: + - image: test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_1_2 + Elastic_Tests_1: true + Gloo_Cluster_PyTests: true + Gloo_MXNet2_MNIST: true + Gloo_Parallel_PyTests: true + Gloo_PyTorch_MNIST: true + Gloo_Single_PyTests: true + Gloo_TensorFlow_2_0_Keras_MNIST: true + Gloo_TensorFlow_2_0_MNIST: true + Single_MXNet2_MNIST: true + Single_PyTorch_MNIST: true + Spark_Lightning_MNIST: true + Spark_PyTests: true + Spark_Torch_MNIST: true + build_timeout: 30 + + - image: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_1_2 + build_timeout: 40 + + steps: + - name: Clean up disk space + # deleting these paths frees 38 GB disk space: + # sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc + # but this sometimes takes 3-4 minutes + # so we delete only some sub-paths which are known to be quick (10s) and 20 GB + run: | + echo ::group::Disk space before clean up + df -h + echo ::endgroup:: + + for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ + /usr/share/dotnet/shared \ + /usr/local/lib/android/sdk/ndk \ + /usr/local/lib/android/sdk/build-tools \ /opt/ghc do echo ::group::Deleting "$dir" @@ -878,428 +1898,1448 @@ jobs: echo ::endgroup:: done - echo ::group::Disk space after clean up - df -h - echo ::endgroup:: + echo ::group::Disk space after clean up + df -h + echo ::endgroup:: + + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Setup docker-compose + run: pip install docker-compose + + - name: Configure AWS credentials + id: aws + uses: aws-actions/configure-aws-credentials@v1 + # AWS credentials are used to authenticate against AWS ECR to pull and push test images + # We can only authenticate when running on Horovod repo (not a fork) + if: github.repository == 'horovod/horovod' + continue-on-error: true + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Login to Amazon ECR + id: ecr + if: steps.aws.outcome == 'success' + continue-on-error: true + uses: aws-actions/amazon-ecr-login@v1 + + - name: Add cache_from to docker-compose YAML + if: steps.ecr.outcome == 'success' + run: | + cat > docker-compose.test.override.yml < /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash + + - name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]" + id: MPI_Cluster_PyTests_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash + + - name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]" + id: MPI_Cluster_PyTests_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash + + - name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]" + id: MPI_Cluster_PyTests_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: recursive + - name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]" + id: MPI_Cluster_PyTests_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 + - name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]" + id: MPI_Cluster_PyTests_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + shell: bash - - name: Setup docker-compose - run: pip install docker-compose + - name: "MPI MXNet MNIST [attempt 1 of 3]" + id: MPI_MXNet_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - name: Configure AWS credentials - id: aws - uses: aws-actions/configure-aws-credentials@v1 - # AWS credentials are used to authenticate against AWS ECR to pull and push test images - # We can only authenticate when running on Horovod repo (not a fork) - if: github.repository == 'horovod/horovod' + - name: "MPI MXNet MNIST [attempt 2 of 3]" + id: MPI_MXNet_MNIST_2 continue-on-error: true - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-1 + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST && steps.MPI_MXNet_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - name: Login to Amazon ECR - id: ecr - if: steps.aws.outcome == 'success' + - name: "MPI MXNet MNIST [attempt 3 of 3]" + id: MPI_MXNet_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST && steps.MPI_MXNet_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash + + - name: "MPI MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_MXNet_MNIST_ONECCL_MPI_1 continue-on-error: true - uses: aws-actions/amazon-ecr-login@v1 + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - name: Add cache_from to docker-compose YAML - if: steps.ecr.outcome == 'success' + - name: "MPI MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_MXNet_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI && steps.MPI_MXNet_MNIST_ONECCL_MPI_1.outcome == 'failure' run: | - cat > docker-compose.test.override.yml < /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" shell: bash - - name: Pull latest test image - if: steps.ecr.outcome == 'success' + - name: "MPI MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_MXNet_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI && steps.MPI_MXNet_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash + + - name: "MPI MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_MXNet_MNIST_ONECCL_OFI_1 continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI && true run: | - docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest - env: - DOCKER_BUILDKIT: 1 + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash - - name: Build - id: build + - name: "MPI MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_MXNet_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI && steps.MPI_MXNet_MNIST_ONECCL_OFI_1.outcome == 'failure' run: | - override_yaml="" - if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi - .github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }} - env: - COMPOSE_DOCKER_CLI_BUILD: 1 - DOCKER_BUILDKIT: 1 + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash + + - name: "MPI MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_MXNet_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI && steps.MPI_MXNet_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + shell: bash + + - name: "MPI Parallel PyTests [attempt 1 of 3]" + id: MPI_Parallel_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [attempt 2 of 3]" + id: MPI_Parallel_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [attempt 3 of 3]" + id: MPI_Parallel_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]" + id: MPI_Parallel_PyTests_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]" + id: MPI_Parallel_PyTests_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]" + id: MPI_Parallel_PyTests_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]" + id: MPI_Parallel_PyTests_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]" + id: MPI_Parallel_PyTests_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]" + id: MPI_Parallel_PyTests_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 5m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + shell: bash + + - name: "MPI PyTorch MNIST [attempt 1 of 3]" + id: MPI_PyTorch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [attempt 2 of 3]" + id: MPI_PyTorch_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST && steps.MPI_PyTorch_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [attempt 3 of 3]" + id: MPI_PyTorch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST && steps.MPI_PyTorch_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI && steps.MPI_PyTorch_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI && steps.MPI_PyTorch_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI && steps.MPI_PyTorch_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_PyTorch_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI && steps.MPI_PyTorch_MNIST_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + shell: bash + + - name: "MPI Single PyTests [attempt 1 of 3]" + id: MPI_Single_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [attempt 2 of 3]" + id: MPI_Single_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [attempt 3 of 3]" + id: MPI_Single_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]" + id: MPI_Single_PyTests_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]" + id: MPI_Single_PyTests_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]" + id: MPI_Single_PyTests_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]" + id: MPI_Single_PyTests_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]" + id: MPI_Single_PyTests_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]" + id: MPI_Single_PyTests_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST && steps.MPI_TensorFlow_2_0_Keras_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST && steps.MPI_TensorFlow_2_0_Keras_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + shell: bash - - name: "Elastic Spark TensorFlow Tests 1" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 + - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" shell: bash - - name: "Elastic Spark TensorFlow Tests 2" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 + - name: "MPI TensorFlow 2.0 MNIST [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST && steps.MPI_TensorFlow_2_0_MNIST_1.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST && steps.MPI_TensorFlow_2_0_MNIST_2.outcome == 'failure' + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI && true + run: | + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + shell: bash + + - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" shell: bash - - name: "Elastic Spark Torch Tests" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests + - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" shell: bash - - name: "Elastic Tests 1" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 + - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI] [attempt 1 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI && true run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" shell: bash - - name: "Elastic Tests 2" - if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 + - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI] [attempt 2 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2 - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" shell: bash - - name: "Gloo Cluster PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests + - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI] [attempt 3 of 3]" + id: MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" shell: bash - - name: "Gloo Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST + - name: "Run PyTests test_interactiverun [attempt 1 of 3]" + id: Run_PyTests_test_interactiverun_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py + mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" shell: bash - - name: "Gloo MXNet2 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST + - name: "Run PyTests test_interactiverun [attempt 2 of 3]" + id: Run_PyTests_test_interactiverun_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" shell: bash - - name: "Gloo MXNet MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST + - name: "Run PyTests test_interactiverun [attempt 3 of 3]" + id: Run_PyTests_test_interactiverun_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" shell: bash - - name: "Gloo Parallel PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests + - name: "Single Keras MNIST [attempt 1 of 3]" + id: Single_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" + mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" shell: bash - - name: "Gloo PyTorch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST + - name: "Single Keras MNIST [attempt 2 of 3]" + id: Single_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets + mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" shell: bash - - name: "Gloo Single PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests + - name: "Single Keras MNIST [attempt 3 of 3]" + id: Single_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 15m 3 10 bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" + mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" shell: bash - - name: "Gloo TensorFlow 2.0 Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST + - name: "Single MXNet2 MNIST [attempt 1 of 3]" + id: Single_MXNet2_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" shell: bash - - name: "Gloo TensorFlow 2.0 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST + - name: "Single MXNet2 MNIST [attempt 2 of 3]" + id: Single_MXNet2_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" shell: bash - - name: "Gloo TensorFlow MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST + - name: "Single MXNet2 MNIST [attempt 3 of 3]" + id: Single_MXNet2_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" shell: bash - - name: "MPI Cluster PyTests" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests + - name: "Single MXNet MNIST [attempt 1 of 3]" + id: Single_MXNet_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI Cluster PyTests [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI + - name: "Single MXNet MNIST [attempt 2 of 3]" + id: Single_MXNet_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI Cluster PyTests [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI + - name: "Single MXNet MNIST [attempt 3 of 3]" + id: Single_MXNet_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI MXNet MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST + - name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" + id: Single_MXNet_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI MXNet MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_MPI + - name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" + id: Single_MXNet_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI MXNet MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_ONECCL_OFI + - name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" + id: Single_MXNet_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI Parallel PyTests" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests + - name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" + id: Single_MXNet_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI Parallel PyTests [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI + - name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" + id: Single_MXNet_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI Parallel PyTests [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI + - name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" + id: Single_MXNet_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 5m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" shell: bash - - name: "MPI PyTorch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST + - name: "Single PyTorch MNIST [attempt 1 of 3]" + id: Single_PyTorch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI PyTorch MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_MPI + - name: "Single PyTorch MNIST [attempt 2 of 3]" + id: Single_PyTorch_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI PyTorch MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_ONECCL_OFI + - name: "Single PyTorch MNIST [attempt 3 of 3]" + id: Single_PyTorch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI Single PyTests" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests + - name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" + id: Single_PyTorch_MNIST_ONECCL_MPI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI Single PyTests [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI + - name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" + id: Single_PyTorch_MNIST_ONECCL_MPI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI Single PyTests [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI + - name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" + id: Single_PyTorch_MNIST_ONECCL_MPI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI TensorFlow 2.0 Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST + - name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" + id: Single_PyTorch_MNIST_ONECCL_OFI_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI + - name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" + id: Single_PyTorch_MNIST_ONECCL_OFI_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI TensorFlow 2.0 Keras MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI + - name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" + id: Single_PyTorch_MNIST_ONECCL_OFI_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" shell: bash - - name: "MPI TensorFlow 2.0 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST + - name: "Spark Keras MNIST [attempt 1 of 3]" + id: Spark_Keras_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "MPI TensorFlow 2.0 MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_MPI + - name: "Spark Keras MNIST [attempt 2 of 3]" + id: Spark_Keras_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "MPI TensorFlow 2.0 MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_ONECCL_OFI + - name: "Spark Keras MNIST [attempt 3 of 3]" + id: Spark_Keras_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "Run PyTests test_interactiverun" - if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun + - name: "Spark Keras Rossmann Estimator [attempt 1 of 3]" + id: Spark_Keras_Rossmann_Estimator_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true run: | - mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" shell: bash - - name: "Single Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST + - name: "Spark Keras Rossmann Estimator [attempt 2 of 3]" + id: Spark_Keras_Rossmann_Estimator_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" shell: bash - - name: "Single MXNet2 MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST + - name: "Spark Keras Rossmann Estimator [attempt 3 of 3]" + id: Spark_Keras_Rossmann_Estimator_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" shell: bash - - name: "Single MXNet MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST + - name: "Spark Keras Rossmann Run [attempt 1 of 3]" + id: Spark_Keras_Rossmann_Run_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" shell: bash - - name: "Single MXNet MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI + - name: "Spark Keras Rossmann Run [attempt 2 of 3]" + id: Spark_Keras_Rossmann_Run_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" shell: bash - - name: "Single MXNet MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI + - name: "Spark Keras Rossmann Run [attempt 3 of 3]" + id: Spark_Keras_Rossmann_Run_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" shell: bash - - name: "Single PyTorch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST + - name: "Spark Lightning MNIST [attempt 1 of 3]" + id: Spark_Lightning_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "Single PyTorch MNIST [ONECCL MPI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI + - name: "Spark Lightning MNIST [attempt 2 of 3]" + id: Spark_Lightning_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "Single PyTorch MNIST [ONECCL OFI]" - if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI + - name: "Spark Lightning MNIST [attempt 3 of 3]" + id: Spark_Lightning_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" + mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "Spark Keras MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST + - name: "Spark PyTests [attempt 1 of 3]" + id: Spark_PyTests_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" shell: bash - - name: "Spark Keras Rossmann Estimator" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator + - name: "Spark PyTests [attempt 2 of 3]" + id: Spark_PyTests_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.01" + mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" shell: bash - - name: "Spark Keras Rossmann Run" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run + - name: "Spark PyTests [attempt 3 of 3]" + id: Spark_PyTests_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.01" + mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 20m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" shell: bash - - name: "Spark Lightning MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST + - name: "Spark Torch MNIST [attempt 1 of 3]" + id: Spark_Torch_MNIST_1 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_1 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "Spark PyTests" - if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests + - name: "Spark Torch MNIST [attempt 2 of 3]" + id: Spark_Torch_MNIST_2 + continue-on-error: true + if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_1.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 20m 3 10 bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" + mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_2 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - - name: "Spark Torch MNIST" - if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST + - name: "Spark Torch MNIST [attempt 3 of 3]" + id: Spark_Torch_MNIST_3 + continue-on-error: false + if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_2.outcome == 'failure' run: | - mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST - docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST:/artifacts" ${{ matrix.image }} /bin/bash /horovod/.github/timeout-and-retry.sh 10m 3 10 bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" + mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_3 + docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" shell: bash - name: Upload Test Results