diff --git a/.ci/torchbench/check-ssh.sh b/.ci/torchbench/check-ssh.sh new file mode 100644 index 000000000..0eedf4dad --- /dev/null +++ b/.ci/torchbench/check-ssh.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eou pipefail + +echo "Holding runner for 2 hours until all ssh sessions have logged out" +for _ in $(seq 1440); do + # Break if no ssh session exists anymore + if [ "$(who)" = "" ]; then + break + fi + echo "." + sleep 5 +done diff --git a/.github/workflows/linux-test-a10g.yml b/.github/workflows/linux-test-a10g.yml new file mode 100644 index 000000000..b2bf924d1 --- /dev/null +++ b/.github/workflows/linux-test-a10g.yml @@ -0,0 +1,43 @@ +name: TorchBench PR Test on A10G +on: + workflow_dispatch: + pull_request: + +jobs: + linux-test-a10g: + # Don't run on forked repos + # Only run on PR labeled 'with-ssh' + if: github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'with-ssh') + runs-on: linux.g5.4xlarge.nvidia.gpu + timeout-minutes: 240 + environment: docker-s3-upload + env: + CONDA_ENV: "pr-test-cuda" + TEST_CONFIG: "cuda" + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + steps: + - name: Checkout TorchBench + uses: actions/checkout@v3 + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }} + - name: Install Conda + run: | + bash ./.ci/torchbench/install-conda.sh + - name: Install TorchBench + run: | + bash ./.ci/torchbench/install.sh + - name: Wait for SSH session to end + if: always() + run: | + bash ./.ci/torchbench/check-ssh.sh + - name: Clean up Conda env + if: always() + run: | + . ${HOME}/miniconda3/etc/profile.d/conda.sh + conda remove -n "${CONDA_ENV}" --all + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true