From 996928f38e227f8ba3ab1ee4038ff5caa5b8d4a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 10 Nov 2022 01:16:32 +0100 Subject: [PATCH 1/2] Clean legacy TPU ci files --- .github/checkgroup.yml | 1 + .github/workflows/tpu-tests.yml | 7 ++-- .../tpu_test_cases.jsonnet | 0 dockers/tpu-tests/Dockerfile | 37 ------------------- dockers/tpu-tests/docker-entrypoint.sh | 8 ---- 5 files changed, 5 insertions(+), 48 deletions(-) rename dockers/{tpu-tests => base-xla}/tpu_test_cases.jsonnet (100%) delete mode 100644 dockers/tpu-tests/Dockerfile delete mode 100644 dockers/tpu-tests/docker-entrypoint.sh diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index b3b0ac8e8a7e4..23783181cf5dc 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -114,6 +114,7 @@ subprojects: - id: "pytorch-lightning: TPU workflow" paths: - ".github/workflows/tpu-tests.yml" + - "dockers/base-xla/*" checks: - "test-on-tpus" diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index a7ffe3e10afe0..2b15786fd5203 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -8,6 +8,7 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".github/workflows/tpu-tests.yml" + - "dockers/base-xla/*" - "requirements/pytorch/*" - "!requirements/pytorch/docs.txt" - "src/pytorch_lightning/**" @@ -62,9 +63,9 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} SHA: ${{ github.event.pull_request.head.sha }} run: | - python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') + python -c "fname = 'dockers/base-xla/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" - cat dockers/tpu-tests/tpu_test_cases.jsonnet + cat dockers/base-xla/tpu_test_cases.jsonnet shell: bash - uses: google-github-actions/auth@v0 @@ -80,7 +81,7 @@ jobs: - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_test_cases.jsonnet | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/base-xla/tpu_test_cases.jsonnet similarity index 100% rename from dockers/tpu-tests/tpu_test_cases.jsonnet rename to dockers/base-xla/tpu_test_cases.jsonnet diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile deleted file mode 100644 index e23db55bb28e9..0000000000000 --- a/dockers/tpu-tests/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.9 - -FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} - -LABEL maintainer="Lightning-AI " - -COPY ./ ./lightning/ - -# Pull the legacy checkpoints -RUN cd lightning && \ - bash .actions/pull_legacy_checkpoints.sh - -RUN \ - pip install -q fire && \ - # drop unnecessary packages - pip install -r lightning/requirements/pytorch/devel.txt --no-cache-dir - -COPY ./dockers/tpu-tests/docker-entrypoint.sh /usr/local/bin/ -RUN chmod +x /usr/local/bin/docker-entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] -CMD ["bash"] diff --git a/dockers/tpu-tests/docker-entrypoint.sh b/dockers/tpu-tests/docker-entrypoint.sh deleted file mode 100644 index 57abc703c8ace..0000000000000 --- a/dockers/tpu-tests/docker-entrypoint.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# source ~/.bashrc -echo "running docker-entrypoint.sh" -# conda activate container -echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS -echo "printed TPU info" -export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" -exec "$@" From 85bcf421bf256abc26a5c60851dc8c66f30312f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 10 Nov 2022 01:24:28 +0100 Subject: [PATCH 2/2] Rename --- .github/workflows/tpu-tests.yml | 8 ++++---- .../{tpu_test_cases.jsonnet => tpu_workflow.jsonnet} | 0 2 files changed, 4 insertions(+), 4 deletions(-) rename dockers/base-xla/{tpu_test_cases.jsonnet => tpu_workflow.jsonnet} (100%) diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 2b15786fd5203..376412402ab81 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -32,7 +32,7 @@ jobs: if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 - timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` + timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet` steps: - uses: actions/checkout@v3 @@ -63,9 +63,9 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} SHA: ${{ github.event.pull_request.head.sha }} run: | - python -c "fname = 'dockers/base-xla/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') + python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" - cat dockers/base-xla/tpu_test_cases.jsonnet + cat dockers/base-xla/tpu_workflow.jsonnet shell: bash - uses: google-github-actions/auth@v0 @@ -81,7 +81,7 @@ jobs: - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_test_cases.jsonnet | kubectl create -f -) + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') diff --git a/dockers/base-xla/tpu_test_cases.jsonnet b/dockers/base-xla/tpu_workflow.jsonnet similarity index 100% rename from dockers/base-xla/tpu_test_cases.jsonnet rename to dockers/base-xla/tpu_workflow.jsonnet