From b875e320736341be828fbd82880ce53e7d007987 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:15:00 -0800 Subject: [PATCH 01/17] Add docker push action --- .github/workflows/docker-build-push.yml | 113 ++++++++++++++++++++++++ docker/Dockerfile.rocm7.0 | 40 +++++++++ docker/Dockerfile.rocm7.1 | 39 ++++++++ 3 files changed, 192 insertions(+) create mode 100644 .github/workflows/docker-build-push.yml create mode 100644 docker/Dockerfile.rocm7.0 create mode 100644 docker/Dockerfile.rocm7.1 diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 00000000..e47ae46b --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,113 @@ +name: Build and Push Docker Image + +on: + workflow_dispatch: + inputs: + triton_commit: + description: 'Triton commit SHA to use' + required: false + default: 'aafec417bded34db6308f5b3d6023daefae43905' + type: string + +env: + DOCKERHUB_USERNAME: muhaawad + IMAGE_NAME: iris-dev + +jobs: + build-and-push: + runs-on: [self-hosted, mi3008x] + permissions: + contents: read + packages: write + + strategy: + fail-fast: false + matrix: + include: + - dockerfile: ./docker/Dockerfile.rocm7.0 + base-name: "rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0" + - dockerfile: ./docker/Dockerfile.rocm7.1 + base-name: "rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set Triton short SHA + id: triton + run: | + TRITON_COMMIT="${{ inputs.triton_commit }}" + TRITON_SHORT="${TRITON_COMMIT:0:7}" + echo "short_sha=${TRITON_SHORT}" >> $GITHUB_OUTPUT + echo "full_sha=${TRITON_COMMIT}" >> $GITHUB_OUTPUT + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ env.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push Docker image + id: build + uses: docker/build-push-action@v5 + with: + context: ./docker + file: ${{ matrix.dockerfile }} + push: true + tags: ${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }} + build-args: | + TRITON_COMMIT=${{ steps.triton.outputs.full_sha }} + cache-from: type=gha,scope=${{ matrix.base-name }} + cache-to: type=gha,mode=max,scope=${{ matrix.base-name }} + + outputs: + triton_short_sha: ${{ steps.triton.outputs.short_sha }} + + validation-test: + name: Validation Test + needs: build-and-push + runs-on: [self-hosted, mi3008x] + timeout-minutes: 30 + + strategy: + fail-fast: false + matrix: + include: + - base-name: "rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0" + - base-name: "rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Pull Docker image + run: | + docker pull ${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ needs.build-and-push.outputs.triton_short_sha }} + + - name: Run validation test + run: | + set -e + + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ needs.build-and-push.outputs.triton_short_sha }}" + + echo "::group::Running validation test in container" + docker run --rm --network host \ + --device=/dev/kfd --device=/dev/dri \ + --ipc=host --shm-size 16G \ + --group-add video --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + -v ${{ github.workspace }}:/workspace \ + -w /workspace \ + ${IMAGE_TAG} bash -c " + set -e + pip install -e . + wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py + python test_iris_distributed.py + " + echo "::endgroup::" + + echo "✅ Validation test passed for ${{ matrix.base-name }}!" + diff --git a/docker/Dockerfile.rocm7.0 b/docker/Dockerfile.rocm7.0 new file mode 100644 index 00000000..891dabda --- /dev/null +++ b/docker/Dockerfile.rocm7.0 @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0 + +# Use bash shell for RUN commands +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV TRITON_PATH=/opt/triton \ + ROCM_PATH=/opt/rocm \ + OMPI_MCA_mtl="^ofi" \ + OMPI_MCA_pml="ob1" + +ENV LD_LIBRARY_PATH=$ROCM_PATH/lib:$LD_LIBRARY_PATH \ + PATH="$ROCM_PATH/bin:$PATH" + +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 + +# Install system packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git wget ninja-build cmake python3-pip python3-dev build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Install Python packages with pip +RUN pip3 install --upgrade pip && \ + pip3 install wheel jupyter + +# Clone and install Triton +ARG TRITON_COMMIT=aafec417bded34db6308f5b3d6023daefae43905 +WORKDIR $TRITON_PATH +RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH +RUN git checkout ${TRITON_COMMIT} +RUN pip3 install -e . +ENV PYTHONPATH=$TRITON_PATH + +WORKDIR /workspace + diff --git a/docker/Dockerfile.rocm7.1 b/docker/Dockerfile.rocm7.1 new file mode 100644 index 00000000..efb3da91 --- /dev/null +++ b/docker/Dockerfile.rocm7.1 @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +FROM rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0 + +# Use bash shell for RUN commands +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV TRITON_PATH=/opt/triton \ + ROCM_PATH=/opt/rocm \ + OMPI_MCA_mtl="^ofi" \ + OMPI_MCA_pml="ob1" + +ENV LD_LIBRARY_PATH=$ROCM_PATH/lib:$LD_LIBRARY_PATH \ + PATH="$ROCM_PATH/bin:$PATH" + +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 + +# Install system packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git wget ninja-build cmake python3-pip python3-dev build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Install Python packages with pip +RUN pip3 install --upgrade pip && \ + pip3 install wheel jupyter + +# Clone and install Triton +WORKDIR $TRITON_PATH +RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH +RUN git checkout aafec417bded34db6308f5b3d6023daefae43905 +RUN pip3 install -e . +ENV PYTHONPATH=$TRITON_PATH + +WORKDIR /workspace + From 95fd213be59b98424b3cd091a2703febb2de5faa Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:17:03 -0800 Subject: [PATCH 02/17] Use `IRIS_DOCKERHUB_TOKEN` --- .github/workflows/docker-build-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index e47ae46b..2866a595 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -48,7 +48,7 @@ jobs: uses: docker/login-action@v3 with: username: ${{ env.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} + password: ${{ secrets.IRIS_DOCKERHUB_TOKEN }} - name: Build and push Docker image id: build From 8fc5a9830af2f843e61ecaea6127ca2d00766ec6 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:20:15 -0800 Subject: [PATCH 03/17] Trigger on push --- .github/workflows/docker-build-push.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index 2866a595..9c51e8e4 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -1,6 +1,9 @@ name: Build and Push Docker Image on: + push: + branches: + - muhaawad/docker-images workflow_dispatch: inputs: triton_commit: From 15a596d67254f2c75a8c1111ee6a2f0abece6142 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:41:19 -0800 Subject: [PATCH 04/17] Use one job --- .github/workflows/docker-build-push.yml | 85 ++++++++++--------------- docker/Dockerfile.rocm7.1 | 3 +- 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index 9c51e8e4..8e0202b6 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -17,11 +17,12 @@ env: IMAGE_NAME: iris-dev jobs: - build-and-push: + build-test: runs-on: [self-hosted, mi3008x] permissions: contents: read packages: write + timeout-minutes: 60 strategy: fail-fast: false @@ -36,7 +37,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Set Triton short SHA + - name: Set Triton SHA id: triton run: | TRITON_COMMIT="${{ inputs.triton_commit }}" @@ -44,59 +45,24 @@ jobs: echo "short_sha=${TRITON_SHORT}" >> $GITHUB_OUTPUT echo "full_sha=${TRITON_COMMIT}" >> $GITHUB_OUTPUT - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ env.DOCKERHUB_USERNAME }} - password: ${{ secrets.IRIS_DOCKERHUB_TOKEN }} - - - name: Build and push Docker image - id: build - uses: docker/build-push-action@v5 - with: - context: ./docker - file: ${{ matrix.dockerfile }} - push: true - tags: ${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }} - build-args: | - TRITON_COMMIT=${{ steps.triton.outputs.full_sha }} - cache-from: type=gha,scope=${{ matrix.base-name }} - cache-to: type=gha,mode=max,scope=${{ matrix.base-name }} - - outputs: - triton_short_sha: ${{ steps.triton.outputs.short_sha }} - - validation-test: - name: Validation Test - needs: build-and-push - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - - strategy: - fail-fast: false - matrix: - include: - - base-name: "rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0" - - base-name: "rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Pull Docker image + - name: Build Docker image run: | - docker pull ${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ needs.build-and-push.outputs.triton_short_sha }} + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }}" + echo "Building ${IMAGE_TAG}..." + docker build \ + -f ${{ matrix.dockerfile }} \ + -t ${IMAGE_TAG} \ + --build-arg TRITON_COMMIT=${{ steps.triton.outputs.full_sha }} \ + ./docker + echo "✅ Build complete!" - - name: Run validation test + - name: Run validation tests run: | set -e - IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ needs.build-and-push.outputs.triton_short_sha }}" + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }}" - echo "::group::Running validation test in container" + echo "::group::Running validation tests" docker run --rm --network host \ --device=/dev/kfd --device=/dev/dri \ --ipc=host --shm-size 16G \ @@ -107,10 +73,29 @@ jobs: ${IMAGE_TAG} bash -c " set -e pip install -e . + + echo '=== Running external validation test ===' wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py python test_iris_distributed.py + + echo '=== Running external gluon validation test ===' + wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py + python test_iris_gluon_distributed.py " echo "::endgroup::" - echo "✅ Validation test passed for ${{ matrix.base-name }}!" + echo "✅ All validation tests passed for ${{ matrix.base-name }}!" + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ env.DOCKERHUB_USERNAME }} + password: ${{ secrets.IRIS_DOCKERHUB_TOKEN }} + + - name: Push Docker image + run: | + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }}" + echo "Pushing ${IMAGE_TAG} to Docker Hub..." + docker push ${IMAGE_TAG} + echo "✅ Successfully pushed ${{ matrix.base-name }}!" diff --git a/docker/Dockerfile.rocm7.1 b/docker/Dockerfile.rocm7.1 index efb3da91..5e6ac9fa 100644 --- a/docker/Dockerfile.rocm7.1 +++ b/docker/Dockerfile.rocm7.1 @@ -29,9 +29,10 @@ RUN pip3 install --upgrade pip && \ pip3 install wheel jupyter # Clone and install Triton +ARG TRITON_COMMIT=aafec417bded34db6308f5b3d6023daefae43905 WORKDIR $TRITON_PATH RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH -RUN git checkout aafec417bded34db6308f5b3d6023daefae43905 +RUN git checkout ${TRITON_COMMIT} RUN pip3 install -e . ENV PYTHONPATH=$TRITON_PATH From 4636e5cc762300de611fa507cbdcd5b0b02ce74b Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:51:35 -0800 Subject: [PATCH 05/17] Set default hash [skip ci] --- .github/workflows/docker-build-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index 8e0202b6..c80c8c86 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -40,7 +40,7 @@ jobs: - name: Set Triton SHA id: triton run: | - TRITON_COMMIT="${{ inputs.triton_commit }}" + TRITON_COMMIT="${{ inputs.triton_commit || 'aafec417bded34db6308f5b3d6023daefae43905' }}" TRITON_SHORT="${TRITON_COMMIT:0:7}" echo "short_sha=${TRITON_SHORT}" >> $GITHUB_OUTPUT echo "full_sha=${TRITON_COMMIT}" >> $GITHUB_OUTPUT From 04fbc36b9d7fb7c7a294a2b8967cd01ba96004e1 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:53:05 -0800 Subject: [PATCH 06/17] Trigger CI for Docker image build [docker only] --- .github/workflows/docker-build-push.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index c80c8c86..f26c3d51 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -12,6 +12,8 @@ on: default: 'aafec417bded34db6308f5b3d6023daefae43905' type: string +# This workflow always runs, even with [skip ci] or [ci skip] in commit message + env: DOCKERHUB_USERNAME: muhaawad IMAGE_NAME: iris-dev From 5af029228f12f66c8b25901c3355d6032fe532cc Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:56:11 -0800 Subject: [PATCH 07/17] Skip lint workflow for [docker only] commits --- .github/workflows/lint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 2b6d1dbb..9d328153 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -8,6 +8,7 @@ on: [push, pull_request] jobs: lint: runs-on: ubuntu-latest + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository From 03e85b38252400ba2a3792a18ae9b34d0a02ea8c Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 17:57:05 -0800 Subject: [PATCH 08/17] Skip all workflows except docker-build-push for [docker only] commits --- .github/workflows/docs.yml | 1 + .github/workflows/iris-external-validation-test.yml | 1 + .github/workflows/iris-performance-regression-test.yml | 1 + .github/workflows/iris-pip-install-test.yml | 1 + .github/workflows/iris-tests-apptainer.yml | 1 + 5 files changed, 5 insertions(+) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4e7dd3da..ce0fbea9 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -28,6 +28,7 @@ concurrency: jobs: build: runs-on: ubuntu-latest + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index 2cac214c..f5342931 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -15,6 +15,7 @@ jobs: build-container-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository diff --git a/.github/workflows/iris-performance-regression-test.yml b/.github/workflows/iris-performance-regression-test.yml index fa017886..9a4d5bb8 100644 --- a/.github/workflows/iris-performance-regression-test.yml +++ b/.github/workflows/iris-performance-regression-test.yml @@ -15,6 +15,7 @@ jobs: build-container-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 20 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index 739af005..eaa45034 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -15,6 +15,7 @@ jobs: build-container-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 63950f95..4f114a53 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -15,6 +15,7 @@ jobs: build-container-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository From a718c6471870bbde6a9a42f5392ef799494116e5 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:00:02 -0800 Subject: [PATCH 09/17] Apply [docker only] condition to ALL jobs in workflows --- .github/workflows/iris-external-validation-test.yml | 2 ++ .github/workflows/iris-performance-regression-test.yml | 1 + .github/workflows/iris-pip-install-test.yml | 2 ++ .github/workflows/iris-tests-apptainer.yml | 2 ++ 4 files changed, 7 insertions(+) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index f5342931..831434b6 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -42,6 +42,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository @@ -71,6 +72,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository diff --git a/.github/workflows/iris-performance-regression-test.yml b/.github/workflows/iris-performance-regression-test.yml index 9a4d5bb8..185ef726 100644 --- a/.github/workflows/iris-performance-regression-test.yml +++ b/.github/workflows/iris-performance-regression-test.yml @@ -42,6 +42,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + if: "!contains(github.event.head_commit.message, '[docker only]')" strategy: fail-fast: false matrix: diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index eaa45034..b43630d4 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -42,6 +42,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository @@ -134,6 +135,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 4f114a53..58a46d5c 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -41,6 +41,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 20 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository @@ -95,6 +96,7 @@ jobs: needs: build-container-image runs-on: [self-hosted, mi3008x] timeout-minutes: 15 + if: "!contains(github.event.head_commit.message, '[docker only]')" steps: - name: Checkout repository From 01a59b560456e23aa0f4eca580138beee99b228d Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:24:32 -0800 Subject: [PATCH 10/17] Use underscores in tag naming and container_exec.sh script [docker only] --- .github/workflows/docker-build-push.yml | 37 ++++++++++--------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index f26c3d51..b3344026 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -49,7 +49,7 @@ jobs: - name: Build Docker image run: | - IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }}" + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" echo "Building ${IMAGE_TAG}..." docker build \ -f ${{ matrix.dockerfile }} \ @@ -62,28 +62,21 @@ jobs: run: | set -e - IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }}" + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" echo "::group::Running validation tests" - docker run --rm --network host \ - --device=/dev/kfd --device=/dev/dri \ - --ipc=host --shm-size 16G \ - --group-add video --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - -v ${{ github.workspace }}:/workspace \ - -w /workspace \ - ${IMAGE_TAG} bash -c " - set -e - pip install -e . - - echo '=== Running external validation test ===' - wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py - python test_iris_distributed.py - - echo '=== Running external gluon validation test ===' - wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py - python test_iris_gluon_distributed.py - " + bash .github/scripts/container_exec.sh --image "${IMAGE_TAG}" " + set -e + pip install -e . + + echo '=== Running external validation test ===' + wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py + python test_iris_distributed.py + + echo '=== Running external gluon validation test ===' + wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py + python test_iris_gluon_distributed.py + " echo "::endgroup::" echo "✅ All validation tests passed for ${{ matrix.base-name }}!" @@ -96,7 +89,7 @@ jobs: - name: Push Docker image run: | - IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}-triton-${{ steps.triton.outputs.short_sha }}" + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" echo "Pushing ${IMAGE_TAG} to Docker Hub..." docker push ${IMAGE_TAG} echo "✅ Successfully pushed ${{ matrix.base-name }}!" From 95d4c7aea0177b968e9a3057a5784661c8bba755 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:33:30 -0800 Subject: [PATCH 11/17] Move ruff to dev dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cc8757f5..33e7f6b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ requires-python = ">=3.8" dependencies = [ "numpy", "requests", - "ruff", ] [project.urls] @@ -33,6 +32,7 @@ dev = [ "pytest", "black", "mypy", + "ruff", ] [tool.setuptools] From 39b289255377adbcf0b48b55835bdd6e18b01178 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:43:13 -0800 Subject: [PATCH 12/17] Make Python site-packages writable for all users [docker only] --- docker/Dockerfile.rocm7.0 | 5 +++++ docker/Dockerfile.rocm7.1 | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/docker/Dockerfile.rocm7.0 b/docker/Dockerfile.rocm7.0 index 891dabda..e171cf2a 100644 --- a/docker/Dockerfile.rocm7.0 +++ b/docker/Dockerfile.rocm7.0 @@ -38,3 +38,8 @@ ENV PYTHONPATH=$TRITON_PATH WORKDIR /workspace +# Make Python site-packages writable by all users for development +RUN chmod -R a+w /usr/local/lib/python3.12/site-packages/ || \ + chmod -R a+w /opt/conda/lib/python3.12/site-packages/ || \ + true + diff --git a/docker/Dockerfile.rocm7.1 b/docker/Dockerfile.rocm7.1 index 5e6ac9fa..153e75ac 100644 --- a/docker/Dockerfile.rocm7.1 +++ b/docker/Dockerfile.rocm7.1 @@ -38,3 +38,8 @@ ENV PYTHONPATH=$TRITON_PATH WORKDIR /workspace +# Make Python site-packages writable by all users for development +RUN chmod -R a+w /usr/local/lib/python3.12/site-packages/ || \ + chmod -R a+w /opt/conda/lib/python3.12/site-packages/ || \ + true + From a7c695e04bb7bb053c9d92dc580635634b765c37 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:44:22 -0800 Subject: [PATCH 13/17] Fix site-packages path to /opt/venv [docker only] --- docker/Dockerfile.rocm7.0 | 4 +--- docker/Dockerfile.rocm7.1 | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile.rocm7.0 b/docker/Dockerfile.rocm7.0 index e171cf2a..b679d14c 100644 --- a/docker/Dockerfile.rocm7.0 +++ b/docker/Dockerfile.rocm7.0 @@ -39,7 +39,5 @@ ENV PYTHONPATH=$TRITON_PATH WORKDIR /workspace # Make Python site-packages writable by all users for development -RUN chmod -R a+w /usr/local/lib/python3.12/site-packages/ || \ - chmod -R a+w /opt/conda/lib/python3.12/site-packages/ || \ - true +RUN chmod -R a+w /opt/venv/lib/python3.12/site-packages/ diff --git a/docker/Dockerfile.rocm7.1 b/docker/Dockerfile.rocm7.1 index 153e75ac..9c6f7666 100644 --- a/docker/Dockerfile.rocm7.1 +++ b/docker/Dockerfile.rocm7.1 @@ -39,7 +39,5 @@ ENV PYTHONPATH=$TRITON_PATH WORKDIR /workspace # Make Python site-packages writable by all users for development -RUN chmod -R a+w /usr/local/lib/python3.12/site-packages/ || \ - chmod -R a+w /opt/conda/lib/python3.12/site-packages/ || \ - true +RUN chmod -R a+w /opt/venv/lib/python3.12/site-packages/ From bd9f2e492a4796f2ea070a19d1a9148dd65a4676 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:44:59 -0800 Subject: [PATCH 14/17] Remove all workflows except docker-build-push [docker only] --- .github/workflows/auto-label.yml | 53 ------ .github/workflows/docs.yml | 81 --------- .../iris-external-validation-test.yml | 98 ---------- .../iris-performance-regression-test.yml | 90 ---------- .github/workflows/iris-pip-install-test.yml | 170 ------------------ .github/workflows/iris-tests-apptainer.yml | 117 ------------ .github/workflows/lint.yml | 38 ---- 7 files changed, 647 deletions(-) delete mode 100644 .github/workflows/auto-label.yml delete mode 100644 .github/workflows/docs.yml delete mode 100644 .github/workflows/iris-external-validation-test.yml delete mode 100644 .github/workflows/iris-performance-regression-test.yml delete mode 100644 .github/workflows/iris-pip-install-test.yml delete mode 100644 .github/workflows/iris-tests-apptainer.yml delete mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/auto-label.yml b/.github/workflows/auto-label.yml deleted file mode 100644 index 5feff289..00000000 --- a/.github/workflows/auto-label.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: Auto labeling -on: - issues: - types: [opened] - pull_request: - types: [opened] - -permissions: - contents: read - issues: write - pull-requests: write - -jobs: - # Label ISSUES using Renato66/auto-label - label-issues: - if: github.event_name == 'issues' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - sparse-checkout: | - .github/workflows/auto-label.json5 - sparse-checkout-cone-mode: false - - uses: Renato66/auto-label@v3 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - # Add ISSUES to ROCm Project #91 so they land in Todo - add-issues-to-project: - if: github.event_name == 'issues' - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v1.0.2 - with: - project-url: https://github.com/orgs/ROCm/projects/91 - github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} - - # PRs: label so the project rule moves them to In Progress - label-prs: - if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork - runs-on: ubuntu-latest - steps: - - name: Add iris + in-progress labels to PR - uses: actions/github-script@v7 - with: - script: | - await github.rest.issues.addLabels({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.payload.pull_request.number, - labels: ["iris", "in-progress"] - }) - diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index ce0fbea9..00000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: Build and Deploy Documentation - -on: - push: - branches: [ main ] - paths: - - 'docs/**' - - 'iris/**' - - 'examples/**' - - '.github/workflows/docs.yml' - pull_request: - branches: [ main ] - paths: - - 'docs/**' - - 'iris/**' - - 'examples/**' - - '.github/workflows/docs.yml' - -permissions: - contents: read - pages: write - id-token: write - -concurrency: - group: "pages-${{ github.ref }}" - cancel-in-progress: false - -jobs: - build: - runs-on: ubuntu-latest - if: "!contains(github.event.head_commit.message, '[docker only]')" - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - ref: ${{ github.ref }} - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y build-essential - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -r docs/sphinx/requirements.txt - - - name: Build documentation - run: | - cd docs - chmod +x build_docs.sh - ./build_docs.sh - - - name: Upload documentation artifact - uses: actions/upload-artifact@v4 - with: - name: documentation - path: docs/_build/html - retention-days: 30 - - - name: Upload artifact for GitHub Pages - uses: actions/upload-pages-artifact@v3 - with: - path: docs/_build/html - - deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - needs: build - if: github.ref == 'refs/heads/main' && github.event_name == 'push' - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml deleted file mode 100644 index 831434b6..00000000 --- a/.github/workflows/iris-external-validation-test.yml +++ /dev/null @@ -1,98 +0,0 @@ -name: Iris External Validation Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 90 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - external-validation-test: - name: External Validation Test - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run External Validation Test - run: | - set -e - - echo "::group::Running external validation test" - bash .github/scripts/container_exec.sh " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py - python test_iris_distributed.py - " - echo "::endgroup::" - - echo "✅ External validation test passed!" - - external-gluon-validation-test: - name: External Gluon Validation Test - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run External Gluon Validation Test - run: | - set -e - - echo "::group::Running external gluon validation test" - bash .github/scripts/container_exec.sh --gpus "0,1" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py - python test_iris_gluon_distributed.py - " - echo "::endgroup::" - - echo "✅ External gluon validation test passed!" \ No newline at end of file diff --git a/.github/workflows/iris-performance-regression-test.yml b/.github/workflows/iris-performance-regression-test.yml deleted file mode 100644 index 185ef726..00000000 --- a/.github/workflows/iris-performance-regression-test.yml +++ /dev/null @@ -1,90 +0,0 @@ -name: Iris Performance Regression Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 20 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - performance-test: - name: ${{ matrix.example_name }} - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - if: "!contains(github.event.head_commit.message, '[docker only]')" - strategy: - fail-fast: false - matrix: - # Performance baselines measured on AMD Instinct MI325X (8 GPUs) - include: - # Disabled https://github.com/ROCm/iris/issues/238 - #- example_name: "GEMM All-Scatter WG Specialization" - # example_path: "10_gemm_all_scatter_wg_specialization" - # tflops_threshold: 1600 # Actual: ~2182 TFLOPs - # benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256" - - - example_name: "GEMM All-Scatter" - example_path: "07_gemm_all_scatter" - tflops_threshold: 1000 # Actual: ~1407 TFLOPs - benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256" - - - example_name: "GEMM All-Scatter Producer-Consumer" - example_path: "11_gemm_all_scatter_producer_consumer" - tflops_threshold: 1600 # Actual: ~2190 TFLOPs - benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48" - - - example_name: "GEMM All-Scatter Bulk Synchronous" - example_path: "12_gemm_all_scatter_bulk_synchronous" - tflops_threshold: 900 # Actual: ~1262 TFLOPs - benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run ${{ matrix.example_name }} Benchmark (8 ranks) - run: | - set -e - - echo "::group::Running performance benchmark" - bash .github/scripts/run_perf_benchmark.sh \ - "${{ matrix.example_path }}" \ - "${{ matrix.tflops_threshold }}" \ - ${{ matrix.benchmark_args }} - echo "::endgroup::" - diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml deleted file mode 100644 index b43630d4..00000000 --- a/.github/workflows/iris-pip-install-test.yml +++ /dev/null @@ -1,170 +0,0 @@ -name: Iris Pip Install Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 90 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - test-1-2-4-ranks: - name: Pip Install Test 1/2/4 Ranks (Parallel) - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run pip install tests for 1, 2, 4 ranks in parallel - run: | - set -e - - # Run tests in parallel with different GPU assignments - # Note: Each test gets 2+ GPUs even if it only uses some of them. - # This allows tests like test_empty_device_handling to verify that - # allocating on a different device correctly raises an error. - - echo "::group::Starting parallel tests" - echo "Starting 1-rank test on GPUs 0,1..." - bash .github/scripts/container_exec.sh --gpus "0,1" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 1 ranks\" - python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 1 ranks\" - python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10 - done - " & - PID1=$! - - echo "Starting 2-rank test on GPUs 2,3..." - bash .github/scripts/container_exec.sh --gpus "2,3" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 2 ranks\" - python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 2 ranks\" - python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10 - done - " & - PID2=$! - - echo "Starting 4-rank test on GPUs 4,5,6,7..." - bash .github/scripts/container_exec.sh --gpus "4,5,6,7" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 4 ranks\" - python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 4 ranks\" - python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10 - done - " & - PID4=$! - echo "::endgroup::" - - # Wait for all parallel tests and track failures - echo "::group::Waiting for parallel tests to complete" - FAIL=0 - FAILED_TESTS="" - - wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; } - wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; } - wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; } - echo "::endgroup::" - - if [ $FAIL -eq 1 ]; then - echo "::error::Parallel tests failed:$FAILED_TESTS" - exit 1 - fi - - echo "✅ All parallel tests (1, 2, 4 ranks) passed!" - - test-8-ranks: - name: Pip Install Test 8 Ranks - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 8-rank pip install test - run: | - set -e - - echo "::group::Running 8-rank test on all GPUs" - bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 8 ranks\" - python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 8 ranks\" - python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10 - done - " - echo "::endgroup::" - - echo "✅ 8-rank test passed!" diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml deleted file mode 100644 index 58a46d5c..00000000 --- a/.github/workflows/iris-tests-apptainer.yml +++ /dev/null @@ -1,117 +0,0 @@ -name: Iris Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 90 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - test-1-2-4-ranks: - name: Test 1/2/4 Ranks (Parallel) - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 20 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 1, 2, 4 rank tests in parallel - run: | - set -e - - # Run tests in parallel with different GPU assignments - # Note: Each test gets 2+ GPUs even if it only uses some of them. - # This allows tests like test_empty_device_handling to verify that - # allocating on a different device correctly raises an error. - - echo "::group::Starting parallel tests" - echo "Starting 1-rank test on GPUs 0,1..." - bash .github/scripts/run_tests.sh 1 "0,1" & - PID1=$! - - echo "Starting 2-rank test on GPUs 2,3..." - bash .github/scripts/run_tests.sh 2 "2,3" & - PID2=$! - - echo "Starting 4-rank test on GPUs 4,5,6,7..." - bash .github/scripts/run_tests.sh 4 "4,5,6,7" & - PID4=$! - echo "::endgroup::" - - # Wait for all parallel tests and track failures - echo "::group::Waiting for parallel tests to complete" - FAIL=0 - FAILED_TESTS="" - - wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; } - wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; } - wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; } - echo "::endgroup::" - - if [ $FAIL -eq 1 ]; then - echo "::error::Parallel tests failed:$FAILED_TESTS" - exit 1 - fi - - echo "✅ All parallel tests (1, 2, 4 ranks) passed!" - - test-8-ranks: - name: Test 8 Ranks - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 15 - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 8-rank test - run: | - set -e - - echo "::group::Running 8-rank test on all GPUs" - bash .github/scripts/run_tests.sh 8 "0,1,2,3,4,5,6,7" - echo "::endgroup::" - - echo "✅ 8-rank test passed!" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml deleted file mode 100644 index 9d328153..00000000 --- a/.github/workflows/lint.yml +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. - -name: Lint & Auto-Fix - -on: [push, pull_request] - -jobs: - lint: - runs-on: ubuntu-latest - if: "!contains(github.event.head_commit.message, '[docker only]')" - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Install Ruff - run: pip install ruff - - - name: Run Ruff and auto-fix issues - run: ruff check . --fix - - - name: Run Ruff formatter - run: ruff format . - - - name: Commit and push fixes - run: | - git config --global user.name "github-actions[bot]" - git config --global user.email "github-actions[bot]@users.noreply.github.com" - git add . - git commit -m "Apply Ruff auto-fixes" || echo "No changes to commit" - git push - continue-on-error: true \ No newline at end of file From 1e2551462a11a3b519d62de67c03f1a1a7347627 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:49:14 -0800 Subject: [PATCH 15/17] Remove test --- .github/workflows/docker-build-push.yml | 44 ++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index b3344026..a127c937 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -58,28 +58,28 @@ jobs: ./docker echo "✅ Build complete!" - - name: Run validation tests - run: | - set -e - - IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" - - echo "::group::Running validation tests" - bash .github/scripts/container_exec.sh --image "${IMAGE_TAG}" " - set -e - pip install -e . - - echo '=== Running external validation test ===' - wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py - python test_iris_distributed.py - - echo '=== Running external gluon validation test ===' - wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py - python test_iris_gluon_distributed.py - " - echo "::endgroup::" - - echo "✅ All validation tests passed for ${{ matrix.base-name }}!" +# - name: Run validation tests +# run: | +# set -e +# +# IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" +# +# echo "::group::Running validation tests" +# bash .github/scripts/container_exec.sh --image "${IMAGE_TAG}" " +# set -e +# pip install -e . +# +# echo '=== Running external validation test ===' +# wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py +# python test_iris_distributed.py +# +# echo '=== Running external gluon validation test ===' +# wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py +# python test_iris_gluon_distributed.py +# " +# echo "::endgroup::" +# +# echo "✅ All validation tests passed for ${{ matrix.base-name }}!" - name: Log in to Docker Hub uses: docker/login-action@v3 From 48d525de83b44e77bca4c1abb74a4248edee0721 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 18:58:49 -0800 Subject: [PATCH 16/17] Add ROCm 6.4.4 Docker image build [docker only] --- .github/workflows/docker-build-push.yml | 2 ++ docker/Dockerfile.rocm6.4 | 43 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 docker/Dockerfile.rocm6.4 diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index a127c937..87e84d75 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -30,6 +30,8 @@ jobs: fail-fast: false matrix: include: + - dockerfile: ./docker/Dockerfile.rocm6.4 + base-name: "rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1" - dockerfile: ./docker/Dockerfile.rocm7.0 base-name: "rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0" - dockerfile: ./docker/Dockerfile.rocm7.1 diff --git a/docker/Dockerfile.rocm6.4 b/docker/Dockerfile.rocm6.4 new file mode 100644 index 00000000..9fd0995a --- /dev/null +++ b/docker/Dockerfile.rocm6.4 @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +FROM rocm/pytorch:rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1 + +# Use bash shell for RUN commands +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV TRITON_PATH=/opt/triton \ + ROCM_PATH=/opt/rocm \ + OMPI_MCA_mtl="^ofi" \ + OMPI_MCA_pml="ob1" + +ENV LD_LIBRARY_PATH=$ROCM_PATH/lib:$LD_LIBRARY_PATH \ + PATH="$ROCM_PATH/bin:$PATH" + +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 + +# Install system packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git wget ninja-build cmake python3-pip python3-dev build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Install Python packages with pip +RUN pip3 install --upgrade pip && \ + pip3 install wheel jupyter + +# Clone and install Triton +ARG TRITON_COMMIT=aafec417bded34db6308f5b3d6023daefae43905 +WORKDIR $TRITON_PATH +RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH +RUN git checkout ${TRITON_COMMIT} +RUN pip3 install -e . +ENV PYTHONPATH=$TRITON_PATH + +WORKDIR /workspace + +# Make Python site-packages writable by all users for development +RUN chmod -R a+w /opt/venv/lib/python3.12/site-packages/ + From 75113f2cde9c378d381dd355a8374aa7776d5f3d Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 5 Nov 2025 19:32:47 -0800 Subject: [PATCH 17/17] Fix ROCm 6.4 Dockerfile - remove incorrect chmod path [docker only] --- docker/Dockerfile.rocm6.4 | 3 --- 1 file changed, 3 deletions(-) diff --git a/docker/Dockerfile.rocm6.4 b/docker/Dockerfile.rocm6.4 index 9fd0995a..bf201dcc 100644 --- a/docker/Dockerfile.rocm6.4 +++ b/docker/Dockerfile.rocm6.4 @@ -38,6 +38,3 @@ ENV PYTHONPATH=$TRITON_PATH WORKDIR /workspace -# Make Python site-packages writable by all users for development -RUN chmod -R a+w /opt/venv/lib/python3.12/site-packages/ -