From bc0f37d7e8c92ed30ba529bf292442ba902a7527 Mon Sep 17 00:00:00 2001
From: CSY <csy@modelcloud.ai>
Date: Mon, 6 Jan 2025 13:56:14 +0800
Subject: [PATCH] [CI] add unit_tests_rocm.yml

---
 .github/workflows/unit_tests_rocm.yml | 454 ++++++++++++++++++++++++++
 1 file changed, 454 insertions(+)
 create mode 100644 .github/workflows/unit_tests_rocm.yml

diff --git a/.github/workflows/unit_tests_rocm.yml b/.github/workflows/unit_tests_rocm.yml
new file mode 100644
index 000000000..366eaae9a
--- /dev/null
+++ b/.github/workflows/unit_tests_rocm.yml
@@ -0,0 +1,454 @@
+name: Unit Tests ROCM
+
+run-name: "${{ github.event.inputs.note }}"
+
+defaults:
+  run:
+    shell: bash -le {0}
+
+on:
+  repository_dispatch:
+  workflow_dispatch:
+    inputs:
+      note:
+        description: 'Note'
+        required: false
+        default: ''
+      repo:
+        description: 'GitHub repo {owner}/{repo}'
+        required: false
+        default: ''
+      ref:
+        description: 'GitHub ref: Branch, Tag or Commit SHA'
+        required: false
+        default: ''
+      pr_number:
+        description: 'PR Number'
+        required: false
+        type: number
+      test_names:
+        description: 'Input Test(s) to Run (default all)'
+        required: false
+        default: ''
+      test_regex:
+        description: 'Regex to filter test files'
+        required: false
+        default: ''
+      artifact_id:
+        description: 'Run id for artifact to be downloaded'
+        required: false
+        default: ''
+      max-parallel:
+        description: 'max parallel jobs'
+        required: false
+        default: '10'
+      m4-only:
+        description: 'only run m4(test only)'
+        type: boolean
+        required: false
+        default: false
+
+env:
+  CUDA_DEVICE_ORDER: PCI_BUS_ID
+  ROCM_VERSION: 6.3
+  PYTORCH_CUDA_ALLOC_CONF: 'expandable_segments:True'
+  MAX_JOBS: 8
+  RUNNER: 10.0.13.31
+  TRANSFORMERS_DIFF_TESTS: "models/test_internlm.py,models/test_internlm2_5.py,models/test_xverse.py"
+  TORCH_2_5_TESTS: "test_evalplus.py,test_perplexity.py,test_q4_ipex.py,test_save_loaded_quantized_model.py,test_quant_formats.py,models/test_hymba.py"
+  IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral.py,test_q4_torch_apple.py,test_ipex_xpu.py"
+  GPTQMODEL_FORCE_BUILD: 1
+  repo: ${{ github.event.inputs.repo || github.repository }}
+  ref: ${{ github.event.inputs.ref || github.ref }}
+
+concurrency:
+  group: ${{ github.event.inputs.ref || github.ref }}-workflow-unit-tests-${{ github.event.inputs.test_names }}
+  cancel-in-progress: true
+
+jobs:
+  check-vm:
+    runs-on: [ self-hosted, rocm ]
+    container:
+      image: modelcloud/gptqmodel:alpine-ci-v1
+    outputs:
+      ip: ${{ steps.get_ip.outputs.ip }}
+      run_id: ${{ steps.get_ip.outputs.run_id }}
+      max-parallel: ${{ steps.get_ip.outputs.max-parallel }}
+    steps:
+      - name: Print env
+        run: |
+          echo "repo: ${{ env.repo }}"
+          echo "ref: ${{ env.ref }}"
+          echo "artifact_id: ${{ github.event.inputs.artifact_id }}"
+          echo "test_names: ${{ github.event.inputs.test_names }}"
+
+      - name: Select server
+        id: get_ip
+        run: |
+          echo "ip=$RUNNER" >> "$GITHUB_OUTPUT"
+
+          echo "ip: $ip"
+
+          if [ -n "${{ github.event.inputs.artifact_id }}" ]; then
+            run_id="${{ github.event.inputs.artifact_id }}"
+          else
+            run_id="${{ github.run_id }}"
+          fi
+          echo "run_id=$run_id" >> "$GITHUB_OUTPUT"
+          echo "artifact_id=$run_id"
+
+          max_p=${{ github.event.inputs.max-parallel }}
+          max_p="{\"size\": ${max_p:-10}}"
+          echo "max-parallel=$max_p" >> "$GITHUB_OUTPUT"
+          echo "max-parallel=$max_p"
+
+  list-test-files:
+    runs-on: ubuntu-latest
+    if: github.event.inputs.m4-only != 'true'
+    outputs:
+      torch-2-5-files: ${{ steps.files.outputs.torch-2-5-files }}
+      transformers-files: ${{ steps.files.outputs.transformers-files }}
+
+    steps:
+      - name: Checkout Codes
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.repo }}
+          ref: ${{ env.ref }}
+
+      - name: Fetch PR by number
+        if: ${{ github.event.inputs.pr_number != 0 }}
+        run: |
+          PR_NUMBER=${{ github.event.inputs.pr_number }}
+          echo "pr number $PR_NUMBER"
+          git config --global --add safe.directory $(pwd)
+          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
+          git checkout pr-${PR_NUMBER}
+
+      - name: List files
+        id: files
+        run: |
+          script="
+          import json
+          import os
+          import re
+
+          TRANSFORMERS_DIFF_TESTS = '${TRANSFORMERS_DIFF_TESTS}'
+          IGNORED_TEST_FILES = '${IGNORED_TEST_FILES}'
+
+          TEST_NAMES='${{ github.event.inputs.test_names }}'
+          TEST_REGEX='${{ github.event.inputs.test_regex }}'
+
+          input_test_files_list = [f.strip().removesuffix('.py') for f in TEST_NAMES.split(',') if f.strip()]
+
+          transformers_test_files = [f.strip().removesuffix('.py') for f in f'{TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()]
+          transformers_test_files = [f for f in transformers_test_files if not input_test_files_list or f in input_test_files_list]
+
+          all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('py') not in f'{IGNORED_TEST_FILES}']
+          all_tests_models = ['models/'+f.removesuffix('.py') for f in os.listdir('tests/models') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('py') not in f'{IGNORED_TEST_FILES}']
+
+          torch_2_5_test_files = [f for f in all_tests+all_tests_models if (not input_test_files_list or f in input_test_files_list) and f not in transformers_test_files]
+
+          torch_2_5_test_files = [test for test in torch_2_5_test_files if re.match(rf'{TEST_REGEX}', test)]
+          transformers_test_files = [test for test in transformers_test_files if re.match(rf'{TEST_REGEX}', test)]
+
+          print(f'{json.dumps(torch_2_5_test_files)}|{json.dumps(transformers_test_files)}')
+          "
+
+          test_files=$(python3 -c "$script")
+          IFS='|' read -r torch_2_5_test_files transformers_test_files <<< "$test_files"
+          echo "torch-2-5-files=$torch_2_5_test_files" >> "$GITHUB_OUTPUT"
+          echo "transformers-files=$transformers_test_files" >> "$GITHUB_OUTPUT"
+
+          echo "Test files: $test_files"
+          echo "Torch 2.5 Test files: $torch_2_5_test_files"
+          echo "Transformers Test files: $transformers_test_files"
+          echo "Ignored Test files: $IGNORED_TEST_FILES"
+
+  build:
+    runs-on: [ self-hosted, rocm ]
+    needs: check-vm
+    if: github.event.inputs.m4-only != 'true' && github.event.inputs.artifact_id == '' && !cancelled()
+    container:
+      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v6-rocm
+    steps:
+
+      - name: Checkout Codes
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.repo }}
+          ref: ${{ env.ref }}
+
+      - name: Fetch PR by number
+        if: ${{ github.event.inputs.pr_number != 0 }}
+        run: |
+          PR_NUMBER=${{ github.event.inputs.pr_number }}
+          echo "pr number $PR_NUMBER"
+          git config --global --add safe.directory $(pwd)
+          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
+          git checkout pr-${PR_NUMBER}
+
+      - name: Print Env
+        run: |
+          echo "== pyenv =="
+          pyenv versions
+          echo "== python =="
+          python --version
+          echo "== nvcc =="
+          nvcc --version
+          echo "== torch =="
+          pip show torch
+          echo "##### pip list #####"
+          pip list
+          echo "----------"
+          python -c 'from datasets import config; print("Datasets 缓存目录:", config.HF_DATASETS_CACHE)'
+
+      - name: Compile
+        timeout-minutes: 35
+        run: |
+          pyenv versions
+          python setup.py bdist_wheel
+
+      - name: Test install
+        run: |
+          ls -ahl dist
+          whl=$(ls -t dist/*.whl | head -n 1 | xargs basename)
+          sha256=$(sha256sum dist/$whl)
+          echo "hash=$sha256"
+
+          echo "WHL_HASH=$sha256" >> $GITHUB_ENV
+          echo "WHL_NAME=$whl" >> $GITHUB_ENV
+
+          twine check dist/$whl
+          uv pip install dist/$whl
+
+      - name: Upload wheel
+        continue-on-error: true
+        run: |
+          curl -s -F "runid=${{ github.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "sha256=${{ env.WHL_HASH }}" -F "file=@dist/${{ env.WHL_NAME }}" http://${{ needs.check-vm.outputs.ip }}/gpu/whl/upload
+
+      - name: Upload to artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist
+
+      - name: Clean cache
+        if: always()
+        run: pip cache purge && uv cache clean
+
+  transformers_diff:
+    needs:
+      - build
+      - list-test-files
+      - check-vm
+    runs-on: [ self-hosted, rocm ]
+    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]'
+    container:
+      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v6-rocm
+      options: --device /dev/dri --device /dev/kfd --ipc=host
+      volumes:
+        - /home/ci/models:/monster/data/model
+    strategy:
+      fail-fast: false
+      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }}
+      matrix:
+        test_script: ${{ fromJSON(needs.list-test-files.outputs.transformers-files) }}
+    steps:
+      - name: Checkout Codes
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.event.inputs.repo }}
+          ref: ${{ github.event.inputs.ref }}
+
+      - name: Fetch PR by number
+        if: ${{ github.event.inputs.pr_number != 0 }}
+        run: |
+          PR_NUMBER=${{ github.event.inputs.pr_number }}
+          echo "pr number $PR_NUMBER"
+          git config --global --add safe.directory $(pwd)
+          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
+          git checkout pr-${PR_NUMBER}
+
+      - name: Print Env
+        run: |
+          echo "== pyenv =="
+          pyenv versions
+          echo "== python =="
+          python --version
+          echo "== nvcc =="
+          nvcc --version
+          echo "== torch =="
+          pip show torch
+          echo "== pip list =="
+          pip list
+
+      - name: Download wheel
+        continue-on-error: true
+        run: |
+          file_name=$(curl -s  -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://${{ needs.check-vm.outputs.ip }}/gpu/whl/download")
+
+          if echo "$file_name" | grep -q "gptqmodel"; then
+              mkdir dist || true
+              cd dist
+              curl -s -O  http://${{ needs.check-vm.outputs.ip }}/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name
+              ls -ahl .
+              sha256=$(sha256sum $file_name)
+              echo "sha256=$sha256"
+              echo "DOWNLOADED=1" >> $GITHUB_ENV
+          fi
+
+      - name: Download artifact
+        if: env.DOWNLOADED == '' && !cancelled()
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist
+          run-id: ${{ needs.check-vm.outputs.run_id }}
+
+      - name: Install wheel
+        run: |
+          echo "===== install optimum bitblas parameterized uvicorn ====="
+          uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          echo "===== install dist/whl ====="
+          uv pip install dist/*.whl
+          echo "===== init test env ====="
+          echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 ====="
+          uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          if [ "${{ matrix.test_script }}" == "models/test_xverse" ]; then
+            echo "===== install tokenizers==0.15.2 ====="
+            uv pip install tokenizers==0.15.2 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          fi
+          if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
+            echo "===== install auto_round ====="
+            uv pip install auto_round
+          fi
+
+      - name: Clean cache
+        if: always()
+        run: pip cache purge && uv cache clean
+
+      - name: Run tests
+        if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
+        run: |
+          start_time=$(date +%s)
+          pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
+          execution_time=$(( $(date +%s) - start_time ))
+          echo "$((execution_time / 60))m $((execution_time % 60))s"
+          curl "http://${{ needs.check-vm.outputs.ip }}/gpu/log_test_vram?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second&name=${{ matrix.test_script }}"
+
+  torch2_5:
+    needs:
+      - build
+      - list-test-files
+      - check-vm
+    runs-on: [ self-hosted, rocm ]
+    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-2-5-files != '[]'
+    container:
+      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v6-rocm
+      options: --device /dev/dri --device /dev/kfd --ipc=host
+      volumes:
+        - /home/ci/models:/monster/data/model
+    strategy:
+      fail-fast: false
+      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }}
+      matrix:
+        test_script: ${{ fromJSON(needs.list-test-files.outputs.torch-2-5-files) }}
+    steps:
+      - name: Checkout Codes
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.repo }}
+          ref: ${{ env.ref }}
+
+      - name: Fetch PR by number
+        if: ${{ github.event.inputs.pr_number != 0 }}
+        run: |
+          PR_NUMBER=${{ github.event.inputs.pr_number }}
+          echo "pr number $PR_NUMBER"
+          git config --global --add safe.directory $(pwd)
+          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
+          git checkout pr-${PR_NUMBER}
+
+      - name: Print Env
+        run: |
+          echo "== pyenv =="
+          pyenv versions
+          echo "== python =="
+          python --version
+          echo "== nvcc =="
+          nvcc --version
+          echo "== torch =="
+          pip show torch
+          echo "== pip list =="
+          pip list
+
+      - name: Download wheel
+        continue-on-error: true
+        run: |
+          file_name=$(curl -s  -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://${{ needs.check-vm.outputs.ip }}/gpu/whl/download")
+          if echo "$file_name" | grep -q "gptqmodel"; then
+              mkdir dist || true
+              cd dist
+              curl -s -O  http://${{ needs.check-vm.outputs.ip }}/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name
+              ls -ahl .
+              sha256=$(sha256sum $file_name)
+              echo "sha256=$sha256"
+              echo "DOWNLOADED=1" >> $GITHUB_ENV
+          fi
+
+      - name: Download artifact
+        if: env.DOWNLOADED == '' && !cancelled()
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist
+          run-id: ${{ needs.check-vm.outputs.run_id }}
+
+      - name: Install wheel
+        run: |
+          if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
+            echo "===== install auto_round ====="
+            uv pip install auto_round
+          fi
+          if [ "${{ matrix.test_script }}" == "models/test_cohere2" ]; then
+            echo "===== install transformers from git ====="
+            uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5
+          fi
+          if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
+            source /etc/profile.d/pyenv.sh && pyenv activate xpu
+          fi
+          echo "===== install dist/whl ====="
+          uv pip install dist/*.whl
+
+      - name: Clean cache
+        if: always()
+        run: pip cache purge && uv cache clean
+
+      - name: Run tests
+        if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
+        run: |
+          if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
+            export CUDA_VISIBLE_DEVICES=""
+            source /etc/profile.d/pyenv.sh && pyenv activate xpu
+            pip uninstall vllm -y
+            pip list
+          fi
+
+          start_time=$(date +%s)
+          pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
+          execution_time=$(( $(date +%s) - start_time ))
+          echo "$((execution_time / 60))m $((execution_time % 60))s"
+          curl "http://${{ needs.check-vm.outputs.ip }}/gpu/log_test_vram?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second&name=${{ matrix.test_script }}"
+
+  show-statistics:
+    runs-on: [ self-hosted, rocm ]
+    if: always()
+    container:
+      image: modelcloud/gptqmodel:alpine-ci-v1
+    needs:
+      - transformers_diff
+      - torch2_5
+    steps:
+      - name: Print statistics
+        run: curl "http://10.0.14.248/gpu/get_vram_logs?id=${{ github.run_id }}"