From bc0f37d7e8c92ed30ba529bf292442ba902a7527 Mon Sep 17 00:00:00 2001 From: CSY Date: Mon, 6 Jan 2025 13:56:14 +0800 Subject: [PATCH] [CI] add unit_tests_rocm.yml --- .github/workflows/unit_tests_rocm.yml | 454 ++++++++++++++++++++++++++ 1 file changed, 454 insertions(+) create mode 100644 .github/workflows/unit_tests_rocm.yml diff --git a/.github/workflows/unit_tests_rocm.yml b/.github/workflows/unit_tests_rocm.yml new file mode 100644 index 000000000..366eaae9a --- /dev/null +++ b/.github/workflows/unit_tests_rocm.yml @@ -0,0 +1,454 @@ +name: Unit Tests ROCM + +run-name: "${{ github.event.inputs.note }}" + +defaults: + run: + shell: bash -le {0} + +on: + repository_dispatch: + workflow_dispatch: + inputs: + note: + description: 'Note' + required: false + default: '' + repo: + description: 'GitHub repo {owner}/{repo}' + required: false + default: '' + ref: + description: 'GitHub ref: Branch, Tag or Commit SHA' + required: false + default: '' + pr_number: + description: 'PR Number' + required: false + type: number + test_names: + description: 'Input Test(s) to Run (default all)' + required: false + default: '' + test_regex: + description: 'Regex to filter test files' + required: false + default: '' + artifact_id: + description: 'Run id for artifact to be downloaded' + required: false + default: '' + max-parallel: + description: 'max parallel jobs' + required: false + default: '10' + m4-only: + description: 'only run m4(test only)' + type: boolean + required: false + default: false + +env: + CUDA_DEVICE_ORDER: PCI_BUS_ID + ROCM_VERSION: 6.3 + PYTORCH_CUDA_ALLOC_CONF: 'expandable_segments:True' + MAX_JOBS: 8 + RUNNER: 10.0.13.31 + TRANSFORMERS_DIFF_TESTS: "models/test_internlm.py,models/test_internlm2_5.py,models/test_xverse.py" + TORCH_2_5_TESTS: "test_evalplus.py,test_perplexity.py,test_q4_ipex.py,test_save_loaded_quantized_model.py,test_quant_formats.py,models/test_hymba.py" + IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral.py,test_q4_torch_apple.py,test_ipex_xpu.py" + GPTQMODEL_FORCE_BUILD: 1 + repo: ${{ github.event.inputs.repo || github.repository }} + ref: ${{ github.event.inputs.ref || github.ref }} + +concurrency: + group: ${{ github.event.inputs.ref || github.ref }}-workflow-unit-tests-${{ github.event.inputs.test_names }} + cancel-in-progress: true + +jobs: + check-vm: + runs-on: [ self-hosted, rocm ] + container: + image: modelcloud/gptqmodel:alpine-ci-v1 + outputs: + ip: ${{ steps.get_ip.outputs.ip }} + run_id: ${{ steps.get_ip.outputs.run_id }} + max-parallel: ${{ steps.get_ip.outputs.max-parallel }} + steps: + - name: Print env + run: | + echo "repo: ${{ env.repo }}" + echo "ref: ${{ env.ref }}" + echo "artifact_id: ${{ github.event.inputs.artifact_id }}" + echo "test_names: ${{ github.event.inputs.test_names }}" + + - name: Select server + id: get_ip + run: | + echo "ip=$RUNNER" >> "$GITHUB_OUTPUT" + + echo "ip: $ip" + + if [ -n "${{ github.event.inputs.artifact_id }}" ]; then + run_id="${{ github.event.inputs.artifact_id }}" + else + run_id="${{ github.run_id }}" + fi + echo "run_id=$run_id" >> "$GITHUB_OUTPUT" + echo "artifact_id=$run_id" + + max_p=${{ github.event.inputs.max-parallel }} + max_p="{\"size\": ${max_p:-10}}" + echo "max-parallel=$max_p" >> "$GITHUB_OUTPUT" + echo "max-parallel=$max_p" + + list-test-files: + runs-on: ubuntu-latest + if: github.event.inputs.m4-only != 'true' + outputs: + torch-2-5-files: ${{ steps.files.outputs.torch-2-5-files }} + transformers-files: ${{ steps.files.outputs.transformers-files }} + + steps: + - name: Checkout Codes + uses: actions/checkout@v4 + with: + repository: ${{ env.repo }} + ref: ${{ env.ref }} + + - name: Fetch PR by number + if: ${{ github.event.inputs.pr_number != 0 }} + run: | + PR_NUMBER=${{ github.event.inputs.pr_number }} + echo "pr number $PR_NUMBER" + git config --global --add safe.directory $(pwd) + git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} + git checkout pr-${PR_NUMBER} + + - name: List files + id: files + run: | + script=" + import json + import os + import re + + TRANSFORMERS_DIFF_TESTS = '${TRANSFORMERS_DIFF_TESTS}' + IGNORED_TEST_FILES = '${IGNORED_TEST_FILES}' + + TEST_NAMES='${{ github.event.inputs.test_names }}' + TEST_REGEX='${{ github.event.inputs.test_regex }}' + + input_test_files_list = [f.strip().removesuffix('.py') for f in TEST_NAMES.split(',') if f.strip()] + + transformers_test_files = [f.strip().removesuffix('.py') for f in f'{TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()] + transformers_test_files = [f for f in transformers_test_files if not input_test_files_list or f in input_test_files_list] + + all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('py') not in f'{IGNORED_TEST_FILES}'] + all_tests_models = ['models/'+f.removesuffix('.py') for f in os.listdir('tests/models') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('py') not in f'{IGNORED_TEST_FILES}'] + + torch_2_5_test_files = [f for f in all_tests+all_tests_models if (not input_test_files_list or f in input_test_files_list) and f not in transformers_test_files] + + torch_2_5_test_files = [test for test in torch_2_5_test_files if re.match(rf'{TEST_REGEX}', test)] + transformers_test_files = [test for test in transformers_test_files if re.match(rf'{TEST_REGEX}', test)] + + print(f'{json.dumps(torch_2_5_test_files)}|{json.dumps(transformers_test_files)}') + " + + test_files=$(python3 -c "$script") + IFS='|' read -r torch_2_5_test_files transformers_test_files <<< "$test_files" + echo "torch-2-5-files=$torch_2_5_test_files" >> "$GITHUB_OUTPUT" + echo "transformers-files=$transformers_test_files" >> "$GITHUB_OUTPUT" + + echo "Test files: $test_files" + echo "Torch 2.5 Test files: $torch_2_5_test_files" + echo "Transformers Test files: $transformers_test_files" + echo "Ignored Test files: $IGNORED_TEST_FILES" + + build: + runs-on: [ self-hosted, rocm ] + needs: check-vm + if: github.event.inputs.m4-only != 'true' && github.event.inputs.artifact_id == '' && !cancelled() + container: + image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v6-rocm + steps: + + - name: Checkout Codes + uses: actions/checkout@v4 + with: + repository: ${{ env.repo }} + ref: ${{ env.ref }} + + - name: Fetch PR by number + if: ${{ github.event.inputs.pr_number != 0 }} + run: | + PR_NUMBER=${{ github.event.inputs.pr_number }} + echo "pr number $PR_NUMBER" + git config --global --add safe.directory $(pwd) + git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} + git checkout pr-${PR_NUMBER} + + - name: Print Env + run: | + echo "== pyenv ==" + pyenv versions + echo "== python ==" + python --version + echo "== nvcc ==" + nvcc --version + echo "== torch ==" + pip show torch + echo "##### pip list #####" + pip list + echo "----------" + python -c 'from datasets import config; print("Datasets 缓存目录:", config.HF_DATASETS_CACHE)' + + - name: Compile + timeout-minutes: 35 + run: | + pyenv versions + python setup.py bdist_wheel + + - name: Test install + run: | + ls -ahl dist + whl=$(ls -t dist/*.whl | head -n 1 | xargs basename) + sha256=$(sha256sum dist/$whl) + echo "hash=$sha256" + + echo "WHL_HASH=$sha256" >> $GITHUB_ENV + echo "WHL_NAME=$whl" >> $GITHUB_ENV + + twine check dist/$whl + uv pip install dist/$whl + + - name: Upload wheel + continue-on-error: true + run: | + curl -s -F "runid=${{ github.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "sha256=${{ env.WHL_HASH }}" -F "file=@dist/${{ env.WHL_NAME }}" http://${{ needs.check-vm.outputs.ip }}/gpu/whl/upload + + - name: Upload to artifact + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist + + - name: Clean cache + if: always() + run: pip cache purge && uv cache clean + + transformers_diff: + needs: + - build + - list-test-files + - check-vm + runs-on: [ self-hosted, rocm ] + if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]' + container: + image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v6-rocm + options: --device /dev/dri --device /dev/kfd --ipc=host + volumes: + - /home/ci/models:/monster/data/model + strategy: + fail-fast: false + max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }} + matrix: + test_script: ${{ fromJSON(needs.list-test-files.outputs.transformers-files) }} + steps: + - name: Checkout Codes + uses: actions/checkout@v4 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.ref }} + + - name: Fetch PR by number + if: ${{ github.event.inputs.pr_number != 0 }} + run: | + PR_NUMBER=${{ github.event.inputs.pr_number }} + echo "pr number $PR_NUMBER" + git config --global --add safe.directory $(pwd) + git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} + git checkout pr-${PR_NUMBER} + + - name: Print Env + run: | + echo "== pyenv ==" + pyenv versions + echo "== python ==" + python --version + echo "== nvcc ==" + nvcc --version + echo "== torch ==" + pip show torch + echo "== pip list ==" + pip list + + - name: Download wheel + continue-on-error: true + run: | + file_name=$(curl -s -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://${{ needs.check-vm.outputs.ip }}/gpu/whl/download") + + if echo "$file_name" | grep -q "gptqmodel"; then + mkdir dist || true + cd dist + curl -s -O http://${{ needs.check-vm.outputs.ip }}/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name + ls -ahl . + sha256=$(sha256sum $file_name) + echo "sha256=$sha256" + echo "DOWNLOADED=1" >> $GITHUB_ENV + fi + + - name: Download artifact + if: env.DOWNLOADED == '' && !cancelled() + uses: actions/download-artifact@v4 + with: + name: dist + path: dist + run-id: ${{ needs.check-vm.outputs.run_id }} + + - name: Install wheel + run: | + echo "===== install optimum bitblas parameterized uvicorn =====" + uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} + echo "===== install dist/whl =====" + uv pip install dist/*.whl + echo "===== init test env =====" + echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 =====" + uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} + if [ "${{ matrix.test_script }}" == "models/test_xverse" ]; then + echo "===== install tokenizers==0.15.2 =====" + uv pip install tokenizers==0.15.2 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} + fi + if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then + echo "===== install auto_round =====" + uv pip install auto_round + fi + + - name: Clean cache + if: always() + run: pip cache purge && uv cache clean + + - name: Run tests + if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} + run: | + start_time=$(date +%s) + pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; } + execution_time=$(( $(date +%s) - start_time )) + echo "$((execution_time / 60))m $((execution_time % 60))s" + curl "http://${{ needs.check-vm.outputs.ip }}/gpu/log_test_vram?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second&name=${{ matrix.test_script }}" + + torch2_5: + needs: + - build + - list-test-files + - check-vm + runs-on: [ self-hosted, rocm ] + if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-2-5-files != '[]' + container: + image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v6-rocm + options: --device /dev/dri --device /dev/kfd --ipc=host + volumes: + - /home/ci/models:/monster/data/model + strategy: + fail-fast: false + max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }} + matrix: + test_script: ${{ fromJSON(needs.list-test-files.outputs.torch-2-5-files) }} + steps: + - name: Checkout Codes + uses: actions/checkout@v4 + with: + repository: ${{ env.repo }} + ref: ${{ env.ref }} + + - name: Fetch PR by number + if: ${{ github.event.inputs.pr_number != 0 }} + run: | + PR_NUMBER=${{ github.event.inputs.pr_number }} + echo "pr number $PR_NUMBER" + git config --global --add safe.directory $(pwd) + git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} + git checkout pr-${PR_NUMBER} + + - name: Print Env + run: | + echo "== pyenv ==" + pyenv versions + echo "== python ==" + python --version + echo "== nvcc ==" + nvcc --version + echo "== torch ==" + pip show torch + echo "== pip list ==" + pip list + + - name: Download wheel + continue-on-error: true + run: | + file_name=$(curl -s -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://${{ needs.check-vm.outputs.ip }}/gpu/whl/download") + if echo "$file_name" | grep -q "gptqmodel"; then + mkdir dist || true + cd dist + curl -s -O http://${{ needs.check-vm.outputs.ip }}/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name + ls -ahl . + sha256=$(sha256sum $file_name) + echo "sha256=$sha256" + echo "DOWNLOADED=1" >> $GITHUB_ENV + fi + + - name: Download artifact + if: env.DOWNLOADED == '' && !cancelled() + uses: actions/download-artifact@v4 + with: + name: dist + path: dist + run-id: ${{ needs.check-vm.outputs.run_id }} + + - name: Install wheel + run: | + if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then + echo "===== install auto_round =====" + uv pip install auto_round + fi + if [ "${{ matrix.test_script }}" == "models/test_cohere2" ]; then + echo "===== install transformers from git =====" + uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5 + fi + if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then + source /etc/profile.d/pyenv.sh && pyenv activate xpu + fi + echo "===== install dist/whl =====" + uv pip install dist/*.whl + + - name: Clean cache + if: always() + run: pip cache purge && uv cache clean + + - name: Run tests + if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} + run: | + if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then + export CUDA_VISIBLE_DEVICES="" + source /etc/profile.d/pyenv.sh && pyenv activate xpu + pip uninstall vllm -y + pip list + fi + + start_time=$(date +%s) + pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; } + execution_time=$(( $(date +%s) - start_time )) + echo "$((execution_time / 60))m $((execution_time % 60))s" + curl "http://${{ needs.check-vm.outputs.ip }}/gpu/log_test_vram?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second&name=${{ matrix.test_script }}" + + show-statistics: + runs-on: [ self-hosted, rocm ] + if: always() + container: + image: modelcloud/gptqmodel:alpine-ci-v1 + needs: + - transformers_diff + - torch2_5 + steps: + - name: Print statistics + run: curl "http://10.0.14.248/gpu/get_vram_logs?id=${{ github.run_id }}"