ci: add cuda status for the known Transformers test failures #76
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Linux Transformers Test | |
on: | |
pull_request: | |
branches: | |
- main | |
paths: | |
- '.github/scripts/check-transformers.py' | |
- '.github/scripts/spec.py' | |
- '.github/workflows/_linux_transformers.yml' | |
workflow_dispatch: | |
inputs: | |
pytorch: | |
required: false | |
type: string | |
default: 'nightly' | |
description: Pytorch branch/commit | |
python: | |
required: false | |
type: string | |
default: '3.10' | |
description: Python version | |
runner: | |
required: true | |
type: string | |
default: 'linux.idc.xpu' | |
description: Runner label | |
driver: | |
required: false | |
type: string | |
default: 'lts' | |
description: Driver lts/rolling | |
nightly_whl: | |
required: false | |
type: string | |
default: '' | |
description: Pytorch nightly wheel version | |
transformers: | |
required: false | |
type: string | |
default: 'v4.48.0' | |
description: Transformers version | |
permissions: read-all | |
jobs: | |
Torch-XPU-Transformers-Tests: | |
runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} | |
env: | |
HF_HOME: ${{ github.workspace }}/.hf_home | |
HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} | |
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} | |
python: ${{ inputs.python != '' && inputs.python || '3.10' }} | |
pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }} | |
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }} | |
PYTORCH_DEBUG_XPU_FALLBACK: '1' | |
TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' | |
steps: | |
- name: Checkout torch-xpu-ops | |
uses: actions/checkout@v4 | |
with: | |
path: torch-xpu-ops | |
- name: Checkout Transformers | |
uses: actions/checkout@v4 | |
with: | |
repository: huggingface/transformers | |
ref: ${{ env.transformers }} | |
path: transformers | |
- name: Prepare OS environment | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y \ | |
espeak-ng \ | |
git-lfs \ | |
pkg-config \ | |
libavcodec-dev \ | |
libavdevice-dev \ | |
libavfilter-dev \ | |
libavformat-dev \ | |
libavutil-dev \ | |
libswresample-dev \ | |
libswscale-dev | |
git lfs install | |
- name: Prepare Conda ENV | |
run: | | |
which conda && conda clean -ay | |
conda remove --all -y -n huggingface_transformers_test || rm -rf $(dirname ${CONDA_EXE})/../envs/huggingface_transformers_test | |
conda create -y -n huggingface_transformers_test python=${{ env.python }} | |
source activate huggingface_transformers_test | |
pip install junitparser | |
- name: Prepare Stock XPU Pytorch | |
run: | | |
pwd | |
source activate huggingface_transformers_test | |
if [ -z "${{ inputs.nightly_whl }}" ]; then | |
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu | |
else | |
pip install torch==$(echo ${{ inputs.nightly_whl }}) torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu | |
fi | |
- name: Prepare Transformers | |
run: | | |
pwd | |
source activate huggingface_transformers_test | |
cd transformers | |
pip install -e . | |
pip install -e ".[dev-torch,testing,video]" | |
rm -rf tests_log && mkdir -p tests_log | |
rm -rf reports | |
cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ | |
- name: Report installed versions | |
run: | | |
source activate huggingface_transformers_test | |
echo "pip installed packages:" | |
pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt | |
echo "lspci gpu devices:" | |
lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt | |
echo "GPU render nodes:" | |
cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt | |
echo "xpu-smi output:" | |
xpu-smi discovery -y --json --dump -1 | |
- name: Sanity check installed packages | |
run: | | |
source activate huggingface_transformers_test | |
# These checks are to exit earlier if for any reason Transformers | |
# reinstalled torch packages back to CUDA versions (not expected). | |
pip show torch | grep Version | grep xpu | |
pip show torchaudio | grep Version | grep xpu | |
pip show torchvision | grep Version | grep xpu | |
python -c 'import torch; exit(not torch.xpu.is_available())' | |
- name: Clean HF home directory and cache | |
run: | | |
rm -rf ${{ env.HF_HOME }} | |
- name: Run -k backbone tests | |
env: | |
TEST_CASE: 'tests_backbone' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml -k backbone tests || \ | |
(echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) | |
- name: Run tests/*.py | |
env: | |
TEST_CASE: 'tests_py' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/*.py || true | |
- name: Run tests/benchmark | |
env: | |
TEST_CASE: 'tests_benchmark' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/benchmark || true | |
- name: Run tests/generation | |
env: | |
TEST_CASE: 'tests_generation' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
# Excluding tests due to: | |
# * torch.distributed.* not yet supported by XPU | |
pattern="not TestFSDPGeneration" | |
python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/generation -k "$pattern" || true | |
- name: Run tests/models | |
env: | |
TEST_CASE: 'tests_models' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
# Excluding tests due to: | |
# * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals) | |
pattern=" \ | |
not test_resize_embeddings_untied and \ | |
not test_resize_tokens_embeddings" | |
python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/models -k "$pattern" || true | |
- name: Run tests/pipelines | |
env: | |
TEST_CASE: 'tests_pipelines' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
# Some tests are known to fail w/o clear pattern | |
# TODO: drop ||true after triage and fixes | |
python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml tests/pipelines || true | |
- name: Run tests/trainer | |
env: | |
TEST_CASE: 'tests_trainer' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
# Excluding tests due to: | |
# * Some ray tests hang, reason unknown | |
# * torch.distributed.* not yet supported by XPU | |
pattern=" \ | |
not ray and \ | |
not TestTrainerDistributed and \ | |
not TestTrainerDistributedXPU and \ | |
not TestFSDPTrainer" | |
python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer --junit-xml=reports/$TEST_CASE.xml -k "$pattern" || \ | |
(echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) | |
- name: Run tests/utils | |
env: | |
TEST_CASE: 'tests_utils' | |
run: | | |
source activate huggingface_transformers_test | |
cd transformers | |
# Excluding tests due to: | |
# * Network proxy connection issue, reason unknown | |
pattern="not test_load_img_url_timeout" | |
python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils --junit-xml=reports/$TEST_CASE.xml -k "$pattern" || \ | |
(echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) | |
- name: Check for errors in tests | |
run: | | |
FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//') | |
echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]" | |
test -z "$FAILED_CASES" | |
source activate huggingface_transformers_test | |
python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml | |
- name: Clean HF home directory and cache | |
if: ${{ always() }} | |
run: | | |
du -sh ${{ env.HF_HOME }} || true | |
rm -rf ${{ env.HF_HOME }} | |
- name: Print results table | |
if: ${{ ! cancelled() }} | |
run: | | |
# Helper function to return number preceeding given pattern, i.e: | |
# === 25 failed, 11 warnings, 0 errors === | |
# Call as follows: | |
# parse_stat $line "failed" | |
function parse_stat() { | |
stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/") | |
if [ -n "$stat" ]; then echo $stat; else echo "0"; fi | |
} | |
cd transformers | |
{ | |
echo "### Results" | |
echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |" | |
echo "| --- | --- | --- | --- | --- | --- |" | |
for stat in $(find reports -name stats.txt); do | |
# Each stat.txt is located in: reports/$test_group/stats.txt | |
test_group=$(echo $stat | cut -f 2 -d/) | |
# Get failed, passed, skipped, etc. counters | |
failed=$(parse_stat $stat failed) | |
passed=$(parse_stat $stat passed) | |
deselected=$(parse_stat $stat deselected) | |
skipped=$(parse_stat $stat skipped) | |
warnings=$(parse_stat $stat warnings) | |
errors=$(parse_stat $stat errors) | |
echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |" | |
done | |
} >> $GITHUB_STEP_SUMMARY | |
- name: Print baseline difference | |
if: ${{ ! cancelled() }} | |
run: | | |
source activate huggingface_transformers_test | |
python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true | |
- name: Print failure lines | |
if: ${{ ! cancelled() }} | |
run: | | |
cd transformers | |
{ | |
echo "### Failure lines" | |
echo "| Test group |File | Error | Comment |" | |
echo "| --- | --- | --- | --- |" | |
rm -rf _failures.txt | |
for failure in $(find reports -name failures_line.txt); do | |
# Each failure_line.txt is located in: reports/$test_group/failure_line.txt | |
test_group=$(echo $failure | cut -f2 -d/) | |
tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt | |
done | |
# failures_line.txt file does not have test case information, | |
# so we can just sort the output and report uniq values | |
sort _failures.txt | uniq > _failures_uniq.txt | |
while read line; do | |
test_group=$(echo $line | cut -f1 -d" ") | |
file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/") | |
error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/") | |
# Failure comments often contain special characters which complicate | |
# parsing failure lines. But fortunately we know for sure where comments | |
# start. So we just output all contents starting from this position and | |
# wrap everything in <pre></pre> to avoid collisions with Markdown formatting. | |
comment="<pre>$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')</pre>" | |
echo "| $test_group | $file | $error | $comment |" | |
done <_failures_uniq.txt | |
} >> $GITHUB_STEP_SUMMARY | |
- name: Print not implemented XPU backend ops | |
run: | | |
cd transformers | |
{ | |
echo "### Not implemented ops" | |
echo "| Test group | Operator | Status |" | |
echo "| --- | --- | --- |" | |
rm -rf _ops.txt && touch _ops.txt | |
for log in $(find reports -name failures_line.txt); do | |
# Each failure_line.txt is located in: reports/$test_group/failure_line.txt | |
test_group=$(echo $log | cut -f2 -d/) | |
ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/") | |
for op in $ops; do | |
echo "| $test_group | <pre>$op</pre> | not implemented |" >> _ops.txt | |
done | |
done | |
for log in $(find reports -name warnings.txt); do | |
# Each warnings.txt is located in: reports/$test_group/warnings.txt | |
test_group=$(echo $log | cut -f2 -d/) | |
ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/") | |
for op in $ops; do | |
echo "| $test_group | <pre>$op</pre> | fallback to CPU happens |" >> _ops.txt | |
done | |
done | |
sort _ops.txt | uniq | |
} >> $GITHUB_STEP_SUMMARY | |
- name: Print annotations | |
if: ${{ ! cancelled() }} | |
run: | | |
source activate huggingface_transformers_test | |
{ | |
echo "### Annotations" | |
echo "| | |" | |
echo "| --- | --- |" | |
echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" | |
echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" | |
echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" | |
packages=" \ | |
level-zero \ | |
libigc1 \ | |
libigc2 \ | |
libze1 \ | |
libze-intel-gpu1 \ | |
intel-i915-dkms \ | |
intel-level-zero-gpu \ | |
intel-opencl-icd" | |
for package in $packages; do | |
package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") | |
echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" | |
done | |
packages="accelerate \ | |
numpy \ | |
torch \ | |
torchaudio \ | |
torchvision \ | |
transformers" | |
for package in $packages; do | |
package_version=$(python -c "import $package; print($package.__version__)" || true) | |
echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" | |
done | |
# printing annotations for GPU cards | |
var="[$(cat /sys/class/drm/render*/device/vendor || true)]" | |
echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |" | |
var="[$(cat /sys/class/drm/render*/device/device || true)]" | |
echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |" | |
var=$(python -c "import torch; print(torch.version.xpu)" || true) | |
echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" | |
var=$(python -c "import torch; print(torch.xpu.device_count())" || true) | |
echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" | |
# printing annotations with key environment variables | |
echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" | |
echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" | |
echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |" | |
echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |" | |
} >> $GITHUB_STEP_SUMMARY | |
- name: Upload Test log | |
if: ${{ ! cancelled() }} | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Torch-XPU-Transformers-Log-${{ github.event.pull_request.number || github.sha }} | |
path: | | |
${{ github.workspace }}/transformers/reports | |
${{ github.workspace }}/transformers/tests_log |