Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor daily CI workflow #30012

Merged
merged 5 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/workflows/self-scheduled-caller.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Self-hosted runner (scheduled)


on:
repository_dispatch:
schedule:
- cron: "17 2 * * *"
push:
branches:
- run_scheduled_ci*

jobs:
model-ci:
name: Model CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_tests_gpu
# See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`.
env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
secrets: inherit

torch-pipeline:
name: Torch pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_torch_gpu
# See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`.
env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
secrets: inherit

tf-pipeline:
name: TF pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_tf_gpu
# See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`.
env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS
secrets: inherit

example-ci:
name: Example CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_examples_gpu
# See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`.
env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS
secrets: inherit

deepspeed-ci:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_all_tests_torch_cuda_extensions_gpu
# See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`.
env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS
secrets: inherit

quantization-ci:
name: Quantization CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_tests_quantization_torch_gpu
# See the comment for `ENV_NAME_FOR_CI_SLACK_REPORT_CHANNEL_ID` in `.github/workflows/slack-report.yml`.
env_name_for_slack_report_channel: CI_SLACK_CHANNEL_DUMMY_TESTS
secrets: inherit
169 changes: 69 additions & 100 deletions .github/workflows/self-scheduled.yml
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ name: Self-hosted runner (scheduled)
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`

on:
repository_dispatch:
schedule:
- cron: "17 2 * * *"
push:
branches:
- run_scheduled_ci*
workflow_call:
inputs:
job:
required: true
type: string
env_name_for_slack_report_channel:
required: true
type: string

env:
HF_HOME: /mnt/cache
Expand All @@ -31,6 +33,7 @@ env:

jobs:
setup:
if: ${{ inputs.job == 'run_tests_gpu' }}
name: Setup
strategy:
matrix:
Expand Down Expand Up @@ -71,6 +74,7 @@ jobs:
nvidia-smi

run_tests_gpu:
if: ${{ inputs.job == 'run_tests_gpu' }}
name: " "
needs: setup
strategy:
Expand All @@ -85,17 +89,17 @@ jobs:
slice_id: ${{ matrix.slice_id }}
secrets: inherit

run_examples_gpu:
name: Examples directory
run_pipelines_torch_gpu:
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
name: PyTorch pipelines
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu]
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
Expand All @@ -118,39 +122,39 @@ jobs:
working-directory: /transformers
run: pip freeze

- name: Run examples tests on GPU
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines

- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_examples_gpu
path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu

run_pipelines_torch_gpu:
name: PyTorch pipelines
run_pipelines_tf_gpu:
if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
name: TensorFlow pipelines
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
container:
image: huggingface/transformers-pytorch-gpu
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
run: |
git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
Expand All @@ -172,36 +176,35 @@ jobs:
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines

- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
if: ${{ always() }}
run: |
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu"
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu

run_pipelines_tf_gpu:
name: TensorFlow pipelines
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
name: Examples directory
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
machine_type: [single-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
run: git fetch && git checkout ${{ github.sha }}

- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
Expand All @@ -220,31 +223,32 @@ jobs:
working-directory: /transformers
run: pip freeze

- name: Run all pipeline tests on GPU
- name: Run examples tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch

- name: Failure short reports
if: ${{ always() }}
run: |
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu"
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
name: ${{ matrix.machine_type }}_run_examples_gpu
path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu

run_all_tests_torch_cuda_extensions_gpu:
if: ${{ inputs.job == 'run_all_tests_torch_cuda_extensions_gpu' }}
name: Torch CUDA extension tests
strategy:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -298,6 +302,7 @@ jobs:
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu

run_tests_quantization_torch_gpu:
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
name: Quantization tests
strategy:
fail-fast: false
Expand All @@ -307,7 +312,6 @@ jobs:
container:
image: huggingface/transformers-quantization-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
Expand Down Expand Up @@ -348,18 +352,11 @@ jobs:
path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu

run_extract_warnings:
# Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
if: ${{ always() && inputs.job == 'run_tests_gpu' }}
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
name: Extract warnings in CI artifacts
runs-on: ubuntu-22.04
if: always()
needs: [
setup,
run_tests_gpu,
run_examples_gpu,
run_pipelines_tf_gpu,
run_pipelines_torch_gpu,
run_all_tests_torch_cuda_extensions_gpu,
run_tests_quantization_torch_gpu,
]
needs: [setup, run_tests_gpu]
steps:
- name: Checkout transformers
uses: actions/checkout@v3
Expand Down Expand Up @@ -396,52 +393,24 @@ jobs:
path: warnings_in_ci/selected_warnings.json

send_results:
name: Send results to webhook
runs-on: ubuntu-22.04
if: always()
name: Slack Report
needs: [
setup,
run_tests_gpu,
run_examples_gpu,
run_pipelines_tf_gpu,
run_pipelines_torch_gpu,
run_pipelines_tf_gpu,
run_examples_gpu,
run_all_tests_torch_cuda_extensions_gpu,
run_tests_quantization_torch_gpu,
run_extract_warnings
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Setup status: ${{ needs.setup.result }}"

- uses: actions/checkout@v3
- uses: actions/download-artifact@v3
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: scheduled
CI_SHA: ${{ github.sha }}
CI_WORKFLOW_REF: ${{ github.workflow_ref }}
SETUP_STATUS: ${{ needs.setup.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
sudo apt-get install -y curl
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"

# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: prev_ci_results
path: prev_ci_results
if: ${{ always() }}
uses: ./.github/workflows/slack-report.yml
with:
job: ${{ inputs.job }}
# This would be `skipped` if `setup` is skipped.
setup_status: ${{ needs.setup.result }}
env_name_for_slack_report_channel: ${{ inputs.env_name_for_slack_report_channel }}
# This would be an empty string if `setup` is skipped.
folder_slices: ${{ needs.setup.outputs.folder_slices }}
secrets: inherit
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading