From 029dcd5a1cce7442161c560194d8f61893afd8e4 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:49:13 +0100 Subject: [PATCH] =?UTF-8?q?[`bnb`]=C2=A0Add=20bnb=20nightly=20workflow=20(?= =?UTF-8?q?#1282)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add bnb nightly workflow * add matrix strategy * temp * oops * temp * oops * nit * fixes * up * up * up * add pytest cov * up * oops * put correct dir * fix * fix dir in makefile + failing test * revert * Update .github/workflows/nightly.yml * Update nightly-bnb.yml * Update log_reports.py * Update Makefile * Update .github/workflows/nightly-bnb.yml * Update .github/workflows/nightly-bnb.yml * Update .github/workflows/nightly.yml * Update nightly.yml * Update .github/workflows/nightly-bnb.yml * Update nightly-bnb.yml --- .github/workflows/nightly-bnb.yml | 109 +++++++++++++++ Makefile | 16 +++ scripts/log_reports.py | 223 ++++++++++++++++-------------- 3 files changed, 243 insertions(+), 105 deletions(-) create mode 100644 .github/workflows/nightly-bnb.yml diff --git a/.github/workflows/nightly-bnb.yml b/.github/workflows/nightly-bnb.yml new file mode 100644 index 0000000000..206218fdca --- /dev/null +++ b/.github/workflows/nightly-bnb.yml @@ -0,0 +1,109 @@ +name: BNB from source self-hosted runner with slow tests (scheduled) + +on: + workflow_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + RUN_SLOW: "yes" + IS_GITHUB_CI: "1" + # To be able to run tests on CUDA 12.2 + NVIDIA_DISABLE_REQUIRE: "1" + SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} + + +jobs: + run_all_tests_single_gpu: + strategy: + fail-fast: false + matrix: + docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci] + env: + CUDA_VISIBLE_DEVICES: "0" + TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}" + container: + image: ${{ matrix.docker-image-name }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -e NVIDIA_DISABLE_REQUIRE=true + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v3 + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-reportlog pytest-cov parameterized datasets scipy einops + mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone # rename to transformers clone to avoid modules conflict + + - name: Run examples on single GPU + run: | + source activate peft + make tests_examples_single_gpu_bnb + + - name: Run core tests on single GPU + run: | + source activate peft + make tests_core_single_gpu_bnb + + - name: Run transformers tests on single GPU + run: | + source activate peft + make transformers_tests + + - name: Generate Report + if: always() + run: | + pip install slack_sdk tabulate + python scripts/log_reports.py --slack_channel_name bnb-daily-ci >> $GITHUB_STEP_SUMMARY + + run_all_tests_multi_gpu: + strategy: + fail-fast: false + matrix: + docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci] + env: + CUDA_VISIBLE_DEVICES: "0,1" + TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}" + container: + image: ${{ matrix.docker-image-name }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -e NVIDIA_DISABLE_REQUIRE=true + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v3 + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-reportlog pytest-cov parameterized datasets scipy einops + mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone + + - name: Run core GPU tests on multi-gpu + run: | + source activate peft + + - name: Run examples on multi GPU + run: | + source activate peft + make tests_examples_multi_gpu_bnb + + - name: Run core tests on multi GPU + run: | + source activate peft + make tests_core_multi_gpu_bnb + + - name: Run transformers tests on multi GPU + run: | + source activate peft + make transformers_tests + + - name: Generate Report + if: always() + run: | + pip install slack_sdk tabulate + python scripts/log_reports.py --slack_channel_name bnb-daily-ci >> $GITHUB_STEP_SUMMARY diff --git a/Makefile b/Makefile index e5343c5553..86c5aaba28 100644 --- a/Makefile +++ b/Makefile @@ -35,5 +35,21 @@ tests_common_gpu: python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",) python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",) +tests_examples_multi_gpu_bnb: + python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",) + +tests_examples_single_gpu_bnb: + python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",) + +tests_core_multi_gpu_bnb: + python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",) + +tests_core_single_gpu_bnb: + python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",) + +# For testing transformers tests for bnb runners +transformers_tests: + RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",) + tests_regression: python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",) diff --git a/scripts/log_reports.py b/scripts/log_reports.py index 12b6d4c1ad..648af3af35 100644 --- a/scripts/log_reports.py +++ b/scripts/log_reports.py @@ -1,4 +1,5 @@ import json, os +import argparse from pathlib import Path from datetime import date from tabulate import tabulate @@ -6,118 +7,130 @@ MAX_LEN_MESSAGE = 2900 # slack endpoint has a limit of 3001 characters -failed = [] -passed = [] - -group_info = [] - -total_num_failed = 0 -empty_file = False or len(list(Path().glob("*.log"))) == 0 - -total_empty_files = [] - -for log in Path().glob("*.log"): - section_num_failed = 0 - i = 0 - with open(log, "r") as f: - for line in f: - line = json.loads(line) - i += 1 - if line.get("nodeid", "") != "": - test = line["nodeid"] - if line.get("duration", None) is not None: - duration = f'{line["duration"]:.4f}' - if line.get("outcome", "") == "failed": - section_num_failed += 1 - failed.append([test, duration, log.name.split('_')[0]]) - total_num_failed += 1 - else: - passed.append([test, duration, log.name.split('_')[0]]) - empty_file = i == 0 - group_info.append([str(log), section_num_failed, failed]) - total_empty_files.append(empty_file) - os.remove(log) +parser = argparse.ArgumentParser() +parser.add_argument( + "--slack_channel_name", + default="peft-ci-daily" +) + + +def main(slack_channel_name=None): failed = [] -no_error_payload = { - "type": "section", - "text": { - "type": "plain_text", - "text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.", - "emoji": True - } -} + passed = [] + + group_info = [] + + total_num_failed = 0 + empty_file = False or len(list(Path().glob("*.log"))) == 0 -message = "" -payload = [ - { - "type": "header", + total_empty_files = [] + + for log in Path().glob("*.log"): + section_num_failed = 0 + i = 0 + with open(log, "r") as f: + for line in f: + line = json.loads(line) + i += 1 + if line.get("nodeid", "") != "": + test = line["nodeid"] + if line.get("duration", None) is not None: + duration = f'{line["duration"]:.4f}' + if line.get("outcome", "") == "failed": + section_num_failed += 1 + failed.append([test, duration, log.name.split('_')[0]]) + total_num_failed += 1 + else: + passed.append([test, duration, log.name.split('_')[0]]) + empty_file = i == 0 + group_info.append([str(log), section_num_failed, failed]) + total_empty_files.append(empty_file) + os.remove(log) + failed = [] + no_error_payload = { + "type": "section", "text": { "type": "plain_text", - "text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")), - } - }, -] -if total_num_failed > 0: - for i, (name, num_failed, failed_tests) in enumerate(group_info): - if num_failed > 0: - if num_failed == 1: - message += f"*{name}: {num_failed} failed test*\n" - else: - message += f"*{name}: {num_failed} failed tests*\n" - failed_table = [] - for test in failed_tests: - failed_table.append(test[0].split("::")) - failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12]) - message += '\n```\n' +failed_table + '\n```' - - if total_empty_files[i]: - message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n" - print(f'### {message}') -else: - payload.append(no_error_payload) - -if os.environ.get("TEST_TYPE", "") != "": - from slack_sdk import WebClient - - if len(message) > MAX_LEN_MESSAGE: - print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}") - message = message[:MAX_LEN_MESSAGE] + "..." - - if len(message) != 0: - md_report = { - "type": "section", - "text": { - "type": "mrkdwn", - "text": message - }, + "text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.", + "emoji": True } - payload.append(md_report) - action_button = { - "type": "section", + } + + message = "" + payload = [ + { + "type": "header", "text": { - "type": "mrkdwn", - "text": "*For more details:*" - }, - "accessory": { - "type": "button", - "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, - "url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}", - }, + "type": "plain_text", + "text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")), + } + }, + ] + if total_num_failed > 0: + for i, (name, num_failed, failed_tests) in enumerate(group_info): + if num_failed > 0: + if num_failed == 1: + message += f"*{name}: {num_failed} failed test*\n" + else: + message += f"*{name}: {num_failed} failed tests*\n" + failed_table = [] + for test in failed_tests: + failed_table.append(test[0].split("::")) + failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12]) + message += '\n```\n' +failed_table + '\n```' + + if total_empty_files[i]: + message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n" + print(f'### {message}') + else: + payload.append(no_error_payload) + + if os.environ.get("TEST_TYPE", "") != "": + from slack_sdk import WebClient + + if len(message) > MAX_LEN_MESSAGE: + print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}") + message = message[:MAX_LEN_MESSAGE] + "..." + + if len(message) != 0: + md_report = { + "type": "section", + "text": { + "type": "mrkdwn", + "text": message + }, + } + payload.append(md_report) + action_button = { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*For more details:*" + }, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, + "url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}", + }, + } + payload.append(action_button) + + date_report = { + "type": "context", + "elements": [ + { + "type": "plain_text", + "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}", + }, + ], } - payload.append(action_button) + payload.append(date_report) - date_report = { - "type": "context", - "elements": [ - { - "type": "plain_text", - "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}", - }, - ], - } - payload.append(date_report) + print(payload) - print(payload) + client = WebClient(token=os.environ.get("SLACK_API_TOKEN")) + client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload) - client = WebClient(token=os.environ.get("SLACK_API_TOKEN")) - client.chat_postMessage(channel="#peft-ci-daily", text=message, blocks=payload) +if __name__ == "__main__": + args = parser.parse_args() + main(args.slack_channel_name)