Skip to content

Commit

Permalink
[bnb] Add bnb nightly workflow (#1282)
Browse files Browse the repository at this point in the history
* add bnb nightly workflow

* add matrix strategy

* temp

* oops

* temp

* oops

* nit

* fixes

* up

* up

* up

* add pytest cov

* up

* oops

* put correct dir

* fix

* fix dir in makefile + failing test

* revert

* Update .github/workflows/nightly.yml

* Update nightly-bnb.yml

* Update log_reports.py

* Update Makefile

* Update .github/workflows/nightly-bnb.yml

* Update .github/workflows/nightly-bnb.yml

* Update .github/workflows/nightly.yml

* Update nightly.yml

* Update .github/workflows/nightly-bnb.yml

* Update nightly-bnb.yml
  • Loading branch information
younesbelkada authored Dec 20, 2023
1 parent 482a2a6 commit 029dcd5
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 105 deletions.
109 changes: 109 additions & 0 deletions .github/workflows/nightly-bnb.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
name: BNB from source self-hosted runner with slow tests (scheduled)

on:
workflow_dispatch:
schedule:
- cron: "0 2 * * *"

env:
RUN_SLOW: "yes"
IS_GITHUB_CI: "1"
# To be able to run tests on CUDA 12.2
NVIDIA_DISABLE_REQUIRE: "1"
SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}


jobs:
run_all_tests_single_gpu:
strategy:
fail-fast: false
matrix:
docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}"
container:
image: ${{ matrix.docker-image-name }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -e NVIDIA_DISABLE_REQUIRE=true
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v3
- name: Pip install
run: |
source activate peft
pip install -e . --no-deps
pip install pytest-reportlog pytest-cov parameterized datasets scipy einops
mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone # rename to transformers clone to avoid modules conflict
- name: Run examples on single GPU
run: |
source activate peft
make tests_examples_single_gpu_bnb
- name: Run core tests on single GPU
run: |
source activate peft
make tests_core_single_gpu_bnb
- name: Run transformers tests on single GPU
run: |
source activate peft
make transformers_tests
- name: Generate Report
if: always()
run: |
pip install slack_sdk tabulate
python scripts/log_reports.py --slack_channel_name bnb-daily-ci >> $GITHUB_STEP_SUMMARY
run_all_tests_multi_gpu:
strategy:
fail-fast: false
matrix:
docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0,1"
TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}"
container:
image: ${{ matrix.docker-image-name }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -e NVIDIA_DISABLE_REQUIRE=true
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v3
- name: Pip install
run: |
source activate peft
pip install -e . --no-deps
pip install pytest-reportlog pytest-cov parameterized datasets scipy einops
mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone
- name: Run core GPU tests on multi-gpu
run: |
source activate peft
- name: Run examples on multi GPU
run: |
source activate peft
make tests_examples_multi_gpu_bnb
- name: Run core tests on multi GPU
run: |
source activate peft
make tests_core_multi_gpu_bnb
- name: Run transformers tests on multi GPU
run: |
source activate peft
make transformers_tests
- name: Generate Report
if: always()
run: |
pip install slack_sdk tabulate
python scripts/log_reports.py --slack_channel_name bnb-daily-ci >> $GITHUB_STEP_SUMMARY
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,21 @@ tests_common_gpu:
python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)

tests_examples_multi_gpu_bnb:
python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)

tests_examples_single_gpu_bnb:
python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",)

tests_core_multi_gpu_bnb:
python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",)

tests_core_single_gpu_bnb:
python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)

# For testing transformers tests for bnb runners
transformers_tests:
RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",)

tests_regression:
python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",)
223 changes: 118 additions & 105 deletions scripts/log_reports.py
Original file line number Diff line number Diff line change
@@ -1,123 +1,136 @@
import json, os
import argparse
from pathlib import Path
from datetime import date
from tabulate import tabulate


MAX_LEN_MESSAGE = 2900 # slack endpoint has a limit of 3001 characters

failed = []
passed = []

group_info = []

total_num_failed = 0
empty_file = False or len(list(Path().glob("*.log"))) == 0

total_empty_files = []

for log in Path().glob("*.log"):
section_num_failed = 0
i = 0
with open(log, "r") as f:
for line in f:
line = json.loads(line)
i += 1
if line.get("nodeid", "") != "":
test = line["nodeid"]
if line.get("duration", None) is not None:
duration = f'{line["duration"]:.4f}'
if line.get("outcome", "") == "failed":
section_num_failed += 1
failed.append([test, duration, log.name.split('_')[0]])
total_num_failed += 1
else:
passed.append([test, duration, log.name.split('_')[0]])
empty_file = i == 0
group_info.append([str(log), section_num_failed, failed])
total_empty_files.append(empty_file)
os.remove(log)
parser = argparse.ArgumentParser()
parser.add_argument(
"--slack_channel_name",
default="peft-ci-daily"
)


def main(slack_channel_name=None):
failed = []
no_error_payload = {
"type": "section",
"text": {
"type": "plain_text",
"text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.",
"emoji": True
}
}
passed = []

group_info = []

total_num_failed = 0
empty_file = False or len(list(Path().glob("*.log"))) == 0

message = ""
payload = [
{
"type": "header",
total_empty_files = []

for log in Path().glob("*.log"):
section_num_failed = 0
i = 0
with open(log, "r") as f:
for line in f:
line = json.loads(line)
i += 1
if line.get("nodeid", "") != "":
test = line["nodeid"]
if line.get("duration", None) is not None:
duration = f'{line["duration"]:.4f}'
if line.get("outcome", "") == "failed":
section_num_failed += 1
failed.append([test, duration, log.name.split('_')[0]])
total_num_failed += 1
else:
passed.append([test, duration, log.name.split('_')[0]])
empty_file = i == 0
group_info.append([str(log), section_num_failed, failed])
total_empty_files.append(empty_file)
os.remove(log)
failed = []
no_error_payload = {
"type": "section",
"text": {
"type": "plain_text",
"text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")),
}
},
]
if total_num_failed > 0:
for i, (name, num_failed, failed_tests) in enumerate(group_info):
if num_failed > 0:
if num_failed == 1:
message += f"*{name}: {num_failed} failed test*\n"
else:
message += f"*{name}: {num_failed} failed tests*\n"
failed_table = []
for test in failed_tests:
failed_table.append(test[0].split("::"))
failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
message += '\n```\n' +failed_table + '\n```'

if total_empty_files[i]:
message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
print(f'### {message}')
else:
payload.append(no_error_payload)

if os.environ.get("TEST_TYPE", "") != "":
from slack_sdk import WebClient

if len(message) > MAX_LEN_MESSAGE:
print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
message = message[:MAX_LEN_MESSAGE] + "..."

if len(message) != 0:
md_report = {
"type": "section",
"text": {
"type": "mrkdwn",
"text": message
},
"text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.",
"emoji": True
}
payload.append(md_report)
action_button = {
"type": "section",
}

message = ""
payload = [
{
"type": "header",
"text": {
"type": "mrkdwn",
"text": "*For more details:*"
},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
"type": "plain_text",
"text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")),
}
},
]
if total_num_failed > 0:
for i, (name, num_failed, failed_tests) in enumerate(group_info):
if num_failed > 0:
if num_failed == 1:
message += f"*{name}: {num_failed} failed test*\n"
else:
message += f"*{name}: {num_failed} failed tests*\n"
failed_table = []
for test in failed_tests:
failed_table.append(test[0].split("::"))
failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
message += '\n```\n' +failed_table + '\n```'

if total_empty_files[i]:
message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
print(f'### {message}')
else:
payload.append(no_error_payload)

if os.environ.get("TEST_TYPE", "") != "":
from slack_sdk import WebClient

if len(message) > MAX_LEN_MESSAGE:
print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
message = message[:MAX_LEN_MESSAGE] + "..."

if len(message) != 0:
md_report = {
"type": "section",
"text": {
"type": "mrkdwn",
"text": message
},
}
payload.append(md_report)
action_button = {
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*For more details:*"
},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
}
payload.append(action_button)

date_report = {
"type": "context",
"elements": [
{
"type": "plain_text",
"text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}",
},
],
}
payload.append(action_button)
payload.append(date_report)

date_report = {
"type": "context",
"elements": [
{
"type": "plain_text",
"text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}",
},
],
}
payload.append(date_report)
print(payload)

print(payload)
client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload)

client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
client.chat_postMessage(channel="#peft-ci-daily", text=message, blocks=payload)
if __name__ == "__main__":
args = parser.parse_args()
main(args.slack_channel_name)

0 comments on commit 029dcd5

Please sign in to comment.