[bnb] Add bnb nightly workflow (#1282)

* add bnb nightly workflow * add matrix strategy * temp * oops * temp * oops * nit * fixes * up * up * up * add pytest cov * up * oops * put correct dir * fix * fix dir in makefile + failing test * revert * Update .github/workflows/nightly.yml * Update nightly-bnb.yml * Update log_reports.py * Update Makefile * Update .github/workflows/nightly-bnb.yml * Update .github/workflows/nightly-bnb.yml * Update .github/workflows/nightly.yml * Update nightly.yml * Update .github/workflows/nightly-bnb.yml * Update nightly-bnb.yml
huggingface · Dec 20, 2023 · 029dcd5 · 029dcd5
1 parent 482a2a6
commit 029dcd5
Show file tree

Hide file tree

Showing 3 changed files with 243 additions and 105 deletions.
diff --git a/.github/workflows/nightly-bnb.yml b/.github/workflows/nightly-bnb.yml
@@ -0,0 +1,109 @@
+name: BNB from source self-hosted runner with slow tests (scheduled)
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 2 * * *"
+
+env:
+  RUN_SLOW: "yes"
+  IS_GITHUB_CI: "1"
+  # To be able to run tests on CUDA 12.2
+  NVIDIA_DISABLE_REQUIRE: "1"
+  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+
+
+jobs:
+  run_all_tests_single_gpu:
+    strategy:
+      fail-fast: false
+      matrix:
+          docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}"
+    container:
+      image: ${{ matrix.docker-image-name }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -e NVIDIA_DISABLE_REQUIRE=true
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+      - name: Pip install
+        run: |
+          source activate peft
+          pip install -e . --no-deps
+          pip install pytest-reportlog pytest-cov parameterized datasets scipy einops
+          mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone # rename to transformers clone to avoid modules conflict
+
+      - name: Run examples on single GPU
+        run: |
+          source activate peft
+          make tests_examples_single_gpu_bnb
+      
+      - name: Run core tests on single GPU
+        run: |
+          source activate peft
+          make tests_core_single_gpu_bnb
+
+      - name: Run transformers tests on single GPU
+        run: |
+          source activate peft
+          make transformers_tests
+          
+      - name: Generate Report
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py --slack_channel_name bnb-daily-ci >> $GITHUB_STEP_SUMMARY
+
+  run_all_tests_multi_gpu:
+    strategy:
+      fail-fast: false
+      matrix:
+        docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0,1"
+      TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}"
+    container:
+      image: ${{ matrix.docker-image-name }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -e NVIDIA_DISABLE_REQUIRE=true
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+      - name: Pip install
+        run: |
+          source activate peft
+          pip install -e . --no-deps
+          pip install pytest-reportlog pytest-cov parameterized datasets scipy einops
+          mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone
+
+      - name: Run core GPU tests on multi-gpu
+        run: |
+          source activate peft
+        
+      - name: Run examples on multi GPU
+        run: |
+          source activate peft
+          make tests_examples_multi_gpu_bnb
+      
+      - name: Run core tests on multi GPU
+        run: |
+          source activate peft
+          make tests_core_multi_gpu_bnb
+
+      - name: Run transformers tests on multi GPU
+        run: |
+          source activate peft
+          make transformers_tests
+          
+      - name: Generate Report
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py --slack_channel_name bnb-daily-ci >> $GITHUB_STEP_SUMMARY
diff --git a/Makefile b/Makefile
@@ -35,5 +35,21 @@ tests_common_gpu:
 	python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
 	python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)
 
+tests_examples_multi_gpu_bnb:
+	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
+
+tests_examples_single_gpu_bnb:
+	python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",)
+
+tests_core_multi_gpu_bnb:
+	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",)
+
+tests_core_single_gpu_bnb:
+	python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)
+
+# For testing transformers tests for bnb runners
+transformers_tests:
+	RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",)
+
 tests_regression:
 	python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",)
diff --git a/scripts/log_reports.py b/scripts/log_reports.py
@@ -1,123 +1,136 @@
 import json, os
+import argparse
 from pathlib import Path
 from datetime import date
 from tabulate import tabulate
 
 
 MAX_LEN_MESSAGE = 2900  # slack endpoint has a limit of 3001 characters
 
-failed = []
-passed = []
-
-group_info = []
-
-total_num_failed = 0
-empty_file = False or len(list(Path().glob("*.log"))) == 0
-
-total_empty_files = []
-
-for log in Path().glob("*.log"):
-    section_num_failed = 0
-    i = 0
-    with open(log, "r") as f:
-        for line in f:
-            line = json.loads(line)
-            i += 1
-            if line.get("nodeid", "") != "":
-                test = line["nodeid"]
-                if line.get("duration", None) is not None:
-                    duration = f'{line["duration"]:.4f}'
-                    if line.get("outcome", "") == "failed":
-                        section_num_failed += 1
-                        failed.append([test, duration, log.name.split('_')[0]])
-                        total_num_failed += 1
-                    else:
-                        passed.append([test, duration, log.name.split('_')[0]])
-        empty_file = i == 0
-    group_info.append([str(log), section_num_failed, failed])
-    total_empty_files.append(empty_file)
-    os.remove(log)
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--slack_channel_name",
+    default="peft-ci-daily"
+)
+
+
+def main(slack_channel_name=None):
     failed = []
-no_error_payload = {
-    "type": "section",
-    "text": {
-        "type": "plain_text",
-        "text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.",
-        "emoji": True
-    }
-}
+    passed = []
+
+    group_info = []
+
+    total_num_failed = 0
+    empty_file = False or len(list(Path().glob("*.log"))) == 0
 
-message = ""
-payload = [
-    {
-        "type": "header",
+    total_empty_files = []
+
+    for log in Path().glob("*.log"):
+        section_num_failed = 0
+        i = 0
+        with open(log, "r") as f:
+            for line in f:
+                line = json.loads(line)
+                i += 1
+                if line.get("nodeid", "") != "":
+                    test = line["nodeid"]
+                    if line.get("duration", None) is not None:
+                        duration = f'{line["duration"]:.4f}'
+                        if line.get("outcome", "") == "failed":
+                            section_num_failed += 1
+                            failed.append([test, duration, log.name.split('_')[0]])
+                            total_num_failed += 1
+                        else:
+                            passed.append([test, duration, log.name.split('_')[0]])
+            empty_file = i == 0
+        group_info.append([str(log), section_num_failed, failed])
+        total_empty_files.append(empty_file)
+        os.remove(log)
+        failed = []
+    no_error_payload = {
+        "type": "section",
         "text": {
             "type": "plain_text",
-            "text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")),
-        }
-    },
-]
-if total_num_failed > 0:
-    for i, (name, num_failed, failed_tests) in enumerate(group_info):
-        if num_failed > 0:
-            if num_failed == 1:
-                message += f"*{name}: {num_failed} failed test*\n"
-            else:
-                message += f"*{name}: {num_failed} failed tests*\n"
-            failed_table = []
-            for test in failed_tests:
-                failed_table.append(test[0].split("::"))
-            failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
-            message += '\n```\n' +failed_table + '\n```'
-
-        if total_empty_files[i]:
-            message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
-    print(f'### {message}')
-else:
-    payload.append(no_error_payload)
-
-if os.environ.get("TEST_TYPE", "") != "":
-    from slack_sdk import WebClient
-
-    if len(message) > MAX_LEN_MESSAGE:
-        print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
-        message = message[:MAX_LEN_MESSAGE] + "..."
-
-    if len(message) != 0:
-        md_report = {
-            "type": "section",
-            "text": {
-                "type": "mrkdwn",
-                "text": message
-            },
+            "text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.",
+            "emoji": True
         }
-        payload.append(md_report)
-        action_button = {
-            "type": "section",
+    }
+
+    message = ""
+    payload = [
+        {
+            "type": "header",
             "text": {
-                "type": "mrkdwn",
-                "text": "*For more details:*"
-            },
-            "accessory": {
-                "type": "button",
-                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                "url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-            },
+                "type": "plain_text",
+                "text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")),
+            }
+        },
+    ]
+    if total_num_failed > 0:
+        for i, (name, num_failed, failed_tests) in enumerate(group_info):
+            if num_failed > 0:
+                if num_failed == 1:
+                    message += f"*{name}: {num_failed} failed test*\n"
+                else:
+                    message += f"*{name}: {num_failed} failed tests*\n"
+                failed_table = []
+                for test in failed_tests:
+                    failed_table.append(test[0].split("::"))
+                failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
+                message += '\n```\n' +failed_table + '\n```'
+
+            if total_empty_files[i]:
+                message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
+        print(f'### {message}')
+    else:
+        payload.append(no_error_payload)
+
+    if os.environ.get("TEST_TYPE", "") != "":
+        from slack_sdk import WebClient
+
+        if len(message) > MAX_LEN_MESSAGE:
+            print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
+            message = message[:MAX_LEN_MESSAGE] + "..."
+
+        if len(message) != 0:
+            md_report = {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": message
+                },
+            }
+            payload.append(md_report)
+            action_button = {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "*For more details:*"
+                },
+                "accessory": {
+                    "type": "button",
+                    "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                    "url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+                },
+            }
+            payload.append(action_button)
+
+        date_report = {
+            "type": "context",
+            "elements": [
+                {
+                    "type": "plain_text",
+                    "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}",
+                },
+            ],
         }
-        payload.append(action_button)
+        payload.append(date_report)
 
-    date_report = {
-        "type": "context",
-        "elements": [
-            {
-                "type": "plain_text",
-                "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}",
-            },
-        ],
-    }
-    payload.append(date_report)
+        print(payload)
 
-    print(payload)
+        client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
+        client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload)
 
-    client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
-    client.chat_postMessage(channel="#peft-ci-daily", text=message, blocks=payload)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.slack_channel_name)