From 9f1d9c89da44ec433551b499e08d99d22e8d8fc9 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:10:29 +0200
Subject: [PATCH] Fix quantization tests (#29914)

* revert back to torch 2.1.1

* run test

* switch to torch 2.2.1

* udapte dockerfile

* fix awq tests

* fix test

* run quanto tests

* update tests

* split quantization tests

* fix

* fix again

* final fix

* fix report artifact

* build docker again

* Revert "build docker again"

This reverts commit 399a5f9d9308da071d79034f238c719de0f3532e.

* debug

* revert

* style

* new notification system

* testing notfication

* rebuild docker

* fix_prev_ci_results

* typo

* remove warning

* fix typo

* fix artifact name

* debug

* issue fixed

* debug again

* fix

* fix time

* test notif with faling test

* typo

* issues again

* final fix ?

* run all quantization tests again

* remove name to clear space

* revert modfiication done on workflow

* fix

* build docker

* build only quant docker

* fix quantization ci

* fix

* fix report

* better quantization_matrix

* add print

* revert to the basic one
---
 .github/workflows/self-scheduled.yml          |  36 ++-
 .github/workflows/slack-report.yml            |  25 +-
 .../Dockerfile                                |   8 +-
 src/transformers/utils/quantization_config.py |   2 +-
 tests/quantization/autoawq/test_awq.py        |  30 +--
 utils/notification_service.py                 |   2 -
 utils/notification_service_quantization.py    | 251 ++++++++++++++++++
 7 files changed, 324 insertions(+), 30 deletions(-)
 create mode 100644 utils/notification_service_quantization.py

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 3e563e94e15ca0..81620b740ba81d 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -33,7 +33,6 @@ env:
 
 jobs:
   setup:
-    if: ${{ inputs.job == 'run_tests_gpu' }}
     name: Setup
     strategy:
       matrix:
@@ -45,6 +44,7 @@ jobs:
     outputs:
       folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
       slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+      quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
     steps:
       - name: Update clone
         working-directory: /transformers
@@ -63,11 +63,19 @@ jobs:
         run: pip freeze
 
       - id: set-matrix
+        if: ${{ inputs.job == 'run_tests_gpu' }}
         name: Identify models to test
         working-directory: /transformers/tests
         run: |
           echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
           echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+      
+      - id: set-matrix-quantization
+        if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
+        name: Identify quantization method to test
+        working-directory: /transformers/tests
+        run: |
+          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
 
       - name: NVIDIA-SMI
         run: |
@@ -303,16 +311,26 @@ jobs:
 
   run_tests_quantization_torch_gpu:
     if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
-    name: Quantization tests
+    name: " "
     strategy:
       fail-fast: false
       matrix:
+        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
         machine_type: [single-gpu, multi-gpu]
     runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
     container:
       image: huggingface/transformers-quantization-latest-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
       - name: Update clone
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
@@ -337,19 +355,19 @@ jobs:
       - name: Run quantization tests on GPU
         working-directory: /transformers
         run: |
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
 
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu/failures_short.txt
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}/failures_short.txt
 
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu"
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}"
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu
-          path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu
+          name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}
 
   run_extract_warnings:
     # Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
@@ -413,4 +431,6 @@ jobs:
       slack_report_channel: ${{ inputs.slack_report_channel }}
       # This would be an empty string if `setup` is skipped.
       folder_slices: ${{ needs.setup.outputs.folder_slices }}
-    secrets: inherit
\ No newline at end of file
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      
+    secrets: inherit
diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml
index 0e964e8596a0f5..9e62417c76dfe7 100644
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@@ -15,6 +15,9 @@ on:
       folder_slices:
         required: true
         type: string
+      quantization_matrix:
+        required: true
+        type: string
 
 
 jobs:
@@ -32,6 +35,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/download-artifact@v3
       - name: Send message to Slack
+        if: ${{ inputs.job != 'run_tests_quantization_torch_gpu' }}
         env:
           CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
           CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
@@ -53,7 +57,26 @@ jobs:
           pip install slack_sdk
           pip show slack_sdk
           python utils/notification_service.py "${{ inputs.folder_slices }}"
-
+      
+      - uses: actions/checkout@v3
+      - uses: actions/download-artifact@v3
+      - name: Send message to Slack for quantization workflow
+        if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
+          CI_EVENT: scheduled
+          CI_SHA: ${{ github.sha }}
+          SETUP_STATUS: ${{ inputs.setup_status }}
+        # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
+        # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          sudo apt-get install -y curl
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" 
+  
       # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
       - name: Failure table artifacts
         # Only the model testing job is concerned for this step
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 8a526c72981620..e1d084c4033902 100644
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='2.2.0'
+ARG PYTORCH='2.2.1'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu118'
 
@@ -30,6 +30,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
+# needed in bnb and awq
+RUN python3 -m pip install --no-cache-dir einops
+
 # Add bitsandbytes for mixed int8 testing
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
@@ -43,7 +46,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
 
 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.0/autoawq-0.2.0+cu118-cp38-cp38-linux_x86_64.whl
+# >=v0.2.3 needed for compatibility with torch 2.2.1
+RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
 
 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir quanto
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 69bb0d5272be97..d91ecef16e37e1 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -789,7 +789,7 @@ def post_init(self):
 
     def get_loading_attributes(self):
         attibutes_dict = copy.deepcopy(self.__dict__)
-        loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len"]
+        loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"]
         loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
         return loading_attibutes_dict
 
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
index 8ed8c394f424c5..8215f3f1458173 100644
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase):
 
     EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
     EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"
-
+    EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out"
     device_map = "cuda"
 
     # called only once for all test in this class
@@ -200,11 +200,11 @@ def test_quantized_model_exllama(self):
 
         quantization_config = AwqConfig(version="exllama")
         quantized_model = AutoModelForCausalLM.from_pretrained(
-            self.model_name, quantization_config=quantization_config
-        ).to(torch_device)
+            self.model_name, quantization_config=quantization_config, device_map=torch_device
+        )
 
         output = quantized_model.generate(**input_ids, max_new_tokens=40)
-        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)
 
     def test_quantized_model_no_device_map(self):
         """
@@ -239,7 +239,7 @@ def test_quantized_model_multi_gpu(self):
 
         quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
 
-        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3})
+        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
 
         output = quantized_model.generate(**input_ids, max_new_tokens=40)
 
@@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase):
     model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
     model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
 
-    custom_mapping_model_id = "TheBloke/Yi-34B-AWQ"
-    custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589"
+    custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
+    custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
 
     mixtral_model_name = "casperhansen/mixtral-instruct-awq"
     mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
@@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase):
         "You end up exactly where you started. Where are you?"
     )
 
-    EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
-    EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org"
+    EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf"
+    EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
     EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
 
     def tearDown(self):
@@ -423,28 +423,25 @@ def test_generation_custom_model(self):
             fuse_max_seq_len=512,
             modules_to_fuse={
                 "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
-                "layernorm": ["ln1", "ln2", "norm"],
                 "mlp": ["gate_proj", "up_proj", "down_proj"],
+                "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
                 "use_alibi": False,
-                "num_attention_heads": 56,
+                "hidden_size": 4096,
+                "num_attention_heads": 32,
                 "num_key_value_heads": 8,
-                "hidden_size": 7168,
             },
         )
 
         model = AutoModelForCausalLM.from_pretrained(
             self.custom_mapping_model_id,
             quantization_config=quantization_config,
-            trust_remote_code=True,
             device_map="balanced",
             revision=self.custom_model_revision,
         )
 
         self._check_fused_modules(model)
 
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True
-        )
+        tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)
 
         prompt = "Hello"
         inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
@@ -452,6 +449,7 @@ def test_generation_custom_model(self):
         outputs = model.generate(**inputs, max_new_tokens=12)
         self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
 
+    @unittest.skip("Not enough GPU memory on CI runners")
     @require_torch_multi_gpu
     def test_generation_mixtral_fused(self):
         """
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 5378348ee9cc9e..158e01942b81fa 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -1056,7 +1056,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
         "TensorFlow pipelines": "run_tests_tf_pipeline_gpu",
         "Examples directory": "run_examples_gpu",
         "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
-        "Quantization tests": "run_tests_quantization_torch_gpu",
     }
 
     if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
@@ -1077,7 +1076,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
         "run_pipelines_tf_gpu": "TensorFlow pipelines",
         "run_examples_gpu": "Examples directory",
         "run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
-        "run_tests_quantization_torch_gpu": "Quantization tests",
     }
 
     # Remove some entries in `additional_files` if they are not concerned.
diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py
new file mode 100644
index 00000000000000..11bc57e618a7e4
--- /dev/null
+++ b/utils/notification_service_quantization.py
@@ -0,0 +1,251 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+import json
+import os
+import sys
+import time
+from typing import Dict
+
+from get_ci_error_statistics import get_jobs
+from notification_service import (
+    Message,
+    handle_stacktraces,
+    handle_test_results,
+    prepare_reports,
+    retrieve_artifact,
+    retrieve_available_artifacts,
+)
+from slack_sdk import WebClient
+
+
+client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
+
+
+class QuantizationMessage(Message):
+    def __init__(
+        self,
+        title: str,
+        results: Dict,
+    ):
+        self.title = title
+
+        # Failures and success of the modeling tests
+        self.n_success = sum(r["success"] for r in results.values())
+        self.single_gpu_failures = sum(r["failed"]["single"] for r in results.values())
+        self.multi_gpu_failures = sum(r["failed"]["multi"] for r in results.values())
+        self.n_failures = self.single_gpu_failures + self.multi_gpu_failures
+
+        self.n_tests = self.n_failures + self.n_success
+        self.results = results
+        self.thread_ts = None
+
+    @property
+    def payload(self) -> str:
+        blocks = [self.header]
+
+        if self.n_failures > 0:
+            blocks.append(self.failures_overwiew)
+            blocks.append(self.failures_detailed)
+
+        if self.n_failures == 0:
+            blocks.append(self.no_failures)
+
+        return json.dumps(blocks)
+
+    @property
+    def time(self) -> str:
+        all_results = self.results.values()
+        time_spent = []
+        for r in all_results:
+            if len(r["time_spent"]):
+                time_spent.extend([x for x in r["time_spent"].split(", ") if len(x.strip())])
+        total_secs = 0
+
+        for time in time_spent:
+            time_parts = time.split(":")
+
+            # Time can be formatted as xx:xx:xx, as .xx, or as x.xx if the time spent was less than a minute.
+            if len(time_parts) == 1:
+                time_parts = [0, 0, time_parts[0]]
+
+            hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), float(time_parts[2])
+            total_secs += hours * 3600 + minutes * 60 + seconds
+
+        hours, minutes, seconds = total_secs // 3600, (total_secs % 3600) // 60, total_secs % 60
+        return f"{int(hours)}h{int(minutes)}m{int(seconds)}s"
+
+    @property
+    def failures_overwiew(self) -> Dict:
+        return {
+            "type": "section",
+            "text": {
+                "type": "plain_text",
+                "text": (
+                    f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
+                    f"The suite ran in {self.time}."
+                ),
+                "emoji": True,
+            },
+            "accessory": {
+                "type": "button",
+                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+            },
+        }
+
+    @property
+    def failures_detailed(self) -> Dict:
+        failures = {k: v["failed"] for k, v in self.results.items()}
+
+        individual_reports = []
+        for key, value in failures.items():
+            device_report = self.get_device_report(value)
+            if sum(value.values()):
+                report = f"{device_report}{key}"
+                individual_reports.append(report)
+
+        header = "Single |  Multi | Category\n"
+        failures_report = prepare_reports(
+            title="The following quantization tests had failures", header=header, reports=individual_reports
+        )
+
+        return {"type": "section", "text": {"type": "mrkdwn", "text": failures_report}}
+
+    def post(self):
+        payload = self.payload
+        print("Sending the following payload")
+        print(json.dumps({"blocks": json.loads(payload)}))
+
+        text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
+
+        self.thread_ts = client.chat_postMessage(
+            channel=SLACK_REPORT_CHANNEL_ID,
+            blocks=payload,
+            text=text,
+        )
+
+    def post_reply(self):
+        if self.thread_ts is None:
+            raise ValueError("Can only post reply if a post has been made.")
+
+        for job, job_result in self.results.items():
+            if len(job_result["failures"]):
+                for device, failures in job_result["failures"].items():
+                    blocks = self.get_reply_blocks(
+                        job,
+                        job_result,
+                        failures,
+                        device,
+                        text=f'Number of failures: {job_result["failed"][device]}',
+                    )
+
+                    print("Sending the following reply")
+                    print(json.dumps({"blocks": blocks}))
+
+                    client.chat_postMessage(
+                        channel="#transformers-ci-daily-quantization",
+                        text=f"Results for {job}",
+                        blocks=blocks,
+                        thread_ts=self.thread_ts["ts"],
+                    )
+                    time.sleep(1)
+
+
+if __name__ == "__main__":
+    setup_status = os.environ.get("SETUP_STATUS")
+    SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"]
+    setup_failed = True if setup_status is not None and setup_status != "success" else False
+
+    # This env. variable is set in workflow file (under the job `send_results`).
+    ci_event = os.environ["CI_EVENT"]
+
+    title = f"🤗 Results of the {ci_event} tests."
+
+    if setup_failed:
+        Message.error_out(
+            title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=setup_failed
+        )
+        exit(0)
+
+    arguments = sys.argv[1:][0]
+    try:
+        quantization_matrix = ast.literal_eval(arguments)
+        # Need to change from elements like `quantization/bnb` to `quantization_bnb` (the ones used as artifact names).
+        quantization_matrix = [x.replace("quantization/", "quantization_") for x in quantization_matrix]
+    except SyntaxError:
+        Message.error_out(title, ci_title="")
+        raise ValueError("Errored out.")
+
+    available_artifacts = retrieve_available_artifacts()
+
+    quantization_results = {
+        quant: {
+            "failed": {"single": 0, "multi": 0},
+            "success": 0,
+            "time_spent": "",
+            "failures": {},
+            "job_link": {},
+        }
+        for quant in quantization_matrix
+        if f"run_tests_quantization_torch_gpu_{quant}" in available_artifacts
+    }
+
+    github_actions_jobs = get_jobs(
+        workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"]
+    )
+    github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs}
+
+    artifact_name_to_job_map = {}
+    for job in github_actions_jobs:
+        for step in job["steps"]:
+            if step["name"].startswith("Test suite reports artifacts: "):
+                artifact_name = step["name"][len("Test suite reports artifacts: ") :]
+                artifact_name_to_job_map[artifact_name] = job
+                break
+
+    for quant in quantization_results.keys():
+        for artifact_path in available_artifacts[f"run_tests_quantization_torch_gpu_{quant}"].paths:
+            artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
+            if "stats" in artifact:
+                # Link to the GitHub Action job
+                job = artifact_name_to_job_map[artifact_path["path"]]
+                quantization_results[quant]["job_link"][artifact_path["gpu"]] = job["html_url"]
+                failed, success, time_spent = handle_test_results(artifact["stats"])
+                quantization_results[quant]["failed"][artifact_path["gpu"]] += failed
+                quantization_results[quant]["success"] += success
+                quantization_results[quant]["time_spent"] += time_spent[1:-1] + ", "
+
+                stacktraces = handle_stacktraces(artifact["failures_line"])
+
+                for line in artifact["summary_short"].split("\n"):
+                    if line.startswith("FAILED "):
+                        line = line[len("FAILED ") :]
+                        line = line.split()[0].replace("\n", "")
+
+                        if artifact_path["gpu"] not in quantization_results[quant]["failures"]:
+                            quantization_results[quant]["failures"][artifact_path["gpu"]] = []
+
+                        quantization_results[quant]["failures"][artifact_path["gpu"]].append(
+                            {"line": line, "trace": stacktraces.pop(0)}
+                        )
+
+    message = QuantizationMessage(
+        title,
+        results=quantization_results,
+    )
+
+    message.post()
+    message.post_reply()