From 63f87e0ca4f67f469b02ebad2c232a72f3b97401 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 19 Feb 2025 22:43:25 +0000
Subject: [PATCH 01/35] [Automated Commit] Format Codebase

---
 compliance/nvidia/TEST01/verify_performance.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/compliance/nvidia/TEST01/verify_performance.py b/compliance/nvidia/TEST01/verify_performance.py
index cc400c8ed7..4b527730b1 100644
--- a/compliance/nvidia/TEST01/verify_performance.py
+++ b/compliance/nvidia/TEST01/verify_performance.py
@@ -54,12 +54,14 @@ def main():
             continue
 
         if ref_mode == "SingleStream":
-            if re.match(".*Early stopping (90th|90.0th|99.9th) percentile estimate", line):
+            if re.match(
+                    ".*Early stopping (90th|90.0th|99.9th) percentile estimate", line):
                 ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "MultiStream":
-            if re.match(".*Early stopping (99th|99.0th) percentile estimate", line):
+            if re.match(
+                    ".*Early stopping (99th|99.0th) percentile estimate", line):
                 ref_score = line.split(": ", 1)[1].strip()
                 continue
 
@@ -96,7 +98,8 @@ def main():
                 continue
 
         if test_mode == "MultiStream":
-            if re.match(".*Early stopping (99th|99.0th) percentile estimate", line):
+            if re.match(
+                    ".*Early stopping (99th|99.0th) percentile estimate", line):
                 test_score = line.split(": ", 1)[1].strip()
                 continue
 

From ec284d277523e5409694b431a94ac0601ab6fc3b Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Mon, 24 Feb 2025 16:37:53 +0000
Subject: [PATCH 02/35] Updated tags for submission checker command in docs

---
 docs/submission/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index da30c18350..6a6bbfb2f2 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -184,14 +184,14 @@ Once you have all the results on the system, you can upload them to the MLCommon
 === "via CLI"
     You can do the following command which will run the submission checker and upload the results to the MLCommons submission server
     ```
-    mlcr run,submission,checker,inference \
+    mlcr run,mlperf,submission,checker,inference \
     --submitter_id=<> \
     --submission_dir=<Path to the submission folder>
     ```
 === "via Browser"
     You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). 
     ```
-    mlcr run,submission,checker \
+    mlcr run,mlperf,submission,checker,inference \
     --submission_dir=<Path to the submission folder> \
     --tar=yes \
     --submission_tar_file=mysubmission.tar.gz

From 5335553c19fe0aa464c7c466325c48fd60c4b6df Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Tue, 25 Feb 2025 10:44:55 +0000
Subject: [PATCH 03/35] Update mobilenets docs

---
 .../image_classification/mobilenets.md        | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/benchmarks/image_classification/mobilenets.md b/docs/benchmarks/image_classification/mobilenets.md
index 7e4605b4b0..9501521b32 100644
--- a/docs/benchmarks/image_classification/mobilenets.md
+++ b/docs/benchmarks/image_classification/mobilenets.md
@@ -23,10 +23,10 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos
     mlcr run,mobilenet-models,_tflite,_mobilenet-v2 \
      --adr.compiler.tags=gcc
     ```
-=== "Mobilenet-V2"
-    ### Mobilenet V2
+=== "Mobilenet-V3"
+    ### Mobilenet V3
     ```bash
-    mlcr run,mobilenet-models,_tflite,_mobilenet-v2 \
+    mlcr run,mobilenet-models,_tflite,_mobilenet-v3 \
      --adr.compiler.tags=gcc
     ```
 === "Mobilenets"
@@ -41,6 +41,12 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos
     mlcr run,mobilenet-models,_tflite,_efficientnet \
      --adr.compiler.tags=gcc
     ```
+=== "Mobilenets and Efficientnet"
+    ### Mobilenets and Efficientnet
+    ```bash
+    mlcr run,mobilenet-models,_tflite \
+     --adr.compiler.tags=gcc
+    ```
 
 ## ARMNN Backend
 === "Mobilenet-V1"
@@ -55,7 +61,7 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos
     mlcr run,mobilenet-models,_tflite,_armnn,_mobilenet-v2 \
      --adr.compiler.tags=gcc
     ```
-=== "Mobilenet-V2"
+=== "Mobilenet-V3"
     ### Mobilenet V2
     ```bash
     mlcr run,mobilenet-models,_tflite,_armnn,_mobilenet-v2 \
@@ -73,4 +79,11 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos
     mlcr run,mobilenet-models,_tflite,_armnn,_efficientnet \
      --adr.compiler.tags=gcc
     ```
+=== "Mobilenets and Efficientnet"
+    ### Mobilenets and Efficientnet
+    ```bash
+    mlcr run,mobilenet-models,_tflite,_armnn \
+     --adr.compiler.tags=gcc
+    ```
+
 

From b9767aa16544a4c759c32767c5ceeb9546b9b195 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Tue, 25 Feb 2025 16:55:02 +0000
Subject: [PATCH 04/35] Update main.py

---
 main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 2ccfab2958..a8fcb30a49 100755
--- a/main.py
+++ b/main.py
@@ -28,11 +28,13 @@ def mlperf_inference_implementation_readme(
         content = ""
 
         execution_envs = ["Docker", "Native"]
-        code_version = "r4.1-dev"
+        code_version = "r5.0-dev"
         implementation_run_options = []
 
         if model == "rnnt":
             code_version = "r4.0"
+        elif implementation == "intel":
+            code_version = "r4.1-dev"
 
         if implementation == "reference":
             # Tip

From f42aeeb8a96c58dd8ffff8cecde1e08aa5f10d41 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Tue, 25 Feb 2025 16:56:51 +0000
Subject: [PATCH 05/35] Update main.py

---
 main.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/main.py b/main.py
index a8fcb30a49..0020d8c4ec 100755
--- a/main.py
+++ b/main.py
@@ -341,12 +341,7 @@ def mlperf_inference_implementation_readme(
                             and framework.lower() == "deepsparse"
                         ):
                             run_suffix += f"{cur_space3}You can use any model from [NeuralMagic sparse zoo](https://sparsezoo.neuralmagic.com/?modelSet=computer_vision&architectures=resnet_v1) (trained on Imagenet dataset) as --nm_model_zoo_stub"
-                        if (
-                            "bert" in model.lower()
-                            and framework.lower() == "deepsparse"
-                        ):
-                            run_suffix += "You can use any model from [NeuralMagic sparse zoo](https://sparsezoo.neuralmagic.com/?modelSet=computer_vision&architectures=resnet_v1) (trained on Imagenet dataset) as --nm_model_zoo_stub"
-                        if (
+                        elif (
                             "bert" in model.lower()
                             and framework.lower() == "deepsparse"
                         ):

From c699ce30ad313a5b9e5f91a9874cf1f4e917772e Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Tue, 25 Feb 2025 22:38:06 +0530
Subject: [PATCH 06/35] update dataset download commands - waymo calib (#2130)

---
 .../get-pointpainting-data.md                  | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md b/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md
index 0a1e65c8ea..6331b3535b 100644
--- a/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md
+++ b/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md
@@ -9,11 +9,21 @@ hide:
 
 > **Note:** By default, the waymo dataset is downloaded from the mlcommons official drive. One has to accept the [MLCommons Waymo Open Dataset EULA](https://waymo.mlcommons.org/) to access the dataset files. 
 
-The benchmark implementation run command will automatically download the preprocessed dataset. In case you want to download only the datasets, you can use the below command.
+The benchmark implementation run command will automatically download the preprocessed dataset. In case you want to download only the datasets, you can use the below commands.
 
-```bash
-mlcr get,dataset,waymo -j
-```
+=== "Validation"
+
+    ### Get Validation Dataset
+    ```
+    mlcr get,dataset,waymo -j
+    ```
+    
+=== "Calibration"
+
+    ### Get Calibration Dataset
+    ```
+    mlcr get,dataset,waymo,calibration -j
+    ```
 
 - `--outdirname=<PATH_TO_DOWNLOAD_WAYMO_DATASET>` could be provided to download the dataset to a specific location.
 

From 7bf2c5e6bc49e4c814f93e92d77189b0fd83d7ba Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Thu, 13 Mar 2025 18:47:52 +0000
Subject: [PATCH 07/35] Merge from Master (#2155)

* Update submission_checker.py | Fix open model unit in Results (#2144)

* Add Llama 3.1 to special unit dict (#2150)

---------

Co-authored-by: Pablo Gonzalez <pablo.gonzalez@factored.ai>
---
 tools/submission/submission_checker.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index c3c30c14d7..7ac74be40d 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1914,50 +1914,38 @@ def log_result(
             notes = notes + system_json.get("sw_notes")
         special_unit_dict = {
             "gptj-99": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "gptj-99.9": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "llama2-70b-99": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "llama2-70b-99.9": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "llama2-70b-interactive-99": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "llama2-70b-interactive-99.9": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
+                "Offline": "Tokens/s",
+                "Server": "Tokens/s",
+            },
+            "llama3.1-405b": {
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "mixtral-8x7b": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
             "llama3.1-405b": {
-                "SingleStream": "Latency (ms)",
-                "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
@@ -1977,7 +1965,7 @@ def log_result(
         if config.version == "v4.0":
             unit = unit_dict[scenario_fixed]
         else:
-            unit = special_unit_dict.get(model_name, unit_dict)[scenario_fixed]
+            unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed])
         power_unit = power_unit_dict[scenario_fixed]
 
         if (power_metric <= 0) or (

From 2a73202e8bdf0028dd7f0e78f4f2d569d6fb1561 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 13 Mar 2025 18:48:21 +0000
Subject: [PATCH 08/35] [Automated Commit] Format Codebase

---
 tools/submission/submission_checker.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 7ac74be40d..0694273b19 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1702,7 +1702,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
                 samples_per_query = 8
 
             if (scenario_fixed in ["MultiStream"]
-                    ) and scenario in ["SingleStream"]:
+                ) and scenario in ["SingleStream"]:
                 power_metric = (
                     avg_power * power_duration * samples_per_query * 1000 / num_queries
                 )
@@ -1965,7 +1965,9 @@ def log_result(
         if config.version == "v4.0":
             unit = unit_dict[scenario_fixed]
         else:
-            unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed])
+            unit = special_unit_dict.get(
+                mlperf_model, unit_dict).get(
+                scenario_fixed, unit_dict[scenario_fixed])
         power_unit = power_unit_dict[scenario_fixed]
 
         if (power_metric <= 0) or (

From 2fb105769406a98ffcd587c3b49c303ba6a9de4b Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Tue, 22 Apr 2025 18:53:22 +0530
Subject: [PATCH 09/35] Inference docs - Update model and dataset download
 commands (#2153)

* Update llama2 70b model download docs

* changes in model and dataset download commands
---
 docs/benchmarks/graph/get-rgat-data.md        | 33 +++++++---
 .../image_classification/get-resnet50-data.md | 38 +++++++----
 docs/benchmarks/language/get-bert-data.md     | 19 ++++++
 docs/benchmarks/language/get-gptj-data.md     | 14 +++-
 .../language/get-llama2-70b-data.md           | 66 ++++++++++++++-----
 .../language/get-llama3_1-405b-data.md        | 29 +++++---
 .../language/get-mixtral-8x7b-data.md         | 13 +++-
 .../medical_imaging/get-3d-unet-data.md       | 30 +++++++--
 .../object_detection/get-retinanet-data.md    | 33 +++++++---
 .../recommendation/get-dlrm-v2-data.md        |  7 +-
 .../benchmarks/text_to_image/get-sdxl-data.md | 27 ++++++--
 11 files changed, 236 insertions(+), 73 deletions(-)

diff --git a/docs/benchmarks/graph/get-rgat-data.md b/docs/benchmarks/graph/get-rgat-data.md
index 6ab9515e59..bb719fea2e 100644
--- a/docs/benchmarks/graph/get-rgat-data.md
+++ b/docs/benchmarks/graph/get-rgat-data.md
@@ -9,22 +9,34 @@ hide:
 
 The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
 
-=== "Full Dataset"
-    R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges.
+=== "Validation"
 
-    ### Get Full Dataset
-    ```
-    mlcr get,dataset,igbh,_full -j
-    ```
+    === "Full Dataset"
+        R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges.
+
+        ### Get Full Dataset
+        ```
+        mlcr get,dataset,igbh,_full -j
+        ```
 
-=== "Debug Dataset"
-    R-GAT debug run uses the IGBH debug dataset(tiny).
+    === "Debug Dataset"
+        R-GAT debug run uses the IGBH debug dataset(tiny).
 
-    ### Get Full Dataset
+        ### Get Full Dataset
+        ```
+        mlcr get,dataset,igbh,_debug -j
+        ```
+
+=== "Calibration"
+    The calibration dataset contains 5000 nodes from the training paper nodes of the IGBH dataset. IGBH `full` dataset would be downloaded for creating calibration dataset. 
+
+    ### Get Calibration Dataset
     ```
-    mlcr get,dataset,igbh,_debug -j
+    mlcr get,dataset,igbh,_full,_calibration -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_IGBH_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -37,3 +49,4 @@ Get the Official MLPerf R-GAT Model
     mlcr get,ml-model,rgat -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_RGAT_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/image_classification/get-resnet50-data.md b/docs/benchmarks/image_classification/get-resnet50-data.md
index 771571d5c7..9ecd25c1a5 100644
--- a/docs/benchmarks/image_classification/get-resnet50-data.md
+++ b/docs/benchmarks/image_classification/get-resnet50-data.md
@@ -9,25 +9,34 @@ hide:
 
 The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
 
-=== "Validation"
-    ResNet50 validation run uses the Imagenet 2012 validation dataset consisting of 50,000 images.
+=== "Unprocessed"
+    === "Validation"
+        ResNet50 validation run uses the Imagenet 2012 validation dataset consisting of 50,000 images.
 
-    ### Get Validation Dataset
-    ```
-    mlcr get,dataset,imagenet,validation -j
-    ```
-=== "Calibration"
-    ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset.
+        ### Get Validation Dataset
+        ```
+        mlcr get,dataset,imagenet,validation -j
+        ```
+    === "Calibration"
+        ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset.
+
+        ### Get Calibration Dataset Using Option 1
+        ```
+        mlcr get,dataset,imagenet,calibration,_mlperf.option1 -j
+        ```
+        ### Get Calibration Dataset Using Option 2
+        ```
+        mlcr get,dataset,imagenet,calibration,_mlperf.option2 -j
+        ```
+=== "Preprocessed"
+    ### Get ResNet50 preprocessed dataset
 
-    ### Get Calibration Dataset Using Option 1
-    ```
-    mlcr get,dataset,imagenet,calibration,_mlperf.option1 -j
-    ```
-    ### Get Calibration Dataset Using Option 2
     ```
-    mlcr get,dataset,imagenet,calibration,_mlperf.option2 -j
+    mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_IMAGENET_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -46,3 +55,4 @@ Get the Official MLPerf ResNet50 Model
     mlcr get,ml-model,resnet50,_onnx -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_RESNET50_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-bert-data.md b/docs/benchmarks/language/get-bert-data.md
index 430031f319..ab3ba9b537 100644
--- a/docs/benchmarks/language/get-bert-data.md
+++ b/docs/benchmarks/language/get-bert-data.md
@@ -17,6 +17,24 @@ The benchmark implementation run command will automatically download the validat
     mlcr get,dataset,squad,validation -j
     ```
 
+=== "Calibration"
+
+    === "Calibration Set 1"
+
+        ### Get Calibration Dataset
+        ```
+        mlcr get,dataset,squad,_calib1 -j
+        ```
+    
+    === "Calibration Set 2"
+
+        ### Get Calibration Dataset
+        ```
+        mlcr get,dataset,squad,_calib2 -j
+        ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_SQUAD_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -41,3 +59,4 @@ Get the Official MLPerf Bert-Large Model
     mlcr get,ml-model,bert-large,_tensorflow -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_BERT_MODEL>` could be provided to download the model to a specific location.
diff --git a/docs/benchmarks/language/get-gptj-data.md b/docs/benchmarks/language/get-gptj-data.md
index 34140598e9..60e2568b6e 100644
--- a/docs/benchmarks/language/get-gptj-data.md
+++ b/docs/benchmarks/language/get-gptj-data.md
@@ -14,9 +14,19 @@ The benchmark implementation run command will automatically download the validat
 
     ### Get Validation Dataset
     ```
-    mlcr get,dataset,cnndm,validation -j
+    mlcr get,dataset,cnndm,_validation -j
     ```
 
+=== "Calibration"
+    GPT-J calibration dataset is extracted from the CNNDM dataset.
+
+    ### Get Validation Dataset
+    ```
+    mlcr get,dataset,cnndm,_calibration -j
+    ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_CNNDM_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -28,3 +38,5 @@ Get the Official MLPerf GPT-J Model
     ```
     mlcr get,ml-model,gptj,_pytorch -j
     ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_GPTJ_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md
index 2a31370574..ce7cd996eb 100644
--- a/docs/benchmarks/language/get-llama2-70b-data.md
+++ b/docs/benchmarks/language/get-llama2-70b-data.md
@@ -9,27 +9,63 @@ hide:
 
 The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
 
-=== "Validation"
-    LLAMA2-70b validation run uses the Open ORCA dataset.
+=== "Preprocessed Dataset"
 
-    ### Get Validation Dataset
-    ```
-    mlcr get,dataset,openorca,validation -j
-    ```
+    === "Validation"
+        LLAMA2-70b validation run uses the Open ORCA dataset.
+    
+        ### Get Preprocessed Validation Dataset
+        ```
+        mlcr get,dataset,preprocessed,openorca,_validation -j
+        ```
+
+    === "Calibration"
+
+        ### Get Preprocessed Calibration dataset
+        ```
+        mlcr get,dataset,preprocessed,openorca,_calibration -j
+        ```
+
+=== "Unprocessed Dataset"
+
+    === "Validation"
+        LLAMA2-70b validation run uses the Open ORCA dataset.
+
+        ### Get Unprocessed Validation Dataset
+        ```
+        mlcr get,dataset,openorca,_validation -j
+        ```
+
+    === "Calibration"
+
+        ### Get Unprocessed Validation Dataset
+        ```
+        mlcr get,dataset,openorca,_validation -j
+        ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_OPENORCA_DATASET>` could be provided to download the dataset to a specific location.
 
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
-Get the Official MLPerf LLAMA2-70b Model
-
 === "Pytorch"
 
-    ### Pytorch
-    ```
-    mlcr get,ml-model,llama2-70b,_pytorch -j --outdirname=<My download path>
-    ```
-  
-!!! tip
+    === "From MLCOMMONS Google Drive"
+
+        > **Note:**  One has to accept the [MLCommons Llama 2 License Confidentiality Notice](https://llama2.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. 
+
+        ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Google Drive
+        ```
+        mlcr get,ml-model,llama2-70b,_pytorch -j
+        ```
+
+    === "From Hugging Face repo"
+
+        > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
 
-    [Access Request Link](https://llama2.mlcommons.org/) for MLCommons members
+        ### Get model from HuggingFace repo
+        ```
+        mlcr get,ml-model,llama2-70b,_hf --hf_token=<huggingface access token> -j
+        ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_LLAMA2_70B_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md
index 62b7bd088a..ad05ca8610 100644
--- a/docs/benchmarks/language/get-llama3_1-405b-data.md
+++ b/docs/benchmarks/language/get-llama3_1-405b-data.md
@@ -23,18 +23,29 @@ The benchmark implementation run command will automatically download the validat
     mlcr get,dataset,mlperf,inference,llama3,_calibration --outdirname=<path to download> -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_LLAMA3_405B_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
-Get the Official MLPerf LLAMA3.1-405b Model
-
 === "Pytorch"
 
-    ### Pytorch
-    ```
-    mlcr get,ml-model,llama3 --outdirname=<path to download>  -j
-    ```
-  
-!!! tip
+    === "From MLCOMMONS Google Drive"
+
+        > **Note:**  One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. 
+
+        ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Google Drive
+        ```
+        mlcr get,ml-model,llama3 -j
+        ```
+
+    === "From Hugging Face repo"
+
+        > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
+
+        ### Get model from HuggingFace repo
+        ```
+        mlcr get,ml-model,llama3,_hf --hf_token=<huggingface access token> -j
+        ```
 
-    [Access Request Link](https://llama3-1.mlcommons.org/) for MLCommons members
+- `--outdirname=<PATH_TO_DOWNLOAD_LLAMA3_405B_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-mixtral-8x7b-data.md b/docs/benchmarks/language/get-mixtral-8x7b-data.md
index 81b90cdb57..cf5225843d 100644
--- a/docs/benchmarks/language/get-mixtral-8x7b-data.md
+++ b/docs/benchmarks/language/get-mixtral-8x7b-data.md
@@ -15,6 +15,15 @@ The benchmark implementation run command will automatically download the preproc
     mlcr get,dataset-mixtral,openorca-mbxp-gsm8k-combined -j
     ```
 
+=== "Calibration"
+    
+    ### Get Calibration Dataset
+    ```
+    mlcr get,dataset-mixtral,openorca-mbxp-gsm8k-combined,_calibration -j
+    ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_MIXTRAL_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -25,4 +34,6 @@ Get the Official MLPerf MIXTRAL-8x7b Model
     ### Pytorch
     ```
     mlcr get,ml-model,mixtral -j
-    ```
\ No newline at end of file
+    ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_MIXTRAL_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/medical_imaging/get-3d-unet-data.md b/docs/benchmarks/medical_imaging/get-3d-unet-data.md
index d68b769209..9c77fdeaa2 100644
--- a/docs/benchmarks/medical_imaging/get-3d-unet-data.md
+++ b/docs/benchmarks/medical_imaging/get-3d-unet-data.md
@@ -9,19 +9,34 @@ hide:
 
 The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
 
-=== "Validation"
-    3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task
 
-    ### Get Validation Dataset(Original)
-    ```
-    mlcr get,dataset,kits19,_validation -j
-    ```
+=== "Unprocessed Dataset"
+
+    === "Validation"
+        3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task
+
+        ### Get Validation Dataset
+        ```
+        mlcr get,dataset,kits19,_validation -j
+        ```
+
+    === "Calibration"
 
-    ### Get Validation Dataset(Preprocessed)
+        ### Get Calibration Dataset
+        ```
+        mlcr get,dataset,kits19,_calibration -j
+        ```
+
+=== "Preprocessed Dataset"
+
+    ### Get Preprocessed Validation Dataset
     ```
     mlcr get,dataset,kits19,preprocessed -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_KITS19_DATASET>` could be provided to download the dataset to a specific location.
+
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -46,3 +61,4 @@ Get the Official MLPerf 3d-unet Model
     mlcr get,ml-model,3d-unet,_tensorflow -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_3DUNET_MODEL>` could be provided to download the model to a specific location.
diff --git a/docs/benchmarks/object_detection/get-retinanet-data.md b/docs/benchmarks/object_detection/get-retinanet-data.md
index 6cd677b4e1..6127eed541 100644
--- a/docs/benchmarks/object_detection/get-retinanet-data.md
+++ b/docs/benchmarks/object_detection/get-retinanet-data.md
@@ -9,20 +9,34 @@ hide:
 
 The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
 
-=== "Validation"
-    Retinanet validation run uses the OpenImages v6 MLPerf validation dataset resized to 800x800 and consisting of 24,576 images.
+=== "Unprocessed"
 
-    ### Get Validation Dataset
-    ```
-    mlcr get,dataset,openimages,_validation -j
-    ```
-=== "Calibration"
-    Retinanet calibration dataset consist of 500 images selected from the OpenImages v6 dataset.
+    === "Validation"
+        Retinanet validation run uses the OpenImages v6 MLPerf validation dataset resized to 800x800 and consisting of 24,576 images.
+
+        ### Get Validation Dataset
+        ```
+        mlcr get,dataset,openimages,_validation -j
+        ```
+
+    === "Calibration"
+        Retinanet calibration dataset consist of 500 images selected from the OpenImages v6 dataset.
 
+        ### Get OpenImages Calibration dataset
+        ```
+        mlcr get,dataset,openimages,_calibration -j
+        ```
+
+=== "Preprocessed"
+
+    ### Get Preprocessed OpenImages dataset
     ```
-    mlcr get,dataset,openimages,_calibration -j
+    get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j 
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_OPENIMAGES_DATASET>` could be provided to download the dataset to a specific location.
+
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -41,3 +55,4 @@ Get the Official MLPerf Retinanet Model
     mlcr get,ml-model,retinanet,_onnx -j
     ```
 
+- `--outdirname=<PATH_TO_DOWNLOAD_RETINANET_MODEL>` could be provided to download the model to a specific location.
diff --git a/docs/benchmarks/recommendation/get-dlrm-v2-data.md b/docs/benchmarks/recommendation/get-dlrm-v2-data.md
index bb35660b68..8505b31bf4 100644
--- a/docs/benchmarks/recommendation/get-dlrm-v2-data.md
+++ b/docs/benchmarks/recommendation/get-dlrm-v2-data.md
@@ -16,6 +16,9 @@ The benchmark implementation run command will automatically download the validat
     ```
     mlcr get,dataset,criteo,_validation -j
     ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_CRITEO_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
@@ -25,6 +28,8 @@ Get the Official MLPerf DLRM v2 Model
 
     ### Pytorch
     ```
-    mlcr get,ml-model,dlrm,_pytorch -j
+    mlcr get,ml-model,dlrm,_pytorch,_weight_sharded,_rclone -j
     ```
 
+
+- `--outdirname=<PATH_TO_DOWNLOAD_DLRM_V2_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/text_to_image/get-sdxl-data.md b/docs/benchmarks/text_to_image/get-sdxl-data.md
index 6d79e331d1..7c5363415c 100644
--- a/docs/benchmarks/text_to_image/get-sdxl-data.md
+++ b/docs/benchmarks/text_to_image/get-sdxl-data.md
@@ -17,15 +17,30 @@ The benchmark implementation run command will automatically download the validat
     mlcr get,dataset,coco2014,_validation -j
     ```
 
+=== "Calibration"
+
+    ### Get COCO2014 Calibration Dataset
+    ```
+    mlcr get,dataset,coco2014,_calibration -j
+    ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_COCO2014_DATASET>` could be provided to download the dataset to a specific location.
+
 ## Model
 The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
 
 Get the Official MLPerf Stable Diffusion Model
 
 === "Pytorch"
-
-    ### Pytorch
-    ```
-    mlcr get,ml-model,sdxl,_pytorch -j
-    ```
-
+    === "FP 16"
+        ### Pytorch
+        ```
+        mlcr get,ml-model,sdxl,_pytorch,_fp16 -j
+        ```
+    === "FP 32"
+        ### Pytorch
+        ```
+        mlcr get,ml-model,sdxl,_pytorch,_fp32 -j
+        ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_SDXL_MODEL>` could be provided to download the model to a specific location.

From d8048376f88d3ad6aabbaddd52018cea5263b117 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Tue, 22 Apr 2025 18:53:40 +0530
Subject: [PATCH 10/35] add powershell command to get result folder structure
 (#2156)

---
 docs/submission/index.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 6a6bbfb2f2..079a513854 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -56,9 +56,15 @@ Please refer to the [installation page](site:inference/install/) to install MLCF
 === "MLC automation based results"
     If you have followed the `mlcr` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `mlc cache` folder. The following command could be used to browse the structure of inference results folder generated by MLCFlow.
     ### Get results folder structure
-    ```bash
-    mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree
-    ```
+
+    === "Unix Terminal"
+        ```bash
+        mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree
+        ```
+    === "Windows PowerShell"
+        ```
+        mlc find cache --tags=get,mlperf,inference,results,dir |  ForEach-Object { Get-ChildItem -Recurse $_ }
+        ```
 
 
 Once all the results across all the models are ready you can use the following the below section to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).

From 5b65f7be79a703fa23034f0d406b2e6f1a6134eb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 22 Apr 2025 17:00:11 +0000
Subject: [PATCH 11/35] [Automated Commit] Format Codebase

---
 tools/submission/submission_checker.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 41116e2620..edda676c9c 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -2094,7 +2094,8 @@ def log_result(
             if filter_submitter and submitter != filter_submitter:
                 continue
             results_path = os.path.join(division, submitter, "results")
-            measurements_path = os.path.join(division, submitter, "measurements")
+            measurements_path = os.path.join(
+                division, submitter, "measurements")
             systems_path = os.path.join(division, submitter, "systems")
             if not os.path.exists(results_path):
                 continue
@@ -2200,7 +2201,8 @@ def log_result(
                         extra_model_mapping = json.load(fp)
 
             if not config.skip_all_systems_with_results:
-                measurement_diff = list(set(list_dir(measurements_path)) - set(list_dir(results_path)))
+                measurement_diff = list(
+                    set(list_dir(measurements_path)) - set(list_dir(results_path)))
                 systems_diff = list(
                     set(
                         [
@@ -3173,7 +3175,7 @@ def main():
         args.extra_model_benchmark_map,
         ignore_uncommited=args.submission_exceptions,
         skip_power_check=args.skip_power_check,
-        skip_all_systems_with_results = args.skip_all_systems_have_results_check
+        skip_all_systems_with_results=args.skip_all_systems_have_results_check
     )
 
     if args.scenarios_to_skip:

From 70fcbe0c8b4aeadedd7ba40c17b4077530e3019b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 10 Jun 2025 16:59:39 +0000
Subject: [PATCH 12/35] [Automated Commit] Format Codebase

---
 language/deepseek-r1/backends/__init__.py     |   2 +-
 .../deepseek-r1/backends/pytorch_backend.py   |  52 ++---
 .../deepseek-r1/backends/sglang_backend.py    | 168 +++++++++------
 language/deepseek-r1/backends/utils.py        | 173 +++++++++-------
 language/deepseek-r1/backends/vllm_backend.py |  33 +--
 language/deepseek-r1/eval_accuracy.py         |  74 ++++---
 language/deepseek-r1/mlperf/__init__.py       |   6 +-
 language/deepseek-r1/mlperf/base_sut.py       |  28 +--
 language/deepseek-r1/mlperf/offline_sut.py    |  70 ++++---
 language/deepseek-r1/mlperf/qsl.py            |  25 +--
 language/deepseek-r1/mlperf/server_sut.py     | 100 +++++----
 language/deepseek-r1/mlperf/utils.py          |  69 ++++---
 language/deepseek-r1/run_eval.py              | 165 ++++++++-------
 language/deepseek-r1/run_eval_mpi.py          |  99 +++++----
 language/deepseek-r1/run_mlperf.py            | 116 ++++++-----
 language/deepseek-r1/run_mlperf_mpi.py        | 195 ++++++++++--------
 language/deepseek-r1/utils/__init__.py        |   2 +-
 .../deepseek-r1/utils/backend_registry.py     |  11 +-
 language/deepseek-r1/utils/data_utils.py      | 108 +++++-----
 language/deepseek-r1/utils/error_handling.py  |  11 +-
 language/deepseek-r1/utils/runner_utils.py    |  58 +++---
 language/deepseek-r1/utils/tokenization.py    |  87 ++++----
 language/deepseek-r1/utils/validation.py      |  44 ++--
 tools/submission/submission_checker.py        |  17 +-
 24 files changed, 987 insertions(+), 726 deletions(-)

diff --git a/language/deepseek-r1/backends/__init__.py b/language/deepseek-r1/backends/__init__.py
index 61ad96a3f2..865ed3bd53 100644
--- a/language/deepseek-r1/backends/__init__.py
+++ b/language/deepseek-r1/backends/__init__.py
@@ -11,4 +11,4 @@
 # to avoid dependency issues when only using certain backends
 __all__ = [
     'BaseBackend',
-]
\ No newline at end of file
+]
diff --git a/language/deepseek-r1/backends/pytorch_backend.py b/language/deepseek-r1/backends/pytorch_backend.py
index c1e426185d..0742882bca 100644
--- a/language/deepseek-r1/backends/pytorch_backend.py
+++ b/language/deepseek-r1/backends/pytorch_backend.py
@@ -1,3 +1,17 @@
+from utils.validation import require_initialized, BackendNotInitializedError
+from utils.backend_registry import get_backend_config
+from .utils import get_cache_directory
+from .base_backend import BaseBackend
+from transformers import AutoTokenizer
+import torch.distributed as dist
+import torch
+from pathlib import Path
+import asyncio
+from typing import Any, Dict, List, Optional
+import logging
+import json
+from ref_dsinfer.inference.model import Transformer, ModelArgs
+from safetensors.torch import load_model
 import os
 import sys
 
@@ -6,23 +20,6 @@
     'REF_DSINFER_PATH', '/opt/ref_dsinfer/inference')
 sys.path.append(ref_dsinfer_path)
 
-from safetensors.torch import load_model
-from ref_dsinfer.inference.model import Transformer, ModelArgs
-import json
-import logging
-from typing import Any, Dict, List, Optional
-import asyncio
-from pathlib import Path
-
-import torch
-import torch.distributed as dist
-from transformers import AutoTokenizer
-
-from .base_backend import BaseBackend
-from .utils import get_cache_directory
-from utils.backend_registry import get_backend_config
-from utils.validation import require_initialized, BackendNotInitializedError
-
 
 logger = logging.getLogger(__name__)
 
@@ -115,8 +112,10 @@ def initialize(self) -> None:
         with torch.device(self.config['device']):
             self.model = Transformer(self.model_args)
 
-        # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need it for run_eval_mpi)
-        self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path), revision=self.config['model_revision'])
+        # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need
+        # it for run_eval_mpi)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            str(self.model_path), revision=self.config['model_revision'])
 
         # Load model weights
         checkpoint_file = self.model_path / \
@@ -133,7 +132,8 @@ def sample(self, logits: torch.Tensor, temperature: float) -> torch.Tensor:
         """Sample from logits with temperature."""
         logits = logits / max(temperature, 1e-5)
         probs = torch.softmax(logits, dim=-1)
-        return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
+        return probs.div_(torch.empty_like(
+            probs).exponential_(1)).argmax(dim=-1)
 
     @torch.inference_mode()
     def _generate_batch(
@@ -222,7 +222,8 @@ def _generate_batch(
         return completion_tokens
 
     @require_initialized
-    def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]:
+    def generate(
+            self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]:
         """
         Generate responses for a list of pre-tokenized prompts.
 
@@ -265,7 +266,8 @@ def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[st
         return results
 
     @require_initialized
-    def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List[int]]:
+    def generate_batch_distributed(
+            self, batch_tokens: List[List[int]]) -> List[List[int]]:
         """
         Generate tokens for a batch in distributed mode.
 
@@ -296,7 +298,8 @@ def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List
             return []
 
     @require_initialized
-    def generate_async(self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]:
+    def generate_async(
+            self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]:
         """
         Generate responses asynchronously.
 
@@ -331,7 +334,8 @@ async def extract_result(idx):
         return futures
 
     @require_initialized
-    def generate_batch_distributed_async(self, batch_tokens: List[List[int]]) -> asyncio.Future:
+    def generate_batch_distributed_async(
+            self, batch_tokens: List[List[int]]) -> asyncio.Future:
         """
         Generate tokens for a batch in distributed mode asynchronously.
 
diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py
index 06cf074a96..10be6e1dcd 100644
--- a/language/deepseek-r1/backends/sglang_backend.py
+++ b/language/deepseek-r1/backends/sglang_backend.py
@@ -66,11 +66,12 @@ def __init__(self, config: Dict[str, Any] = None):
 
         # Log monitoring
         self._log_monitor = None
-        
+
         # Shared semaphore for async concurrency control
         self._async_semaphore = None
 
-        # Configure logging to suppress httpx INFO logs (only show warnings/errors)
+        # Configure logging to suppress httpx INFO logs (only show
+        # warnings/errors)
         import logging
         logging.getLogger("httpx").setLevel(logging.WARNING)
         logging.getLogger("openai").setLevel(logging.WARNING)
@@ -128,7 +129,8 @@ def _build_server_command(self) -> List[str]:
             cmd.append('flashinfer')
 
         if self.config['enable_dp_attention']:
-            cmd.extend(['--enable-dp-attention', '--dp', str(self.config['dp'])])
+            cmd.extend(['--enable-dp-attention',
+                       '--dp', str(self.config['dp'])])
 
         # Add performance settings
         cmd.extend([
@@ -175,7 +177,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool:
             # Update progress indicator every 0.5 seconds
             if time.time() - last_progress_update >= 0.5:
                 last_progress_update = time.time()
-                progress_idx = (progress_idx + 1) % len(TerminalDisplay.PROGRESS_CHARS)
+                progress_idx = (
+                    progress_idx + 1) % len(TerminalDisplay.PROGRESS_CHARS)
                 minutes = elapsed // 60
                 seconds = elapsed % 60
                 # Use carriage return to stay on the same line
@@ -192,7 +195,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool:
                     if response.status_code == 200:
                         # Health check passed, now try a warmup query
                         print(f"\r{' '*80}\r", end='', flush=True)
-                        print(f"\n[SGLANG] Health check passed, running warmup query...")
+                        print(
+                            f"\n[SGLANG] Health check passed, running warmup query...")
 
                         # Try to send a simple warmup query using OpenAI client
                         try:
@@ -210,7 +214,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool:
                             # Send a simple warmup request
                             warmup_response = warmup_client.chat.completions.create(
                                 model=self.config['served_model_name'],
-                                messages=[{"role": "user", "content": "Hello"}],
+                                messages=[
+                                    {"role": "user", "content": "Hello"}],
                                 temperature=0.0,
                                 max_tokens=10,
                                 seed=self.config['seed']
@@ -218,23 +223,28 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool:
 
                             # Check if we got a valid response
                             if warmup_response.choices[0].message.content:
-                                print(f"[SGLANG] ✓ Warmup query successful! Response: {warmup_response.choices[0].message.content[:50]}...")
+                                print(
+                                    f"[SGLANG] ✓ Warmup query successful! Response: {warmup_response.choices[0].message.content[:50]}...")
 
                                 # Stop log monitoring
                                 if self._log_monitor:
                                     self._log_monitor.stop()
                                     self._log_monitor = None
 
-                                print(f"\n[SGLANG] " + "="*60)
-                                print(f"[SGLANG] ✓ SERVER READY! (startup took {elapsed}s)")
-                                print(f"[SGLANG] " + "="*60)
+                                print(f"\n[SGLANG] " + "=" * 60)
+                                print(
+                                    f"[SGLANG] ✓ SERVER READY! (startup took {elapsed}s)")
+                                print(f"[SGLANG] " + "=" * 60)
                                 return True
                             else:
-                                print(f"[SGLANG] Warmup query returned empty response, retrying...")
+                                print(
+                                    f"[SGLANG] Warmup query returned empty response, retrying...")
 
                         except Exception as warmup_error:
-                            print(f"[SGLANG] Warmup query failed: {warmup_error}, retrying...")
-                            # Continue waiting, the server might not be fully ready yet
+                            print(
+                                f"[SGLANG] Warmup query failed: {warmup_error}, retrying...")
+                            # Continue waiting, the server might not be fully
+                            # ready yet
 
                 except requests.exceptions.RequestException:
                     pass
@@ -246,9 +256,11 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool:
                     self._log_monitor = None
                 # Clear progress line
                 print(f"\r{' '*80}\r", end='', flush=True)
-                print(f"\n[SGLANG] ✗ Server process died with exit code: {self.server_process.returncode}")
+                print(
+                    f"\n[SGLANG] ✗ Server process died with exit code: {self.server_process.returncode}")
                 if self.server_log_file:
-                    print(f"[SGLANG] Check server logs at: {self.server_log_file}")
+                    print(
+                        f"[SGLANG] Check server logs at: {self.server_log_file}")
                 return False
 
             time.sleep(0.1)  # Check every 100ms for smoother progress
@@ -264,17 +276,21 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool:
 
     def _start_server(self) -> None:
         """Start the SGLang server as a subprocess."""
-        print(f"\n[SGLANG] Starting SGLang server for {self.config['model']}...")
+        print(
+            f"\n[SGLANG] Starting SGLang server for {self.config['model']}...")
         print(f"[SGLANG] Configuration:")
         print(f"[SGLANG]   - Port: {self.port}")
-        print(f"[SGLANG]   - Tensor Parallel: {self.config['tensor_parallel_size']}")
-        print(f"[SGLANG]   - Context Length: {self.config['context_length']:,} tokens")
+        print(
+            f"[SGLANG]   - Tensor Parallel: {self.config['tensor_parallel_size']}")
+        print(
+            f"[SGLANG]   - Context Length: {self.config['context_length']:,} tokens")
         print(f"[SGLANG]   - dtype: {self.config['dtype']}")
 
         # Create log file for server output
         log_dir = Path("/work/logs")
         log_dir.mkdir(exist_ok=True)
-        self.server_log_file = log_dir / f"sglang_server_{self.port}_{int(time.time())}.log"
+        self.server_log_file = log_dir / \
+            f"sglang_server_{self.port}_{int(time.time())}.log"
 
         cmd = self._build_server_command()
         print(f"\n[SGLANG] Command: {' '.join(cmd)}")
@@ -315,7 +331,10 @@ def _stop_server(self) -> None:
                 except subprocess.TimeoutExpired:
                     # Force kill if not stopped
                     print("[SGLANG] Server didn't stop gracefully, forcing...")
-                    os.killpg(os.getpgid(self.server_process.pid), signal.SIGKILL)
+                    os.killpg(
+                        os.getpgid(
+                            self.server_process.pid),
+                        signal.SIGKILL)
                     self.server_process.wait()
                     print("[SGLANG] Server force stopped")
             except ProcessLookupError:
@@ -332,7 +351,8 @@ def initialize(self) -> None:
         try:
             # Load tokenizer for string conversion
             print(f"[SGLANG] Loading tokenizer: {self.config['tokenizer']}...")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer'], revision=self.config['model_revision'])
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.config['tokenizer'], revision=self.config['model_revision'])
 
             # Start SGLang server (with log monitoring)
             self._start_server()
@@ -341,7 +361,8 @@ def initialize(self) -> None:
             base_url = f"http://localhost:{self.port}/v1"
             api_key = self.config['api_key'] or "dummy-key"
 
-            print(f"[SGLANG] Creating OpenAI clients with base URL: {base_url}")
+            print(
+                f"[SGLANG] Creating OpenAI clients with base URL: {base_url}")
 
             # Configure timeout settings
             timeout_config = httpx.Timeout(
@@ -371,10 +392,12 @@ def initialize(self) -> None:
             )
 
             print(f"[SGLANG] Created asynchronous OpenAI client")
-            
+
             # Create shared semaphore for async concurrency control
-            self._async_semaphore = asyncio.Semaphore(self.config['max_running_requests'])
-            print(f"[SGLANG] Created async semaphore with limit: {self.config['max_running_requests']}")
+            self._async_semaphore = asyncio.Semaphore(
+                self.config['max_running_requests'])
+            print(
+                f"[SGLANG] Created async semaphore with limit: {self.config['max_running_requests']}")
 
             # Server readiness was already verified by health endpoint in _wait_for_server_ready()
             # No need to check models endpoint
@@ -403,17 +426,18 @@ def initialize(self) -> None:
             raise
 
     @require_initialized
-    def generate(self, 
-                tokenized_prompts: Optional[List[List[int]]] = None,
-                text_prompts: Optional[List[str]] = None,
-                **kwargs) -> List[Dict[str, Any]]:
+    def generate(self,
+                 tokenized_prompts: Optional[List[List[int]]] = None,
+                 text_prompts: Optional[List[str]] = None,
+                 **kwargs) -> List[Dict[str, Any]]:
         """Generate responses synchronously."""
         # Check if server process is still alive
         self._check_server_alive()
 
         # Check if client is properly initialized
         if self.client is None:
-            raise RuntimeError("SGLang client is not initialized. Server may have failed to start.")
+            raise RuntimeError(
+                "SGLang client is not initialized. Server may have failed to start.")
 
         # Validate prompts using centralized validation
         validate_prompts_input(
@@ -436,7 +460,8 @@ def generate(self,
         results = []
 
         # Process prompts with progress bar
-        for prompt in tqdm(prompt_strings, desc="SGLang sync inference", unit="prompt"):
+        for prompt in tqdm(
+                prompt_strings, desc="SGLang sync inference", unit="prompt"):
             try:
                 completion = self.client.chat.completions.create(
                     model=self.config['served_model_name'],
@@ -452,7 +477,8 @@ def generate(self,
 
                 # Validate response is not empty
                 if not generated_text:
-                    raise RuntimeError(f"Empty response received from SGLang server for prompt: {prompt[:100]}...")
+                    raise RuntimeError(
+                        f"Empty response received from SGLang server for prompt: {prompt[:100]}...")
 
                 # Tokenize the output to get token IDs
                 tokens = self.tokenizer.encode(generated_text)
@@ -464,15 +490,18 @@ def generate(self,
 
             except Exception as e:
                 print(f"\nError generating completion: {e}")
-                raise RuntimeError(f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...")
+                raise RuntimeError(
+                    f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...")
 
         return results
 
-    async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio.Semaphore) -> Tuple[int, Dict[str, Any]]:
+    async def _async_generate_single(
+            self, prompt: str, idx: int, semaphore: asyncio.Semaphore) -> Tuple[int, Dict[str, Any]]:
         """Generate a single response asynchronously with semaphore control."""
         # Check if async client is properly initialized
         if self.async_client is None:
-            raise RuntimeError(f"SGLang async client is not initialized for prompt {idx}")
+            raise RuntimeError(
+                f"SGLang async client is not initialized for prompt {idx}")
 
         async with semaphore:
             try:
@@ -490,7 +519,8 @@ async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio
 
                 # Validate response is not empty
                 if not generated_text:
-                    raise RuntimeError(f"Empty response received from SGLang server for prompt: {prompt[:100]}...")
+                    raise RuntimeError(
+                        f"Empty response received from SGLang server for prompt: {prompt[:100]}...")
 
                 # Tokenize the output to get token IDs
                 tokens = self.tokenizer.encode(generated_text)
@@ -499,20 +529,22 @@ async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio
 
             except Exception as e:
                 print(f"\nError generating completion for prompt {idx}: {e}")
-                raise RuntimeError(f"SGLang backend failed to generate tokens for prompt {idx}: {e}")
+                raise RuntimeError(
+                    f"SGLang backend failed to generate tokens for prompt {idx}: {e}")
 
     @require_initialized
-    def generate_async(self, 
-                      tokenized_prompts: Optional[List[List[int]]] = None,
-                      text_prompts: Optional[List[str]] = None,
-                      **kwargs) -> List[asyncio.Future]:
+    def generate_async(self,
+                       tokenized_prompts: Optional[List[List[int]]] = None,
+                       text_prompts: Optional[List[str]] = None,
+                       **kwargs) -> List[asyncio.Future]:
         """Generate responses asynchronously using shared semaphore."""
         # Check if server process is still alive
         self._check_server_alive()
 
         # Check if client is properly initialized
         if self.async_client is None:
-            raise RuntimeError("SGLang async client is not initialized. Server may have failed to start.")
+            raise RuntimeError(
+                "SGLang async client is not initialized. Server may have failed to start.")
 
         # Validate prompts using centralized validation
         validate_prompts_input(
@@ -542,44 +574,49 @@ def generate_async(self,
         futures = []
         for idx, prompt in enumerate(prompt_strings):
             # Create a task for each prompt using the shared semaphore
-            task = asyncio.create_task(self._async_generate_single(prompt, idx, self._async_semaphore))
-            
+            task = asyncio.create_task(
+                self._async_generate_single(
+                    prompt, idx, self._async_semaphore))
+
             # Create a future that will hold the result
             future = asyncio.Future()
-            
+
             # Setup callback to extract just the result (not the index)
             def make_callback(future_obj, expected_idx):
                 def callback(task_obj):
                     try:
                         idx, result = task_obj.result()
                         if idx != expected_idx:
-                            future_obj.set_exception(Exception(f"Index mismatch: expected {expected_idx}, got {idx}"))
+                            future_obj.set_exception(
+                                Exception(f"Index mismatch: expected {expected_idx}, got {idx}"))
                         else:
                             future_obj.set_result(result)
                     except Exception as e:
                         future_obj.set_exception(e)
                 return callback
-            
+
             task.add_done_callback(make_callback(future, idx))
             futures.append(future)
 
         return futures
 
-    async def generate_stream(self, 
-                            tokenized_prompts: Optional[List[List[int]]] = None,
-                            text_prompts: Optional[List[str]] = None,
-                            **kwargs) -> List[AsyncIterator[StreamingChunk]]:
+    async def generate_stream(self,
+                              tokenized_prompts: Optional[List[List[int]]] = None,
+                              text_prompts: Optional[List[str]] = None,
+                              **kwargs) -> List[AsyncIterator[StreamingChunk]]:
         """Generate responses for a list of prompts with streaming."""
         if not self.is_initialized:
-            raise RuntimeError("Backend not initialized. Call initialize() first.")
-            
+            raise RuntimeError(
+                "Backend not initialized. Call initialize() first.")
+
         # Check if server process is still alive
         self._check_server_alive()
-        
+
         # Check if async client is properly initialized
         if self.async_client is None:
-            raise RuntimeError("SGLang async client is not initialized. Server may have failed to start.")
-        
+            raise RuntimeError(
+                "SGLang async client is not initialized. Server may have failed to start.")
+
         # Validate prompts
         validate_prompts_input(
             backend_name='sglang',
@@ -587,7 +624,7 @@ async def generate_stream(self,
             text_prompts=text_prompts,
             input_type='text'
         )
-        
+
         # SGLang prefers text prompts
         if text_prompts is None:
             # Convert tokenized prompts to strings
@@ -597,8 +634,9 @@ async def generate_stream(self,
             ]
         else:
             prompt_strings = text_prompts
-            
-        async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]:
+
+        async def stream_single_prompt(
+                prompt: str) -> AsyncIterator[StreamingChunk]:
             try:
                 stream = await self.async_client.chat.completions.create(
                     model=self.config['served_model_name'],
@@ -609,14 +647,14 @@ async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]:
                     seed=self.config.get('seed'),
                     stream=True
                 )
-                
+
                 async for chunk in stream:
                     if not chunk.choices:
                         continue
-                        
+
                     delta = chunk.choices[0].delta
                     finish_reason = chunk.choices[0].finish_reason
-                    
+
                     if delta.content:
                         yield StreamingChunk(
                             token=delta.content,
@@ -635,7 +673,7 @@ async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]:
             except Exception as e:
                 print(f"[SGLANG] Streaming error for prompt: {e}")
                 raise
-        
+
         return [stream_single_prompt(prompt) for prompt in prompt_strings]
 
     def shutdown(self) -> None:
@@ -650,7 +688,7 @@ def shutdown(self) -> None:
         # Close clients
         self.client = None
         self.async_client = None
-        
+
         # Clear async semaphore
         self._async_semaphore = None
 
@@ -665,4 +703,4 @@ def shutdown(self) -> None:
             torch.cuda.empty_cache()
 
         self.is_initialized = False
-        print("[SGLANG] Backend shutdown complete")
\ No newline at end of file
+        print("[SGLANG] Backend shutdown complete")
diff --git a/language/deepseek-r1/backends/utils.py b/language/deepseek-r1/backends/utils.py
index 0e4c7732da..ebd6ce3719 100644
--- a/language/deepseek-r1/backends/utils.py
+++ b/language/deepseek-r1/backends/utils.py
@@ -19,50 +19,50 @@
 def get_cache_directory() -> Path:
     """
     Get the cache directory at /raid/data/$USER/.cache
-    
+
     Returns:
         Path: The cache directory path
     """
     # Get the current user
     user = os.environ.get('USER', os.environ.get('USERNAME', 'unknown'))
-    
+
     # Use /raid/data/$USER/.cache
     cache_dir = Path(f'/raid/data/{user}/.cache')
-    
+
     # Create the cache directory if it doesn't exist
     cache_dir.mkdir(parents=True, exist_ok=True)
-    
+
     return cache_dir
 
 
 def setup_huggingface_cache() -> Path:
     """
     Set up HuggingFace cache environment variables using the preferred cache directory.
-    
+
     Returns:
         Path: The cache directory being used
     """
     cache_dir = get_cache_directory()
-    
+
     # Set HuggingFace cache environment variables
     os.environ['HF_HOME'] = str(cache_dir)
     os.environ['HF_HUB_CACHE'] = str(cache_dir)
     os.environ['HUGGINGFACE_HUB_CACHE'] = str(cache_dir)
-    
+
     return cache_dir
 
 
 def find_free_port(start_port: int = 30000, max_attempts: int = 100) -> int:
     """
     Find a free port starting from start_port.
-    
+
     Args:
         start_port: The port number to start searching from
         max_attempts: Maximum number of ports to try
-        
+
     Returns:
         int: A free port number
-        
+
     Raises:
         RuntimeError: If no free port is found after max_attempts
     """
@@ -75,13 +75,14 @@ def find_free_port(start_port: int = 30000, max_attempts: int = 100) -> int:
             return port
         except OSError:
             continue
-    raise RuntimeError(f"Could not find free port after {max_attempts} attempts starting from {start_port}")
+    raise RuntimeError(
+        f"Could not find free port after {max_attempts} attempts starting from {start_port}")
 
 
 def set_all_seeds(seed: int = 42) -> None:
     """
     Set seeds for all random number generators for reproducibility.
-    
+
     Args:
         seed: The seed value to use
     """
@@ -96,73 +97,76 @@ def set_all_seeds(seed: int = 42) -> None:
     set_seed(seed)
 
 
-def validate_prompts(tokenized_prompts: Optional[list] = None, 
-                    text_prompts: Optional[list] = None,
-                    backend_type: str = "") -> None:
+def validate_prompts(tokenized_prompts: Optional[list] = None,
+                     text_prompts: Optional[list] = None,
+                     backend_type: str = "") -> None:
     """
     Validate that at least one type of prompts is provided.
-    
+
     Args:
         tokenized_prompts: List of tokenized prompts
         text_prompts: List of text prompts
         backend_type: Name of the backend for error messages
-        
+
     Raises:
         ValueError: If neither prompt type is provided
     """
     if tokenized_prompts is None and text_prompts is None:
-        raise ValueError(f"{backend_type + ' backend' if backend_type else 'Backend'} requires either text_prompts or tokenized_prompts")
+        raise ValueError(
+            f"{backend_type + ' backend' if backend_type else 'Backend'} requires either text_prompts or tokenized_prompts")
 
 
 # Terminal display utilities
 class TerminalDisplay:
     """ANSI escape codes and utilities for terminal display formatting."""
-    
+
     # ANSI escape codes for cursor control
     CLEAR_SCREEN = "\033[2J"
     MOVE_CURSOR_UP = "\033[{}A"
     CLEAR_LINE = "\033[K"
     SAVE_CURSOR = "\033[s"
     RESTORE_CURSOR = "\033[u"
-    
+
     # Progress spinner characters
     PROGRESS_CHARS = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
-    
+
     @staticmethod
     def clear_lines(num_lines: int) -> None:
         """Clear the specified number of lines above the cursor."""
-        print(TerminalDisplay.MOVE_CURSOR_UP.format(num_lines), end='', flush=True)
+        print(TerminalDisplay.MOVE_CURSOR_UP.format(
+            num_lines), end='', flush=True)
         for _ in range(num_lines):
             print(TerminalDisplay.CLEAR_LINE)
-        print(TerminalDisplay.MOVE_CURSOR_UP.format(num_lines), end='', flush=True)
-    
+        print(TerminalDisplay.MOVE_CURSOR_UP.format(
+            num_lines), end='', flush=True)
+
     @staticmethod
     def save_cursor_position() -> None:
         """Save the current cursor position."""
         print(TerminalDisplay.SAVE_CURSOR, end='', flush=True)
-    
+
     @staticmethod
     def restore_cursor_position() -> None:
         """Restore the previously saved cursor position."""
         print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True)
-    
+
     @staticmethod
     def clear_current_line() -> None:
         """Clear the current line."""
         print("\r" + " " * 80 + "\r", end='', flush=True)
-    
+
     @staticmethod
     def truncate_line(line: str, max_length: int = 110) -> str:
         """Truncate a line to fit within the specified length."""
         if len(line) <= max_length:
             return line
-        return line[:max_length - 3] + "..." 
+        return line[:max_length - 3] + "..."
 
 
 class LogMonitor:
     """Real-time log file monitor with terminal display."""
-    
-    def __init__(self, 
+
+    def __init__(self,
                  log_file_path: Union[str, Path],
                  prefix: str = "LOG",
                  max_lines: int = 5,
@@ -170,7 +174,7 @@ def __init__(self,
                  header_text: Optional[str] = None):
         """
         Initialize the log monitor.
-        
+
         Args:
             log_file_path: Path to the log file to monitor
             prefix: Prefix for display lines (e.g., "[SGLANG]")
@@ -183,42 +187,43 @@ def __init__(self,
         self.max_lines = max_lines
         self.display_interval = display_interval
         self.header_text = header_text or f"Server startup logs (last {max_lines} lines):"
-        
+
         # Threading control
         self._monitor_thread = None
         self._stop_event = None
         self._ready_event = None
-        
+
         # Display dimensions
         self.total_lines = max_lines + 3  # 2 header lines + 1 blank separator
-        
-    def start(self, wait_for_file: bool = True, file_wait_timeout: float = 30.0) -> bool:
+
+    def start(self, wait_for_file: bool = True,
+              file_wait_timeout: float = 30.0) -> bool:
         """
         Start the log monitor in a background thread.
-        
+
         Args:
             wait_for_file: Whether to wait for the log file to exist
             file_wait_timeout: How long to wait for the file (seconds)
-            
+
         Returns:
             bool: True if monitor started successfully
         """
         if self._monitor_thread is not None:
             return True  # Already running
-            
+
         self._stop_event = threading.Event()
         self._ready_event = threading.Event()
-        
+
         self._monitor_thread = threading.Thread(
             target=self._monitor_loop,
             args=(wait_for_file, file_wait_timeout),
             daemon=True
         )
         self._monitor_thread.start()
-        
+
         # Wait for the monitor to set up its display area
         return self._ready_event.wait(timeout=2.0)
-    
+
     def stop(self) -> None:
         """Stop the log monitor and clean up display."""
         if self._stop_event and self._monitor_thread:
@@ -227,36 +232,39 @@ def stop(self) -> None:
             self._monitor_thread = None
             self._stop_event = None
             self._ready_event = None
-    
-    def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None:
+
+    def _monitor_loop(self, wait_for_file: bool,
+                      file_wait_timeout: float) -> None:
         """Main monitoring loop that runs in a separate thread."""
         # Wait for log file if requested
         if wait_for_file:
             start_time = time.time()
             while not self.log_file_path.exists():
                 if time.time() - start_time > file_wait_timeout:
-                    print(f"[{self.prefix}] Warning: Log file not found after {file_wait_timeout}s: {self.log_file_path}")
+                    print(
+                        f"[{self.prefix}] Warning: Log file not found after {file_wait_timeout}s: {self.log_file_path}")
                     self._ready_event.set()
                     return
                 time.sleep(0.5)
         elif not self.log_file_path.exists():
-            print(f"[{self.prefix}] Warning: Log file not found: {self.log_file_path}")
+            print(
+                f"[{self.prefix}] Warning: Log file not found: {self.log_file_path}")
             self._ready_event.set()
             return
-            
+
         print(f"\n[{self.prefix}] Monitoring logs: {self.log_file_path.name}")
-        print(f"[{self.prefix}] " + "="*60)
-        
+        print(f"[{self.prefix}] " + "=" * 60)
+
         # Initialize display area
         self._setup_display_area()
-        
+
         # Signal that we're ready
         self._ready_event.set()
-        
+
         # Buffer for log lines
         line_buffer = []
         last_display_time = 0
-        
+
         try:
             # Use tail -f to follow the log file
             process = subprocess.Popen(
@@ -267,11 +275,11 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None:
                 bufsize=1,
                 universal_newlines=True
             )
-            
+
             while not self._stop_event.is_set():
                 if process.poll() is not None:
                     break
-                    
+
                 # Read available lines without blocking
                 line_added = False
                 try:
@@ -285,7 +293,7 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None:
                             line_added = True
                         else:
                             break
-                except:
+                except BaseException:
                     # Fallback for systems without select
                     line = process.stdout.readline()
                     if line:
@@ -293,65 +301,69 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None:
                         if len(line_buffer) > self.max_lines:
                             line_buffer.pop(0)
                         line_added = True
-                
+
                 # Update display if needed
                 current_time = time.time()
-                if line_added or (current_time - last_display_time >= self.display_interval):
+                if line_added or (
+                        current_time - last_display_time >= self.display_interval):
                     last_display_time = current_time
                     self._update_display(line_buffer)
-                    
+
                 time.sleep(0.1)
-                
+
             # Clean up
             process.terminate()
             try:
                 process.wait(timeout=2)
             except subprocess.TimeoutExpired:
                 process.kill()
-                
+
         except Exception as e:
             print(f"\n[{self.prefix}] Log monitor error: {e}")
         finally:
             self._cleanup_display()
-    
+
     def _setup_display_area(self) -> None:
         """Reserve and initialize the display area."""
         # Reserve space
         for _ in range(self.total_lines):
             print()
-            
+
         # Move back up to start of reserved area
-        print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines), end='', flush=True)
-        
+        print(TerminalDisplay.MOVE_CURSOR_UP.format(
+            self.total_lines), end='', flush=True)
+
         # Print initial display
         print(f"\r[{self.prefix}] {self.header_text}", end='')
         print(TerminalDisplay.CLEAR_LINE, flush=True)
-        print(f"\r[{self.prefix}] " + "-"*60, end='')
+        print(f"\r[{self.prefix}] " + "-" * 60, end='')
         print(TerminalDisplay.CLEAR_LINE, flush=True)
-        
+
         # Print empty lines
         for _ in range(self.max_lines):
             print(f"\r[{self.prefix}] ", end='')
             print(TerminalDisplay.CLEAR_LINE, flush=True)
-            
+
         # Print separator
         print(f"\r", end='')
         print(TerminalDisplay.CLEAR_LINE, flush=True)
-    
+
     def _update_display(self, line_buffer: list) -> None:
         """Update the display with current log lines."""
         # Save cursor position
         print(TerminalDisplay.SAVE_CURSOR, end='', flush=True)
-        
-        # Move to start of reserved area (cursor is on progress line, 1 below our area)
-        print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines + 1), end='', flush=True)
-        
+
+        # Move to start of reserved area (cursor is on progress line, 1 below
+        # our area)
+        print(TerminalDisplay.MOVE_CURSOR_UP.format(
+            self.total_lines + 1), end='', flush=True)
+
         # Print header
         print(f"\r[{self.prefix}] {self.header_text}", end='')
         print(TerminalDisplay.CLEAR_LINE, flush=True)
-        print(f"\r[{self.prefix}] " + "-"*60, end='')
+        print(f"\r[{self.prefix}] " + "-" * 60, end='')
         print(TerminalDisplay.CLEAR_LINE, flush=True)
-        
+
         # Print log lines
         for i in range(self.max_lines):
             if i < len(line_buffer):
@@ -360,22 +372,23 @@ def _update_display(self, line_buffer: list) -> None:
             else:
                 print(f"\r[{self.prefix}] ", end='')
             print(TerminalDisplay.CLEAR_LINE, flush=True)
-            
+
         # Print separator
         print(f"\r", end='')
         print(TerminalDisplay.CLEAR_LINE, flush=True)
-        
+
         # Restore cursor position
         print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True)
-    
+
     def _cleanup_display(self) -> None:
         """Clean up the display area on exit."""
         print(TerminalDisplay.SAVE_CURSOR, end='', flush=True)
-        print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines + 1), end='', flush=True)
-        
+        print(TerminalDisplay.MOVE_CURSOR_UP.format(
+            self.total_lines + 1), end='', flush=True)
+
         # Clear all reserved lines
         for _ in range(self.total_lines):
             print(f"\r", end='')
             print(TerminalDisplay.CLEAR_LINE, flush=True)
-            
-        print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) 
\ No newline at end of file
+
+        print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True)
diff --git a/language/deepseek-r1/backends/vllm_backend.py b/language/deepseek-r1/backends/vllm_backend.py
index 4ac408042f..ec49227f41 100644
--- a/language/deepseek-r1/backends/vllm_backend.py
+++ b/language/deepseek-r1/backends/vllm_backend.py
@@ -148,9 +148,9 @@ def initialize(self) -> None:
 
     @require_initialized
     def generate(self,
-                tokenized_prompts: Optional[List[List[int]]] = None,
-                text_prompts: Optional[List[str]] = None,
-                **kwargs) -> List[Dict[str, Any]]:
+                 tokenized_prompts: Optional[List[List[int]]] = None,
+                 text_prompts: Optional[List[str]] = None,
+                 **kwargs) -> List[Dict[str, Any]]:
         """Generate responses synchronously using LLM.generate().
 
         Note: vLLM backend only accepts text_prompts parameter.
@@ -177,11 +177,14 @@ def generate(self,
             if not completion.text:
                 # Get the corresponding prompt for context
                 prompt_idx = outputs.index(output)
-                prompt_preview = text_prompts[prompt_idx][:100] if len(text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx]
-                raise RuntimeError(f"Empty response received from vLLM for prompt: {prompt_preview}...")
+                prompt_preview = text_prompts[prompt_idx][:100] if len(
+                    text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx]
+                raise RuntimeError(
+                    f"Empty response received from vLLM for prompt: {prompt_preview}...")
 
             results.append({
-                'tokens': list(completion.token_ids),  # Convert tuple to list for .copy() compatibility
+                # Convert tuple to list for .copy() compatibility
+                'tokens': list(completion.token_ids),
                 'text': completion.text,
                 'finish_reason': completion.finish_reason
             })
@@ -190,9 +193,9 @@ def generate(self,
 
     @require_initialized
     def generate_async(self,
-                      tokenized_prompts: Optional[List[List[int]]] = None,
-                      text_prompts: Optional[List[str]] = None,
-                      **kwargs) -> List[asyncio.Future]:
+                       tokenized_prompts: Optional[List[List[int]]] = None,
+                       text_prompts: Optional[List[str]] = None,
+                       **kwargs) -> List[asyncio.Future]:
         """Generate responses asynchronously, returning futures immediately.
 
         Note: vLLM backend only accepts text_prompts parameter.
@@ -245,11 +248,14 @@ def _generate_batch(self, text_prompts: List[str]) -> List[Dict[str, Any]]:
             if not completion.text:
                 # Get the corresponding prompt for context
                 prompt_idx = outputs.index(output)
-                prompt_preview = text_prompts[prompt_idx][:100] if len(text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx]
-                raise RuntimeError(f"Empty response received from vLLM for prompt: {prompt_preview}...")
+                prompt_preview = text_prompts[prompt_idx][:100] if len(
+                    text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx]
+                raise RuntimeError(
+                    f"Empty response received from vLLM for prompt: {prompt_preview}...")
 
             results.append({
-                'tokens': list(completion.token_ids),  # Convert tuple to list for .copy() compatibility
+                # Convert tuple to list for .copy() compatibility
+                'tokens': list(completion.token_ids),
                 'text': completion.text,
                 'finish_reason': completion.finish_reason
             })
@@ -265,7 +271,8 @@ def shutdown(self) -> None:
             # Access internal executor to ensure proper cleanup
             if self.llm.llm_engine is not None:
                 try:
-                    # This helps cleanup vLLM's internal Ray/multiprocessing resources
+                    # This helps cleanup vLLM's internal Ray/multiprocessing
+                    # resources
                     del self.llm.llm_engine.model_executor
                 except Exception as e:
                     print(f"Warning: Failed to cleanup model executor: {e}")
diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py
index 55647b7ca5..a0b546b600 100644
--- a/language/deepseek-r1/eval_accuracy.py
+++ b/language/deepseek-r1/eval_accuracy.py
@@ -45,11 +45,11 @@
 # =============================================================================
 
 def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
-                               dataset_file: Union[str, Path],
-                               checkpoint_path: str,
-                               dtype: str = "int32",
-                               output_dir: Optional[Union[str, Path]] = None,
-                               base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
+                                dataset_file: Union[str, Path],
+                                checkpoint_path: str,
+                                dtype: str = "int32",
+                                output_dir: Optional[Union[str, Path]] = None,
+                                base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
     """Process MLPerf log accuracy file and evaluate results.
 
     Args:
@@ -68,7 +68,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
     dataset_file = Path(dataset_file)
 
     if not mlperf_log_file.exists():
-        raise FileNotFoundError(f"MLPerf log file not found: {mlperf_log_file}")
+        raise FileNotFoundError(
+            f"MLPerf log file not found: {mlperf_log_file}")
     if not dataset_file.exists():
         raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
 
@@ -86,7 +87,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
         )
         logger.info("Tokenizer loaded successfully")
     except Exception as e:
-        raise RuntimeError(f"Failed to load tokenizer from {checkpoint_path}: {e}")
+        raise RuntimeError(
+            f"Failed to load tokenizer from {checkpoint_path}: {e}")
 
     # Load ground truth dataset
     try:
@@ -99,14 +101,20 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
             elif 'ground_truth' in dataset_df.columns:
                 ground_truths = dataset_df['ground_truth'].tolist()
             else:
-                raise ValueError("Dataset must contain 'gt_output' or 'ground_truth' column")
+                raise ValueError(
+                    "Dataset must contain 'gt_output' or 'ground_truth' column")
 
             # Get other required columns with fallbacks
             if 'dataset' in dataset_df.columns:
                 datasets = dataset_df['dataset'].tolist()
             elif 'metric' in dataset_df.columns:
                 # Infer dataset from metric names
-                datasets = [metric.replace('_em', '').replace('_', '') for metric in dataset_df['metric'].tolist()]
+                datasets = [
+                    metric.replace(
+                        '_em',
+                        '').replace(
+                        '_',
+                        '') for metric in dataset_df['metric'].tolist()]
             else:
                 datasets = ['unknown'] * len(ground_truths)
 
@@ -138,7 +146,7 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
         # First, check if this is a JSON array format or newline-delimited JSON
         with open(mlperf_log_file, 'r') as f:
             first_line = f.readline().strip()
-            
+
         if first_line == '[':
             # JSON array format - load the entire file
             logger.info("Detected JSON array format")
@@ -146,8 +154,10 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
                 try:
                     mlperf_results = json.load(f)
                 except json.JSONDecodeError as e:
-                    # If full file parsing fails, try to parse line by line, skipping brackets
-                    logger.warning(f"Failed to parse as complete JSON array: {e}")
+                    # If full file parsing fails, try to parse line by line,
+                    # skipping brackets
+                    logger.warning(
+                        f"Failed to parse as complete JSON array: {e}")
                     logger.info("Attempting line-by-line parsing")
                     mlperf_results = []
                     with open(mlperf_log_file, 'r') as f2:
@@ -162,7 +172,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
                             try:
                                 mlperf_results.append(json.loads(line))
                             except json.JSONDecodeError as e:
-                                logger.warning(f"Failed to parse line {line_num}: {e}")
+                                logger.warning(
+                                    f"Failed to parse line {line_num}: {e}")
                                 continue
         else:
             # Newline-delimited JSON format
@@ -180,7 +191,7 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
                     except json.JSONDecodeError as e:
                         logger.warning(f"Failed to parse line {line_num}: {e}")
                         continue
-                        
+
         logger.info(f"Loaded {len(mlperf_results)} MLPerf results")
     except Exception as e:
         raise RuntimeError(f"Failed to load MLPerf log file: {e}")
@@ -220,7 +231,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path],
             questions_required.append(questions[qsl_idx])
 
         except Exception as e:
-            logger.warning(f"Error processing entry with qsl_idx {qsl_idx}: {e}")
+            logger.warning(
+                f"Error processing entry with qsl_idx {qsl_idx}: {e}")
             continue
 
     if not preds_token_ids:
@@ -271,7 +283,11 @@ def validate_dataframe(df: pd.DataFrame) -> None:
     if not isinstance(df, pd.DataFrame):
         raise ValueError("Input must be a pandas DataFrame")
 
-    required_cols = ['model_output', 'dataset', 'ground_truth', 'tok_model_output_len']
+    required_cols = [
+        'model_output',
+        'dataset',
+        'ground_truth',
+        'tok_model_output_len']
     missing_cols = [col for col in required_cols if col not in df.columns]
     if missing_cols:
         raise ValueError(f"Missing required columns: {missing_cols}")
@@ -390,7 +406,8 @@ def parse_code(text: str) -> Optional[str]:
 # Answer Evaluation Functions
 # =============================================================================
 
-def evaluate_multiple_choice(parsed: Optional[str], ground_truth: str, valid_options: str) -> bool:
+def evaluate_multiple_choice(
+        parsed: Optional[str], ground_truth: str, valid_options: str) -> bool:
     """Evaluate multiple choice answer."""
     if not parsed or not ground_truth:
         return False
@@ -414,10 +431,12 @@ def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool:
 
     # Use sys.path approach for proper module importing
     workspace_path = os.path.dirname(os.path.abspath(__file__))
-    prm800k_module_path = os.path.join(workspace_path, "submodules", "prm800k", "prm800k")
+    prm800k_module_path = os.path.join(
+        workspace_path, "submodules", "prm800k", "prm800k")
 
     if not os.path.exists(prm800k_module_path):
-        raise FileNotFoundError(f"PRM800K module not found at: {prm800k_module_path}")
+        raise FileNotFoundError(
+            f"PRM800K module not found at: {prm800k_module_path}")
 
     # Save current directory and sys.path
     original_cwd = os.getcwd()
@@ -427,10 +446,10 @@ def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool:
         # Add prm800k module path to sys.path
         if prm800k_module_path not in sys.path:
             sys.path.insert(0, prm800k_module_path)
-        
+
         # Change directory as some imports might use relative paths
         os.chdir(prm800k_module_path)
-        
+
         # Now import should work
         from grading.grader import grade_answer
         result = grade_answer(given_answer=parsed, ground_truth=ground_truth)
@@ -622,7 +641,8 @@ def process_row(row: pd.Series) -> Dict[str, Any]:
     }
 
 
-def process_livecodebench_parallel(df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]:
+def process_livecodebench_parallel(
+        df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]:
     """Process LiveCodeBench items in parallel."""
     # Prepare work items
     work_items = []
@@ -726,7 +746,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 # Unified Evaluation Utilities
 # =============================================================================
 
-def print_evaluation_results(df_evaluated: pd.DataFrame, logger: Optional[logging.Logger] = None) -> Dict[str, Any]:
+def print_evaluation_results(df_evaluated: pd.DataFrame,
+                             logger: Optional[logging.Logger] = None) -> Dict[str, Any]:
     """Print evaluation results in a unified format.
 
     Args:
@@ -762,8 +783,8 @@ def print_evaluation_results(df_evaluated: pd.DataFrame, logger: Optional[loggin
 
 
 def process_and_save_dataframe(df: pd.DataFrame,
-                              output_dir: Optional[Union[str, Path]] = None,
-                              base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
+                               output_dir: Optional[Union[str, Path]] = None,
+                               base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]:
     """Process dataframe for evaluation and save the results.
 
     Args:
@@ -779,7 +800,8 @@ def process_and_save_dataframe(df: pd.DataFrame,
 
     # Determine output path
     if output_dir is None:
-        # Try to infer from existing path info in the dataframe or use current directory
+        # Try to infer from existing path info in the dataframe or use current
+        # directory
         output_dir = Path.cwd()
     else:
         output_dir = Path(output_dir)
diff --git a/language/deepseek-r1/mlperf/__init__.py b/language/deepseek-r1/mlperf/__init__.py
index 33b3154f6b..bfe95c35e3 100644
--- a/language/deepseek-r1/mlperf/__init__.py
+++ b/language/deepseek-r1/mlperf/__init__.py
@@ -16,8 +16,8 @@
 
 __all__ = [
     # SUTs
-    'BaseSUT', 
-    'OfflineSUT', 
+    'BaseSUT',
+    'OfflineSUT',
     'ServerSUT',
     # QSL
     'QuerySampleLibrary',
@@ -26,4 +26,4 @@
     'prepare_mlperf_dataset',
     'process_mlperf_results',
     'create_mlperf_output_dataframe'
-] 
\ No newline at end of file
+]
diff --git a/language/deepseek-r1/mlperf/base_sut.py b/language/deepseek-r1/mlperf/base_sut.py
index 7249207aab..f1d32eb869 100644
--- a/language/deepseek-r1/mlperf/base_sut.py
+++ b/language/deepseek-r1/mlperf/base_sut.py
@@ -12,65 +12,65 @@
 
 class BaseSUT(abc.ABC):
     """Base class for MLPerf inference System Under Test (SUT).
-    
+
     This class defines the interface that all SUTs must implement for MLPerf
     inference benchmarks. It provides two main methods:
     - issue_queries: to enqueue prompt tokens
     - flush_queries: to await completion of all issued queries
     """
-    
+
     def __init__(self, name: str = "BaseSUT"):
         """Initialize the base SUT.
-        
+
         Args:
             name: Name of the SUT for logging purposes
         """
         self.name = name
         self.sut = None
         logger.info(f"Initializing {self.name}")
-    
+
     @abc.abstractmethod
     def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """Issue queries to the SUT.
-        
+
         This method should enqueue the provided query samples for processing.
         It should return immediately without waiting for completion.
-        
+
         Args:
             query_samples: List of MLPerf LoadGen query samples to process
         """
         raise NotImplementedError("Subclasses must implement issue_queries")
-    
+
     @abc.abstractmethod
     def flush_queries(self) -> None:
         """Flush all pending queries.
-        
+
         This method should wait for all previously issued queries to complete
         before returning. It's called by LoadGen to ensure all work is done.
         """
         raise NotImplementedError("Subclasses must implement flush_queries")
-    
+
     def start(self) -> lg.ConstructSUT:
         """Start the SUT and return the LoadGen SUT handle.
-        
+
         Returns:
             LoadGen SUT handle for use with LoadGen
         """
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
         logger.info(f"{self.name} started")
         return self.sut
-    
+
     def stop(self) -> None:
         """Stop the SUT and clean up resources."""
         if self.sut:
             lg.DestroySUT(self.sut)
             self.sut = None
             logger.info(f"{self.name} stopped")
-    
+
     def __enter__(self):
         """Context manager entry."""
         return self.start()
-    
+
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit."""
-        self.stop() 
\ No newline at end of file
+        self.stop()
diff --git a/language/deepseek-r1/mlperf/offline_sut.py b/language/deepseek-r1/mlperf/offline_sut.py
index db1c4feea1..00382f4660 100644
--- a/language/deepseek-r1/mlperf/offline_sut.py
+++ b/language/deepseek-r1/mlperf/offline_sut.py
@@ -44,11 +44,15 @@ def __init__(self,
         self.dataset_strings = dataset_strings
 
         # Determine backend type using registry
-        self.backend_name = getattr(backend, 'backend_name', type(backend).__name__.lower())
+        self.backend_name = getattr(
+            backend,
+            'backend_name',
+            type(backend).__name__.lower())
         self.uses_text_prompts = uses_text_input(self.backend_name)
 
         if self.uses_text_prompts and dataset_strings is None:
-            raise ValueError(f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided")
+            raise ValueError(
+                f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided")
 
         # Async event loop and thread
         self.loop = None
@@ -122,12 +126,15 @@ async def _process_all_queries_async(self):
             # Prepare prompts for batch processing (like run_eval.py)
             if self.uses_text_prompts:
                 # Use text prompts for vLLM and SGLang
-                prompts = [self.dataset_strings[sample.index] for sample in query_samples]
+                prompts = [self.dataset_strings[sample.index]
+                           for sample in query_samples]
                 futures = self.backend.generate_async(text_prompts=prompts)
             else:
                 # Use tokenized prompts for other backends
-                prompts = [self.dataset[sample.index] for sample in query_samples]
-                futures = self.backend.generate_async(tokenized_prompts=prompts)
+                prompts = [self.dataset[sample.index]
+                           for sample in query_samples]
+                futures = self.backend.generate_async(
+                    tokenized_prompts=prompts)
 
             logger.info(f"Got {len(futures)} futures from backend")
 
@@ -136,7 +143,8 @@ async def _process_all_queries_async(self):
             indexed_futures = [(i, future) for i, future in enumerate(futures)]
             completed_indices = set()
 
-            # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order completion
+            # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order
+            # completion
             pending = {future for _, future in indexed_futures}
 
             while pending:
@@ -153,12 +161,14 @@ async def _process_all_queries_async(self):
                             break
 
                     if original_idx is None:
-                        logger.error("Could not find original index for completed future")
+                        logger.error(
+                            "Could not find original index for completed future")
                         continue
 
                     # Check for duplicate completion
                     if original_idx in completed_indices:
-                        logger.warning(f"Prompt {original_idx} completed multiple times!")
+                        logger.warning(
+                            f"Prompt {original_idx} completed multiple times!")
                         continue
 
                     try:
@@ -174,36 +184,44 @@ async def _process_all_queries_async(self):
                         await self._send_result_to_loadgen(sample, result)
 
                     except Exception as e:
-                        logger.error(f"Error processing prompt {original_idx}: {type(e).__name__}: {e}")
+                        logger.error(
+                            f"Error processing prompt {original_idx}: {type(e).__name__}: {e}")
                         # Raise the error instead of handling empty responses
-                        raise RuntimeError(f"Backend failed to generate tokens for prompt {original_idx}: {e}")
+                        raise RuntimeError(
+                            f"Backend failed to generate tokens for prompt {original_idx}: {e}")
 
             # Verify all results are populated
             if len(completed_indices) != len(futures):
                 missing_count = len(futures) - len(completed_indices)
-                raise RuntimeError(f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)")
+                raise RuntimeError(
+                    f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)")
 
             for i, result in enumerate(results):
                 if result is None:
                     raise RuntimeError(f"Missing result for prompt {i}")
 
-            logger.info(f"Completed all {len(completed_indices)} prompts successfully")
+            logger.info(
+                f"Completed all {len(completed_indices)} prompts successfully")
 
         except Exception as e:
-            logger.error(f"Error during batch processing: {type(e).__name__}: {e}")
+            logger.error(
+                f"Error during batch processing: {type(e).__name__}: {e}")
             import traceback
             traceback.print_exc()
             raise  # Re-raise instead of sending empty responses
 
-    async def _send_result_to_loadgen(self, sample: lg.QuerySample, result: Dict[str, Any]):
+    async def _send_result_to_loadgen(
+            self, sample: lg.QuerySample, result: Dict[str, Any]):
         """Send a single result to LoadGen."""
         try:
             # Validate that tokens exist - raise error if missing
             tokens = result.get('tokens')
             if tokens is None:
-                raise ValueError(f"Backend result missing 'tokens' key for query {sample.id}")
+                raise ValueError(
+                    f"Backend result missing 'tokens' key for query {sample.id}")
             if not isinstance(tokens, (list, tuple)) or len(tokens) == 0:
-                raise ValueError(f"Backend returned empty or invalid tokens for query {sample.id}: {tokens}")
+                raise ValueError(
+                    f"Backend returned empty or invalid tokens for query {sample.id}: {tokens}")
 
             # Create a copy of tokens before numpy conversion
             tokens_copy = tokens.copy()
@@ -229,12 +247,15 @@ async def _send_result_to_loadgen(self, sample: lg.QuerySample, result: Dict[str
             # Send response to LoadGen
             lg.QuerySamplesComplete([response])
 
-            logger.debug(f"Sent {n_tokens} tokens to LoadGen for query {sample.id}")
+            logger.debug(
+                f"Sent {n_tokens} tokens to LoadGen for query {sample.id}")
 
         except Exception as e:
-            logger.error(f"Error sending result to LoadGen for query {sample.id}: {e}")
+            logger.error(
+                f"Error sending result to LoadGen for query {sample.id}: {e}")
             # Raise the error instead of sending empty response
-            raise RuntimeError(f"Failed to send result to LoadGen for query {sample.id}: {e}")
+            raise RuntimeError(
+                f"Failed to send result to LoadGen for query {sample.id}: {e}")
 
     def _run_event_loop(self):
         """Run the async event loop in a separate thread."""
@@ -282,7 +303,8 @@ def get_results(self) -> List[Dict[str, Any]]:
         # Sort by index to maintain dataset order
         queried_indices = sorted(self.index_to_id.keys())
 
-        logger.info(f"Retrieving results for {len(queried_indices)} queried samples")
+        logger.info(
+            f"Retrieving results for {len(queried_indices)} queried samples")
 
         # Process results in order of dataset indices using stored results
         for i in queried_indices:
@@ -296,7 +318,8 @@ def get_results(self) -> List[Dict[str, Any]]:
                 tokens = result['tokens']
                 output_text = result.get('text', '')
                 if not output_text and self.backend.tokenizer:
-                    output_text = self.backend.tokenizer.decode(result['tokens'], skip_special_tokens=True)
+                    output_text = self.backend.tokenizer.decode(
+                        result['tokens'], skip_special_tokens=True)
 
                 ordered_results.append({
                     'model_output': output_text,
@@ -305,6 +328,7 @@ def get_results(self) -> List[Dict[str, Any]]:
                 })
             else:
                 # No backend result for this sample
-                raise RuntimeError(f"No backend result stored for dataset index {i}, sample_id {sample_id}")
+                raise RuntimeError(
+                    f"No backend result stored for dataset index {i}, sample_id {sample_id}")
 
-        return ordered_results
\ No newline at end of file
+        return ordered_results
diff --git a/language/deepseek-r1/mlperf/qsl.py b/language/deepseek-r1/mlperf/qsl.py
index 59bc5e36a8..d4c9405a4e 100644
--- a/language/deepseek-r1/mlperf/qsl.py
+++ b/language/deepseek-r1/mlperf/qsl.py
@@ -7,12 +7,12 @@
 
 class QuerySampleLibrary:
     """MLPerf QuerySampleLibrary implementation for single-process execution."""
-    
-    def __init__(self, dataset: List[List[int]], dataset_strings: List[str], 
+
+    def __init__(self, dataset: List[List[int]], dataset_strings: List[str],
                  name: str = "QSL"):
         """
         Initialize QSL with dataset.
-        
+
         Args:
             dataset: List of tokenized prompts
             dataset_strings: List of original prompt strings
@@ -24,7 +24,7 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str],
         self.perf_count = self.count
         self.name = name
         self.logger = logging.getLogger(__name__)
-        
+
         # Create LoadGen QSL
         self.qsl = lg.ConstructQSL(
             self.count,
@@ -33,7 +33,7 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str],
             lambda x: None   # UnloadSamplesFromRam
         )
         self.logger.info(f"Created {self.name} with {self.count} samples")
-    
+
     def __del__(self):
         """Cleanup QSL."""
         if self.qsl is not None:
@@ -43,12 +43,12 @@ def __del__(self):
 
 class DistributedQuerySampleLibrary:
     """QuerySampleLibrary for distributed execution (MPI/torchrun)."""
-    
+
     def __init__(self, dataset: List[List[int]], dataset_strings: List[str],
                  rank: int, world_size: int, name: str = "DistributedQSL"):
         """
         Initialize distributed QSL.
-        
+
         Args:
             dataset: List of tokenized prompts
             dataset_strings: List of original prompt strings
@@ -64,10 +64,10 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str],
         self.world_size = world_size
         self.name = name
         self.logger = logging.getLogger(__name__)
-        
+
         # Track if this is rank zero explicitly
         self.is_rank_zero = (self.rank == 0)
-        
+
         # Only rank 0 creates the actual QSL
         if self.is_rank_zero:
             self.qsl = lg.ConstructQSL(
@@ -76,12 +76,13 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str],
                 lambda x: None,
                 lambda x: None
             )
-            self.logger.info(f"Created {self.name} with {self.count} samples on rank 0")
+            self.logger.info(
+                f"Created {self.name} with {self.count} samples on rank 0")
         else:
             self.qsl = None
-    
+
     def __del__(self):
         """Cleanup QSL on rank 0."""
         if self.is_rank_zero and self.qsl is not None:
             lg.DestroyQSL(self.qsl)
-            self.logger.info(f"{self.name} destroyed on rank 0") 
\ No newline at end of file
+            self.logger.info(f"{self.name} destroyed on rank 0")
diff --git a/language/deepseek-r1/mlperf/server_sut.py b/language/deepseek-r1/mlperf/server_sut.py
index 75699e208f..e3acb2bde8 100644
--- a/language/deepseek-r1/mlperf/server_sut.py
+++ b/language/deepseek-r1/mlperf/server_sut.py
@@ -69,11 +69,15 @@ def __init__(self,
         self.dataset_strings = dataset_strings
 
         # Determine backend type using registry
-        self.backend_name = getattr(backend, 'backend_name', type(backend).__name__.lower())
+        self.backend_name = getattr(
+            backend,
+            'backend_name',
+            type(backend).__name__.lower())
         self.uses_text_prompts = uses_text_input(self.backend_name)
 
         if self.uses_text_prompts and dataset_strings is None:
-            raise ValueError(f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided")
+            raise ValueError(
+                f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided")
 
         # Async event loop and thread
         self.loop = None
@@ -91,8 +95,6 @@ def __init__(self,
         self.all_results: Dict[int, Dict[str, Any]] = {}
         self.results_lock = asyncio.Lock()
 
-
-
     def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         """Issue queries in streaming mode with batching."""
         if not supports_streaming():
@@ -123,7 +125,8 @@ async def _start_streaming_query(self, query_info: QueryInfo) -> None:
         try:
             # Verify streaming support
             if not supports_streaming():
-                raise RuntimeError(f"Backend {self.backend_name} does not support streaming required for server mode")
+                raise RuntimeError(
+                    f"Backend {self.backend_name} does not support streaming required for server mode")
 
             # Prepare prompt based on backend type
             if self.uses_text_prompts:
@@ -155,8 +158,10 @@ async def _start_streaming_query(self, query_info: QueryInfo) -> None:
             task.add_done_callback(self._remove_task_from_active)
 
         except Exception as e:
-            logger.error(f"Error starting stream for query {query_info.query_id}: {e}")
-            raise RuntimeError(f"Failed to start streaming for query {query_info.query_id}: {e}")
+            logger.error(
+                f"Error starting stream for query {query_info.query_id}: {e}")
+            raise RuntimeError(
+                f"Failed to start streaming for query {query_info.query_id}: {e}")
 
     def _remove_task_from_active(self, task: asyncio.Task) -> None:
         """Remove a completed task from the active set."""
@@ -181,7 +186,8 @@ async def _process_stream(self, state: StreamingQueryState) -> None:
                     state.accumulated_tokens.extend(chunk.token_ids)
 
                 # Report first token immediately for TTFT measurement
-                if not state.first_token_sent and (chunk.token or chunk.token_ids):
+                if not state.first_token_sent and (
+                        chunk.token or chunk.token_ids):
                     state.first_token_time = current_time - state.start_time
                     state.first_token_sent = True
 
@@ -197,35 +203,43 @@ async def _process_stream(self, state: StreamingQueryState) -> None:
 
         except asyncio.CancelledError:
             # Task was cancelled, clean up gracefully
-            logger.debug(f"Stream processing cancelled for query {state.query_info.query_id}")
-            # Close the async generator properly (assume aclose exists in our containerized environment)
+            logger.debug(
+                f"Stream processing cancelled for query {state.query_info.query_id}")
+            # Close the async generator properly (assume aclose exists in our
+            # containerized environment)
             try:
                 await state.stream_gen.aclose()
             except Exception:
                 pass
             raise
         except Exception as e:
-            logger.error(f"Error processing stream for query {state.query_info.query_id}: {e}")
-            raise RuntimeError(f"Stream processing failed for query {state.query_info.query_id}: {e}")
+            logger.error(
+                f"Error processing stream for query {state.query_info.query_id}: {e}")
+            raise RuntimeError(
+                f"Stream processing failed for query {state.query_info.query_id}: {e}")
         finally:
             # Clean up active stream
             async with self.active_streams_lock:
                 self.active_streams.pop(state.query_info.query_id, None)
 
-    async def _send_first_token_response(self, state: StreamingQueryState) -> None:
+    async def _send_first_token_response(
+            self, state: StreamingQueryState) -> None:
         """Send first token notification to LoadGen for TTFT measurement."""
-        logger.debug(f"First token received for query {state.query_info.query_id} at {state.first_token_time:.3f}s")
+        logger.debug(
+            f"First token received for query {state.query_info.query_id} at {state.first_token_time:.3f}s")
 
         # Convert first tokens to proper format for LoadGen
         if state.accumulated_tokens:
-            output_tokens = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32)
+            output_tokens = np.ascontiguousarray(
+                state.accumulated_tokens, dtype=np.int32)
         else:
             # If no token IDs available, encode the text
             if hasattr(self.backend, 'tokenizer') and state.accumulated_text:
                 tokens = self.backend.tokenizer.encode(state.accumulated_text)
                 output_tokens = np.ascontiguousarray(tokens, dtype=np.int32)
             else:
-                raise RuntimeError(f"No token IDs available for first token response for query {state.query_info.query_id}")
+                raise RuntimeError(
+                    f"No token IDs available for first token response for query {state.query_info.query_id}")
 
         output_seq_len = len(output_tokens)
         output_toks_ptr = output_tokens.ctypes.data if output_seq_len > 0 else 0
@@ -248,22 +262,25 @@ async def _send_final_response(self, state: StreamingQueryState) -> None:
             if state.accumulated_tokens:
                 # Create a copy of tokens before numpy conversion
                 tokens_to_send = state.accumulated_tokens.copy()
-                token_array = np.array(state.accumulated_tokens, dtype=np.int32)
+                token_array = np.array(
+                    state.accumulated_tokens, dtype=np.int32)
             else:
                 # If no tokens, encode the text
-                if hasattr(self.backend, 'tokenizer') and state.accumulated_text:
-                    tokens = self.backend.tokenizer.encode(state.accumulated_text)
+                if hasattr(self.backend,
+                           'tokenizer') and state.accumulated_text:
+                    tokens = self.backend.tokenizer.encode(
+                        state.accumulated_text)
                     # Create a copy of tokens before numpy conversion
                     tokens_to_send = tokens.copy()
                     token_array = np.array(tokens, dtype=np.int32)
                 else:
-                    raise RuntimeError(f"No tokens or tokenizer available for query {state.query_info.query_id}")
+                    raise RuntimeError(
+                        f"No tokens or tokenizer available for query {state.query_info.query_id}")
 
             # Validate we have tokens
             if len(token_array) == 0:
-                raise RuntimeError(f"No tokens generated for query {state.query_info.query_id}")
-
-
+                raise RuntimeError(
+                    f"No tokens generated for query {state.query_info.query_id}")
 
             # Create LoadGen response
             response = lg.QuerySampleResponse(
@@ -287,11 +304,14 @@ async def _send_final_response(self, state: StreamingQueryState) -> None:
                 }
                 self.all_results[state.query_info.query_id] = state.query_info.result
 
-            logger.debug(f"Sent {len(token_array)} tokens to LoadGen for query {state.query_info.query_id}")
+            logger.debug(
+                f"Sent {len(token_array)} tokens to LoadGen for query {state.query_info.query_id}")
 
         except Exception as e:
-            logger.error(f"Error sending final response for query {state.query_info.query_id}: {e}")
-            raise RuntimeError(f"Failed to send final response for query {state.query_info.query_id}: {e}")
+            logger.error(
+                f"Error sending final response for query {state.query_info.query_id}: {e}")
+            raise RuntimeError(
+                f"Failed to send final response for query {state.query_info.query_id}: {e}")
 
     def flush_queries(self) -> None:
         """Wait for all active streams to complete."""
@@ -313,13 +333,16 @@ async def wait_for_streams():
 
             async with self.active_streams_lock:
                 if self.active_streams:
-                    logger.warning(f"Timeout: {len(self.active_streams)} streams still active")
+                    logger.warning(
+                        f"Timeout: {len(self.active_streams)} streams still active")
 
         # Run the wait task in the event loop
         if self.loop and not self.loop.is_closed():
-            future = asyncio.run_coroutine_threadsafe(wait_for_streams(), self.loop)
+            future = asyncio.run_coroutine_threadsafe(
+                wait_for_streams(), self.loop)
             try:
-                future.result(timeout=310)  # Slightly longer than internal timeout
+                # Slightly longer than internal timeout
+                future.result(timeout=310)
             except Exception as e:
                 logger.error(f"Error waiting for streams to complete: {e}")
 
@@ -352,7 +375,8 @@ async def cancel_all_tasks():
                     tasks_to_cancel = list(self.active_tasks)
 
                 if tasks_to_cancel:
-                    logger.info(f"Cancelling {len(tasks_to_cancel)} active streaming tasks...")
+                    logger.info(
+                        f"Cancelling {len(tasks_to_cancel)} active streaming tasks...")
                     for task in tasks_to_cancel:
                         task.cancel()
 
@@ -365,7 +389,8 @@ async def cancel_all_tasks():
                     self.active_tasks.clear()
 
             # Run the cancellation in the event loop
-            future = asyncio.run_coroutine_threadsafe(cancel_all_tasks(), self.loop)
+            future = asyncio.run_coroutine_threadsafe(
+                cancel_all_tasks(), self.loop)
             try:
                 future.result(timeout=10.0)  # Give tasks time to cancel
             except Exception as e:
@@ -405,10 +430,12 @@ def get_results(self) -> List[Dict[str, Any]]:
         # Only process results for samples that were actually queried
         # Sort by index to maintain dataset order
         queried_indices = sorted(index_to_result.keys())
-        
-        logger.info(f"Retrieving results for {len(queried_indices)} queried samples")
 
-        # Process results in order of dataset indices using stored backend results
+        logger.info(
+            f"Retrieving results for {len(queried_indices)} queried samples")
+
+        # Process results in order of dataset indices using stored backend
+        # results
         for i in queried_indices:
             result = index_to_result[i]
 
@@ -416,7 +443,8 @@ def get_results(self) -> List[Dict[str, Any]]:
             tokens = result['tokens']
             output_text = result.get('text', '')
             if not output_text and self.backend.tokenizer:
-                output_text = self.backend.tokenizer.decode(result['tokens'], skip_special_tokens=True)
+                output_text = self.backend.tokenizer.decode(
+                    result['tokens'], skip_special_tokens=True)
 
             ordered_results.append({
                 'model_output': output_text,
@@ -424,4 +452,4 @@ def get_results(self) -> List[Dict[str, Any]]:
                 'tok_model_output_len': len(tokens)
             })
 
-        return ordered_results
\ No newline at end of file
+        return ordered_results
diff --git a/language/deepseek-r1/mlperf/utils.py b/language/deepseek-r1/mlperf/utils.py
index 973e46c201..f4fbedda41 100644
--- a/language/deepseek-r1/mlperf/utils.py
+++ b/language/deepseek-r1/mlperf/utils.py
@@ -7,15 +7,15 @@
 from utils.tokenization import StandardTokenizer
 
 
-def prepare_mlperf_dataset(input_file: str, 
-                          backend_name: Optional[str] = None,
-                          tokenizer: StandardTokenizer = None,
-                          num_samples: Optional[int] = None,
-                          skip_samples: int = 0,
-                          use_chat_template: Optional[bool] = None) -> Dict[str, Any]:
+def prepare_mlperf_dataset(input_file: str,
+                           backend_name: Optional[str] = None,
+                           tokenizer: StandardTokenizer = None,
+                           num_samples: Optional[int] = None,
+                           skip_samples: int = 0,
+                           use_chat_template: Optional[bool] = None) -> Dict[str, Any]:
     """
     Prepare dataset for MLPerf inference.
-    
+
     Args:
         input_file: Path to input pickle file
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
@@ -24,29 +24,30 @@ def prepare_mlperf_dataset(input_file: str,
         num_samples: Number of samples to use
         skip_samples: Number of samples to skip
         use_chat_template: Whether to use chat template (if None, determined by registry)
-        
+
     Returns:
         Dictionary with prepared dataset components
     """
     if backend_name is None:
         from utils.backend_registry import detect_backend
         backend_name = detect_backend()
-    
+
     # Load and validate dataset
     df = load_dataset(input_file, num_samples, skip_samples)
     validate_dataset(df)
-    
+
     prompts = df['text_input'].tolist()
     print(f"[MLPerf] Loaded {len(prompts)} prompts from dataset")
-    
+
     # Check if backend uses text prompts from registry
     uses_text_prompts = uses_text_input()
-    
+
     # Determine chat template usage from registry if not specified
     if use_chat_template is None:
         use_chat_template = uses_chat_template()
-        print(f"[MLPerf] Using chat template from registry: {use_chat_template}")
-    
+        print(
+            f"[MLPerf] Using chat template from registry: {use_chat_template}")
+
     if uses_text_prompts:
         print(f"[MLPerf] Backend {backend_name} uses text prompts directly")
         return {
@@ -62,7 +63,7 @@ def prepare_mlperf_dataset(input_file: str,
             prompts, use_chat_template
         )
         print(f"[MLPerf] Tokenized {len(tokenized_prompts)} prompts")
-        
+
         return {
             'dataframe': df,
             'prompts': prompts,
@@ -73,61 +74,63 @@ def prepare_mlperf_dataset(input_file: str,
 
 
 def process_mlperf_results(sut_results: List[Dict[str, Any]],
-                          tokenizer: Optional[StandardTokenizer] = None,
-                          backend_name: Optional[str] = None,
-                          uses_text_prompts: Optional[bool] = None) -> List[Dict[str, Any]]:
+                           tokenizer: Optional[StandardTokenizer] = None,
+                           backend_name: Optional[str] = None,
+                           uses_text_prompts: Optional[bool] = None) -> List[Dict[str, Any]]:
     """
     Process MLPerf SUT results into standardized format.
-    
+
     Args:
         sut_results: Raw results from MLPerf SUT
         tokenizer: StandardTokenizer for decoding
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
                       (Kept for backward compatibility but not used in our codebase)
         uses_text_prompts: Whether backend uses text prompts (if None, determined by registry)
-        
+
     Returns:
         List of processed result dictionaries
     """
     from utils.tokenization import process_inference_results
-    
+
     if backend_name is None:
         from utils.backend_registry import detect_backend
         backend_name = detect_backend()
-    
+
     # Determine text prompt usage from registry if not specified
     if uses_text_prompts is None:
         uses_text_prompts = uses_text_input()
-    
+
     # Reuse the general inference result processing
-    return process_inference_results(sut_results, tokenizer, uses_text_prompts=uses_text_prompts)
+    return process_inference_results(
+        sut_results, tokenizer, uses_text_prompts=uses_text_prompts)
 
 
 def create_mlperf_output_dataframe(input_df: pd.DataFrame,
-                                  results: List[Dict[str, Any]],
-                                  backend_name: Optional[str] = None) -> pd.DataFrame:
+                                   results: List[Dict[str, Any]],
+                                   backend_name: Optional[str] = None) -> pd.DataFrame:
     """
     Create output dataframe with MLPerf results.
-    
+
     Args:
         input_df: Input dataframe
         results: Processed MLPerf results
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
                       (Kept for backward compatibility but not used in our codebase)
-        
+
     Returns:
         Output dataframe with results
     """
     if backend_name is None:
         from utils.backend_registry import detect_backend
         backend_name = detect_backend()
-    
+
     df_output = input_df.copy()
-    
+
     # Add result columns
     df_output['model_output'] = [r['model_output'] for r in results]
     df_output['tok_model_output'] = [r['tok_model_output'] for r in results]
-    df_output['tok_model_output_len'] = [r['tok_model_output_len'] for r in results]
+    df_output['tok_model_output_len'] = [
+        r['tok_model_output_len'] for r in results]
     df_output['model_backend'] = backend_name
-    
-    return df_output 
\ No newline at end of file
+
+    return df_output
diff --git a/language/deepseek-r1/run_eval.py b/language/deepseek-r1/run_eval.py
index 169b3473e4..8965101bd4 100755
--- a/language/deepseek-r1/run_eval.py
+++ b/language/deepseek-r1/run_eval.py
@@ -1,4 +1,13 @@
 #!/usr/bin/env python3
+from utils import (
+    load_dataset, save_results, validate_dataset, generate_timestamped_filename,
+    validate_runner_for_backend, uses_text_input, uses_chat_template,
+    StandardTokenizer, process_inference_results,
+    get_backend_instance, create_base_argument_parser, print_runner_header,
+    setup_output_paths, validate_runner_args, handle_runner_error,
+    validate_dataset_extended, supports_async
+)
+from backends import BaseBackend
 import argparse
 import asyncio
 import os
@@ -11,59 +20,51 @@
 # Disable tokenizers parallelism to avoid forking issues
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-from backends import BaseBackend
-from utils import (
-    load_dataset, save_results, validate_dataset, generate_timestamped_filename,
-    validate_runner_for_backend, uses_text_input, uses_chat_template,
-    StandardTokenizer, process_inference_results,
-    get_backend_instance, create_base_argument_parser, print_runner_header,
-    setup_output_paths, validate_runner_args, handle_runner_error,
-    validate_dataset_extended, supports_async
-)
-
 
 def create_argument_parser() -> argparse.ArgumentParser:
     """Create argument parser with shared arguments only."""
     parser = create_base_argument_parser(
         "Modular backend evaluation system for MLPerf DeepSeek reference implementation"
     )
-    
+
     # Add runner-specific arguments
     parser.add_argument("--async", action="store_true",
-                       help="Use async generation instead of synchronous")
-    
+                        help="Use async generation instead of synchronous")
+
     return parser
 
 
-async def run_async_inference(backend: BaseBackend, 
-                            tokenized_prompts: List[List[int]],
-                            text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]:
+async def run_async_inference(backend: BaseBackend,
+                              tokenized_prompts: List[List[int]],
+                              text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]:
     """Run async inference with proper error handling and progress bar that updates as tasks complete."""
     try:
         # Get futures from backend
         if uses_text_input():
             futures = backend.generate_async(text_prompts=text_prompts)
         else:
-            futures = backend.generate_async(tokenized_prompts=tokenized_prompts)
-        
+            futures = backend.generate_async(
+                tokenized_prompts=tokenized_prompts)
+
         # Create a list to store results in order
         results = [None] * len(futures)
-        
+
         # Create enumerated futures with their original indices for tracking
         indexed_futures = [(i, future) for i, future in enumerate(futures)]
-        
+
         # Track completion for debugging
         completed_indices = set()
-        
+
         # Process tasks with progress bar that updates as tasks complete
         with async_tqdm(total=len(futures), desc="Async inference", unit="prompt") as pbar:
-            # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order completion
+            # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order
+            # completion
             pending = {future for _, future in indexed_futures}
-            
+
             while pending:
                 # Wait for at least one future to complete
                 done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
-                
+
                 # Process all completed futures in this batch
                 for completed_future in done:
                     # Find the original index for this completed future
@@ -72,46 +73,51 @@ async def run_async_inference(backend: BaseBackend,
                         if future is completed_future:
                             original_idx = idx
                             break
-                    
+
                     if original_idx is None:
-                        print(f"\nWarning: Could not find original index for completed future")
+                        print(
+                            f"\nWarning: Could not find original index for completed future")
                         continue
-                    
+
                     # Check for duplicate completion
                     if original_idx in completed_indices:
-                        print(f"\nWarning: Prompt {original_idx} completed multiple times!")
+                        print(
+                            f"\nWarning: Prompt {original_idx} completed multiple times!")
                         continue
-                    
+
                     try:
                         # Get the result from the completed future
                         result = await completed_future
-                        
+
                         # Store the result in the correct position
                         results[original_idx] = result
                         completed_indices.add(original_idx)
-                        
+
                     except Exception as e:
-                        print(f"\nError processing prompt {original_idx}: {type(e).__name__}: {e}")
+                        print(
+                            f"\nError processing prompt {original_idx}: {type(e).__name__}: {e}")
                         import traceback
                         traceback.print_exception(type(e), e, e.__traceback__)
-                        
+
                         # Raise the error instead of using empty tokens
-                        raise RuntimeError(f"Backend failed to generate tokens for prompt {original_idx}: {e}")
-                    
+                        raise RuntimeError(
+                            f"Backend failed to generate tokens for prompt {original_idx}: {e}")
+
                     # Update progress bar after each completion
                     pbar.update(1)
-        
+
         # Verify all results are populated
         if len(completed_indices) != len(futures):
             missing_count = len(futures) - len(completed_indices)
-            raise RuntimeError(f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)")
-        
+            raise RuntimeError(
+                f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)")
+
         for i, result in enumerate(results):
             if result is None:
                 raise RuntimeError(f"Missing result for prompt {i}")
-        
+
         print(f"\nCompleted all {len(completed_indices)} prompts successfully")
-        
+
         return results
     except Exception as e:
         print(f"Error during async inference: {type(e).__name__}: {e}")
@@ -120,9 +126,9 @@ async def run_async_inference(backend: BaseBackend,
         raise
 
 
-def run_sync_inference(backend: BaseBackend, 
-                      tokenized_prompts: List[List[int]],
-                      text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]:
+def run_sync_inference(backend: BaseBackend,
+                       tokenized_prompts: List[List[int]],
+                       text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]:
     """Run sync inference with proper error handling."""
     try:
         if uses_text_input():
@@ -140,46 +146,52 @@ def main():
     # Parse arguments
     parser = create_argument_parser()
     args = parser.parse_args()
-    
+
     try:
         # Validate arguments
         validate_runner_args(args, 'eval')
-        
+
         # Detect backend early
         backend_name = validate_runner_for_backend('eval')
-        
+
         # Set up output paths
         output_dir, output_file = setup_output_paths(args)
         if args.output_file is None:
             args.output_file = output_file
-        
-        # Generate the actual filename with timestamp that will be used for saving
-        actual_output_file = generate_timestamped_filename(args.output_file, add_timestamp=True)
-        
+
+        # Generate the actual filename with timestamp that will be used for
+        # saving
+        actual_output_file = generate_timestamped_filename(
+            args.output_file, add_timestamp=True)
+
         # Get async flag using getattr since 'async' is a reserved keyword
         use_async = getattr(args, 'async', False)
-        
+
         # Check if backend supports async
         if use_async and not supports_async():
-            raise RuntimeError(f"Backend {backend_name} does not support async generation")
-        
+            raise RuntimeError(
+                f"Backend {backend_name} does not support async generation")
+
         # Print header
-        print_runner_header("Modular Backend Evaluation System", backend_name, args)
+        print_runner_header(
+            "Modular Backend Evaluation System",
+            backend_name,
+            args)
         print(f"Mode: {'Async' if use_async else 'Sync'}")
         print("=" * 80)
-        
+
         # Load and validate dataset
         df = load_dataset(args.input_file, args.num_samples, args.skip_samples)
         validate_dataset_extended(df)
-        
+
         prompts = df['text_input'].tolist()
-        
+
         # Initialize tokenizer
         tokenizer = StandardTokenizer()
-        
+
         # Determine whether to use chat template based on registry
         use_chat_template = uses_chat_template()
-        
+
         # For text-prompt backends, we'll pass the prompts directly
         # For tokenized-prompt backends, we need to tokenize first
         if uses_text_input():
@@ -195,19 +207,19 @@ def main():
             )
             print(f"Tokenized {len(tokenized_prompts)} prompts")
             print(f"Tokenizer Max length: {tokenizer.max_length}")
-        
+
         # Initialize backend using registry
         print(f"\nInitializing {backend_name.upper()} backend...")
         backend = get_backend_instance(backend_name)
-        
+
         with backend:
             # Create new output dataframe with only required columns
             df_output = pd.DataFrame()
-            
+
             # Copy all columns from input dataframe first
             for col in df.columns:
                 df_output[col] = df[col]
-            
+
             # Run inference with appropriate prompt format
             if use_async:
                 print("Running async inference...")
@@ -217,26 +229,31 @@ def main():
                 print("Running sync inference...")
                 raw_results = run_sync_inference(
                     backend, tokenized_prompts, text_prompts=prompts)
-            
+
             # Process raw results into standardized format using shared utility
             print("Processing results...")
             standardized_results = process_inference_results(
                 raw_results, tokenizer
             )
-            
+
             # Add generated columns
-            df_output['model_output'] = [r['model_output'] for r in standardized_results]
-            df_output['tok_model_output'] = [r['tok_model_output'] for r in standardized_results]
-            df_output['tok_model_output_len'] = [r['tok_model_output_len'] for r in standardized_results]
-            df_output['model_backend'] = [r['model_backend'] for r in standardized_results]
-            
+            df_output['model_output'] = [r['model_output']
+                                         for r in standardized_results]
+            df_output['tok_model_output'] = [r['tok_model_output']
+                                             for r in standardized_results]
+            df_output['tok_model_output_len'] = [
+                r['tok_model_output_len'] for r in standardized_results]
+            df_output['model_backend'] = [r['model_backend']
+                                          for r in standardized_results]
+
             # Save results
-            output_file = save_results(df_output, args.output_file, add_timestamp=True)
-            
+            output_file = save_results(
+                df_output, args.output_file, add_timestamp=True)
+
             print(f"\nEvaluation completed successfully!")
             print(f"Results saved to: {output_file}")
             print(f"Output columns: {list(df_output.columns)}")
-            
+
     except KeyboardInterrupt:
         print("\nEvaluation interrupted by user")
         sys.exit(1)
@@ -245,4 +262,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()
diff --git a/language/deepseek-r1/run_eval_mpi.py b/language/deepseek-r1/run_eval_mpi.py
index 37425526e1..4edeae5f8f 100644
--- a/language/deepseek-r1/run_eval_mpi.py
+++ b/language/deepseek-r1/run_eval_mpi.py
@@ -1,4 +1,11 @@
 #!/usr/bin/env python3
+from backends import BaseBackend
+from utils.data_utils import load_dataset
+from utils.validation import validate_runner_args, ValidationError
+from utils.runner_utils import create_base_argument_parser, print_runner_header
+from utils.backend_registry import uses_chat_template, get_backend_instance, detect_backend, validate_runner_for_backend
+from utils import save_results, generate_timestamped_filename, StandardTokenizer
+from backends.pytorch_backend import PyTorchBackend
 import os
 import sys
 import argparse
@@ -11,13 +18,6 @@
 
 # Import utilities and backend registry
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from backends.pytorch_backend import PyTorchBackend
-from utils import save_results, generate_timestamped_filename, StandardTokenizer
-from utils.backend_registry import uses_chat_template, get_backend_instance, detect_backend, validate_runner_for_backend
-from utils.runner_utils import create_base_argument_parser, print_runner_header
-from utils.validation import validate_runner_args, ValidationError
-from utils.data_utils import load_dataset
-from backends import BaseBackend
 
 
 def main(
@@ -41,7 +41,7 @@ def main(
 
     # Detect backend from environment
     backend_name = detect_backend()
-    
+
     # Validate backend
     validate_runner_for_backend('eval_mpi')
 
@@ -49,7 +49,8 @@ def main(
     use_chat_template = uses_chat_template()
 
     # Generate the actual filename with timestamp that will be used for saving
-    actual_output_file = generate_timestamped_filename(output_pickle_path, add_timestamp=True)
+    actual_output_file = generate_timestamped_filename(
+        output_pickle_path, add_timestamp=True)
 
     if rank == 0:
         _print("=" * 80)
@@ -63,13 +64,14 @@ def main(
             _print(f"Sample limit: {num_samples}")
         if skip_samples:
             _print(f"Skip samples: {skip_samples}")
-        _print(f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)")
+        _print(
+            f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)")
         _print("=" * 80)
 
     # Initialize PyTorch backend
     backend = PyTorchBackend()
     backend.initialize()
-    
+
     # Initialize StandardTokenizer
     tokenizer = StandardTokenizer()
 
@@ -82,12 +84,14 @@ def main(
         _print(f"Loading input DataFrame from {input_pickle_path}...")
         try:
             df_for_results = pd.read_pickle(input_pickle_path)
-            _print(f"Loaded DataFrame with {len(df_for_results)} rows and columns: {df_for_results.columns.tolist()}")
-            
+            _print(
+                f"Loaded DataFrame with {len(df_for_results)} rows and columns: {df_for_results.columns.tolist()}")
+
             # Apply skip_samples if specified
             if skip_samples > 0:
                 if skip_samples >= len(df_for_results):
-                    _print(f"Error: skip_samples ({skip_samples}) is greater than or equal to total samples ({len(df_for_results)})")
+                    _print(
+                        f"Error: skip_samples ({skip_samples}) is greater than or equal to total samples ({len(df_for_results)})")
                     backend.shutdown()
                     if world_size > 1:
                         dist.destroy_process_group()
@@ -96,14 +100,15 @@ def main(
                 df_for_results = df_for_results.iloc[skip_samples:].copy()
                 # Reset index to ensure sequential indices starting from 0
                 df_for_results = df_for_results.reset_index(drop=True)
-            
+
             # Apply num_samples limit if specified
             if num_samples is not None and num_samples < len(df_for_results):
-                _print(f"Limiting to first {num_samples} samples (out of {len(df_for_results)} total after skipping)")
+                _print(
+                    f"Limiting to first {num_samples} samples (out of {len(df_for_results)} total after skipping)")
                 df_for_results = df_for_results.head(num_samples).copy()
                 # Reset index to ensure sequential indices starting from 0
                 df_for_results = df_for_results.reset_index(drop=True)
-                
+
         except Exception as e:
             _print(f"Error loading input pickle file: {e}")
             backend.shutdown()
@@ -119,21 +124,25 @@ def main(
             return
 
         prompts_text_list = df_for_results['text_input'].tolist()
-        _print(f"Extracted {len(prompts_text_list)} prompts from 'text_input' column.")
+        _print(
+            f"Extracted {len(prompts_text_list)} prompts from 'text_input' column.")
 
         # Pre-initialize output columns
         df_for_results['model_output'] = ""
         df_for_results['tok_model_output'] = None
-        df_for_results['tok_model_output'] = df_for_results['tok_model_output'].astype('object')
+        df_for_results['tok_model_output'] = df_for_results['tok_model_output'].astype(
+            'object')
         df_for_results['tok_model_output_len'] = 0
         df_for_results['model_backend'] = backend_name
 
     # Broadcast the number of prompts to all ranks
     if world_size > 1:
         if rank == 0:
-            num_prompts_tensor = torch.tensor(len(prompts_text_list), dtype=torch.long, device="cuda")
+            num_prompts_tensor = torch.tensor(
+                len(prompts_text_list), dtype=torch.long, device="cuda")
         else:
-            num_prompts_tensor = torch.empty(1, dtype=torch.long, device="cuda")
+            num_prompts_tensor = torch.empty(
+                1, dtype=torch.long, device="cuda")
         dist.broadcast(num_prompts_tensor, src=0)
         num_total_prompts = num_prompts_tensor.item()
     else:
@@ -148,13 +157,14 @@ def main(
         current_batch_prompt_tokens = None
 
         if rank == 0:
-            current_batch_prompt_texts = prompts_text_list[i:i+batch_size]
+            current_batch_prompt_texts = prompts_text_list[i:i + batch_size]
             # Tokenize on rank 0 using StandardTokenizer
             current_batch_prompt_tokens, _ = tokenizer.tokenize_prompts(
                 current_batch_prompt_texts, use_chat_template
             )
-            
-            _print(f"Processing batch {current_batch_num}, size {len(current_batch_prompt_tokens)}")
+
+            _print(
+                f"Processing batch {current_batch_num}, size {len(current_batch_prompt_tokens)}")
 
         # All ranks call generate_batch_distributed
         generated_tokens_for_batch = backend.generate_batch_distributed(
@@ -164,12 +174,14 @@ def main(
         if rank == 0:
             # Validate that we received valid tokens
             if not generated_tokens_for_batch:
-                raise RuntimeError(f"Backend returned empty tokens for batch {current_batch_num}")
-            
+                raise RuntimeError(
+                    f"Backend returned empty tokens for batch {current_batch_num}")
+
             for batch_idx, tokens in enumerate(generated_tokens_for_batch):
                 if not isinstance(tokens, (list, tuple)) or len(tokens) == 0:
-                    raise RuntimeError(f"Backend returned empty or invalid tokens for batch {current_batch_num}, item {batch_idx}: {tokens}")
-            
+                    raise RuntimeError(
+                        f"Backend returned empty or invalid tokens for batch {current_batch_num}, item {batch_idx}: {tokens}")
+
             # Decode tokens to text using StandardTokenizer
             decoded_texts_for_batch = tokenizer.batch_decode(
                 generated_tokens_for_batch
@@ -183,23 +195,36 @@ def main(
                 original_df_idx = start_index_in_df + batch_idx
                 if original_df_idx < len(df_for_results):
                     # Use at for assignments with list values
-                    df_for_results.at[original_df_idx, 'model_output'] = decoded_texts_for_batch[batch_idx]
-                    df_for_results.at[original_df_idx, 'tok_model_output'] = generated_tokens_for_batch[batch_idx]
-                    df_for_results.at[original_df_idx, 'tok_model_output_len'] = len(generated_tokens_for_batch[batch_idx])
+                    df_for_results.at[original_df_idx,
+                                      'model_output'] = decoded_texts_for_batch[batch_idx]
+                    df_for_results.at[original_df_idx,
+                                      'tok_model_output'] = generated_tokens_for_batch[batch_idx]
+                    df_for_results.at[original_df_idx, 'tok_model_output_len'] = len(
+                        generated_tokens_for_batch[batch_idx])
 
             _print(f"Batch {current_batch_num} completed.")
 
     if rank == 0 and df_for_results is not None:
         _print(f"All batches processed. Saving results...")
-        
+
         # Keep only required columns in the same order as run_eval.py
-        output_columns = ['text_input', 'ground_truth', 'question', 'dataset', 'model_output', 'tok_model_output', 'tok_model_output_len', 'model_backend']
+        output_columns = [
+            'text_input',
+            'ground_truth',
+            'question',
+            'dataset',
+            'model_output',
+            'tok_model_output',
+            'tok_model_output_len',
+            'model_backend']
         # Filter to only columns that exist
-        output_columns = [col for col in output_columns if col in df_for_results.columns]
+        output_columns = [
+            col for col in output_columns if col in df_for_results.columns]
         df_output = df_for_results[output_columns]
-        
+
         try:
-            saved_file = save_results(df_output, output_pickle_path, add_timestamp=True)
+            saved_file = save_results(
+                df_output, output_pickle_path, add_timestamp=True)
             _print(f"Successfully saved results to {saved_file}")
         except Exception as e:
             _print(f"Error saving output pickle file: {e}")
@@ -234,4 +259,4 @@ def main(
         args.output_file,
         args.num_samples,
         args.skip_samples,
-    ) 
\ No newline at end of file
+    )
diff --git a/language/deepseek-r1/run_mlperf.py b/language/deepseek-r1/run_mlperf.py
index 7f484e725e..2345cf5b9b 100755
--- a/language/deepseek-r1/run_mlperf.py
+++ b/language/deepseek-r1/run_mlperf.py
@@ -1,4 +1,23 @@
 #!/usr/bin/env python3
+from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy
+from utils import (
+    validate_runner_for_backend, uses_text_input, uses_chat_template,
+    load_dataset, save_results, print_runner_header, StandardTokenizer,
+    get_backend_instance, create_base_argument_parser,
+    setup_output_paths, validate_runner_args, handle_runner_error,
+    validate_dataset_extended, generate_timestamped_filename
+)
+from mlperf import (
+    OfflineSUT, ServerSUT, BaseSUT,
+    QuerySampleLibrary,
+    prepare_mlperf_dataset,
+    process_mlperf_results,
+    create_mlperf_output_dataframe
+)
+from backends import BaseBackend
+import pandas as pd
+import numpy as np
+import mlperf_loadgen as lg
 import argparse
 import json
 import logging
@@ -10,26 +29,6 @@
 # Disable tokenizers parallelism to avoid forking issues
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-import mlperf_loadgen as lg
-import numpy as np
-import pandas as pd
-
-from backends import BaseBackend
-from mlperf import (
-    OfflineSUT, ServerSUT, BaseSUT,
-    QuerySampleLibrary,
-    prepare_mlperf_dataset,
-    process_mlperf_results,
-    create_mlperf_output_dataframe
-)
-from utils import (
-    validate_runner_for_backend, uses_text_input, uses_chat_template,
-    load_dataset, save_results, print_runner_header, StandardTokenizer,
-    get_backend_instance, create_base_argument_parser,
-    setup_output_paths, validate_runner_args, handle_runner_error,
-    validate_dataset_extended, generate_timestamped_filename
-)
-from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy
 
 # Configure logging
 logging.basicConfig(
@@ -47,39 +46,39 @@ def create_argument_parser() -> argparse.ArgumentParser:
 
     # Scenario selection (no backend argument, auto-detected)
     parser.add_argument("--mode", type=str, default="offline",
-                       choices=["offline", "server"],
-                       help="MLPerf scenario mode")
+                        choices=["offline", "server"],
+                        help="MLPerf scenario mode")
 
     # MLPerf configuration
     parser.add_argument("--mlperf-conf", type=str, default="/inference/mlperf.conf",
-                       help="Path to MLPerf configuration file")
+                        help="Path to MLPerf configuration file")
 
     parser.add_argument("--user-conf", type=str, default="mlperf/user.conf",
-                       help="Path to user configuration file")
+                        help="Path to user configuration file")
 
     parser.add_argument("--scenario", type=str, default=None,
-                       choices=["Offline", "Server"],
-                       help="MLPerf scenario (overrides --mode)")
+                        choices=["Offline", "Server"],
+                        help="MLPerf scenario (overrides --mode)")
 
     parser.add_argument("--accuracy", action="store_true",
-                       help="Run accuracy mode instead of performance")
+                        help="Run accuracy mode instead of performance")
 
     # Output configuration
     parser.add_argument("--output-dir", type=str, default="mlperf_results",
-                       help="Directory for MLPerf output logs")
+                        help="Directory for MLPerf output logs")
 
     parser.add_argument("--log-dir", type=str, default=None,
-                       help="Directory for detailed logs")
+                        help="Directory for detailed logs")
 
     return parser
 
 
 def configure_loadgen(scenario: str,
-                     accuracy_mode: bool,
-                     mlperf_conf: Optional[str] = None,
-                     user_conf: Optional[str] = None,
-                     log_dir: Optional[str] = None,
-                     model_name: str = "deepseek-r1") -> lg.TestSettings:
+                      accuracy_mode: bool,
+                      mlperf_conf: Optional[str] = None,
+                      user_conf: Optional[str] = None,
+                      log_dir: Optional[str] = None,
+                      model_name: str = "deepseek-r1") -> lg.TestSettings:
     """Configure LoadGen test settings.
 
     Args:
@@ -119,9 +118,9 @@ def configure_loadgen(scenario: str,
 
 
 def run_loadgen_test(sut: Union[OfflineSUT, ServerSUT],
-                    qsl: QuerySampleLibrary,
-                    settings: lg.TestSettings,
-                    log_settings: lg.LogSettings) -> None:
+                     qsl: QuerySampleLibrary,
+                     settings: lg.TestSettings,
+                     log_settings: lg.LogSettings) -> None:
     """Run LoadGen test.
 
     Args:
@@ -162,7 +161,8 @@ def main():
         if args.log_dir:
             log_dir = Path(args.log_dir)
         else:
-            log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance")
+            log_dir = output_dir / args.mode / \
+                ("accuracy" if args.accuracy else "performance")
         log_dir.mkdir(parents=True, exist_ok=True)
 
         # Set up output paths with mode information
@@ -170,17 +170,21 @@ def main():
         if args.output_file is None:
             # Create output file path in the log directory
             mode_str = "accuracy" if args.accuracy else "performance"
-            output_file_base = str(log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl")
+            output_file_base = str(
+                log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl")
         else:
             output_file_base = args.output_file
 
-        # Generate the actual filename with timestamp that will be used for saving
-        actual_output_file = generate_timestamped_filename(output_file_base, add_timestamp=True)
+        # Generate the actual filename with timestamp that will be used for
+        # saving
+        actual_output_file = generate_timestamped_filename(
+            output_file_base, add_timestamp=True)
 
         # Ensure the parent directory of the output file exists
         output_file_parent = Path(actual_output_file).parent
         output_file_parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Ensured output file directory exists: {output_file_parent}")
+        logger.info(
+            f"Ensured output file directory exists: {output_file_parent}")
 
         logger.info("=" * 80)
         logger.info("MLPerf Inference Benchmark Runner (Async Pattern)")
@@ -220,13 +224,14 @@ def main():
         # For backends that use text prompts, we pass the processed strings
         # For tokenized backends, we pass the tokenized prompts
         if uses_text_prompts:
-            logger.info(f"Backend {backend_name} will use text prompts directly")
+            logger.info(
+                f"Backend {backend_name} will use text prompts directly")
             dataset_for_sut = tokenized_prompts
             strings_for_sut = processed_strings
         else:
             logger.info(f"Backend {backend_name} will use tokenized prompts")
             dataset_for_sut = tokenized_prompts
-            strings_for_sut = processed_strings # This is what gets used for generation now
+            strings_for_sut = processed_strings  # This is what gets used for generation now
 
         # Create backend using registry
         logger.info(f"Initializing {backend_name} backend...")
@@ -315,7 +320,8 @@ def main():
             try:
                 # Get results from SUT - must have valid results
                 if not sut_results:
-                    raise RuntimeError("No results available from SUT - backend failed to generate tokens")
+                    raise RuntimeError(
+                        "No results available from SUT - backend failed to generate tokens")
 
                 # Process results using new utility
                 processed_results = process_mlperf_results(
@@ -347,16 +353,19 @@ def main():
                 mlperf_log_file = log_dir / "mlperf_log_accuracy.json"
 
                 if mlperf_log_file.exists():
-                    logger.info(f"Found MLPerf log accuracy file: {mlperf_log_file}")
+                    logger.info(
+                        f"Found MLPerf log accuracy file: {mlperf_log_file}")
                     logger.info("Using MLPerf log for accuracy evaluation...")
 
                     # Get checkpoint path from backend configuration
                     backend_config = get_backend_instance(backend_name).config
 
                     # Determine checkpoint path based on backend type
-                    if hasattr(get_backend_instance(backend_name), 'model_path'):
+                    if hasattr(get_backend_instance(
+                            backend_name), 'model_path'):
                         # PyTorch backend has model_path
-                        checkpoint_path = str(get_backend_instance(backend_name).model_path)
+                        checkpoint_path = str(
+                            get_backend_instance(backend_name).model_path)
                     elif 'model' in backend_config:
                         # Other backends use model name directly
                         checkpoint_path = backend_config['model']
@@ -376,10 +385,13 @@ def main():
                         base_filename="mlperf_accuracy_evaluated.pkl"
                     )
 
-                    logger.info(f"MLPerf accuracy evaluation saved to: {evaluated_file}")
+                    logger.info(
+                        f"MLPerf accuracy evaluation saved to: {evaluated_file}")
                 else:
-                    logger.info("No MLPerf log accuracy file found, using standard DataFrame evaluation...")
-                    raise RuntimeError("No MLPerf log accuracy file found, using standard DataFrame evaluation...")
+                    logger.info(
+                        "No MLPerf log accuracy file found, using standard DataFrame evaluation...")
+                    raise RuntimeError(
+                        "No MLPerf log accuracy file found, using standard DataFrame evaluation...")
 
         # Ensure clean exit
         gc.collect()
@@ -397,4 +409,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/language/deepseek-r1/run_mlperf_mpi.py b/language/deepseek-r1/run_mlperf_mpi.py
index 66196645ed..176be41710 100755
--- a/language/deepseek-r1/run_mlperf_mpi.py
+++ b/language/deepseek-r1/run_mlperf_mpi.py
@@ -1,4 +1,37 @@
 #!/usr/bin/env python3
+from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy
+from utils.data_utils import (
+    load_dataset, save_results,
+    generate_timestamped_filename
+)
+from utils.validation import (
+    validate_runner_args, ValidationError,
+    validate_dataset_extended
+)
+from utils.backend_registry import (
+    uses_chat_template, get_backend_instance, detect_backend,
+    validate_runner_for_backend
+)
+from utils.runner_utils import create_base_argument_parser, print_runner_header
+from utils import (
+    StandardTokenizer,
+    validate_dataset,
+    process_inference_results
+)
+from mlperf import (
+    OfflineSUT, ServerSUT, BaseSUT,
+    DistributedQuerySampleLibrary,
+    prepare_mlperf_dataset,
+    process_mlperf_results,
+    create_mlperf_output_dataframe
+)
+from backends.pytorch_backend import PyTorchBackend
+from transformers import AutoTokenizer
+import torch.distributed as dist
+import torch
+import pandas as pd
+import numpy as np
+import mlperf_loadgen as lg
 import argparse
 import json
 import logging
@@ -12,41 +45,6 @@
 # Disable tokenizers parallelism to avoid forking issues
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-import mlperf_loadgen as lg
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-from transformers import AutoTokenizer
-
-from backends.pytorch_backend import PyTorchBackend
-from mlperf import (
-    OfflineSUT, ServerSUT, BaseSUT,
-    DistributedQuerySampleLibrary,
-    prepare_mlperf_dataset,
-    process_mlperf_results,
-    create_mlperf_output_dataframe
-)
-from utils import (
-    StandardTokenizer,
-    validate_dataset,
-    process_inference_results
-)
-from utils.runner_utils import create_base_argument_parser, print_runner_header
-from utils.backend_registry import (
-    uses_chat_template, get_backend_instance, detect_backend,
-    validate_runner_for_backend
-)
-from utils.validation import (
-    validate_runner_args, ValidationError,
-    validate_dataset_extended
-)
-from utils.data_utils import (
-    load_dataset, save_results,
-    generate_timestamped_filename
-)
-from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy
-
 
 # Configure logging - only for rank 0
 def setup_logging(rank: int):
@@ -119,7 +117,7 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
         batch_size = self.backend.config['batch_size']
 
         for i in range(0, len(query_samples), batch_size):
-            batch_samples = query_samples[i:i+batch_size]
+            batch_samples = query_samples[i:i + batch_size]
 
             # Prepare batch tokens
             batch_tokens = []
@@ -141,10 +139,12 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None:
 
             # Generate using distributed backend
             # This will broadcast to all ranks internally
-            generated_tokens = self.backend.generate_batch_distributed(batch_tokens)
+            generated_tokens = self.backend.generate_batch_distributed(
+                batch_tokens)
 
             # Process results and send to LoadGen
-            for j, (sample_id, tokens) in enumerate(zip(batch_ids, generated_tokens)):
+            for j, (sample_id, tokens) in enumerate(
+                    zip(batch_ids, generated_tokens)):
                 # Create a copy of tokens before numpy conversion
                 tokens_copy = tokens.copy()
 
@@ -219,7 +219,8 @@ def get_results(self) -> List[Dict[str, Any]]:
                     # Decode tokens to get text output
                     output_text = ''
                     if self.backend.tokenizer:
-                        output_text = self.backend.tokenizer.decode(tokens, skip_special_tokens=True)
+                        output_text = self.backend.tokenizer.decode(
+                            tokens, skip_special_tokens=True)
 
                     ordered_results.append({
                         'model_output': output_text,
@@ -228,16 +229,16 @@ def get_results(self) -> List[Dict[str, Any]]:
                     })
                 else:
                     # Result exists but no tokens - this is an error
-                    raise RuntimeError(f"No tokens in result for dataset index {i}, sample_id {sample_id}")
+                    raise RuntimeError(
+                        f"No tokens in result for dataset index {i}, sample_id {sample_id}")
             else:
                 # No result for this index - this is an error
-                raise RuntimeError(f"No result for dataset index {i}, sample_id {sample_id}")
+                raise RuntimeError(
+                    f"No result for dataset index {i}, sample_id {sample_id}")
 
         return ordered_results
 
 
-
-
 def create_argument_parser() -> argparse.ArgumentParser:
     """Create argument parser for distributed MLPerf runner."""
     parser = argparse.ArgumentParser(
@@ -247,44 +248,45 @@ def create_argument_parser() -> argparse.ArgumentParser:
 
     # Dataset arguments
     parser.add_argument("--input-file", type=str,
-                       default="data/final_output.pkl",
-                       help="Input pickle file with prompts")
+                        default="data/final_output.pkl",
+                        help="Input pickle file with prompts")
 
     # MLPerf configuration
     parser.add_argument("--mlperf-conf", type=str, default="/inference/mlperf.conf",
-                       help="Path to MLPerf configuration file")
+                        help="Path to MLPerf configuration file")
 
     parser.add_argument("--user-conf", type=str, default="mlperf/user.conf",
-                       help="Path to user configuration file")
+                        help="Path to user configuration file")
 
     parser.add_argument("--mode", type=str, default="offline",
-                       choices=["offline", "server"],
-                       help="MLPerf scenario mode (only offline supported for distributed)")
+                        choices=["offline", "server"],
+                        help="MLPerf scenario mode (only offline supported for distributed)")
 
     parser.add_argument("--accuracy", action="store_true",
-                       help="Run accuracy mode instead of performance")
+                        help="Run accuracy mode instead of performance")
 
     # Output configuration
     parser.add_argument("--output-dir", type=str, default="mlperf_results",
-                       help="Directory for MLPerf output logs")
+                        help="Directory for MLPerf output logs")
 
     parser.add_argument("--log-dir", type=str, default=None,
-                       help="Directory for detailed logs")
+                        help="Directory for detailed logs")
 
     parser.add_argument("--output-file", type=str, default=None,
-                       help="Output pickle file path (auto-generated if not specified)")
+                        help="Output pickle file path (auto-generated if not specified)")
 
-    # Note: --no-chat-template is removed (chat template usage determined by backend registry)
+    # Note: --no-chat-template is removed (chat template usage determined by
+    # backend registry)
 
     return parser
 
 
 def configure_loadgen(scenario: str,
-                     accuracy_mode: bool,
-                     mlperf_conf: Optional[str] = None,
-                     user_conf: Optional[str] = None,
-                     log_dir: Optional[str] = None,
-                     model_name: str = "deepseek-r1") -> lg.TestSettings:
+                      accuracy_mode: bool,
+                      mlperf_conf: Optional[str] = None,
+                      user_conf: Optional[str] = None,
+                      log_dir: Optional[str] = None,
+                      model_name: str = "deepseek-r1") -> lg.TestSettings:
     """Configure LoadGen test settings.
 
     Args:
@@ -324,11 +326,11 @@ def configure_loadgen(scenario: str,
 
 
 def run_loadgen_test(sut: DistributedOfflineSUT,
-                    qsl: DistributedQuerySampleLibrary,
-                    settings: lg.TestSettings,
-                    log_settings: lg.LogSettings,
-                    rank: int,
-                    logger) -> None:
+                     qsl: DistributedQuerySampleLibrary,
+                     settings: lg.TestSettings,
+                     log_settings: lg.LogSettings,
+                     rank: int,
+                     logger) -> None:
     """Run LoadGen test (only on rank 0).
 
     Args:
@@ -386,7 +388,8 @@ def main():
     # Validate mode for distributed
     if args.mode != "offline":
         if rank == 0:
-            logger.error("Only offline mode is supported for distributed execution")
+            logger.error(
+                "Only offline mode is supported for distributed execution")
         sys.exit(1)
 
     # Create output directories (only rank 0)
@@ -397,7 +400,8 @@ def main():
         if args.log_dir:
             log_dir = Path(args.log_dir)
         else:
-            log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance")
+            log_dir = output_dir / args.mode / \
+                ("accuracy" if args.accuracy else "performance")
         log_dir.mkdir(parents=True, exist_ok=True)
 
         # Determine output file path
@@ -405,15 +409,18 @@ def main():
             output_file_base = args.output_file
         else:
             mode_str = "accuracy" if args.accuracy else "performance"
-            output_file_base = str(log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl")
+            output_file_base = str(
+                log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl")
 
         # Generate the actual filename with timestamp
-        actual_output_file = generate_timestamped_filename(output_file_base, add_timestamp=True)
+        actual_output_file = generate_timestamped_filename(
+            output_file_base, add_timestamp=True)
 
         # Ensure the parent directory of the output file exists
         output_file_parent = Path(actual_output_file).parent
         output_file_parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Ensured output file directory exists: {output_file_parent}")
+        logger.info(
+            f"Ensured output file directory exists: {output_file_parent}")
 
         logger.info("=" * 80)
         logger.info("MLPerf Inference Benchmark Runner (Distributed PyTorch)")
@@ -425,7 +432,8 @@ def main():
         logger.info(f"Input file: {args.input_file}")
         logger.info(f"Output directory: {output_dir}")
         logger.info(f"Output file: {actual_output_file}")
-        logger.info(f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)")
+        logger.info(
+            f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)")
         logger.info("=" * 80)
     else:
         log_dir = None
@@ -460,7 +468,8 @@ def main():
             tokenized_prompts = dataset_info['tokenized_prompts']
             processed_strings = dataset_info['processed_strings']
 
-            logger.info(f"Loaded {len(tokenized_prompts)} prompts from dataset")
+            logger.info(
+                f"Loaded {len(tokenized_prompts)} prompts from dataset")
 
         # Create SUT
         sut = DistributedOfflineSUT(
@@ -511,7 +520,8 @@ def main():
             if rank == 0:
                 # Run test (only rank 0)
                 logger.info("Running test...")
-                run_loadgen_test(sut, qsl, settings, log_settings, rank, logger)
+                run_loadgen_test(
+                    sut, qsl, settings, log_settings, rank, logger)
                 logger.info("Completed test...")
 
                 # Ensure all queries are flushed and async operations complete
@@ -524,7 +534,8 @@ def main():
                     dist.broadcast_object_list(exit_signal, src=0)
             else:
                 # Non-rank 0 processes participate in distributed generation
-                # They wait for signals from rank 0 and participate in generate_batch_distributed
+                # They wait for signals from rank 0 and participate in
+                # generate_batch_distributed
                 while True:
                     # First, check if we should exit
                     # We use a separate broadcast to signal exit
@@ -536,7 +547,8 @@ def main():
                         break
                     elif exit_check[0] == "generate":
                         # Signal to participate in generation
-                        # The actual batch tokens will be broadcast inside generate_batch_distributed
+                        # The actual batch tokens will be broadcast inside
+                        # generate_batch_distributed
                         backend.generate_batch_distributed(None)
                     # If exit_check[0] is None, continue waiting
         finally:
@@ -563,9 +575,11 @@ def main():
 
                     try:
                         # Get results from SUT (if available)
-                        logger.info("Retrieving results from distributed SUT...")
+                        logger.info(
+                            "Retrieving results from distributed SUT...")
                         sut_results = sut.get_results()
-                        logger.info(f"Retrieved {len(sut_results)} results from distributed SUT")
+                        logger.info(
+                            f"Retrieved {len(sut_results)} results from distributed SUT")
 
                         # Process results using new utility
                         processed_results = process_mlperf_results(
@@ -597,11 +611,19 @@ def main():
                         mlperf_log_file = log_dir / "mlperf_log_accuracy.json"
 
                         if mlperf_log_file.exists():
-                            logger.info(f"Found MLPerf log accuracy file: {mlperf_log_file}")
-                            logger.info("Using MLPerf log for accuracy evaluation...")
-
-                            # For PyTorch backend (only one supported in MPI), get model path
-                            checkpoint_path = str(backend.model_path) if hasattr(backend, 'model_path') else backend.config.get('model_name', 'deepseek-ai/DeepSeek-R1')
+                            logger.info(
+                                f"Found MLPerf log accuracy file: {mlperf_log_file}")
+                            logger.info(
+                                "Using MLPerf log for accuracy evaluation...")
+
+                            # For PyTorch backend (only one supported in MPI),
+                            # get model path
+                            checkpoint_path = str(
+                                backend.model_path) if hasattr(
+                                backend,
+                                'model_path') else backend.config.get(
+                                'model_name',
+                                'deepseek-ai/DeepSeek-R1')
 
                             # Process MLPerf log accuracy
                             df_evaluated, evaluated_file = process_mlperf_log_accuracy(
@@ -612,10 +634,13 @@ def main():
                                 base_filename="mlperf_accuracy_evaluated.pkl"
                             )
 
-                            logger.info(f"MLPerf accuracy evaluation saved to: {evaluated_file}")
+                            logger.info(
+                                f"MLPerf accuracy evaluation saved to: {evaluated_file}")
                         else:
-                            logger.info("No MLPerf log accuracy file found, using standard DataFrame evaluation...")
-                            raise RuntimeError("No MLPerf log accuracy file found, using standard DataFrame evaluation...")
+                            logger.info(
+                                "No MLPerf log accuracy file found, using standard DataFrame evaluation...")
+                            raise RuntimeError(
+                                "No MLPerf log accuracy file found, using standard DataFrame evaluation...")
 
     except KeyboardInterrupt:
         if rank == 0:
@@ -639,4 +664,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/language/deepseek-r1/utils/__init__.py b/language/deepseek-r1/utils/__init__.py
index ce8d10e9fd..65f575da29 100644
--- a/language/deepseek-r1/utils/__init__.py
+++ b/language/deepseek-r1/utils/__init__.py
@@ -101,4 +101,4 @@
     # Error handling
     'handle_backend_error',
     'handle_runner_error'
-]
\ No newline at end of file
+]
diff --git a/language/deepseek-r1/utils/backend_registry.py b/language/deepseek-r1/utils/backend_registry.py
index 73aee587e6..aa8d62a18e 100644
--- a/language/deepseek-r1/utils/backend_registry.py
+++ b/language/deepseek-r1/utils/backend_registry.py
@@ -173,7 +173,8 @@ def validate_backend(backend: str) -> None:
             f"Unknown backend '{backend}'. Supported backends: {', '.join(supported_backends)}")
 
 
-def _get_compatibility_error_message(backend: str, runner_type: str, compatible: List[str]) -> str:
+def _get_compatibility_error_message(
+        backend: str, runner_type: str, compatible: List[str]) -> str:
     """
     Generate error message for incompatible backend/runner combinations.
 
@@ -401,7 +402,8 @@ def get_backend_instance(backend_name: Optional[str] = None):
     return backend_class()
 
 
-def is_backend_compatible_with_runner(backend_name: Optional[str] = None, runner_type: str = None) -> bool:
+def is_backend_compatible_with_runner(
+        backend_name: Optional[str] = None, runner_type: str = None) -> bool:
     """Check if a backend is compatible with a specific runner type.
 
     Args:
@@ -441,7 +443,8 @@ def get_backend_env_vars(backend_name: Optional[str] = None) -> Dict[str, str]:
     # Get static env vars
     env_vars = BACKEND_REGISTRY[backend_name]['env_vars'].copy()
 
-    # Handle dynamic env vars (e.g., OMP_NUM_THREADS based on tensor_parallel_size)
+    # Handle dynamic env vars (e.g., OMP_NUM_THREADS based on
+    # tensor_parallel_size)
     if backend_name == 'vllm':
         config = get_backend_config(backend_name)
         env_vars['OMP_NUM_THREADS'] = str(
@@ -461,4 +464,4 @@ def apply_backend_env_vars(backend_name: Optional[str] = None) -> None:
 
     env_vars = get_backend_env_vars(backend_name)
     for key, value in env_vars.items():
-        os.environ[key] = value
\ No newline at end of file
+        os.environ[key] = value
diff --git a/language/deepseek-r1/utils/data_utils.py b/language/deepseek-r1/utils/data_utils.py
index 80acb5c8ce..0eb4cd3dcd 100644
--- a/language/deepseek-r1/utils/data_utils.py
+++ b/language/deepseek-r1/utils/data_utils.py
@@ -15,54 +15,56 @@
 from utils.validation import ValidationError, validate_dataset_extended
 
 
-def generate_timestamped_filename(output_file: str, add_timestamp: bool = True) -> str:
+def generate_timestamped_filename(
+        output_file: str, add_timestamp: bool = True) -> str:
     """
     Generate the actual filename that will be used when saving, with timestamp if requested.
-    
+
     Args:
         output_file: Base output file path
         add_timestamp: Whether to add timestamp to filename
-        
+
     Returns:
         Actual filename that will be used for saving
     """
     if not add_timestamp:
         return output_file
-        
+
     timestamp_suffix = time.strftime("%Y%m%d_%H%M%S")
     base_name, ext = os.path.splitext(output_file)
     return f"{base_name}_{timestamp_suffix}{ext}"
 
 
-def load_dataset(file_path: str, num_samples: Optional[int] = None, skip_samples: int = 0) -> pd.DataFrame:
+def load_dataset(
+        file_path: str, num_samples: Optional[int] = None, skip_samples: int = 0) -> pd.DataFrame:
     """
     Load dataset from pickle file.
-    
+
     Args:
         file_path: Path to the pickle file
         num_samples: Optional limit on number of samples to load
         skip_samples: Number of samples to skip from the beginning
-        
+
     Returns:
         Loaded DataFrame
-        
+
     Raises:
         ValidationError: If file doesn't exist or validation fails
         Exception: If file can't be loaded
     """
     if not os.path.exists(file_path):
         raise ValidationError(f"Input file not found: {file_path}")
-    
+
     print(f"Loading dataset from {file_path}...")
-    
+
     try:
         with open(file_path, "rb") as f:
             df = pd.read_pickle(f)
     except Exception as e:
         raise ValidationError(f"Failed to load dataset: {str(e)}")
-    
+
     print(f"Loaded {len(df)} samples")
-    
+
     # Skip samples if specified
     if skip_samples > 0:
         if skip_samples >= len(df):
@@ -71,31 +73,33 @@ def load_dataset(file_path: str, num_samples: Optional[int] = None, skip_samples
             )
         original_length = len(df)
         df = df.iloc[skip_samples:].reset_index(drop=True)
-        print(f"Skipped first {skip_samples} samples (from {original_length} total)")
-    
+        print(
+            f"Skipped first {skip_samples} samples (from {original_length} total)")
+
     # Limit number of samples if specified
     if num_samples is not None:
         original_length = len(df)
         df = df.head(num_samples)
-        print(f"Limited to {len(df)} samples (from {original_length} total after skipping)")
-    
+        print(
+            f"Limited to {len(df)} samples (from {original_length} total after skipping)")
+
     return df
 
 
-def save_results(df: pd.DataFrame, 
-                output_file: str, 
-                add_timestamp: bool = True) -> str:
+def save_results(df: pd.DataFrame,
+                 output_file: str,
+                 add_timestamp: bool = True) -> str:
     """
     Save results DataFrame to pickle file.
-    
+
     Args:
         df: DataFrame to save
         output_file: Output file path
         add_timestamp: Whether to add timestamp to filename
-        
+
     Returns:
         Actual output file path used
-        
+
     Raises:
         ValidationError: If save operation fails
     """
@@ -104,93 +108,99 @@ def save_results(df: pd.DataFrame,
         timestamp_suffix = time.strftime("%Y%m%d_%H%M%S")
         base_name, ext = os.path.splitext(output_file)
         output_file = f"{base_name}_{timestamp_suffix}{ext}"
-    
+
     # Ensure output directory exists
     os.makedirs(os.path.dirname(output_file), exist_ok=True)
-    
+
     print(f"Saving results to {output_file}...")
-    
+
     # Reset index before saving
     df_to_save = df.reset_index(drop=True)
-    
+
     try:
         with open(output_file, "wb") as f:
             pickle.dump(df_to_save, f)
-        print(f"Save completed: {len(df_to_save)} samples saved to {output_file}")
+        print(
+            f"Save completed: {len(df_to_save)} samples saved to {output_file}")
     except Exception as e:
         raise ValidationError(f"Failed to save results: {str(e)}")
-    
+
     return output_file
 
 
-def prepare_output_dataframe(input_df: pd.DataFrame, 
-                           backend_name: Optional[str] = None) -> pd.DataFrame:
+def prepare_output_dataframe(input_df: pd.DataFrame,
+                             backend_name: Optional[str] = None) -> pd.DataFrame:
     """
     Prepare output DataFrame by cleaning up old columns.
-    
+
     Args:
         input_df: Input DataFrame
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
-        
+
     Returns:
         Cleaned DataFrame ready for new results
     """
     if backend_name is None:
         from utils.backend_registry import detect_backend
         backend_name = detect_backend()
-    
+
     df_output = input_df.copy()
-    
+
     # Define columns to drop (old model outputs and unwanted columns)
     columns_to_drop = [
         # specify columns to drop here
     ]
-    
+
     # Also drop any existing backend-specific columns
-    backend_columns = [col for col in df_output.columns if col.startswith(f'{backend_name}_')]
+    backend_columns = [
+        col for col in df_output.columns if col.startswith(f'{backend_name}_')]
     columns_to_drop.extend(backend_columns)
-    
+
     # Drop columns that exist
     df_output = df_output.drop(
         columns=[col for col in columns_to_drop if col in df_output.columns]
     )
-    
+
     return df_output
 
 
-def add_standardized_columns(df: pd.DataFrame, 
-                           results: List[Dict[str, Any]],
-                           tokenized_prompts: List[List[int]] = None) -> pd.DataFrame:
+def add_standardized_columns(df: pd.DataFrame,
+                             results: List[Dict[str, Any]],
+                             tokenized_prompts: List[List[int]] = None) -> pd.DataFrame:
     """
     Add standardized output columns to DataFrame.
-    
+
     Args:
         df: Input DataFrame
         results: List of result dictionaries from backend
         tokenized_prompts: List of tokenized input prompts (deprecated, not used)
-        
+
     Returns:
         DataFrame with added standardized columns
     """
     # Add results columns with new naming convention
     df['model_output'] = [r.get('model_output', '') for r in results]
     df['tok_model_output'] = [r.get('tok_model_output', []) for r in results]
-    df['tok_model_output_len'] = [r.get('tok_model_output_len', 0) for r in results]
+    df['tok_model_output_len'] = [
+        r.get(
+            'tok_model_output_len',
+            0) for r in results]
     df['model_backend'] = [r.get('model_backend', '') for r in results]
-    
+
     return df
 
 
-def validate_dataset(df: pd.DataFrame, backend_name: Optional[str] = None) -> None:
+def validate_dataset(df: pd.DataFrame,
+                     backend_name: Optional[str] = None) -> None:
     """
     Validate that the dataset has required columns.
-    
+
     Args:
         df: DataFrame to validate
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
-        
+
     Raises:
         ValidationError: If required columns are missing or validation fails
     """
     # Use centralized validation function
-    validate_dataset_extended(df, backend_name) 
\ No newline at end of file
+    validate_dataset_extended(df, backend_name)
diff --git a/language/deepseek-r1/utils/error_handling.py b/language/deepseek-r1/utils/error_handling.py
index 54ca580135..6b588b9c20 100644
--- a/language/deepseek-r1/utils/error_handling.py
+++ b/language/deepseek-r1/utils/error_handling.py
@@ -5,17 +5,18 @@
 from .validation import BackendError, ValidationError
 
 
-def handle_backend_error(e: Exception, backend_name: str, operation: str) -> None:
+def handle_backend_error(e: Exception, backend_name: str,
+                         operation: str) -> None:
     """
     Standardized error handling for backend operations.
-    
+
     Args:
         e: The exception that occurred
         backend_name: Name of the backend
         operation: Description of the operation that failed
     """
     error_msg = f"\n[{backend_name.upper()}] Error during {operation}: {type(e).__name__}: {str(e)}"
-    
+
     if isinstance(e, (RuntimeError, ValueError)):
         # Known errors - just print the message
         print(error_msg)
@@ -28,7 +29,7 @@ def handle_backend_error(e: Exception, backend_name: str, operation: str) -> Non
 def handle_runner_error(e: Exception, runner_name: str) -> None:
     """
     Standardized error handling for runners.
-    
+
     Args:
         e: The exception that occurred
         runner_name: Name of the runner
@@ -45,4 +46,4 @@ def handle_runner_error(e: Exception, runner_name: str) -> None:
     else:
         print(f"\n{runner_name} failed: {e}")
         traceback.print_exc()
-        sys.exit(1) 
\ No newline at end of file
+        sys.exit(1)
diff --git a/language/deepseek-r1/utils/runner_utils.py b/language/deepseek-r1/utils/runner_utils.py
index 8c90deb515..fc2d4ad7f2 100644
--- a/language/deepseek-r1/utils/runner_utils.py
+++ b/language/deepseek-r1/utils/runner_utils.py
@@ -12,29 +12,31 @@ def create_base_argument_parser(description: str) -> argparse.ArgumentParser:
         description=description,
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    
+
     # Common dataset arguments
     parser.add_argument("--input-file", type=str,
-                       default="data/final_output.pkl",
-                       help="Input pickle file with prompts")
-    
+                        default="data/final_output.pkl",
+                        help="Input pickle file with prompts")
+
     parser.add_argument("--output-file", type=str, default=None,
-                       help="Output pickle file path (auto-generated if not specified)")
-    
+                        help="Output pickle file path (auto-generated if not specified)")
+
     parser.add_argument("--num-samples", type=int, default=None,
-                       help="Number of samples to process from dataset")
-    
+                        help="Number of samples to process from dataset")
+
     parser.add_argument("--skip-samples", type=int, default=0,
-                       help="Number of samples to skip from the beginning")
-    
-    # NOTE: --no-chat-template flag is NOT included (chat template usage determined by backend registry)
-    
+                        help="Number of samples to skip from the beginning")
+
+    # NOTE: --no-chat-template flag is NOT included (chat template usage
+    # determined by backend registry)
+
     return parser
 
 
-def print_runner_header(runner_name: str, backend_name: Optional[str] = None, args: argparse.Namespace = None) -> None:
+def print_runner_header(
+        runner_name: str, backend_name: Optional[str] = None, args: argparse.Namespace = None) -> None:
     """Print standardized header for runners.
-    
+
     Args:
         runner_name: Name of the runner
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
@@ -43,7 +45,7 @@ def print_runner_header(runner_name: str, backend_name: Optional[str] = None, ar
     if backend_name is None:
         from .backend_registry import detect_backend
         backend_name = detect_backend()
-    
+
     print("=" * 80)
     print(f"{runner_name}")
     print("=" * 80)
@@ -59,22 +61,23 @@ def print_runner_header(runner_name: str, backend_name: Optional[str] = None, ar
     print("=" * 80)
 
 
-def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = None, mode: Optional[str] = None) -> Tuple[Path, str]:
+def setup_output_paths(args: argparse.Namespace,
+                       backend_name: Optional[str] = None, mode: Optional[str] = None) -> Tuple[Path, str]:
     """
     Set up output directories and file paths.
-    
+
     Args:
         args: Parsed command line arguments
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
         mode: Optional mode (e.g., 'offline', 'server' for MLPerf)
-        
+
     Returns:
         Tuple of (output_dir, output_file_path)
     """
     if backend_name is None:
         from .backend_registry import detect_backend
         backend_name = detect_backend()
-    
+
     # Determine output directory
     if hasattr(args, 'output_dir') and args.output_dir:
         output_dir = Path(args.output_dir)
@@ -84,9 +87,9 @@ def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = N
             output_dir = Path(f"outputs/{backend_name}/{mode}")
         else:
             output_dir = Path(f"outputs/{backend_name}")
-    
+
     output_dir.mkdir(parents=True, exist_ok=True)
-    
+
     # Determine output file path
     if args.output_file:
         output_file = args.output_file
@@ -97,10 +100,13 @@ def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = N
             suffix = f"_{args.num_samples}samples"
         else:
             suffix = "_full"
-        
+
         if mode:
-            output_file = str(output_dir / f"{backend_name}_{mode}_output_{timestamp}{suffix}.pkl")
+            output_file = str(
+                output_dir /
+                f"{backend_name}_{mode}_output_{timestamp}{suffix}.pkl")
         else:
-            output_file = str(output_dir / f"{backend_name}_output_{timestamp}{suffix}.pkl")
-    
-    return output_dir, output_file 
\ No newline at end of file
+            output_file = str(output_dir /
+                              f"{backend_name}_output_{timestamp}{suffix}.pkl")
+
+    return output_dir, output_file
diff --git a/language/deepseek-r1/utils/tokenization.py b/language/deepseek-r1/utils/tokenization.py
index c5fa77d69d..ec67e1e2eb 100644
--- a/language/deepseek-r1/utils/tokenization.py
+++ b/language/deepseek-r1/utils/tokenization.py
@@ -7,15 +7,15 @@
 
 class StandardTokenizer:
     """Standard tokenizer for DeepSeek models."""
-    
+
     # Standard configuration used across all runners
     DEFAULT_MODEL = "deepseek-ai/DeepSeek-R1"
     DEFAULT_MAX_LENGTH = 32 * 1024
-    
+
     def __init__(self, model_name: str = None, max_length: int = None):
         """
         Initialize tokenizer.
-        
+
         Args:
             model_name: HuggingFace model name
             max_length: Maximum sequence length
@@ -23,50 +23,54 @@ def __init__(self, model_name: str = None, max_length: int = None):
         self.model_name = model_name or self.DEFAULT_MODEL
         self.max_length = max_length or self.DEFAULT_MAX_LENGTH
         self._tokenizer = None
-    
+
     @property
     def tokenizer(self):
         """Lazy load tokenizer."""
         if self._tokenizer is None:
             print(f"Loading tokenizer: {self.model_name}")
-            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision="56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad")
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name, revision="56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad")
         return self._tokenizer
-    
-    def tokenize_prompts(self, prompts: List[str], 
-                        use_chat_template: Optional[bool] = None,
-                        backend_name: Optional[str] = None) -> Tuple[List[List[int]], List[str]]:
+
+    def tokenize_prompts(self, prompts: List[str],
+                         use_chat_template: Optional[bool] = None,
+                         backend_name: Optional[str] = None) -> Tuple[List[List[int]], List[str]]:
         """
         Tokenize prompts with backend-specific handling.
-        
+
         Args:
             prompts: List of text prompts
             use_chat_template: Whether to use chat template (if None and backend_name provided, uses registry)
             backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
-            
+
         Returns:
             Tuple of (tokenized_prompts, processed_strings)
         """
         # Auto-detect backend if not provided
         if backend_name is None:
             backend_name = detect_backend()
-        
+
         # Determine chat template usage from registry if backend_name provided
         if use_chat_template is None:
             use_chat_template = uses_chat_template(backend_name)
-            print(f"[{backend_name}] Using chat template from registry: {use_chat_template}")
-        
+            print(
+                f"[{backend_name}] Using chat template from registry: {use_chat_template}")
+
         tokenized = []
         processed_strings = []
-        
+
         for prompt in prompts:
-            if use_chat_template and hasattr(self.tokenizer, 'apply_chat_template'):
+            if use_chat_template and hasattr(
+                    self.tokenizer, 'apply_chat_template'):
                 tokens = self.tokenizer.apply_chat_template(
                     [{"role": "user", "content": prompt}],
                     add_generation_prompt=True,
                     max_length=self.max_length,
                     truncation=True
                 )
-                processed_string = self.tokenizer.decode(tokens, skip_special_tokens=False)
+                processed_string = self.tokenizer.decode(
+                    tokens, skip_special_tokens=False)
             else:
                 tokens = self.tokenizer.encode(
                     prompt,
@@ -74,49 +78,52 @@ def tokenize_prompts(self, prompts: List[str],
                     max_length=self.max_length
                 )
                 processed_string = prompt
-            
+
             tokenized.append(tokens)
             processed_strings.append(processed_string)
-        
+
         return tokenized, processed_strings
-    
-    def decode_tokens(self, tokens: List[int], skip_special_tokens: bool = True) -> str:
+
+    def decode_tokens(self, tokens: List[int],
+                      skip_special_tokens: bool = True) -> str:
         """Decode tokens to text."""
-        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
-    
-    def batch_decode(self, token_lists: List[List[int]], 
-                    skip_special_tokens: bool = True) -> List[str]:
+        return self.tokenizer.decode(
+            tokens, skip_special_tokens=skip_special_tokens)
+
+    def batch_decode(self, token_lists: List[List[int]],
+                     skip_special_tokens: bool = True) -> List[str]:
         """Batch decode multiple token lists."""
-        return self.tokenizer.batch_decode(token_lists, skip_special_tokens=skip_special_tokens)
+        return self.tokenizer.batch_decode(
+            token_lists, skip_special_tokens=skip_special_tokens)
 
 
-def process_inference_results(raw_results: List[dict], 
-                            tokenizer: Optional[StandardTokenizer] = None,
-                            backend_name: Optional[str] = None,
-                            uses_text_prompts: bool = False) -> List[dict]:
+def process_inference_results(raw_results: List[dict],
+                              tokenizer: Optional[StandardTokenizer] = None,
+                              backend_name: Optional[str] = None,
+                              uses_text_prompts: bool = False) -> List[dict]:
     """
     Process raw inference results into standardized format.
-    
+
     Args:
         raw_results: Raw results from backend
         tokenizer: Tokenizer for decoding
         backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var.
         uses_text_prompts: Whether backend uses text prompts
-        
+
     Returns:
         List of standardized result dictionaries
     """
     # Auto-detect backend if not provided
     if backend_name is None:
         backend_name = detect_backend()
-    
+
     if backend_name not in get_supported_backends():
         raise ValueError(f"Backend {backend_name} is not supported")
-    
+
     backend_config = get_backend_config(backend_name)
-    
+
     standardized_results = []
-    
+
     for raw_result in raw_results:
         # Handle text-prompt backends
         if uses_text_prompts and 'text' in raw_result:
@@ -129,9 +136,9 @@ def process_inference_results(raw_results: List[dict],
             if tokenizer and tokens:
                 try:
                     text = tokenizer.decode_tokens(tokens)
-                except:
+                except BaseException:
                     pass
-        
+
         standardized = {
             'model_output': text,
             'tok_model_output': tokens,
@@ -139,5 +146,5 @@ def process_inference_results(raw_results: List[dict],
             'model_backend': backend_name,
         }
         standardized_results.append(standardized)
-    
-    return standardized_results 
\ No newline at end of file
+
+    return standardized_results
diff --git a/language/deepseek-r1/utils/validation.py b/language/deepseek-r1/utils/validation.py
index 29bebef4f1..768427ada1 100644
--- a/language/deepseek-r1/utils/validation.py
+++ b/language/deepseek-r1/utils/validation.py
@@ -12,8 +12,10 @@ class BackendError(RuntimeError):
 
 class BackendNotInitializedError(BackendError):
     """Raised when backend operation is called before initialization."""
+
     def __init__(self, backend_name: str = "Backend"):
-        super().__init__(f"{backend_name} not initialized. Call initialize() first.")
+        super().__init__(
+            f"{backend_name} not initialized. Call initialize() first.")
 
 
 class ValidationError(ValueError):
@@ -33,9 +35,9 @@ def wrapper(self, *args, **kwargs):
 
 
 def validate_prompts_input(backend_name: Optional[str] = None,
-                          tokenized_prompts: Optional[List[List[int]]] = None,
-                          text_prompts: Optional[List[str]] = None,
-                          input_type: str = None) -> None:
+                           tokenized_prompts: Optional[List[List[int]]] = None,
+                           text_prompts: Optional[List[str]] = None,
+                           input_type: str = None) -> None:
     """
     Centralized prompt validation with backend-specific requirements.
 
@@ -53,13 +55,16 @@ def validate_prompts_input(backend_name: Optional[str] = None,
         backend_name = detect_backend()
 
     if tokenized_prompts is None and text_prompts is None:
-        raise ValidationError(f"{backend_name} backend requires either text_prompts or tokenized_prompts")
+        raise ValidationError(
+            f"{backend_name} backend requires either text_prompts or tokenized_prompts")
 
     if input_type == 'text' and tokenized_prompts is not None and text_prompts is None:
-        raise ValidationError(f"{backend_name} backend requires text_prompts, not tokenized_prompts")
+        raise ValidationError(
+            f"{backend_name} backend requires text_prompts, not tokenized_prompts")
 
     if input_type == 'tokenized' and text_prompts is not None and tokenized_prompts is None:
-        raise ValidationError(f"{backend_name} backend requires tokenized_prompts, not text_prompts")
+        raise ValidationError(
+            f"{backend_name} backend requires tokenized_prompts, not text_prompts")
 
     # Additional validation for tokenized prompts
     if tokenized_prompts is not None:
@@ -67,9 +72,11 @@ def validate_prompts_input(backend_name: Optional[str] = None,
             raise ValidationError("tokenized_prompts cannot be empty")
         for i, prompt in enumerate(tokenized_prompts):
             if not isinstance(prompt, list):
-                raise ValidationError(f"tokenized_prompts[{i}] must be a list of integers")
+                raise ValidationError(
+                    f"tokenized_prompts[{i}] must be a list of integers")
             if not prompt:
-                raise ValidationError(f"tokenized_prompts[{i}] cannot be empty")
+                raise ValidationError(
+                    f"tokenized_prompts[{i}] cannot be empty")
 
     # Additional validation for text prompts
     if text_prompts is not None:
@@ -81,8 +88,8 @@ def validate_prompts_input(backend_name: Optional[str] = None,
 
 
 def validate_dataset_extended(df: pd.DataFrame,
-                            backend_name: Optional[str] = None,
-                            required_columns: Optional[List[str]] = None) -> None:
+                              backend_name: Optional[str] = None,
+                              required_columns: Optional[List[str]] = None) -> None:
     """
     Extended dataset validation with backend-specific requirements.
 
@@ -101,9 +108,11 @@ def validate_dataset_extended(df: pd.DataFrame,
     if required_columns is None:
         required_columns = ['text_input']
 
-    missing_columns = [col for col in required_columns if col not in df.columns]
+    missing_columns = [
+        col for col in required_columns if col not in df.columns]
     if missing_columns:
-        raise ValidationError(f"Dataset missing required columns: {missing_columns}")
+        raise ValidationError(
+            f"Dataset missing required columns: {missing_columns}")
 
     # Check for empty prompts
     empty_prompts = df['text_input'].isna().sum()
@@ -118,7 +127,8 @@ def validate_dataset_extended(df: pd.DataFrame,
             config = get_backend_config(backend_name)
             # Add backend-specific validation based on config if needed
 
-    print(f"Dataset validation passed: {len(df)} samples with required columns")
+    print(
+        f"Dataset validation passed: {len(df)} samples with required columns")
 
 
 def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None:
@@ -133,7 +143,8 @@ def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None:
         ValidationError: If validation fails
     """
     # Common validations
-    if hasattr(args, 'num_samples') and args.num_samples is not None and args.num_samples <= 0:
+    if hasattr(
+            args, 'num_samples') and args.num_samples is not None and args.num_samples <= 0:
         raise ValidationError("--num-samples must be positive")
 
     if hasattr(args, 'skip_samples') and args.skip_samples < 0:
@@ -142,4 +153,5 @@ def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None:
     # Runner-specific validations
     if runner_type in ['mlperf', 'mlperf_mpi']:
         if hasattr(args, 'mode') and args.mode not in ['offline', 'server']:
-            raise ValidationError(f"Invalid mode: {args.mode}. Must be 'offline' or 'server'")
\ No newline at end of file
+            raise ValidationError(
+                f"Invalid mode: {args.mode}. Must be 'offline' or 'server'")
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index fa3ae51514..40c83eecc5 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -482,7 +482,7 @@
             "rgat": 788379,
             "pointpainting": 1024,
         },
-        "dataset-size" : {
+        "dataset-size": {
             "resnet": 50000,
             "retinanet": 24781,
             "bert-99": 10833,
@@ -1023,7 +1023,7 @@ def get_min_query_count(self, model, scenario):
         if model not in self.min_queries:
             raise ValueError("model not known: " + model)
         return self.min_queries[model].get(scenario)
-    
+
     def get_dataset_size(self, model):
         model = self.get_mlperf_model(model)
         if model not in self.dataset_size:
@@ -2292,9 +2292,12 @@ def log_result(
 
             #  Check for calibration documentation
             if not config.skip_calibration_check and division not in ["open"]:
-                calibration_path_root = os.path.join(division, submitter, "calibration.md")
-                calibration_path_doc = os.path.join(division, submitter, "documentation", "calibration.md")
-                if not (os.path.exists(calibration_path_root)) and (not os.path.exists(calibration_path_doc)):
+                calibration_path_root = os.path.join(
+                    division, submitter, "calibration.md")
+                calibration_path_doc = os.path.join(
+                    division, submitter, "documentation", "calibration.md")
+                if not (os.path.exists(calibration_path_root)) and (
+                        not os.path.exists(calibration_path_doc)):
                     log.error(
                         "%s/%s: has not calibration file. One of %s or %s is required",
                         division,
@@ -3248,8 +3251,8 @@ def main():
         args.extra_model_benchmark_map,
         ignore_uncommited=args.submission_exceptions,
         skip_power_check=args.skip_power_check,
-        skip_all_systems_with_results = args.skip_all_systems_have_results_check,
-        skip_calibration_check = args.skip_calibration_check
+        skip_all_systems_with_results=args.skip_all_systems_have_results_check,
+        skip_calibration_check=args.skip_calibration_check
     )
 
     if args.scenarios_to_skip:

From ac42a2f4b6f7ce7dc18d41939a4da0fe9cfa89f2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:19:58 +0000
Subject: [PATCH 13/35] [Automated Commit] Format Codebase

---
 speech2text/QSL.py                       | 18 ++++-
 speech2text/accuracy_eval.py             | 57 ++++++++++++--
 speech2text/helpers.py                   | 45 ++++++++----
 speech2text/legacy_helpers.py            |  6 +-
 speech2text/manifest.py                  |  1 +
 speech2text/reference_SUT.py             | 94 ++++++++++++++++++------
 speech2text/reference_mlperf.py          | 60 ++++++++++-----
 speech2text/utils/download_utils.py      |  3 +-
 speech2text/utils/preprocessing_utils.py |  3 +-
 9 files changed, 216 insertions(+), 71 deletions(-)

diff --git a/speech2text/QSL.py b/speech2text/QSL.py
index 19afd49acf..a4882eb32b 100644
--- a/speech2text/QSL.py
+++ b/speech2text/QSL.py
@@ -30,6 +30,7 @@
 Manifest_Global = None
 max_duration = float(os.environ.get("MAX_DURATION", "30.0"))
 
+
 def load_sample_from_file(index):
     global Manifest
     sample = Manifest_Global[index]
@@ -43,12 +44,18 @@ def load_sample_from_file(index):
     duration = sample['duration']
     return prompt
 
+
 class AudioQSL:
     def __init__(self, dataset_dir, manifest_filepath, labels,
                  sample_rate=16000, perf_count=None, skip_qsl=False):
         global Manifest_Global
         m_paths = [manifest_filepath]
-        self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels), max_duration=max_duration)
+        self.manifest = Manifest(
+            dataset_dir,
+            m_paths,
+            labels,
+            len(labels),
+            max_duration=max_duration)
         Manifest_Global = self.manifest
         self.sample_rate = sample_rate
         self.count = len(self.manifest)
@@ -59,15 +66,15 @@ def __init__(self, dataset_dir, manifest_filepath, labels,
             self.qsl = None
         else:
             self.qsl = lg.ConstructQSL(self.count, perf_count,
-                                    self.load_query_samples,
-                                    self.unload_query_samples)
+                                       self.load_query_samples,
+                                       self.unload_query_samples)
 
         print(
             "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format(
                 self.manifest.duration / 3600,
                 self.manifest.filtered_duration / 3600,
                 self.count))
-    
+
     def load_query_samples(self, sample_list):
         pass
 
@@ -83,6 +90,8 @@ def __del__(self):
 
 # We have no problem fitting all data in memory, so we do that, in
 # order to speed up execution of the benchmark.
+
+
 class AudioQSLInMemory(AudioQSL):
     def __init__(self, dataset_dir, manifest_filepath, labels,
                  sample_rate=16000, perf_count=None, skip_qsl=True):
@@ -104,5 +113,6 @@ def load_query_samples(self, sample_list):
     def unload_query_samples(self, sample_list):
         for sample_id in sample_list:
             del self.sample_id_to_sample[sample_id]
+
     def __del__(self):
         print("FInished destroying no QSL")
diff --git a/speech2text/accuracy_eval.py b/speech2text/accuracy_eval.py
index ad741db502..3c7466ff8f 100644
--- a/speech2text/accuracy_eval.py
+++ b/speech2text/accuracy_eval.py
@@ -28,7 +28,35 @@
 
 
 max_duration = float(os.environ.get("MAX_DURATION", "30.0"))
-labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+labels = [
+    " ",
+    "a",
+    "b",
+    "c",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "q",
+    "r",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+    "'"]
 dtype_map = {
     "int8": 'b',
     "int16": 'h',
@@ -36,6 +64,7 @@
     "int64": 'q',
 }
 
+
 def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
     """
     Computes Average Word Error rate between two texts represented as
@@ -61,37 +90,53 @@ def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
         r = normalizer(r)
         h_list = h.split()
         r_list = r.split()
-        scores_clip, words_clip = compute_wer_with_concatenation(h_list, r_list)
+        scores_clip, words_clip = compute_wer_with_concatenation(
+            h_list, r_list)
         scores += scores_clip
         words += words_clip
     wer = scores / words
     return wer, scores, words
 
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--log_dir", required=True)
     parser.add_argument("--dataset_dir", required=True)
     parser.add_argument("--manifest", required=True)
-    parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type")
+    parser.add_argument(
+        "--output_dtype",
+        default="int64",
+        choices=dtype_map.keys(),
+        help="Output data type")
     args = parser.parse_args()
     return args
 
+
 def main():
     args = get_args()
-    manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), max_duration=max_duration)
+    manifest = Manifest(args.dataset_dir,
+                        [args.manifest],
+                        labels,
+                        len(labels),
+                        max_duration=max_duration)
     with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
         results = json.load(fh)
     hypotheses = []
     references = []
     for result in results:
-        hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
+        hypotheses.append(array.array(
+            dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
         references.append(manifest[result["qsl_idx"]]["transcript"])
 
     references = __gather_predictions([references], labels=labels)
     hypotheses = __gather_predictions([hypotheses], labels=labels)
 
     wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references)
-    print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
+    print(
+        "Word Error Rate: {:}%, accuracy={:}%".format(
+            wer * 100,
+         (1 - wer) * 100))
+
 
 if __name__ == '__main__':
     main()
diff --git a/speech2text/helpers.py b/speech2text/helpers.py
index b89b04e91e..279a3d396c 100644
--- a/speech2text/helpers.py
+++ b/speech2text/helpers.py
@@ -16,6 +16,7 @@
 from typing import List
 from legacy_helpers import __levenshtein
 
+
 def compute_wer_with_concatenation(prediction, reference):
     """
     Compute WER considering concatenated words as correct matches using kaldialign
@@ -46,7 +47,6 @@ def compute_wer_with_concatenation(prediction, reference):
         ref_concat = ref_words[i]
         hyp_concat = hyp_words[j]
 
-
         # Try concatenating up to 3 words
         ref_match_len = 1
         hyp_match_len = 1
@@ -54,7 +54,7 @@ def compute_wer_with_concatenation(prediction, reference):
 
         for k in range(1, 4):
             if i + k <= len(ref_words):
-                ref_concat = ''.join(ref_words[i:i+k])
+                ref_concat = ''.join(ref_words[i:i + k])
                 if ref_concat == hyp_words[j]:
                     ref_match_len = k
                     hyp_match_len = 1
@@ -62,7 +62,7 @@ def compute_wer_with_concatenation(prediction, reference):
                     break
 
             if j + k <= len(hyp_words):
-                hyp_concat = ''.join(hyp_words[j:j+k])
+                hyp_concat = ''.join(hyp_words[j:j + k])
                 if hyp_concat == ref_words[i]:
                     ref_match_len = 1
                     hyp_match_len = k
@@ -71,8 +71,8 @@ def compute_wer_with_concatenation(prediction, reference):
 
         if match_found:
             # Add concatenated match
-            alignment.append((' '.join(ref_words[i:i+ref_match_len]),
-                            ' '.join(hyp_words[j:j+hyp_match_len])))
+            alignment.append((' '.join(ref_words[i:i + ref_match_len]),
+                              ' '.join(hyp_words[j:j + hyp_match_len])))
             i += ref_match_len
             j += hyp_match_len
 
@@ -91,14 +91,18 @@ def compute_wer_with_concatenation(prediction, reference):
         j += 1
 
     # Calculate WER using kaldialign
-    ref_aligned = [x[0].replace(" ", "") for x in alignment if x[0] is not None]
-    hyp_aligned = [x[1].replace(" ", "") for x in alignment if x[1] is not None]
+    ref_aligned = [x[0].replace(" ", "")
+                   for x in alignment if x[0] is not None]
+    hyp_aligned = [x[1].replace(" ", "")
+                   for x in alignment if x[1] is not None]
     distance = __levenshtein(ref_aligned, hyp_aligned)
     wer = distance / len(ref_words) if ref_words else 0
 
     return distance, len(ref_words) if ref_words else 0
 
-def expand_concatenations(words_list: List, reference_dict: dict, reference_list: List):
+
+def expand_concatenations(
+        words_list: List, reference_dict: dict, reference_list: List):
     """
     Finds matching compound words in 'words_list' which exist as keys in 'reference_dict', if any.
     If found, the compound word will be separated using reference_dict if the substitution reduces
@@ -113,15 +117,19 @@ def expand_concatenations(words_list: List, reference_dict: dict, reference_list
     score = __levenshtein(words_list, reference_list)
 
     # Searches each word in 'word_list' for separability using the reference list. Once all options are
-    # considered, the modified 'word_list' is returned. Length of 'word_list' can grow, but not contract.
+    # considered, the modified 'word_list' is returned. Length of 'word_list'
+    # can grow, but not contract.
     i = 0
     words_length = len(words_list)
     while i < words_length:
         if words_list[i] in reference_dict.keys():
-            words_candidate = words_list[:i] + reference_dict[words_list[i]] + words_list[i + 1:]
+            words_candidate = words_list[:i] + \
+                reference_dict[words_list[i]] + words_list[i + 1:]
 
-            # If levenshtein distance reduced, cache new word_list and resume search
-            candidate_levenshtein = __levenshtein(words_candidate, reference_list)
+            # If levenshtein distance reduced, cache new word_list and resume
+            # search
+            candidate_levenshtein = __levenshtein(
+                words_candidate, reference_list)
             if candidate_levenshtein < score:
                 words_list = words_candidate
                 words_length = len(words_list)
@@ -129,6 +137,7 @@ def expand_concatenations(words_list: List, reference_dict: dict, reference_list
         i += 1
     return words_list
 
+
 def get_expanded_wordlist(words_list: List, reference_list: List):
     """
     Provided two lists of English words, the two will be compared, and any compound words found in
@@ -141,7 +150,8 @@ def get_expanded_wordlist(words_list: List, reference_list: List):
         List of words modified from 'word_list' after expanding referenced compound words
     """
 
-    # If levenshtein distance < 2, there cannot be any compound word separation issues.
+    # If levenshtein distance < 2, there cannot be any compound word
+    # separation issues.
     if __levenshtein(words_list, reference_list) < 2:
         return words_list
 
@@ -153,9 +163,12 @@ def get_expanded_wordlist(words_list: List, reference_list: List):
 
     # Adding three-word compounding candidates to checklist
     for i in range(len(reference_list) - 2):
-        compound = reference_list[i] + reference_list[i + 1] + reference_list[i + 2]
-        checklist[compound] = [reference_list[i], reference_list[i + 1], reference_list[i + 2]]
+        compound = reference_list[i] + \
+            reference_list[i + 1] + reference_list[i + 2]
+        checklist[compound] = [reference_list[i],
+                               reference_list[i + 1], reference_list[i + 2]]
 
     # All compiled candidates will be checked, and after checking for minimal Levenshtein
-    # distance, the modified list (or original if compounding not found) is directly returned 
+    # distance, the modified list (or original if compounding not found) is
+    # directly returned
     return expand_concatenations(words_list, checklist, reference_list)
diff --git a/speech2text/legacy_helpers.py b/speech2text/legacy_helpers.py
index 45065fff72..17687e4230 100644
--- a/speech2text/legacy_helpers.py
+++ b/speech2text/legacy_helpers.py
@@ -16,6 +16,7 @@
 from enum import Enum
 from typing import List
 
+
 def __levenshtein(a: List, b: List) -> int:
     """Calculates the Levenshtein distance between a and b.
     """
@@ -37,6 +38,7 @@ def __levenshtein(a: List, b: List) -> int:
 
     return current[n]
 
+
 def __whisper_decoder_predictions_tensor(tensor, labels):
     """
     Takes output of greedy whisper decoder and converts to strings.
@@ -58,6 +60,6 @@ def __whisper_decoder_predictions_tensor(tensor, labels):
 def __gather_predictions(predictions_list: list, labels: list) -> list:
     results = []
     for prediction in predictions_list:
-        results += __whisper_decoder_predictions_tensor(prediction, labels=labels)
+        results += __whisper_decoder_predictions_tensor(
+            prediction, labels=labels)
     return results
-
diff --git a/speech2text/manifest.py b/speech2text/manifest.py
index d7c0fc88d6..c54be6923b 100644
--- a/speech2text/manifest.py
+++ b/speech2text/manifest.py
@@ -16,6 +16,7 @@
 import string
 import os
 
+
 class Manifest(object):
     def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False,
                  min_duration=None, sort_by_duration=False, max_utts=0,
diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py
index 3031281736..3deeea0cec 100644
--- a/speech2text/reference_SUT.py
+++ b/speech2text/reference_SUT.py
@@ -43,27 +43,59 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("SUT")
 
+
 def get_start_cores(start_cores="0"):
     start_cores = start_cores.split(",")
     start_cores = list(map(int, start_cores))
     return start_cores
 
+
 cores_per_inst = int(os.environ.get("CORES_PER_INST", "1"))
 num_numa_nodes = int(os.environ.get("NUM_NUMA_NODES", "1"))
-nodes_per_inst = int(os.environ["NUM_NUMA_NODES"])/int(os.environ["NUM_INSTS"])
+nodes_per_inst = int(os.environ["NUM_NUMA_NODES"]
+                     ) / int(os.environ["NUM_INSTS"])
 insts_per_node = int(os.environ["INSTS_PER_NODE"])
-start_cores    = os.environ["START_CORES"]
+start_cores = os.environ["START_CORES"]
 
 precision = torch.float32
 n_mels = 128
 sample_rate = 16000
 model_path = "openai/whisper-large-v3"
 
-labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+labels = [
+    " ",
+    "a",
+    "b",
+    "c",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "q",
+    "r",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+    "'"]
 labels_dict = {}
 for i in range(len(labels)):
     labels_dict[labels[i]] = i
 
+
 class Instance(mp.Process):
     def __init__(
         self,
@@ -123,7 +155,7 @@ def run(self):
             self.total_sample_count
         )
 
-        dtype="bfloat16"
+        dtype = "bfloat16"
         print(f"Precision: {dtype}")
         model = LLM(
             model=model_path,
@@ -162,7 +194,7 @@ def process_queries(self):
 
         if qitem_list is None:
             return False
-        
+
         prompt_list = []
         for qitem in qitem_list:
             prompt = self.qsl[qitem.index]
@@ -177,7 +209,8 @@ def process_queries(self):
 
         start_time = time.time()
         outputs = self.model.generate(prompt_list, self.sampling_params)
-        print(f"Sample number: {self.num_samples} | Step time {time.time()-start_time:.3f}s")
+        print(
+            f"Sample number: {self.num_samples} | Step time {time.time()-start_time:.3f}s")
 
         for output in outputs:
             request_id = int(output.request_id)
@@ -188,7 +221,7 @@ def process_queries(self):
 
         self.num_samples += len(results)
 
-        for i,result in enumerate(results):
+        for i, result in enumerate(results):
             # Whisper outputs space in the front and capitalizes things
             result = result.lower().strip()
             transcript = []
@@ -204,6 +237,7 @@ def process_queries(self):
             print(f"Finished {qid[i]}")
         return True
 
+
 class vllmSUT:
     def __init__(self, dataset_dir,
                  manifest_filepath, perf_count, num_workers=1, device="cpu"):
@@ -211,21 +245,21 @@ def __init__(self, dataset_dir,
         self.dataset_path = dataset_dir
         self.manifest_filepath = manifest_filepath
         self.device = device
-        self.batch_size = 16 
+        self.batch_size = 16
         self.total_sample_count = perf_count
         self.num_workers = num_workers
         self.worker_threads = [None] * self.num_workers
 
         dataset_vocab = labels
 
-        #self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
+        # self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
 
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
         self.qsl = AudioQSL(dataset_dir,
-                                    manifest_filepath,
-                                    dataset_vocab,
-                                    sample_rate,
-                                    perf_count)
+                            manifest_filepath,
+                            dataset_vocab,
+                            sample_rate,
+                            perf_count)
         self.query_queue = mp.JoinableQueue()
         self.output_queue = mp.Queue()
         self.alive_counter = mp.Value("i", 0)
@@ -235,10 +269,20 @@ def __init__(self, dataset_dir,
     def start(self):
         node_start_cores = get_start_cores(start_cores)
         core_lists = []
-        if insts_per_node>0:
+        if insts_per_node > 0:
             for i in range(num_numa_nodes):
                 for j in range(insts_per_node):
-                    core_lists.append(list(range(node_start_cores[i]+j*cores_per_inst, node_start_cores[i]+(j+1)*cores_per_inst)))
+                    core_lists.append(
+                        list(
+                            range(
+                                node_start_cores[i] +
+                                j *
+                                cores_per_inst,
+                                node_start_cores[i] +
+                                (
+                                    j +
+                                    1) *
+                                cores_per_inst)))
 
         for j in range(self.num_workers):
             core_list = core_lists[j]
@@ -253,18 +297,19 @@ def start(self):
                 rank=j,
                 dtype=precision,
                 core_list=tuple(core_list),
-                node_list=tuple([math.floor(j*nodes_per_inst)]),
-                input_queue = self.query_queue,
-                output_queue = self.output_queue,
-                cond_var = self.cond_var,
-                alive_counter = self.alive_counter,
-                sample_counter = self.sample_counter
+                node_list=tuple([math.floor(j * nodes_per_inst)]),
+                input_queue=self.query_queue,
+                output_queue=self.output_queue,
+                cond_var=self.cond_var,
+                alive_counter=self.alive_counter,
+                sample_counter=self.sample_counter
             )
             worker.start()
             self.worker_threads[j] = worker
 
         with self.cond_var:
-            self.cond_var.wait_for(lambda: self.alive_counter.value == self.num_workers)
+            self.cond_var.wait_for(
+                lambda: self.alive_counter.value == self.num_workers)
 
         log.info(f"Starting Loadgen response thread")
         response_thread = threading.Thread(target=self.response_loadgen)
@@ -276,11 +321,12 @@ def issue_queries(self, query_samples):
         for query_sample in query_samples:
             # Continuous batching
             self.query_queue.put([query_sample])
-        if len(query_sample_list)>0:
+        if len(query_sample_list) > 0:
             self.query_queue.put(query_sample_list)
 
     def flush_queries(self):
         pass
+
     def response_loadgen(self):
         keep_alive = True
         while keep_alive:
@@ -293,13 +339,13 @@ def response_loadgen(self):
                 response = lg.QuerySampleResponse(qid, bi[0],
                                                   bi[1] * response_array.itemsize)
                 lg.QuerySamplesComplete([response])
+
     def stop(self):
         for i in range(self.num_workers):
             self.query_queue.put(None)
         for worker in self.worker_threads:
             worker.kill()
 
-
     def __del__(self):
         lg.DestroySUT(self.sut)
         print("Finished destroying SUT.")
diff --git a/speech2text/reference_mlperf.py b/speech2text/reference_mlperf.py
index 7ca4416a14..cee5843a7a 100644
--- a/speech2text/reference_mlperf.py
+++ b/speech2text/reference_mlperf.py
@@ -21,13 +21,32 @@
 import mlperf_loadgen as lg
 from reference_SUT import vllmSUT
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--scenario", choices=["Offline", "Server"], default="Offline", help="Scenario")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
-    parser.add_argument("--mlperf_conf", default="mlperf.conf", help="mlperf rules config")
-    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--audit_conf", default="audit.conf", help="audit config for LoadGen settings during compliance runs")
+    parser.add_argument(
+        "--scenario",
+        choices=[
+            "Offline",
+            "Server"],
+        default="Offline",
+        help="Scenario")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--mlperf_conf",
+        default="mlperf.conf",
+        help="mlperf rules config")
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS")
+    parser.add_argument(
+        "--audit_conf",
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs")
     parser.add_argument("--dataset_dir", required=True)
     parser.add_argument("--manifest", required=True)
     parser.add_argument("--perf_count", type=int, default=None)
@@ -50,17 +69,16 @@ def main():
     log_path = args.log_dir
     os.makedirs(log_path, exist_ok=True)
 
-
-    sut = vllmSUT( args.dataset_dir, 
-                   args.manifest, 
-                   args.perf_count, 
-                   num_workers=args.num_workers,
-                   device="cpu")
+    sut = vllmSUT(args.dataset_dir,
+                  args.manifest,
+                  args.perf_count,
+                  num_workers=args.num_workers,
+                  device="cpu")
     sut.start()
 
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario]
-    #settings.FromConfig(args.mlperf_conf, "whisper", args.scenario)
+    # settings.FromConfig(args.mlperf_conf, "whisper", args.scenario)
     settings.FromConfig(args.user_conf, "whisper", args.scenario)
 
     if args.accuracy:
@@ -75,15 +93,23 @@ def main():
     log_settings.log_output = log_output_settings
 
     print("Running Loadgen test...")
-    lg.StartTestWithLogSettings(sut.sut, 
-                                sut.qsl.qsl, 
-                                settings, 
-                                log_settings, 
+    lg.StartTestWithLogSettings(sut.sut,
+                                sut.qsl.qsl,
+                                settings,
+                                log_settings,
                                 args.audit_conf)
     sut.stop()
 
     if args.accuracy:
-        cmd = ["python3", "accuracy_eval.py", "--log_dir", log_path, "--dataset_dir", args.dataset_dir, "--manifest", args.manifest]
+        cmd = [
+            "python3",
+            "accuracy_eval.py",
+            "--log_dir",
+            log_path,
+            "--dataset_dir",
+            args.dataset_dir,
+            "--manifest",
+            args.manifest]
         print(f"Running accuracy script: {cmd}")
         subprocess.check_call(cmd)
 
diff --git a/speech2text/utils/download_utils.py b/speech2text/utils/download_utils.py
index bda4193fbb..ba5e15b7f0 100644
--- a/speech2text/utils/download_utils.py
+++ b/speech2text/utils/download_utils.py
@@ -65,5 +65,6 @@ def extract(fpath, dest_folder):
 
     with tarfile.open(fpath, mode) as tar:
         members = tar.getmembers()
-        for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
+        for member in tqdm.tqdm(
+                iterable=members, total=len(members), leave=True):
             tar.extract(path=dest_folder, member=member)
diff --git a/speech2text/utils/preprocessing_utils.py b/speech2text/utils/preprocessing_utils.py
index 260e860b80..5d6673434c 100644
--- a/speech2text/utils/preprocessing_utils.py
+++ b/speech2text/utils/preprocessing_utils.py
@@ -68,7 +68,8 @@ def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
     return output_dict
 
 
-def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
+def parallel_preprocess(dataset, input_dir, dest_dir,
+                        target_sr, speed, overwrite, parallel):
     with multiprocessing.Pool(parallel) as p:
         func = functools.partial(preprocess,
                                  input_dir=input_dir, dest_dir=dest_dir,

From 2b371c9028fbaaaa1e121df7e9c6d360ebd2aac0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 16 Jun 2025 18:03:55 +0000
Subject: [PATCH 14/35] [Automated Commit] Format Codebase

---
 language/llama3.1-8b/ref_eval.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/language/llama3.1-8b/ref_eval.py b/language/llama3.1-8b/ref_eval.py
index 0cfc0694d7..bb34847b13 100644
--- a/language/llama3.1-8b/ref_eval.py
+++ b/language/llama3.1-8b/ref_eval.py
@@ -17,7 +17,8 @@ def rouge(label, pred):
 
 
 def niah_em(label, pred):
-    label_uuids = re.findall(r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', label)
+    label_uuids = re.findall(
+        r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', label)
     pred_uuids = re.findall(r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', pred)
 
     # https://github.com/hsiehjackson/RULER/blob/main/scripts/eval/synthetic/constants.py#L28
@@ -43,7 +44,8 @@ def qa_em(label, pred):
         return {'exact_match': 100.0}
 
     normalized_answer = re.sub(r'\s+', '', answer_substring).lower()
-    label_entries = [re.sub(r'\s+', '', entry).lower() for entry in label.split('|')]
+    label_entries = [re.sub(r'\s+', '', entry).lower()
+                     for entry in label.split('|')]
 
     match_found = any(entry in normalized_answer for entry in label_entries)
     return {'exact_match': 100.0 if match_found else 0.0}
@@ -63,7 +65,12 @@ def process_row(row):
 
 def run_evaluation(df):
     with Pool(cpu_count()) as pool:
-        accuracies = list(tqdm(pool.imap(process_row, df.to_dict('records')), total=len(df)))
+        accuracies = list(
+            tqdm(
+                pool.imap(
+                    process_row,
+                    df.to_dict('records')),
+                total=len(df)))
 
     df['accuracy'] = accuracies
     return df
@@ -74,10 +81,10 @@ def run_evaluation(df):
     df = pd.read_pickle(fname)
 
     df = run_evaluation(df)
-    #df.to_pickle(str(fname).replace(".pkl", "_eval.pkl"))
+    # df.to_pickle(str(fname).replace(".pkl", "_eval.pkl"))
     print(f"WROTE: {str(fname).replace('.pkl', '_eval.pkl')}")
 
     accuracy = df.accuracy.apply(pd.Series)
     print(df.dataset.value_counts())
     print(accuracy.describe())
-    print(df.describe())
\ No newline at end of file
+    print(df.describe())

From 72d52bb8adb42c9560076194a2c8a82190f37740 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 19 Jun 2025 08:45:56 +0000
Subject: [PATCH 15/35] [Automated Commit] Format Codebase

---
 speech2text/utils/repackage_librispeech.py | 42 +++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/speech2text/utils/repackage_librispeech.py b/speech2text/utils/repackage_librispeech.py
index 7a2b62f421..4889b815a2 100644
--- a/speech2text/utils/repackage_librispeech.py
+++ b/speech2text/utils/repackage_librispeech.py
@@ -24,6 +24,7 @@
 PAD_DURATION = 0.5
 SR = 16000
 
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--manifest", required=True)
@@ -33,10 +34,12 @@ def get_args():
     args = parser.parse_args()
     return args
 
+
 def get_source_name(fname):
     basename_list, _ = os.path.splitext(fname)
     return "-".join(basename_list.split("-")[:2])
 
+
 def prepare_clip(current_entry, new_fname):
     pad_audio = np.zeros(int(PAD_DURATION * SR))
     new_audio = []
@@ -51,6 +54,7 @@ def prepare_clip(current_entry, new_fname):
     new_json = get_sample_json(new_audio, new_transcript, new_fname)
     return new_audio, new_json
 
+
 def get_sample_json(audio, transcript, fname):
     json_file = {
         "transcript": transcript,
@@ -73,6 +77,7 @@ def get_sample_json(audio, transcript, fname):
     }
     return json_file
 
+
 def main():
     args = get_args()
     with open(args.manifest, "r") as manifest:
@@ -84,15 +89,22 @@ def main():
     for data in json_data:
         original_fname = data["files"][0]["fname"]
         original_transcript = data["transcript"]
-        original_audio = librosa.load(os.path.join(args.data_dir, original_fname), sr=SR)[0]
-        original_json = get_sample_json(original_audio, original_transcript, original_fname)
+        original_audio = librosa.load(
+            os.path.join(
+                args.data_dir,
+                original_fname),
+            sr=SR)[0]
+        original_json = get_sample_json(
+            original_audio, original_transcript, original_fname)
 
-        source_name = get_source_name(os.path.basename(os.path.basename(original_fname)))
+        source_name = get_source_name(
+            os.path.basename(
+                os.path.basename(original_fname)))
         if source_name not in catalog:
             catalog[source_name] = []
-        
+
         catalog[source_name].append((original_audio, original_json))
-    
+
     full_json = []
     for key in catalog.keys():
         index = 0
@@ -100,13 +112,17 @@ def main():
         current_duration = 0
         for entry in catalog[key]:
             clip_duration = entry[1]["original_duration"]
-            
-            # Only considering clips <=30s.  If single clip duration > 30s, ignore.
+
+            # Only considering clips <=30s.  If single clip duration > 30s,
+            # ignore.
             if clip_duration > 30:
                 continue
-            # If new clip would extend compiled entry to >30s, flush the existing entry
-            if (len(current_entry) > 0) and (current_duration + PAD_DURATION + clip_duration > 30):
-                new_fname = os.path.join(args.output_dir, key + "_" + str(index) + ".wav")
+            # If new clip would extend compiled entry to >30s, flush the
+            # existing entry
+            if (len(current_entry) > 0) and (
+                    current_duration + PAD_DURATION + clip_duration > 30):
+                new_fname = os.path.join(
+                    args.output_dir, key + "_" + str(index) + ".wav")
                 new_audio, new_json = prepare_clip(current_entry, new_fname)
                 sf.write(new_fname, new_audio, SR)
                 full_json.append(new_json)
@@ -120,9 +136,11 @@ def main():
             if len(current_entry) > 1:
                 current_duration += PAD_DURATION
 
-        # After all key clips are processed, if a remaining entry has content, exports it.
+        # After all key clips are processed, if a remaining entry has content,
+        # exports it.
         if len(current_entry) > 0:
-            new_fname = os.path.join(args.output_dir, key + "_" + str(index) + ".wav")
+            new_fname = os.path.join(
+                args.output_dir, key + "_" + str(index) + ".wav")
             new_audio, new_json = prepare_clip(current_entry, new_fname)
             sf.write(new_fname, new_audio, SR)
             full_json.append(new_json)

From f841519e77014c799b0e08ad75b23ca2c534b1aa Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Thu, 26 Jun 2025 23:30:42 +0100
Subject: [PATCH 16/35] Fix Typo in Interactive Latencies (#2147) (#2225)

* Fix Typo in Interactive Latencies

* Update submission_checker.py
---
 language/llama2-70b/README.md          |  2 +-
 tools/submission/submission_checker.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index bbd9889564..0c0ad21952 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -291,7 +291,7 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/submission/)
 
 # Run llama2-70b-interactive benchmark
 
-For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms`
+For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `tpot <= 40ms`
 
 In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode:
 
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 69b47d6f17..462f3d56ea 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -808,6 +808,18 @@
             "ttft": 450 * 1000000, "tpot": 40 * 1000000
         },
     },
+    "llama2-70b-interactive-99": {
+        "Server": {
+            "ttft": 450 * 1000000, "tpot": 40 * 1000000
+        },
+    },
+    # for v5.0
+    "llama2-70b-interactive-99.9": {
+        "Server": {
+            "ttft": 450 * 1000000, "tpot": 40 * 1000000
+        },
+    },
+    # for v5.0
     "mixtral-8x7b": {
         "Server": {
             "ttft": 2000 * 1000000, "tpot": 200 * 1000000

From 4e7717712ceb8259a703423898fbfcbb5edd057e Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Thu, 26 Jun 2025 23:40:19 +0100
Subject: [PATCH 17/35] Fix Typo in Interactive Latencies (#2147) (#2226)

* Fix Typo in Interactive Latencies

* Update submission_checker.py

---------

Co-authored-by: Miro <mirhodak@amd.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

From 350032a3f13ff25cd5d54c99c0a604de4a155dbd Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Sun, 6 Jul 2025 01:57:04 +0530
Subject: [PATCH 18/35] Update MLCFlow  commands for v5.1 (#2237)

---
 docs/benchmarks/language/deepseek-r1.md       | 11 ++++
 .../language/get-deepseek-r1-data.md          | 24 ++++++++
 .../language/get-llama3_1-8b-data.md          | 60 +++++++++++++++++++
 docs/benchmarks/language/llama3_1-8b.md       | 11 ++++
 .../speech_to_text/get-whisper-data.md        | 40 +++++++++++++
 docs/benchmarks/speech_to_text/whisper.md     | 11 ++++
 language/deepseek-r1/README.md                | 30 ++++++++++
 language/llama3.1-8b/README.md                | 52 ++++++++++++----
 main.py                                       | 33 ++++++----
 mkdocs.yml                                    |  4 ++
 speech2text/README.md                         | 46 ++++++++++++++
 11 files changed, 298 insertions(+), 24 deletions(-)
 create mode 100644 docs/benchmarks/language/deepseek-r1.md
 create mode 100644 docs/benchmarks/language/get-deepseek-r1-data.md
 create mode 100644 docs/benchmarks/language/get-llama3_1-8b-data.md
 create mode 100644 docs/benchmarks/language/llama3_1-8b.md
 create mode 100644 docs/benchmarks/speech_to_text/get-whisper-data.md
 create mode 100644 docs/benchmarks/speech_to_text/whisper.md

diff --git a/docs/benchmarks/language/deepseek-r1.md b/docs/benchmarks/language/deepseek-r1.md
new file mode 100644
index 0000000000..f83fe1bb82
--- /dev/null
+++ b/docs/benchmarks/language/deepseek-r1.md
@@ -0,0 +1,11 @@
+---
+hide:
+  - toc
+---
+
+# Reasoning using DeepSeek-R1
+
+=== "MLCommons-Python"
+    ## MLPerf Reference Implementation in Python
+    
+{{ mlperf_inference_implementation_readme (4, "deepseek-r1", "reference", devices=["CUDA"]) }}
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-deepseek-r1-data.md b/docs/benchmarks/language/get-deepseek-r1-data.md
new file mode 100644
index 0000000000..401c4d27bc
--- /dev/null
+++ b/docs/benchmarks/language/get-deepseek-r1-data.md
@@ -0,0 +1,24 @@
+---
+hide:
+  - toc
+---
+
+# Reasoning using DeepSeek R1
+
+## Dataset
+
+The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
+
+=== "Validation"
+
+    ### Get Validation Dataset
+    ```
+    mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_rclone --outdirname=<path to download> -j
+    ```
+
+=== "Calibration"
+
+    ### Get Calibration Dataset
+    ```
+    mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname=<path to download> -j
+    ```
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md
new file mode 100644
index 0000000000..e24cc37d44
--- /dev/null
+++ b/docs/benchmarks/language/get-llama3_1-8b-data.md
@@ -0,0 +1,60 @@
+---
+hide:
+  - toc
+---
+
+# Text Summarization using LLAMA3.1-8b
+
+## Dataset
+
+The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
+
+=== "Validation"
+
+    === "Full dataset (Datacenter)"
+
+        ### Get Validation Dataset
+        ```
+        mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+        ```
+    
+    === "5000 samples (Edge)"
+
+        ### Get Validation Dataset
+        ```
+        mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+        ```
+
+=== "Calibration"
+
+    ### Get Calibration Dataset
+    ```
+    mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+    ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_LLAMA3_405B_DATASET>` could be provided to download the dataset to a specific location.
+
+## Model
+The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
+
+=== "Pytorch"
+
+    === "From MLCOMMONS Google Drive"
+
+        > **Note:**  One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. 
+
+        ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2
+        ```
+        TBD
+        ```
+
+    === "From Hugging Face repo"
+
+        > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
+
+        ### Get model from HuggingFace repo
+        ```
+        mlcr get,ml-model,llama3,_hf,_meta-llama/Llama-3.1-8B-Instruct --hf_token=<huggingface access token> -j
+        ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_LLAMA3_8B_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/language/llama3_1-8b.md b/docs/benchmarks/language/llama3_1-8b.md
new file mode 100644
index 0000000000..93f8df2997
--- /dev/null
+++ b/docs/benchmarks/language/llama3_1-8b.md
@@ -0,0 +1,11 @@
+---
+hide:
+  - toc
+---
+
+# Text Summarization using LLAMA3_1-8b
+
+=== "MLCommons-Python"
+    ## MLPerf Reference Implementation in Python
+    
+{{ mlperf_inference_implementation_readme (4, "llama3_1-8b", "reference", devices=["CPU","CUDA"]) }}
\ No newline at end of file
diff --git a/docs/benchmarks/speech_to_text/get-whisper-data.md b/docs/benchmarks/speech_to_text/get-whisper-data.md
new file mode 100644
index 0000000000..9bc97ad9a0
--- /dev/null
+++ b/docs/benchmarks/speech_to_text/get-whisper-data.md
@@ -0,0 +1,40 @@
+---
+hide:
+  - toc
+---
+
+# Speech to Text using Whisper
+
+## Dataset
+
+The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
+
+=== "Validation"
+
+    === "Preprocessed"
+
+        ### Get Preprocessed Validation Dataset
+        ```
+        mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname=<path to download> -j
+        ```
+
+    === "Unprocessed"
+
+        ### Get Unprocessed Validation Dataset
+        ```
+        mlcr get,dataset,whisper,_unprocessed --outdirname=<path to download> -j
+        ```
+
+## Model
+The benchmark implementation run command will automatically download the required model and do the necessary conversions if any. In case you want to only download the official model, you can use the below commands.
+
+=== "Pytorch"
+
+    === "From MLCOMMONS"
+
+        ### Get the Official MLPerf Whisper model from MLCOMMONS Cloudflare R2
+        ```
+        mlcr get,ml-model,whisper,_rclone,_mlc s-j
+        ```
+
+- `--outdirname=<PATH_TO_DOWNLOAD_WHISPER_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/speech_to_text/whisper.md b/docs/benchmarks/speech_to_text/whisper.md
new file mode 100644
index 0000000000..fddf37b767
--- /dev/null
+++ b/docs/benchmarks/speech_to_text/whisper.md
@@ -0,0 +1,11 @@
+---
+hide:
+  - toc
+---
+
+# Speech to Text using Whisper
+
+=== "MLCommons-Python"
+    ## MLPerf Reference Implementation in Python
+    
+{{ mlperf_inference_implementation_readme (4, "whisper", "reference", devices=["CPU","CUDA"]) }}
\ No newline at end of file
diff --git a/language/deepseek-r1/README.md b/language/deepseek-r1/README.md
index 7c2722c7a6..4e0184ff94 100644
--- a/language/deepseek-r1/README.md
+++ b/language/deepseek-r1/README.md
@@ -1,5 +1,11 @@
 # Mlperf Inference DeepSeek Reference Implementation
 
+## Automated command to run the benchmark via MLFlow
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/deepseek-r1/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+
+You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections.
+
 ## Model & Dataset Download
 
 > **Model**: [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) (revision: `56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad`)
@@ -11,6 +17,14 @@
 
 ### Preprocessed
 
+**Using MLCFlow Automation**
+
+```
+mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname=<path to download> -j
+```
+
+**Using Native method**
+
 You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket.
 
 To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
@@ -30,6 +44,14 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deeps
 
 ### Calibration
 
+**Using MLCFlow Automation**
+
+```
+mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname=<path to download> -j
+```
+
+**Using Native method**
+
 Download and install Rclone as described in the previous section.
 
 Then navigate in the terminal to your desired download directory and run the following command to download the dataset:
@@ -171,6 +193,14 @@ The following table shows which backends support different evaluation and MLPerf
 
 ## Accuracy Evaluation
 
+**Using MLCFlow Automation**
+
+```
+TBD
+```
+
+**Using Native method**
+
 Accuracy evaluation is handled uniformly across all backends:
 
 ```bash
diff --git a/language/llama3.1-8b/README.md b/language/llama3.1-8b/README.md
index 5947aa0cc4..2b331c98f9 100644
--- a/language/llama3.1-8b/README.md
+++ b/language/llama3.1-8b/README.md
@@ -9,7 +9,7 @@
 
 ## Automated command to run the benchmark via MLFlow
 
-Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-8b/) (TBD) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-8b/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
 
 You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections.
 
@@ -99,7 +99,10 @@ pip install -e ../../loadgen
 ## Get Model
 ### MLCommons Members Download (Recommended for official submission)
 
-You need to request for access to [MLCommons](http://llama3-1.mlcommons.org/) and you'll receive an email with the download instructions. You can download the model automatically via the below command
+You need to request for access to [MLCommons](http://llama3-1.mlcommons.org/) and you'll receive an email with the download instructions. 
+
+**Official Model download using MLCFlow Automation**
+You can download the model automatically via the below command
 ```
 TBD
 ```
@@ -115,6 +118,12 @@ git clone https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct ${CHECKPOINT_P
 cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1
 ```
 
+**External Model download using MLCFlow Automation**
+You can download the model automatically via the below command
+```
+mlcr get,ml-model,llama3,_hf,_meta-llama/Llama-3.1-8B-Instruct --hf_token=<huggingface access token> -j
+```
+
 ### Download huggingface model through MLC
 
 ```
@@ -142,24 +151,39 @@ rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5ee
 You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
 
 #### Full dataset (datacenter) 
+
+**Using MLCFlow Automation**
+```
+mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+```
+
+**Native method**
 ```
 rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.json ./ -P
 ```
 
 #### 5000 samples (edge)
+
+**Using MLCFlow Automation**
+```
+mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+```
+
+**Native method**
 ```
 rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/sample_cnn_eval_5000.json ./ -P
 ```
 
 #### Calibration
+
+**Using MLCFlow Automation**
 ```
-rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_dailymail_calibration.json ./ -P
+mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname=<path to download> -j
 ```
 
-**MLC Command**
-
+**Native method**
 ```
-TBD
+rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_dailymail_calibration.json ./ -P
 ```
 
 You can also download the calibration dataset from the Cloudflare R2 bucket by running the following command:
@@ -168,11 +192,6 @@ You can also download the calibration dataset from the Cloudflare R2 bucket by r
 rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.json ./ -P
 ```
 
-**MLC Command**
-```
-TBD
-```
-
 
 ## Run Performance Benchmarks
 
@@ -265,8 +284,17 @@ The ServerSUT was not tested for GPU runs.
 
 ### Evaluate the accuracy using MLCFlow
 You can also evaulate the accuracy from the generated accuracy log by using the following MLC command
+
+**Full dataset (datacenter)**
+
 ```
-TBD
+mlcr run,accuracy,mlperf,_cnndm_llama_3,_edge --result_dir=<Path to directory where files are generated after the benchmark run>
+```
+
+**5000 samples (edge)**
+
+```
+mlcr run,accuracy,mlperf,_cnndm_llama_3,_datacenter --result_dir=<Path to directory where files are generated after the benchmark run>
 ```
 
 ## Accuracy Target
diff --git a/main.py b/main.py
index d2f625fcf4..1e5a20bb96 100755
--- a/main.py
+++ b/main.py
@@ -33,13 +33,15 @@ def mlperf_inference_implementation_readme(
 
         if model == "rnnt":
             code_version = "r4.0"
+        if "gpt" in model:
+            code_version = "r5.0-dev"
         elif implementation == "intel":
             code_version = "r4.1-dev"
 
         if implementation == "reference":
             # Tip
-            if model != "rnnt":
-                code_version = "r5.0-dev"
+            if model not in ["rnnt", "gptj-99", "gptj-99.9"]:
+                code_version = "r5.1-dev"
             if "99.9" not in model and implementation_tips:
                 content += f"\n{pre_space}!!! tip\n\n"
                 content += f"{pre_space}    - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n"
@@ -54,8 +56,12 @@ def mlperf_inference_implementation_readme(
                     frameworks = ["Onnxruntime", "Pytorch"]
                 elif "bert" in model.lower():
                     frameworks = ["Pytorch", "Deepsparse"]
-                elif "llama3" in model.lower():
-                    frameworks = ["Pytorch"]
+                elif "whisper" in model.lower():
+                    frameworks = ["vLLM"]
+                elif "deepseek" in model.lower():
+                    frameworks = ["vLLM", "Pytorch", "SGLang"]
+                elif "llama3_1-8b" in model.lower():
+                    frameworks = ["vLLM"]
                 else:
                     frameworks = ["Pytorch"]
 
@@ -130,12 +136,7 @@ def mlperf_inference_implementation_readme(
                 categories = ["Datacenter"]
             elif model.lower() in ["pointpainting"]:
                 categories = ["Edge"]
-            elif (
-                "dlrm" in model.lower()
-                or "llama2" in model.lower()
-                or "mixtral" in model.lower()
-                or "llama3" in model.lower()
-            ):
+            elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3", "deepseek-r1"]:
                 categories = ["Datacenter"]
             else:
                 categories = ["Edge", "Datacenter"]
@@ -153,8 +154,12 @@ def mlperf_inference_implementation_readme(
                     scenarios.append("MultiStream")
                 if model.lower() in ["pointpainting"]:
                     scenarios.remove("Offline")
+                if model.lower() in ["whisper"]:
+                    scenarios.remove("SingleStream")
             elif category == "Datacenter":
                 scenarios = ["Offline", "Server"]
+                if model.lower() in ["whisper"]:
+                    scenarios.remove("Server")
             if fixed_scenarios:
                 scenarios = [
                     scenario for scenario in scenarios if scenario in fixed_scenarios]
@@ -164,7 +169,7 @@ def mlperf_inference_implementation_readme(
             cur_space = pre_space + "    "
             scenarios_string = ", ".join(scenarios)
 
-            content += f"{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenarios and all the scenarios are mandatory for a closed division submission.\n\n"
+            content += f"""{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenario{"s" if len(scenarios)>1 else ""} and {"all of the scenarios are" if len(scenarios)>1 else "the scenario is"}  mandatory for a closed division submission.\n\n"""
 
             for framework in frameworks:
                 cur_space1 = cur_space + "    "
@@ -539,7 +544,7 @@ def get_common_info(spaces, implementation, model):
             info += f"{pre_space}    - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=<query_count>`.\n\n"
 
         if implementation.lower() == "reference" and model.lower() not in [
-                "pointpainting"]:
+                "pointpainting", "llama3_1-8b",  "deepseek-r1", "whisper"]:
 
             info += f"{pre_space}    - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n"
         if model == "rgat":
@@ -568,6 +573,10 @@ def get_docker_info(spaces, model, implementation,
             elif "llama3" in model.lower():
                 info += f"{pre_space}    - `--env.MLC_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
                 info += f"{pre_space}    - `--env.MLC_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n"
+            elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]:
+                info += f"{pre_space}    - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
+                info += f"{pre_space}    - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n"
+            
             if implementation.lower() == "nvidia":
                 info += f"{pre_space}    - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n"
                 info += f"{pre_space}    - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n"
diff --git a/mkdocs.yml b/mkdocs.yml
index a0ac88ef98..e4396d2d53 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -46,11 +46,15 @@ nav:
         - GPT-J: benchmarks/language/gpt-j.md
         - LLAMA2-70B: benchmarks/language/llama2-70b.md
         - LLAMA3-405B: benchmarks/language/llama3_1-405b.md
+        - LLAMA3-8B: benchmarks/language/llama3_1-8b.md
         - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md
+        - DeepSeek-R1: benchmarks/language/deepseek-r1.md
       - Recommendation:
         - DLRM-v2: benchmarks/recommendation/dlrm-v2.md
       - Graph Neural Networks:
         - R-GAT: benchmarks/graph/rgat.md
+      - Speech to Text:
+        - Whisper: benchmarks/speech_to_text/whisper.md
   - Install MLCFlow:
     - install/index.md
   - Submission:
diff --git a/speech2text/README.md b/speech2text/README.md
index 8747cc8557..be325975a2 100644
--- a/speech2text/README.md
+++ b/speech2text/README.md
@@ -1,5 +1,11 @@
 # Reference Implementation for whisper-large-v3
 
+## Automated command to run the benchmark via MLFlow
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/whisper/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+
+You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections.
+
 ## Prepare environment
 
 ### Docker
@@ -92,6 +98,15 @@ VLLM_TARGET_DEVICE=cpu pip install --break-system-packages . --no-build-isolatio
 ## Get Model
 ### MLCommons Download
 
+**Official Model download using MLCFlow Automation**
+
+You can download the model automatically via the below command
+```
+mlcr get,ml-model,whisper,_rclone,_mlc --outdirname=<path_to_download> -j
+```
+
+**Official Model download using native method**
+
 You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket.
 
 To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
@@ -111,6 +126,15 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/Whisper/model/ ./ -P
 
 ### External Download (Not recommended for official submission)
 
+**External Model download using MLCFlow Automation**
+
+You can download the model automatically via the below command
+```
+TBD
+```
+
+**External Model download using native method**
+
 + Requires Git Large Files Storage
 ```bash
 export CHECKPOINT_PATH=whisper-large-v3
@@ -127,6 +151,13 @@ We use dev-clean and dev-other splits, which are approximately 10 hours.
 
 ### Preprocessed
 
+**Using MLCFlow Automation**
+```
+mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname=<path to download> -j
+```
+
+**Native method**
+
 Download and install rclone as decribed in the [MLCommons Download section](#mlcommons-download)
 
 You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
@@ -136,6 +167,13 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ ./ -P
 
 ### Unprocessed
 
+**Using MLCFlow Automation**
+```
+mlcr get,dataset,whisper,_unprocessed --outdirname=<path to download> -j
+```
+
+**Native method**
+
 If your are using docker, we provide a script to download and preprocess the dataset from the source. You can download it by running:
 ```bash
 ./download_dataset.sh
@@ -227,6 +265,14 @@ python reference_mlperf.py \
 
 ### Run Accuracy
 
+**Evaluate Accuracy using  MLCFlow Automation**
+
+```
+mlcr run,accuracy,mlperf,_librispeech_whisper,_int32 --result_dir=<Path to directory where files are generated after the benchmark run>
+```
+
+**Evaluate Accuracy using native method**
+
 ```bash
 python reference_mlperf.py \
     --dataset_dir ${DATA_DIR} \

From 3a8595acd31a5f5a401f52d35449374e9d4cd281 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 5 Jul 2025 20:27:26 +0000
Subject: [PATCH 19/35] [Automated Commit] Format Codebase

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 1e5a20bb96..5b3062fa32 100755
--- a/main.py
+++ b/main.py
@@ -544,7 +544,7 @@ def get_common_info(spaces, implementation, model):
             info += f"{pre_space}    - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=<query_count>`.\n\n"
 
         if implementation.lower() == "reference" and model.lower() not in [
-                "pointpainting", "llama3_1-8b",  "deepseek-r1", "whisper"]:
+                "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]:
 
             info += f"{pre_space}    - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n"
         if model == "rgat":
@@ -576,7 +576,7 @@ def get_docker_info(spaces, model, implementation,
             elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]:
                 info += f"{pre_space}    - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
                 info += f"{pre_space}    - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n"
-            
+
             if implementation.lower() == "nvidia":
                 info += f"{pre_space}    - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n"
                 info += f"{pre_space}    - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n"

From 906c0fcb0e43f3e041011baf9a8fb42ade41f7ec Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 15 Jul 2025 22:11:56 +0000
Subject: [PATCH 20/35] [Automated Commit] Format Codebase

---
 tools/submission/submission_checker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index feeb0e41ae..291f07476a 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -498,7 +498,7 @@
             "rgat": ("acc", 0.7286 * 0.99),
             "pointpainting": ("mAP", 0.5425 * 0.999),
             "deepseek-r1": ("exact_match", 0.99 * 81.6773, "TOKENS_PER_SAMPLE", 0.9 * 4043.449),
-            "whisper": ("ACCURACY", (100.0-2.0671) * 0.99),
+            "whisper": ("ACCURACY", (100.0 - 2.0671) * 0.99),
         },
         "accuracy-upper-limit": {
             "stable-diffusion-xl": (

From 024e4aded9afdfffd33aeba0271f20f4acc393f7 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Thu, 17 Jul 2025 14:18:54 +0530
Subject: [PATCH 21/35] Update main.py

---
 main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.py b/main.py
index 5b3062fa32..e21582a4af 100755
--- a/main.py
+++ b/main.py
@@ -66,6 +66,8 @@ def mlperf_inference_implementation_readme(
                     frameworks = ["Pytorch"]
 
         elif implementation == "nvidia":
+            if model in ["retinanet", "resnet50", "3d-unet-99", "3d-unet-99.9]:
+                code_version = "r5.1-dev"
             if model in ["mixtral-8x7b"]:
                 return pre_space + "    WIP"
             devices = ["CUDA"]

From 9e2c7a982b8d7eed48c508ef6dc0659c6c196db9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 17 Jul 2025 08:50:04 +0000
Subject: [PATCH 22/35] [Automated Commit] Format Codebase

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index e21582a4af..e22e9f0421 100755
--- a/main.py
+++ b/main.py
@@ -66,7 +66,7 @@ def mlperf_inference_implementation_readme(
                     frameworks = ["Pytorch"]
 
         elif implementation == "nvidia":
-            if model in ["retinanet", "resnet50", "3d-unet-99", "3d-unet-99.9]:
+            if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9]:
                 code_version = "r5.1-dev"
             if model in ["mixtral-8x7b"]:
                 return pre_space + "    WIP"

From 0928e11461625e39b9380f8505a3d8b02a7d06e5 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Thu, 17 Jul 2025 14:20:51 +0530
Subject: [PATCH 23/35] updating for 5.1-dev (inference doc)

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index e22e9f0421..a4c3c556e8 100755
--- a/main.py
+++ b/main.py
@@ -66,7 +66,7 @@ def mlperf_inference_implementation_readme(
                     frameworks = ["Pytorch"]
 
         elif implementation == "nvidia":
-            if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9]:
+            if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9"]:
                 code_version = "r5.1-dev"
             if model in ["mixtral-8x7b"]:
                 return pre_space + "    WIP"

From 7069a9e58be3f40d9acbc08942c31a2840e893c4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 17 Jul 2025 08:51:14 +0000
Subject: [PATCH 24/35] [Automated Commit] Format Codebase

---
 main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index a4c3c556e8..b4d1f5e74d 100755
--- a/main.py
+++ b/main.py
@@ -66,7 +66,8 @@ def mlperf_inference_implementation_readme(
                     frameworks = ["Pytorch"]
 
         elif implementation == "nvidia":
-            if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9"]:
+            if model in ["retinanet", "resnet50",
+                         "3d-unet-99", "3d - unet - 99.9"]:
                 code_version = "r5.1-dev"
             if model in ["mixtral-8x7b"]:
                 return pre_space + "    WIP"

From 253854ff4f10d73a94f3a071dcaf5fa2e9f02a5b Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Thu, 17 Jul 2025 14:23:03 +0530
Subject: [PATCH 25/35] fix typo

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index b4d1f5e74d..7cdec722ba 100755
--- a/main.py
+++ b/main.py
@@ -67,7 +67,7 @@ def mlperf_inference_implementation_readme(
 
         elif implementation == "nvidia":
             if model in ["retinanet", "resnet50",
-                         "3d-unet-99", "3d - unet - 99.9"]:
+                         "3d-unet-99", "3d-unet-99.9"]:
                 code_version = "r5.1-dev"
             if model in ["mixtral-8x7b"]:
                 return pre_space + "    WIP"

From 371d5835c94c187f5c2af69f9509dbf1b91f4e07 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 21 Jul 2025 13:52:33 +0000
Subject: [PATCH 26/35] [Automated Commit] Format Codebase

---
 compliance/nvidia/TEST06/run_verification.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py
index 70e16f5266..cae64b3f47 100644
--- a/compliance/nvidia/TEST06/run_verification.py
+++ b/compliance/nvidia/TEST06/run_verification.py
@@ -53,7 +53,12 @@ def get_args():
         "--scenario",
         "-s",
         required=True,
-        choices=["Offline", "Server", "Interactive", "SingleStream", "MultiStream"],
+        choices=[
+            "Offline",
+            "Server",
+            "Interactive",
+            "SingleStream",
+            "MultiStream"],
     )
     args = parser.parse_args()
     return args

From ea86fc011fdeff2f0a1dd462010b4ea16d68c8c5 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:01:30 +0530
Subject: [PATCH 27/35] Update main.py

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 7cdec722ba..419b76a6da 100755
--- a/main.py
+++ b/main.py
@@ -190,7 +190,7 @@ def mlperf_inference_implementation_readme(
                     content += f"{cur_space1}=== \"{device}\"\n"
                     content += f"{cur_space2}##### {device} device\n\n"
 
-                    # minimum system requirements
+                    # get minimum system requirements
                     content += get_min_system_requirements(
 
                         cur_space2, model, implementation, device

From 0828e9ca31c692174d23e1f8baf8eca860381c90 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 24 Jul 2025 18:35:56 +0000
Subject: [PATCH 28/35] [Automated Commit] Format Codebase

---
 language/deepseek-r1/eval_accuracy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py
index bf537e9d3a..9c103fdcba 100644
--- a/language/deepseek-r1/eval_accuracy.py
+++ b/language/deepseek-r1/eval_accuracy.py
@@ -773,7 +773,7 @@ def print_evaluation_results(df_evaluated: pd.DataFrame,
         'tokens_per_sample': mean_output_len,
         'num-samples': len(df_evaluated),
     }
-    
+
     print("\nResults\n")
     print(results)
 

From f8c344fa5f5b312908a888be34f5f3fef615629e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 29 Jul 2025 15:52:53 +0000
Subject: [PATCH 29/35] [Automated Commit] Format Codebase

---
 language/llama3.1-8b/download_cnndm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/language/llama3.1-8b/download_cnndm.py b/language/llama3.1-8b/download_cnndm.py
index d8694be720..90c9ad8d7a 100644
--- a/language/llama3.1-8b/download_cnndm.py
+++ b/language/llama3.1-8b/download_cnndm.py
@@ -100,8 +100,8 @@ def preprocess_function(sample, padding="max_length"):
     # create list of samples
     inputs = []
 
-    #print(f"Num samples: {len(sample[text_column])}")
-    #for i in range(0, len(sample[text_column])):
+    # print(f"Num samples: {len(sample[text_column])}")
+    # for i in range(0, len(sample[text_column])):
     x = dict()
     x["instruction"] = instruction_template
     x["input"] = sample[text_column]
@@ -109,7 +109,7 @@ def preprocess_function(sample, padding="max_length"):
         instruction_template[instruction].format_map(x)
     )
     x["output"] = sample[summary_column]
-    #inputs.append(x)
+    # inputs.append(x)
     model_inputs = dict()
     model_inputs["text"] = x
 

From d0a2ed48ed75803abf6788d742a1dfd4444885e3 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Wed, 30 Jul 2025 19:39:47 +0530
Subject: [PATCH 30/35] Doc updates (#2292)

* improve submission doc

* Update index.md

* Fix for model and dataset download commands

* update submission doc

* [Automated Commit] Format Codebase

* Update index.md

* r2_downloader -> r2-downloader

* Update multithreading information about SDXL

* [Automated Commit] Format Codebase

* .lower() for consistency

* [Automated Commit] Format Codebase

* updation for llama3_1-8b edge

* [Automated Commit] Format Codebase

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Arjun Suresh <arjun@gateoverflow.com>
---
 .../image_classification/get-resnet50-data.md  |  6 +++---
 .../language/get-deepseek-r1-data.md           | 18 +++++++++---------
 .../benchmarks/language/get-llama2-70b-data.md | 14 ++++++++++++--
 .../language/get-llama3_1-405b-data.md         |  8 ++++++++
 .../language/get-llama3_1-8b-data.md           | 17 +++++++++--------
 .../object_detection/get-retinanet-data.md     |  6 +++---
 .../speech_to_text/get-whisper-data.md         |  4 ++--
 docs/submission/index.md                       | 15 ++++++++++++++-
 main.py                                        | 15 +++++++++------
 9 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/docs/benchmarks/image_classification/get-resnet50-data.md b/docs/benchmarks/image_classification/get-resnet50-data.md
index 9ecd25c1a5..64e17c8246 100644
--- a/docs/benchmarks/image_classification/get-resnet50-data.md
+++ b/docs/benchmarks/image_classification/get-resnet50-data.md
@@ -15,7 +15,7 @@ The benchmark implementation run command will automatically download the validat
 
         ### Get Validation Dataset
         ```
-        mlcr get,dataset,imagenet,validation -j
+        mlcr get,dataset,imagenet,validation,_full -j
         ```
     === "Calibration"
         ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset.
@@ -32,7 +32,7 @@ The benchmark implementation run command will automatically download the validat
     ### Get ResNet50 preprocessed dataset
 
     ```
-    mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch -j
+    mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch,_full-j
     ```
 
 - `--outdirname=<PATH_TO_DOWNLOAD_IMAGENET_DATASET>` could be provided to download the dataset to a specific location.
@@ -52,7 +52,7 @@ Get the Official MLPerf ResNet50 Model
 
     ### Onnx
     ```
-    mlcr get,ml-model,resnet50,_onnx -j
+    mlcr get,ml-model,resnet50,image-classification,_onnx -j
     ```
 
 - `--outdirname=<PATH_TO_DOWNLOAD_RESNET50_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-deepseek-r1-data.md b/docs/benchmarks/language/get-deepseek-r1-data.md
index 401c4d27bc..1dbce5cdf7 100644
--- a/docs/benchmarks/language/get-deepseek-r1-data.md
+++ b/docs/benchmarks/language/get-deepseek-r1-data.md
@@ -11,14 +11,14 @@ The benchmark implementation run command will automatically download the validat
 
 === "Validation"
 
-    ### Get Validation Dataset
-    ```
-    mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_rclone --outdirname=<path to download> -j
-    ```
 
-=== "Calibration"
+  ### Get Validation Dataset
+  ```
+  mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_r2-downloader --outdirname=<path to download> -j
+  ```
 
-    ### Get Calibration Dataset
-    ```
-    mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname=<path to download> -j
-    ```
\ No newline at end of file
+=== "Calibration"
+  
+  ### Get Calibration Dataset
+  ```
+  mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_r2-downloader --outdirname=<path to download> -j
\ No newline at end of file
diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md
index ce7cd996eb..6c22d3658d 100644
--- a/docs/benchmarks/language/get-llama2-70b-data.md
+++ b/docs/benchmarks/language/get-llama2-70b-data.md
@@ -16,7 +16,7 @@ The benchmark implementation run command will automatically download the validat
     
         ### Get Preprocessed Validation Dataset
         ```
-        mlcr get,dataset,preprocessed,openorca,_validation -j
+        mlcr get,dataset,preprocessed,openorca,_validation,_mlcommons -j
         ```
 
     === "Calibration"
@@ -56,7 +56,17 @@ The benchmark implementation run command will automatically download the require
 
         ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Google Drive
         ```
-        mlcr get,ml-model,llama2-70b,_pytorch -j
+        mlcr get,ml-model,llama2-70b,_rclone,_mlc,_70b -j
+        ```
+
+    === "From MLCOMMONS Cloudfare R2"
+
+        > **Note:**  One has to accept the [MLCommons Llama 2 License Confidentiality Notice](https://llama2.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. 
+
+        ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Cloudfare R2
+
+        ```
+        mlcr get,ml-model,llama2-70b,_mlc,_r2-downloader,_70b -j
         ```
 
     === "From Hugging Face repo"
diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md
index ad05ca8610..3257cd17b0 100644
--- a/docs/benchmarks/language/get-llama3_1-405b-data.md
+++ b/docs/benchmarks/language/get-llama3_1-405b-data.md
@@ -38,6 +38,14 @@ The benchmark implementation run command will automatically download the require
         ```
         mlcr get,ml-model,llama3 -j
         ```
+    
+    === "From Cloudfare R2"
+
+        > **Note:**  One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. 
+
+        ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2
+        ```
+        mlcr get,ml-model,llama3,_mlc,_405b,_r2-downloader    --outdirname=<path to download> -j
 
     === "From Hugging Face repo"
 
diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md
index e24cc37d44..26b3cf11d1 100644
--- a/docs/benchmarks/language/get-llama3_1-8b-data.md
+++ b/docs/benchmarks/language/get-llama3_1-8b-data.md
@@ -10,26 +10,27 @@ hide:
 The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
 
 === "Validation"
-
+    
     === "Full dataset (Datacenter)"
 
         ### Get Validation Dataset
         ```
-        mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+        mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_r2-downloader --outdirname=<path to download> -j
         ```
-    
+
     === "5000 samples (Edge)"
 
         ### Get Validation Dataset
         ```
-        mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+        mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_r2-downloader --outdirname=<path to download> -j
         ```
 
 === "Calibration"
-
+        ```
+        
     ### Get Calibration Dataset
     ```
-    mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname=<path to download> -j
+    mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_r2-downloader    --outdirname=<path to download> -j
     ```
 
 - `--outdirname=<PATH_TO_DOWNLOAD_LLAMA3_405B_DATASET>` could be provided to download the dataset to a specific location.
@@ -39,13 +40,13 @@ The benchmark implementation run command will automatically download the require
 
 === "Pytorch"
 
-    === "From MLCOMMONS Google Drive"
+    === "From Cloudfare R2"
 
         > **Note:**  One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. 
 
         ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2
         ```
-        TBD
+        mlcr get,ml-model,llama3,_mlc,_8b,_r2-downloader    --outdirname=<path to download> -j
         ```
 
     === "From Hugging Face repo"
diff --git a/docs/benchmarks/object_detection/get-retinanet-data.md b/docs/benchmarks/object_detection/get-retinanet-data.md
index 6127eed541..00c5bf8451 100644
--- a/docs/benchmarks/object_detection/get-retinanet-data.md
+++ b/docs/benchmarks/object_detection/get-retinanet-data.md
@@ -16,7 +16,7 @@ The benchmark implementation run command will automatically download the validat
 
         ### Get Validation Dataset
         ```
-        mlcr get,dataset,openimages,_validation -j
+        mlcr get,dataset,openimages,original,_validation -j
         ```
 
     === "Calibration"
@@ -24,14 +24,14 @@ The benchmark implementation run command will automatically download the validat
 
         ### Get OpenImages Calibration dataset
         ```
-        mlcr get,dataset,openimages,_calibration -j
+        mlcr get,dataset,openimages,original,_calibration -j
         ```
 
 === "Preprocessed"
 
     ### Get Preprocessed OpenImages dataset
     ```
-    get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j 
+    mlcr get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j 
     ```
 
 - `--outdirname=<PATH_TO_DOWNLOAD_OPENIMAGES_DATASET>` could be provided to download the dataset to a specific location.
diff --git a/docs/benchmarks/speech_to_text/get-whisper-data.md b/docs/benchmarks/speech_to_text/get-whisper-data.md
index 9bc97ad9a0..ed9e3b02b0 100644
--- a/docs/benchmarks/speech_to_text/get-whisper-data.md
+++ b/docs/benchmarks/speech_to_text/get-whisper-data.md
@@ -15,7 +15,7 @@ The benchmark implementation run command will automatically download the validat
 
         ### Get Preprocessed Validation Dataset
         ```
-        mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname=<path to download> -j
+        mlcr get,dataset,whisper,_preprocessed,_mlc,_r2-downloader --outdirname=<path to download> -j
         ```
 
     === "Unprocessed"
@@ -34,7 +34,7 @@ The benchmark implementation run command will automatically download the require
 
         ### Get the Official MLPerf Whisper model from MLCOMMONS Cloudflare R2
         ```
-        mlcr get,ml-model,whisper,_rclone,_mlc s-j
+        mlcr get,ml-model,whisper,_r2-downloader,_mlc -j
         ```
 
 - `--outdirname=<PATH_TO_DOWNLOAD_WHISPER_MODEL>` could be provided to download the model to a specific location.
\ No newline at end of file
diff --git a/docs/submission/index.md b/docs/submission/index.md
index 079a513854..56f872fc6e 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -158,6 +158,18 @@ If there are multiple systems where MLPerf results are collected, the same proce
        --commit_message="Results on <HW name> added by <Name>" \
        --quiet
     ```
+
+    The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument.
+    <details>
+  	<summary>Click to see the sample output</summary>
+	```
+  	[2025-07-23 16:36:56,399 module.py:2197 INFO] - 
+    
+    Path to the locally synced submission directory: mysubmissions/mlperf_submission
+
+    
+ 	```
+    </details>
     
     ```mermaid
         flowchart LR
@@ -193,7 +205,8 @@ Once you have all the results on the system, you can upload them to the MLCommon
     mlcr run,mlperf,submission,checker,inference \
     --submitter_id=<> \
     --submission_dir=<Path to the submission folder>
-    ```
+    ``` 
+    
 === "via Browser"
     You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). 
     ```
diff --git a/main.py b/main.py
index 419b76a6da..e4782bf3ac 100755
--- a/main.py
+++ b/main.py
@@ -45,7 +45,8 @@ def mlperf_inference_implementation_readme(
             if "99.9" not in model and implementation_tips:
                 content += f"\n{pre_space}!!! tip\n\n"
                 content += f"{pre_space}    - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n"
-
+                if model.lower() in ["sdxl"]:
+                    content += f"\n{pre_space}> **Note:** {model.upper()} reference implementation does not support multithreading.\n\n"
             if not devices:
                 devices = ["CPU", "CUDA", "ROCm"]
 
@@ -139,10 +140,10 @@ def mlperf_inference_implementation_readme(
                 categories = ["Datacenter"]
             elif model.lower() in ["pointpainting"]:
                 categories = ["Edge"]
-            elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3", "deepseek-r1"]:
+            elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3_1-405b-99.9", "llama3_1-405b-99", "deepseek-r1"]:
                 categories = ["Datacenter"]
             else:
-                categories = ["Edge", "Datacenter"]
+                categories = ["Datacenter", "Edge"]
 
         # model name
         content += f"{pre_space}{model.upper()}\n\n"
@@ -159,6 +160,8 @@ def mlperf_inference_implementation_readme(
                     scenarios.remove("Offline")
                 if model.lower() in ["whisper"]:
                     scenarios.remove("SingleStream")
+                if model.lower() == "llama3_1-8b":
+                    model = "llama3_1-8b-edge"
             elif category == "Datacenter":
                 scenarios = ["Offline", "Server"]
                 if model.lower() in ["whisper"]:
@@ -547,7 +550,7 @@ def get_common_info(spaces, implementation, model):
             info += f"{pre_space}    - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=<query_count>`.\n\n"
 
         if implementation.lower() == "reference" and model.lower() not in [
-                "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]:
+                "pointpainting", "llama3_1-8b", "llama3_1-8b-edge", "deepseek-r1", "whisper"]:
 
             info += f"{pre_space}    - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n"
         if model == "rgat":
@@ -573,10 +576,10 @@ def get_docker_info(spaces, model, implementation,
 
             if model == "sdxl":
                 info += f"{pre_space}    - `--env.MLC_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
-            elif "llama3" in model.lower():
+            elif "llama3_1-405b" in model.lower():
                 info += f"{pre_space}    - `--env.MLC_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
                 info += f"{pre_space}    - `--env.MLC_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n"
-            elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]:
+            elif model.lower() in ["llama3_1-8b", "llama3_1-8b-edge", "whisper", "deepseek-r1"]:
                 info += f"{pre_space}    - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n"
                 info += f"{pre_space}    - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n"
 

From 23bd06243d9232728f185bfe39411ec977764712 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Thu, 31 Jul 2025 13:15:03 +0530
Subject: [PATCH 31/35] Add quiet flags to MLC commands (#2309)

---
 docs/submission/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 56f872fc6e..7b4ab585f5 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -204,7 +204,7 @@ Once you have all the results on the system, you can upload them to the MLCommon
     ```
     mlcr run,mlperf,submission,checker,inference \
     --submitter_id=<> \
-    --submission_dir=<Path to the submission folder>
+    --submission_dir=<Path to the submission folder> --quiet
     ``` 
     
 === "via Browser"
@@ -213,7 +213,7 @@ Once you have all the results on the system, you can upload them to the MLCommon
     mlcr run,mlperf,submission,checker,inference \
     --submission_dir=<Path to the submission folder> \
     --tar=yes \
-    --submission_tar_file=mysubmission.tar.gz
+    --submission_tar_file=mysubmission.tar.gz --quiet
     ```
     
 ```mermaid

From 6ec49291d40cf427f5c45428e952db6e6349c9c0 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Thu, 31 Jul 2025 23:27:58 +0530
Subject: [PATCH 32/35] Improve docs - submission generation (#2311)

---
 docs/submission/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 7b4ab585f5..f920bbfa11 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -159,7 +159,7 @@ If there are multiple systems where MLPerf results are collected, the same proce
        --quiet
     ```
 
-    The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument.
+    > **Note:** The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument.
     <details>
   	<summary>Click to see the sample output</summary>
 	```
@@ -204,7 +204,7 @@ Once you have all the results on the system, you can upload them to the MLCommon
     ```
     mlcr run,mlperf,submission,checker,inference \
     --submitter_id=<> \
-    --submission_dir=<Path to the submission folder> --quiet
+    --submission_dir=<Path to the locally synced submission directory> --quiet
     ``` 
     
 === "via Browser"

From ff856b8ff204395338432d7c823f677644c2ffe2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 31 Jul 2025 22:04:42 +0000
Subject: [PATCH 33/35] [Automated Commit] Format Codebase

---
 tools/submission/submission_checker.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index f124b808c5..7a0c22c902 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1487,7 +1487,8 @@ def check_accuracy_dir(config, model, path, verbose):
 def extra_check_llm(mlperf_log, scenario, model):
     if mlperf_log["requested_use_token_latencies"]:
         if scenario not in ["Server", "Interactive"]:
-            # For offline, singlestream and multistream no further checks are necessary
+            # For offline, singlestream and multistream no further checks are
+            # necessary
             return True
         else:
             limits = LLM_LATENCY_LIMITS[model][scenario]
@@ -1887,7 +1888,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
                 samples_per_query = 8
 
             if (scenario_fixed in ["MultiStream"]
-                    ) and scenario in ["SingleStream"]:
+                ) and scenario in ["SingleStream"]:
                 power_metric = (
                     avg_power * power_duration * samples_per_query * 1000 / num_queries
                 )

From bf1469eccc454c3e74733740ad1be0653d8959d6 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 4 Nov 2025 22:11:55 +0000
Subject: [PATCH 34/35] [Automated Commit] Format Codebase

---
 speech2text/accuracy_eval.py | 4 ++--
 speech2text/reference_SUT.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/speech2text/accuracy_eval.py b/speech2text/accuracy_eval.py
index eb6cc53299..c356ab6398 100644
--- a/speech2text/accuracy_eval.py
+++ b/speech2text/accuracy_eval.py
@@ -57,12 +57,12 @@
     "x",
     "y",
     "z",
-    "'", 
+    "'",
     "0",
     "1",
     "2",
     "3",
-    "4", 
+    "4",
     "5",
     "6",
     "7",
diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py
index 63d491a00f..0b2f02c490 100644
--- a/speech2text/reference_SUT.py
+++ b/speech2text/reference_SUT.py
@@ -90,12 +90,12 @@ def get_start_cores(start_cores="0"):
     "x",
     "y",
     "z",
-    "'", 
+    "'",
     "0",
     "1",
     "2",
     "3",
-    "4", 
+    "4",
     "5",
     "6",
     "7",

From d9f1bc1d28939f3d6b58678aad7184539f7c072c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:56:38 +0000
Subject: [PATCH 35/35] [Automated Commit] Format Codebase

---
 tools/submission/preprocess_submission.py | 19 +++++++++++++------
 tools/submission/submission_checker.py    |  5 ++---
 tools/submission/truncate_accuracy_log.py |  3 ++-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index 34043208c8..df3d748c5c 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -99,7 +99,7 @@ def delete_empty_dirs(src):
     return False
 
 
-def copy_submission_dir(src, dst, filter_submitter, keep_structure = True):
+def copy_submission_dir(src, dst, filter_submitter, keep_structure=True):
     """
     Copies the submission tree to output directory for processing
     """
@@ -116,15 +116,18 @@ def copy_submission_dir(src, dst, filter_submitter, keep_structure = True):
                 )
             else:
                 for dir in os.listdir(os.path.join(src, division, submitter)):
-                    if os.path.isdir(os.path.join(src, division, submitter, dir)):
-                        target_dir = "results" if dir in ["compliance", "measurements"] else dir
+                    if os.path.isdir(os.path.join(
+                            src, division, submitter, dir)):
+                        target_dir = "results" if dir in [
+                            "compliance", "measurements"] else dir
                         shutil.copytree(
                             os.path.join(src, division, submitter, dir),
                             os.path.join(dst, division, submitter, target_dir),
-                            dirs_exist_ok = True
+                            dirs_exist_ok=True
                         )
                 for file in os.listdir(os.path.join(src, division, submitter)):
-                    if os.path.isfile(os.path.join(src, division, submitter, file)):
+                    if os.path.isfile(os.path.join(
+                            src, division, submitter, file)):
                         shutil.copyfile(
                             os.path.join(src, division, submitter, file),
                             os.path.join(dst, division, submitter, file)
@@ -561,7 +564,11 @@ def main():
         log.error(f"output directory {args.output} already exists")
         sys.exit(1)
     os.makedirs(args.output)
-    copy_submission_dir(args.input, args.output, args.submitter, args.keep_structure)
+    copy_submission_dir(
+        args.input,
+        args.output,
+        args.submitter,
+        args.keep_structure)
     src_dir = args.output
 
     config = checker.Config(
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 335485c33c..5c2801bacb 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1061,7 +1061,7 @@ def set_type(self, submission_type):
             self.optional = self.base["optional-scenarios-datacenter-edge"]
         else:
             raise ValueError("invalid system type")
-        
+
     def skip_calibration(self):
         return self.skip_calibration_check or self.version in ["v5.0"]
 
@@ -1893,7 +1893,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
                 samples_per_query = 8
 
             if (scenario_fixed in ["MultiStream"]
-                ) and scenario in ["SingleStream"]:
+                    ) and scenario in ["SingleStream"]:
                 power_metric = (
                     avg_power * power_duration * samples_per_query * 1000 / num_queries
                 )
@@ -3040,7 +3040,6 @@ def check_measurement_dir(
             end = len(".json")
             break
 
-
     weight_data_types = None
     if system_file:
         with open(os.path.join(measurement_dir, system_file), "r") as f:
diff --git a/tools/submission/truncate_accuracy_log.py b/tools/submission/truncate_accuracy_log.py
index 6c1267fdf8..87bba5ab98 100755
--- a/tools/submission/truncate_accuracy_log.py
+++ b/tools/submission/truncate_accuracy_log.py
@@ -172,7 +172,8 @@ def truncate_results_dir(filter_submitter, backup, scenarios_to_skip):
                                     acc_path, "accuracy.txt")
 
                                 # only TEST01 has an accuracy log
-                                if str(test).startswith("TEST") and test != "TEST01":
+                                if str(test).startswith(
+                                        "TEST") and test != "TEST01":
                                     continue
                                 if not os.path.exists(acc_log):
                                     log.error("%s missing", acc_log)