From 63f87e0ca4f67f469b02ebad2c232a72f3b97401 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 19 Feb 2025 22:43:25 +0000 Subject: [PATCH 01/35] [Automated Commit] Format Codebase --- compliance/nvidia/TEST01/verify_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/compliance/nvidia/TEST01/verify_performance.py b/compliance/nvidia/TEST01/verify_performance.py index cc400c8ed7..4b527730b1 100644 --- a/compliance/nvidia/TEST01/verify_performance.py +++ b/compliance/nvidia/TEST01/verify_performance.py @@ -54,12 +54,14 @@ def main(): continue if ref_mode == "SingleStream": - if re.match(".*Early stopping (90th|90.0th|99.9th) percentile estimate", line): + if re.match( + ".*Early stopping (90th|90.0th|99.9th) percentile estimate", line): ref_score = line.split(": ", 1)[1].strip() continue if ref_mode == "MultiStream": - if re.match(".*Early stopping (99th|99.0th) percentile estimate", line): + if re.match( + ".*Early stopping (99th|99.0th) percentile estimate", line): ref_score = line.split(": ", 1)[1].strip() continue @@ -96,7 +98,8 @@ def main(): continue if test_mode == "MultiStream": - if re.match(".*Early stopping (99th|99.0th) percentile estimate", line): + if re.match( + ".*Early stopping (99th|99.0th) percentile estimate", line): test_score = line.split(": ", 1)[1].strip() continue From ec284d277523e5409694b431a94ac0601ab6fc3b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 24 Feb 2025 16:37:53 +0000 Subject: [PATCH 02/35] Updated tags for submission checker command in docs --- docs/submission/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index da30c18350..6a6bbfb2f2 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -184,14 +184,14 @@ Once you have all the results on the system, you can upload them to the MLCommon === "via CLI" You can do the following command which will run the submission checker and upload the results to the MLCommons submission server ``` - mlcr run,submission,checker,inference \ + mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ --submission_dir= ``` === "via Browser" You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). ``` - mlcr run,submission,checker \ + mlcr run,mlperf,submission,checker,inference \ --submission_dir= \ --tar=yes \ --submission_tar_file=mysubmission.tar.gz From 5335553c19fe0aa464c7c466325c48fd60c4b6df Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 25 Feb 2025 10:44:55 +0000 Subject: [PATCH 03/35] Update mobilenets docs --- .../image_classification/mobilenets.md | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/benchmarks/image_classification/mobilenets.md b/docs/benchmarks/image_classification/mobilenets.md index 7e4605b4b0..9501521b32 100644 --- a/docs/benchmarks/image_classification/mobilenets.md +++ b/docs/benchmarks/image_classification/mobilenets.md @@ -23,10 +23,10 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_mobilenet-v2 \ --adr.compiler.tags=gcc ``` -=== "Mobilenet-V2" - ### Mobilenet V2 +=== "Mobilenet-V3" + ### Mobilenet V3 ```bash - mlcr run,mobilenet-models,_tflite,_mobilenet-v2 \ + mlcr run,mobilenet-models,_tflite,_mobilenet-v3 \ --adr.compiler.tags=gcc ``` === "Mobilenets" @@ -41,6 +41,12 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_efficientnet \ --adr.compiler.tags=gcc ``` +=== "Mobilenets and Efficientnet" + ### Mobilenets and Efficientnet + ```bash + mlcr run,mobilenet-models,_tflite \ + --adr.compiler.tags=gcc + ``` ## ARMNN Backend === "Mobilenet-V1" @@ -55,7 +61,7 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_armnn,_mobilenet-v2 \ --adr.compiler.tags=gcc ``` -=== "Mobilenet-V2" +=== "Mobilenet-V3" ### Mobilenet V2 ```bash mlcr run,mobilenet-models,_tflite,_armnn,_mobilenet-v2 \ @@ -73,4 +79,11 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_armnn,_efficientnet \ --adr.compiler.tags=gcc ``` +=== "Mobilenets and Efficientnet" + ### Mobilenets and Efficientnet + ```bash + mlcr run,mobilenet-models,_tflite,_armnn \ + --adr.compiler.tags=gcc + ``` + From b9767aa16544a4c759c32767c5ceeb9546b9b195 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 25 Feb 2025 16:55:02 +0000 Subject: [PATCH 04/35] Update main.py --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 2ccfab2958..a8fcb30a49 100755 --- a/main.py +++ b/main.py @@ -28,11 +28,13 @@ def mlperf_inference_implementation_readme( content = "" execution_envs = ["Docker", "Native"] - code_version = "r4.1-dev" + code_version = "r5.0-dev" implementation_run_options = [] if model == "rnnt": code_version = "r4.0" + elif implementation == "intel": + code_version = "r4.1-dev" if implementation == "reference": # Tip From f42aeeb8a96c58dd8ffff8cecde1e08aa5f10d41 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 25 Feb 2025 16:56:51 +0000 Subject: [PATCH 05/35] Update main.py --- main.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/main.py b/main.py index a8fcb30a49..0020d8c4ec 100755 --- a/main.py +++ b/main.py @@ -341,12 +341,7 @@ def mlperf_inference_implementation_readme( and framework.lower() == "deepsparse" ): run_suffix += f"{cur_space3}You can use any model from [NeuralMagic sparse zoo](https://sparsezoo.neuralmagic.com/?modelSet=computer_vision&architectures=resnet_v1) (trained on Imagenet dataset) as --nm_model_zoo_stub" - if ( - "bert" in model.lower() - and framework.lower() == "deepsparse" - ): - run_suffix += "You can use any model from [NeuralMagic sparse zoo](https://sparsezoo.neuralmagic.com/?modelSet=computer_vision&architectures=resnet_v1) (trained on Imagenet dataset) as --nm_model_zoo_stub" - if ( + elif ( "bert" in model.lower() and framework.lower() == "deepsparse" ): From c699ce30ad313a5b9e5f91a9874cf1f4e917772e Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 25 Feb 2025 22:38:06 +0530 Subject: [PATCH 06/35] update dataset download commands - waymo calib (#2130) --- .../get-pointpainting-data.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md b/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md index 0a1e65c8ea..6331b3535b 100644 --- a/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md +++ b/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md @@ -9,11 +9,21 @@ hide: > **Note:** By default, the waymo dataset is downloaded from the mlcommons official drive. One has to accept the [MLCommons Waymo Open Dataset EULA](https://waymo.mlcommons.org/) to access the dataset files. -The benchmark implementation run command will automatically download the preprocessed dataset. In case you want to download only the datasets, you can use the below command. +The benchmark implementation run command will automatically download the preprocessed dataset. In case you want to download only the datasets, you can use the below commands. -```bash -mlcr get,dataset,waymo -j -``` +=== "Validation" + + ### Get Validation Dataset + ``` + mlcr get,dataset,waymo -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,waymo,calibration -j + ``` - `--outdirname=` could be provided to download the dataset to a specific location. From 7bf2c5e6bc49e4c814f93e92d77189b0fd83d7ba Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 13 Mar 2025 18:47:52 +0000 Subject: [PATCH 07/35] Merge from Master (#2155) * Update submission_checker.py | Fix open model unit in Results (#2144) * Add Llama 3.1 to special unit dict (#2150) --------- Co-authored-by: Pablo Gonzalez --- tools/submission/submission_checker.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index c3c30c14d7..7ac74be40d 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1914,50 +1914,38 @@ def log_result( notes = notes + system_json.get("sw_notes") special_unit_dict = { "gptj-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "gptj-99.9": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-99.9": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-interactive-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-interactive-99.9": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", + "Offline": "Tokens/s", + "Server": "Tokens/s", + }, + "llama3.1-405b": { "Offline": "Tokens/s", "Server": "Tokens/s", }, "mixtral-8x7b": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama3.1-405b": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, @@ -1977,7 +1965,7 @@ def log_result( if config.version == "v4.0": unit = unit_dict[scenario_fixed] else: - unit = special_unit_dict.get(model_name, unit_dict)[scenario_fixed] + unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed]) power_unit = power_unit_dict[scenario_fixed] if (power_metric <= 0) or ( From 2a73202e8bdf0028dd7f0e78f4f2d569d6fb1561 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 13 Mar 2025 18:48:21 +0000 Subject: [PATCH 08/35] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 7ac74be40d..0694273b19 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1702,7 +1702,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) @@ -1965,7 +1965,9 @@ def log_result( if config.version == "v4.0": unit = unit_dict[scenario_fixed] else: - unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed]) + unit = special_unit_dict.get( + mlperf_model, unit_dict).get( + scenario_fixed, unit_dict[scenario_fixed]) power_unit = power_unit_dict[scenario_fixed] if (power_metric <= 0) or ( From 2fb105769406a98ffcd587c3b49c303ba6a9de4b Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 22 Apr 2025 18:53:22 +0530 Subject: [PATCH 09/35] Inference docs - Update model and dataset download commands (#2153) * Update llama2 70b model download docs * changes in model and dataset download commands --- docs/benchmarks/graph/get-rgat-data.md | 33 +++++++--- .../image_classification/get-resnet50-data.md | 38 +++++++---- docs/benchmarks/language/get-bert-data.md | 19 ++++++ docs/benchmarks/language/get-gptj-data.md | 14 +++- .../language/get-llama2-70b-data.md | 66 ++++++++++++++----- .../language/get-llama3_1-405b-data.md | 29 +++++--- .../language/get-mixtral-8x7b-data.md | 13 +++- .../medical_imaging/get-3d-unet-data.md | 30 +++++++-- .../object_detection/get-retinanet-data.md | 33 +++++++--- .../recommendation/get-dlrm-v2-data.md | 7 +- .../benchmarks/text_to_image/get-sdxl-data.md | 27 ++++++-- 11 files changed, 236 insertions(+), 73 deletions(-) diff --git a/docs/benchmarks/graph/get-rgat-data.md b/docs/benchmarks/graph/get-rgat-data.md index 6ab9515e59..bb719fea2e 100644 --- a/docs/benchmarks/graph/get-rgat-data.md +++ b/docs/benchmarks/graph/get-rgat-data.md @@ -9,22 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Full Dataset" - R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges. +=== "Validation" - ### Get Full Dataset - ``` - mlcr get,dataset,igbh,_full -j - ``` + === "Full Dataset" + R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges. + + ### Get Full Dataset + ``` + mlcr get,dataset,igbh,_full -j + ``` -=== "Debug Dataset" - R-GAT debug run uses the IGBH debug dataset(tiny). + === "Debug Dataset" + R-GAT debug run uses the IGBH debug dataset(tiny). - ### Get Full Dataset + ### Get Full Dataset + ``` + mlcr get,dataset,igbh,_debug -j + ``` + +=== "Calibration" + The calibration dataset contains 5000 nodes from the training paper nodes of the IGBH dataset. IGBH `full` dataset would be downloaded for creating calibration dataset. + + ### Get Calibration Dataset ``` - mlcr get,dataset,igbh,_debug -j + mlcr get,dataset,igbh,_full,_calibration -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -37,3 +49,4 @@ Get the Official MLPerf R-GAT Model mlcr get,ml-model,rgat -j ``` +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/image_classification/get-resnet50-data.md b/docs/benchmarks/image_classification/get-resnet50-data.md index 771571d5c7..9ecd25c1a5 100644 --- a/docs/benchmarks/image_classification/get-resnet50-data.md +++ b/docs/benchmarks/image_classification/get-resnet50-data.md @@ -9,25 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - ResNet50 validation run uses the Imagenet 2012 validation dataset consisting of 50,000 images. +=== "Unprocessed" + === "Validation" + ResNet50 validation run uses the Imagenet 2012 validation dataset consisting of 50,000 images. - ### Get Validation Dataset - ``` - mlcr get,dataset,imagenet,validation -j - ``` -=== "Calibration" - ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset. + ### Get Validation Dataset + ``` + mlcr get,dataset,imagenet,validation -j + ``` + === "Calibration" + ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset. + + ### Get Calibration Dataset Using Option 1 + ``` + mlcr get,dataset,imagenet,calibration,_mlperf.option1 -j + ``` + ### Get Calibration Dataset Using Option 2 + ``` + mlcr get,dataset,imagenet,calibration,_mlperf.option2 -j + ``` +=== "Preprocessed" + ### Get ResNet50 preprocessed dataset - ### Get Calibration Dataset Using Option 1 - ``` - mlcr get,dataset,imagenet,calibration,_mlperf.option1 -j - ``` - ### Get Calibration Dataset Using Option 2 ``` - mlcr get,dataset,imagenet,calibration,_mlperf.option2 -j + mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -46,3 +55,4 @@ Get the Official MLPerf ResNet50 Model mlcr get,ml-model,resnet50,_onnx -j ``` +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-bert-data.md b/docs/benchmarks/language/get-bert-data.md index 430031f319..ab3ba9b537 100644 --- a/docs/benchmarks/language/get-bert-data.md +++ b/docs/benchmarks/language/get-bert-data.md @@ -17,6 +17,24 @@ The benchmark implementation run command will automatically download the validat mlcr get,dataset,squad,validation -j ``` +=== "Calibration" + + === "Calibration Set 1" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,squad,_calib1 -j + ``` + + === "Calibration Set 2" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,squad,_calib2 -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -41,3 +59,4 @@ Get the Official MLPerf Bert-Large Model mlcr get,ml-model,bert-large,_tensorflow -j ``` +- `--outdirname=` could be provided to download the model to a specific location. diff --git a/docs/benchmarks/language/get-gptj-data.md b/docs/benchmarks/language/get-gptj-data.md index 34140598e9..60e2568b6e 100644 --- a/docs/benchmarks/language/get-gptj-data.md +++ b/docs/benchmarks/language/get-gptj-data.md @@ -14,9 +14,19 @@ The benchmark implementation run command will automatically download the validat ### Get Validation Dataset ``` - mlcr get,dataset,cnndm,validation -j + mlcr get,dataset,cnndm,_validation -j ``` +=== "Calibration" + GPT-J calibration dataset is extracted from the CNNDM dataset. + + ### Get Validation Dataset + ``` + mlcr get,dataset,cnndm,_calibration -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -28,3 +38,5 @@ Get the Official MLPerf GPT-J Model ``` mlcr get,ml-model,gptj,_pytorch -j ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md index 2a31370574..ce7cd996eb 100644 --- a/docs/benchmarks/language/get-llama2-70b-data.md +++ b/docs/benchmarks/language/get-llama2-70b-data.md @@ -9,27 +9,63 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - LLAMA2-70b validation run uses the Open ORCA dataset. +=== "Preprocessed Dataset" - ### Get Validation Dataset - ``` - mlcr get,dataset,openorca,validation -j - ``` + === "Validation" + LLAMA2-70b validation run uses the Open ORCA dataset. + + ### Get Preprocessed Validation Dataset + ``` + mlcr get,dataset,preprocessed,openorca,_validation -j + ``` + + === "Calibration" + + ### Get Preprocessed Calibration dataset + ``` + mlcr get,dataset,preprocessed,openorca,_calibration -j + ``` + +=== "Unprocessed Dataset" + + === "Validation" + LLAMA2-70b validation run uses the Open ORCA dataset. + + ### Get Unprocessed Validation Dataset + ``` + mlcr get,dataset,openorca,_validation -j + ``` + + === "Calibration" + + ### Get Unprocessed Validation Dataset + ``` + mlcr get,dataset,openorca,_validation -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. -Get the Official MLPerf LLAMA2-70b Model - === "Pytorch" - ### Pytorch - ``` - mlcr get,ml-model,llama2-70b,_pytorch -j --outdirname= - ``` - -!!! tip + === "From MLCOMMONS Google Drive" + + > **Note:** One has to accept the [MLCommons Llama 2 License Confidentiality Notice](https://llama2.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Google Drive + ``` + mlcr get,ml-model,llama2-70b,_pytorch -j + ``` + + === "From Hugging Face repo" + + > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). - [Access Request Link](https://llama2.mlcommons.org/) for MLCommons members + ### Get model from HuggingFace repo + ``` + mlcr get,ml-model,llama2-70b,_hf --hf_token= -j + ``` +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md index 62b7bd088a..ad05ca8610 100644 --- a/docs/benchmarks/language/get-llama3_1-405b-data.md +++ b/docs/benchmarks/language/get-llama3_1-405b-data.md @@ -23,18 +23,29 @@ The benchmark implementation run command will automatically download the validat mlcr get,dataset,mlperf,inference,llama3,_calibration --outdirname= -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. -Get the Official MLPerf LLAMA3.1-405b Model - === "Pytorch" - ### Pytorch - ``` - mlcr get,ml-model,llama3 --outdirname= -j - ``` - -!!! tip + === "From MLCOMMONS Google Drive" + + > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Google Drive + ``` + mlcr get,ml-model,llama3 -j + ``` + + === "From Hugging Face repo" + + > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). + + ### Get model from HuggingFace repo + ``` + mlcr get,ml-model,llama3,_hf --hf_token= -j + ``` - [Access Request Link](https://llama3-1.mlcommons.org/) for MLCommons members +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-mixtral-8x7b-data.md b/docs/benchmarks/language/get-mixtral-8x7b-data.md index 81b90cdb57..cf5225843d 100644 --- a/docs/benchmarks/language/get-mixtral-8x7b-data.md +++ b/docs/benchmarks/language/get-mixtral-8x7b-data.md @@ -15,6 +15,15 @@ The benchmark implementation run command will automatically download the preproc mlcr get,dataset-mixtral,openorca-mbxp-gsm8k-combined -j ``` +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,dataset-mixtral,openorca-mbxp-gsm8k-combined,_calibration -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -25,4 +34,6 @@ Get the Official MLPerf MIXTRAL-8x7b Model ### Pytorch ``` mlcr get,ml-model,mixtral -j - ``` \ No newline at end of file + ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/medical_imaging/get-3d-unet-data.md b/docs/benchmarks/medical_imaging/get-3d-unet-data.md index d68b769209..9c77fdeaa2 100644 --- a/docs/benchmarks/medical_imaging/get-3d-unet-data.md +++ b/docs/benchmarks/medical_imaging/get-3d-unet-data.md @@ -9,19 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - 3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task - ### Get Validation Dataset(Original) - ``` - mlcr get,dataset,kits19,_validation -j - ``` +=== "Unprocessed Dataset" + + === "Validation" + 3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task + + ### Get Validation Dataset + ``` + mlcr get,dataset,kits19,_validation -j + ``` + + === "Calibration" - ### Get Validation Dataset(Preprocessed) + ### Get Calibration Dataset + ``` + mlcr get,dataset,kits19,_calibration -j + ``` + +=== "Preprocessed Dataset" + + ### Get Preprocessed Validation Dataset ``` mlcr get,dataset,kits19,preprocessed -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -46,3 +61,4 @@ Get the Official MLPerf 3d-unet Model mlcr get,ml-model,3d-unet,_tensorflow -j ``` +- `--outdirname=` could be provided to download the model to a specific location. diff --git a/docs/benchmarks/object_detection/get-retinanet-data.md b/docs/benchmarks/object_detection/get-retinanet-data.md index 6cd677b4e1..6127eed541 100644 --- a/docs/benchmarks/object_detection/get-retinanet-data.md +++ b/docs/benchmarks/object_detection/get-retinanet-data.md @@ -9,20 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - Retinanet validation run uses the OpenImages v6 MLPerf validation dataset resized to 800x800 and consisting of 24,576 images. +=== "Unprocessed" - ### Get Validation Dataset - ``` - mlcr get,dataset,openimages,_validation -j - ``` -=== "Calibration" - Retinanet calibration dataset consist of 500 images selected from the OpenImages v6 dataset. + === "Validation" + Retinanet validation run uses the OpenImages v6 MLPerf validation dataset resized to 800x800 and consisting of 24,576 images. + + ### Get Validation Dataset + ``` + mlcr get,dataset,openimages,_validation -j + ``` + + === "Calibration" + Retinanet calibration dataset consist of 500 images selected from the OpenImages v6 dataset. + ### Get OpenImages Calibration dataset + ``` + mlcr get,dataset,openimages,_calibration -j + ``` + +=== "Preprocessed" + + ### Get Preprocessed OpenImages dataset ``` - mlcr get,dataset,openimages,_calibration -j + get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -41,3 +55,4 @@ Get the Official MLPerf Retinanet Model mlcr get,ml-model,retinanet,_onnx -j ``` +- `--outdirname=` could be provided to download the model to a specific location. diff --git a/docs/benchmarks/recommendation/get-dlrm-v2-data.md b/docs/benchmarks/recommendation/get-dlrm-v2-data.md index bb35660b68..8505b31bf4 100644 --- a/docs/benchmarks/recommendation/get-dlrm-v2-data.md +++ b/docs/benchmarks/recommendation/get-dlrm-v2-data.md @@ -16,6 +16,9 @@ The benchmark implementation run command will automatically download the validat ``` mlcr get,dataset,criteo,_validation -j ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -25,6 +28,8 @@ Get the Official MLPerf DLRM v2 Model ### Pytorch ``` - mlcr get,ml-model,dlrm,_pytorch -j + mlcr get,ml-model,dlrm,_pytorch,_weight_sharded,_rclone -j ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/text_to_image/get-sdxl-data.md b/docs/benchmarks/text_to_image/get-sdxl-data.md index 6d79e331d1..7c5363415c 100644 --- a/docs/benchmarks/text_to_image/get-sdxl-data.md +++ b/docs/benchmarks/text_to_image/get-sdxl-data.md @@ -17,15 +17,30 @@ The benchmark implementation run command will automatically download the validat mlcr get,dataset,coco2014,_validation -j ``` +=== "Calibration" + + ### Get COCO2014 Calibration Dataset + ``` + mlcr get,dataset,coco2014,_calibration -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. Get the Official MLPerf Stable Diffusion Model === "Pytorch" - - ### Pytorch - ``` - mlcr get,ml-model,sdxl,_pytorch -j - ``` - + === "FP 16" + ### Pytorch + ``` + mlcr get,ml-model,sdxl,_pytorch,_fp16 -j + ``` + === "FP 32" + ### Pytorch + ``` + mlcr get,ml-model,sdxl,_pytorch,_fp32 -j + ``` + +- `--outdirname=` could be provided to download the model to a specific location. From d8048376f88d3ad6aabbaddd52018cea5263b117 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 22 Apr 2025 18:53:40 +0530 Subject: [PATCH 10/35] add powershell command to get result folder structure (#2156) --- docs/submission/index.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index 6a6bbfb2f2..079a513854 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -56,9 +56,15 @@ Please refer to the [installation page](site:inference/install/) to install MLCF === "MLC automation based results" If you have followed the `mlcr` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `mlc cache` folder. The following command could be used to browse the structure of inference results folder generated by MLCFlow. ### Get results folder structure - ```bash - mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree - ``` + + === "Unix Terminal" + ```bash + mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree + ``` + === "Windows PowerShell" + ``` + mlc find cache --tags=get,mlperf,inference,results,dir | ForEach-Object { Get-ChildItem -Recurse $_ } + ``` Once all the results across all the models are ready you can use the following the below section to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1). From 5b65f7be79a703fa23034f0d406b2e6f1a6134eb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 22 Apr 2025 17:00:11 +0000 Subject: [PATCH 11/35] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 41116e2620..edda676c9c 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -2094,7 +2094,8 @@ def log_result( if filter_submitter and submitter != filter_submitter: continue results_path = os.path.join(division, submitter, "results") - measurements_path = os.path.join(division, submitter, "measurements") + measurements_path = os.path.join( + division, submitter, "measurements") systems_path = os.path.join(division, submitter, "systems") if not os.path.exists(results_path): continue @@ -2200,7 +2201,8 @@ def log_result( extra_model_mapping = json.load(fp) if not config.skip_all_systems_with_results: - measurement_diff = list(set(list_dir(measurements_path)) - set(list_dir(results_path))) + measurement_diff = list( + set(list_dir(measurements_path)) - set(list_dir(results_path))) systems_diff = list( set( [ @@ -3173,7 +3175,7 @@ def main(): args.extra_model_benchmark_map, ignore_uncommited=args.submission_exceptions, skip_power_check=args.skip_power_check, - skip_all_systems_with_results = args.skip_all_systems_have_results_check + skip_all_systems_with_results=args.skip_all_systems_have_results_check ) if args.scenarios_to_skip: From 70fcbe0c8b4aeadedd7ba40c17b4077530e3019b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 10 Jun 2025 16:59:39 +0000 Subject: [PATCH 12/35] [Automated Commit] Format Codebase --- language/deepseek-r1/backends/__init__.py | 2 +- .../deepseek-r1/backends/pytorch_backend.py | 52 ++--- .../deepseek-r1/backends/sglang_backend.py | 168 +++++++++------ language/deepseek-r1/backends/utils.py | 173 +++++++++------- language/deepseek-r1/backends/vllm_backend.py | 33 +-- language/deepseek-r1/eval_accuracy.py | 74 ++++--- language/deepseek-r1/mlperf/__init__.py | 6 +- language/deepseek-r1/mlperf/base_sut.py | 28 +-- language/deepseek-r1/mlperf/offline_sut.py | 70 ++++--- language/deepseek-r1/mlperf/qsl.py | 25 +-- language/deepseek-r1/mlperf/server_sut.py | 100 +++++---- language/deepseek-r1/mlperf/utils.py | 69 ++++--- language/deepseek-r1/run_eval.py | 165 ++++++++------- language/deepseek-r1/run_eval_mpi.py | 99 +++++---- language/deepseek-r1/run_mlperf.py | 116 ++++++----- language/deepseek-r1/run_mlperf_mpi.py | 195 ++++++++++-------- language/deepseek-r1/utils/__init__.py | 2 +- .../deepseek-r1/utils/backend_registry.py | 11 +- language/deepseek-r1/utils/data_utils.py | 108 +++++----- language/deepseek-r1/utils/error_handling.py | 11 +- language/deepseek-r1/utils/runner_utils.py | 58 +++--- language/deepseek-r1/utils/tokenization.py | 87 ++++---- language/deepseek-r1/utils/validation.py | 44 ++-- tools/submission/submission_checker.py | 17 +- 24 files changed, 987 insertions(+), 726 deletions(-) diff --git a/language/deepseek-r1/backends/__init__.py b/language/deepseek-r1/backends/__init__.py index 61ad96a3f2..865ed3bd53 100644 --- a/language/deepseek-r1/backends/__init__.py +++ b/language/deepseek-r1/backends/__init__.py @@ -11,4 +11,4 @@ # to avoid dependency issues when only using certain backends __all__ = [ 'BaseBackend', -] \ No newline at end of file +] diff --git a/language/deepseek-r1/backends/pytorch_backend.py b/language/deepseek-r1/backends/pytorch_backend.py index c1e426185d..0742882bca 100644 --- a/language/deepseek-r1/backends/pytorch_backend.py +++ b/language/deepseek-r1/backends/pytorch_backend.py @@ -1,3 +1,17 @@ +from utils.validation import require_initialized, BackendNotInitializedError +from utils.backend_registry import get_backend_config +from .utils import get_cache_directory +from .base_backend import BaseBackend +from transformers import AutoTokenizer +import torch.distributed as dist +import torch +from pathlib import Path +import asyncio +from typing import Any, Dict, List, Optional +import logging +import json +from ref_dsinfer.inference.model import Transformer, ModelArgs +from safetensors.torch import load_model import os import sys @@ -6,23 +20,6 @@ 'REF_DSINFER_PATH', '/opt/ref_dsinfer/inference') sys.path.append(ref_dsinfer_path) -from safetensors.torch import load_model -from ref_dsinfer.inference.model import Transformer, ModelArgs -import json -import logging -from typing import Any, Dict, List, Optional -import asyncio -from pathlib import Path - -import torch -import torch.distributed as dist -from transformers import AutoTokenizer - -from .base_backend import BaseBackend -from .utils import get_cache_directory -from utils.backend_registry import get_backend_config -from utils.validation import require_initialized, BackendNotInitializedError - logger = logging.getLogger(__name__) @@ -115,8 +112,10 @@ def initialize(self) -> None: with torch.device(self.config['device']): self.model = Transformer(self.model_args) - # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need it for run_eval_mpi) - self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path), revision=self.config['model_revision']) + # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need + # it for run_eval_mpi) + self.tokenizer = AutoTokenizer.from_pretrained( + str(self.model_path), revision=self.config['model_revision']) # Load model weights checkpoint_file = self.model_path / \ @@ -133,7 +132,8 @@ def sample(self, logits: torch.Tensor, temperature: float) -> torch.Tensor: """Sample from logits with temperature.""" logits = logits / max(temperature, 1e-5) probs = torch.softmax(logits, dim=-1) - return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1) + return probs.div_(torch.empty_like( + probs).exponential_(1)).argmax(dim=-1) @torch.inference_mode() def _generate_batch( @@ -222,7 +222,8 @@ def _generate_batch( return completion_tokens @require_initialized - def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]: + def generate( + self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]: """ Generate responses for a list of pre-tokenized prompts. @@ -265,7 +266,8 @@ def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[st return results @require_initialized - def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List[int]]: + def generate_batch_distributed( + self, batch_tokens: List[List[int]]) -> List[List[int]]: """ Generate tokens for a batch in distributed mode. @@ -296,7 +298,8 @@ def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List return [] @require_initialized - def generate_async(self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]: + def generate_async( + self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]: """ Generate responses asynchronously. @@ -331,7 +334,8 @@ async def extract_result(idx): return futures @require_initialized - def generate_batch_distributed_async(self, batch_tokens: List[List[int]]) -> asyncio.Future: + def generate_batch_distributed_async( + self, batch_tokens: List[List[int]]) -> asyncio.Future: """ Generate tokens for a batch in distributed mode asynchronously. diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 06cf074a96..10be6e1dcd 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -66,11 +66,12 @@ def __init__(self, config: Dict[str, Any] = None): # Log monitoring self._log_monitor = None - + # Shared semaphore for async concurrency control self._async_semaphore = None - # Configure logging to suppress httpx INFO logs (only show warnings/errors) + # Configure logging to suppress httpx INFO logs (only show + # warnings/errors) import logging logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("openai").setLevel(logging.WARNING) @@ -128,7 +129,8 @@ def _build_server_command(self) -> List[str]: cmd.append('flashinfer') if self.config['enable_dp_attention']: - cmd.extend(['--enable-dp-attention', '--dp', str(self.config['dp'])]) + cmd.extend(['--enable-dp-attention', + '--dp', str(self.config['dp'])]) # Add performance settings cmd.extend([ @@ -175,7 +177,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: # Update progress indicator every 0.5 seconds if time.time() - last_progress_update >= 0.5: last_progress_update = time.time() - progress_idx = (progress_idx + 1) % len(TerminalDisplay.PROGRESS_CHARS) + progress_idx = ( + progress_idx + 1) % len(TerminalDisplay.PROGRESS_CHARS) minutes = elapsed // 60 seconds = elapsed % 60 # Use carriage return to stay on the same line @@ -192,7 +195,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: if response.status_code == 200: # Health check passed, now try a warmup query print(f"\r{' '*80}\r", end='', flush=True) - print(f"\n[SGLANG] Health check passed, running warmup query...") + print( + f"\n[SGLANG] Health check passed, running warmup query...") # Try to send a simple warmup query using OpenAI client try: @@ -210,7 +214,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: # Send a simple warmup request warmup_response = warmup_client.chat.completions.create( model=self.config['served_model_name'], - messages=[{"role": "user", "content": "Hello"}], + messages=[ + {"role": "user", "content": "Hello"}], temperature=0.0, max_tokens=10, seed=self.config['seed'] @@ -218,23 +223,28 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: # Check if we got a valid response if warmup_response.choices[0].message.content: - print(f"[SGLANG] ✓ Warmup query successful! Response: {warmup_response.choices[0].message.content[:50]}...") + print( + f"[SGLANG] ✓ Warmup query successful! Response: {warmup_response.choices[0].message.content[:50]}...") # Stop log monitoring if self._log_monitor: self._log_monitor.stop() self._log_monitor = None - print(f"\n[SGLANG] " + "="*60) - print(f"[SGLANG] ✓ SERVER READY! (startup took {elapsed}s)") - print(f"[SGLANG] " + "="*60) + print(f"\n[SGLANG] " + "=" * 60) + print( + f"[SGLANG] ✓ SERVER READY! (startup took {elapsed}s)") + print(f"[SGLANG] " + "=" * 60) return True else: - print(f"[SGLANG] Warmup query returned empty response, retrying...") + print( + f"[SGLANG] Warmup query returned empty response, retrying...") except Exception as warmup_error: - print(f"[SGLANG] Warmup query failed: {warmup_error}, retrying...") - # Continue waiting, the server might not be fully ready yet + print( + f"[SGLANG] Warmup query failed: {warmup_error}, retrying...") + # Continue waiting, the server might not be fully + # ready yet except requests.exceptions.RequestException: pass @@ -246,9 +256,11 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: self._log_monitor = None # Clear progress line print(f"\r{' '*80}\r", end='', flush=True) - print(f"\n[SGLANG] ✗ Server process died with exit code: {self.server_process.returncode}") + print( + f"\n[SGLANG] ✗ Server process died with exit code: {self.server_process.returncode}") if self.server_log_file: - print(f"[SGLANG] Check server logs at: {self.server_log_file}") + print( + f"[SGLANG] Check server logs at: {self.server_log_file}") return False time.sleep(0.1) # Check every 100ms for smoother progress @@ -264,17 +276,21 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: def _start_server(self) -> None: """Start the SGLang server as a subprocess.""" - print(f"\n[SGLANG] Starting SGLang server for {self.config['model']}...") + print( + f"\n[SGLANG] Starting SGLang server for {self.config['model']}...") print(f"[SGLANG] Configuration:") print(f"[SGLANG] - Port: {self.port}") - print(f"[SGLANG] - Tensor Parallel: {self.config['tensor_parallel_size']}") - print(f"[SGLANG] - Context Length: {self.config['context_length']:,} tokens") + print( + f"[SGLANG] - Tensor Parallel: {self.config['tensor_parallel_size']}") + print( + f"[SGLANG] - Context Length: {self.config['context_length']:,} tokens") print(f"[SGLANG] - dtype: {self.config['dtype']}") # Create log file for server output log_dir = Path("/work/logs") log_dir.mkdir(exist_ok=True) - self.server_log_file = log_dir / f"sglang_server_{self.port}_{int(time.time())}.log" + self.server_log_file = log_dir / \ + f"sglang_server_{self.port}_{int(time.time())}.log" cmd = self._build_server_command() print(f"\n[SGLANG] Command: {' '.join(cmd)}") @@ -315,7 +331,10 @@ def _stop_server(self) -> None: except subprocess.TimeoutExpired: # Force kill if not stopped print("[SGLANG] Server didn't stop gracefully, forcing...") - os.killpg(os.getpgid(self.server_process.pid), signal.SIGKILL) + os.killpg( + os.getpgid( + self.server_process.pid), + signal.SIGKILL) self.server_process.wait() print("[SGLANG] Server force stopped") except ProcessLookupError: @@ -332,7 +351,8 @@ def initialize(self) -> None: try: # Load tokenizer for string conversion print(f"[SGLANG] Loading tokenizer: {self.config['tokenizer']}...") - self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer'], revision=self.config['model_revision']) + self.tokenizer = AutoTokenizer.from_pretrained( + self.config['tokenizer'], revision=self.config['model_revision']) # Start SGLang server (with log monitoring) self._start_server() @@ -341,7 +361,8 @@ def initialize(self) -> None: base_url = f"http://localhost:{self.port}/v1" api_key = self.config['api_key'] or "dummy-key" - print(f"[SGLANG] Creating OpenAI clients with base URL: {base_url}") + print( + f"[SGLANG] Creating OpenAI clients with base URL: {base_url}") # Configure timeout settings timeout_config = httpx.Timeout( @@ -371,10 +392,12 @@ def initialize(self) -> None: ) print(f"[SGLANG] Created asynchronous OpenAI client") - + # Create shared semaphore for async concurrency control - self._async_semaphore = asyncio.Semaphore(self.config['max_running_requests']) - print(f"[SGLANG] Created async semaphore with limit: {self.config['max_running_requests']}") + self._async_semaphore = asyncio.Semaphore( + self.config['max_running_requests']) + print( + f"[SGLANG] Created async semaphore with limit: {self.config['max_running_requests']}") # Server readiness was already verified by health endpoint in _wait_for_server_ready() # No need to check models endpoint @@ -403,17 +426,18 @@ def initialize(self) -> None: raise @require_initialized - def generate(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[Dict[str, Any]]: + def generate(self, + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[Dict[str, Any]]: """Generate responses synchronously.""" # Check if server process is still alive self._check_server_alive() # Check if client is properly initialized if self.client is None: - raise RuntimeError("SGLang client is not initialized. Server may have failed to start.") + raise RuntimeError( + "SGLang client is not initialized. Server may have failed to start.") # Validate prompts using centralized validation validate_prompts_input( @@ -436,7 +460,8 @@ def generate(self, results = [] # Process prompts with progress bar - for prompt in tqdm(prompt_strings, desc="SGLang sync inference", unit="prompt"): + for prompt in tqdm( + prompt_strings, desc="SGLang sync inference", unit="prompt"): try: completion = self.client.chat.completions.create( model=self.config['served_model_name'], @@ -452,7 +477,8 @@ def generate(self, # Validate response is not empty if not generated_text: - raise RuntimeError(f"Empty response received from SGLang server for prompt: {prompt[:100]}...") + raise RuntimeError( + f"Empty response received from SGLang server for prompt: {prompt[:100]}...") # Tokenize the output to get token IDs tokens = self.tokenizer.encode(generated_text) @@ -464,15 +490,18 @@ def generate(self, except Exception as e: print(f"\nError generating completion: {e}") - raise RuntimeError(f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...") + raise RuntimeError( + f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...") return results - async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio.Semaphore) -> Tuple[int, Dict[str, Any]]: + async def _async_generate_single( + self, prompt: str, idx: int, semaphore: asyncio.Semaphore) -> Tuple[int, Dict[str, Any]]: """Generate a single response asynchronously with semaphore control.""" # Check if async client is properly initialized if self.async_client is None: - raise RuntimeError(f"SGLang async client is not initialized for prompt {idx}") + raise RuntimeError( + f"SGLang async client is not initialized for prompt {idx}") async with semaphore: try: @@ -490,7 +519,8 @@ async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio # Validate response is not empty if not generated_text: - raise RuntimeError(f"Empty response received from SGLang server for prompt: {prompt[:100]}...") + raise RuntimeError( + f"Empty response received from SGLang server for prompt: {prompt[:100]}...") # Tokenize the output to get token IDs tokens = self.tokenizer.encode(generated_text) @@ -499,20 +529,22 @@ async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio except Exception as e: print(f"\nError generating completion for prompt {idx}: {e}") - raise RuntimeError(f"SGLang backend failed to generate tokens for prompt {idx}: {e}") + raise RuntimeError( + f"SGLang backend failed to generate tokens for prompt {idx}: {e}") @require_initialized - def generate_async(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[asyncio.Future]: + def generate_async(self, + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[asyncio.Future]: """Generate responses asynchronously using shared semaphore.""" # Check if server process is still alive self._check_server_alive() # Check if client is properly initialized if self.async_client is None: - raise RuntimeError("SGLang async client is not initialized. Server may have failed to start.") + raise RuntimeError( + "SGLang async client is not initialized. Server may have failed to start.") # Validate prompts using centralized validation validate_prompts_input( @@ -542,44 +574,49 @@ def generate_async(self, futures = [] for idx, prompt in enumerate(prompt_strings): # Create a task for each prompt using the shared semaphore - task = asyncio.create_task(self._async_generate_single(prompt, idx, self._async_semaphore)) - + task = asyncio.create_task( + self._async_generate_single( + prompt, idx, self._async_semaphore)) + # Create a future that will hold the result future = asyncio.Future() - + # Setup callback to extract just the result (not the index) def make_callback(future_obj, expected_idx): def callback(task_obj): try: idx, result = task_obj.result() if idx != expected_idx: - future_obj.set_exception(Exception(f"Index mismatch: expected {expected_idx}, got {idx}")) + future_obj.set_exception( + Exception(f"Index mismatch: expected {expected_idx}, got {idx}")) else: future_obj.set_result(result) except Exception as e: future_obj.set_exception(e) return callback - + task.add_done_callback(make_callback(future, idx)) futures.append(future) return futures - async def generate_stream(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[AsyncIterator[StreamingChunk]]: + async def generate_stream(self, + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[AsyncIterator[StreamingChunk]]: """Generate responses for a list of prompts with streaming.""" if not self.is_initialized: - raise RuntimeError("Backend not initialized. Call initialize() first.") - + raise RuntimeError( + "Backend not initialized. Call initialize() first.") + # Check if server process is still alive self._check_server_alive() - + # Check if async client is properly initialized if self.async_client is None: - raise RuntimeError("SGLang async client is not initialized. Server may have failed to start.") - + raise RuntimeError( + "SGLang async client is not initialized. Server may have failed to start.") + # Validate prompts validate_prompts_input( backend_name='sglang', @@ -587,7 +624,7 @@ async def generate_stream(self, text_prompts=text_prompts, input_type='text' ) - + # SGLang prefers text prompts if text_prompts is None: # Convert tokenized prompts to strings @@ -597,8 +634,9 @@ async def generate_stream(self, ] else: prompt_strings = text_prompts - - async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]: + + async def stream_single_prompt( + prompt: str) -> AsyncIterator[StreamingChunk]: try: stream = await self.async_client.chat.completions.create( model=self.config['served_model_name'], @@ -609,14 +647,14 @@ async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]: seed=self.config.get('seed'), stream=True ) - + async for chunk in stream: if not chunk.choices: continue - + delta = chunk.choices[0].delta finish_reason = chunk.choices[0].finish_reason - + if delta.content: yield StreamingChunk( token=delta.content, @@ -635,7 +673,7 @@ async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]: except Exception as e: print(f"[SGLANG] Streaming error for prompt: {e}") raise - + return [stream_single_prompt(prompt) for prompt in prompt_strings] def shutdown(self) -> None: @@ -650,7 +688,7 @@ def shutdown(self) -> None: # Close clients self.client = None self.async_client = None - + # Clear async semaphore self._async_semaphore = None @@ -665,4 +703,4 @@ def shutdown(self) -> None: torch.cuda.empty_cache() self.is_initialized = False - print("[SGLANG] Backend shutdown complete") \ No newline at end of file + print("[SGLANG] Backend shutdown complete") diff --git a/language/deepseek-r1/backends/utils.py b/language/deepseek-r1/backends/utils.py index 0e4c7732da..ebd6ce3719 100644 --- a/language/deepseek-r1/backends/utils.py +++ b/language/deepseek-r1/backends/utils.py @@ -19,50 +19,50 @@ def get_cache_directory() -> Path: """ Get the cache directory at /raid/data/$USER/.cache - + Returns: Path: The cache directory path """ # Get the current user user = os.environ.get('USER', os.environ.get('USERNAME', 'unknown')) - + # Use /raid/data/$USER/.cache cache_dir = Path(f'/raid/data/{user}/.cache') - + # Create the cache directory if it doesn't exist cache_dir.mkdir(parents=True, exist_ok=True) - + return cache_dir def setup_huggingface_cache() -> Path: """ Set up HuggingFace cache environment variables using the preferred cache directory. - + Returns: Path: The cache directory being used """ cache_dir = get_cache_directory() - + # Set HuggingFace cache environment variables os.environ['HF_HOME'] = str(cache_dir) os.environ['HF_HUB_CACHE'] = str(cache_dir) os.environ['HUGGINGFACE_HUB_CACHE'] = str(cache_dir) - + return cache_dir def find_free_port(start_port: int = 30000, max_attempts: int = 100) -> int: """ Find a free port starting from start_port. - + Args: start_port: The port number to start searching from max_attempts: Maximum number of ports to try - + Returns: int: A free port number - + Raises: RuntimeError: If no free port is found after max_attempts """ @@ -75,13 +75,14 @@ def find_free_port(start_port: int = 30000, max_attempts: int = 100) -> int: return port except OSError: continue - raise RuntimeError(f"Could not find free port after {max_attempts} attempts starting from {start_port}") + raise RuntimeError( + f"Could not find free port after {max_attempts} attempts starting from {start_port}") def set_all_seeds(seed: int = 42) -> None: """ Set seeds for all random number generators for reproducibility. - + Args: seed: The seed value to use """ @@ -96,73 +97,76 @@ def set_all_seeds(seed: int = 42) -> None: set_seed(seed) -def validate_prompts(tokenized_prompts: Optional[list] = None, - text_prompts: Optional[list] = None, - backend_type: str = "") -> None: +def validate_prompts(tokenized_prompts: Optional[list] = None, + text_prompts: Optional[list] = None, + backend_type: str = "") -> None: """ Validate that at least one type of prompts is provided. - + Args: tokenized_prompts: List of tokenized prompts text_prompts: List of text prompts backend_type: Name of the backend for error messages - + Raises: ValueError: If neither prompt type is provided """ if tokenized_prompts is None and text_prompts is None: - raise ValueError(f"{backend_type + ' backend' if backend_type else 'Backend'} requires either text_prompts or tokenized_prompts") + raise ValueError( + f"{backend_type + ' backend' if backend_type else 'Backend'} requires either text_prompts or tokenized_prompts") # Terminal display utilities class TerminalDisplay: """ANSI escape codes and utilities for terminal display formatting.""" - + # ANSI escape codes for cursor control CLEAR_SCREEN = "\033[2J" MOVE_CURSOR_UP = "\033[{}A" CLEAR_LINE = "\033[K" SAVE_CURSOR = "\033[s" RESTORE_CURSOR = "\033[u" - + # Progress spinner characters PROGRESS_CHARS = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'] - + @staticmethod def clear_lines(num_lines: int) -> None: """Clear the specified number of lines above the cursor.""" - print(TerminalDisplay.MOVE_CURSOR_UP.format(num_lines), end='', flush=True) + print(TerminalDisplay.MOVE_CURSOR_UP.format( + num_lines), end='', flush=True) for _ in range(num_lines): print(TerminalDisplay.CLEAR_LINE) - print(TerminalDisplay.MOVE_CURSOR_UP.format(num_lines), end='', flush=True) - + print(TerminalDisplay.MOVE_CURSOR_UP.format( + num_lines), end='', flush=True) + @staticmethod def save_cursor_position() -> None: """Save the current cursor position.""" print(TerminalDisplay.SAVE_CURSOR, end='', flush=True) - + @staticmethod def restore_cursor_position() -> None: """Restore the previously saved cursor position.""" print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) - + @staticmethod def clear_current_line() -> None: """Clear the current line.""" print("\r" + " " * 80 + "\r", end='', flush=True) - + @staticmethod def truncate_line(line: str, max_length: int = 110) -> str: """Truncate a line to fit within the specified length.""" if len(line) <= max_length: return line - return line[:max_length - 3] + "..." + return line[:max_length - 3] + "..." class LogMonitor: """Real-time log file monitor with terminal display.""" - - def __init__(self, + + def __init__(self, log_file_path: Union[str, Path], prefix: str = "LOG", max_lines: int = 5, @@ -170,7 +174,7 @@ def __init__(self, header_text: Optional[str] = None): """ Initialize the log monitor. - + Args: log_file_path: Path to the log file to monitor prefix: Prefix for display lines (e.g., "[SGLANG]") @@ -183,42 +187,43 @@ def __init__(self, self.max_lines = max_lines self.display_interval = display_interval self.header_text = header_text or f"Server startup logs (last {max_lines} lines):" - + # Threading control self._monitor_thread = None self._stop_event = None self._ready_event = None - + # Display dimensions self.total_lines = max_lines + 3 # 2 header lines + 1 blank separator - - def start(self, wait_for_file: bool = True, file_wait_timeout: float = 30.0) -> bool: + + def start(self, wait_for_file: bool = True, + file_wait_timeout: float = 30.0) -> bool: """ Start the log monitor in a background thread. - + Args: wait_for_file: Whether to wait for the log file to exist file_wait_timeout: How long to wait for the file (seconds) - + Returns: bool: True if monitor started successfully """ if self._monitor_thread is not None: return True # Already running - + self._stop_event = threading.Event() self._ready_event = threading.Event() - + self._monitor_thread = threading.Thread( target=self._monitor_loop, args=(wait_for_file, file_wait_timeout), daemon=True ) self._monitor_thread.start() - + # Wait for the monitor to set up its display area return self._ready_event.wait(timeout=2.0) - + def stop(self) -> None: """Stop the log monitor and clean up display.""" if self._stop_event and self._monitor_thread: @@ -227,36 +232,39 @@ def stop(self) -> None: self._monitor_thread = None self._stop_event = None self._ready_event = None - - def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: + + def _monitor_loop(self, wait_for_file: bool, + file_wait_timeout: float) -> None: """Main monitoring loop that runs in a separate thread.""" # Wait for log file if requested if wait_for_file: start_time = time.time() while not self.log_file_path.exists(): if time.time() - start_time > file_wait_timeout: - print(f"[{self.prefix}] Warning: Log file not found after {file_wait_timeout}s: {self.log_file_path}") + print( + f"[{self.prefix}] Warning: Log file not found after {file_wait_timeout}s: {self.log_file_path}") self._ready_event.set() return time.sleep(0.5) elif not self.log_file_path.exists(): - print(f"[{self.prefix}] Warning: Log file not found: {self.log_file_path}") + print( + f"[{self.prefix}] Warning: Log file not found: {self.log_file_path}") self._ready_event.set() return - + print(f"\n[{self.prefix}] Monitoring logs: {self.log_file_path.name}") - print(f"[{self.prefix}] " + "="*60) - + print(f"[{self.prefix}] " + "=" * 60) + # Initialize display area self._setup_display_area() - + # Signal that we're ready self._ready_event.set() - + # Buffer for log lines line_buffer = [] last_display_time = 0 - + try: # Use tail -f to follow the log file process = subprocess.Popen( @@ -267,11 +275,11 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: bufsize=1, universal_newlines=True ) - + while not self._stop_event.is_set(): if process.poll() is not None: break - + # Read available lines without blocking line_added = False try: @@ -285,7 +293,7 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: line_added = True else: break - except: + except BaseException: # Fallback for systems without select line = process.stdout.readline() if line: @@ -293,65 +301,69 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: if len(line_buffer) > self.max_lines: line_buffer.pop(0) line_added = True - + # Update display if needed current_time = time.time() - if line_added or (current_time - last_display_time >= self.display_interval): + if line_added or ( + current_time - last_display_time >= self.display_interval): last_display_time = current_time self._update_display(line_buffer) - + time.sleep(0.1) - + # Clean up process.terminate() try: process.wait(timeout=2) except subprocess.TimeoutExpired: process.kill() - + except Exception as e: print(f"\n[{self.prefix}] Log monitor error: {e}") finally: self._cleanup_display() - + def _setup_display_area(self) -> None: """Reserve and initialize the display area.""" # Reserve space for _ in range(self.total_lines): print() - + # Move back up to start of reserved area - print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines), end='', flush=True) - + print(TerminalDisplay.MOVE_CURSOR_UP.format( + self.total_lines), end='', flush=True) + # Print initial display print(f"\r[{self.prefix}] {self.header_text}", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - print(f"\r[{self.prefix}] " + "-"*60, end='') + print(f"\r[{self.prefix}] " + "-" * 60, end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print empty lines for _ in range(self.max_lines): print(f"\r[{self.prefix}] ", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print separator print(f"\r", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + def _update_display(self, line_buffer: list) -> None: """Update the display with current log lines.""" # Save cursor position print(TerminalDisplay.SAVE_CURSOR, end='', flush=True) - - # Move to start of reserved area (cursor is on progress line, 1 below our area) - print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines + 1), end='', flush=True) - + + # Move to start of reserved area (cursor is on progress line, 1 below + # our area) + print(TerminalDisplay.MOVE_CURSOR_UP.format( + self.total_lines + 1), end='', flush=True) + # Print header print(f"\r[{self.prefix}] {self.header_text}", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - print(f"\r[{self.prefix}] " + "-"*60, end='') + print(f"\r[{self.prefix}] " + "-" * 60, end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print log lines for i in range(self.max_lines): if i < len(line_buffer): @@ -360,22 +372,23 @@ def _update_display(self, line_buffer: list) -> None: else: print(f"\r[{self.prefix}] ", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print separator print(f"\r", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Restore cursor position print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) - + def _cleanup_display(self) -> None: """Clean up the display area on exit.""" print(TerminalDisplay.SAVE_CURSOR, end='', flush=True) - print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines + 1), end='', flush=True) - + print(TerminalDisplay.MOVE_CURSOR_UP.format( + self.total_lines + 1), end='', flush=True) + # Clear all reserved lines for _ in range(self.total_lines): print(f"\r", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - - print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) \ No newline at end of file + + print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) diff --git a/language/deepseek-r1/backends/vllm_backend.py b/language/deepseek-r1/backends/vllm_backend.py index 4ac408042f..ec49227f41 100644 --- a/language/deepseek-r1/backends/vllm_backend.py +++ b/language/deepseek-r1/backends/vllm_backend.py @@ -148,9 +148,9 @@ def initialize(self) -> None: @require_initialized def generate(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[Dict[str, Any]]: + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[Dict[str, Any]]: """Generate responses synchronously using LLM.generate(). Note: vLLM backend only accepts text_prompts parameter. @@ -177,11 +177,14 @@ def generate(self, if not completion.text: # Get the corresponding prompt for context prompt_idx = outputs.index(output) - prompt_preview = text_prompts[prompt_idx][:100] if len(text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] - raise RuntimeError(f"Empty response received from vLLM for prompt: {prompt_preview}...") + prompt_preview = text_prompts[prompt_idx][:100] if len( + text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] + raise RuntimeError( + f"Empty response received from vLLM for prompt: {prompt_preview}...") results.append({ - 'tokens': list(completion.token_ids), # Convert tuple to list for .copy() compatibility + # Convert tuple to list for .copy() compatibility + 'tokens': list(completion.token_ids), 'text': completion.text, 'finish_reason': completion.finish_reason }) @@ -190,9 +193,9 @@ def generate(self, @require_initialized def generate_async(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[asyncio.Future]: + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[asyncio.Future]: """Generate responses asynchronously, returning futures immediately. Note: vLLM backend only accepts text_prompts parameter. @@ -245,11 +248,14 @@ def _generate_batch(self, text_prompts: List[str]) -> List[Dict[str, Any]]: if not completion.text: # Get the corresponding prompt for context prompt_idx = outputs.index(output) - prompt_preview = text_prompts[prompt_idx][:100] if len(text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] - raise RuntimeError(f"Empty response received from vLLM for prompt: {prompt_preview}...") + prompt_preview = text_prompts[prompt_idx][:100] if len( + text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] + raise RuntimeError( + f"Empty response received from vLLM for prompt: {prompt_preview}...") results.append({ - 'tokens': list(completion.token_ids), # Convert tuple to list for .copy() compatibility + # Convert tuple to list for .copy() compatibility + 'tokens': list(completion.token_ids), 'text': completion.text, 'finish_reason': completion.finish_reason }) @@ -265,7 +271,8 @@ def shutdown(self) -> None: # Access internal executor to ensure proper cleanup if self.llm.llm_engine is not None: try: - # This helps cleanup vLLM's internal Ray/multiprocessing resources + # This helps cleanup vLLM's internal Ray/multiprocessing + # resources del self.llm.llm_engine.model_executor except Exception as e: print(f"Warning: Failed to cleanup model executor: {e}") diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py index 55647b7ca5..a0b546b600 100644 --- a/language/deepseek-r1/eval_accuracy.py +++ b/language/deepseek-r1/eval_accuracy.py @@ -45,11 +45,11 @@ # ============================================================================= def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], - dataset_file: Union[str, Path], - checkpoint_path: str, - dtype: str = "int32", - output_dir: Optional[Union[str, Path]] = None, - base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: + dataset_file: Union[str, Path], + checkpoint_path: str, + dtype: str = "int32", + output_dir: Optional[Union[str, Path]] = None, + base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: """Process MLPerf log accuracy file and evaluate results. Args: @@ -68,7 +68,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], dataset_file = Path(dataset_file) if not mlperf_log_file.exists(): - raise FileNotFoundError(f"MLPerf log file not found: {mlperf_log_file}") + raise FileNotFoundError( + f"MLPerf log file not found: {mlperf_log_file}") if not dataset_file.exists(): raise FileNotFoundError(f"Dataset file not found: {dataset_file}") @@ -86,7 +87,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], ) logger.info("Tokenizer loaded successfully") except Exception as e: - raise RuntimeError(f"Failed to load tokenizer from {checkpoint_path}: {e}") + raise RuntimeError( + f"Failed to load tokenizer from {checkpoint_path}: {e}") # Load ground truth dataset try: @@ -99,14 +101,20 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], elif 'ground_truth' in dataset_df.columns: ground_truths = dataset_df['ground_truth'].tolist() else: - raise ValueError("Dataset must contain 'gt_output' or 'ground_truth' column") + raise ValueError( + "Dataset must contain 'gt_output' or 'ground_truth' column") # Get other required columns with fallbacks if 'dataset' in dataset_df.columns: datasets = dataset_df['dataset'].tolist() elif 'metric' in dataset_df.columns: # Infer dataset from metric names - datasets = [metric.replace('_em', '').replace('_', '') for metric in dataset_df['metric'].tolist()] + datasets = [ + metric.replace( + '_em', + '').replace( + '_', + '') for metric in dataset_df['metric'].tolist()] else: datasets = ['unknown'] * len(ground_truths) @@ -138,7 +146,7 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], # First, check if this is a JSON array format or newline-delimited JSON with open(mlperf_log_file, 'r') as f: first_line = f.readline().strip() - + if first_line == '[': # JSON array format - load the entire file logger.info("Detected JSON array format") @@ -146,8 +154,10 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], try: mlperf_results = json.load(f) except json.JSONDecodeError as e: - # If full file parsing fails, try to parse line by line, skipping brackets - logger.warning(f"Failed to parse as complete JSON array: {e}") + # If full file parsing fails, try to parse line by line, + # skipping brackets + logger.warning( + f"Failed to parse as complete JSON array: {e}") logger.info("Attempting line-by-line parsing") mlperf_results = [] with open(mlperf_log_file, 'r') as f2: @@ -162,7 +172,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], try: mlperf_results.append(json.loads(line)) except json.JSONDecodeError as e: - logger.warning(f"Failed to parse line {line_num}: {e}") + logger.warning( + f"Failed to parse line {line_num}: {e}") continue else: # Newline-delimited JSON format @@ -180,7 +191,7 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], except json.JSONDecodeError as e: logger.warning(f"Failed to parse line {line_num}: {e}") continue - + logger.info(f"Loaded {len(mlperf_results)} MLPerf results") except Exception as e: raise RuntimeError(f"Failed to load MLPerf log file: {e}") @@ -220,7 +231,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], questions_required.append(questions[qsl_idx]) except Exception as e: - logger.warning(f"Error processing entry with qsl_idx {qsl_idx}: {e}") + logger.warning( + f"Error processing entry with qsl_idx {qsl_idx}: {e}") continue if not preds_token_ids: @@ -271,7 +283,11 @@ def validate_dataframe(df: pd.DataFrame) -> None: if not isinstance(df, pd.DataFrame): raise ValueError("Input must be a pandas DataFrame") - required_cols = ['model_output', 'dataset', 'ground_truth', 'tok_model_output_len'] + required_cols = [ + 'model_output', + 'dataset', + 'ground_truth', + 'tok_model_output_len'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: raise ValueError(f"Missing required columns: {missing_cols}") @@ -390,7 +406,8 @@ def parse_code(text: str) -> Optional[str]: # Answer Evaluation Functions # ============================================================================= -def evaluate_multiple_choice(parsed: Optional[str], ground_truth: str, valid_options: str) -> bool: +def evaluate_multiple_choice( + parsed: Optional[str], ground_truth: str, valid_options: str) -> bool: """Evaluate multiple choice answer.""" if not parsed or not ground_truth: return False @@ -414,10 +431,12 @@ def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool: # Use sys.path approach for proper module importing workspace_path = os.path.dirname(os.path.abspath(__file__)) - prm800k_module_path = os.path.join(workspace_path, "submodules", "prm800k", "prm800k") + prm800k_module_path = os.path.join( + workspace_path, "submodules", "prm800k", "prm800k") if not os.path.exists(prm800k_module_path): - raise FileNotFoundError(f"PRM800K module not found at: {prm800k_module_path}") + raise FileNotFoundError( + f"PRM800K module not found at: {prm800k_module_path}") # Save current directory and sys.path original_cwd = os.getcwd() @@ -427,10 +446,10 @@ def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool: # Add prm800k module path to sys.path if prm800k_module_path not in sys.path: sys.path.insert(0, prm800k_module_path) - + # Change directory as some imports might use relative paths os.chdir(prm800k_module_path) - + # Now import should work from grading.grader import grade_answer result = grade_answer(given_answer=parsed, ground_truth=ground_truth) @@ -622,7 +641,8 @@ def process_row(row: pd.Series) -> Dict[str, Any]: } -def process_livecodebench_parallel(df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]: +def process_livecodebench_parallel( + df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]: """Process LiveCodeBench items in parallel.""" # Prepare work items work_items = [] @@ -726,7 +746,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame: # Unified Evaluation Utilities # ============================================================================= -def print_evaluation_results(df_evaluated: pd.DataFrame, logger: Optional[logging.Logger] = None) -> Dict[str, Any]: +def print_evaluation_results(df_evaluated: pd.DataFrame, + logger: Optional[logging.Logger] = None) -> Dict[str, Any]: """Print evaluation results in a unified format. Args: @@ -762,8 +783,8 @@ def print_evaluation_results(df_evaluated: pd.DataFrame, logger: Optional[loggin def process_and_save_dataframe(df: pd.DataFrame, - output_dir: Optional[Union[str, Path]] = None, - base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: + output_dir: Optional[Union[str, Path]] = None, + base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: """Process dataframe for evaluation and save the results. Args: @@ -779,7 +800,8 @@ def process_and_save_dataframe(df: pd.DataFrame, # Determine output path if output_dir is None: - # Try to infer from existing path info in the dataframe or use current directory + # Try to infer from existing path info in the dataframe or use current + # directory output_dir = Path.cwd() else: output_dir = Path(output_dir) diff --git a/language/deepseek-r1/mlperf/__init__.py b/language/deepseek-r1/mlperf/__init__.py index 33b3154f6b..bfe95c35e3 100644 --- a/language/deepseek-r1/mlperf/__init__.py +++ b/language/deepseek-r1/mlperf/__init__.py @@ -16,8 +16,8 @@ __all__ = [ # SUTs - 'BaseSUT', - 'OfflineSUT', + 'BaseSUT', + 'OfflineSUT', 'ServerSUT', # QSL 'QuerySampleLibrary', @@ -26,4 +26,4 @@ 'prepare_mlperf_dataset', 'process_mlperf_results', 'create_mlperf_output_dataframe' -] \ No newline at end of file +] diff --git a/language/deepseek-r1/mlperf/base_sut.py b/language/deepseek-r1/mlperf/base_sut.py index 7249207aab..f1d32eb869 100644 --- a/language/deepseek-r1/mlperf/base_sut.py +++ b/language/deepseek-r1/mlperf/base_sut.py @@ -12,65 +12,65 @@ class BaseSUT(abc.ABC): """Base class for MLPerf inference System Under Test (SUT). - + This class defines the interface that all SUTs must implement for MLPerf inference benchmarks. It provides two main methods: - issue_queries: to enqueue prompt tokens - flush_queries: to await completion of all issued queries """ - + def __init__(self, name: str = "BaseSUT"): """Initialize the base SUT. - + Args: name: Name of the SUT for logging purposes """ self.name = name self.sut = None logger.info(f"Initializing {self.name}") - + @abc.abstractmethod def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: """Issue queries to the SUT. - + This method should enqueue the provided query samples for processing. It should return immediately without waiting for completion. - + Args: query_samples: List of MLPerf LoadGen query samples to process """ raise NotImplementedError("Subclasses must implement issue_queries") - + @abc.abstractmethod def flush_queries(self) -> None: """Flush all pending queries. - + This method should wait for all previously issued queries to complete before returning. It's called by LoadGen to ensure all work is done. """ raise NotImplementedError("Subclasses must implement flush_queries") - + def start(self) -> lg.ConstructSUT: """Start the SUT and return the LoadGen SUT handle. - + Returns: LoadGen SUT handle for use with LoadGen """ self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries) logger.info(f"{self.name} started") return self.sut - + def stop(self) -> None: """Stop the SUT and clean up resources.""" if self.sut: lg.DestroySUT(self.sut) self.sut = None logger.info(f"{self.name} stopped") - + def __enter__(self): """Context manager entry.""" return self.start() - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" - self.stop() \ No newline at end of file + self.stop() diff --git a/language/deepseek-r1/mlperf/offline_sut.py b/language/deepseek-r1/mlperf/offline_sut.py index db1c4feea1..00382f4660 100644 --- a/language/deepseek-r1/mlperf/offline_sut.py +++ b/language/deepseek-r1/mlperf/offline_sut.py @@ -44,11 +44,15 @@ def __init__(self, self.dataset_strings = dataset_strings # Determine backend type using registry - self.backend_name = getattr(backend, 'backend_name', type(backend).__name__.lower()) + self.backend_name = getattr( + backend, + 'backend_name', + type(backend).__name__.lower()) self.uses_text_prompts = uses_text_input(self.backend_name) if self.uses_text_prompts and dataset_strings is None: - raise ValueError(f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") + raise ValueError( + f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") # Async event loop and thread self.loop = None @@ -122,12 +126,15 @@ async def _process_all_queries_async(self): # Prepare prompts for batch processing (like run_eval.py) if self.uses_text_prompts: # Use text prompts for vLLM and SGLang - prompts = [self.dataset_strings[sample.index] for sample in query_samples] + prompts = [self.dataset_strings[sample.index] + for sample in query_samples] futures = self.backend.generate_async(text_prompts=prompts) else: # Use tokenized prompts for other backends - prompts = [self.dataset[sample.index] for sample in query_samples] - futures = self.backend.generate_async(tokenized_prompts=prompts) + prompts = [self.dataset[sample.index] + for sample in query_samples] + futures = self.backend.generate_async( + tokenized_prompts=prompts) logger.info(f"Got {len(futures)} futures from backend") @@ -136,7 +143,8 @@ async def _process_all_queries_async(self): indexed_futures = [(i, future) for i, future in enumerate(futures)] completed_indices = set() - # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order completion + # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order + # completion pending = {future for _, future in indexed_futures} while pending: @@ -153,12 +161,14 @@ async def _process_all_queries_async(self): break if original_idx is None: - logger.error("Could not find original index for completed future") + logger.error( + "Could not find original index for completed future") continue # Check for duplicate completion if original_idx in completed_indices: - logger.warning(f"Prompt {original_idx} completed multiple times!") + logger.warning( + f"Prompt {original_idx} completed multiple times!") continue try: @@ -174,36 +184,44 @@ async def _process_all_queries_async(self): await self._send_result_to_loadgen(sample, result) except Exception as e: - logger.error(f"Error processing prompt {original_idx}: {type(e).__name__}: {e}") + logger.error( + f"Error processing prompt {original_idx}: {type(e).__name__}: {e}") # Raise the error instead of handling empty responses - raise RuntimeError(f"Backend failed to generate tokens for prompt {original_idx}: {e}") + raise RuntimeError( + f"Backend failed to generate tokens for prompt {original_idx}: {e}") # Verify all results are populated if len(completed_indices) != len(futures): missing_count = len(futures) - len(completed_indices) - raise RuntimeError(f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") + raise RuntimeError( + f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") for i, result in enumerate(results): if result is None: raise RuntimeError(f"Missing result for prompt {i}") - logger.info(f"Completed all {len(completed_indices)} prompts successfully") + logger.info( + f"Completed all {len(completed_indices)} prompts successfully") except Exception as e: - logger.error(f"Error during batch processing: {type(e).__name__}: {e}") + logger.error( + f"Error during batch processing: {type(e).__name__}: {e}") import traceback traceback.print_exc() raise # Re-raise instead of sending empty responses - async def _send_result_to_loadgen(self, sample: lg.QuerySample, result: Dict[str, Any]): + async def _send_result_to_loadgen( + self, sample: lg.QuerySample, result: Dict[str, Any]): """Send a single result to LoadGen.""" try: # Validate that tokens exist - raise error if missing tokens = result.get('tokens') if tokens is None: - raise ValueError(f"Backend result missing 'tokens' key for query {sample.id}") + raise ValueError( + f"Backend result missing 'tokens' key for query {sample.id}") if not isinstance(tokens, (list, tuple)) or len(tokens) == 0: - raise ValueError(f"Backend returned empty or invalid tokens for query {sample.id}: {tokens}") + raise ValueError( + f"Backend returned empty or invalid tokens for query {sample.id}: {tokens}") # Create a copy of tokens before numpy conversion tokens_copy = tokens.copy() @@ -229,12 +247,15 @@ async def _send_result_to_loadgen(self, sample: lg.QuerySample, result: Dict[str # Send response to LoadGen lg.QuerySamplesComplete([response]) - logger.debug(f"Sent {n_tokens} tokens to LoadGen for query {sample.id}") + logger.debug( + f"Sent {n_tokens} tokens to LoadGen for query {sample.id}") except Exception as e: - logger.error(f"Error sending result to LoadGen for query {sample.id}: {e}") + logger.error( + f"Error sending result to LoadGen for query {sample.id}: {e}") # Raise the error instead of sending empty response - raise RuntimeError(f"Failed to send result to LoadGen for query {sample.id}: {e}") + raise RuntimeError( + f"Failed to send result to LoadGen for query {sample.id}: {e}") def _run_event_loop(self): """Run the async event loop in a separate thread.""" @@ -282,7 +303,8 @@ def get_results(self) -> List[Dict[str, Any]]: # Sort by index to maintain dataset order queried_indices = sorted(self.index_to_id.keys()) - logger.info(f"Retrieving results for {len(queried_indices)} queried samples") + logger.info( + f"Retrieving results for {len(queried_indices)} queried samples") # Process results in order of dataset indices using stored results for i in queried_indices: @@ -296,7 +318,8 @@ def get_results(self) -> List[Dict[str, Any]]: tokens = result['tokens'] output_text = result.get('text', '') if not output_text and self.backend.tokenizer: - output_text = self.backend.tokenizer.decode(result['tokens'], skip_special_tokens=True) + output_text = self.backend.tokenizer.decode( + result['tokens'], skip_special_tokens=True) ordered_results.append({ 'model_output': output_text, @@ -305,6 +328,7 @@ def get_results(self) -> List[Dict[str, Any]]: }) else: # No backend result for this sample - raise RuntimeError(f"No backend result stored for dataset index {i}, sample_id {sample_id}") + raise RuntimeError( + f"No backend result stored for dataset index {i}, sample_id {sample_id}") - return ordered_results \ No newline at end of file + return ordered_results diff --git a/language/deepseek-r1/mlperf/qsl.py b/language/deepseek-r1/mlperf/qsl.py index 59bc5e36a8..d4c9405a4e 100644 --- a/language/deepseek-r1/mlperf/qsl.py +++ b/language/deepseek-r1/mlperf/qsl.py @@ -7,12 +7,12 @@ class QuerySampleLibrary: """MLPerf QuerySampleLibrary implementation for single-process execution.""" - - def __init__(self, dataset: List[List[int]], dataset_strings: List[str], + + def __init__(self, dataset: List[List[int]], dataset_strings: List[str], name: str = "QSL"): """ Initialize QSL with dataset. - + Args: dataset: List of tokenized prompts dataset_strings: List of original prompt strings @@ -24,7 +24,7 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], self.perf_count = self.count self.name = name self.logger = logging.getLogger(__name__) - + # Create LoadGen QSL self.qsl = lg.ConstructQSL( self.count, @@ -33,7 +33,7 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], lambda x: None # UnloadSamplesFromRam ) self.logger.info(f"Created {self.name} with {self.count} samples") - + def __del__(self): """Cleanup QSL.""" if self.qsl is not None: @@ -43,12 +43,12 @@ def __del__(self): class DistributedQuerySampleLibrary: """QuerySampleLibrary for distributed execution (MPI/torchrun).""" - + def __init__(self, dataset: List[List[int]], dataset_strings: List[str], rank: int, world_size: int, name: str = "DistributedQSL"): """ Initialize distributed QSL. - + Args: dataset: List of tokenized prompts dataset_strings: List of original prompt strings @@ -64,10 +64,10 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], self.world_size = world_size self.name = name self.logger = logging.getLogger(__name__) - + # Track if this is rank zero explicitly self.is_rank_zero = (self.rank == 0) - + # Only rank 0 creates the actual QSL if self.is_rank_zero: self.qsl = lg.ConstructQSL( @@ -76,12 +76,13 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], lambda x: None, lambda x: None ) - self.logger.info(f"Created {self.name} with {self.count} samples on rank 0") + self.logger.info( + f"Created {self.name} with {self.count} samples on rank 0") else: self.qsl = None - + def __del__(self): """Cleanup QSL on rank 0.""" if self.is_rank_zero and self.qsl is not None: lg.DestroyQSL(self.qsl) - self.logger.info(f"{self.name} destroyed on rank 0") \ No newline at end of file + self.logger.info(f"{self.name} destroyed on rank 0") diff --git a/language/deepseek-r1/mlperf/server_sut.py b/language/deepseek-r1/mlperf/server_sut.py index 75699e208f..e3acb2bde8 100644 --- a/language/deepseek-r1/mlperf/server_sut.py +++ b/language/deepseek-r1/mlperf/server_sut.py @@ -69,11 +69,15 @@ def __init__(self, self.dataset_strings = dataset_strings # Determine backend type using registry - self.backend_name = getattr(backend, 'backend_name', type(backend).__name__.lower()) + self.backend_name = getattr( + backend, + 'backend_name', + type(backend).__name__.lower()) self.uses_text_prompts = uses_text_input(self.backend_name) if self.uses_text_prompts and dataset_strings is None: - raise ValueError(f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") + raise ValueError( + f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") # Async event loop and thread self.loop = None @@ -91,8 +95,6 @@ def __init__(self, self.all_results: Dict[int, Dict[str, Any]] = {} self.results_lock = asyncio.Lock() - - def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: """Issue queries in streaming mode with batching.""" if not supports_streaming(): @@ -123,7 +125,8 @@ async def _start_streaming_query(self, query_info: QueryInfo) -> None: try: # Verify streaming support if not supports_streaming(): - raise RuntimeError(f"Backend {self.backend_name} does not support streaming required for server mode") + raise RuntimeError( + f"Backend {self.backend_name} does not support streaming required for server mode") # Prepare prompt based on backend type if self.uses_text_prompts: @@ -155,8 +158,10 @@ async def _start_streaming_query(self, query_info: QueryInfo) -> None: task.add_done_callback(self._remove_task_from_active) except Exception as e: - logger.error(f"Error starting stream for query {query_info.query_id}: {e}") - raise RuntimeError(f"Failed to start streaming for query {query_info.query_id}: {e}") + logger.error( + f"Error starting stream for query {query_info.query_id}: {e}") + raise RuntimeError( + f"Failed to start streaming for query {query_info.query_id}: {e}") def _remove_task_from_active(self, task: asyncio.Task) -> None: """Remove a completed task from the active set.""" @@ -181,7 +186,8 @@ async def _process_stream(self, state: StreamingQueryState) -> None: state.accumulated_tokens.extend(chunk.token_ids) # Report first token immediately for TTFT measurement - if not state.first_token_sent and (chunk.token or chunk.token_ids): + if not state.first_token_sent and ( + chunk.token or chunk.token_ids): state.first_token_time = current_time - state.start_time state.first_token_sent = True @@ -197,35 +203,43 @@ async def _process_stream(self, state: StreamingQueryState) -> None: except asyncio.CancelledError: # Task was cancelled, clean up gracefully - logger.debug(f"Stream processing cancelled for query {state.query_info.query_id}") - # Close the async generator properly (assume aclose exists in our containerized environment) + logger.debug( + f"Stream processing cancelled for query {state.query_info.query_id}") + # Close the async generator properly (assume aclose exists in our + # containerized environment) try: await state.stream_gen.aclose() except Exception: pass raise except Exception as e: - logger.error(f"Error processing stream for query {state.query_info.query_id}: {e}") - raise RuntimeError(f"Stream processing failed for query {state.query_info.query_id}: {e}") + logger.error( + f"Error processing stream for query {state.query_info.query_id}: {e}") + raise RuntimeError( + f"Stream processing failed for query {state.query_info.query_id}: {e}") finally: # Clean up active stream async with self.active_streams_lock: self.active_streams.pop(state.query_info.query_id, None) - async def _send_first_token_response(self, state: StreamingQueryState) -> None: + async def _send_first_token_response( + self, state: StreamingQueryState) -> None: """Send first token notification to LoadGen for TTFT measurement.""" - logger.debug(f"First token received for query {state.query_info.query_id} at {state.first_token_time:.3f}s") + logger.debug( + f"First token received for query {state.query_info.query_id} at {state.first_token_time:.3f}s") # Convert first tokens to proper format for LoadGen if state.accumulated_tokens: - output_tokens = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32) + output_tokens = np.ascontiguousarray( + state.accumulated_tokens, dtype=np.int32) else: # If no token IDs available, encode the text if hasattr(self.backend, 'tokenizer') and state.accumulated_text: tokens = self.backend.tokenizer.encode(state.accumulated_text) output_tokens = np.ascontiguousarray(tokens, dtype=np.int32) else: - raise RuntimeError(f"No token IDs available for first token response for query {state.query_info.query_id}") + raise RuntimeError( + f"No token IDs available for first token response for query {state.query_info.query_id}") output_seq_len = len(output_tokens) output_toks_ptr = output_tokens.ctypes.data if output_seq_len > 0 else 0 @@ -248,22 +262,25 @@ async def _send_final_response(self, state: StreamingQueryState) -> None: if state.accumulated_tokens: # Create a copy of tokens before numpy conversion tokens_to_send = state.accumulated_tokens.copy() - token_array = np.array(state.accumulated_tokens, dtype=np.int32) + token_array = np.array( + state.accumulated_tokens, dtype=np.int32) else: # If no tokens, encode the text - if hasattr(self.backend, 'tokenizer') and state.accumulated_text: - tokens = self.backend.tokenizer.encode(state.accumulated_text) + if hasattr(self.backend, + 'tokenizer') and state.accumulated_text: + tokens = self.backend.tokenizer.encode( + state.accumulated_text) # Create a copy of tokens before numpy conversion tokens_to_send = tokens.copy() token_array = np.array(tokens, dtype=np.int32) else: - raise RuntimeError(f"No tokens or tokenizer available for query {state.query_info.query_id}") + raise RuntimeError( + f"No tokens or tokenizer available for query {state.query_info.query_id}") # Validate we have tokens if len(token_array) == 0: - raise RuntimeError(f"No tokens generated for query {state.query_info.query_id}") - - + raise RuntimeError( + f"No tokens generated for query {state.query_info.query_id}") # Create LoadGen response response = lg.QuerySampleResponse( @@ -287,11 +304,14 @@ async def _send_final_response(self, state: StreamingQueryState) -> None: } self.all_results[state.query_info.query_id] = state.query_info.result - logger.debug(f"Sent {len(token_array)} tokens to LoadGen for query {state.query_info.query_id}") + logger.debug( + f"Sent {len(token_array)} tokens to LoadGen for query {state.query_info.query_id}") except Exception as e: - logger.error(f"Error sending final response for query {state.query_info.query_id}: {e}") - raise RuntimeError(f"Failed to send final response for query {state.query_info.query_id}: {e}") + logger.error( + f"Error sending final response for query {state.query_info.query_id}: {e}") + raise RuntimeError( + f"Failed to send final response for query {state.query_info.query_id}: {e}") def flush_queries(self) -> None: """Wait for all active streams to complete.""" @@ -313,13 +333,16 @@ async def wait_for_streams(): async with self.active_streams_lock: if self.active_streams: - logger.warning(f"Timeout: {len(self.active_streams)} streams still active") + logger.warning( + f"Timeout: {len(self.active_streams)} streams still active") # Run the wait task in the event loop if self.loop and not self.loop.is_closed(): - future = asyncio.run_coroutine_threadsafe(wait_for_streams(), self.loop) + future = asyncio.run_coroutine_threadsafe( + wait_for_streams(), self.loop) try: - future.result(timeout=310) # Slightly longer than internal timeout + # Slightly longer than internal timeout + future.result(timeout=310) except Exception as e: logger.error(f"Error waiting for streams to complete: {e}") @@ -352,7 +375,8 @@ async def cancel_all_tasks(): tasks_to_cancel = list(self.active_tasks) if tasks_to_cancel: - logger.info(f"Cancelling {len(tasks_to_cancel)} active streaming tasks...") + logger.info( + f"Cancelling {len(tasks_to_cancel)} active streaming tasks...") for task in tasks_to_cancel: task.cancel() @@ -365,7 +389,8 @@ async def cancel_all_tasks(): self.active_tasks.clear() # Run the cancellation in the event loop - future = asyncio.run_coroutine_threadsafe(cancel_all_tasks(), self.loop) + future = asyncio.run_coroutine_threadsafe( + cancel_all_tasks(), self.loop) try: future.result(timeout=10.0) # Give tasks time to cancel except Exception as e: @@ -405,10 +430,12 @@ def get_results(self) -> List[Dict[str, Any]]: # Only process results for samples that were actually queried # Sort by index to maintain dataset order queried_indices = sorted(index_to_result.keys()) - - logger.info(f"Retrieving results for {len(queried_indices)} queried samples") - # Process results in order of dataset indices using stored backend results + logger.info( + f"Retrieving results for {len(queried_indices)} queried samples") + + # Process results in order of dataset indices using stored backend + # results for i in queried_indices: result = index_to_result[i] @@ -416,7 +443,8 @@ def get_results(self) -> List[Dict[str, Any]]: tokens = result['tokens'] output_text = result.get('text', '') if not output_text and self.backend.tokenizer: - output_text = self.backend.tokenizer.decode(result['tokens'], skip_special_tokens=True) + output_text = self.backend.tokenizer.decode( + result['tokens'], skip_special_tokens=True) ordered_results.append({ 'model_output': output_text, @@ -424,4 +452,4 @@ def get_results(self) -> List[Dict[str, Any]]: 'tok_model_output_len': len(tokens) }) - return ordered_results \ No newline at end of file + return ordered_results diff --git a/language/deepseek-r1/mlperf/utils.py b/language/deepseek-r1/mlperf/utils.py index 973e46c201..f4fbedda41 100644 --- a/language/deepseek-r1/mlperf/utils.py +++ b/language/deepseek-r1/mlperf/utils.py @@ -7,15 +7,15 @@ from utils.tokenization import StandardTokenizer -def prepare_mlperf_dataset(input_file: str, - backend_name: Optional[str] = None, - tokenizer: StandardTokenizer = None, - num_samples: Optional[int] = None, - skip_samples: int = 0, - use_chat_template: Optional[bool] = None) -> Dict[str, Any]: +def prepare_mlperf_dataset(input_file: str, + backend_name: Optional[str] = None, + tokenizer: StandardTokenizer = None, + num_samples: Optional[int] = None, + skip_samples: int = 0, + use_chat_template: Optional[bool] = None) -> Dict[str, Any]: """ Prepare dataset for MLPerf inference. - + Args: input_file: Path to input pickle file backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. @@ -24,29 +24,30 @@ def prepare_mlperf_dataset(input_file: str, num_samples: Number of samples to use skip_samples: Number of samples to skip use_chat_template: Whether to use chat template (if None, determined by registry) - + Returns: Dictionary with prepared dataset components """ if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + # Load and validate dataset df = load_dataset(input_file, num_samples, skip_samples) validate_dataset(df) - + prompts = df['text_input'].tolist() print(f"[MLPerf] Loaded {len(prompts)} prompts from dataset") - + # Check if backend uses text prompts from registry uses_text_prompts = uses_text_input() - + # Determine chat template usage from registry if not specified if use_chat_template is None: use_chat_template = uses_chat_template() - print(f"[MLPerf] Using chat template from registry: {use_chat_template}") - + print( + f"[MLPerf] Using chat template from registry: {use_chat_template}") + if uses_text_prompts: print(f"[MLPerf] Backend {backend_name} uses text prompts directly") return { @@ -62,7 +63,7 @@ def prepare_mlperf_dataset(input_file: str, prompts, use_chat_template ) print(f"[MLPerf] Tokenized {len(tokenized_prompts)} prompts") - + return { 'dataframe': df, 'prompts': prompts, @@ -73,61 +74,63 @@ def prepare_mlperf_dataset(input_file: str, def process_mlperf_results(sut_results: List[Dict[str, Any]], - tokenizer: Optional[StandardTokenizer] = None, - backend_name: Optional[str] = None, - uses_text_prompts: Optional[bool] = None) -> List[Dict[str, Any]]: + tokenizer: Optional[StandardTokenizer] = None, + backend_name: Optional[str] = None, + uses_text_prompts: Optional[bool] = None) -> List[Dict[str, Any]]: """ Process MLPerf SUT results into standardized format. - + Args: sut_results: Raw results from MLPerf SUT tokenizer: StandardTokenizer for decoding backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. (Kept for backward compatibility but not used in our codebase) uses_text_prompts: Whether backend uses text prompts (if None, determined by registry) - + Returns: List of processed result dictionaries """ from utils.tokenization import process_inference_results - + if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + # Determine text prompt usage from registry if not specified if uses_text_prompts is None: uses_text_prompts = uses_text_input() - + # Reuse the general inference result processing - return process_inference_results(sut_results, tokenizer, uses_text_prompts=uses_text_prompts) + return process_inference_results( + sut_results, tokenizer, uses_text_prompts=uses_text_prompts) def create_mlperf_output_dataframe(input_df: pd.DataFrame, - results: List[Dict[str, Any]], - backend_name: Optional[str] = None) -> pd.DataFrame: + results: List[Dict[str, Any]], + backend_name: Optional[str] = None) -> pd.DataFrame: """ Create output dataframe with MLPerf results. - + Args: input_df: Input dataframe results: Processed MLPerf results backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. (Kept for backward compatibility but not used in our codebase) - + Returns: Output dataframe with results """ if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + df_output = input_df.copy() - + # Add result columns df_output['model_output'] = [r['model_output'] for r in results] df_output['tok_model_output'] = [r['tok_model_output'] for r in results] - df_output['tok_model_output_len'] = [r['tok_model_output_len'] for r in results] + df_output['tok_model_output_len'] = [ + r['tok_model_output_len'] for r in results] df_output['model_backend'] = backend_name - - return df_output \ No newline at end of file + + return df_output diff --git a/language/deepseek-r1/run_eval.py b/language/deepseek-r1/run_eval.py index 169b3473e4..8965101bd4 100755 --- a/language/deepseek-r1/run_eval.py +++ b/language/deepseek-r1/run_eval.py @@ -1,4 +1,13 @@ #!/usr/bin/env python3 +from utils import ( + load_dataset, save_results, validate_dataset, generate_timestamped_filename, + validate_runner_for_backend, uses_text_input, uses_chat_template, + StandardTokenizer, process_inference_results, + get_backend_instance, create_base_argument_parser, print_runner_header, + setup_output_paths, validate_runner_args, handle_runner_error, + validate_dataset_extended, supports_async +) +from backends import BaseBackend import argparse import asyncio import os @@ -11,59 +20,51 @@ # Disable tokenizers parallelism to avoid forking issues os.environ["TOKENIZERS_PARALLELISM"] = "false" -from backends import BaseBackend -from utils import ( - load_dataset, save_results, validate_dataset, generate_timestamped_filename, - validate_runner_for_backend, uses_text_input, uses_chat_template, - StandardTokenizer, process_inference_results, - get_backend_instance, create_base_argument_parser, print_runner_header, - setup_output_paths, validate_runner_args, handle_runner_error, - validate_dataset_extended, supports_async -) - def create_argument_parser() -> argparse.ArgumentParser: """Create argument parser with shared arguments only.""" parser = create_base_argument_parser( "Modular backend evaluation system for MLPerf DeepSeek reference implementation" ) - + # Add runner-specific arguments parser.add_argument("--async", action="store_true", - help="Use async generation instead of synchronous") - + help="Use async generation instead of synchronous") + return parser -async def run_async_inference(backend: BaseBackend, - tokenized_prompts: List[List[int]], - text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: +async def run_async_inference(backend: BaseBackend, + tokenized_prompts: List[List[int]], + text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: """Run async inference with proper error handling and progress bar that updates as tasks complete.""" try: # Get futures from backend if uses_text_input(): futures = backend.generate_async(text_prompts=text_prompts) else: - futures = backend.generate_async(tokenized_prompts=tokenized_prompts) - + futures = backend.generate_async( + tokenized_prompts=tokenized_prompts) + # Create a list to store results in order results = [None] * len(futures) - + # Create enumerated futures with their original indices for tracking indexed_futures = [(i, future) for i, future in enumerate(futures)] - + # Track completion for debugging completed_indices = set() - + # Process tasks with progress bar that updates as tasks complete with async_tqdm(total=len(futures), desc="Async inference", unit="prompt") as pbar: - # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order completion + # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order + # completion pending = {future for _, future in indexed_futures} - + while pending: # Wait for at least one future to complete done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) - + # Process all completed futures in this batch for completed_future in done: # Find the original index for this completed future @@ -72,46 +73,51 @@ async def run_async_inference(backend: BaseBackend, if future is completed_future: original_idx = idx break - + if original_idx is None: - print(f"\nWarning: Could not find original index for completed future") + print( + f"\nWarning: Could not find original index for completed future") continue - + # Check for duplicate completion if original_idx in completed_indices: - print(f"\nWarning: Prompt {original_idx} completed multiple times!") + print( + f"\nWarning: Prompt {original_idx} completed multiple times!") continue - + try: # Get the result from the completed future result = await completed_future - + # Store the result in the correct position results[original_idx] = result completed_indices.add(original_idx) - + except Exception as e: - print(f"\nError processing prompt {original_idx}: {type(e).__name__}: {e}") + print( + f"\nError processing prompt {original_idx}: {type(e).__name__}: {e}") import traceback traceback.print_exception(type(e), e, e.__traceback__) - + # Raise the error instead of using empty tokens - raise RuntimeError(f"Backend failed to generate tokens for prompt {original_idx}: {e}") - + raise RuntimeError( + f"Backend failed to generate tokens for prompt {original_idx}: {e}") + # Update progress bar after each completion pbar.update(1) - + # Verify all results are populated if len(completed_indices) != len(futures): missing_count = len(futures) - len(completed_indices) - raise RuntimeError(f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") - + raise RuntimeError( + f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") + for i, result in enumerate(results): if result is None: raise RuntimeError(f"Missing result for prompt {i}") - + print(f"\nCompleted all {len(completed_indices)} prompts successfully") - + return results except Exception as e: print(f"Error during async inference: {type(e).__name__}: {e}") @@ -120,9 +126,9 @@ async def run_async_inference(backend: BaseBackend, raise -def run_sync_inference(backend: BaseBackend, - tokenized_prompts: List[List[int]], - text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: +def run_sync_inference(backend: BaseBackend, + tokenized_prompts: List[List[int]], + text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: """Run sync inference with proper error handling.""" try: if uses_text_input(): @@ -140,46 +146,52 @@ def main(): # Parse arguments parser = create_argument_parser() args = parser.parse_args() - + try: # Validate arguments validate_runner_args(args, 'eval') - + # Detect backend early backend_name = validate_runner_for_backend('eval') - + # Set up output paths output_dir, output_file = setup_output_paths(args) if args.output_file is None: args.output_file = output_file - - # Generate the actual filename with timestamp that will be used for saving - actual_output_file = generate_timestamped_filename(args.output_file, add_timestamp=True) - + + # Generate the actual filename with timestamp that will be used for + # saving + actual_output_file = generate_timestamped_filename( + args.output_file, add_timestamp=True) + # Get async flag using getattr since 'async' is a reserved keyword use_async = getattr(args, 'async', False) - + # Check if backend supports async if use_async and not supports_async(): - raise RuntimeError(f"Backend {backend_name} does not support async generation") - + raise RuntimeError( + f"Backend {backend_name} does not support async generation") + # Print header - print_runner_header("Modular Backend Evaluation System", backend_name, args) + print_runner_header( + "Modular Backend Evaluation System", + backend_name, + args) print(f"Mode: {'Async' if use_async else 'Sync'}") print("=" * 80) - + # Load and validate dataset df = load_dataset(args.input_file, args.num_samples, args.skip_samples) validate_dataset_extended(df) - + prompts = df['text_input'].tolist() - + # Initialize tokenizer tokenizer = StandardTokenizer() - + # Determine whether to use chat template based on registry use_chat_template = uses_chat_template() - + # For text-prompt backends, we'll pass the prompts directly # For tokenized-prompt backends, we need to tokenize first if uses_text_input(): @@ -195,19 +207,19 @@ def main(): ) print(f"Tokenized {len(tokenized_prompts)} prompts") print(f"Tokenizer Max length: {tokenizer.max_length}") - + # Initialize backend using registry print(f"\nInitializing {backend_name.upper()} backend...") backend = get_backend_instance(backend_name) - + with backend: # Create new output dataframe with only required columns df_output = pd.DataFrame() - + # Copy all columns from input dataframe first for col in df.columns: df_output[col] = df[col] - + # Run inference with appropriate prompt format if use_async: print("Running async inference...") @@ -217,26 +229,31 @@ def main(): print("Running sync inference...") raw_results = run_sync_inference( backend, tokenized_prompts, text_prompts=prompts) - + # Process raw results into standardized format using shared utility print("Processing results...") standardized_results = process_inference_results( raw_results, tokenizer ) - + # Add generated columns - df_output['model_output'] = [r['model_output'] for r in standardized_results] - df_output['tok_model_output'] = [r['tok_model_output'] for r in standardized_results] - df_output['tok_model_output_len'] = [r['tok_model_output_len'] for r in standardized_results] - df_output['model_backend'] = [r['model_backend'] for r in standardized_results] - + df_output['model_output'] = [r['model_output'] + for r in standardized_results] + df_output['tok_model_output'] = [r['tok_model_output'] + for r in standardized_results] + df_output['tok_model_output_len'] = [ + r['tok_model_output_len'] for r in standardized_results] + df_output['model_backend'] = [r['model_backend'] + for r in standardized_results] + # Save results - output_file = save_results(df_output, args.output_file, add_timestamp=True) - + output_file = save_results( + df_output, args.output_file, add_timestamp=True) + print(f"\nEvaluation completed successfully!") print(f"Results saved to: {output_file}") print(f"Output columns: {list(df_output.columns)}") - + except KeyboardInterrupt: print("\nEvaluation interrupted by user") sys.exit(1) @@ -245,4 +262,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/deepseek-r1/run_eval_mpi.py b/language/deepseek-r1/run_eval_mpi.py index 37425526e1..4edeae5f8f 100644 --- a/language/deepseek-r1/run_eval_mpi.py +++ b/language/deepseek-r1/run_eval_mpi.py @@ -1,4 +1,11 @@ #!/usr/bin/env python3 +from backends import BaseBackend +from utils.data_utils import load_dataset +from utils.validation import validate_runner_args, ValidationError +from utils.runner_utils import create_base_argument_parser, print_runner_header +from utils.backend_registry import uses_chat_template, get_backend_instance, detect_backend, validate_runner_for_backend +from utils import save_results, generate_timestamped_filename, StandardTokenizer +from backends.pytorch_backend import PyTorchBackend import os import sys import argparse @@ -11,13 +18,6 @@ # Import utilities and backend registry sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from backends.pytorch_backend import PyTorchBackend -from utils import save_results, generate_timestamped_filename, StandardTokenizer -from utils.backend_registry import uses_chat_template, get_backend_instance, detect_backend, validate_runner_for_backend -from utils.runner_utils import create_base_argument_parser, print_runner_header -from utils.validation import validate_runner_args, ValidationError -from utils.data_utils import load_dataset -from backends import BaseBackend def main( @@ -41,7 +41,7 @@ def main( # Detect backend from environment backend_name = detect_backend() - + # Validate backend validate_runner_for_backend('eval_mpi') @@ -49,7 +49,8 @@ def main( use_chat_template = uses_chat_template() # Generate the actual filename with timestamp that will be used for saving - actual_output_file = generate_timestamped_filename(output_pickle_path, add_timestamp=True) + actual_output_file = generate_timestamped_filename( + output_pickle_path, add_timestamp=True) if rank == 0: _print("=" * 80) @@ -63,13 +64,14 @@ def main( _print(f"Sample limit: {num_samples}") if skip_samples: _print(f"Skip samples: {skip_samples}") - _print(f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") + _print( + f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") _print("=" * 80) # Initialize PyTorch backend backend = PyTorchBackend() backend.initialize() - + # Initialize StandardTokenizer tokenizer = StandardTokenizer() @@ -82,12 +84,14 @@ def main( _print(f"Loading input DataFrame from {input_pickle_path}...") try: df_for_results = pd.read_pickle(input_pickle_path) - _print(f"Loaded DataFrame with {len(df_for_results)} rows and columns: {df_for_results.columns.tolist()}") - + _print( + f"Loaded DataFrame with {len(df_for_results)} rows and columns: {df_for_results.columns.tolist()}") + # Apply skip_samples if specified if skip_samples > 0: if skip_samples >= len(df_for_results): - _print(f"Error: skip_samples ({skip_samples}) is greater than or equal to total samples ({len(df_for_results)})") + _print( + f"Error: skip_samples ({skip_samples}) is greater than or equal to total samples ({len(df_for_results)})") backend.shutdown() if world_size > 1: dist.destroy_process_group() @@ -96,14 +100,15 @@ def main( df_for_results = df_for_results.iloc[skip_samples:].copy() # Reset index to ensure sequential indices starting from 0 df_for_results = df_for_results.reset_index(drop=True) - + # Apply num_samples limit if specified if num_samples is not None and num_samples < len(df_for_results): - _print(f"Limiting to first {num_samples} samples (out of {len(df_for_results)} total after skipping)") + _print( + f"Limiting to first {num_samples} samples (out of {len(df_for_results)} total after skipping)") df_for_results = df_for_results.head(num_samples).copy() # Reset index to ensure sequential indices starting from 0 df_for_results = df_for_results.reset_index(drop=True) - + except Exception as e: _print(f"Error loading input pickle file: {e}") backend.shutdown() @@ -119,21 +124,25 @@ def main( return prompts_text_list = df_for_results['text_input'].tolist() - _print(f"Extracted {len(prompts_text_list)} prompts from 'text_input' column.") + _print( + f"Extracted {len(prompts_text_list)} prompts from 'text_input' column.") # Pre-initialize output columns df_for_results['model_output'] = "" df_for_results['tok_model_output'] = None - df_for_results['tok_model_output'] = df_for_results['tok_model_output'].astype('object') + df_for_results['tok_model_output'] = df_for_results['tok_model_output'].astype( + 'object') df_for_results['tok_model_output_len'] = 0 df_for_results['model_backend'] = backend_name # Broadcast the number of prompts to all ranks if world_size > 1: if rank == 0: - num_prompts_tensor = torch.tensor(len(prompts_text_list), dtype=torch.long, device="cuda") + num_prompts_tensor = torch.tensor( + len(prompts_text_list), dtype=torch.long, device="cuda") else: - num_prompts_tensor = torch.empty(1, dtype=torch.long, device="cuda") + num_prompts_tensor = torch.empty( + 1, dtype=torch.long, device="cuda") dist.broadcast(num_prompts_tensor, src=0) num_total_prompts = num_prompts_tensor.item() else: @@ -148,13 +157,14 @@ def main( current_batch_prompt_tokens = None if rank == 0: - current_batch_prompt_texts = prompts_text_list[i:i+batch_size] + current_batch_prompt_texts = prompts_text_list[i:i + batch_size] # Tokenize on rank 0 using StandardTokenizer current_batch_prompt_tokens, _ = tokenizer.tokenize_prompts( current_batch_prompt_texts, use_chat_template ) - - _print(f"Processing batch {current_batch_num}, size {len(current_batch_prompt_tokens)}") + + _print( + f"Processing batch {current_batch_num}, size {len(current_batch_prompt_tokens)}") # All ranks call generate_batch_distributed generated_tokens_for_batch = backend.generate_batch_distributed( @@ -164,12 +174,14 @@ def main( if rank == 0: # Validate that we received valid tokens if not generated_tokens_for_batch: - raise RuntimeError(f"Backend returned empty tokens for batch {current_batch_num}") - + raise RuntimeError( + f"Backend returned empty tokens for batch {current_batch_num}") + for batch_idx, tokens in enumerate(generated_tokens_for_batch): if not isinstance(tokens, (list, tuple)) or len(tokens) == 0: - raise RuntimeError(f"Backend returned empty or invalid tokens for batch {current_batch_num}, item {batch_idx}: {tokens}") - + raise RuntimeError( + f"Backend returned empty or invalid tokens for batch {current_batch_num}, item {batch_idx}: {tokens}") + # Decode tokens to text using StandardTokenizer decoded_texts_for_batch = tokenizer.batch_decode( generated_tokens_for_batch @@ -183,23 +195,36 @@ def main( original_df_idx = start_index_in_df + batch_idx if original_df_idx < len(df_for_results): # Use at for assignments with list values - df_for_results.at[original_df_idx, 'model_output'] = decoded_texts_for_batch[batch_idx] - df_for_results.at[original_df_idx, 'tok_model_output'] = generated_tokens_for_batch[batch_idx] - df_for_results.at[original_df_idx, 'tok_model_output_len'] = len(generated_tokens_for_batch[batch_idx]) + df_for_results.at[original_df_idx, + 'model_output'] = decoded_texts_for_batch[batch_idx] + df_for_results.at[original_df_idx, + 'tok_model_output'] = generated_tokens_for_batch[batch_idx] + df_for_results.at[original_df_idx, 'tok_model_output_len'] = len( + generated_tokens_for_batch[batch_idx]) _print(f"Batch {current_batch_num} completed.") if rank == 0 and df_for_results is not None: _print(f"All batches processed. Saving results...") - + # Keep only required columns in the same order as run_eval.py - output_columns = ['text_input', 'ground_truth', 'question', 'dataset', 'model_output', 'tok_model_output', 'tok_model_output_len', 'model_backend'] + output_columns = [ + 'text_input', + 'ground_truth', + 'question', + 'dataset', + 'model_output', + 'tok_model_output', + 'tok_model_output_len', + 'model_backend'] # Filter to only columns that exist - output_columns = [col for col in output_columns if col in df_for_results.columns] + output_columns = [ + col for col in output_columns if col in df_for_results.columns] df_output = df_for_results[output_columns] - + try: - saved_file = save_results(df_output, output_pickle_path, add_timestamp=True) + saved_file = save_results( + df_output, output_pickle_path, add_timestamp=True) _print(f"Successfully saved results to {saved_file}") except Exception as e: _print(f"Error saving output pickle file: {e}") @@ -234,4 +259,4 @@ def main( args.output_file, args.num_samples, args.skip_samples, - ) \ No newline at end of file + ) diff --git a/language/deepseek-r1/run_mlperf.py b/language/deepseek-r1/run_mlperf.py index 7f484e725e..2345cf5b9b 100755 --- a/language/deepseek-r1/run_mlperf.py +++ b/language/deepseek-r1/run_mlperf.py @@ -1,4 +1,23 @@ #!/usr/bin/env python3 +from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy +from utils import ( + validate_runner_for_backend, uses_text_input, uses_chat_template, + load_dataset, save_results, print_runner_header, StandardTokenizer, + get_backend_instance, create_base_argument_parser, + setup_output_paths, validate_runner_args, handle_runner_error, + validate_dataset_extended, generate_timestamped_filename +) +from mlperf import ( + OfflineSUT, ServerSUT, BaseSUT, + QuerySampleLibrary, + prepare_mlperf_dataset, + process_mlperf_results, + create_mlperf_output_dataframe +) +from backends import BaseBackend +import pandas as pd +import numpy as np +import mlperf_loadgen as lg import argparse import json import logging @@ -10,26 +29,6 @@ # Disable tokenizers parallelism to avoid forking issues os.environ["TOKENIZERS_PARALLELISM"] = "false" -import mlperf_loadgen as lg -import numpy as np -import pandas as pd - -from backends import BaseBackend -from mlperf import ( - OfflineSUT, ServerSUT, BaseSUT, - QuerySampleLibrary, - prepare_mlperf_dataset, - process_mlperf_results, - create_mlperf_output_dataframe -) -from utils import ( - validate_runner_for_backend, uses_text_input, uses_chat_template, - load_dataset, save_results, print_runner_header, StandardTokenizer, - get_backend_instance, create_base_argument_parser, - setup_output_paths, validate_runner_args, handle_runner_error, - validate_dataset_extended, generate_timestamped_filename -) -from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy # Configure logging logging.basicConfig( @@ -47,39 +46,39 @@ def create_argument_parser() -> argparse.ArgumentParser: # Scenario selection (no backend argument, auto-detected) parser.add_argument("--mode", type=str, default="offline", - choices=["offline", "server"], - help="MLPerf scenario mode") + choices=["offline", "server"], + help="MLPerf scenario mode") # MLPerf configuration parser.add_argument("--mlperf-conf", type=str, default="/inference/mlperf.conf", - help="Path to MLPerf configuration file") + help="Path to MLPerf configuration file") parser.add_argument("--user-conf", type=str, default="mlperf/user.conf", - help="Path to user configuration file") + help="Path to user configuration file") parser.add_argument("--scenario", type=str, default=None, - choices=["Offline", "Server"], - help="MLPerf scenario (overrides --mode)") + choices=["Offline", "Server"], + help="MLPerf scenario (overrides --mode)") parser.add_argument("--accuracy", action="store_true", - help="Run accuracy mode instead of performance") + help="Run accuracy mode instead of performance") # Output configuration parser.add_argument("--output-dir", type=str, default="mlperf_results", - help="Directory for MLPerf output logs") + help="Directory for MLPerf output logs") parser.add_argument("--log-dir", type=str, default=None, - help="Directory for detailed logs") + help="Directory for detailed logs") return parser def configure_loadgen(scenario: str, - accuracy_mode: bool, - mlperf_conf: Optional[str] = None, - user_conf: Optional[str] = None, - log_dir: Optional[str] = None, - model_name: str = "deepseek-r1") -> lg.TestSettings: + accuracy_mode: bool, + mlperf_conf: Optional[str] = None, + user_conf: Optional[str] = None, + log_dir: Optional[str] = None, + model_name: str = "deepseek-r1") -> lg.TestSettings: """Configure LoadGen test settings. Args: @@ -119,9 +118,9 @@ def configure_loadgen(scenario: str, def run_loadgen_test(sut: Union[OfflineSUT, ServerSUT], - qsl: QuerySampleLibrary, - settings: lg.TestSettings, - log_settings: lg.LogSettings) -> None: + qsl: QuerySampleLibrary, + settings: lg.TestSettings, + log_settings: lg.LogSettings) -> None: """Run LoadGen test. Args: @@ -162,7 +161,8 @@ def main(): if args.log_dir: log_dir = Path(args.log_dir) else: - log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance") + log_dir = output_dir / args.mode / \ + ("accuracy" if args.accuracy else "performance") log_dir.mkdir(parents=True, exist_ok=True) # Set up output paths with mode information @@ -170,17 +170,21 @@ def main(): if args.output_file is None: # Create output file path in the log directory mode_str = "accuracy" if args.accuracy else "performance" - output_file_base = str(log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") + output_file_base = str( + log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") else: output_file_base = args.output_file - # Generate the actual filename with timestamp that will be used for saving - actual_output_file = generate_timestamped_filename(output_file_base, add_timestamp=True) + # Generate the actual filename with timestamp that will be used for + # saving + actual_output_file = generate_timestamped_filename( + output_file_base, add_timestamp=True) # Ensure the parent directory of the output file exists output_file_parent = Path(actual_output_file).parent output_file_parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Ensured output file directory exists: {output_file_parent}") + logger.info( + f"Ensured output file directory exists: {output_file_parent}") logger.info("=" * 80) logger.info("MLPerf Inference Benchmark Runner (Async Pattern)") @@ -220,13 +224,14 @@ def main(): # For backends that use text prompts, we pass the processed strings # For tokenized backends, we pass the tokenized prompts if uses_text_prompts: - logger.info(f"Backend {backend_name} will use text prompts directly") + logger.info( + f"Backend {backend_name} will use text prompts directly") dataset_for_sut = tokenized_prompts strings_for_sut = processed_strings else: logger.info(f"Backend {backend_name} will use tokenized prompts") dataset_for_sut = tokenized_prompts - strings_for_sut = processed_strings # This is what gets used for generation now + strings_for_sut = processed_strings # This is what gets used for generation now # Create backend using registry logger.info(f"Initializing {backend_name} backend...") @@ -315,7 +320,8 @@ def main(): try: # Get results from SUT - must have valid results if not sut_results: - raise RuntimeError("No results available from SUT - backend failed to generate tokens") + raise RuntimeError( + "No results available from SUT - backend failed to generate tokens") # Process results using new utility processed_results = process_mlperf_results( @@ -347,16 +353,19 @@ def main(): mlperf_log_file = log_dir / "mlperf_log_accuracy.json" if mlperf_log_file.exists(): - logger.info(f"Found MLPerf log accuracy file: {mlperf_log_file}") + logger.info( + f"Found MLPerf log accuracy file: {mlperf_log_file}") logger.info("Using MLPerf log for accuracy evaluation...") # Get checkpoint path from backend configuration backend_config = get_backend_instance(backend_name).config # Determine checkpoint path based on backend type - if hasattr(get_backend_instance(backend_name), 'model_path'): + if hasattr(get_backend_instance( + backend_name), 'model_path'): # PyTorch backend has model_path - checkpoint_path = str(get_backend_instance(backend_name).model_path) + checkpoint_path = str( + get_backend_instance(backend_name).model_path) elif 'model' in backend_config: # Other backends use model name directly checkpoint_path = backend_config['model'] @@ -376,10 +385,13 @@ def main(): base_filename="mlperf_accuracy_evaluated.pkl" ) - logger.info(f"MLPerf accuracy evaluation saved to: {evaluated_file}") + logger.info( + f"MLPerf accuracy evaluation saved to: {evaluated_file}") else: - logger.info("No MLPerf log accuracy file found, using standard DataFrame evaluation...") - raise RuntimeError("No MLPerf log accuracy file found, using standard DataFrame evaluation...") + logger.info( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") + raise RuntimeError( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") # Ensure clean exit gc.collect() @@ -397,4 +409,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/deepseek-r1/run_mlperf_mpi.py b/language/deepseek-r1/run_mlperf_mpi.py index 66196645ed..176be41710 100755 --- a/language/deepseek-r1/run_mlperf_mpi.py +++ b/language/deepseek-r1/run_mlperf_mpi.py @@ -1,4 +1,37 @@ #!/usr/bin/env python3 +from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy +from utils.data_utils import ( + load_dataset, save_results, + generate_timestamped_filename +) +from utils.validation import ( + validate_runner_args, ValidationError, + validate_dataset_extended +) +from utils.backend_registry import ( + uses_chat_template, get_backend_instance, detect_backend, + validate_runner_for_backend +) +from utils.runner_utils import create_base_argument_parser, print_runner_header +from utils import ( + StandardTokenizer, + validate_dataset, + process_inference_results +) +from mlperf import ( + OfflineSUT, ServerSUT, BaseSUT, + DistributedQuerySampleLibrary, + prepare_mlperf_dataset, + process_mlperf_results, + create_mlperf_output_dataframe +) +from backends.pytorch_backend import PyTorchBackend +from transformers import AutoTokenizer +import torch.distributed as dist +import torch +import pandas as pd +import numpy as np +import mlperf_loadgen as lg import argparse import json import logging @@ -12,41 +45,6 @@ # Disable tokenizers parallelism to avoid forking issues os.environ["TOKENIZERS_PARALLELISM"] = "false" -import mlperf_loadgen as lg -import numpy as np -import pandas as pd -import torch -import torch.distributed as dist -from transformers import AutoTokenizer - -from backends.pytorch_backend import PyTorchBackend -from mlperf import ( - OfflineSUT, ServerSUT, BaseSUT, - DistributedQuerySampleLibrary, - prepare_mlperf_dataset, - process_mlperf_results, - create_mlperf_output_dataframe -) -from utils import ( - StandardTokenizer, - validate_dataset, - process_inference_results -) -from utils.runner_utils import create_base_argument_parser, print_runner_header -from utils.backend_registry import ( - uses_chat_template, get_backend_instance, detect_backend, - validate_runner_for_backend -) -from utils.validation import ( - validate_runner_args, ValidationError, - validate_dataset_extended -) -from utils.data_utils import ( - load_dataset, save_results, - generate_timestamped_filename -) -from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy - # Configure logging - only for rank 0 def setup_logging(rank: int): @@ -119,7 +117,7 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: batch_size = self.backend.config['batch_size'] for i in range(0, len(query_samples), batch_size): - batch_samples = query_samples[i:i+batch_size] + batch_samples = query_samples[i:i + batch_size] # Prepare batch tokens batch_tokens = [] @@ -141,10 +139,12 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: # Generate using distributed backend # This will broadcast to all ranks internally - generated_tokens = self.backend.generate_batch_distributed(batch_tokens) + generated_tokens = self.backend.generate_batch_distributed( + batch_tokens) # Process results and send to LoadGen - for j, (sample_id, tokens) in enumerate(zip(batch_ids, generated_tokens)): + for j, (sample_id, tokens) in enumerate( + zip(batch_ids, generated_tokens)): # Create a copy of tokens before numpy conversion tokens_copy = tokens.copy() @@ -219,7 +219,8 @@ def get_results(self) -> List[Dict[str, Any]]: # Decode tokens to get text output output_text = '' if self.backend.tokenizer: - output_text = self.backend.tokenizer.decode(tokens, skip_special_tokens=True) + output_text = self.backend.tokenizer.decode( + tokens, skip_special_tokens=True) ordered_results.append({ 'model_output': output_text, @@ -228,16 +229,16 @@ def get_results(self) -> List[Dict[str, Any]]: }) else: # Result exists but no tokens - this is an error - raise RuntimeError(f"No tokens in result for dataset index {i}, sample_id {sample_id}") + raise RuntimeError( + f"No tokens in result for dataset index {i}, sample_id {sample_id}") else: # No result for this index - this is an error - raise RuntimeError(f"No result for dataset index {i}, sample_id {sample_id}") + raise RuntimeError( + f"No result for dataset index {i}, sample_id {sample_id}") return ordered_results - - def create_argument_parser() -> argparse.ArgumentParser: """Create argument parser for distributed MLPerf runner.""" parser = argparse.ArgumentParser( @@ -247,44 +248,45 @@ def create_argument_parser() -> argparse.ArgumentParser: # Dataset arguments parser.add_argument("--input-file", type=str, - default="data/final_output.pkl", - help="Input pickle file with prompts") + default="data/final_output.pkl", + help="Input pickle file with prompts") # MLPerf configuration parser.add_argument("--mlperf-conf", type=str, default="/inference/mlperf.conf", - help="Path to MLPerf configuration file") + help="Path to MLPerf configuration file") parser.add_argument("--user-conf", type=str, default="mlperf/user.conf", - help="Path to user configuration file") + help="Path to user configuration file") parser.add_argument("--mode", type=str, default="offline", - choices=["offline", "server"], - help="MLPerf scenario mode (only offline supported for distributed)") + choices=["offline", "server"], + help="MLPerf scenario mode (only offline supported for distributed)") parser.add_argument("--accuracy", action="store_true", - help="Run accuracy mode instead of performance") + help="Run accuracy mode instead of performance") # Output configuration parser.add_argument("--output-dir", type=str, default="mlperf_results", - help="Directory for MLPerf output logs") + help="Directory for MLPerf output logs") parser.add_argument("--log-dir", type=str, default=None, - help="Directory for detailed logs") + help="Directory for detailed logs") parser.add_argument("--output-file", type=str, default=None, - help="Output pickle file path (auto-generated if not specified)") + help="Output pickle file path (auto-generated if not specified)") - # Note: --no-chat-template is removed (chat template usage determined by backend registry) + # Note: --no-chat-template is removed (chat template usage determined by + # backend registry) return parser def configure_loadgen(scenario: str, - accuracy_mode: bool, - mlperf_conf: Optional[str] = None, - user_conf: Optional[str] = None, - log_dir: Optional[str] = None, - model_name: str = "deepseek-r1") -> lg.TestSettings: + accuracy_mode: bool, + mlperf_conf: Optional[str] = None, + user_conf: Optional[str] = None, + log_dir: Optional[str] = None, + model_name: str = "deepseek-r1") -> lg.TestSettings: """Configure LoadGen test settings. Args: @@ -324,11 +326,11 @@ def configure_loadgen(scenario: str, def run_loadgen_test(sut: DistributedOfflineSUT, - qsl: DistributedQuerySampleLibrary, - settings: lg.TestSettings, - log_settings: lg.LogSettings, - rank: int, - logger) -> None: + qsl: DistributedQuerySampleLibrary, + settings: lg.TestSettings, + log_settings: lg.LogSettings, + rank: int, + logger) -> None: """Run LoadGen test (only on rank 0). Args: @@ -386,7 +388,8 @@ def main(): # Validate mode for distributed if args.mode != "offline": if rank == 0: - logger.error("Only offline mode is supported for distributed execution") + logger.error( + "Only offline mode is supported for distributed execution") sys.exit(1) # Create output directories (only rank 0) @@ -397,7 +400,8 @@ def main(): if args.log_dir: log_dir = Path(args.log_dir) else: - log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance") + log_dir = output_dir / args.mode / \ + ("accuracy" if args.accuracy else "performance") log_dir.mkdir(parents=True, exist_ok=True) # Determine output file path @@ -405,15 +409,18 @@ def main(): output_file_base = args.output_file else: mode_str = "accuracy" if args.accuracy else "performance" - output_file_base = str(log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") + output_file_base = str( + log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") # Generate the actual filename with timestamp - actual_output_file = generate_timestamped_filename(output_file_base, add_timestamp=True) + actual_output_file = generate_timestamped_filename( + output_file_base, add_timestamp=True) # Ensure the parent directory of the output file exists output_file_parent = Path(actual_output_file).parent output_file_parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Ensured output file directory exists: {output_file_parent}") + logger.info( + f"Ensured output file directory exists: {output_file_parent}") logger.info("=" * 80) logger.info("MLPerf Inference Benchmark Runner (Distributed PyTorch)") @@ -425,7 +432,8 @@ def main(): logger.info(f"Input file: {args.input_file}") logger.info(f"Output directory: {output_dir}") logger.info(f"Output file: {actual_output_file}") - logger.info(f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") + logger.info( + f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") logger.info("=" * 80) else: log_dir = None @@ -460,7 +468,8 @@ def main(): tokenized_prompts = dataset_info['tokenized_prompts'] processed_strings = dataset_info['processed_strings'] - logger.info(f"Loaded {len(tokenized_prompts)} prompts from dataset") + logger.info( + f"Loaded {len(tokenized_prompts)} prompts from dataset") # Create SUT sut = DistributedOfflineSUT( @@ -511,7 +520,8 @@ def main(): if rank == 0: # Run test (only rank 0) logger.info("Running test...") - run_loadgen_test(sut, qsl, settings, log_settings, rank, logger) + run_loadgen_test( + sut, qsl, settings, log_settings, rank, logger) logger.info("Completed test...") # Ensure all queries are flushed and async operations complete @@ -524,7 +534,8 @@ def main(): dist.broadcast_object_list(exit_signal, src=0) else: # Non-rank 0 processes participate in distributed generation - # They wait for signals from rank 0 and participate in generate_batch_distributed + # They wait for signals from rank 0 and participate in + # generate_batch_distributed while True: # First, check if we should exit # We use a separate broadcast to signal exit @@ -536,7 +547,8 @@ def main(): break elif exit_check[0] == "generate": # Signal to participate in generation - # The actual batch tokens will be broadcast inside generate_batch_distributed + # The actual batch tokens will be broadcast inside + # generate_batch_distributed backend.generate_batch_distributed(None) # If exit_check[0] is None, continue waiting finally: @@ -563,9 +575,11 @@ def main(): try: # Get results from SUT (if available) - logger.info("Retrieving results from distributed SUT...") + logger.info( + "Retrieving results from distributed SUT...") sut_results = sut.get_results() - logger.info(f"Retrieved {len(sut_results)} results from distributed SUT") + logger.info( + f"Retrieved {len(sut_results)} results from distributed SUT") # Process results using new utility processed_results = process_mlperf_results( @@ -597,11 +611,19 @@ def main(): mlperf_log_file = log_dir / "mlperf_log_accuracy.json" if mlperf_log_file.exists(): - logger.info(f"Found MLPerf log accuracy file: {mlperf_log_file}") - logger.info("Using MLPerf log for accuracy evaluation...") - - # For PyTorch backend (only one supported in MPI), get model path - checkpoint_path = str(backend.model_path) if hasattr(backend, 'model_path') else backend.config.get('model_name', 'deepseek-ai/DeepSeek-R1') + logger.info( + f"Found MLPerf log accuracy file: {mlperf_log_file}") + logger.info( + "Using MLPerf log for accuracy evaluation...") + + # For PyTorch backend (only one supported in MPI), + # get model path + checkpoint_path = str( + backend.model_path) if hasattr( + backend, + 'model_path') else backend.config.get( + 'model_name', + 'deepseek-ai/DeepSeek-R1') # Process MLPerf log accuracy df_evaluated, evaluated_file = process_mlperf_log_accuracy( @@ -612,10 +634,13 @@ def main(): base_filename="mlperf_accuracy_evaluated.pkl" ) - logger.info(f"MLPerf accuracy evaluation saved to: {evaluated_file}") + logger.info( + f"MLPerf accuracy evaluation saved to: {evaluated_file}") else: - logger.info("No MLPerf log accuracy file found, using standard DataFrame evaluation...") - raise RuntimeError("No MLPerf log accuracy file found, using standard DataFrame evaluation...") + logger.info( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") + raise RuntimeError( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") except KeyboardInterrupt: if rank == 0: @@ -639,4 +664,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/deepseek-r1/utils/__init__.py b/language/deepseek-r1/utils/__init__.py index ce8d10e9fd..65f575da29 100644 --- a/language/deepseek-r1/utils/__init__.py +++ b/language/deepseek-r1/utils/__init__.py @@ -101,4 +101,4 @@ # Error handling 'handle_backend_error', 'handle_runner_error' -] \ No newline at end of file +] diff --git a/language/deepseek-r1/utils/backend_registry.py b/language/deepseek-r1/utils/backend_registry.py index 73aee587e6..aa8d62a18e 100644 --- a/language/deepseek-r1/utils/backend_registry.py +++ b/language/deepseek-r1/utils/backend_registry.py @@ -173,7 +173,8 @@ def validate_backend(backend: str) -> None: f"Unknown backend '{backend}'. Supported backends: {', '.join(supported_backends)}") -def _get_compatibility_error_message(backend: str, runner_type: str, compatible: List[str]) -> str: +def _get_compatibility_error_message( + backend: str, runner_type: str, compatible: List[str]) -> str: """ Generate error message for incompatible backend/runner combinations. @@ -401,7 +402,8 @@ def get_backend_instance(backend_name: Optional[str] = None): return backend_class() -def is_backend_compatible_with_runner(backend_name: Optional[str] = None, runner_type: str = None) -> bool: +def is_backend_compatible_with_runner( + backend_name: Optional[str] = None, runner_type: str = None) -> bool: """Check if a backend is compatible with a specific runner type. Args: @@ -441,7 +443,8 @@ def get_backend_env_vars(backend_name: Optional[str] = None) -> Dict[str, str]: # Get static env vars env_vars = BACKEND_REGISTRY[backend_name]['env_vars'].copy() - # Handle dynamic env vars (e.g., OMP_NUM_THREADS based on tensor_parallel_size) + # Handle dynamic env vars (e.g., OMP_NUM_THREADS based on + # tensor_parallel_size) if backend_name == 'vllm': config = get_backend_config(backend_name) env_vars['OMP_NUM_THREADS'] = str( @@ -461,4 +464,4 @@ def apply_backend_env_vars(backend_name: Optional[str] = None) -> None: env_vars = get_backend_env_vars(backend_name) for key, value in env_vars.items(): - os.environ[key] = value \ No newline at end of file + os.environ[key] = value diff --git a/language/deepseek-r1/utils/data_utils.py b/language/deepseek-r1/utils/data_utils.py index 80acb5c8ce..0eb4cd3dcd 100644 --- a/language/deepseek-r1/utils/data_utils.py +++ b/language/deepseek-r1/utils/data_utils.py @@ -15,54 +15,56 @@ from utils.validation import ValidationError, validate_dataset_extended -def generate_timestamped_filename(output_file: str, add_timestamp: bool = True) -> str: +def generate_timestamped_filename( + output_file: str, add_timestamp: bool = True) -> str: """ Generate the actual filename that will be used when saving, with timestamp if requested. - + Args: output_file: Base output file path add_timestamp: Whether to add timestamp to filename - + Returns: Actual filename that will be used for saving """ if not add_timestamp: return output_file - + timestamp_suffix = time.strftime("%Y%m%d_%H%M%S") base_name, ext = os.path.splitext(output_file) return f"{base_name}_{timestamp_suffix}{ext}" -def load_dataset(file_path: str, num_samples: Optional[int] = None, skip_samples: int = 0) -> pd.DataFrame: +def load_dataset( + file_path: str, num_samples: Optional[int] = None, skip_samples: int = 0) -> pd.DataFrame: """ Load dataset from pickle file. - + Args: file_path: Path to the pickle file num_samples: Optional limit on number of samples to load skip_samples: Number of samples to skip from the beginning - + Returns: Loaded DataFrame - + Raises: ValidationError: If file doesn't exist or validation fails Exception: If file can't be loaded """ if not os.path.exists(file_path): raise ValidationError(f"Input file not found: {file_path}") - + print(f"Loading dataset from {file_path}...") - + try: with open(file_path, "rb") as f: df = pd.read_pickle(f) except Exception as e: raise ValidationError(f"Failed to load dataset: {str(e)}") - + print(f"Loaded {len(df)} samples") - + # Skip samples if specified if skip_samples > 0: if skip_samples >= len(df): @@ -71,31 +73,33 @@ def load_dataset(file_path: str, num_samples: Optional[int] = None, skip_samples ) original_length = len(df) df = df.iloc[skip_samples:].reset_index(drop=True) - print(f"Skipped first {skip_samples} samples (from {original_length} total)") - + print( + f"Skipped first {skip_samples} samples (from {original_length} total)") + # Limit number of samples if specified if num_samples is not None: original_length = len(df) df = df.head(num_samples) - print(f"Limited to {len(df)} samples (from {original_length} total after skipping)") - + print( + f"Limited to {len(df)} samples (from {original_length} total after skipping)") + return df -def save_results(df: pd.DataFrame, - output_file: str, - add_timestamp: bool = True) -> str: +def save_results(df: pd.DataFrame, + output_file: str, + add_timestamp: bool = True) -> str: """ Save results DataFrame to pickle file. - + Args: df: DataFrame to save output_file: Output file path add_timestamp: Whether to add timestamp to filename - + Returns: Actual output file path used - + Raises: ValidationError: If save operation fails """ @@ -104,93 +108,99 @@ def save_results(df: pd.DataFrame, timestamp_suffix = time.strftime("%Y%m%d_%H%M%S") base_name, ext = os.path.splitext(output_file) output_file = f"{base_name}_{timestamp_suffix}{ext}" - + # Ensure output directory exists os.makedirs(os.path.dirname(output_file), exist_ok=True) - + print(f"Saving results to {output_file}...") - + # Reset index before saving df_to_save = df.reset_index(drop=True) - + try: with open(output_file, "wb") as f: pickle.dump(df_to_save, f) - print(f"Save completed: {len(df_to_save)} samples saved to {output_file}") + print( + f"Save completed: {len(df_to_save)} samples saved to {output_file}") except Exception as e: raise ValidationError(f"Failed to save results: {str(e)}") - + return output_file -def prepare_output_dataframe(input_df: pd.DataFrame, - backend_name: Optional[str] = None) -> pd.DataFrame: +def prepare_output_dataframe(input_df: pd.DataFrame, + backend_name: Optional[str] = None) -> pd.DataFrame: """ Prepare output DataFrame by cleaning up old columns. - + Args: input_df: Input DataFrame backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. - + Returns: Cleaned DataFrame ready for new results """ if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + df_output = input_df.copy() - + # Define columns to drop (old model outputs and unwanted columns) columns_to_drop = [ # specify columns to drop here ] - + # Also drop any existing backend-specific columns - backend_columns = [col for col in df_output.columns if col.startswith(f'{backend_name}_')] + backend_columns = [ + col for col in df_output.columns if col.startswith(f'{backend_name}_')] columns_to_drop.extend(backend_columns) - + # Drop columns that exist df_output = df_output.drop( columns=[col for col in columns_to_drop if col in df_output.columns] ) - + return df_output -def add_standardized_columns(df: pd.DataFrame, - results: List[Dict[str, Any]], - tokenized_prompts: List[List[int]] = None) -> pd.DataFrame: +def add_standardized_columns(df: pd.DataFrame, + results: List[Dict[str, Any]], + tokenized_prompts: List[List[int]] = None) -> pd.DataFrame: """ Add standardized output columns to DataFrame. - + Args: df: Input DataFrame results: List of result dictionaries from backend tokenized_prompts: List of tokenized input prompts (deprecated, not used) - + Returns: DataFrame with added standardized columns """ # Add results columns with new naming convention df['model_output'] = [r.get('model_output', '') for r in results] df['tok_model_output'] = [r.get('tok_model_output', []) for r in results] - df['tok_model_output_len'] = [r.get('tok_model_output_len', 0) for r in results] + df['tok_model_output_len'] = [ + r.get( + 'tok_model_output_len', + 0) for r in results] df['model_backend'] = [r.get('model_backend', '') for r in results] - + return df -def validate_dataset(df: pd.DataFrame, backend_name: Optional[str] = None) -> None: +def validate_dataset(df: pd.DataFrame, + backend_name: Optional[str] = None) -> None: """ Validate that the dataset has required columns. - + Args: df: DataFrame to validate backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. - + Raises: ValidationError: If required columns are missing or validation fails """ # Use centralized validation function - validate_dataset_extended(df, backend_name) \ No newline at end of file + validate_dataset_extended(df, backend_name) diff --git a/language/deepseek-r1/utils/error_handling.py b/language/deepseek-r1/utils/error_handling.py index 54ca580135..6b588b9c20 100644 --- a/language/deepseek-r1/utils/error_handling.py +++ b/language/deepseek-r1/utils/error_handling.py @@ -5,17 +5,18 @@ from .validation import BackendError, ValidationError -def handle_backend_error(e: Exception, backend_name: str, operation: str) -> None: +def handle_backend_error(e: Exception, backend_name: str, + operation: str) -> None: """ Standardized error handling for backend operations. - + Args: e: The exception that occurred backend_name: Name of the backend operation: Description of the operation that failed """ error_msg = f"\n[{backend_name.upper()}] Error during {operation}: {type(e).__name__}: {str(e)}" - + if isinstance(e, (RuntimeError, ValueError)): # Known errors - just print the message print(error_msg) @@ -28,7 +29,7 @@ def handle_backend_error(e: Exception, backend_name: str, operation: str) -> Non def handle_runner_error(e: Exception, runner_name: str) -> None: """ Standardized error handling for runners. - + Args: e: The exception that occurred runner_name: Name of the runner @@ -45,4 +46,4 @@ def handle_runner_error(e: Exception, runner_name: str) -> None: else: print(f"\n{runner_name} failed: {e}") traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/language/deepseek-r1/utils/runner_utils.py b/language/deepseek-r1/utils/runner_utils.py index 8c90deb515..fc2d4ad7f2 100644 --- a/language/deepseek-r1/utils/runner_utils.py +++ b/language/deepseek-r1/utils/runner_utils.py @@ -12,29 +12,31 @@ def create_base_argument_parser(description: str) -> argparse.ArgumentParser: description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - + # Common dataset arguments parser.add_argument("--input-file", type=str, - default="data/final_output.pkl", - help="Input pickle file with prompts") - + default="data/final_output.pkl", + help="Input pickle file with prompts") + parser.add_argument("--output-file", type=str, default=None, - help="Output pickle file path (auto-generated if not specified)") - + help="Output pickle file path (auto-generated if not specified)") + parser.add_argument("--num-samples", type=int, default=None, - help="Number of samples to process from dataset") - + help="Number of samples to process from dataset") + parser.add_argument("--skip-samples", type=int, default=0, - help="Number of samples to skip from the beginning") - - # NOTE: --no-chat-template flag is NOT included (chat template usage determined by backend registry) - + help="Number of samples to skip from the beginning") + + # NOTE: --no-chat-template flag is NOT included (chat template usage + # determined by backend registry) + return parser -def print_runner_header(runner_name: str, backend_name: Optional[str] = None, args: argparse.Namespace = None) -> None: +def print_runner_header( + runner_name: str, backend_name: Optional[str] = None, args: argparse.Namespace = None) -> None: """Print standardized header for runners. - + Args: runner_name: Name of the runner backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. @@ -43,7 +45,7 @@ def print_runner_header(runner_name: str, backend_name: Optional[str] = None, ar if backend_name is None: from .backend_registry import detect_backend backend_name = detect_backend() - + print("=" * 80) print(f"{runner_name}") print("=" * 80) @@ -59,22 +61,23 @@ def print_runner_header(runner_name: str, backend_name: Optional[str] = None, ar print("=" * 80) -def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = None, mode: Optional[str] = None) -> Tuple[Path, str]: +def setup_output_paths(args: argparse.Namespace, + backend_name: Optional[str] = None, mode: Optional[str] = None) -> Tuple[Path, str]: """ Set up output directories and file paths. - + Args: args: Parsed command line arguments backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. mode: Optional mode (e.g., 'offline', 'server' for MLPerf) - + Returns: Tuple of (output_dir, output_file_path) """ if backend_name is None: from .backend_registry import detect_backend backend_name = detect_backend() - + # Determine output directory if hasattr(args, 'output_dir') and args.output_dir: output_dir = Path(args.output_dir) @@ -84,9 +87,9 @@ def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = N output_dir = Path(f"outputs/{backend_name}/{mode}") else: output_dir = Path(f"outputs/{backend_name}") - + output_dir.mkdir(parents=True, exist_ok=True) - + # Determine output file path if args.output_file: output_file = args.output_file @@ -97,10 +100,13 @@ def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = N suffix = f"_{args.num_samples}samples" else: suffix = "_full" - + if mode: - output_file = str(output_dir / f"{backend_name}_{mode}_output_{timestamp}{suffix}.pkl") + output_file = str( + output_dir / + f"{backend_name}_{mode}_output_{timestamp}{suffix}.pkl") else: - output_file = str(output_dir / f"{backend_name}_output_{timestamp}{suffix}.pkl") - - return output_dir, output_file \ No newline at end of file + output_file = str(output_dir / + f"{backend_name}_output_{timestamp}{suffix}.pkl") + + return output_dir, output_file diff --git a/language/deepseek-r1/utils/tokenization.py b/language/deepseek-r1/utils/tokenization.py index c5fa77d69d..ec67e1e2eb 100644 --- a/language/deepseek-r1/utils/tokenization.py +++ b/language/deepseek-r1/utils/tokenization.py @@ -7,15 +7,15 @@ class StandardTokenizer: """Standard tokenizer for DeepSeek models.""" - + # Standard configuration used across all runners DEFAULT_MODEL = "deepseek-ai/DeepSeek-R1" DEFAULT_MAX_LENGTH = 32 * 1024 - + def __init__(self, model_name: str = None, max_length: int = None): """ Initialize tokenizer. - + Args: model_name: HuggingFace model name max_length: Maximum sequence length @@ -23,50 +23,54 @@ def __init__(self, model_name: str = None, max_length: int = None): self.model_name = model_name or self.DEFAULT_MODEL self.max_length = max_length or self.DEFAULT_MAX_LENGTH self._tokenizer = None - + @property def tokenizer(self): """Lazy load tokenizer.""" if self._tokenizer is None: print(f"Loading tokenizer: {self.model_name}") - self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision="56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad") + self._tokenizer = AutoTokenizer.from_pretrained( + self.model_name, revision="56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad") return self._tokenizer - - def tokenize_prompts(self, prompts: List[str], - use_chat_template: Optional[bool] = None, - backend_name: Optional[str] = None) -> Tuple[List[List[int]], List[str]]: + + def tokenize_prompts(self, prompts: List[str], + use_chat_template: Optional[bool] = None, + backend_name: Optional[str] = None) -> Tuple[List[List[int]], List[str]]: """ Tokenize prompts with backend-specific handling. - + Args: prompts: List of text prompts use_chat_template: Whether to use chat template (if None and backend_name provided, uses registry) backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. - + Returns: Tuple of (tokenized_prompts, processed_strings) """ # Auto-detect backend if not provided if backend_name is None: backend_name = detect_backend() - + # Determine chat template usage from registry if backend_name provided if use_chat_template is None: use_chat_template = uses_chat_template(backend_name) - print(f"[{backend_name}] Using chat template from registry: {use_chat_template}") - + print( + f"[{backend_name}] Using chat template from registry: {use_chat_template}") + tokenized = [] processed_strings = [] - + for prompt in prompts: - if use_chat_template and hasattr(self.tokenizer, 'apply_chat_template'): + if use_chat_template and hasattr( + self.tokenizer, 'apply_chat_template'): tokens = self.tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, max_length=self.max_length, truncation=True ) - processed_string = self.tokenizer.decode(tokens, skip_special_tokens=False) + processed_string = self.tokenizer.decode( + tokens, skip_special_tokens=False) else: tokens = self.tokenizer.encode( prompt, @@ -74,49 +78,52 @@ def tokenize_prompts(self, prompts: List[str], max_length=self.max_length ) processed_string = prompt - + tokenized.append(tokens) processed_strings.append(processed_string) - + return tokenized, processed_strings - - def decode_tokens(self, tokens: List[int], skip_special_tokens: bool = True) -> str: + + def decode_tokens(self, tokens: List[int], + skip_special_tokens: bool = True) -> str: """Decode tokens to text.""" - return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) - - def batch_decode(self, token_lists: List[List[int]], - skip_special_tokens: bool = True) -> List[str]: + return self.tokenizer.decode( + tokens, skip_special_tokens=skip_special_tokens) + + def batch_decode(self, token_lists: List[List[int]], + skip_special_tokens: bool = True) -> List[str]: """Batch decode multiple token lists.""" - return self.tokenizer.batch_decode(token_lists, skip_special_tokens=skip_special_tokens) + return self.tokenizer.batch_decode( + token_lists, skip_special_tokens=skip_special_tokens) -def process_inference_results(raw_results: List[dict], - tokenizer: Optional[StandardTokenizer] = None, - backend_name: Optional[str] = None, - uses_text_prompts: bool = False) -> List[dict]: +def process_inference_results(raw_results: List[dict], + tokenizer: Optional[StandardTokenizer] = None, + backend_name: Optional[str] = None, + uses_text_prompts: bool = False) -> List[dict]: """ Process raw inference results into standardized format. - + Args: raw_results: Raw results from backend tokenizer: Tokenizer for decoding backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. uses_text_prompts: Whether backend uses text prompts - + Returns: List of standardized result dictionaries """ # Auto-detect backend if not provided if backend_name is None: backend_name = detect_backend() - + if backend_name not in get_supported_backends(): raise ValueError(f"Backend {backend_name} is not supported") - + backend_config = get_backend_config(backend_name) - + standardized_results = [] - + for raw_result in raw_results: # Handle text-prompt backends if uses_text_prompts and 'text' in raw_result: @@ -129,9 +136,9 @@ def process_inference_results(raw_results: List[dict], if tokenizer and tokens: try: text = tokenizer.decode_tokens(tokens) - except: + except BaseException: pass - + standardized = { 'model_output': text, 'tok_model_output': tokens, @@ -139,5 +146,5 @@ def process_inference_results(raw_results: List[dict], 'model_backend': backend_name, } standardized_results.append(standardized) - - return standardized_results \ No newline at end of file + + return standardized_results diff --git a/language/deepseek-r1/utils/validation.py b/language/deepseek-r1/utils/validation.py index 29bebef4f1..768427ada1 100644 --- a/language/deepseek-r1/utils/validation.py +++ b/language/deepseek-r1/utils/validation.py @@ -12,8 +12,10 @@ class BackendError(RuntimeError): class BackendNotInitializedError(BackendError): """Raised when backend operation is called before initialization.""" + def __init__(self, backend_name: str = "Backend"): - super().__init__(f"{backend_name} not initialized. Call initialize() first.") + super().__init__( + f"{backend_name} not initialized. Call initialize() first.") class ValidationError(ValueError): @@ -33,9 +35,9 @@ def wrapper(self, *args, **kwargs): def validate_prompts_input(backend_name: Optional[str] = None, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - input_type: str = None) -> None: + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + input_type: str = None) -> None: """ Centralized prompt validation with backend-specific requirements. @@ -53,13 +55,16 @@ def validate_prompts_input(backend_name: Optional[str] = None, backend_name = detect_backend() if tokenized_prompts is None and text_prompts is None: - raise ValidationError(f"{backend_name} backend requires either text_prompts or tokenized_prompts") + raise ValidationError( + f"{backend_name} backend requires either text_prompts or tokenized_prompts") if input_type == 'text' and tokenized_prompts is not None and text_prompts is None: - raise ValidationError(f"{backend_name} backend requires text_prompts, not tokenized_prompts") + raise ValidationError( + f"{backend_name} backend requires text_prompts, not tokenized_prompts") if input_type == 'tokenized' and text_prompts is not None and tokenized_prompts is None: - raise ValidationError(f"{backend_name} backend requires tokenized_prompts, not text_prompts") + raise ValidationError( + f"{backend_name} backend requires tokenized_prompts, not text_prompts") # Additional validation for tokenized prompts if tokenized_prompts is not None: @@ -67,9 +72,11 @@ def validate_prompts_input(backend_name: Optional[str] = None, raise ValidationError("tokenized_prompts cannot be empty") for i, prompt in enumerate(tokenized_prompts): if not isinstance(prompt, list): - raise ValidationError(f"tokenized_prompts[{i}] must be a list of integers") + raise ValidationError( + f"tokenized_prompts[{i}] must be a list of integers") if not prompt: - raise ValidationError(f"tokenized_prompts[{i}] cannot be empty") + raise ValidationError( + f"tokenized_prompts[{i}] cannot be empty") # Additional validation for text prompts if text_prompts is not None: @@ -81,8 +88,8 @@ def validate_prompts_input(backend_name: Optional[str] = None, def validate_dataset_extended(df: pd.DataFrame, - backend_name: Optional[str] = None, - required_columns: Optional[List[str]] = None) -> None: + backend_name: Optional[str] = None, + required_columns: Optional[List[str]] = None) -> None: """ Extended dataset validation with backend-specific requirements. @@ -101,9 +108,11 @@ def validate_dataset_extended(df: pd.DataFrame, if required_columns is None: required_columns = ['text_input'] - missing_columns = [col for col in required_columns if col not in df.columns] + missing_columns = [ + col for col in required_columns if col not in df.columns] if missing_columns: - raise ValidationError(f"Dataset missing required columns: {missing_columns}") + raise ValidationError( + f"Dataset missing required columns: {missing_columns}") # Check for empty prompts empty_prompts = df['text_input'].isna().sum() @@ -118,7 +127,8 @@ def validate_dataset_extended(df: pd.DataFrame, config = get_backend_config(backend_name) # Add backend-specific validation based on config if needed - print(f"Dataset validation passed: {len(df)} samples with required columns") + print( + f"Dataset validation passed: {len(df)} samples with required columns") def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None: @@ -133,7 +143,8 @@ def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None: ValidationError: If validation fails """ # Common validations - if hasattr(args, 'num_samples') and args.num_samples is not None and args.num_samples <= 0: + if hasattr( + args, 'num_samples') and args.num_samples is not None and args.num_samples <= 0: raise ValidationError("--num-samples must be positive") if hasattr(args, 'skip_samples') and args.skip_samples < 0: @@ -142,4 +153,5 @@ def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None: # Runner-specific validations if runner_type in ['mlperf', 'mlperf_mpi']: if hasattr(args, 'mode') and args.mode not in ['offline', 'server']: - raise ValidationError(f"Invalid mode: {args.mode}. Must be 'offline' or 'server'") \ No newline at end of file + raise ValidationError( + f"Invalid mode: {args.mode}. Must be 'offline' or 'server'") diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index fa3ae51514..40c83eecc5 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -482,7 +482,7 @@ "rgat": 788379, "pointpainting": 1024, }, - "dataset-size" : { + "dataset-size": { "resnet": 50000, "retinanet": 24781, "bert-99": 10833, @@ -1023,7 +1023,7 @@ def get_min_query_count(self, model, scenario): if model not in self.min_queries: raise ValueError("model not known: " + model) return self.min_queries[model].get(scenario) - + def get_dataset_size(self, model): model = self.get_mlperf_model(model) if model not in self.dataset_size: @@ -2292,9 +2292,12 @@ def log_result( # Check for calibration documentation if not config.skip_calibration_check and division not in ["open"]: - calibration_path_root = os.path.join(division, submitter, "calibration.md") - calibration_path_doc = os.path.join(division, submitter, "documentation", "calibration.md") - if not (os.path.exists(calibration_path_root)) and (not os.path.exists(calibration_path_doc)): + calibration_path_root = os.path.join( + division, submitter, "calibration.md") + calibration_path_doc = os.path.join( + division, submitter, "documentation", "calibration.md") + if not (os.path.exists(calibration_path_root)) and ( + not os.path.exists(calibration_path_doc)): log.error( "%s/%s: has not calibration file. One of %s or %s is required", division, @@ -3248,8 +3251,8 @@ def main(): args.extra_model_benchmark_map, ignore_uncommited=args.submission_exceptions, skip_power_check=args.skip_power_check, - skip_all_systems_with_results = args.skip_all_systems_have_results_check, - skip_calibration_check = args.skip_calibration_check + skip_all_systems_with_results=args.skip_all_systems_have_results_check, + skip_calibration_check=args.skip_calibration_check ) if args.scenarios_to_skip: From ac42a2f4b6f7ce7dc18d41939a4da0fe9cfa89f2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Jun 2025 16:19:58 +0000 Subject: [PATCH 13/35] [Automated Commit] Format Codebase --- speech2text/QSL.py | 18 ++++- speech2text/accuracy_eval.py | 57 ++++++++++++-- speech2text/helpers.py | 45 ++++++++---- speech2text/legacy_helpers.py | 6 +- speech2text/manifest.py | 1 + speech2text/reference_SUT.py | 94 ++++++++++++++++++------ speech2text/reference_mlperf.py | 60 ++++++++++----- speech2text/utils/download_utils.py | 3 +- speech2text/utils/preprocessing_utils.py | 3 +- 9 files changed, 216 insertions(+), 71 deletions(-) diff --git a/speech2text/QSL.py b/speech2text/QSL.py index 19afd49acf..a4882eb32b 100644 --- a/speech2text/QSL.py +++ b/speech2text/QSL.py @@ -30,6 +30,7 @@ Manifest_Global = None max_duration = float(os.environ.get("MAX_DURATION", "30.0")) + def load_sample_from_file(index): global Manifest sample = Manifest_Global[index] @@ -43,12 +44,18 @@ def load_sample_from_file(index): duration = sample['duration'] return prompt + class AudioQSL: def __init__(self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None, skip_qsl=False): global Manifest_Global m_paths = [manifest_filepath] - self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels), max_duration=max_duration) + self.manifest = Manifest( + dataset_dir, + m_paths, + labels, + len(labels), + max_duration=max_duration) Manifest_Global = self.manifest self.sample_rate = sample_rate self.count = len(self.manifest) @@ -59,15 +66,15 @@ def __init__(self, dataset_dir, manifest_filepath, labels, self.qsl = None else: self.qsl = lg.ConstructQSL(self.count, perf_count, - self.load_query_samples, - self.unload_query_samples) + self.load_query_samples, + self.unload_query_samples) print( "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format( self.manifest.duration / 3600, self.manifest.filtered_duration / 3600, self.count)) - + def load_query_samples(self, sample_list): pass @@ -83,6 +90,8 @@ def __del__(self): # We have no problem fitting all data in memory, so we do that, in # order to speed up execution of the benchmark. + + class AudioQSLInMemory(AudioQSL): def __init__(self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None, skip_qsl=True): @@ -104,5 +113,6 @@ def load_query_samples(self, sample_list): def unload_query_samples(self, sample_list): for sample_id in sample_list: del self.sample_id_to_sample[sample_id] + def __del__(self): print("FInished destroying no QSL") diff --git a/speech2text/accuracy_eval.py b/speech2text/accuracy_eval.py index ad741db502..3c7466ff8f 100644 --- a/speech2text/accuracy_eval.py +++ b/speech2text/accuracy_eval.py @@ -28,7 +28,35 @@ max_duration = float(os.environ.get("MAX_DURATION", "30.0")) -labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] +labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'"] dtype_map = { "int8": 'b', "int16": 'h', @@ -36,6 +64,7 @@ "int64": 'q', } + def word_error_rate(hypotheses: List[str], references: List[str]) -> float: """ Computes Average Word Error rate between two texts represented as @@ -61,37 +90,53 @@ def word_error_rate(hypotheses: List[str], references: List[str]) -> float: r = normalizer(r) h_list = h.split() r_list = r.split() - scores_clip, words_clip = compute_wer_with_concatenation(h_list, r_list) + scores_clip, words_clip = compute_wer_with_concatenation( + h_list, r_list) scores += scores_clip words += words_clip wer = scores / words return wer, scores, words + def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", required=True) parser.add_argument("--dataset_dir", required=True) parser.add_argument("--manifest", required=True) - parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type") + parser.add_argument( + "--output_dtype", + default="int64", + choices=dtype_map.keys(), + help="Output data type") args = parser.parse_args() return args + def main(): args = get_args() - manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), max_duration=max_duration) + manifest = Manifest(args.dataset_dir, + [args.manifest], + labels, + len(labels), + max_duration=max_duration) with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: results = json.load(fh) hypotheses = [] references = [] for result in results: - hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) + hypotheses.append(array.array( + dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) references.append(manifest[result["qsl_idx"]]["transcript"]) references = __gather_predictions([references], labels=labels) hypotheses = __gather_predictions([hypotheses], labels=labels) wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references) - print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100)) + print( + "Word Error Rate: {:}%, accuracy={:}%".format( + wer * 100, + (1 - wer) * 100)) + if __name__ == '__main__': main() diff --git a/speech2text/helpers.py b/speech2text/helpers.py index b89b04e91e..279a3d396c 100644 --- a/speech2text/helpers.py +++ b/speech2text/helpers.py @@ -16,6 +16,7 @@ from typing import List from legacy_helpers import __levenshtein + def compute_wer_with_concatenation(prediction, reference): """ Compute WER considering concatenated words as correct matches using kaldialign @@ -46,7 +47,6 @@ def compute_wer_with_concatenation(prediction, reference): ref_concat = ref_words[i] hyp_concat = hyp_words[j] - # Try concatenating up to 3 words ref_match_len = 1 hyp_match_len = 1 @@ -54,7 +54,7 @@ def compute_wer_with_concatenation(prediction, reference): for k in range(1, 4): if i + k <= len(ref_words): - ref_concat = ''.join(ref_words[i:i+k]) + ref_concat = ''.join(ref_words[i:i + k]) if ref_concat == hyp_words[j]: ref_match_len = k hyp_match_len = 1 @@ -62,7 +62,7 @@ def compute_wer_with_concatenation(prediction, reference): break if j + k <= len(hyp_words): - hyp_concat = ''.join(hyp_words[j:j+k]) + hyp_concat = ''.join(hyp_words[j:j + k]) if hyp_concat == ref_words[i]: ref_match_len = 1 hyp_match_len = k @@ -71,8 +71,8 @@ def compute_wer_with_concatenation(prediction, reference): if match_found: # Add concatenated match - alignment.append((' '.join(ref_words[i:i+ref_match_len]), - ' '.join(hyp_words[j:j+hyp_match_len]))) + alignment.append((' '.join(ref_words[i:i + ref_match_len]), + ' '.join(hyp_words[j:j + hyp_match_len]))) i += ref_match_len j += hyp_match_len @@ -91,14 +91,18 @@ def compute_wer_with_concatenation(prediction, reference): j += 1 # Calculate WER using kaldialign - ref_aligned = [x[0].replace(" ", "") for x in alignment if x[0] is not None] - hyp_aligned = [x[1].replace(" ", "") for x in alignment if x[1] is not None] + ref_aligned = [x[0].replace(" ", "") + for x in alignment if x[0] is not None] + hyp_aligned = [x[1].replace(" ", "") + for x in alignment if x[1] is not None] distance = __levenshtein(ref_aligned, hyp_aligned) wer = distance / len(ref_words) if ref_words else 0 return distance, len(ref_words) if ref_words else 0 -def expand_concatenations(words_list: List, reference_dict: dict, reference_list: List): + +def expand_concatenations( + words_list: List, reference_dict: dict, reference_list: List): """ Finds matching compound words in 'words_list' which exist as keys in 'reference_dict', if any. If found, the compound word will be separated using reference_dict if the substitution reduces @@ -113,15 +117,19 @@ def expand_concatenations(words_list: List, reference_dict: dict, reference_list score = __levenshtein(words_list, reference_list) # Searches each word in 'word_list' for separability using the reference list. Once all options are - # considered, the modified 'word_list' is returned. Length of 'word_list' can grow, but not contract. + # considered, the modified 'word_list' is returned. Length of 'word_list' + # can grow, but not contract. i = 0 words_length = len(words_list) while i < words_length: if words_list[i] in reference_dict.keys(): - words_candidate = words_list[:i] + reference_dict[words_list[i]] + words_list[i + 1:] + words_candidate = words_list[:i] + \ + reference_dict[words_list[i]] + words_list[i + 1:] - # If levenshtein distance reduced, cache new word_list and resume search - candidate_levenshtein = __levenshtein(words_candidate, reference_list) + # If levenshtein distance reduced, cache new word_list and resume + # search + candidate_levenshtein = __levenshtein( + words_candidate, reference_list) if candidate_levenshtein < score: words_list = words_candidate words_length = len(words_list) @@ -129,6 +137,7 @@ def expand_concatenations(words_list: List, reference_dict: dict, reference_list i += 1 return words_list + def get_expanded_wordlist(words_list: List, reference_list: List): """ Provided two lists of English words, the two will be compared, and any compound words found in @@ -141,7 +150,8 @@ def get_expanded_wordlist(words_list: List, reference_list: List): List of words modified from 'word_list' after expanding referenced compound words """ - # If levenshtein distance < 2, there cannot be any compound word separation issues. + # If levenshtein distance < 2, there cannot be any compound word + # separation issues. if __levenshtein(words_list, reference_list) < 2: return words_list @@ -153,9 +163,12 @@ def get_expanded_wordlist(words_list: List, reference_list: List): # Adding three-word compounding candidates to checklist for i in range(len(reference_list) - 2): - compound = reference_list[i] + reference_list[i + 1] + reference_list[i + 2] - checklist[compound] = [reference_list[i], reference_list[i + 1], reference_list[i + 2]] + compound = reference_list[i] + \ + reference_list[i + 1] + reference_list[i + 2] + checklist[compound] = [reference_list[i], + reference_list[i + 1], reference_list[i + 2]] # All compiled candidates will be checked, and after checking for minimal Levenshtein - # distance, the modified list (or original if compounding not found) is directly returned + # distance, the modified list (or original if compounding not found) is + # directly returned return expand_concatenations(words_list, checklist, reference_list) diff --git a/speech2text/legacy_helpers.py b/speech2text/legacy_helpers.py index 45065fff72..17687e4230 100644 --- a/speech2text/legacy_helpers.py +++ b/speech2text/legacy_helpers.py @@ -16,6 +16,7 @@ from enum import Enum from typing import List + def __levenshtein(a: List, b: List) -> int: """Calculates the Levenshtein distance between a and b. """ @@ -37,6 +38,7 @@ def __levenshtein(a: List, b: List) -> int: return current[n] + def __whisper_decoder_predictions_tensor(tensor, labels): """ Takes output of greedy whisper decoder and converts to strings. @@ -58,6 +60,6 @@ def __whisper_decoder_predictions_tensor(tensor, labels): def __gather_predictions(predictions_list: list, labels: list) -> list: results = [] for prediction in predictions_list: - results += __whisper_decoder_predictions_tensor(prediction, labels=labels) + results += __whisper_decoder_predictions_tensor( + prediction, labels=labels) return results - diff --git a/speech2text/manifest.py b/speech2text/manifest.py index d7c0fc88d6..c54be6923b 100644 --- a/speech2text/manifest.py +++ b/speech2text/manifest.py @@ -16,6 +16,7 @@ import string import os + class Manifest(object): def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False, min_duration=None, sort_by_duration=False, max_utts=0, diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py index 3031281736..3deeea0cec 100644 --- a/speech2text/reference_SUT.py +++ b/speech2text/reference_SUT.py @@ -43,27 +43,59 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger("SUT") + def get_start_cores(start_cores="0"): start_cores = start_cores.split(",") start_cores = list(map(int, start_cores)) return start_cores + cores_per_inst = int(os.environ.get("CORES_PER_INST", "1")) num_numa_nodes = int(os.environ.get("NUM_NUMA_NODES", "1")) -nodes_per_inst = int(os.environ["NUM_NUMA_NODES"])/int(os.environ["NUM_INSTS"]) +nodes_per_inst = int(os.environ["NUM_NUMA_NODES"] + ) / int(os.environ["NUM_INSTS"]) insts_per_node = int(os.environ["INSTS_PER_NODE"]) -start_cores = os.environ["START_CORES"] +start_cores = os.environ["START_CORES"] precision = torch.float32 n_mels = 128 sample_rate = 16000 model_path = "openai/whisper-large-v3" -labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] +labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'"] labels_dict = {} for i in range(len(labels)): labels_dict[labels[i]] = i + class Instance(mp.Process): def __init__( self, @@ -123,7 +155,7 @@ def run(self): self.total_sample_count ) - dtype="bfloat16" + dtype = "bfloat16" print(f"Precision: {dtype}") model = LLM( model=model_path, @@ -162,7 +194,7 @@ def process_queries(self): if qitem_list is None: return False - + prompt_list = [] for qitem in qitem_list: prompt = self.qsl[qitem.index] @@ -177,7 +209,8 @@ def process_queries(self): start_time = time.time() outputs = self.model.generate(prompt_list, self.sampling_params) - print(f"Sample number: {self.num_samples} | Step time {time.time()-start_time:.3f}s") + print( + f"Sample number: {self.num_samples} | Step time {time.time()-start_time:.3f}s") for output in outputs: request_id = int(output.request_id) @@ -188,7 +221,7 @@ def process_queries(self): self.num_samples += len(results) - for i,result in enumerate(results): + for i, result in enumerate(results): # Whisper outputs space in the front and capitalizes things result = result.lower().strip() transcript = [] @@ -204,6 +237,7 @@ def process_queries(self): print(f"Finished {qid[i]}") return True + class vllmSUT: def __init__(self, dataset_dir, manifest_filepath, perf_count, num_workers=1, device="cpu"): @@ -211,21 +245,21 @@ def __init__(self, dataset_dir, self.dataset_path = dataset_dir self.manifest_filepath = manifest_filepath self.device = device - self.batch_size = 16 + self.batch_size = 16 self.total_sample_count = perf_count self.num_workers = num_workers self.worker_threads = [None] * self.num_workers dataset_vocab = labels - #self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in [ "no", "false" ] else torch.device("cpu") + # self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in [ "no", "false" ] else torch.device("cpu") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries) self.qsl = AudioQSL(dataset_dir, - manifest_filepath, - dataset_vocab, - sample_rate, - perf_count) + manifest_filepath, + dataset_vocab, + sample_rate, + perf_count) self.query_queue = mp.JoinableQueue() self.output_queue = mp.Queue() self.alive_counter = mp.Value("i", 0) @@ -235,10 +269,20 @@ def __init__(self, dataset_dir, def start(self): node_start_cores = get_start_cores(start_cores) core_lists = [] - if insts_per_node>0: + if insts_per_node > 0: for i in range(num_numa_nodes): for j in range(insts_per_node): - core_lists.append(list(range(node_start_cores[i]+j*cores_per_inst, node_start_cores[i]+(j+1)*cores_per_inst))) + core_lists.append( + list( + range( + node_start_cores[i] + + j * + cores_per_inst, + node_start_cores[i] + + ( + j + + 1) * + cores_per_inst))) for j in range(self.num_workers): core_list = core_lists[j] @@ -253,18 +297,19 @@ def start(self): rank=j, dtype=precision, core_list=tuple(core_list), - node_list=tuple([math.floor(j*nodes_per_inst)]), - input_queue = self.query_queue, - output_queue = self.output_queue, - cond_var = self.cond_var, - alive_counter = self.alive_counter, - sample_counter = self.sample_counter + node_list=tuple([math.floor(j * nodes_per_inst)]), + input_queue=self.query_queue, + output_queue=self.output_queue, + cond_var=self.cond_var, + alive_counter=self.alive_counter, + sample_counter=self.sample_counter ) worker.start() self.worker_threads[j] = worker with self.cond_var: - self.cond_var.wait_for(lambda: self.alive_counter.value == self.num_workers) + self.cond_var.wait_for( + lambda: self.alive_counter.value == self.num_workers) log.info(f"Starting Loadgen response thread") response_thread = threading.Thread(target=self.response_loadgen) @@ -276,11 +321,12 @@ def issue_queries(self, query_samples): for query_sample in query_samples: # Continuous batching self.query_queue.put([query_sample]) - if len(query_sample_list)>0: + if len(query_sample_list) > 0: self.query_queue.put(query_sample_list) def flush_queries(self): pass + def response_loadgen(self): keep_alive = True while keep_alive: @@ -293,13 +339,13 @@ def response_loadgen(self): response = lg.QuerySampleResponse(qid, bi[0], bi[1] * response_array.itemsize) lg.QuerySamplesComplete([response]) + def stop(self): for i in range(self.num_workers): self.query_queue.put(None) for worker in self.worker_threads: worker.kill() - def __del__(self): lg.DestroySUT(self.sut) print("Finished destroying SUT.") diff --git a/speech2text/reference_mlperf.py b/speech2text/reference_mlperf.py index 7ca4416a14..cee5843a7a 100644 --- a/speech2text/reference_mlperf.py +++ b/speech2text/reference_mlperf.py @@ -21,13 +21,32 @@ import mlperf_loadgen as lg from reference_SUT import vllmSUT + def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--scenario", choices=["Offline", "Server"], default="Offline", help="Scenario") - parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") - parser.add_argument("--mlperf_conf", default="mlperf.conf", help="mlperf rules config") - parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS") - parser.add_argument("--audit_conf", default="audit.conf", help="audit config for LoadGen settings during compliance runs") + parser.add_argument( + "--scenario", + choices=[ + "Offline", + "Server"], + default="Offline", + help="Scenario") + parser.add_argument( + "--accuracy", + action="store_true", + help="enable accuracy pass") + parser.add_argument( + "--mlperf_conf", + default="mlperf.conf", + help="mlperf rules config") + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS") + parser.add_argument( + "--audit_conf", + default="audit.conf", + help="audit config for LoadGen settings during compliance runs") parser.add_argument("--dataset_dir", required=True) parser.add_argument("--manifest", required=True) parser.add_argument("--perf_count", type=int, default=None) @@ -50,17 +69,16 @@ def main(): log_path = args.log_dir os.makedirs(log_path, exist_ok=True) - - sut = vllmSUT( args.dataset_dir, - args.manifest, - args.perf_count, - num_workers=args.num_workers, - device="cpu") + sut = vllmSUT(args.dataset_dir, + args.manifest, + args.perf_count, + num_workers=args.num_workers, + device="cpu") sut.start() settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario] - #settings.FromConfig(args.mlperf_conf, "whisper", args.scenario) + # settings.FromConfig(args.mlperf_conf, "whisper", args.scenario) settings.FromConfig(args.user_conf, "whisper", args.scenario) if args.accuracy: @@ -75,15 +93,23 @@ def main(): log_settings.log_output = log_output_settings print("Running Loadgen test...") - lg.StartTestWithLogSettings(sut.sut, - sut.qsl.qsl, - settings, - log_settings, + lg.StartTestWithLogSettings(sut.sut, + sut.qsl.qsl, + settings, + log_settings, args.audit_conf) sut.stop() if args.accuracy: - cmd = ["python3", "accuracy_eval.py", "--log_dir", log_path, "--dataset_dir", args.dataset_dir, "--manifest", args.manifest] + cmd = [ + "python3", + "accuracy_eval.py", + "--log_dir", + log_path, + "--dataset_dir", + args.dataset_dir, + "--manifest", + args.manifest] print(f"Running accuracy script: {cmd}") subprocess.check_call(cmd) diff --git a/speech2text/utils/download_utils.py b/speech2text/utils/download_utils.py index bda4193fbb..ba5e15b7f0 100644 --- a/speech2text/utils/download_utils.py +++ b/speech2text/utils/download_utils.py @@ -65,5 +65,6 @@ def extract(fpath, dest_folder): with tarfile.open(fpath, mode) as tar: members = tar.getmembers() - for member in tqdm.tqdm(iterable=members, total=len(members), leave=True): + for member in tqdm.tqdm( + iterable=members, total=len(members), leave=True): tar.extract(path=dest_folder, member=member) diff --git a/speech2text/utils/preprocessing_utils.py b/speech2text/utils/preprocessing_utils.py index 260e860b80..5d6673434c 100644 --- a/speech2text/utils/preprocessing_utils.py +++ b/speech2text/utils/preprocessing_utils.py @@ -68,7 +68,8 @@ def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None, return output_dict -def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel): +def parallel_preprocess(dataset, input_dir, dest_dir, + target_sr, speed, overwrite, parallel): with multiprocessing.Pool(parallel) as p: func = functools.partial(preprocess, input_dir=input_dir, dest_dir=dest_dir, From 2b371c9028fbaaaa1e121df7e9c6d360ebd2aac0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Jun 2025 18:03:55 +0000 Subject: [PATCH 14/35] [Automated Commit] Format Codebase --- language/llama3.1-8b/ref_eval.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/language/llama3.1-8b/ref_eval.py b/language/llama3.1-8b/ref_eval.py index 0cfc0694d7..bb34847b13 100644 --- a/language/llama3.1-8b/ref_eval.py +++ b/language/llama3.1-8b/ref_eval.py @@ -17,7 +17,8 @@ def rouge(label, pred): def niah_em(label, pred): - label_uuids = re.findall(r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', label) + label_uuids = re.findall( + r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', label) pred_uuids = re.findall(r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', pred) # https://github.com/hsiehjackson/RULER/blob/main/scripts/eval/synthetic/constants.py#L28 @@ -43,7 +44,8 @@ def qa_em(label, pred): return {'exact_match': 100.0} normalized_answer = re.sub(r'\s+', '', answer_substring).lower() - label_entries = [re.sub(r'\s+', '', entry).lower() for entry in label.split('|')] + label_entries = [re.sub(r'\s+', '', entry).lower() + for entry in label.split('|')] match_found = any(entry in normalized_answer for entry in label_entries) return {'exact_match': 100.0 if match_found else 0.0} @@ -63,7 +65,12 @@ def process_row(row): def run_evaluation(df): with Pool(cpu_count()) as pool: - accuracies = list(tqdm(pool.imap(process_row, df.to_dict('records')), total=len(df))) + accuracies = list( + tqdm( + pool.imap( + process_row, + df.to_dict('records')), + total=len(df))) df['accuracy'] = accuracies return df @@ -74,10 +81,10 @@ def run_evaluation(df): df = pd.read_pickle(fname) df = run_evaluation(df) - #df.to_pickle(str(fname).replace(".pkl", "_eval.pkl")) + # df.to_pickle(str(fname).replace(".pkl", "_eval.pkl")) print(f"WROTE: {str(fname).replace('.pkl', '_eval.pkl')}") accuracy = df.accuracy.apply(pd.Series) print(df.dataset.value_counts()) print(accuracy.describe()) - print(df.describe()) \ No newline at end of file + print(df.describe()) From 72d52bb8adb42c9560076194a2c8a82190f37740 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 19 Jun 2025 08:45:56 +0000 Subject: [PATCH 15/35] [Automated Commit] Format Codebase --- speech2text/utils/repackage_librispeech.py | 42 +++++++++++++++------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/speech2text/utils/repackage_librispeech.py b/speech2text/utils/repackage_librispeech.py index 7a2b62f421..4889b815a2 100644 --- a/speech2text/utils/repackage_librispeech.py +++ b/speech2text/utils/repackage_librispeech.py @@ -24,6 +24,7 @@ PAD_DURATION = 0.5 SR = 16000 + def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--manifest", required=True) @@ -33,10 +34,12 @@ def get_args(): args = parser.parse_args() return args + def get_source_name(fname): basename_list, _ = os.path.splitext(fname) return "-".join(basename_list.split("-")[:2]) + def prepare_clip(current_entry, new_fname): pad_audio = np.zeros(int(PAD_DURATION * SR)) new_audio = [] @@ -51,6 +54,7 @@ def prepare_clip(current_entry, new_fname): new_json = get_sample_json(new_audio, new_transcript, new_fname) return new_audio, new_json + def get_sample_json(audio, transcript, fname): json_file = { "transcript": transcript, @@ -73,6 +77,7 @@ def get_sample_json(audio, transcript, fname): } return json_file + def main(): args = get_args() with open(args.manifest, "r") as manifest: @@ -84,15 +89,22 @@ def main(): for data in json_data: original_fname = data["files"][0]["fname"] original_transcript = data["transcript"] - original_audio = librosa.load(os.path.join(args.data_dir, original_fname), sr=SR)[0] - original_json = get_sample_json(original_audio, original_transcript, original_fname) + original_audio = librosa.load( + os.path.join( + args.data_dir, + original_fname), + sr=SR)[0] + original_json = get_sample_json( + original_audio, original_transcript, original_fname) - source_name = get_source_name(os.path.basename(os.path.basename(original_fname))) + source_name = get_source_name( + os.path.basename( + os.path.basename(original_fname))) if source_name not in catalog: catalog[source_name] = [] - + catalog[source_name].append((original_audio, original_json)) - + full_json = [] for key in catalog.keys(): index = 0 @@ -100,13 +112,17 @@ def main(): current_duration = 0 for entry in catalog[key]: clip_duration = entry[1]["original_duration"] - - # Only considering clips <=30s. If single clip duration > 30s, ignore. + + # Only considering clips <=30s. If single clip duration > 30s, + # ignore. if clip_duration > 30: continue - # If new clip would extend compiled entry to >30s, flush the existing entry - if (len(current_entry) > 0) and (current_duration + PAD_DURATION + clip_duration > 30): - new_fname = os.path.join(args.output_dir, key + "_" + str(index) + ".wav") + # If new clip would extend compiled entry to >30s, flush the + # existing entry + if (len(current_entry) > 0) and ( + current_duration + PAD_DURATION + clip_duration > 30): + new_fname = os.path.join( + args.output_dir, key + "_" + str(index) + ".wav") new_audio, new_json = prepare_clip(current_entry, new_fname) sf.write(new_fname, new_audio, SR) full_json.append(new_json) @@ -120,9 +136,11 @@ def main(): if len(current_entry) > 1: current_duration += PAD_DURATION - # After all key clips are processed, if a remaining entry has content, exports it. + # After all key clips are processed, if a remaining entry has content, + # exports it. if len(current_entry) > 0: - new_fname = os.path.join(args.output_dir, key + "_" + str(index) + ".wav") + new_fname = os.path.join( + args.output_dir, key + "_" + str(index) + ".wav") new_audio, new_json = prepare_clip(current_entry, new_fname) sf.write(new_fname, new_audio, SR) full_json.append(new_json) From f841519e77014c799b0e08ad75b23ca2c534b1aa Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Jun 2025 23:30:42 +0100 Subject: [PATCH 16/35] Fix Typo in Interactive Latencies (#2147) (#2225) * Fix Typo in Interactive Latencies * Update submission_checker.py --- language/llama2-70b/README.md | 2 +- tools/submission/submission_checker.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md index bbd9889564..0c0ad21952 100644 --- a/language/llama2-70b/README.md +++ b/language/llama2-70b/README.md @@ -291,7 +291,7 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/submission/) # Run llama2-70b-interactive benchmark -For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms` +For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `tpot <= 40ms` In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode: diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 69b47d6f17..462f3d56ea 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -808,6 +808,18 @@ "ttft": 450 * 1000000, "tpot": 40 * 1000000 }, }, + "llama2-70b-interactive-99": { + "Server": { + "ttft": 450 * 1000000, "tpot": 40 * 1000000 + }, + }, + # for v5.0 + "llama2-70b-interactive-99.9": { + "Server": { + "ttft": 450 * 1000000, "tpot": 40 * 1000000 + }, + }, + # for v5.0 "mixtral-8x7b": { "Server": { "ttft": 2000 * 1000000, "tpot": 200 * 1000000 From 4e7717712ceb8259a703423898fbfcbb5edd057e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Jun 2025 23:40:19 +0100 Subject: [PATCH 17/35] Fix Typo in Interactive Latencies (#2147) (#2226) * Fix Typo in Interactive Latencies * Update submission_checker.py --------- Co-authored-by: Miro Co-authored-by: github-actions[bot] From 350032a3f13ff25cd5d54c99c0a604de4a155dbd Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Sun, 6 Jul 2025 01:57:04 +0530 Subject: [PATCH 18/35] Update MLCFlow commands for v5.1 (#2237) --- docs/benchmarks/language/deepseek-r1.md | 11 ++++ .../language/get-deepseek-r1-data.md | 24 ++++++++ .../language/get-llama3_1-8b-data.md | 60 +++++++++++++++++++ docs/benchmarks/language/llama3_1-8b.md | 11 ++++ .../speech_to_text/get-whisper-data.md | 40 +++++++++++++ docs/benchmarks/speech_to_text/whisper.md | 11 ++++ language/deepseek-r1/README.md | 30 ++++++++++ language/llama3.1-8b/README.md | 52 ++++++++++++---- main.py | 33 ++++++---- mkdocs.yml | 4 ++ speech2text/README.md | 46 ++++++++++++++ 11 files changed, 298 insertions(+), 24 deletions(-) create mode 100644 docs/benchmarks/language/deepseek-r1.md create mode 100644 docs/benchmarks/language/get-deepseek-r1-data.md create mode 100644 docs/benchmarks/language/get-llama3_1-8b-data.md create mode 100644 docs/benchmarks/language/llama3_1-8b.md create mode 100644 docs/benchmarks/speech_to_text/get-whisper-data.md create mode 100644 docs/benchmarks/speech_to_text/whisper.md diff --git a/docs/benchmarks/language/deepseek-r1.md b/docs/benchmarks/language/deepseek-r1.md new file mode 100644 index 0000000000..f83fe1bb82 --- /dev/null +++ b/docs/benchmarks/language/deepseek-r1.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +--- + +# Reasoning using DeepSeek-R1 + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "deepseek-r1", "reference", devices=["CUDA"]) }} \ No newline at end of file diff --git a/docs/benchmarks/language/get-deepseek-r1-data.md b/docs/benchmarks/language/get-deepseek-r1-data.md new file mode 100644 index 0000000000..401c4d27bc --- /dev/null +++ b/docs/benchmarks/language/get-deepseek-r1-data.md @@ -0,0 +1,24 @@ +--- +hide: + - toc +--- + +# Reasoning using DeepSeek R1 + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + ### Get Validation Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_rclone --outdirname= -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname= -j + ``` \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md new file mode 100644 index 0000000000..e24cc37d44 --- /dev/null +++ b/docs/benchmarks/language/get-llama3_1-8b-data.md @@ -0,0 +1,60 @@ +--- +hide: + - toc +--- + +# Text Summarization using LLAMA3.1-8b + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + === "Full dataset (Datacenter)" + + ### Get Validation Dataset + ``` + mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname= -j + ``` + + === "5000 samples (Edge)" + + ### Get Validation Dataset + ``` + mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname= -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname= -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. + +=== "Pytorch" + + === "From MLCOMMONS Google Drive" + + > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2 + ``` + TBD + ``` + + === "From Hugging Face repo" + + > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). + + ### Get model from HuggingFace repo + ``` + mlcr get,ml-model,llama3,_hf,_meta-llama/Llama-3.1-8B-Instruct --hf_token= -j + ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/llama3_1-8b.md b/docs/benchmarks/language/llama3_1-8b.md new file mode 100644 index 0000000000..93f8df2997 --- /dev/null +++ b/docs/benchmarks/language/llama3_1-8b.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +--- + +# Text Summarization using LLAMA3_1-8b + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "llama3_1-8b", "reference", devices=["CPU","CUDA"]) }} \ No newline at end of file diff --git a/docs/benchmarks/speech_to_text/get-whisper-data.md b/docs/benchmarks/speech_to_text/get-whisper-data.md new file mode 100644 index 0000000000..9bc97ad9a0 --- /dev/null +++ b/docs/benchmarks/speech_to_text/get-whisper-data.md @@ -0,0 +1,40 @@ +--- +hide: + - toc +--- + +# Speech to Text using Whisper + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + === "Preprocessed" + + ### Get Preprocessed Validation Dataset + ``` + mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j + ``` + + === "Unprocessed" + + ### Get Unprocessed Validation Dataset + ``` + mlcr get,dataset,whisper,_unprocessed --outdirname= -j + ``` + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions if any. In case you want to only download the official model, you can use the below commands. + +=== "Pytorch" + + === "From MLCOMMONS" + + ### Get the Official MLPerf Whisper model from MLCOMMONS Cloudflare R2 + ``` + mlcr get,ml-model,whisper,_rclone,_mlc s-j + ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/speech_to_text/whisper.md b/docs/benchmarks/speech_to_text/whisper.md new file mode 100644 index 0000000000..fddf37b767 --- /dev/null +++ b/docs/benchmarks/speech_to_text/whisper.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +--- + +# Speech to Text using Whisper + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "whisper", "reference", devices=["CPU","CUDA"]) }} \ No newline at end of file diff --git a/language/deepseek-r1/README.md b/language/deepseek-r1/README.md index 7c2722c7a6..4e0184ff94 100644 --- a/language/deepseek-r1/README.md +++ b/language/deepseek-r1/README.md @@ -1,5 +1,11 @@ # Mlperf Inference DeepSeek Reference Implementation +## Automated command to run the benchmark via MLFlow + +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/deepseek-r1/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + +You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections. + ## Model & Dataset Download > **Model**: [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) (revision: `56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad`) @@ -11,6 +17,14 @@ ### Preprocessed +**Using MLCFlow Automation** + +``` +mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j +``` + +**Using Native method** + You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket. To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows). @@ -30,6 +44,14 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deeps ### Calibration +**Using MLCFlow Automation** + +``` +mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname= -j +``` + +**Using Native method** + Download and install Rclone as described in the previous section. Then navigate in the terminal to your desired download directory and run the following command to download the dataset: @@ -171,6 +193,14 @@ The following table shows which backends support different evaluation and MLPerf ## Accuracy Evaluation +**Using MLCFlow Automation** + +``` +TBD +``` + +**Using Native method** + Accuracy evaluation is handled uniformly across all backends: ```bash diff --git a/language/llama3.1-8b/README.md b/language/llama3.1-8b/README.md index 5947aa0cc4..2b331c98f9 100644 --- a/language/llama3.1-8b/README.md +++ b/language/llama3.1-8b/README.md @@ -9,7 +9,7 @@ ## Automated command to run the benchmark via MLFlow -Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-8b/) (TBD) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-8b/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections. @@ -99,7 +99,10 @@ pip install -e ../../loadgen ## Get Model ### MLCommons Members Download (Recommended for official submission) -You need to request for access to [MLCommons](http://llama3-1.mlcommons.org/) and you'll receive an email with the download instructions. You can download the model automatically via the below command +You need to request for access to [MLCommons](http://llama3-1.mlcommons.org/) and you'll receive an email with the download instructions. + +**Official Model download using MLCFlow Automation** +You can download the model automatically via the below command ``` TBD ``` @@ -115,6 +118,12 @@ git clone https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct ${CHECKPOINT_P cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1 ``` +**External Model download using MLCFlow Automation** +You can download the model automatically via the below command +``` +mlcr get,ml-model,llama3,_hf,_meta-llama/Llama-3.1-8B-Instruct --hf_token= -j +``` + ### Download huggingface model through MLC ``` @@ -142,24 +151,39 @@ rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5ee You can then navigate in the terminal to your desired download directory and run the following command to download the dataset: #### Full dataset (datacenter) + +**Using MLCFlow Automation** +``` +mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname= -j +``` + +**Native method** ``` rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.json ./ -P ``` #### 5000 samples (edge) + +**Using MLCFlow Automation** +``` +mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname= -j +``` + +**Native method** ``` rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/sample_cnn_eval_5000.json ./ -P ``` #### Calibration + +**Using MLCFlow Automation** ``` -rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_dailymail_calibration.json ./ -P +mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname= -j ``` -**MLC Command** - +**Native method** ``` -TBD +rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_dailymail_calibration.json ./ -P ``` You can also download the calibration dataset from the Cloudflare R2 bucket by running the following command: @@ -168,11 +192,6 @@ You can also download the calibration dataset from the Cloudflare R2 bucket by r rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.json ./ -P ``` -**MLC Command** -``` -TBD -``` - ## Run Performance Benchmarks @@ -265,8 +284,17 @@ The ServerSUT was not tested for GPU runs. ### Evaluate the accuracy using MLCFlow You can also evaulate the accuracy from the generated accuracy log by using the following MLC command + +**Full dataset (datacenter)** + ``` -TBD +mlcr run,accuracy,mlperf,_cnndm_llama_3,_edge --result_dir= +``` + +**5000 samples (edge)** + +``` +mlcr run,accuracy,mlperf,_cnndm_llama_3,_datacenter --result_dir= ``` ## Accuracy Target diff --git a/main.py b/main.py index d2f625fcf4..1e5a20bb96 100755 --- a/main.py +++ b/main.py @@ -33,13 +33,15 @@ def mlperf_inference_implementation_readme( if model == "rnnt": code_version = "r4.0" + if "gpt" in model: + code_version = "r5.0-dev" elif implementation == "intel": code_version = "r4.1-dev" if implementation == "reference": # Tip - if model != "rnnt": - code_version = "r5.0-dev" + if model not in ["rnnt", "gptj-99", "gptj-99.9"]: + code_version = "r5.1-dev" if "99.9" not in model and implementation_tips: content += f"\n{pre_space}!!! tip\n\n" content += f"{pre_space} - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n" @@ -54,8 +56,12 @@ def mlperf_inference_implementation_readme( frameworks = ["Onnxruntime", "Pytorch"] elif "bert" in model.lower(): frameworks = ["Pytorch", "Deepsparse"] - elif "llama3" in model.lower(): - frameworks = ["Pytorch"] + elif "whisper" in model.lower(): + frameworks = ["vLLM"] + elif "deepseek" in model.lower(): + frameworks = ["vLLM", "Pytorch", "SGLang"] + elif "llama3_1-8b" in model.lower(): + frameworks = ["vLLM"] else: frameworks = ["Pytorch"] @@ -130,12 +136,7 @@ def mlperf_inference_implementation_readme( categories = ["Datacenter"] elif model.lower() in ["pointpainting"]: categories = ["Edge"] - elif ( - "dlrm" in model.lower() - or "llama2" in model.lower() - or "mixtral" in model.lower() - or "llama3" in model.lower() - ): + elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3", "deepseek-r1"]: categories = ["Datacenter"] else: categories = ["Edge", "Datacenter"] @@ -153,8 +154,12 @@ def mlperf_inference_implementation_readme( scenarios.append("MultiStream") if model.lower() in ["pointpainting"]: scenarios.remove("Offline") + if model.lower() in ["whisper"]: + scenarios.remove("SingleStream") elif category == "Datacenter": scenarios = ["Offline", "Server"] + if model.lower() in ["whisper"]: + scenarios.remove("Server") if fixed_scenarios: scenarios = [ scenario for scenario in scenarios if scenario in fixed_scenarios] @@ -164,7 +169,7 @@ def mlperf_inference_implementation_readme( cur_space = pre_space + " " scenarios_string = ", ".join(scenarios) - content += f"{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenarios and all the scenarios are mandatory for a closed division submission.\n\n" + content += f"""{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenario{"s" if len(scenarios)>1 else ""} and {"all of the scenarios are" if len(scenarios)>1 else "the scenario is"} mandatory for a closed division submission.\n\n""" for framework in frameworks: cur_space1 = cur_space + " " @@ -539,7 +544,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=`.\n\n" if implementation.lower() == "reference" and model.lower() not in [ - "pointpainting"]: + "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": @@ -568,6 +573,10 @@ def get_docker_info(spaces, model, implementation, elif "llama3" in model.lower(): info += f"{pre_space} - `--env.MLC_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" + elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]: + info += f"{pre_space} - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" + info += f"{pre_space} - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" + if implementation.lower() == "nvidia": info += f"{pre_space} - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n" info += f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n" diff --git a/mkdocs.yml b/mkdocs.yml index a0ac88ef98..e4396d2d53 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,11 +46,15 @@ nav: - GPT-J: benchmarks/language/gpt-j.md - LLAMA2-70B: benchmarks/language/llama2-70b.md - LLAMA3-405B: benchmarks/language/llama3_1-405b.md + - LLAMA3-8B: benchmarks/language/llama3_1-8b.md - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md + - DeepSeek-R1: benchmarks/language/deepseek-r1.md - Recommendation: - DLRM-v2: benchmarks/recommendation/dlrm-v2.md - Graph Neural Networks: - R-GAT: benchmarks/graph/rgat.md + - Speech to Text: + - Whisper: benchmarks/speech_to_text/whisper.md - Install MLCFlow: - install/index.md - Submission: diff --git a/speech2text/README.md b/speech2text/README.md index 8747cc8557..be325975a2 100644 --- a/speech2text/README.md +++ b/speech2text/README.md @@ -1,5 +1,11 @@ # Reference Implementation for whisper-large-v3 +## Automated command to run the benchmark via MLFlow + +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/whisper/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + +You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections. + ## Prepare environment ### Docker @@ -92,6 +98,15 @@ VLLM_TARGET_DEVICE=cpu pip install --break-system-packages . --no-build-isolatio ## Get Model ### MLCommons Download +**Official Model download using MLCFlow Automation** + +You can download the model automatically via the below command +``` +mlcr get,ml-model,whisper,_rclone,_mlc --outdirname= -j +``` + +**Official Model download using native method** + You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket. To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows). @@ -111,6 +126,15 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/Whisper/model/ ./ -P ### External Download (Not recommended for official submission) +**External Model download using MLCFlow Automation** + +You can download the model automatically via the below command +``` +TBD +``` + +**External Model download using native method** + + Requires Git Large Files Storage ```bash export CHECKPOINT_PATH=whisper-large-v3 @@ -127,6 +151,13 @@ We use dev-clean and dev-other splits, which are approximately 10 hours. ### Preprocessed +**Using MLCFlow Automation** +``` +mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j +``` + +**Native method** + Download and install rclone as decribed in the [MLCommons Download section](#mlcommons-download) You can then navigate in the terminal to your desired download directory and run the following command to download the dataset: @@ -136,6 +167,13 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ ./ -P ### Unprocessed +**Using MLCFlow Automation** +``` +mlcr get,dataset,whisper,_unprocessed --outdirname= -j +``` + +**Native method** + If your are using docker, we provide a script to download and preprocess the dataset from the source. You can download it by running: ```bash ./download_dataset.sh @@ -227,6 +265,14 @@ python reference_mlperf.py \ ### Run Accuracy +**Evaluate Accuracy using MLCFlow Automation** + +``` +mlcr run,accuracy,mlperf,_librispeech_whisper,_int32 --result_dir= +``` + +**Evaluate Accuracy using native method** + ```bash python reference_mlperf.py \ --dataset_dir ${DATA_DIR} \ From 3a8595acd31a5f5a401f52d35449374e9d4cd281 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 5 Jul 2025 20:27:26 +0000 Subject: [PATCH 19/35] [Automated Commit] Format Codebase --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 1e5a20bb96..5b3062fa32 100755 --- a/main.py +++ b/main.py @@ -544,7 +544,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=`.\n\n" if implementation.lower() == "reference" and model.lower() not in [ - "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: + "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": @@ -576,7 +576,7 @@ def get_docker_info(spaces, model, implementation, elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]: info += f"{pre_space} - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" - + if implementation.lower() == "nvidia": info += f"{pre_space} - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n" info += f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n" From 906c0fcb0e43f3e041011baf9a8fb42ade41f7ec Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 15 Jul 2025 22:11:56 +0000 Subject: [PATCH 20/35] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index feeb0e41ae..291f07476a 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -498,7 +498,7 @@ "rgat": ("acc", 0.7286 * 0.99), "pointpainting": ("mAP", 0.5425 * 0.999), "deepseek-r1": ("exact_match", 0.99 * 81.6773, "TOKENS_PER_SAMPLE", 0.9 * 4043.449), - "whisper": ("ACCURACY", (100.0-2.0671) * 0.99), + "whisper": ("ACCURACY", (100.0 - 2.0671) * 0.99), }, "accuracy-upper-limit": { "stable-diffusion-xl": ( From 024e4aded9afdfffd33aeba0271f20f4acc393f7 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:18:54 +0530 Subject: [PATCH 21/35] Update main.py --- main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.py b/main.py index 5b3062fa32..e21582a4af 100755 --- a/main.py +++ b/main.py @@ -66,6 +66,8 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": + if model in ["retinanet", "resnet50", "3d-unet-99", "3d-unet-99.9]: + code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" devices = ["CUDA"] From 9e2c7a982b8d7eed48c508ef6dc0659c6c196db9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 17 Jul 2025 08:50:04 +0000 Subject: [PATCH 22/35] [Automated Commit] Format Codebase --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index e21582a4af..e22e9f0421 100755 --- a/main.py +++ b/main.py @@ -66,7 +66,7 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": - if model in ["retinanet", "resnet50", "3d-unet-99", "3d-unet-99.9]: + if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 0928e11461625e39b9380f8505a3d8b02a7d06e5 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:20:51 +0530 Subject: [PATCH 23/35] updating for 5.1-dev (inference doc) --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index e22e9f0421..a4c3c556e8 100755 --- a/main.py +++ b/main.py @@ -66,7 +66,7 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": - if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9]: + if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9"]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 7069a9e58be3f40d9acbc08942c31a2840e893c4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 17 Jul 2025 08:51:14 +0000 Subject: [PATCH 24/35] [Automated Commit] Format Codebase --- main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index a4c3c556e8..b4d1f5e74d 100755 --- a/main.py +++ b/main.py @@ -66,7 +66,8 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": - if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9"]: + if model in ["retinanet", "resnet50", + "3d-unet-99", "3d - unet - 99.9"]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 253854ff4f10d73a94f3a071dcaf5fa2e9f02a5b Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:23:03 +0530 Subject: [PATCH 25/35] fix typo --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b4d1f5e74d..7cdec722ba 100755 --- a/main.py +++ b/main.py @@ -67,7 +67,7 @@ def mlperf_inference_implementation_readme( elif implementation == "nvidia": if model in ["retinanet", "resnet50", - "3d-unet-99", "3d - unet - 99.9"]: + "3d-unet-99", "3d-unet-99.9"]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 371d5835c94c187f5c2af69f9509dbf1b91f4e07 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 21 Jul 2025 13:52:33 +0000 Subject: [PATCH 26/35] [Automated Commit] Format Codebase --- compliance/nvidia/TEST06/run_verification.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py index 70e16f5266..cae64b3f47 100644 --- a/compliance/nvidia/TEST06/run_verification.py +++ b/compliance/nvidia/TEST06/run_verification.py @@ -53,7 +53,12 @@ def get_args(): "--scenario", "-s", required=True, - choices=["Offline", "Server", "Interactive", "SingleStream", "MultiStream"], + choices=[ + "Offline", + "Server", + "Interactive", + "SingleStream", + "MultiStream"], ) args = parser.parse_args() return args From ea86fc011fdeff2f0a1dd462010b4ea16d68c8c5 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:01:30 +0530 Subject: [PATCH 27/35] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 7cdec722ba..419b76a6da 100755 --- a/main.py +++ b/main.py @@ -190,7 +190,7 @@ def mlperf_inference_implementation_readme( content += f"{cur_space1}=== \"{device}\"\n" content += f"{cur_space2}##### {device} device\n\n" - # minimum system requirements + # get minimum system requirements content += get_min_system_requirements( cur_space2, model, implementation, device From 0828e9ca31c692174d23e1f8baf8eca860381c90 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 24 Jul 2025 18:35:56 +0000 Subject: [PATCH 28/35] [Automated Commit] Format Codebase --- language/deepseek-r1/eval_accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py index bf537e9d3a..9c103fdcba 100644 --- a/language/deepseek-r1/eval_accuracy.py +++ b/language/deepseek-r1/eval_accuracy.py @@ -773,7 +773,7 @@ def print_evaluation_results(df_evaluated: pd.DataFrame, 'tokens_per_sample': mean_output_len, 'num-samples': len(df_evaluated), } - + print("\nResults\n") print(results) From f8c344fa5f5b312908a888be34f5f3fef615629e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 29 Jul 2025 15:52:53 +0000 Subject: [PATCH 29/35] [Automated Commit] Format Codebase --- language/llama3.1-8b/download_cnndm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/language/llama3.1-8b/download_cnndm.py b/language/llama3.1-8b/download_cnndm.py index d8694be720..90c9ad8d7a 100644 --- a/language/llama3.1-8b/download_cnndm.py +++ b/language/llama3.1-8b/download_cnndm.py @@ -100,8 +100,8 @@ def preprocess_function(sample, padding="max_length"): # create list of samples inputs = [] - #print(f"Num samples: {len(sample[text_column])}") - #for i in range(0, len(sample[text_column])): + # print(f"Num samples: {len(sample[text_column])}") + # for i in range(0, len(sample[text_column])): x = dict() x["instruction"] = instruction_template x["input"] = sample[text_column] @@ -109,7 +109,7 @@ def preprocess_function(sample, padding="max_length"): instruction_template[instruction].format_map(x) ) x["output"] = sample[summary_column] - #inputs.append(x) + # inputs.append(x) model_inputs = dict() model_inputs["text"] = x From d0a2ed48ed75803abf6788d742a1dfd4444885e3 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 30 Jul 2025 19:39:47 +0530 Subject: [PATCH 30/35] Doc updates (#2292) * improve submission doc * Update index.md * Fix for model and dataset download commands * update submission doc * [Automated Commit] Format Codebase * Update index.md * r2_downloader -> r2-downloader * Update multithreading information about SDXL * [Automated Commit] Format Codebase * .lower() for consistency * [Automated Commit] Format Codebase * updation for llama3_1-8b edge * [Automated Commit] Format Codebase --------- Co-authored-by: github-actions[bot] Co-authored-by: Arjun Suresh --- .../image_classification/get-resnet50-data.md | 6 +++--- .../language/get-deepseek-r1-data.md | 18 +++++++++--------- .../benchmarks/language/get-llama2-70b-data.md | 14 ++++++++++++-- .../language/get-llama3_1-405b-data.md | 8 ++++++++ .../language/get-llama3_1-8b-data.md | 17 +++++++++-------- .../object_detection/get-retinanet-data.md | 6 +++--- .../speech_to_text/get-whisper-data.md | 4 ++-- docs/submission/index.md | 15 ++++++++++++++- main.py | 15 +++++++++------ 9 files changed, 69 insertions(+), 34 deletions(-) diff --git a/docs/benchmarks/image_classification/get-resnet50-data.md b/docs/benchmarks/image_classification/get-resnet50-data.md index 9ecd25c1a5..64e17c8246 100644 --- a/docs/benchmarks/image_classification/get-resnet50-data.md +++ b/docs/benchmarks/image_classification/get-resnet50-data.md @@ -15,7 +15,7 @@ The benchmark implementation run command will automatically download the validat ### Get Validation Dataset ``` - mlcr get,dataset,imagenet,validation -j + mlcr get,dataset,imagenet,validation,_full -j ``` === "Calibration" ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset. @@ -32,7 +32,7 @@ The benchmark implementation run command will automatically download the validat ### Get ResNet50 preprocessed dataset ``` - mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch -j + mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch,_full-j ``` - `--outdirname=` could be provided to download the dataset to a specific location. @@ -52,7 +52,7 @@ Get the Official MLPerf ResNet50 Model ### Onnx ``` - mlcr get,ml-model,resnet50,_onnx -j + mlcr get,ml-model,resnet50,image-classification,_onnx -j ``` - `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-deepseek-r1-data.md b/docs/benchmarks/language/get-deepseek-r1-data.md index 401c4d27bc..1dbce5cdf7 100644 --- a/docs/benchmarks/language/get-deepseek-r1-data.md +++ b/docs/benchmarks/language/get-deepseek-r1-data.md @@ -11,14 +11,14 @@ The benchmark implementation run command will automatically download the validat === "Validation" - ### Get Validation Dataset - ``` - mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_rclone --outdirname= -j - ``` -=== "Calibration" + ### Get Validation Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_r2-downloader --outdirname= -j + ``` - ### Get Calibration Dataset - ``` - mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname= -j - ``` \ No newline at end of file +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_r2-downloader --outdirname= -j \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md index ce7cd996eb..6c22d3658d 100644 --- a/docs/benchmarks/language/get-llama2-70b-data.md +++ b/docs/benchmarks/language/get-llama2-70b-data.md @@ -16,7 +16,7 @@ The benchmark implementation run command will automatically download the validat ### Get Preprocessed Validation Dataset ``` - mlcr get,dataset,preprocessed,openorca,_validation -j + mlcr get,dataset,preprocessed,openorca,_validation,_mlcommons -j ``` === "Calibration" @@ -56,7 +56,17 @@ The benchmark implementation run command will automatically download the require ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Google Drive ``` - mlcr get,ml-model,llama2-70b,_pytorch -j + mlcr get,ml-model,llama2-70b,_rclone,_mlc,_70b -j + ``` + + === "From MLCOMMONS Cloudfare R2" + + > **Note:** One has to accept the [MLCommons Llama 2 License Confidentiality Notice](https://llama2.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Cloudfare R2 + + ``` + mlcr get,ml-model,llama2-70b,_mlc,_r2-downloader,_70b -j ``` === "From Hugging Face repo" diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md index ad05ca8610..3257cd17b0 100644 --- a/docs/benchmarks/language/get-llama3_1-405b-data.md +++ b/docs/benchmarks/language/get-llama3_1-405b-data.md @@ -38,6 +38,14 @@ The benchmark implementation run command will automatically download the require ``` mlcr get,ml-model,llama3 -j ``` + + === "From Cloudfare R2" + + > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2 + ``` + mlcr get,ml-model,llama3,_mlc,_405b,_r2-downloader --outdirname= -j === "From Hugging Face repo" diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md index e24cc37d44..26b3cf11d1 100644 --- a/docs/benchmarks/language/get-llama3_1-8b-data.md +++ b/docs/benchmarks/language/get-llama3_1-8b-data.md @@ -10,26 +10,27 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. === "Validation" - + === "Full dataset (Datacenter)" ### Get Validation Dataset ``` - mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname= -j + mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_r2-downloader --outdirname= -j ``` - + === "5000 samples (Edge)" ### Get Validation Dataset ``` - mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname= -j + mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_r2-downloader --outdirname= -j ``` === "Calibration" - + ``` + ### Get Calibration Dataset ``` - mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname= -j + mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_r2-downloader --outdirname= -j ``` - `--outdirname=` could be provided to download the dataset to a specific location. @@ -39,13 +40,13 @@ The benchmark implementation run command will automatically download the require === "Pytorch" - === "From MLCOMMONS Google Drive" + === "From Cloudfare R2" > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2 ``` - TBD + mlcr get,ml-model,llama3,_mlc,_8b,_r2-downloader --outdirname= -j ``` === "From Hugging Face repo" diff --git a/docs/benchmarks/object_detection/get-retinanet-data.md b/docs/benchmarks/object_detection/get-retinanet-data.md index 6127eed541..00c5bf8451 100644 --- a/docs/benchmarks/object_detection/get-retinanet-data.md +++ b/docs/benchmarks/object_detection/get-retinanet-data.md @@ -16,7 +16,7 @@ The benchmark implementation run command will automatically download the validat ### Get Validation Dataset ``` - mlcr get,dataset,openimages,_validation -j + mlcr get,dataset,openimages,original,_validation -j ``` === "Calibration" @@ -24,14 +24,14 @@ The benchmark implementation run command will automatically download the validat ### Get OpenImages Calibration dataset ``` - mlcr get,dataset,openimages,_calibration -j + mlcr get,dataset,openimages,original,_calibration -j ``` === "Preprocessed" ### Get Preprocessed OpenImages dataset ``` - get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j + mlcr get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j ``` - `--outdirname=` could be provided to download the dataset to a specific location. diff --git a/docs/benchmarks/speech_to_text/get-whisper-data.md b/docs/benchmarks/speech_to_text/get-whisper-data.md index 9bc97ad9a0..ed9e3b02b0 100644 --- a/docs/benchmarks/speech_to_text/get-whisper-data.md +++ b/docs/benchmarks/speech_to_text/get-whisper-data.md @@ -15,7 +15,7 @@ The benchmark implementation run command will automatically download the validat ### Get Preprocessed Validation Dataset ``` - mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j + mlcr get,dataset,whisper,_preprocessed,_mlc,_r2-downloader --outdirname= -j ``` === "Unprocessed" @@ -34,7 +34,7 @@ The benchmark implementation run command will automatically download the require ### Get the Official MLPerf Whisper model from MLCOMMONS Cloudflare R2 ``` - mlcr get,ml-model,whisper,_rclone,_mlc s-j + mlcr get,ml-model,whisper,_r2-downloader,_mlc -j ``` - `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/submission/index.md b/docs/submission/index.md index 079a513854..56f872fc6e 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -158,6 +158,18 @@ If there are multiple systems where MLPerf results are collected, the same proce --commit_message="Results on added by " \ --quiet ``` + + The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument. +
+ Click to see the sample output + ``` + [2025-07-23 16:36:56,399 module.py:2197 INFO] - + + Path to the locally synced submission directory: mysubmissions/mlperf_submission + + + ``` +
```mermaid flowchart LR @@ -193,7 +205,8 @@ Once you have all the results on the system, you can upload them to the MLCommon mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ --submission_dir= - ``` + ``` + === "via Browser" You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). ``` diff --git a/main.py b/main.py index 419b76a6da..e4782bf3ac 100755 --- a/main.py +++ b/main.py @@ -45,7 +45,8 @@ def mlperf_inference_implementation_readme( if "99.9" not in model and implementation_tips: content += f"\n{pre_space}!!! tip\n\n" content += f"{pre_space} - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n" - + if model.lower() in ["sdxl"]: + content += f"\n{pre_space}> **Note:** {model.upper()} reference implementation does not support multithreading.\n\n" if not devices: devices = ["CPU", "CUDA", "ROCm"] @@ -139,10 +140,10 @@ def mlperf_inference_implementation_readme( categories = ["Datacenter"] elif model.lower() in ["pointpainting"]: categories = ["Edge"] - elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3", "deepseek-r1"]: + elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3_1-405b-99.9", "llama3_1-405b-99", "deepseek-r1"]: categories = ["Datacenter"] else: - categories = ["Edge", "Datacenter"] + categories = ["Datacenter", "Edge"] # model name content += f"{pre_space}{model.upper()}\n\n" @@ -159,6 +160,8 @@ def mlperf_inference_implementation_readme( scenarios.remove("Offline") if model.lower() in ["whisper"]: scenarios.remove("SingleStream") + if model.lower() == "llama3_1-8b": + model = "llama3_1-8b-edge" elif category == "Datacenter": scenarios = ["Offline", "Server"] if model.lower() in ["whisper"]: @@ -547,7 +550,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=`.\n\n" if implementation.lower() == "reference" and model.lower() not in [ - "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: + "pointpainting", "llama3_1-8b", "llama3_1-8b-edge", "deepseek-r1", "whisper"]: info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": @@ -573,10 +576,10 @@ def get_docker_info(spaces, model, implementation, if model == "sdxl": info += f"{pre_space} - `--env.MLC_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" - elif "llama3" in model.lower(): + elif "llama3_1-405b" in model.lower(): info += f"{pre_space} - `--env.MLC_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" - elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]: + elif model.lower() in ["llama3_1-8b", "llama3_1-8b-edge", "whisper", "deepseek-r1"]: info += f"{pre_space} - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" From 23bd06243d9232728f185bfe39411ec977764712 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:15:03 +0530 Subject: [PATCH 31/35] Add quiet flags to MLC commands (#2309) --- docs/submission/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index 56f872fc6e..7b4ab585f5 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -204,7 +204,7 @@ Once you have all the results on the system, you can upload them to the MLCommon ``` mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ - --submission_dir= + --submission_dir= --quiet ``` === "via Browser" @@ -213,7 +213,7 @@ Once you have all the results on the system, you can upload them to the MLCommon mlcr run,mlperf,submission,checker,inference \ --submission_dir= \ --tar=yes \ - --submission_tar_file=mysubmission.tar.gz + --submission_tar_file=mysubmission.tar.gz --quiet ``` ```mermaid From 6ec49291d40cf427f5c45428e952db6e6349c9c0 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 31 Jul 2025 23:27:58 +0530 Subject: [PATCH 32/35] Improve docs - submission generation (#2311) --- docs/submission/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index 7b4ab585f5..f920bbfa11 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -159,7 +159,7 @@ If there are multiple systems where MLPerf results are collected, the same proce --quiet ``` - The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument. + > **Note:** The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument.
Click to see the sample output ``` @@ -204,7 +204,7 @@ Once you have all the results on the system, you can upload them to the MLCommon ``` mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ - --submission_dir= --quiet + --submission_dir= --quiet ``` === "via Browser" From ff856b8ff204395338432d7c823f677644c2ffe2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 31 Jul 2025 22:04:42 +0000 Subject: [PATCH 33/35] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index f124b808c5..7a0c22c902 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1487,7 +1487,8 @@ def check_accuracy_dir(config, model, path, verbose): def extra_check_llm(mlperf_log, scenario, model): if mlperf_log["requested_use_token_latencies"]: if scenario not in ["Server", "Interactive"]: - # For offline, singlestream and multistream no further checks are necessary + # For offline, singlestream and multistream no further checks are + # necessary return True else: limits = LLM_LATENCY_LIMITS[model][scenario] @@ -1887,7 +1888,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) From bf1469eccc454c3e74733740ad1be0653d8959d6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 4 Nov 2025 22:11:55 +0000 Subject: [PATCH 34/35] [Automated Commit] Format Codebase --- speech2text/accuracy_eval.py | 4 ++-- speech2text/reference_SUT.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/speech2text/accuracy_eval.py b/speech2text/accuracy_eval.py index eb6cc53299..c356ab6398 100644 --- a/speech2text/accuracy_eval.py +++ b/speech2text/accuracy_eval.py @@ -57,12 +57,12 @@ "x", "y", "z", - "'", + "'", "0", "1", "2", "3", - "4", + "4", "5", "6", "7", diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py index 63d491a00f..0b2f02c490 100644 --- a/speech2text/reference_SUT.py +++ b/speech2text/reference_SUT.py @@ -90,12 +90,12 @@ def get_start_cores(start_cores="0"): "x", "y", "z", - "'", + "'", "0", "1", "2", "3", - "4", + "4", "5", "6", "7", From d9f1bc1d28939f3d6b58678aad7184539f7c072c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 25 Nov 2025 16:56:38 +0000 Subject: [PATCH 35/35] [Automated Commit] Format Codebase --- tools/submission/preprocess_submission.py | 19 +++++++++++++------ tools/submission/submission_checker.py | 5 ++--- tools/submission/truncate_accuracy_log.py | 3 ++- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 34043208c8..df3d748c5c 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -99,7 +99,7 @@ def delete_empty_dirs(src): return False -def copy_submission_dir(src, dst, filter_submitter, keep_structure = True): +def copy_submission_dir(src, dst, filter_submitter, keep_structure=True): """ Copies the submission tree to output directory for processing """ @@ -116,15 +116,18 @@ def copy_submission_dir(src, dst, filter_submitter, keep_structure = True): ) else: for dir in os.listdir(os.path.join(src, division, submitter)): - if os.path.isdir(os.path.join(src, division, submitter, dir)): - target_dir = "results" if dir in ["compliance", "measurements"] else dir + if os.path.isdir(os.path.join( + src, division, submitter, dir)): + target_dir = "results" if dir in [ + "compliance", "measurements"] else dir shutil.copytree( os.path.join(src, division, submitter, dir), os.path.join(dst, division, submitter, target_dir), - dirs_exist_ok = True + dirs_exist_ok=True ) for file in os.listdir(os.path.join(src, division, submitter)): - if os.path.isfile(os.path.join(src, division, submitter, file)): + if os.path.isfile(os.path.join( + src, division, submitter, file)): shutil.copyfile( os.path.join(src, division, submitter, file), os.path.join(dst, division, submitter, file) @@ -561,7 +564,11 @@ def main(): log.error(f"output directory {args.output} already exists") sys.exit(1) os.makedirs(args.output) - copy_submission_dir(args.input, args.output, args.submitter, args.keep_structure) + copy_submission_dir( + args.input, + args.output, + args.submitter, + args.keep_structure) src_dir = args.output config = checker.Config( diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 335485c33c..5c2801bacb 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1061,7 +1061,7 @@ def set_type(self, submission_type): self.optional = self.base["optional-scenarios-datacenter-edge"] else: raise ValueError("invalid system type") - + def skip_calibration(self): return self.skip_calibration_check or self.version in ["v5.0"] @@ -1893,7 +1893,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) @@ -3040,7 +3040,6 @@ def check_measurement_dir( end = len(".json") break - weight_data_types = None if system_file: with open(os.path.join(measurement_dir, system_file), "r") as f: diff --git a/tools/submission/truncate_accuracy_log.py b/tools/submission/truncate_accuracy_log.py index 6c1267fdf8..87bba5ab98 100755 --- a/tools/submission/truncate_accuracy_log.py +++ b/tools/submission/truncate_accuracy_log.py @@ -172,7 +172,8 @@ def truncate_results_dir(filter_submitter, backup, scenarios_to_skip): acc_path, "accuracy.txt") # only TEST01 has an accuracy log - if str(test).startswith("TEST") and test != "TEST01": + if str(test).startswith( + "TEST") and test != "TEST01": continue if not os.path.exists(acc_log): log.error("%s missing", acc_log)