Merge branch 'mlcommons:master' into master

GATEOverflow · Jul 16, 2024 · 751b9fc · 751b9fc
2 parents ef51ae3 + c83565d
commit 751b9fc
Show file tree

Hide file tree

Showing 11 changed files with 102 additions and 22 deletions.
diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml
@@ -32,4 +32,4 @@ jobs:
         git clone https://github.com/mlcommons/inference_results_v4.0 --depth 1
     - name: Test MLPerf inference submission checker
       run: |
-        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --version=r4.0 --quiet 
+        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --src_version=v4.0 --quiet 
diff --git a/compliance/nvidia/TEST04/README.md b/compliance/nvidia/TEST04/README.md
@@ -25,6 +25,8 @@ This test is not applicable for the following benchmarks whose performance is de
  5. retinanet (while retinanet input images are not variable, computation cost varies significantly due to the variance in qualified detections from the detector heads, which affect the NMS runtime)
  6. gpt-j
 
+**Warning:** For the stable diffusion benchmark, min_query_count is set to 500 to avoid a longer runtime. For loading benchmark specific configuration variables, you should load the audit configuration from your SUT code. You can do this by calling `settings.FromConfig(audit_config, model_name, scenario)`. This can't happend inside loadgen, because it is agnostic to the model that is being tested.
+
 ## Scenarios
 
  - This test is applicable for all scenarios.

diff --git a/compliance/nvidia/TEST04/audit.config b/compliance/nvidia/TEST04/audit.config
@@ -18,3 +18,8 @@
 *.Server.performance_issue_unique = 0
 *.Server.performance_issue_same = 1
 *.Server.performance_issue_same_index = 3
+stable-diffusion-xl.Offline.min_query_count = 500
+# You can optionally set the target qps to match the expected query count 
+# in the min duration with the min_query_count. But take into account you
+# system expected qps, the min duration for this test is 10 minutes
+# stable-diffusion-xl.Offline.target_qps = 0.75
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
@@ -53,7 +53,6 @@ def main():
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
-        log.warning("Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
     else:
         settings.mode = lg.TestMode.PerformanceOnly
 

diff --git a/language/mixtral-8x7b/main.py b/language/mixtral-8x7b/main.py
@@ -117,8 +117,6 @@ def main():
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
-        log.warning(
-            "Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
     else:
         settings.mode = lg.TestMode.PerformanceOnly
 

diff --git a/loadgen/CMakeLists.txt b/loadgen/CMakeLists.txt
@@ -4,7 +4,7 @@ project(mlperf_loadgen)
 
 # The mlperf_loadgen version.
 set(mlperf_loadgen_VERSION_MAJOR 4)
-set(mlperf_loadgen_VERSION_MINOR 0)
+set(mlperf_loadgen_VERSION_MINOR 1)
 message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}")
 
 # Set build options. NB: CXX_STANDARD is supported since CMake 3.1.

diff --git a/loadgen/results.cc b/loadgen/results.cc
@@ -672,6 +672,8 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
       recommendation +=
           "The test exited early, before enough queries were issued.";
     }
+    std::replace(recommendation.begin(),
+               recommendation.end(), '\n', ' ');
     MLPERF_LOG(detail, "result_invalid_reason", recommendation);
   }
   std::replace(early_stopping_recommendation.begin(),

diff --git a/loadgen/version_generator.py b/loadgen/version_generator.py
@@ -93,7 +93,7 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root):
     ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
     ofile.write("#include <string>\n\n")
     ofile.write("namespace mlperf {\n\n")
-    ofile.write(func_def("Version", "\"4.0\""))
+    ofile.write(func_def("Version", "\"4.1\""))
 
     date_time_now_local = datetime.datetime.now().isoformat()
     date_time_now_utc = datetime.datetime.utcnow().isoformat()

diff --git a/text_to_image/main.py b/text_to_image/main.py
@@ -434,6 +434,8 @@ def flush_queries():
     settings = lg.TestSettings()
     settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
     settings.FromConfig(user_conf, args.model_name, args.scenario)
+    if os.path.exists(audit_config):
+        settings.FromConfig(audit_config, args.model_name, args.scenario)
     settings.scenario = scenario
     settings.mode = lg.TestMode.PerformanceOnly
     if args.accuracy:

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
@@ -15,7 +15,7 @@
 import torch
 from clip.clip_encoder import CLIPEncoder
 from fid.inception import InceptionV3
-from fid.fid_score import compute_statistics_of_path, get_activations, calculate_frechet_distance
+from fid.fid_score import compute_fid, compute_statistics_of_path, get_activations, calculate_frechet_distance
 from tqdm import tqdm
 import ijson
 
@@ -31,6 +31,7 @@ def get_args():
     parser.add_argument("--output-file", default="coco-results.json", help="path to output file")
     parser.add_argument("--compliance-images-path", required=False, help="path to dump 10 stable diffusion xl compliance images")
     parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"])
+    parser.add_argument("--low_memory", action="store_true", help="If device is has limited memory (<70G), use the memory saving path.")
     args = parser.parse_args()
     return args
 
@@ -78,19 +79,84 @@ def main():
                 caption_file.write(f"{idx}  {df_captions.iloc[idx]['caption']}\n")
 
     # Compute accuracy
-    compute_accuracy(
-        args.mlperf_accuracy_file,
-        args.output_file,
-        device,
-        dump_compliance_images,
-        compliance_images_idx_list,
-        args.compliance_images_path,
-        df_captions,
-        statistics_path,
-    )
-
+    if args.low_memory:
+        print(f"Device has low memory, running memory saving path!")
+        compute_accuracy_low_memory(
+            args.mlperf_accuracy_file,
+            args.output_file,
+            device,
+            dump_compliance_images,
+            compliance_images_idx_list,
+            args.compliance_images_path,
+            df_captions,
+            statistics_path,
+        )
+    else:
+        compute_accuracy(
+            args.mlperf_accuracy_file,
+            args.output_file,
+            device,
+            dump_compliance_images,
+            compliance_images_idx_list,
+            args.compliance_images_path,
+            df_captions,
+            statistics_path,
+        )        
+
 
 def compute_accuracy(
+    mlperf_accuracy_file, 
+    output_file,
+    device,
+    dump_compliance_images,
+    compliance_images_idx_list,
+    compliance_images_path,
+    df_captions,
+    statistics_path,    
+):
+    # Load torchmetrics modules
+    clip = CLIPEncoder(device=device)
+    clip_scores = []
+    seen = set()
+    result_list = []
+    result_dict = {}
+
+    # Load model outputs
+    with open(mlperf_accuracy_file, "r") as f:
+        results = json.load(f)
+
+    for j in tqdm(results):
+        idx = j['qsl_idx']
+        if idx in seen:
+            continue
+        seen.add(idx)
+
+        # Load generated image
+        generated_img = np.frombuffer(bytes.fromhex(j['data']), np.uint8).reshape(1024, 1024, 3)
+        result_list.append(generated_img)
+        generated_img = Image.fromarray(generated_img)
+
+        # Dump compliance images
+        if dump_compliance_images and idx in compliance_images_idx_list:
+            generated_img.save(os.path.join(compliance_images_path, f"{idx}.png"))
+
+        # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device)
+        # Load Ground Truth
+        caption = df_captions.iloc[idx]["caption"]
+        clip_scores.append(
+            100 * clip.get_clip_score(caption, generated_img).item()
+        )
+    fid_score = compute_fid(result_list, statistics_path, device)
+
+    result_dict["FID_SCORE"] = fid_score
+    result_dict["CLIP_SCORE"] = np.mean(clip_scores)
+    print(f"Accuracy Results: {result_dict}")
+
+    with open(output_file, "w") as fp:
+        json.dump(result_dict, fp, sort_keys=True, indent=4)
+
+
+def compute_accuracy_low_memory(
     mlperf_accuracy_file, 
     output_file,
     device,
@@ -99,7 +165,7 @@ def compute_accuracy(
     compliance_images_path,
     df_captions,
     statistics_path,
-    batch_size=8,
+    batch_size=256,
     inception_dims=2048,
     num_workers=1,
 ):    
@@ -116,7 +182,7 @@ def compute_accuracy(
     else:
         num_workers = num_workers    
 
-    # Prepare models
+    # Load torchmetrics modules
     block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims]
     inception_model = InceptionV3([block_idx]).to(device)
     clip_model = CLIPEncoder(device=device)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
@@ -2556,7 +2556,6 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
-        "stable-diffusion-xl",
         "mixtral-8x7b"
     ]:
         test_list.remove("TEST04")
@@ -2578,7 +2577,14 @@ def check_compliance_dir(
         "llama2-70b-99.9",
         "mixtral-8x7b"
     ]:
-        test_list.remove("TEST01") 
+        test_list.remove("TEST01")
+
+    if model in [
+        "stable-diffusion-xl"
+    ] and config.version in [ "v4.0" ]:
+        test_list.remove("TEST01")
+        test_list.remove("TEST04")
+
 
     if model in [
         "llama2-70b-99",