Skip to content

Commit

Permalink
Merge branch 'mlcommons:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
arjunsuresh authored Jul 16, 2024
2 parents ef51ae3 + c83565d commit 751b9fc
Show file tree
Hide file tree
Showing 11 changed files with 102 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-submission-checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
git clone https://github.com/mlcommons/inference_results_v4.0 --depth 1
- name: Test MLPerf inference submission checker
run: |
cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --version=r4.0 --quiet
cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --src_version=v4.0 --quiet
2 changes: 2 additions & 0 deletions compliance/nvidia/TEST04/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ This test is not applicable for the following benchmarks whose performance is de
5. retinanet (while retinanet input images are not variable, computation cost varies significantly due to the variance in qualified detections from the detector heads, which affect the NMS runtime)
6. gpt-j

**Warning:** For the stable diffusion benchmark, min_query_count is set to 500 to avoid a longer runtime. For loading benchmark specific configuration variables, you should load the audit configuration from your SUT code. You can do this by calling `settings.FromConfig(audit_config, model_name, scenario)`. This can't happend inside loadgen, because it is agnostic to the model that is being tested.

## Scenarios

- This test is applicable for all scenarios.
Expand Down
5 changes: 5 additions & 0 deletions compliance/nvidia/TEST04/audit.config
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@
*.Server.performance_issue_unique = 0
*.Server.performance_issue_same = 1
*.Server.performance_issue_same_index = 3
stable-diffusion-xl.Offline.min_query_count = 500
# You can optionally set the target qps to match the expected query count
# in the min duration with the min_query_count. But take into account you
# system expected qps, the min duration for this test is 10 minutes
# stable-diffusion-xl.Offline.target_qps = 0.75
1 change: 0 additions & 1 deletion language/llama2-70b/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def main():

if args.accuracy:
settings.mode = lg.TestMode.AccuracyOnly
log.warning("Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
else:
settings.mode = lg.TestMode.PerformanceOnly

Expand Down
2 changes: 0 additions & 2 deletions language/mixtral-8x7b/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,6 @@ def main():

if args.accuracy:
settings.mode = lg.TestMode.AccuracyOnly
log.warning(
"Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
else:
settings.mode = lg.TestMode.PerformanceOnly

Expand Down
2 changes: 1 addition & 1 deletion loadgen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project(mlperf_loadgen)

# The mlperf_loadgen version.
set(mlperf_loadgen_VERSION_MAJOR 4)
set(mlperf_loadgen_VERSION_MINOR 0)
set(mlperf_loadgen_VERSION_MINOR 1)
message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}")

# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
Expand Down
2 changes: 2 additions & 0 deletions loadgen/results.cc
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,8 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
recommendation +=
"The test exited early, before enough queries were issued.";
}
std::replace(recommendation.begin(),
recommendation.end(), '\n', ' ');
MLPERF_LOG(detail, "result_invalid_reason", recommendation);
}
std::replace(early_stopping_recommendation.begin(),
Expand Down
2 changes: 1 addition & 1 deletion loadgen/version_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root):
ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
ofile.write("#include <string>\n\n")
ofile.write("namespace mlperf {\n\n")
ofile.write(func_def("Version", "\"4.0\""))
ofile.write(func_def("Version", "\"4.1\""))

date_time_now_local = datetime.datetime.now().isoformat()
date_time_now_utc = datetime.datetime.utcnow().isoformat()
Expand Down
2 changes: 2 additions & 0 deletions text_to_image/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ def flush_queries():
settings = lg.TestSettings()
settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
settings.FromConfig(user_conf, args.model_name, args.scenario)
if os.path.exists(audit_config):
settings.FromConfig(audit_config, args.model_name, args.scenario)
settings.scenario = scenario
settings.mode = lg.TestMode.PerformanceOnly
if args.accuracy:
Expand Down
94 changes: 80 additions & 14 deletions text_to_image/tools/accuracy_coco.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import torch
from clip.clip_encoder import CLIPEncoder
from fid.inception import InceptionV3
from fid.fid_score import compute_statistics_of_path, get_activations, calculate_frechet_distance
from fid.fid_score import compute_fid, compute_statistics_of_path, get_activations, calculate_frechet_distance
from tqdm import tqdm
import ijson

Expand All @@ -31,6 +31,7 @@ def get_args():
parser.add_argument("--output-file", default="coco-results.json", help="path to output file")
parser.add_argument("--compliance-images-path", required=False, help="path to dump 10 stable diffusion xl compliance images")
parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"])
parser.add_argument("--low_memory", action="store_true", help="If device is has limited memory (<70G), use the memory saving path.")
args = parser.parse_args()
return args

Expand Down Expand Up @@ -78,19 +79,84 @@ def main():
caption_file.write(f"{idx} {df_captions.iloc[idx]['caption']}\n")

# Compute accuracy
compute_accuracy(
args.mlperf_accuracy_file,
args.output_file,
device,
dump_compliance_images,
compliance_images_idx_list,
args.compliance_images_path,
df_captions,
statistics_path,
)

if args.low_memory:
print(f"Device has low memory, running memory saving path!")
compute_accuracy_low_memory(
args.mlperf_accuracy_file,
args.output_file,
device,
dump_compliance_images,
compliance_images_idx_list,
args.compliance_images_path,
df_captions,
statistics_path,
)
else:
compute_accuracy(
args.mlperf_accuracy_file,
args.output_file,
device,
dump_compliance_images,
compliance_images_idx_list,
args.compliance_images_path,
df_captions,
statistics_path,
)


def compute_accuracy(
mlperf_accuracy_file,
output_file,
device,
dump_compliance_images,
compliance_images_idx_list,
compliance_images_path,
df_captions,
statistics_path,
):
# Load torchmetrics modules
clip = CLIPEncoder(device=device)
clip_scores = []
seen = set()
result_list = []
result_dict = {}

# Load model outputs
with open(mlperf_accuracy_file, "r") as f:
results = json.load(f)

for j in tqdm(results):
idx = j['qsl_idx']
if idx in seen:
continue
seen.add(idx)

# Load generated image
generated_img = np.frombuffer(bytes.fromhex(j['data']), np.uint8).reshape(1024, 1024, 3)
result_list.append(generated_img)
generated_img = Image.fromarray(generated_img)

# Dump compliance images
if dump_compliance_images and idx in compliance_images_idx_list:
generated_img.save(os.path.join(compliance_images_path, f"{idx}.png"))

# generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device)
# Load Ground Truth
caption = df_captions.iloc[idx]["caption"]
clip_scores.append(
100 * clip.get_clip_score(caption, generated_img).item()
)
fid_score = compute_fid(result_list, statistics_path, device)

result_dict["FID_SCORE"] = fid_score
result_dict["CLIP_SCORE"] = np.mean(clip_scores)
print(f"Accuracy Results: {result_dict}")

with open(output_file, "w") as fp:
json.dump(result_dict, fp, sort_keys=True, indent=4)


def compute_accuracy_low_memory(
mlperf_accuracy_file,
output_file,
device,
Expand All @@ -99,7 +165,7 @@ def compute_accuracy(
compliance_images_path,
df_captions,
statistics_path,
batch_size=8,
batch_size=256,
inception_dims=2048,
num_workers=1,
):
Expand All @@ -116,7 +182,7 @@ def compute_accuracy(
else:
num_workers = num_workers

# Prepare models
# Load torchmetrics modules
block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims]
inception_model = InceptionV3([block_idx]).to(device)
clip_model = CLIPEncoder(device=device)
Expand Down
10 changes: 8 additions & 2 deletions tools/submission/submission_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2556,7 +2556,6 @@ def check_compliance_dir(
"gptj-99.9",
"llama2-70b-99",
"llama2-70b-99.9",
"stable-diffusion-xl",
"mixtral-8x7b"
]:
test_list.remove("TEST04")
Expand All @@ -2578,7 +2577,14 @@ def check_compliance_dir(
"llama2-70b-99.9",
"mixtral-8x7b"
]:
test_list.remove("TEST01")
test_list.remove("TEST01")

if model in [
"stable-diffusion-xl"
] and config.version in [ "v4.0" ]:
test_list.remove("TEST01")
test_list.remove("TEST04")


if model in [
"llama2-70b-99",
Expand Down

0 comments on commit 751b9fc

Please sign in to comment.