From f576d52a5b576fef7034033da4e141a2cc745cfe Mon Sep 17 00:00:00 2001 From: nvitramble <84538536+nvitramble@users.noreply.github.com> Date: Tue, 12 Jul 2022 09:30:58 -0700 Subject: [PATCH 1/5] Add new BERT calibration dataset (#1171) --- calibration/SQuAD-v1.1/README.md | 7 +- ...tion.txt => bert_calibration_features.txt} | 0 .../SQuAD-v1.1/bert_calibration_qas_ids.txt | 100 ++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) rename calibration/SQuAD-v1.1/{bert-calibration.txt => bert_calibration_features.txt} (100%) create mode 100755 calibration/SQuAD-v1.1/bert_calibration_qas_ids.txt diff --git a/calibration/SQuAD-v1.1/README.md b/calibration/SQuAD-v1.1/README.md index 5f0fad6d3..85b2513d7 100644 --- a/calibration/SQuAD-v1.1/README.md +++ b/calibration/SQuAD-v1.1/README.md @@ -1 +1,6 @@ -The calibration file has 100 randomly selected samples from dev-1.1.json, which contains 10570 samples in total. +The integers in bert_calibration_features.txt correspond to 100 randomly selected indices in the list of features generated from dev-v1.1.json using [convert_examples_to_features()](https://github.com/mlcommons/inference/blob/master/language/bert/create_squad_data.py#L249) with a doc_stride of 128 and a max_seq_len of 384. + +The values in bert_calibration_qas_ids.txt correspond to 100 randomly selected qas ids in the dev-v1.1.json file. + +Please only use at most 1 calibration file from this folder for calibration. + diff --git a/calibration/SQuAD-v1.1/bert-calibration.txt b/calibration/SQuAD-v1.1/bert_calibration_features.txt similarity index 100% rename from calibration/SQuAD-v1.1/bert-calibration.txt rename to calibration/SQuAD-v1.1/bert_calibration_features.txt diff --git a/calibration/SQuAD-v1.1/bert_calibration_qas_ids.txt b/calibration/SQuAD-v1.1/bert_calibration_qas_ids.txt new file mode 100755 index 000000000..5d6f45989 --- /dev/null +++ b/calibration/SQuAD-v1.1/bert_calibration_qas_ids.txt @@ -0,0 +1,100 @@ +573020f7b2c2fd14005688fa +56beb6f23aeaaa14008c92a1 +5737a5931c456719005744e9 +5725d79e89a1e219009abf91 +56e0d9e0231d4119001ac43f +57281ab63acd2414000df496 +57269fab5951b619008f780b +5726400589a1e219009ac5f0 +572fd264b2c2fd14005684aa +56f85e71a6d7ea1400e175c4 +5728804b4b864d1900164a47 +57264cac708984140094c1b4 +5726bf135951b619008f7ceb +5728848cff5b5019007da298 +572fbf21a23a5019007fc93b +5727448b5951b619008f87a1 +5729e1101d04691400779641 +56e11afbcd28a01900c675c9 +5726642f5951b619008f7159 +56e08d32231d4119001ac2b1 +57265d86f1498d1400e8dd50 +56f7eddca6d7ea1400e172d9 +56de1645cffd8e1900b4b5d1 +5726a5525951b619008f78df +56f851b1a6d7ea1400e1755e +572a18a4af94a219006aa7e2 +57286bb84b864d19001649ca +571bb2269499d21900609cab +56d7251d0d65d214001983cc +56f88eafaef2371900626194 +571cde695efbb31900334e16 +57294279af94a219006aa20a +56bec98e3aeaaa14008c9457 +57269656708984140094cb01 +56be54bdacb8001400a50323 +571c9074dd7acb1400e4c100 +56f8b4d79b226e1400dd0e78 +5710f2e2a58dae1900cd6b73 +572683e6f1498d1400e8e24e +56f7f2e0aef2371900625cb3 +572fadcbb2c2fd1400568329 +5725fabc89a1e219009ac12a +5727aa413acd2414000de924 +56e77da237bdd419002c403d +5729e2b76aef0514001550d2 +57265e11708984140094c3bd +5726bf325951b619008f7d01 +57335fcad058e614000b5973 +572663a9f1498d1400e8ddf2 +57299ec43f37b3190047850f +56f80e1daef2371900625d8d +572689b6dd62a815002e8892 +57264a74708984140094c18c +57274d1cdd62a815002e9ab2 +572871bc4b864d1900164a04 +56d7018a0d65d214001982c5 +57111713a58dae1900cd6c02 +56bebbbf3aeaaa14008c9317 +57300e2604bcaa1900d770b7 +56f8074faef2371900625d7a +5727c94bff5b5019007d954b +5727ffb5ff5b5019007d9a8d +56e75d5037bdd419002c3ef8 +57273e50dd62a815002e9a05 +5729582b1d046914007792e4 +57290ee2af94a219006aa003 +57286ec63acd2414000df9d4 +572632ceec44d21400f3dc30 +5726f635dd62a815002e9658 +572a1f086aef0514001552c2 +57269344f1498d1400e8e440 +56bec6ac3aeaaa14008c93ff +57283adcff5b5019007d9f96 +5733266d4776f41900660714 +5725d79e89a1e219009abf94 +57280f974b864d1900164372 +570960cf200fba1400367f04 +570d28bdb3d812140066d4a7 +56e1c0f6cd28a01900c67b2e +56bec3153aeaaa14008c938b +57284618ff5b5019007da0ac +571c3e8cdd7acb1400e4c0a7 +5728fb6a1d04691400778ef6 +5726ef12dd62a815002e95a0 +57296f85af94a219006aa404 +572fe288a23a5019007fcadb +5727500f708984140094dbff +572fc659b2c2fd1400568449 +570d3468b3d812140066d545 +572a07c11d046914007796d5 +56e1fc57e3433e140042322c +573098f38ab72b1400f9c5d5 +56e1b355e3433e14004230b2 +57280cac2ca10214002d9cac +57287b4a4b864d1900164a2b +56bf36b93aeaaa14008c9565 +5728202c4b864d19001644ec +5728dab94b864d1900164f99 +57376a1bc3c5551400e51ec5 +57377083c3c5551400e51ee2 From fc02e9b19d1a3dc1b18cc425d6c2d7b25407ea96 Mon Sep 17 00:00:00 2001 From: Bruno Ferreira Date: Mon, 18 Jul 2022 16:31:27 +0100 Subject: [PATCH 2/5] Update CLA bot (#1180) --- .github/workflows/cla.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cla.yml b/.github/workflows/cla.yml index 8880b7449..c0e1544d4 100644 --- a/.github/workflows/cla.yml +++ b/.github/workflows/cla.yml @@ -1,3 +1,4 @@ + name: "cla-bot" on: issue_comment: @@ -12,20 +13,19 @@ jobs: - name: "MLCommons CLA bot check" if: (github.event.comment.body == 'recheck') || github.event_name == 'pull_request_target' # Alpha Release - uses: sub-mod/github-action@v3 + uses: mlcommons/cla-bot@master env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # the below token should have repo scope and must be manually added by you in the repository's secret PERSONAL_ACCESS_TOKEN : ${{ secrets.MLCOMMONS_BOT_CLA_TOKEN }} with: path-to-signatures: 'cla-bot/v1/cla.json' - path-to-document: 'https://github.com/mlcommons/systems/blob/main/mlcommons_cla.txt' # e.g. a CLA or a DCO document # branch should not be protected branch: 'main' allowlist: user1,bot* remote-organization-name: mlcommons remote-repository-name: systems - + #below are the optional inputs - If the optional inputs are not given, then default values will be taken #remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository) #remote-repository-name: enter the remote repository name where the signatures should be stored (Default is storing the signatures in the same repository) From ef8e58e1e612ee991ea537c02e289ae48e8414d2 Mon Sep 17 00:00:00 2001 From: georgelyuan <53881988+georgelyuan@users.noreply.github.com> Date: Mon, 25 Jul 2022 15:40:43 -0700 Subject: [PATCH 3/5] removing obselete audit directory (#1188) --- compliance/audit_v0.5/nvidia/TEST01/README | 61 ------- .../audit_v0.5/nvidia/TEST01/audit.config | 18 --- .../audit_v0.5/nvidia/TEST01/gnmt/README | 25 --- .../TEST01/gnmt/create_accuracy_baseline.sh | 19 --- .../audit_v0.5/nvidia/TEST01/truncate_log.py | 51 ------ .../audit_v0.5/nvidia/TEST01/truncate_log.sh | 11 -- .../nvidia/TEST01/verify_accuracy.py | 103 ------------ .../nvidia/TEST01/verify_performance.py | 126 --------------- compliance/audit_v0.5/nvidia/TEST03/README | 34 ---- .../nvidia/TEST03/download_and_modify_gnmt.sh | 149 ------------------ .../nvidia/TEST03/modify_gnmt_data.py | 147 ----------------- .../nvidia/TEST03/modify_image_data.py | 120 -------------- .../nvidia/TEST03/verify_performance.py | 126 --------------- compliance/audit_v0.5/nvidia/TEST04-A/README | 39 ----- .../audit_v0.5/nvidia/TEST04-A/audit.config | 16 -- .../TEST04-A/verify_test4_performance.py | 135 ---------------- compliance/audit_v0.5/nvidia/TEST04-B/README | 1 - .../audit_v0.5/nvidia/TEST04-B/audit.config | 20 --- compliance/audit_v0.5/nvidia/TEST05/README | 25 --- .../audit_v0.5/nvidia/TEST05/audit.config | 22 --- .../nvidia/TEST05/verify_performance.py | 126 --------------- 21 files changed, 1374 deletions(-) delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/README delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/audit.config delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/gnmt/README delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/gnmt/create_accuracy_baseline.sh delete mode 100755 compliance/audit_v0.5/nvidia/TEST01/truncate_log.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/truncate_log.sh delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/verify_accuracy.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST01/verify_performance.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST03/README delete mode 100644 compliance/audit_v0.5/nvidia/TEST03/download_and_modify_gnmt.sh delete mode 100644 compliance/audit_v0.5/nvidia/TEST03/modify_gnmt_data.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST03/modify_image_data.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST03/verify_performance.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST04-A/README delete mode 100644 compliance/audit_v0.5/nvidia/TEST04-A/audit.config delete mode 100644 compliance/audit_v0.5/nvidia/TEST04-A/verify_test4_performance.py delete mode 100644 compliance/audit_v0.5/nvidia/TEST04-B/README delete mode 100644 compliance/audit_v0.5/nvidia/TEST04-B/audit.config delete mode 100644 compliance/audit_v0.5/nvidia/TEST05/README delete mode 100644 compliance/audit_v0.5/nvidia/TEST05/audit.config delete mode 100644 compliance/audit_v0.5/nvidia/TEST05/verify_performance.py diff --git a/compliance/audit_v0.5/nvidia/TEST01/README b/compliance/audit_v0.5/nvidia/TEST01/README deleted file mode 100644 index 1c6c6739b..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/README +++ /dev/null @@ -1,61 +0,0 @@ -The purpose of this test is to ensure that valid inferences are being performed in performance mode. By default, -the inference result that is returned from SUT to Loadgen is not dumped to the accuracy JSON file and thus not -checked for accuracy. In this test, a portion of the results are dumped to the accuracy JSON randomly with some -chosen probability. This accuracy JSON file can then be checked against the accuracy JSON generated in accuracy -mode. - -Note that under the MLPerf v0.5 inference rules, certain forms of non-determinism is acceptable, which can cause -inference results to differ across runs. It is foreseeable that the results obtained during the accuracy run -can be different from that obtained during the performance run, which will cause the accuracy checking script -to report failure. Test failure will automatically result in an objection, but the objection can be overturned -by comparing the quality of the results generated in performance mode to that obtained in accuracy mode. This -can be done by using the accuracy measurement scripts provided as part of the repo to ensure that the -classification accuracy/mAP/BLEU score meets the target. An example is provided for GNMT in the gnmt folder. - -If performance with sampling enabled is lower than the submitted performance score, accuracy_log_probability in -config file can be reduced from 10 (%) to check that performance approaches reported score. - -Note that for high-performance machines, a logging probability of 10% can result in a massive number of logged -results. To keep the size of the accuracy log file reasonable, accuracy_log_probability should be reduced to -keep the total number of logged inferences on the order of ~1000 samples for SSD-Large, and ~10000 samples -elsewhere. The probability setting can be calculated as follows: - accuracy_log_probability = 100% * (1000 or 10000) / -Alternatively, if the accuracy log is too large, it can be truncated using the provided truncate_log.sh script -before uploading results. - -UPDATE: An alternate script has been provided for reducing the accuracy log. Check truncate_log.py - -The mode is set assuming that an accuracy JSON in accuracy mode already exists (i.e. from the submission results) -that can be used to for verification. If not, the mode should be changed from PerformanceOnly (mode=2) to -AccuracyOnly (mode=1) so that accuracy mode results can be generated. - -This test does not use custom dataset or weights. - -Instructions - -Part I -Run test with provided audit.config. -Note that audit.config must be copied to the directory where the benchmark is being run from. -Verification that audit.config was properly read can be done by checking that the settings in the summary txt -file matches what is in audit.config. - -Part II -The first check is to ensure that accuracy during performance portion of run matches that achieved during -accuracy mode. - python verify_accuracy.py -a -p - -Expected outcome: - TEST PASS - -Part III -The second check is to ensure that performance with accuracy logging enabled matches submission performance score. - - python verify_performance.py -r -t - -Expected outcome: - TEST PASS - for sufficiently small but non-zero accuracy_log_probability - -Part IV -Truncate logs to reduce accuracy log file size in preparation for uploading - bash ./truncate_log.sh diff --git a/compliance/audit_v0.5/nvidia/TEST01/audit.config b/compliance/audit_v0.5/nvidia/TEST01/audit.config deleted file mode 100644 index 3947cdf5f..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/audit.config +++ /dev/null @@ -1,18 +0,0 @@ -# The format of this config file is 'key = value'. -# The key has the format 'model.scenario.key'. Value is mostly int64_t. -# Model maybe '*' as wildcard. In that case the value applies to all models. -# All times are in milli seconds - -# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) -*.MultiStream.mode = 2 -*.MultiStream.accuracy_log_rng_seed = 123 -*.MultiStream.accuracy_log_probability = 10 -*.Offline.mode = 2 -*.Offline.accuracy_log_rng_seed = 456 -*.Offline.accuracy_log_probability = 10 -*.SingleStream.mode = 2 -*.SingleStream.accuracy_log_rng_seed = 789 -*.SingleStream.accuracy_log_probability = 10 -*.Server.mode = 2 -*.Server.accuracy_log_rng_seed = 147 -*.Server.accuracy_log_probability = 10 diff --git a/compliance/audit_v0.5/nvidia/TEST01/gnmt/README b/compliance/audit_v0.5/nvidia/TEST01/gnmt/README deleted file mode 100644 index 21d20bb8c..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/gnmt/README +++ /dev/null @@ -1,25 +0,0 @@ -In the case where performance mode results differ from accuracy mode results, -which would automatically result in an objection being raised, the objection -can be overturned by evaluating the quality of the results in the two modes -and ensuring that accuracy is maintained. -create_accuracy_baseline.sh attempts to facilitate this by using the perf -mode results to create a baseline accuracy log from the accuracy mode results -that contains the same subset of the full dataset as the provided performance -mode log. This allows for an apples-to-apples comparison using the GNMT -accuracy checking script provided in the MLPerf inference repository. -The scores should be reported to the result committee who will then review -and evalute whether the objection will be permitted to be overturned. - -Note: You may see a python error about: - module 'tensorflow' has no attribute 'gfile' -in which case, replace tf.gfile.GFile with tf.io.gfile.GFile in process_accuracy.py -Usage: -1) bash ./create_accuracy_baseline.sh -2) python inference/v0.5/translation/gnmt/tensorflow/process_accuracy.py \ - --accuracy_log \ - --reference /gnmt/newstest2014.tok.bpe.32000.de -3) python inference/v0.5/translation/gnmt/tensorflow/process_accuracy.py \ - --accuracy_log \ - --reference /gnmt/newstest2014.tok.bpe.32000.de -4) Upload accuracy logs and report BLEU scores - diff --git a/compliance/audit_v0.5/nvidia/TEST01/gnmt/create_accuracy_baseline.sh b/compliance/audit_v0.5/nvidia/TEST01/gnmt/create_accuracy_baseline.sh deleted file mode 100644 index 7fe622efc..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/gnmt/create_accuracy_baseline.sh +++ /dev/null @@ -1,19 +0,0 @@ -# Usage: -# 1) bash ./create_accuracy_baseline.sh -# 2) python inference/v0.5/translation/gnmt/tensorflow/process_accuracy.py -# 3) python inference/v0.5/translation/gnmt/tensorflow/process_accuracy.py on generated baseline -# 4) Compare BLEU scores - -#!/bin/bash -accuracy_log=$1 -perf_log=$2 -patterns="unique_patterns.txt" -accuracy_baseline=$(basename -- "$accuracy_log") -accuracy_baseline="${accuracy_baseline%.*}"_baseline.json - -cut -d ':' -f 2,3 ${perf_log} | cut -d ',' -f 2- | sort | uniq | grep qsl > ${patterns} -echo '[' > ${accuracy_baseline} -grep -f ${patterns} ${accuracy_log} >> ${accuracy_baseline} -sed -i '$ s/,$/]/g' ${accuracy_baseline} -rm ${patterns} -echo "Created a baseline accuracy file: ${accuracy_baseline}" diff --git a/compliance/audit_v0.5/nvidia/TEST01/truncate_log.py b/compliance/audit_v0.5/nvidia/TEST01/truncate_log.py deleted file mode 100755 index 3c87b0e20..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/truncate_log.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2019 The MLPerf Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import os -import sys -import argparse -import json - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('log') - - return parser.parse_args() - -def main(args): - - print('Load log from {0}'.format(args.log)) - with open(args.log, 'r') as f: - results = json.load(f) - - print('Processing log entries') - rmap = {} - truncated_results = [] - for j in results: - idx = j['qsl_idx'] - if idx in rmap and rmap[idx] == j['data']: - continue - else: - truncated_results.append(j) - if idx not in rmap: - rmap[idx] = j['data'] - print('original: {0} => truncated: {1}'.format(len(results), len(truncated_results))) - - print('Write truncated log to {0}.new'.format(args.log)) - with open(args.log+'.new', 'w') as f: - json.dump(truncated_results, f, indent=4) - -if __name__ == '__main__': - main(parse_args()) diff --git a/compliance/audit_v0.5/nvidia/TEST01/truncate_log.sh b/compliance/audit_v0.5/nvidia/TEST01/truncate_log.sh deleted file mode 100644 index 52073d41b..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/truncate_log.sh +++ /dev/null @@ -1,11 +0,0 @@ -# Usage: -# 1) bash ./truncate_log.sh - -#!/bin/bash -log=$1 -samples=$2 - -head -n $((samples + 1)) ${log} > ${log}.new -sed -i '$ s/,$/]/g' ${log}.new -rm ${log} -mv ${log}.new ${log} diff --git a/compliance/audit_v0.5/nvidia/TEST01/verify_accuracy.py b/compliance/audit_v0.5/nvidia/TEST01/verify_accuracy.py deleted file mode 100644 index 91d0e8c2a..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/verify_accuracy.py +++ /dev/null @@ -1,103 +0,0 @@ -#! /usr/bin/env python3 -import os -import sys -sys.path.append(os.getcwd()) - -import argparse -import json - -import numpy as np - -dtype_map = { - "byte": np.byte, - "float32": np.float32, - "int32": np.int32, - "int64": np.int64 -} - -def main(): - - py3 = sys.version_info >= (3,0) - # Parse arguments to identify the path to the accuracy logs from - # the accuracy and performance runs - parser = argparse.ArgumentParser() - parser.add_argument( - "--accuracy_log", "-a", - help="Specifies the path to the accuracy log from a submission/accuracy run.", - default="" - ) - parser.add_argument( - "--performance_log", "-p", - help="Specifies the path to the accuracy log from a performance run with accuracy log sampling enabled.", - default="" - ) - parser.add_argument( - "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label") - args = parser.parse_args() - - print("Verifying accuracy. This might take a while...") - acc_log = args.accuracy_log - perf_log = args.performance_log - with open(acc_log, "r") as acc_json: - acc_data = json.load(acc_json) - - with open(perf_log, "r") as perf_json: - perf_data = json.load(perf_json) - - # read accuracy log json and create a dictionary of qsl_idx/data pairs - results_dict = {} - num_acc_log_duplicate_keys = 0 - num_acc_log_data_mismatch = 0 - num_perf_log_qsl_idx_match = 0 - num_perf_log_data_mismatch = 0 - num_missing_qsl_idxs = 0 - - print("Reading accuracy mode results...") - for sample in acc_data: - #print sample["qsl_idx"] - qsl_idx = sample["qsl_idx"] - data = sample["data"] - if data == '': - data = "" - if qsl_idx in results_dict.keys(): - num_acc_log_duplicate_keys += 1 - if results_dict[qsl_idx] != data: - num_acc_log_data_mismatch += 1 - else: - results_dict[qsl_idx] = data - - print("Reading performance mode results...") - for sample in perf_data: - qsl_idx = sample["qsl_idx"] - data = np.frombuffer(bytes.fromhex(sample['data']), dtype_map[args.dtype]) if py3 == True \ - else np.frombuffer(bytearray.fromhex(sample['data']), dtype_map[args.dtype]) - - if qsl_idx in results_dict.keys(): - num_perf_log_qsl_idx_match += 1 - data_perf = np.frombuffer(bytes.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype]) \ - if py3 == True else np.frombuffer(bytearray.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype]) - if data_perf.size == 0 or data.size == 0: - if data_perf.size != data.size: - num_perf_log_data_mismatch += 1 - elif data[0] != data_perf[0]: - num_perf_log_data_mismatch += 1 - else: - num_missing_qsl_idxs += 1 - - results_dict[sample["qsl_idx"]] = sample["data"] - - - print("num_acc_log_entries = {:}".format(len(acc_data))) - print("num_acc_log_duplicate_keys = {:}".format(num_acc_log_duplicate_keys)) - print("num_acc_log_data_mismatch = {:}".format(num_acc_log_data_mismatch)) - print("num_perf_log_entries = {:}".format(len(perf_data))) - print("num_perf_log_qsl_idx_match = {:}".format(num_perf_log_qsl_idx_match)) - print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch)) - print("num_missing_qsl_idxs = {:}".format(num_missing_qsl_idxs)) - if num_perf_log_data_mismatch > 0 : - print("TEST FAIL\n"); - else : - print("TEST PASS\n"); - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/compliance/audit_v0.5/nvidia/TEST01/verify_performance.py b/compliance/audit_v0.5/nvidia/TEST01/verify_performance.py deleted file mode 100644 index 09fcbfb91..000000000 --- a/compliance/audit_v0.5/nvidia/TEST01/verify_performance.py +++ /dev/null @@ -1,126 +0,0 @@ -#! /usr/bin/env python3 -import os -import sys -import re -sys.path.append(os.getcwd()) - -import argparse -import json - -def main(): - # Parse arguments to identify the path to the accuracy logs from - # the accuracy and performance runs - parser = argparse.ArgumentParser() - parser.add_argument( - "--reference_summary", "-r", - help="Specifies the path to the summary log for TEST00.", - default="" - ) - parser.add_argument( - "--test_summary", "-t", - help="Specifies the path to the summary log for this test.", - default="" - ) - args = parser.parse_args() - - print("Verifying performance.") - ref_file = open(args.reference_summary, "r") - test_file = open(args.test_summary, "r") - ref_score = 0 - test_score = 0 - ref_mode = '' - test_mode = '' - - for line in ref_file: - if re.match("Scenario", line): - ref_mode = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Single Stream": - if re.match("90th percentile latency", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Multi Stream": - if re.match("Samples per query", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Server": - if re.match("Scheduled samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Offline": - if re.match("Samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if re.match("Result is", line): - valid = line.split(": ",1)[1].strip() - if valid == 'INVALID': - sys.exit("TEST FAIL: Reference results are invalid") - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in reference results") - - - for line in test_file: - if re.match("Scenario", line): - test_mode = line.split(": ",1)[1].strip() - continue - - if test_mode == "Single Stream": - if re.match("90th percentile latency", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Multi Stream": - if re.match("Samples per query", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Server": - if re.match("Scheduled samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Offline": - if re.match("Samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if re.match("Result is", line): - valid = line.split(": ",1)[1].strip() - if valid == 'INVALID': - sys.exit("TEST FAIL: Test results are invalid") - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in test results") - - if test_mode != ref_mode: - sys.exit("Test and reference scenarios do not match!") - - print("reference score = {}".format(ref_score)) - print("test score = {}".format(test_score)) - - - threshold = 0.10 - - # In single stream mode, latencies can be very short for high performance systems - # and run-to-run variation due to external disturbances (OS) can be significant. - # In this case we relax pass threshold to 20% - - if ref_mode == "Single Stream" and float(ref_score) <= 200000: - threshold = 0.20 - - if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold): - print("TEST PASS") - else: - print("TEST FAIL: Test score invalid") - -if __name__ == '__main__': - main() - diff --git a/compliance/audit_v0.5/nvidia/TEST03/README b/compliance/audit_v0.5/nvidia/TEST03/README deleted file mode 100644 index f582a0444..000000000 --- a/compliance/audit_v0.5/nvidia/TEST03/README +++ /dev/null @@ -1,34 +0,0 @@ -The purpose of this test is to ensure that the System-Under-Test (SUT) is not providing precalculated inference -results. The benchmark should be run in Submission mode with the dataset modified using the given scripts. -The performance must match the submisssion and the accuracy should be within an acceptable range based on -measurements on the reference implementation. - -Instructions - -Part I - -#Generate custom data for imagenet and coco -Run the script with the path to the original dataset and the new path to store the custom data: - python modify_image_data.py -d -o --datatset [coco|imagenet] - "original data path" must contain the ImageNet and/or COCO datasets in JPEG format. - -#Generate custom data for GNMT -This script assumes you have the original dataset and BPE code files already. -Please change the ORIGINAL_DATASET and CUSTOM_DATASET_OUTPUT variables in the script to point to where your -original newstest2014 dataset is stored and where you want the custom dataset to be stored respectively. -The script stores intermediate files in OUTPUT_DIR which is set to $PWD/outputs. This may be cleaned up after -the script completes. Final dataset will be available in $CUSTOM_DATASET_OUTPUT. -To run the script: - ./download_and_modify_gnmt.sh - -Part II -Run the benchmark in the same manner as the original submission, once in AccuracyOnly mode and once in -SubmissionOnly mode. Ensure that accuracy.txt is generated along with the other mlperf_log_* logs. -Note that the expected accuracies are lower than the MLPerf targets so the benchmark may report failure. -This is expected behavior and does not neccessarily mean that the audit has failed. - -Part III -Ensure that performance matches that achieved in the submission run. - python verify_performance.py -r -t - - diff --git a/compliance/audit_v0.5/nvidia/TEST03/download_and_modify_gnmt.sh b/compliance/audit_v0.5/nvidia/TEST03/download_and_modify_gnmt.sh deleted file mode 100644 index f81e4a186..000000000 --- a/compliance/audit_v0.5/nvidia/TEST03/download_and_modify_gnmt.sh +++ /dev/null @@ -1,149 +0,0 @@ -#! /usr/bin/env bash - -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -export LANG=C.UTF-8 -export LC_ALL=C.UTF-8 - -# OUTPUT_DIR=${1:-"data"} -OUTPUT_DIR="$PWD/outputs" -echo $OUTPUT_DIR - -echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable." - -OUTPUT_DIR_DATA="${OUTPUT_DIR}/data" - -mkdir -p $OUTPUT_DIR_DATA - -echo "Downloading dev/test sets" -wget -nc -nv -O ${OUTPUT_DIR_DATA}/dev.tgz \ - http://data.statmt.org/wmt16/translation-task/dev.tgz - -mkdir -p "${OUTPUT_DIR_DATA}/dev" -tar -xvzf "${OUTPUT_DIR_DATA}/dev.tgz" -C "${OUTPUT_DIR_DATA}/dev" - -# Clone Moses -if [ ! -d "${OUTPUT_DIR}/mosesdecoder" ]; then - echo "Cloning moses for data processing" - git clone https://github.com/moses-smt/mosesdecoder.git "${OUTPUT_DIR}/mosesdecoder" - cd ${OUTPUT_DIR}/mosesdecoder - git reset --hard 8c5eaa1a122236bbf927bde4ec610906fea599e6 - cd - -fi - -# Convert SGM files -# Convert newstest2014 data into raw text format -${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ - < ${OUTPUT_DIR_DATA}/dev/dev/newstest2014-deen-src.de.sgm \ - > ${OUTPUT_DIR_DATA}/dev/dev/newstest2014.de -${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ - < ${OUTPUT_DIR_DATA}/dev/dev/newstest2014-deen-ref.en.sgm \ - > ${OUTPUT_DIR_DATA}/dev/dev/newstest2014.en - -# Copy dev/test data to output dir -cp ${OUTPUT_DIR_DATA}/dev/dev/newstest2014.de ${OUTPUT_DIR} -cp ${OUTPUT_DIR_DATA}/dev/dev/newstest2014.en ${OUTPUT_DIR} - -# Modify dataset -echo "Modifying Dataset..." -python modify_gnmt_data.py --filename="${OUTPUT_DIR}/newstest2014" -mv "${OUTPUT_DIR}/newstest2014.en" "${OUTPUT_DIR}/newstest2014.original.en" -mv "${OUTPUT_DIR}/newstest2014.new.en" "${OUTPUT_DIR}/newstest2014.en" - -# Tokenize data -for f in ${OUTPUT_DIR}/*.de; do - echo "Tokenizing $f..." - ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l de -threads 8 < $f > ${f%.*}.tok.de -done - -for f in ${OUTPUT_DIR}/*.en; do - echo "Tokenizing $f..." - ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l en -threads 8 < $f > ${f%.*}.tok.en -done - -# Clean all corpora -#for f in ${OUTPUT_DIR}/*.en; do -# fbase=${f%.*} -# echo "Cleaning ${fbase}..." -# ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $fbase de en "${fbase}.clean" 1 80 -#done - -# # Create dev dataset -# cat "${OUTPUT_DIR}/newstest2015.tok.clean.en" \ -# "${OUTPUT_DIR}/newstest2016.tok.clean.en" \ -# > "${OUTPUT_DIR}/newstest_dev.tok.clean.en" - -# cat "${OUTPUT_DIR}/newstest2015.tok.clean.de" \ -# "${OUTPUT_DIR}/newstest2016.tok.clean.de" \ -# > "${OUTPUT_DIR}/newstest_dev.tok.clean.de" - -# # Filter datasets -# python3 pytorch/scripts/filter_dataset.py -f1 ${OUTPUT_DIR}/train.tok.clean.en -f2 ${OUTPUT_DIR}/train.tok.clean.de -# python3 pytorch/scripts/filter_dataset.py -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de - -# Generate Subword Units (BPE) -# Clone Subword NMT -if [ ! -d "${OUTPUT_DIR}/subword-nmt" ]; then - git clone https://github.com/rsennrich/subword-nmt.git "${OUTPUT_DIR}/subword-nmt" - cd ${OUTPUT_DIR}/subword-nmt - git reset --hard 48ba99e657591c329e0003f0c6e32e493fa959ef - cd - -fi - -# # Learn Shared BPE -# for merge_ops in 32000; do -# echo "Learning BPE with merge_ops=${merge_ops}. This may take a while..." -# cat "${OUTPUT_DIR}/train.tok.de" "${OUTPUT_DIR}/train.tok.en" | \ -# ${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}" - -# echo "Apply BPE with merge_ops=${merge_ops} to tokenized files..." -# for lang in en de; dols -# for f in ${OUTPUT_DIR}/*.tok.${lang} ${OUTPUT_DIR}/*.tok.${lang}; do -# outfile="${f%.*}.bpe.${merge_ops}.${lang}" -# ${OUTPUT_DIR}/subword-nmt/apply_bpe.py -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}" -# echo ${outfile} -# done -# done - -# # Create vocabulary file for BPE -# cat "${OUTPUT_DIR}/train.tok.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.bpe.${merge_ops}.de" | \ -# ${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}" -# done -ORIGINAL_DATASET="/gpfs/fs1/datasets/mlperf_inference/preprocessed_data/nmt/GNMT/" -CUSTOM_DATASET_OUTPUT="/gpfs/fs1/anirbang/custom_datasets/mlperf_inference/preprocessed_data/nmt/GNMT" -BPE_CODES_ORIGINAL="${ORIGINAL_DATASET}/bpe.32000" -BPE_CODES="${OUTPUT_DIR}/bpe.32000" -cp ${BPE_CODES_ORIGINAL} ${BPE_CODES} - -fbase="${OUTPUT_DIR}/newstest2014" -${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l de -threads 8 < ${fbase}.de > ${fbase}.tok.de -${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l de -threads 8 < ${fbase}.en > ${fbase}.tok.en -${OUTPUT_DIR}/subword-nmt/subword_nmt/apply_bpe.py -c $BPE_CODES < ${fbase}.tok.en > ${fbase}.tok.bpe.en -${OUTPUT_DIR}/subword-nmt/subword_nmt/apply_bpe.py -c $BPE_CODES < ${fbase}.tok.de > ${fbase}.tok.bpe.de - - -echo "Copying original dataset to custom directory" -cp -r ${ORIGINAL_DATASET}/* ${CUSTOM_DATASET_OUTPUT}/ -echo "Replacing original dataset files with custom dataset files" -cp "${OUTPUT_DIR}/newstest2014.tok.bpe.en" "${CUSTOM_DATASET_OUTPUT}/newstest2014.tok.bpe.32000.en" -cp "${OUTPUT_DIR}/newstest2014.tok.bpe.de" "${CUSTOM_DATASET_OUTPUT}/newstest2014.tok.bpe.32000.de" - -echo "Preparing perf mode dataset" -rm "${CUSTOM_DATASET_OUTPUT}/newstest2014.tok.bpe.32000.en.large" -for i in {1..1300};do cat "${CUSTOM_DATASET_OUTPUT}/newstest2014.tok.bpe.32000.en" >> "${CUSTOM_DATASET_OUTPUT}/newstest2014.tok.bpe.32000.en.large"; done - -echo "All done." diff --git a/compliance/audit_v0.5/nvidia/TEST03/modify_gnmt_data.py b/compliance/audit_v0.5/nvidia/TEST03/modify_gnmt_data.py deleted file mode 100644 index aa4b034c9..000000000 --- a/compliance/audit_v0.5/nvidia/TEST03/modify_gnmt_data.py +++ /dev/null @@ -1,147 +0,0 @@ -import fileinput -import argparse - -def replace_words(my_dict, filename): - #Read input file - inp_file = open(filename+".en") - out_file = open(filename+".new.en", "w") - count = 0 #Total number of lines modified - count2 = 0 #Total number of lines - #Replace words and write to file - for line in inp_file: - count2 += 1 - newline = line - #print(line) - flag = 0 - for search_str in my_dict: - if search_str in line : - flag = 1 - newline = newline.replace(search_str, my_dict[search_str]) - #print(newline) - if flag == 1: - count += 1 - out_file.write(newline) - #print(count, count2) #Uncomment this line to print the number of modifications done to the file - - -dict2 = { " he ": " she ", - "He " : "She ", - " him " : " her ", - " his " : " her ", - "Him " : "Her ", - " a few " : " many ", - " few " : " many ", - " more ": " less ", - " not " : " ", - " said" : " swims", - " love ": " hate ", - " says" : " swims", - " impossible " : " easy ", - " hard " : " easy ", - "Wednesday" : "Yesterday", - " can " : " can't ", - " will ": " won't ", - "first" : "fifth", - " last " : " fourth ", - " second " : " third ", - "brother" : "cat", - " man " : " woman ", - "men" : "women", - "girlfriend": "cousin", - "today" : "a year back", - "After " : "Before ", - " more " : " less ", - " shops " : " cars ", - " food " : " people ", - " small ": " big ", - "million" : "thousand", - "police" : "apple", - "swimmer" : "police", - " day " : " night ", - " minutes ":" hours ", - " seconds ":" minutes ", - "singing" : "playing", - "Thursday" : "Tuesday", - "money" : "chocolates", - "injured" : "hurt", - "killed" : "awarded", - " months " : " days ", - " year" : " second", - " good " : " bad ", - " gold " : " diamond ", - "phone" : "computer", - "5": "6", - "0": "1", - "9": "2", - "8": "3", - "water" : "juice", - "newspaper" : "story", - " car " : " dog ", - " news ": " car ", - " driver" : " athlete", - " citizen" : " terrorist", - " speak" : " drive", - " ago ": " in the future ", - " difficult " : " annoying ", - " customer " : " baby ", - " announced " : " travelled ", - " billion" : " hundered", - "country" : "street", - "company" : "district", - "government" : "company"} -dict1 = { " he ": " she ", - "He " : "She ", - " him " : " her ", - " his " : " her ", - "Him " : "Her ", - " a few " : " many ", - " few " : " many ", - " more ": " less ", - " not " : " ", - " said" : " thought", - " love ": " hate ", - " says" : " thinks", - " impossible " : " easy ", - " hard " : " easy ", - "Wednesday" : "Friday", - " can " : " can't ", - " will ": " won't ", - "first" : "fifth", - " last " : " fourth ", - " second " : " third ", - "brother" : "sister", - " man " : " woman ", - "men" : "women", - "girlfriend": "boyfriend", - "today" : "yesterday", - "After " : "Before ", - " more " : " less ", - " shops " : " restaurants ", - " food " : " water ", - " small ": " big ", - "million" : "thousand", - "police" : "guard", - "swimmer" : "athlete", - " day " : " night ", - " minutes ":" hours ", - " seconds ":" minutes ", - "singing" : "dancing", - "Thursday" : "Tuesday", - "money" : "silver", - "injured" : "hurt", - "killed" : "injured", - " months " : " days ", - " year" : " month", - " good " : " bad ", - " gold " : " bronze ", - "world" : "house"} - -my_dict = dict2 -parser = argparse.ArgumentParser() -parser.add_argument( - "--filename", "-f", - help="Specifies the name of the english file", - default="" - ) -args = parser.parse_args() -replace_words(my_dict, args.filename) diff --git a/compliance/audit_v0.5/nvidia/TEST03/modify_image_data.py b/compliance/audit_v0.5/nvidia/TEST03/modify_image_data.py deleted file mode 100644 index 7dba27a07..000000000 --- a/compliance/audit_v0.5/nvidia/TEST03/modify_image_data.py +++ /dev/null @@ -1,120 +0,0 @@ -#! /usr/bin/env python3 -import os -import sys -sys.path.append(os.getcwd()) - -import argparse -import numpy as np -import shutil - -#from common import logging -#from PIL import Image -import cv2 -import math - - -def modify_imagenet(data_dir, custom_data_dir): - - #logging.info("Modifying imagenet...") - print("Moidfying imagenet") - dirlist = os.listdir(data_dir) - image_list = [x for x in dirlist if x.endswith(".JPEG")] - - src_dir = data_dir - dst_dir = os.path.join(custom_data_dir, "imagenet") - - if not os.path.exists(dst_dir): - os.makedirs(dst_dir) - - for idx, file_name in enumerate(image_list): - if (idx % 1000) == 0: - print("Processing image No.{:d}/{:d}...".format(idx, len(image_list))) - img_out = os.path.join(dst_dir, file_name) - if not os.path.exists(img_out): - image = cv2.imread(os.path.join(src_dir, file_name)) - #Set pixels to 0 - image[:,:,0] = 0 - #print ("Writing image No.{:d}/{:d}...".format(idx, len(image_list))) - cv2.imwrite(img_out, image) - - -def modify_coco(data_dir, custom_data_dir): - - #logging.info("Preprocessing coco...") - - def modify_coco_helper(src_dir, dst_dir, image_list): - - if not os.path.exists(dst_dir): - os.makedirs(dst_dir) - - for idx, file_name in enumerate(image_list): - #logging.info("Processing image No.{:d}/{:d}...".format(idx, len(image_list))) - img_out = os.path.join(dst_dir, file_name) - if not os.path.exists(img_out): - image_path = os.path.join(src_dir, file_name) - image = cv2.imread(image_path) - #Set pixels to 0 - image[:,:,0] = 0 - cv2.imwrite(img_out, image) - - #Modify the validation set - src_dir = os.path.join(data_dir, "val2017") - dst_dir = os.path.join(custom_data_dir, "coco/val2017/") - - dirlist = os.listdir(src_dir) - image_list = [x for x in dirlist if x.endswith(".jpg")] - modify_coco_helper(src_dir, dst_dir, image_list) - - #Copy the training set - src_dir = os.path.join(data_dir, "train2017") - dst_dir = os.path.join(custom_data_dir, "coco/train2017") - shutil.copytree(src_dir, dst_dir) - -def copy_coco_annotations(data_dir, output_dir): - src_dir = os.path.join(data_dir, "annotations") - dst_dir = os.path.join(output_dir, "coco/annotations") - shutil.copytree(src_dir, dst_dir) - -def main(): - # Parse arguments to identify the data directory with the input images - # and the output directory for the new custom images - parser = argparse.ArgumentParser() - parser.add_argument( - "--data_dir", "-d", - help="Specifies the directory containing the input images.", - default="" - ) - parser.add_argument( - "--output_dir", "-o", - help="Specifies the output directory for the custom data.", - default="" - ) - parser.add_argument( - "--dataset", - help="Specifies the dataset - coco or imagenet", - default="" - ) - args = parser.parse_args() - print ("Running dataset modifer....") - # Now, actually modify the input images - #logging.info("Loading and modifying images. This might take a while...") - data_dir = args.data_dir - output_dir = args.output_dir - #while True: - #print ("a") - #pass - if args.dataset == "imagenet": - print("Begin Imagenet") - modify_imagenet(data_dir, output_dir) - print("Imagenet complete") - elif args.dataset == "coco": - modify_coco(data_dir, output_dir) - copy_coco_annotations(data_dir, output_dir) - else: - print("Incorrect dataset") - #logging.info("Incorrect dataset. It can be either coco or imagenet.") - #logging.info("Processing done.") - -if __name__ == '__main__': - main() - diff --git a/compliance/audit_v0.5/nvidia/TEST03/verify_performance.py b/compliance/audit_v0.5/nvidia/TEST03/verify_performance.py deleted file mode 100644 index 09fcbfb91..000000000 --- a/compliance/audit_v0.5/nvidia/TEST03/verify_performance.py +++ /dev/null @@ -1,126 +0,0 @@ -#! /usr/bin/env python3 -import os -import sys -import re -sys.path.append(os.getcwd()) - -import argparse -import json - -def main(): - # Parse arguments to identify the path to the accuracy logs from - # the accuracy and performance runs - parser = argparse.ArgumentParser() - parser.add_argument( - "--reference_summary", "-r", - help="Specifies the path to the summary log for TEST00.", - default="" - ) - parser.add_argument( - "--test_summary", "-t", - help="Specifies the path to the summary log for this test.", - default="" - ) - args = parser.parse_args() - - print("Verifying performance.") - ref_file = open(args.reference_summary, "r") - test_file = open(args.test_summary, "r") - ref_score = 0 - test_score = 0 - ref_mode = '' - test_mode = '' - - for line in ref_file: - if re.match("Scenario", line): - ref_mode = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Single Stream": - if re.match("90th percentile latency", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Multi Stream": - if re.match("Samples per query", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Server": - if re.match("Scheduled samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Offline": - if re.match("Samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if re.match("Result is", line): - valid = line.split(": ",1)[1].strip() - if valid == 'INVALID': - sys.exit("TEST FAIL: Reference results are invalid") - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in reference results") - - - for line in test_file: - if re.match("Scenario", line): - test_mode = line.split(": ",1)[1].strip() - continue - - if test_mode == "Single Stream": - if re.match("90th percentile latency", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Multi Stream": - if re.match("Samples per query", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Server": - if re.match("Scheduled samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Offline": - if re.match("Samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if re.match("Result is", line): - valid = line.split(": ",1)[1].strip() - if valid == 'INVALID': - sys.exit("TEST FAIL: Test results are invalid") - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in test results") - - if test_mode != ref_mode: - sys.exit("Test and reference scenarios do not match!") - - print("reference score = {}".format(ref_score)) - print("test score = {}".format(test_score)) - - - threshold = 0.10 - - # In single stream mode, latencies can be very short for high performance systems - # and run-to-run variation due to external disturbances (OS) can be significant. - # In this case we relax pass threshold to 20% - - if ref_mode == "Single Stream" and float(ref_score) <= 200000: - threshold = 0.20 - - if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold): - print("TEST PASS") - else: - print("TEST FAIL: Test score invalid") - -if __name__ == '__main__': - main() - diff --git a/compliance/audit_v0.5/nvidia/TEST04-A/README b/compliance/audit_v0.5/nvidia/TEST04-A/README deleted file mode 100644 index 7df96c5a7..000000000 --- a/compliance/audit_v0.5/nvidia/TEST04-A/README +++ /dev/null @@ -1,39 +0,0 @@ -The purpose of this test is to ensure that results are not cached on the fly when SUT sees duplicate sample IDs. - -By default, QSL loads a subset of the dataset determined by Performance Sample Count (say P) and queries for each scenario are -built using samples from the PerformanceSample implying the same sample can get repeatedly sent to the SUT over -the test duration. - -This test requires measuring & comparing performance of SUT (PerformanceOnly, mode=2) for two audit settings: - (TEST04-A) Issue P unique samples: In Offline scenario, a single query with samples_per_query equivalent to P unique samples is issued. - In Multi-Stream scenario, test ends after #queries = P/samples_per_query have been issued. - In Single-Stream/Server scenario test ends after P unique queries have been issued. - (TEST04-B) Issue same sample P times: In Offline scenario, the same sample is repeated P times to fill the query. This breaks the requirement - of reading contiguous memory locations in Offline mode, but it is normal for an audit test, meant to - stress the SUT in newer ways, to cause performance degradation. - In Multi-Stream scenario, the same query is repeated for #queries (=P/samples_per_query). - In Single-Stream/Server scenario test ends after sending P same queries. -This test is not applicable for: - (1) GNMT benchmark: Performance of GNMT benchmark is dependant on sample sequence length and hence performance for the two cases mentioned above can differ significantly. - (2) For Multi-Stream scenario, if samples_per_query >= P: The two cases above are the same and hence do not require testing. - -Validation checks: - TEST04-B should not be significantly faster than TEST04-A in a fair system which does not cache. - -This test does not use custom dataset or weights. - -Instructions - -Part I - Copy audit.config from TEST04-A folder to the working directory and run test. - -Part II - Copy audit.config from TEST04-B folder to the working directory and run test. - -Part III - Check the performance reported by TEST04-A matches that of TEST04-B by running the script provided - - python verify_test4_performance.py -u -s - -Expected outcome: - TEST PASS diff --git a/compliance/audit_v0.5/nvidia/TEST04-A/audit.config b/compliance/audit_v0.5/nvidia/TEST04-A/audit.config deleted file mode 100644 index cecff59e8..000000000 --- a/compliance/audit_v0.5/nvidia/TEST04-A/audit.config +++ /dev/null @@ -1,16 +0,0 @@ -# The format of this config file is 'key = value'. -# The key has the format 'model.scenario.key'. Value is mostly int64_t. -# Model maybe '*' as wildcard. In that case the value applies to all models. -# All times are in milli seconds -*.MultiStream.mode = 2 -*.MultiStream.performance_issue_unique = 1 -*.MultiStream.performance_issue_same = 0 -*.Offline.mode = 2 -*.Offline.performance_issue_unique = 1 -*.Offline.performance_issue_same = 0 -*.SingleStream.mode = 2 -*.SingleStream.performance_issue_unique = 1 -*.SingleStream.performance_issue_same = 0 -*.Server.mode = 2 -*.Server.performance_issue_unique = 1 -*.Server.performance_issue_same = 0 diff --git a/compliance/audit_v0.5/nvidia/TEST04-A/verify_test4_performance.py b/compliance/audit_v0.5/nvidia/TEST04-A/verify_test4_performance.py deleted file mode 100644 index 8a505f7e2..000000000 --- a/compliance/audit_v0.5/nvidia/TEST04-A/verify_test4_performance.py +++ /dev/null @@ -1,135 +0,0 @@ -#! /usr/bin/env python3 -import os -import sys -import re -sys.path.append(os.getcwd()) - -import argparse -import json - -def main(): - # Parse arguments to identify the path to the accuracy logs from - # the accuracy and performance runs - parser = argparse.ArgumentParser() - parser.add_argument( - "--unique_sample", "-u", - help="Specifies the path to the summary log for TEST04-A.", - default="" - ) - parser.add_argument( - "--same_sample", "-s", - help="Specifies the path to the summary log for TEST04-B.", - default="" - ) - args = parser.parse_args() - - print("Verifying performance.") - ref_file = open(args.unique_sample, "r") - test_file = open(args.same_sample, "r") - ref_score = 0 - test_score = 0 - ref_mode = '' - test_mode = '' - performance_issue_unqiue = '' - performance_issue_same = '' - - for line in ref_file: - if re.match("Scenario", line): - ref_mode = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Single Stream": - if re.match("90th percentile latency", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Multi Stream": - if re.match("Samples per query", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Server": - if re.match("Scheduled samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Offline": - if re.match("Samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in TEST04-A results") - - if re.match("performance_issue_unique", line): - performance_issue_unique = line.split(": ",1)[1].strip() - if performance_issue_unique == 'false': - sys.exit("TEST FAIL: Invalid test settings in TEST04-A summary.") - break - - for line in test_file: - if re.match("Scenario", line): - test_mode = line.split(": ",1)[1].strip() - continue - - if test_mode == "Single Stream": - if re.match("90th percentile latency", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Multi Stream": - if re.match("Samples per query", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Server": - if re.match("Scheduled samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Offline": - if re.match("Samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in TEST04-B results") - - if re.match("performance_issue_same", line): - performance_issue_same = line.split(": ",1)[1].strip() - if performance_issue_same == 'false': - sys.exit("TEST FAIL: Invalid test settings in TEST04-B summary.") - break - - if test_mode != ref_mode: - sys.exit("Test and reference scenarios do not match!") - - print("TEST04-A score = {}".format(ref_score)) - print("TEST04-B score = {}".format(test_score)) - - threshold = 0.10 - - # In single stream mode, latencies can be very short for high performance systems - # and run-to-run variation due to external disturbances (OS) can be significant. - # In this case we relax pass threshold to 20% - - if ref_mode == "Single Stream" and float(ref_score) <= 200000: - threshold = 0.20 - - if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold): - print("TEST PASS") - elif (float(test_score) > float(ref_score) and test_mode == "Single Stream"): - print("TEST PASS") - print("Note: TEST04-B is significantly slower than TEST04-A") - elif (float(test_score) < float(ref_score) and test_mode != "Single Stream"): - print("TEST PASS") - print("Note: TEST04-B is significantly slower than TEST04-A") - else: - print("TEST FAIL: Test score invalid") - -if __name__ == '__main__': - main() - diff --git a/compliance/audit_v0.5/nvidia/TEST04-B/README b/compliance/audit_v0.5/nvidia/TEST04-B/README deleted file mode 100644 index 76c8c1b20..000000000 --- a/compliance/audit_v0.5/nvidia/TEST04-B/README +++ /dev/null @@ -1 +0,0 @@ -Refer README & verify performance script provided under TEST04-A. diff --git a/compliance/audit_v0.5/nvidia/TEST04-B/audit.config b/compliance/audit_v0.5/nvidia/TEST04-B/audit.config deleted file mode 100644 index b6e82b2cc..000000000 --- a/compliance/audit_v0.5/nvidia/TEST04-B/audit.config +++ /dev/null @@ -1,20 +0,0 @@ -# The format of this config file is 'key = value'. -# The key has the format 'model.scenario.key'. Value is mostly int64_t. -# Model maybe '*' as wildcard. In that case the value applies to all models. -# All times are in milli seconds -*.MultiStream.mode = 2 -*.MultiStream.performance_issue_unique = 0 -*.MultiStream.performance_issue_same = 1 -*.MultiStream.performance_issue_same_index = 3 -*.Offline.mode = 2 -*.Offline.performance_issue_unique = 0 -*.Offline.performance_issue_same = 1 -*.Offline.performance_issue_same_index = 3 -*.SingleStream.mode = 2 -*.SingleStream.performance_issue_unique = 0 -*.SingleStream.performance_issue_same = 1 -*.SingleStream.performance_issue_same_index = 3 -*.Server.mode = 2 -*.Server.performance_issue_unique = 0 -*.Server.performance_issue_same = 1 -*.Server.performance_issue_same_index = 3 diff --git a/compliance/audit_v0.5/nvidia/TEST05/README b/compliance/audit_v0.5/nvidia/TEST05/README deleted file mode 100644 index 94cef6834..000000000 --- a/compliance/audit_v0.5/nvidia/TEST05/README +++ /dev/null @@ -1,25 +0,0 @@ -The purpose of this test is to ensure that the SUT is not tuned for specific Loadgen RNG seed values. -The pass condition is that performance with non-default RNG seed values should be similar to the submitted -score. - -The seeds that are changed are listed below: - qsl_rng_seed - determines order of samples in QSL - sample_index_rng_seed - determines subset of samples in each loadable set - schedule_rng_seed - determines scheduling of samples in server mode - -This test does not use custom dataset or weights. - -Instructions - -Part I - Run test with provided audit.config in PerformanceOnly mode - -Part II - Ensure that performance with custom RNG seeds matches submission performance score. - - python verify_performance.py -r -t - -Expected outcome: - TEST PASS - - diff --git a/compliance/audit_v0.5/nvidia/TEST05/audit.config b/compliance/audit_v0.5/nvidia/TEST05/audit.config deleted file mode 100644 index 44c553667..000000000 --- a/compliance/audit_v0.5/nvidia/TEST05/audit.config +++ /dev/null @@ -1,22 +0,0 @@ -# The format of this config file is 'key = value'. -# The key has the format 'model.scenario.key'. Value is mostly int64_t. -# Model maybe '*' as wildcard. In that case the value applies to all models. -# All times are in milli seconds - -# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) -*.MultiStream.mode = 2 -*.MultiStream.qsl_rng_seed = 195 -*.MultiStream.sample_index_rng_seed = 235 -*.MultiStream.schedule_rng_seed = 634 -*.Offline.mode = 2 -*.Offline.qsl_rng_seed = 195 -*.Offline.sample_index_rng_seed = 235 -*.Offline.schedule_rng_seed = 634 -*.SingleStream.mode = 2 -*.SingleStream.qsl_rng_seed = 195 -*.SingleStream.sample_index_rng_seed = 235 -*.SingleStream.schedule_rng_seed = 634 -*.Server.mode = 2 -*.Server.qsl_rng_seed = 195 -*.Server.sample_index_rng_seed = 235 -*.Server.schedule_rng_seed = 634 diff --git a/compliance/audit_v0.5/nvidia/TEST05/verify_performance.py b/compliance/audit_v0.5/nvidia/TEST05/verify_performance.py deleted file mode 100644 index 09fcbfb91..000000000 --- a/compliance/audit_v0.5/nvidia/TEST05/verify_performance.py +++ /dev/null @@ -1,126 +0,0 @@ -#! /usr/bin/env python3 -import os -import sys -import re -sys.path.append(os.getcwd()) - -import argparse -import json - -def main(): - # Parse arguments to identify the path to the accuracy logs from - # the accuracy and performance runs - parser = argparse.ArgumentParser() - parser.add_argument( - "--reference_summary", "-r", - help="Specifies the path to the summary log for TEST00.", - default="" - ) - parser.add_argument( - "--test_summary", "-t", - help="Specifies the path to the summary log for this test.", - default="" - ) - args = parser.parse_args() - - print("Verifying performance.") - ref_file = open(args.reference_summary, "r") - test_file = open(args.test_summary, "r") - ref_score = 0 - test_score = 0 - ref_mode = '' - test_mode = '' - - for line in ref_file: - if re.match("Scenario", line): - ref_mode = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Single Stream": - if re.match("90th percentile latency", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Multi Stream": - if re.match("Samples per query", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Server": - if re.match("Scheduled samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if ref_mode == "Offline": - if re.match("Samples per second", line): - ref_score = line.split(": ",1)[1].strip() - continue - - if re.match("Result is", line): - valid = line.split(": ",1)[1].strip() - if valid == 'INVALID': - sys.exit("TEST FAIL: Reference results are invalid") - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in reference results") - - - for line in test_file: - if re.match("Scenario", line): - test_mode = line.split(": ",1)[1].strip() - continue - - if test_mode == "Single Stream": - if re.match("90th percentile latency", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Multi Stream": - if re.match("Samples per query", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Server": - if re.match("Scheduled samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if test_mode == "Offline": - if re.match("Samples per second", line): - test_score = line.split(": ",1)[1].strip() - continue - - if re.match("Result is", line): - valid = line.split(": ",1)[1].strip() - if valid == 'INVALID': - sys.exit("TEST FAIL: Test results are invalid") - - if re.match("\d+ ERROR", line): - error = line.split(" ",1)[0].strip() - print("WARNING: " + error + " ERROR reported in test results") - - if test_mode != ref_mode: - sys.exit("Test and reference scenarios do not match!") - - print("reference score = {}".format(ref_score)) - print("test score = {}".format(test_score)) - - - threshold = 0.10 - - # In single stream mode, latencies can be very short for high performance systems - # and run-to-run variation due to external disturbances (OS) can be significant. - # In this case we relax pass threshold to 20% - - if ref_mode == "Single Stream" and float(ref_score) <= 200000: - threshold = 0.20 - - if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold): - print("TEST PASS") - else: - print("TEST FAIL: Test score invalid") - -if __name__ == '__main__': - main() - From ef663f8581bd3f02a14483e766e1cc03179c83f9 Mon Sep 17 00:00:00 2001 From: Jinho Suh <83969361+nv-jinhosuh@users.noreply.github.com> Date: Mon, 25 Jul 2022 17:45:21 -0500 Subject: [PATCH 4/5] [Loadgen for LON] making non-const function for sut->Name() call (#1184) * [Loadgen for LON] making non-const function for sut->Name() call sut->Name() call requiires dynamic behavior through QDL in LON. * Removing const from SUT Name() * Removing const from Name() - missed ones Co-authored-by: rameshchukka --- loadgen/benchmark/repro.cpp | 6 +++--- loadgen/bindings/c_api.cc | 2 +- loadgen/bindings/python_api.cc | 2 +- loadgen/system_under_test.h | 2 +- loadgen/tests/perftests_null_sut.cc | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/loadgen/benchmark/repro.cpp b/loadgen/benchmark/repro.cpp index e724338ac..300a2b555 100644 --- a/loadgen/benchmark/repro.cpp +++ b/loadgen/benchmark/repro.cpp @@ -49,7 +49,7 @@ class BasicSUT : public mlperf::SystemUnderTest { initResponse(10000); } ~BasicSUT() override {} - const std::string& Name() const override { return mName; } + const std::string& Name() override { return mName; } void IssueQuery(const std::vector& samples) override { int n = samples.size(); if (n > mResponses.size()) { @@ -96,7 +96,7 @@ class QueueSUT : public mlperf::SystemUnderTest { thread.join(); } } - const std::string& Name() const override { return mName; } + const std::string& Name() override { return mName; } void IssueQuery(const std::vector& samples) override { std::unique_lock lck(mMtx); for (const auto& sample : samples) { @@ -163,7 +163,7 @@ class MultiBasicSUT : public mlperf::SystemUnderTest { thread.join(); } } - const std::string& Name() const override { return mName; } + const std::string& Name() override { return mName; } void IssueQuery(const std::vector& samples) override { int thread_idx = mThreadMap[std::this_thread::get_id()]; int n = samples.size(); diff --git a/loadgen/bindings/c_api.cc b/loadgen/bindings/c_api.cc index f7c7f3cf2..21b2aa96d 100644 --- a/loadgen/bindings/c_api.cc +++ b/loadgen/bindings/c_api.cc @@ -36,7 +36,7 @@ class SystemUnderTestTrampoline : public SystemUnderTest { flush_queries_cb_(flush_queries_cb) {} ~SystemUnderTestTrampoline() override = default; - const std::string& Name() const override { return name_; } + const std::string& Name() override { return name_; } void IssueQuery(const std::vector& samples) override { (*issue_cb_)(client_data_, samples.data(), samples.size()); diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 345a2a03d..9aa2732b8 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -47,7 +47,7 @@ class SystemUnderTestTrampoline : public SystemUnderTest { flush_queries_cb_(flush_queries_cb) {} ~SystemUnderTestTrampoline() override = default; - const std::string& Name() const override { return name_; } + const std::string& Name() override { return name_; } void IssueQuery(const std::vector& samples) override { pybind11::gil_scoped_acquire gil_acquirer; diff --git a/loadgen/system_under_test.h b/loadgen/system_under_test.h index eac7f5fde..843453962 100644 --- a/loadgen/system_under_test.h +++ b/loadgen/system_under_test.h @@ -38,7 +38,7 @@ class SystemUnderTest { virtual ~SystemUnderTest() {} /// \brief A human-readable string for logging purposes. - virtual const std::string& Name() const = 0; + virtual const std::string& Name() = 0; /// \brief Lets the loadgen issue N samples to the SUT. /// \details The SUT may either a) return immediately and signal completion diff --git a/loadgen/tests/perftests_null_sut.cc b/loadgen/tests/perftests_null_sut.cc index bdcd9d43c..56d562c3e 100644 --- a/loadgen/tests/perftests_null_sut.cc +++ b/loadgen/tests/perftests_null_sut.cc @@ -31,7 +31,7 @@ class SystemUnderTestNull : public mlperf::SystemUnderTest { public: SystemUnderTestNull() = default; ~SystemUnderTestNull() override = default; - const std::string& Name() const override { return name_; } + const std::string& Name() override { return name_; } void IssueQuery(const std::vector& samples) override { std::vector responses; responses.reserve(samples.size()); From 7c3c6977cb6bda1f766c6a8e1de0ec8c55151637 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 25 Jul 2022 15:45:58 -0700 Subject: [PATCH 5/5] Update run_local.sh (#1181) Using python3 is easier here as python2 is not used anymore and python command gives error on macOS Co-authored-by: rameshchukka --- vision/classification_and_detection/run_local.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision/classification_and_detection/run_local.sh b/vision/classification_and_detection/run_local.sh index c014fc1a3..e69e3b8b3 100755 --- a/vision/classification_and_detection/run_local.sh +++ b/vision/classification_and_detection/run_local.sh @@ -9,5 +9,5 @@ if [ ! -d $OUTPUT_DIR ]; then mkdir -p $OUTPUT_DIR fi -python python/main.py --profile $profile $common_opt --model $model_path $dataset \ +python3 python/main.py --profile $profile $common_opt --model $model_path $dataset \ --output $OUTPUT_DIR $EXTRA_OPS $@