Merge branch 'mlcommons:master' into docs

GATEOverflow · Aug 13, 2024 · 73ce4fd · 73ce4fd
2 parents 36af6b4 + 725b3c0
commit 73ce4fd
Show file tree

Hide file tree

Showing 96 changed files with 3,019 additions and 649 deletions.
diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
@@ -3,12 +3,18 @@ name: Build loadgen wheels and release them into PYPI
 on:
   release:
     types: [published]
+  push:
+    branches:
+      - master
+    paths:
+      - loadgen/setup.py
 
 jobs:
   build_wheels:
     name: Build wheels on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macOS-latest]
 
@@ -18,7 +24,7 @@ jobs:
       - uses: actions/setup-python@v3
 
       - name: Install requirements
-        run: python -m pip install cibuildwheel==2.16.2 twine==4.0.2
+        run: python -m pip install cibuildwheel twine
 
       - name: Build wheels
         run: python -m cibuildwheel loadgen/ --output-dir wheels

diff --git a/.github/workflows/test-bert.yml b/.github/workflows/test-bert.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test BERT and end to end submission generation
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} --target_qps=1
+        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/.github/workflows/test-loadgen.yml b/.github/workflows/test-loadgen.yml
@@ -28,8 +28,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
+        python3 -m pip install cm4mlops
     - name: Test Loadgen
       run: |
-        cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }}
+        cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/.github/workflows/test-resnet50.yml b/.github/workflows/test-resnet50.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test Resnet50 and end to end submission generation
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }}
+        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom
diff --git a/.github/workflows/test-retinanet.yml b/.github/workflows/test-retinanet.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test Retinanet and end to end submission generation
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }}
+        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml
@@ -28,9 +28,8 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
+        python3 -m pip install cm4mlops
         git clone https://github.com/mlcommons/inference_results_v4.0 --depth 1
     - name: Test MLPerf inference submission checker
       run: |
-        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --quiet 
+        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --src_version=v4.0 --quiet 
diff --git a/.github/workflows/test-tvm.yml b/.github/workflows/test-tvm.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test Resnet50 TVM backend
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=5 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} --target_qps=1
+        cm run script --tags=run,mlperf,inference,generate-run-cmds --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=5 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/README.md b/README.md
@@ -13,7 +13,29 @@ Please see the [MLPerf Inference benchmark paper](https://arxiv.org/abs/1911.025
     primaryClass={cs.LG}
 }
 ```
-## MLPerf Inference v4.0 (submission deadline February 23, 2024)
+Please see [here](https://docs.mlcommons.org/inference/benchmarks/) for the MLPerf inference documentation website which includes automated commands to run MLPerf inference benchmarks using different implementations.
+
+## MLPerf Inference v4.1 (submission deadline July 26, 2024)
+
+For submissions, please use the master branch and any commit since the [4.1 seed release](https://github.com/mlcommons/inference/pull/1736/files) although it is best to use the latest commit. v4.1 tag will be created from the master branch after the result publication.
+
+For power submissions please use [SPEC PTD 1.10](https://github.com/mlcommons/power/tree/main/inference_v1.0) (needs special access) and any commit of the power-dev repository after the [code-freeze](https://github.com/mlcommons/power-dev/pull/325)
+
+| model | reference app | framework | dataset | category
+| ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | tensorflow, onnx, tvm, ncnn | imagenet2012 | edge,datacenter |
+| retinanet 800x800 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | pytorch, onnx | openimages resized to 800x800| edge,datacenter |
+| bert | [language/bert](https://github.com/mlcommons/inference/tree/master/language/bert) | tensorflow, pytorch, onnx | squad-1.1 | edge,datacenter |
+| dlrm-v2 | [recommendation/dlrm_v2](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorch) | pytorch | Multihot Criteo Terabyte | datacenter |
+| 3d-unet | [vision/medical_imaging/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 | edge,datacenter |
+| gpt-j | [language/gpt-j](https://github.com/mlcommons/inference/tree/master/language/gpt-j)| pytorch | CNN-Daily Mail | edge,datacenter |
+| stable-diffusion-xl | [text_to_image](https://github.com/mlcommons/inference/tree/master/text_to_image) | pytorch | COCO 2014| edge,datacenter |
+| llama2-70b | [language/llama2-70b](https://github.com/mlcommons/inference/tree/master/language/llama2-70b) | pytorch | OpenOrca | datacenter |
+| mixtral-8x7b | [language/mixtral-8x7b](https://github.com/mlcommons/inference/tree/master/language/mixtral-8x7b) | pytorch | OpenOrca, MBXP, GSM8K | datacenter |
+
+* Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark.
+
+## MLPerf Inference v4.0 (submission February 23, 2024)
 
 There is an extra one-week extension allowed only for the llama2-70b submissions. For submissions, please use the master branch and any commit since the [4.0 seed release](https://github.com/mlcommons/inference/commit/8e36925bd36a503e39fcbbc488e9e46126f079ed) although it is best to use the latest commit. v4.0 tag will be created from the master branch after the result publication.
 

diff --git a/Submission_Guidelines.md b/Submission_Guidelines.md
@@ -18,40 +18,34 @@ The MLPerf inference submission rules are spread between the [MLCommons policies
 
 ### Is there an automatic way to run the MLPerf inference benchmarks?
 
-MLPerf inference submissions are expected on different hardware and related software stacks. For this reason, only reference implementations are provided by MLCommons and they can guide submitters to make their own optimal implementations for their software/hardware stack. Also, all the previous implementations are made available in the MLCommons Inference results repositories and they can also guide submitters in doing their own implementations.
-
-[The MLCommons taskforce on automation and reproducibility](https://github.com/mlcommons/ck/blob/master/docs/taskforce.md) has automated all the MLCommons inference tasks using the [MLCommons CM language](https://github.com/mlcommons/ck/blob/master/cm) and [this readme](https://github.com/mlcommons/ck/tree/master/docs/mlperf/inference) can guide you in running the reference implementations with very minimal effort. Currently, this automation supports MLCommons reference implementations, Nvidia implementations, and C++ implementations for Onnxruntime and TFLite. Feel free to join the [taskforce Discord channel](https://discord.gg/8jbEM4J6Ff) if you have any questions.
-
-The previous MLPerf inference results are aggregated in [Collective Knowledge platform (MLCommons CK playground)](platform) as [reproducible experiments](https://access.cknowledge.org/playground/?action=experiments)  and can be used by submitters to compare their results with the previous ones while adding various derived metrics (such as performance/watt) and constraints.
+MLPerf inference submissions are expected to be run on various hardware and supported software stacks. Therefore, MLCommons provides only reference implementations to guide submitters in creating optimal implementations for their specific software and hardware configurations. Additionally, all implementations used for MLPerf inference submissions are available in the MLCommons [Inference results](https://github.com/orgs/mlcommons/repositories?q=inference_results_v+sort%3Aname) repositories (under `closed/<submitter>/code` directory), offering further guidance for submitters developing their own implementations.
 
 ### Expected time to do benchmark runs
-1. Closed submission under data enter needs offline and server scenario runs with a minimum of ten minutes needed for both. 
+1. Closed submission under datacenter needs offline and server scenario runs with a minimum of ten minutes needed for both. 
 2. Closed submission under the edge category needs single stream, multi-stream (only for R50 and retinanet), and offline scenarios. A minimum of ten minutes is needed for each scenario. 
 3. Further two (three for ResNet50) compliance runs are needed for closed division, each taking at least 10 minutes for each scenario.
 4. SingleStream, MultiStream and Server scenarios use early stopping and so can always finish around 10 minutes
 5. Offline scenario needs a minimum of 24756 input queries to be processed -- can take hours for low-performing models like 3dunet, LLMs, etc.
 6. Open division has no accuracy constraints, no required compliance runs, and can be submitted for any single scenario. There is no constraint on the model used except that the model accuracy must be validated on the accuracy dataset used in the corresponding MLPerf inference task [or must be preapproved](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#412-relaxed-constraints-for-the-open-division).
-7. Power submission needs an extra ranging mode to determine the peak current usage and this often doubles the overall experiment run time.
+7. Power submission needs an extra ranging mode to determine the peak current usage and this often doubles the overall experiment run time. If this overhead is too much, ranging run can be reduced to 5 minutes run using mechanisms like [this](https://github.com/mlcommons/cm4mlops/blob/main/script/benchmark-program-mlperf/customize.py#L18).
 
 
 ## Validity of the submission
 
 1. [MLCommons Inference submission checker](https://github.com/mlcommons/inference/blob/master/tools/submission/submission_checker.py) is provided to ensure that all submissions are passing the required checks.
-2. In the unlikely event that there is an error on the submission checker for your submission, please raise a Github issue [here](https://github.com/mlcommons/inference/issues)
+2. In the unlikely event that there is an error on the submission checker for your submission, please raise a GitHub issue [here](https://github.com/mlcommons/inference/issues)
 3. Any submission passing the submission checker is valid to go to the review discussions but submitters are still required to answer any queries and fix any issues being reported by other submitters.
 
 ### Reviewing other submissions
-1. Ensure that the `system_desc_id.json` file is having meaningful responses - submission_checker only checks for the existence of the fields.
+1. Ensure that the `system_desc_id.json` file is having meaningful responses - `submission_checker` only checks for the existence of the fields.
 2. For power submissions, `power settings` and `analyzer table` files are to be submitted, and even though the submission checker checks for the existence of these files, the content of [these files](https://github.com/mlcommons/inference_policies/blob/master/power_measurement.adoc#64-power-management-settings) must be checked manually for validity.
 3. README files in the submission directory must be checked to make sure that the instructions are reproducible.
 4. For closed datacenter submissions, [ECC RAM and Networking requirements](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#constraints-for-the-closed-division) must be ensured.
 5. Submission checker might be reporting warnings and some of these warnings can warrant an answer from the submitter.
 
-## Changes from MLCommons Inference 3.0
+## Changes from MLCommons Inference 4.0
 
-1. Two new benchmarks GPT-J and GPT-3 and DLRMv2 replacing DLRM
-2. Submission checker is now checking for non-empty README files and mandatory system description and power-related fields
-3. New script is provided which can be used to infer scenario results and low-accuracy results from a high-accuracy result
-4. `min_query_count` is removed for all scenarios except offline due to early stopping. SingleStream now needs a minimum of 64 queries and MultiStream needs 662 queries as mandated by the early stopping criteria.
+1. One new benchmark in the datacenter category: Mixtral-8x7B. No changes in the edge category.
+2. For power submissions, there is no code change. 
 
 
diff --git a/compliance/nvidia/README.md b/compliance/nvidia/README.md
@@ -10,7 +10,7 @@ This repository provides the compliance tests that need to be run by the submitt
 ## Introduction
 The purpose of compliance testing is to ensure a basic level of compliance with a subset of the MLPerf rules. The tests are designed to be complementary to third-party auditing which will be introduced in future rounds of MLPerf. The tests are not meant to root-cause issues with the submission, but can help detect anomalies in the submission that need to be investigated further by the submitter.
 
-Each compliance test must be run once for each submission run and the logs from the compliance test run must be uploaded along with the rest of the submission collateral. In MLPerf Inference v0.7, effort has been made to reduce the burden on submitters to perform compliance testing through improvements to documentation, scripting, and LoadGen's compliance functionality. More documentation is provided on the purpose of each test in the corresponding test directory, along with more detailed instructions. 
+Each compliance test must be run once for each submission run and the logs from the compliance test run must be uploaded along with the rest of the submission collateral. In MLPerf Inference v0.7, effort has been made to reduce the burden on submitters to perform compliance testing through improvements to documentation, scripting, and LoadGen's compliance functionality. More documentation is provided on the purpose of each test in the corresponding test directory, along with more detailed instructions.
 
 ## Test Infrastructure
 The compliance tests exercise functionality in LoadGen, triggered through the use of a config file that overrides LoadGen functionality. This enables LoadGen to run in a variety of compliance testing modes. When LoadGen::StartTest() is invoked, LoadGen checks if a `audit.config` file exists in the current working directory. If the file is found, LoadGen will log this event in `mlperf_log_detail.txt`.  The LoadGen settings that are used will be logged in `mlperf_log_summary.txt`. The configuration parameters in `audit.config` override any settings set by `mlperf.conf` or `user.conf`.
@@ -37,5 +37,6 @@ The `run_verification.py` found in each test directory will copy the test files
 | 3d-unet | [TEST01](./TEST01/), [TEST05](./TEST05/) |
 | rnnt | [TEST01](./TEST01/), [TEST05](./TEST05/) |
 | gpt-j | - |
-| stable-diffusion-xl | - |
-| Llama2-70b | [TEST06]() |
+| stable-diffusion-xl | [TEST01](./TEST01/), [TEST04](./TEST04/) |
+| Llama2-70b | [TEST06](./TEST06/) |
+| mixtral-8x7b | [TEST06](./TEST06/) |