Unify llama3 names to llama3.1-405b (#1982)

* Unify llama3 names to llama3.1-405b * Set mlperf.conf name to llama3_1-405b
mlcommons · Dec 19, 2024 · 00945c3 · 00945c3
1 parent e6069aa
commit 00945c3
Show file tree

Hide file tree

Showing 17 changed files with 42 additions and 41 deletions.
diff --git a/language/llama3-405b/Dockerfile → language/llama3.1-405b/Dockerfile b/language/llama3-405b/Dockerfile → language/llama3.1-405b/Dockerfile
@@ -44,7 +44,7 @@ WORKDIR /tmp
 RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh \
     && bash Miniconda3-* -b -p /opt/miniconda3
 ENV PATH="$PATH:/opt/miniconda3/bin"
-RUN conda create -n llama3-405b python=3.10
+RUN conda create -n llama3.1-405b python=3.10
 RUN chmod -R 777 /opt/miniconda3
 
 # Set the env variable for vLLM

diff --git a/language/llama3-405b/README.md → language/llama3.1-405b/README.md b/language/llama3-405b/README.md → language/llama3.1-405b/README.md
@@ -1,13 +1,13 @@
-# Reference Implementation for llama3-405b
+# Reference Implementation for llama3.1-405b
 
-**Basic implementation for llama3-405b. Few noteworthy items:**
+**Basic implementation for llama3.1-405b. Few noteworthy items:**
 
 + Streamer for communicating with loadgen has quite some overhead. This is only meant to provide functional implementation
 + For custom/optimized implementations of this benchmark it is important to include the :
         - For server scenario, it is necessary to call `lg.FirstTokenComplete(response)` for each query. This way the first token will be reported and it's latency will be measured.
         - For all scenarios, when calling `lg.QuerySamplesComplete(response)`, it is necessary that each of the elements in response is a `lg.QuerySampleResponse` that contains the number of tokens (can be create this way: `lg.QuerySampleResponse(qitem.id, bi[0], bi[1], n_tokens)`). The number of tokens reported should match with the number of tokens on your answer and this will be checked in [TEST06](../../compliance/nvidia/TEST06/)
 
-Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3.1-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
 
 
 ## Prepare environment
@@ -33,9 +33,9 @@ rm ~/miniconda3/miniconda.sh
 - Set the following helper variables
 ```bash
 export ROOT=$PWD/inference
-export LLAMA_FOLDER=$PWD/inference/language/llama3-405b
+export LLAMA_FOLDER=$PWD/inference/language/llama3.1-405b
 export LOADGEN_FOLDER=$PWD/inference/loadgen
-export DATASET_FOLDER=$PWD/inference/language/llama3-405b/dataset
+export DATASET_FOLDER=$PWD/inference/language/llama3.1-405b/dataset
 ```
 
 - Clone the inference repository:
@@ -46,8 +46,8 @@ git clone --recurse-submodules https://github.com/mlcommons/inference.git \
 
 - Create a conda environment:
 ```bash
-conda create -y -n llama3-405b python=3.10
-conda activate llama3-405b
+conda create -y -n llama3.1-405b python=3.10
+conda activate llama3.1-405b
 conda install -y -c conda-forge libstdcxx-ng=12
 ```
 
@@ -100,7 +100,7 @@ TODO: Host model and grant access to submitters
 
 
 ### External Download
-+ First go to [llama3-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
++ First go to [llama3.1-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
 + Requires Git Large Files Storage
 ```
 export CHECKPOINT_PATH=Meta-Llama-3.1-405B-Instruct
@@ -127,13 +127,13 @@ rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5ee
 You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
 
 ```
-rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl ./ -P
+rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_405b/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl ./ -P
 ```
 
 You can also download the calibration dataset from the Cloudflare R2 bucket by running the following command:
 
 ```
-rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama3.1_405b_calibration_dataset_512_processed_fp16_eval.pkl ./ -P
+rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_405b/mlperf_llama3.1_405b_calibration_dataset_512_processed_fp16_eval.pkl ./ -P
 ```
 
 ## Run Performance Benchmarks

diff --git a/language/llama3-405b/SUT_VLLM.py → language/llama3.1-405b/SUT_VLLM.py b/language/llama3-405b/SUT_VLLM.py → language/llama3.1-405b/SUT_VLLM.py
diff --git a/language/llama3-405b/build.sh → language/llama3.1-405b/build.sh b/language/llama3-405b/build.sh → language/llama3.1-405b/build.sh
diff --git a/language/llama3-405b/dataset.py → language/llama3.1-405b/dataset.py b/language/llama3-405b/dataset.py → language/llama3.1-405b/dataset.py
diff --git a/language/llama3-405b/evaluate-accuracy.py → language/llama3.1-405b/evaluate-accuracy.py b/language/llama3-405b/evaluate-accuracy.py → language/llama3.1-405b/evaluate-accuracy.py
@@ -15,7 +15,7 @@ def get_args():
     parser.add_argument(
         "--checkpoint-path",
         default="meta-llama/Meta-Llama-3-8B",
-        help="Path to Llama3-405b-hf-chat checkpoint"
+        help="Path to Llama3.1-405b-hf-chat checkpoint"
     )
     parser.add_argument(
         "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"

diff --git a/language/llama3-405b/launch_docker.sh → language/llama3.1-405b/launch_docker.sh b/language/llama3-405b/launch_docker.sh → language/llama3.1-405b/launch_docker.sh
diff --git a/language/llama3-405b/main.py → language/llama3.1-405b/main.py b/language/llama3-405b/main.py → language/llama3.1-405b/main.py
@@ -136,8 +136,8 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, "llama3-405b", args.scenario)
-    settings.FromConfig(args.user_conf, "llama3-405b", args.scenario)
+    # settings.FromConfig(args.mlperf_conf, "llama3_1-405b", args.scenario)
+    settings.FromConfig(args.user_conf, "llama3_1-405b", args.scenario)
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly

diff --git a/language/llama3-405b/requirements.txt → language/llama3.1-405b/requirements.txt b/language/llama3-405b/requirements.txt → language/llama3.1-405b/requirements.txt
diff --git a/language/llama3-405b/run_accuracy.sh → language/llama3.1-405b/run_accuracy.sh b/language/llama3-405b/run_accuracy.sh → language/llama3.1-405b/run_accuracy.sh
diff --git a/language/llama3-405b/run_offline.sh → language/llama3.1-405b/run_offline.sh b/language/llama3-405b/run_offline.sh → language/llama3.1-405b/run_offline.sh
diff --git a/language/llama3-405b/run_server.sh → language/llama3.1-405b/run_server.sh b/language/llama3-405b/run_server.sh → language/llama3.1-405b/run_server.sh
diff --git a/language/llama3-405b/user.conf → language/llama3.1-405b/user.conf b/language/llama3-405b/user.conf → language/llama3.1-405b/user.conf
@@ -10,4 +10,4 @@
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
 
-llama3-405b.Server.sample_concatenate_permutation = 1
+llama3_1-405b.Server.sample_concatenate_permutation = 1
diff --git a/language/llama3-405b/with_the_same_user → language/llama3.1-405b/with_the_same_user b/language/llama3-405b/with_the_same_user → language/llama3.1-405b/with_the_same_user
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
@@ -14,7 +14,7 @@ dlrm-v2.*.performance_sample_count_override = 204800
 rnnt.*.performance_sample_count_override = 2513
 gptj.*.performance_sample_count_override = 13368
 llama2-70b.*.performance_sample_count_override = 24576
-llama3-405b.*.performance_sample_count_override = 8313
+llama3_1-405b.*.performance_sample_count_override = 8313
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
 # set to 0 to let entire sample set to be performance sample
@@ -49,7 +49,7 @@ rgat.*.sample_concatenate_permutation = 1
 gptj.*.sample_concatenate_permutation = 1
 llama2-70b.*.sample_concatenate_permutation = 1
 mixtral-8x7b.*.sample_concatenate_permutation = 1
-llama3-405b.*.sample_concatenate_permutation = 1
+llama3_1-405b.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
@@ -66,11 +66,11 @@ stable-diffusion-xl.Server.target_latency = 20000
 # Benchmarks that measure token latencies
 llama2-70b.*.use_token_latencies = 1
 mixtral-8x7b.*.use_token_latencies = 1
-llama3-405b.*.use_token_latencies = 1
+llama3_1-405b.*.use_token_latencies = 1
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
-# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3-405b benchmark therefore target_latency = 0
+# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0
 llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
@@ -79,9 +79,9 @@ mixtral-8x7b.Server.target_latency = 0
 mixtral-8x7b.Server.ttft_latency = 2000
 mixtral-8x7b.Server.tpot_latency = 200
 
-llama3-405b.Server.target_latency = 0
-llama3-405b.Server.ttft_latency = 6000
-llama3-405b.Server.tpot_latency = 175
+llama3_1-405b.Server.target_latency = 0
+llama3_1-405b.Server.ttft_latency = 6000
+llama3_1-405b.Server.tpot_latency = 175
 
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
@@ -100,7 +100,7 @@ rnnt.Offline.min_query_count = 2513
 3d-unet.Offline.min_query_count = 43
 stable-diffusion-xl.Offline.min_query_count = 5000
 llama2-70b.Offline.min_query_count = 24576
-llama3-405b.Offline.min_query_count = 8313
+llama3_1-405b.Offline.min_query_count = 8313
 mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 

diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
@@ -211,7 +211,7 @@ def main():
                 "llama2-70b-99.9": ["Server", "Offline"],
                 "mixtral-8x7b": ["Server", "Offline"],
                 "rgat": ["Offline"],
-                "llama3-405b": ["Offline", "Server"]
+                "llama3.1-405b": ["Offline", "Server"]
             },
             "edge": {
                 "resnet": ["SingleStream", "MultiStream", "Offline"],

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
@@ -194,6 +194,7 @@
             "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
+            "llama3_1-405b": "llama3.1-405b"
         },
         "seeds": {
             "qsl_rng_seed": 3066443479025735752,
@@ -267,7 +268,7 @@
             "llama2-70b-99.9",
             "stable-diffusion-xl",
             "mixtral-8x7b",
-            "llama3-405b",
+            "llama3.1-405b",
             "rgat",
             # TODO: add automotive?
         ],
@@ -284,7 +285,7 @@
             "llama2-70b-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["Server", "Offline"],
             "mixtral-8x7b": ["Server", "Offline"],
-            "llama3-405b": ["Server", "Offline"],
+            "llama3.1-405b": ["Server", "Offline"],
             "rgat": ["Offline"],
         },
         "optional-scenarios-datacenter": {},
@@ -315,7 +316,7 @@
             "llama2-70b-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
             "mixtral-8x7b": ["Server", "Offline"],
-            "llama3-405b": ["Server", "Offline"],
+            "llama3.1-405b": ["Server", "Offline"],
             "rgat": ["Offline"],
         },
         "optional-scenarios-datacenter-edge": {},
@@ -389,7 +390,7 @@
                 "mbxp_accuracy",
                 60.12 * 0.99,
             ),
-            "llama3-405b": (
+            "llama3.1-405b": (
                 "ROUGEL",
                 21.6666 * 0.99,
                 "exact_match",
@@ -409,7 +410,7 @@
             "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
-            "llama3-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
+            "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
         },
         "accuracy-delta-perc": {
             "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2}
@@ -429,7 +430,7 @@
             "llama2-70b-99.9": 24576,
             "stable-diffusion-xl": 5000,
             "mixtral-8x7b": 15000,
-            "llama3-405b": 8313,
+            "llama3.1-405b": 8313,
             "rgat": 788379
 
         },
@@ -459,7 +460,7 @@
             "llama2-70b-99": {"Server": 20000000000},
             "llama2-70b-99.9": {"Server": 20000000000},
             "mixtral-8x7b": {"Server": 20000000000},
-            "llama3-405b": {"Server": 60000000000}
+            "llama3.1-405b": {"Server": 60000000000}
         },
         "min-queries": {
             "resnet": {
@@ -490,7 +491,7 @@
                 "Offline": 1,
             },
             "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
-            "llama3-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama3.1-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "rgat": {"SingleStream": 1024, "Offline": 1}
         },
     },
@@ -579,7 +580,7 @@
     "llama2-70b-99.9": 24576,
     "stable-diffusion-xl": 5000,
     "mixtral-8x7b": 15000,
-    "llama3-405b": 8313,
+    "llama3.1-405b": 8313,
     "rgat": 788379,
 }
 
@@ -656,7 +657,7 @@
             "Offline": "result_tokens_per_second",
             "Server": "result_completed_tokens_per_second",
         },
-        "llama3-405b": {
+        "llama3.1-405b": {
             "Offline": "result_tokens_per_second",
             "Server": "result_completed_tokens_per_second",
         },
@@ -671,7 +672,7 @@
         "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
     },
     "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
-    "llama3-405b": {
+    "llama3.1-405b": {
         "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000}
     },
 }
@@ -956,7 +957,7 @@ def requires_equal_issue(self, model, division):
                 "llama2-70b-99",
                 "llama2-70b-99.9",
                 "mixtral-8x7b",
-                "llama3-405b",
+                "llama3.1-405b",
                 "rgat",
             ]
             and self.version not in ["v4.0", "v4.1"]
@@ -1325,7 +1326,7 @@ def check_performance_dir(
         )
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
-                 "mixtral-8x7b", "llama3-405b"]:
+                 "mixtral-8x7b", "llama3.1-405b"]:
         llama_constraint, is_valid = extra_check_llm(
             mlperf_log, scenario_fixed, model)
 
@@ -1865,7 +1866,7 @@ def log_result(
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
-            "llama3-405b": {
+            "llama3.1-405b": {
                 "SingleStream": "Latency (ms)",
                 "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
@@ -2950,7 +2951,7 @@ def check_compliance_dir(
         "llama2-70b-99",
         "llama2-70b-99.9",
         "mixtral-8x7b",
-        "llama3-405b",
+        "llama3.1-405b",
         "rgat",
     ]:
         test_list.remove("TEST04")
@@ -2971,7 +2972,7 @@ def check_compliance_dir(
         "llama2-70b-99",
         "llama2-70b-99.9",
         "mixtral-8x7b",
-        "llama3-405b",
+        "llama3.1-405b",
     ]:
         test_list.remove("TEST01")
 
@@ -2980,7 +2981,7 @@ def check_compliance_dir(
         test_list.remove("TEST04")
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
-                 "mixtral-8x7b", "llama3-405b"]:
+                 "mixtral-8x7b", "llama3.1-405b"]:
         test_list.append("TEST06")
 
     if test_list and not os.path.exists(compliance_dir):