From 00ad41466c914021de07cf3a6ff22ea545e6e7e2 Mon Sep 17 00:00:00 2001
From: DearPlanet <junsong.zhang2021.work@outlook.com>
Date: Wed, 19 Jun 2024 18:19:08 +0800
Subject: [PATCH] [Misc]Add param max-model-len in benchmark_latency.py (#5629)

---
 benchmarks/benchmark_latency.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 98e0be2779922..e9d1048c89b64 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -29,6 +29,7 @@ def main(args: argparse.Namespace):
         tensor_parallel_size=args.tensor_parallel_size,
         trust_remote_code=args.trust_remote_code,
         dtype=args.dtype,
+        max_model_len=args.max_model_len,
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
         quantization_param_path=args.quantization_param_path,
@@ -150,6 +151,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
     parser.add_argument(
         '--dtype',
         type=str,