From 5abe79362a11347cb80b0d752dd3bf1f1468e6c6 Mon Sep 17 00:00:00 2001
From: yewentao256 <zhyanwentao@126.com>
Date: Sat, 28 Jun 2025 00:19:52 +0000
Subject: [PATCH 1/2] fix ci issue distributed 4 gpu test

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 examples/offline_inference/data_parallel.py | 25 +++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 3eccb4e11ab6..b56ac41bc397 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -64,6 +64,25 @@ def parse_args():
     parser.add_argument(
         "--trust-remote-code", action="store_true", help="Trust remote code."
     )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=64,
+        help=(
+            "Maximum number of sequences used during engine warm-up. "
+            "Lowering this value can substantially reduce peak memory "
+            "consumption and help avoid CUDA OOM errors."
+        ),
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.8,
+        help=(
+            "Fraction of GPU memory vLLM is allowed to allocate (0-1). "
+            "Setting a smaller value leaves more free memory headroom."
+        ),
+    )
     return parser.parse_args()
 
 
@@ -77,6 +96,8 @@ def main(
     GPUs_per_dp_rank,
     enforce_eager,
     trust_remote_code,
+    max_num_seqs,
+    gpu_memory_utilization,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -127,6 +148,8 @@ def start(rank):
         enforce_eager=enforce_eager,
         enable_expert_parallel=True,
         trust_remote_code=trust_remote_code,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
     )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -181,6 +204,8 @@ def start(rank):
                 tp_size,
                 args.enforce_eager,
                 args.trust_remote_code,
+                args.max_num_seqs,
+                args.gpu_memory_utilization,
             ),
         )
         proc.start()

From 0bc44e79e3753eb9680a16fb07a1ccfe6423dd3b Mon Sep 17 00:00:00 2001
From: yewentao256 <zhyanwentao@126.com>
Date: Sat, 28 Jun 2025 00:31:00 +0000
Subject: [PATCH 2/2] fix doc

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 examples/offline_inference/data_parallel.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b56ac41bc397..dbf8ed58cc47 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -68,20 +68,13 @@ def parse_args():
         "--max-num-seqs",
         type=int,
         default=64,
-        help=(
-            "Maximum number of sequences used during engine warm-up. "
-            "Lowering this value can substantially reduce peak memory "
-            "consumption and help avoid CUDA OOM errors."
-        ),
+        help=("Maximum number of sequences to be processed in a single iteration."),
     )
     parser.add_argument(
         "--gpu-memory-utilization",
         type=float,
         default=0.8,
-        help=(
-            "Fraction of GPU memory vLLM is allowed to allocate (0-1). "
-            "Setting a smaller value leaves more free memory headroom."
-        ),
+        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
     )
     return parser.parse_args()