vllm-project · vllm-bot · Jun 28, 2025 · Jun 28, 2025 · Jun 28, 2025 · njhill
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
@@ -64,6 +64,18 @@ def parse_args():
     parser.add_argument(
         "--trust-remote-code", action="store_true", help="Trust remote code."
     )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=64,
+        help=("Maximum number of sequences to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.8,
+        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
+    )
     return parser.parse_args()
 
 
@@ -77,6 +89,8 @@ def main(
     GPUs_per_dp_rank,
     enforce_eager,
     trust_remote_code,
+    max_num_seqs,
+    gpu_memory_utilization,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -127,6 +141,8 @@ def start(rank):
         enforce_eager=enforce_eager,
         enable_expert_parallel=True,
         trust_remote_code=trust_remote_code,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
     )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -181,6 +197,8 @@ def start(rank):
                 tp_size,
                 args.enforce_eager,
                 args.trust_remote_code,
+                args.max_num_seqs,
+                args.gpu_memory_utilization,
             ),
         )
         proc.start()