From 5abe79362a11347cb80b0d752dd3bf1f1468e6c6 Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Sat, 28 Jun 2025 00:19:52 +0000 Subject: [PATCH 1/2] fix ci issue distributed 4 gpu test Signed-off-by: yewentao256 --- examples/offline_inference/data_parallel.py | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 3eccb4e11ab6..b56ac41bc397 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -64,6 +64,25 @@ def parse_args(): parser.add_argument( "--trust-remote-code", action="store_true", help="Trust remote code." ) + parser.add_argument( + "--max-num-seqs", + type=int, + default=64, + help=( + "Maximum number of sequences used during engine warm-up. " + "Lowering this value can substantially reduce peak memory " + "consumption and help avoid CUDA OOM errors." + ), + ) + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.8, + help=( + "Fraction of GPU memory vLLM is allowed to allocate (0-1). " + "Setting a smaller value leaves more free memory headroom." + ), + ) return parser.parse_args() @@ -77,6 +96,8 @@ def main( GPUs_per_dp_rank, enforce_eager, trust_remote_code, + max_num_seqs, + gpu_memory_utilization, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -127,6 +148,8 @@ def start(rank): enforce_eager=enforce_eager, enable_expert_parallel=True, trust_remote_code=trust_remote_code, + max_num_seqs=max_num_seqs, + gpu_memory_utilization=gpu_memory_utilization, ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. @@ -181,6 +204,8 @@ def start(rank): tp_size, args.enforce_eager, args.trust_remote_code, + args.max_num_seqs, + args.gpu_memory_utilization, ), ) proc.start() From 0bc44e79e3753eb9680a16fb07a1ccfe6423dd3b Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Sat, 28 Jun 2025 00:31:00 +0000 Subject: [PATCH 2/2] fix doc Signed-off-by: yewentao256 --- examples/offline_inference/data_parallel.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index b56ac41bc397..dbf8ed58cc47 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -68,20 +68,13 @@ def parse_args(): "--max-num-seqs", type=int, default=64, - help=( - "Maximum number of sequences used during engine warm-up. " - "Lowering this value can substantially reduce peak memory " - "consumption and help avoid CUDA OOM errors." - ), + help=("Maximum number of sequences to be processed in a single iteration."), ) parser.add_argument( "--gpu-memory-utilization", type=float, default=0.8, - help=( - "Fraction of GPU memory vLLM is allowed to allocate (0-1). " - "Setting a smaller value leaves more free memory headroom." - ), + help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), ) return parser.parse_args()