@@ -64,6 +64,18 @@ def parse_args():
6464 parser .add_argument (
6565 "--trust-remote-code" , action = "store_true" , help = "Trust remote code."
6666 )
67+ parser .add_argument (
68+ "--max-num-seqs" ,
69+ type = int ,
70+ default = 64 ,
71+ help = ("Maximum number of sequences to be processed in a single iteration." ),
72+ )
73+ parser .add_argument (
74+ "--gpu-memory-utilization" ,
75+ type = float ,
76+ default = 0.8 ,
77+ help = ("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]." ),
78+ )
6779 return parser .parse_args ()
6880
6981
@@ -77,6 +89,8 @@ def main(
7789 GPUs_per_dp_rank ,
7890 enforce_eager ,
7991 trust_remote_code ,
92+ max_num_seqs ,
93+ gpu_memory_utilization ,
8094):
8195 os .environ ["VLLM_DP_RANK" ] = str (global_dp_rank )
8296 os .environ ["VLLM_DP_RANK_LOCAL" ] = str (local_dp_rank )
@@ -127,6 +141,8 @@ def start(rank):
127141 enforce_eager = enforce_eager ,
128142 enable_expert_parallel = True ,
129143 trust_remote_code = trust_remote_code ,
144+ max_num_seqs = max_num_seqs ,
145+ gpu_memory_utilization = gpu_memory_utilization ,
130146 )
131147 outputs = llm .generate (prompts , sampling_params )
132148 # Print the outputs.
@@ -181,6 +197,8 @@ def start(rank):
181197 tp_size ,
182198 args .enforce_eager ,
183199 args .trust_remote_code ,
200+ args .max_num_seqs ,
201+ args .gpu_memory_utilization ,
184202 ),
185203 )
186204 proc .start ()
0 commit comments