@@ -266,24 +266,24 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
266266 if structured_outputs_config .reasoning_parser == "" :
267267 structured_outputs_config .reasoning_parser = "openai_gptoss"
268268
269- # Increase the max capture size from 512 to 1024 for performance.
269+ # Increase the max capture size from 512 to 992 for performance.
270270 # NOTE(woosuk): This will increase the number of CUDA graphs
271- # from 67 to 83 .
271+ # from 67 to 81 .
272272 scheduler_config = vllm_config .scheduler_config
273273 if len (scheduler_config .cuda_graph_sizes ) == 1 :
274274 max_capture_size = scheduler_config .cuda_graph_sizes [0 ]
275275 # FIXME(woosuk): When using full cuda graph with FA3, the max
276276 # supported size is 992.
277- if max_capture_size < 1024 :
277+ if max_capture_size < 992 :
278278 cuda_graph_sizes = [1 , 2 , 4 ]
279279 # Step size 8 for small batch sizes
280280 cuda_graph_sizes += [i for i in range (8 , 256 , 8 )]
281281 # Step size 16 for larger batch sizes
282- cuda_graph_sizes += [i for i in range (256 , 1025 , 16 )]
282+ cuda_graph_sizes += [i for i in range (256 , 993 , 16 )]
283283 scheduler_config .cuda_graph_sizes = cuda_graph_sizes
284284 logger .info (
285285 "Overriding max cuda graph capture size to "
286- "%d for performance." , 1024 )
286+ "%d for performance." , 992 )
287287
288288
289289class MambaModelConfig (VerifyAndUpdateConfig ):
0 commit comments