@@ -266,24 +266,24 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
266266        if  structured_outputs_config .reasoning_parser  ==  "" :
267267            structured_outputs_config .reasoning_parser  =  "openai_gptoss" 
268268
269-         # Increase the max capture size from 512 to 1024  for performance. 
269+         # Increase the max capture size from 512 to 992  for performance. 
270270        # NOTE(woosuk): This will increase the number of CUDA graphs 
271-         # from 67 to 83 . 
271+         # from 67 to 81 . 
272272        scheduler_config  =  vllm_config .scheduler_config 
273273        if  len (scheduler_config .cuda_graph_sizes ) ==  1 :
274274            max_capture_size  =  scheduler_config .cuda_graph_sizes [0 ]
275275            # FIXME(woosuk): When using full cuda graph with FA3, the max 
276276            # supported size is 992. 
277-             if  max_capture_size  <  1024 :
277+             if  max_capture_size  <  992 :
278278                cuda_graph_sizes  =  [1 , 2 , 4 ]
279279                # Step size 8 for small batch sizes 
280280                cuda_graph_sizes  +=  [i  for  i  in  range (8 , 256 , 8 )]
281281                # Step size 16 for larger batch sizes 
282-                 cuda_graph_sizes  +=  [i  for  i  in  range (256 , 1025 , 16 )]
282+                 cuda_graph_sizes  +=  [i  for  i  in  range (256 , 993 , 16 )]
283283                scheduler_config .cuda_graph_sizes  =  cuda_graph_sizes 
284284                logger .info (
285285                    "Overriding max cuda graph capture size to " 
286-                     "%d for performance." , 1024 )
286+                     "%d for performance." , 992 )
287287
288288
289289class  MambaModelConfig (VerifyAndUpdateConfig ):
0 commit comments