Skip to content

Commit a8ffc4f

Browse files
authored
[Bugfix] Lower gpt-oss max cudagraph size to 992 to be compatible with FA3 (#25508)
Signed-off-by: mgoin <mgoin64@gmail.com>
1 parent d5944d5 commit a8ffc4f

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

vllm/model_executor/models/config.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,24 +266,24 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
266266
if structured_outputs_config.reasoning_parser == "":
267267
structured_outputs_config.reasoning_parser = "openai_gptoss"
268268

269-
# Increase the max capture size from 512 to 1024 for performance.
269+
# Increase the max capture size from 512 to 992 for performance.
270270
# NOTE(woosuk): This will increase the number of CUDA graphs
271-
# from 67 to 83.
271+
# from 67 to 81.
272272
scheduler_config = vllm_config.scheduler_config
273273
if len(scheduler_config.cuda_graph_sizes) == 1:
274274
max_capture_size = scheduler_config.cuda_graph_sizes[0]
275275
# FIXME(woosuk): When using full cuda graph with FA3, the max
276276
# supported size is 992.
277-
if max_capture_size < 1024:
277+
if max_capture_size < 992:
278278
cuda_graph_sizes = [1, 2, 4]
279279
# Step size 8 for small batch sizes
280280
cuda_graph_sizes += [i for i in range(8, 256, 8)]
281281
# Step size 16 for larger batch sizes
282-
cuda_graph_sizes += [i for i in range(256, 1025, 16)]
282+
cuda_graph_sizes += [i for i in range(256, 993, 16)]
283283
scheduler_config.cuda_graph_sizes = cuda_graph_sizes
284284
logger.info(
285285
"Overriding max cuda graph capture size to "
286-
"%d for performance.", 1024)
286+
"%d for performance.", 992)
287287

288288

289289
class MambaModelConfig(VerifyAndUpdateConfig):

0 commit comments

Comments
 (0)