@@ -75,11 +75,11 @@ class PassConfig:
7575 don't all have access to full configuration - that would create a cycle as
7676 the `PassManager` is set as a property of config."""
7777
78- enable_fusion : bool = field (default_factory = lambda : not envs .VLLM_USE_V1 )
78+ enable_fusion : bool = field (default_factory = lambda : envs .VLLM_USE_V1 )
7979 """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
80- enable_attn_fusion : bool = False
80+ enable_attn_fusion : bool = field ( default_factory = lambda : envs . VLLM_USE_V1 )
8181 """Whether to enable the custom attention+quant fusion pass."""
82- enable_noop : bool = field (default_factory = lambda : not envs .VLLM_USE_V1 )
82+ enable_noop : bool = field (default_factory = lambda : envs .VLLM_USE_V1 )
8383 """Whether to enable the custom no-op elimination pass."""
8484 enable_sequence_parallelism : bool = False
8585 """Whether to enable sequence parallelism."""
@@ -223,7 +223,7 @@ class CompilationConfig:
223223 constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
224224
225225 # CudaGraph compilation
226- cudagraph_mode : Optional [CUDAGraphMode ] = None
226+ cudagraph_mode : Optional [CUDAGraphMode ] = CUDAGraphMode . FULL
227227 """
228228 The mode of the cudagraph:
229229
@@ -410,6 +410,16 @@ def __post_init__(self) -> None:
410410 count_all = self .custom_ops .count ("all" )
411411 assert count_none + count_all <= 1 , "Can only specify 'none' or 'all'"
412412
413+ if "+rms_norm" not in self .custom_ops and \
414+ "-rms_norm" not in self .custom_ops :
415+ self .custom_ops .append ("+rms_norm" )
416+ if "+silu_and_mul" not in self .custom_ops and \
417+ "-silu_and_mul" not in self .custom_ops :
418+ self .custom_ops .append ("+silu_and_mul" )
419+ if "+quant_fp8" not in self .custom_ops and \
420+ "-quant_fp8" not in self .custom_ops :
421+ self .custom_ops .append ("+quant_fp8" )
422+
413423 # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
414424 # 1. A bug in PyTorch, fixed in 2.7:
415425 # https://github.com/pytorch/pytorch/issues/147924
@@ -540,28 +550,7 @@ def set_splitting_ops_for_v1(self):
540550 "set_splitting_ops_for_v1 should only be called when "
541551 "level is CompilationLevel.PIECEWISE" )
542552
543- if self .splitting_ops is None :
544- # NOTE: When using full cudagraph, instead of setting an empty
545- # list and capture the full cudagraph inside the flattened fx
546- # graph, we keep the piecewise fx graph structure but capture the
547- # full cudagraph outside the fx graph. This reduces some cpu
548- # overhead when the runtime batch_size is not cudagraph captured.
549- # see https://github.com/vllm-project/vllm/pull/20059 for details.
550- # make a copy to avoid mutating the class-level list via reference.
551- self .splitting_ops = list (self ._attention_ops )
552- elif len (self .splitting_ops ) == 0 :
553- logger .warning_once ("Using piecewise compilation with empty "
554- "splitting_ops." )
555- if self .cudagraph_mode == CUDAGraphMode .PIECEWISE :
556- logger .warning_once (
557- "When compilation level is piecewise with empty "
558- "splitting_ops, PIECEWISE cudagraph_mode will be "
559- "treated as FULL cudagraph_mode. Please ensure you are "
560- "using attention backends that support cudagraph or set "
561- "cudagraph_mode to NONE explicitly if encountering "
562- "any problems." )
563- self .cudagraph_mode = CUDAGraphMode .FULL
564- self .splitting_ops = []
553+ self .splitting_ops = []
565554
566555 if envs .VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" :
567556 # exclude MoE dispatch/combine from capture by ensuring
0 commit comments