Skip to content

Commit 081cb98

Browse files
committed
Configuration defaults
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
1 parent 08808b4 commit 081cb98

File tree

2 files changed

+17
-28
lines changed

2 files changed

+17
-28
lines changed

vllm/config/compilation.py

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,11 @@ class PassConfig:
7575
don't all have access to full configuration - that would create a cycle as
7676
the `PassManager` is set as a property of config."""
7777

78-
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
78+
enable_fusion: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
7979
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
80-
enable_attn_fusion: bool = False
80+
enable_attn_fusion: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
8181
"""Whether to enable the custom attention+quant fusion pass."""
82-
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
82+
enable_noop: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
8383
"""Whether to enable the custom no-op elimination pass."""
8484
enable_sequence_parallelism: bool = False
8585
"""Whether to enable sequence parallelism."""
@@ -223,7 +223,7 @@ class CompilationConfig:
223223
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
224224

225225
# CudaGraph compilation
226-
cudagraph_mode: Optional[CUDAGraphMode] = None
226+
cudagraph_mode: Optional[CUDAGraphMode] = CUDAGraphMode.FULL
227227
"""
228228
The mode of the cudagraph:
229229
@@ -410,6 +410,16 @@ def __post_init__(self) -> None:
410410
count_all = self.custom_ops.count("all")
411411
assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
412412

413+
if "+rms_norm" not in self.custom_ops and \
414+
"-rms_norm" not in self.custom_ops:
415+
self.custom_ops.append("+rms_norm")
416+
if "+silu_and_mul" not in self.custom_ops and \
417+
"-silu_and_mul" not in self.custom_ops:
418+
self.custom_ops.append("+silu_and_mul")
419+
if "+quant_fp8" not in self.custom_ops and \
420+
"-quant_fp8" not in self.custom_ops:
421+
self.custom_ops.append("+quant_fp8")
422+
413423
# TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
414424
# 1. A bug in PyTorch, fixed in 2.7:
415425
# https://github.com/pytorch/pytorch/issues/147924
@@ -540,28 +550,7 @@ def set_splitting_ops_for_v1(self):
540550
"set_splitting_ops_for_v1 should only be called when "
541551
"level is CompilationLevel.PIECEWISE")
542552

543-
if self.splitting_ops is None:
544-
# NOTE: When using full cudagraph, instead of setting an empty
545-
# list and capture the full cudagraph inside the flattened fx
546-
# graph, we keep the piecewise fx graph structure but capture the
547-
# full cudagraph outside the fx graph. This reduces some cpu
548-
# overhead when the runtime batch_size is not cudagraph captured.
549-
# see https://github.com/vllm-project/vllm/pull/20059 for details.
550-
# make a copy to avoid mutating the class-level list via reference.
551-
self.splitting_ops = list(self._attention_ops)
552-
elif len(self.splitting_ops) == 0:
553-
logger.warning_once("Using piecewise compilation with empty "
554-
"splitting_ops.")
555-
if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
556-
logger.warning_once(
557-
"When compilation level is piecewise with empty "
558-
"splitting_ops, PIECEWISE cudagraph_mode will be "
559-
"treated as FULL cudagraph_mode. Please ensure you are "
560-
"using attention backends that support cudagraph or set "
561-
"cudagraph_mode to NONE explicitly if encountering "
562-
"any problems.")
563-
self.cudagraph_mode = CUDAGraphMode.FULL
564-
self.splitting_ops = []
553+
self.splitting_ops = []
565554

566555
if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
567556
# exclude MoE dispatch/combine from capture by ensuring

vllm/envs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
VLLM_NCCL_SO_PATH: Optional[str] = None
1818
LD_LIBRARY_PATH: Optional[str] = None
1919
VLLM_USE_TRITON_FLASH_ATTN: bool = True
20-
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
20+
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = True
2121
VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
2222
VLLM_FLASH_ATTN_VERSION: Optional[int] = None
2323
LOCAL_RANK: int = 0
@@ -416,7 +416,7 @@ def get_vllm_port() -> Optional[int]:
416416
# the unified triton kernel.
417417
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
418418
lambda:
419-
(os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
419+
(os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "True").lower() in
420420
("true", "1")),
421421

422422
# Use AITER triton unified attention for V1 attention

0 commit comments

Comments
 (0)