Skip to content

Commit 40f896e

Browse files
committed
refactor attention backend for perf boost
Signed-off-by: ganyi <ygan@amd.com>
1 parent 4e68cc9 commit 40f896e

File tree

6 files changed

+763
-297
lines changed

6 files changed

+763
-297
lines changed

vllm/config/scheduler.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,10 @@ class SchedulerConfig:
137137
structured outputs, speculative decoding, and pipeline parallelism.
138138
"""
139139

140+
split_prefill_from_chunk: bool = False
141+
"""Whether to split the prefill request into pure prefill and chunked
142+
prefill in a single batch."""
143+
140144
def compute_hash(self) -> str:
141145
"""
142146
WARNING: Whenever a new field is added to this config,

vllm/platforms/rocm.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,11 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
391391
):
392392
compilation_config.custom_ops.append("+rms_norm")
393393

394+
if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA:
395+
# enable the request reorder if we are using AITER MHA
396+
# for calculation
397+
vllm_config.scheduler_config.split_prefill_from_chunk = True
398+
394399
@classmethod
395400
def verify_model_arch(cls, model_arch: str) -> None:
396401
if model_arch in _ROCM_UNSUPPORTED_MODELS:

0 commit comments

Comments
 (0)