From 33fd0ce60365147e6f5fcc21e7e04c64fa7d04d7 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 30 Jul 2025 19:03:15 +0000 Subject: [PATCH] Rename CUTLASS_MLA_VLLM_V1 to CUTLASS_MLA Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 2 +- vllm/platforms/cuda.py | 10 +++++----- vllm/platforms/interface.py | 2 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ababa49a53ae..c36c79c69317 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1417,7 +1417,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: "PALLAS_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TRITON_MLA", - "CUTLASS_MLA_VLLM_V1", + "CUTLASS_MLA", "FLASHMLA", "FLASHINFER", "FLASHINFER_VLLM_V1", diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c35d22c1d682..87ff6b385809 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -162,7 +162,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. use_cutlass_mla = True - envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA_VLLM_V1" + envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" else: # Not Blackwell use_flashmla = True @@ -170,7 +170,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: # Forced case use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA") use_cutlass_mla = ( - envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1") + envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA") from vllm.attention.ops.flashmla import is_flashmla_supported if use_flashmla and is_flashmla_supported()[0] \ @@ -182,7 +182,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: if use_cutlass_mla and cache_config.block_size != 128: cache_config.block_size = 128 logger.info("Forcing kv cache block size to 128 for " - "CUTLASS_MLA_VLLM_V1 backend.") + "CUTLASS_MLA backend.") compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" @@ -211,9 +211,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla) -> str: if use_mla: - # TODO(lucas): refactor to be more concise + # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here - if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1: + if selected_backend == _Backend.CUTLASS_MLA: if use_v1: logger.info_once("Using Cutlass MLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla." diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 02cc392244ba..6bae0fe25c79 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -53,7 +53,7 @@ class _Backend(enum.Enum): TRITON_MLA_VLLM_V1 = enum.auto() FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 - CUTLASS_MLA_VLLM_V1 = enum.auto() + CUTLASS_MLA = enum.auto() PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() IPEX = enum.auto() diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index c787f25cd3ad..b23a8f0a5e87 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -21,7 +21,7 @@ class CutlassMLABackend(MLACommonBackend): @staticmethod def get_name() -> str: - return "CUTLASS_MLA_VLLM_V1" + return "CUTLASS_MLA" @staticmethod def get_impl_cls() -> type["CutlassMLAImpl"]: