@@ -162,15 +162,15 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
162162 if cls .is_device_capability (100 ):
163163 # Blackwell => Force CutlassMLA.
164164 use_cutlass_mla = True
165- envs .VLLM_ATTENTION_BACKEND = "CUTLASS_MLA_VLLM_V1 "
165+ envs .VLLM_ATTENTION_BACKEND = "CUTLASS_MLA "
166166 else :
167167 # Not Blackwell
168168 use_flashmla = True
169169 else :
170170 # Forced case
171171 use_flashmla = (envs .VLLM_ATTENTION_BACKEND == "FLASHMLA" )
172172 use_cutlass_mla = (
173- envs .VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1 " )
173+ envs .VLLM_ATTENTION_BACKEND == "CUTLASS_MLA " )
174174
175175 from vllm .attention .ops .flashmla import is_flashmla_supported
176176 if use_flashmla and is_flashmla_supported ()[0 ] \
@@ -182,7 +182,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
182182 if use_cutlass_mla and cache_config .block_size != 128 :
183183 cache_config .block_size = 128
184184 logger .info ("Forcing kv cache block size to 128 for "
185- "CUTLASS_MLA_VLLM_V1 backend." )
185+ "CUTLASS_MLA backend." )
186186
187187 compilation_config = vllm_config .compilation_config
188188 if (envs .VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
@@ -211,9 +211,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
211211 kv_cache_dtype , block_size , use_v1 ,
212212 use_mla ) -> str :
213213 if use_mla :
214- # TODO(lucas): refactor to be more concise
214+ # TODO(lucas): refactor to be more concise
215215 # we should probably consider factoring out V1 here
216- if selected_backend == _Backend .CUTLASS_MLA_VLLM_V1 :
216+ if selected_backend == _Backend .CUTLASS_MLA :
217217 if use_v1 :
218218 logger .info_once ("Using Cutlass MLA backend on V1 engine." )
219219 return ("vllm.v1.attention.backends.mla."
0 commit comments