[1/N][Platform] Cleanup useless function (vllm-project#26982)

wangxiyuan · ilmarkov · commit ce15379d0fd6 · 2025-11-07T09:30:43.000Z
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
@@ -9,6 +9,7 @@
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
+from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 
@@ -69,8 +70,10 @@ def test_models(
     if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
         pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
 
-    if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
-        pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
+    if not flash_attn_supports_fp8():
+        pytest.skip(
+            f"{kv_cache_dtype} is not supported on this GPU type with {backend} attention."
+        )
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -356,10 +356,6 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skipif(
-    not current_platform.is_kv_cache_dtype_supported("fp8", None),
-    reason="FP8 KV cache is not supported on this device.",
-)
 @pytest.mark.skipif(
     not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
 )
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -23,7 +23,7 @@
 
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.config import VllmConfig
 else:
     _Backend = None
 
@@ -457,49 +457,6 @@ def get_static_graph_wrapper_cls(cls) -> str:
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
 
-    @classmethod
-    def is_kv_cache_dtype_supported(
-        cls, kv_cache_dtype: str, model_config: "ModelConfig"
-    ) -> bool:
-        fp8_attention = kv_cache_dtype.startswith("fp8")
-        attention_backend = envs.VLLM_ATTENTION_BACKEND
-
-        supported = False
-        if model_config is not None and model_config.use_mla:
-            # Default to CutlassMLA for blackwell,
-            # FlashMLA otherwise
-            if attention_backend is None:
-                if cls.is_device_capability(100):
-                    attention_backend = "CUTLASS_MLA"
-                else:
-                    attention_backend = "FLASHMLA"
-
-            # Only FlashMLA and CUTLASS_MLA support fp8
-            if attention_backend in ["FLASHMLA", "CUTLASS_MLA", "FLASHINFER_MLA"]:
-                supported = True
-            else:
-                supported = not fp8_attention
-        else:
-            # Default to FlashAttention
-            if attention_backend is None:
-                attention_backend = "FLASH_ATTN"
-
-            # All Blackwell backends support fp8
-            if cls.is_device_capability(100):
-                supported = True
-            elif attention_backend == "FLASH_ATTN":
-                if fp8_attention:
-                    from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
-
-                    supported = flash_attn_supports_fp8()
-                else:
-                    supported = True
-            elif attention_backend == "FLASHINFER":
-                supported = True
-            elif attention_backend == "TRITON_ATTN":
-                supported = cls.supports_fp8()
-        return supported
-
     @classmethod
     def check_if_supports_dtype(cls, dtype: torch.dtype):
         if dtype == torch.bfloat16:  # noqa: SIM102
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -7,36 +7,31 @@
 import random
 import sys
 from datetime import timedelta
-from platform import uname
 from typing import TYPE_CHECKING, Any, NamedTuple
 
 import numpy as np
 import torch
-from torch.distributed import PrefixStore, ProcessGroup
 
-from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
+    from torch.distributed import PrefixStore, ProcessGroup
+
     from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.config import VllmConfig
+    from vllm.inputs import ProcessorInputs, PromptType
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.utils import FlexibleArgumentParser
 else:
-    _Backend = object
-    ModelConfig = object
-    VllmConfig = object
-    PoolingParams = object
-    SamplingParams = object
     FlexibleArgumentParser = object
 
 logger = init_logger(__name__)
 
 
 def in_wsl() -> bool:
     # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
+    return "microsoft" in " ".join(platform.uname()).lower()
 
 
 class PlatformEnum(enum.Enum):
@@ -178,15 +173,16 @@ def import_kernels(cls) -> None:
             import vllm._moe_C  # noqa: F401
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend:
+    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
+        # Import _Backend here to avoid circular import.
         from vllm.attention.backends.registry import _Backend
 
         return _Backend.TORCH_SDPA
 
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: _Backend,
+        selected_backend: "_Backend",
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
@@ -317,7 +313,7 @@ def pre_register_and_update(
         pass
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
         Check and update the configuration for the current platform.
 
@@ -498,9 +494,9 @@ def opaque_attention_op(cls) -> bool:
     @classmethod
     def validate_request(
         cls,
-        prompt: PromptType,
-        params: SamplingParams | PoolingParams,
-        processed_inputs: ProcessorInputs,
+        prompt: "PromptType",
+        params: "SamplingParams | PoolingParams",
+        processed_inputs: "ProcessorInputs",
     ) -> None:
         """Raises if this request is unsupported on this platform"""
 
@@ -543,25 +539,16 @@ def get_static_graph_wrapper_cls(cls) -> str:
     def stateless_init_device_torch_dist_pg(
         cls,
         backend: str,
-        prefix_store: PrefixStore,
+        prefix_store: "PrefixStore",
         group_rank: int,
         group_size: int,
         timeout: timedelta,
-    ) -> ProcessGroup:
+    ) -> "ProcessGroup":
         """
         Init platform-specific torch distributed process group.
         """
         raise NotImplementedError
 
-    @classmethod
-    def is_kv_cache_dtype_supported(
-        cls, kv_cache_dtype: str, model_config: ModelConfig
-    ) -> bool:
-        """
-        Returns if the kv_cache_dtype is supported by the current platform.
-        """
-        return False
-
     @classmethod
     def check_if_supports_dtype(cls, dtype: torch.dtype):
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -15,7 +15,7 @@
 
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import _Backend
-    from vllm.config import ModelConfig, VllmConfig
+    from vllm.config import VllmConfig
 else:
     _Backend = None
 
@@ -474,12 +474,6 @@ def get_static_graph_wrapper_cls(cls) -> str:
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
 
-    @classmethod
-    def is_kv_cache_dtype_supported(
-        cls, kv_cache_dtype: str, model_config: "ModelConfig"
-    ) -> bool:
-        return True
-
     @classmethod
     def check_if_supports_dtype(cls, dtype: torch.dtype):
         if dtype == torch.bfloat16:  # noqa: SIM102
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -222,12 +222,6 @@ def validate_request(
         ):
             raise ValueError("Torch XLA does not support per-request seed.")
 
-    @classmethod
-    def is_kv_cache_dtype_supported(
-        cls, kv_cache_dtype: str, model_config: "ModelConfig"
-    ) -> bool:
-        return True
-
     @classmethod
     @torch.compile(backend="openxla")
     def insert_blocks_to_device(
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -86,22 +86,6 @@ def get_attn_backend_cls(
         logger.info("Using Flash Attention backend on V1 engine.")
         return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
 
-    @classmethod
-    def is_kv_cache_dtype_supported(
-        cls, kv_cache_dtype: str, model_config: "ModelConfig"
-    ) -> bool:
-        """
-        Check if the kv_cache_dtype is supported.
-        XPU only support fp8 kv cache with triton backend.
-        """
-        if (
-            envs.is_set("VLLM_ATTENTION_BACKEND")
-            and envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN"
-        ):
-            return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"]
-
-        return False
-
     @classmethod
     def set_device(cls, device: torch.device) -> None:
         """

Original file line number	Diff line number	Diff line change
`@@ -356,10 +356,6 @@ def check_model(model):`
`356`	`356`	`assert output`
`357`	`357`
`358`	`358`
`359`		`-@pytest.mark.skipif(`
`360`		`- not current_platform.is_kv_cache_dtype_supported("fp8", None),`
`361`		`- reason="FP8 KV cache is not supported on this device.",`
`362`		`-)`
`363`	`359`	`@pytest.mark.skipif(`
`364`	`360`	`not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."`
`365`	`361`	`)`