Small improvement for linear

maleksan85 · maleksan85 · commit 622f1c432a4b · 2025-03-10T19:29:48.000Z
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -15,6 +15,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.tuned_gemm import tgemm
 # yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            BlockQuantScaleParameter,
@@ -137,7 +138,7 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return torch.mm(x, torch.transpose(layer.weight, 0, 1))
+        return tgemm.mm(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
@@ -8,12 +8,13 @@
 import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.envs import VLLM_USE_ROCM_SKINNY_GEMM
 from vllm.platforms import current_platform
 from vllm.utils import is_mi250, is_navi
 
 support_tuned_gemms = False
-if current_platform.is_rocm():
+if current_platform.is_rocm() and not envs.VLLM_USE_V1:
     import vllm._gradlib_C  # noqa: F401
     support_tuned_gemms = True