switch to xpu kernel for w8a16 gemm (vllm-project#323)

jikunshang · jikunshang · commit 93b95701cf52 · 2025-09-28T18:41:13.000-07:00
Signed-off-by: Kunshang Ji &lt;kunshang.ji@intel.com&gt;
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -8,7 +8,7 @@
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
-from vllm._ipex_ops import ipex_ops as ops
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -283,8 +283,8 @@ def apply(self,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = layer.weight.data
         weight_scale = layer.weight_scale.data
-        output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True,
-                                                     weight_scale, bias)
+        output = torch.ops._xpu_C.fp8_gemm_w8a16(x, weight, True, weight_scale,
+                                                 bias)
         return output
 
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -7,6 +7,7 @@
 import torch
 # import custom ops, trigger op registration
 import vllm_xpu_kernels._C  # noqa
+import vllm_xpu_kernels._xpu_C  # noqa
 
 import vllm.envs as envs
 from vllm.logger import init_logger