Skip to content

Commit 3400df0

Browse files
committed
feat: Improve performance for Altlas 300I series
Signed-off-by: Vincent Yuan <farawayboat@gmail.com>
1 parent 9fb3d55 commit 3400df0

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,14 @@
8989
else:
9090
xgr = LazyLoader("xgr", globals(), "xgrammar")
9191

92+
import torch_npu
9293
import vllm.envs as envs_vllm
9394

9495
import vllm_ascend.envs as envs_ascend
9596

97+
if is_310p():
98+
torch_npu.npu.set_compile_mode(jit_compile=False)
99+
96100

97101
@dataclass
98102
class GraphCaptureContext:
@@ -1991,6 +1995,18 @@ def load_model(self) -> None:
19911995

19921996
with DeviceMemoryProfiler() as m: # noqa: SIM117
19931997
self.model = get_model(vllm_config=self.vllm_config)
1998+
1999+
if is_310p():
2000+
from vllm.model_executor.layers.linear import (
2001+
MergedColumnParallelLinear, QKVParallelLinear,
2002+
RowParallelLinear)
2003+
for module in self.model.modules():
2004+
if isinstance(module,
2005+
(MergedColumnParallelLinear,
2006+
QKVParallelLinear, RowParallelLinear)):
2007+
module.weight.data = torch_npu.npu_format_cast(
2008+
module.weight.data, ACL_FORMAT_FRACTAL_NZ)
2009+
19942010
try:
19952011
# For version compatibility, remove this after we abort vllm v0.9.1 support
19962012
from vllm.model_executor.models.interfaces import \

0 commit comments

Comments
 (0)