[1/N][Refactor] torchair model runner refactor

wangxiyuan · wangxiyuan · commit a8f6e7697e3e · 2025-08-05T10:14:45.000+08:00
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
+#
+
+
+import torch
+import torch._dynamo.cache_size
+from vllm.config import VllmConfig
+
+from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
+
+
+class NPUTorchairModelRunner(NPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py
@@ -17,6 +17,7 @@
 from vllm.logger import logger
 
 import vllm_ascend.envs as envs_ascend
+from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner
 from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist,
                                         check_torchair_cache_exist,
                                         delete_torchair_cache_file,
@@ -52,3 +53,9 @@ def determine_available_memory(self) -> int:
         self.model_runner.new_kv_cache_bytes = available_kv_cache_memory
 
         return available_kv_cache_memory
+
+    def init_device(self):
+        """Override init_device to init torchair model runner"""
+        device = self._init_device()
+        # Init ModelRunner here, so that we have access to self.device.
+        self.model_runner = NPUTorchairModelRunner(self.vllm_config, device)
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -130,17 +130,19 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-    def init_device(self):
+    def _init_device(self):
         device = torch.device(f"npu:{self.local_rank}")
         NPUPlatform.set_device(device)
         NPUPlatform.empty_cache()
         self.init_npu_memory = NPUPlatform.mem_get_info()[0]
-
         # Initialize the distributed environment.
         self._init_worker_distributed_environment()
         # Set random seed.
         NPUPlatform.seed_everything(self.model_config.seed)
+        return device
 
+    def init_device(self):
+        device = self._init_device()
         # Init ModelRunner here, so that we have access to self.device.
         self.model_runner = NPUModelRunner(self.vllm_config, device)