add clear_npu_memory() method

shen-shanshan · shen-shanshan · commit 87494ecd6fcf · 2025-06-20T07:09:38.000Z
Signed-off-by: shen-shanshan &lt;467638484@qq.com&gt;
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -15,7 +15,6 @@
 # This file is a part of the vllm-ascend project.
 #
 
-import gc
 import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple
@@ -107,12 +106,6 @@ def synchronize(cls):
     def mem_get_info(cls) -> Tuple[int, int]:
         return torch.npu.mem_get_info()
 
-    @classmethod
-    def clear_npu_memory(cls):
-        gc.collect()
-        torch.npu.empty_cache()
-        torch.npu.reset_peak_memory_stats()
-
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # initialize ascend config from vllm additional_config
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -18,6 +18,7 @@
 #
 
 import atexit
+import gc
 import math
 from contextlib import contextmanager, nullcontext
 from enum import Enum
@@ -294,3 +295,9 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool):
         return FusedMoEState.All2All
     else:
         return FusedMoEState.MC2
+
+
+def clear_npu_memory():
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
@@ -23,7 +23,6 @@
 
 import msgpack  # type: ignore
 import torch
-import torch.distributed
 import zmq
 from torch import nn
 from vllm import envs
@@ -51,7 +50,7 @@
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import try_register_lib
+from vllm_ascend.utils import clear_npu_memory, try_register_lib
 from vllm_ascend.worker.model_runner import NPUModelRunner
 from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner
 
@@ -280,7 +279,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
-        NPUPlatform.empty_cache()
+        clear_npu_memory()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -42,7 +42,7 @@
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import try_register_lib
+from vllm_ascend.utils import clear_npu_memory, try_register_lib
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 
@@ -136,7 +136,7 @@ def init_device(self):
     def determine_available_memory(self) -> int:
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
-        NPUPlatform.clear_npu_memory()
+        clear_npu_memory()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.