File tree Expand file tree Collapse file tree 4 files changed +11
-12
lines changed Expand file tree Collapse file tree 4 files changed +11
-12
lines changed Original file line number Diff line number Diff line change 1515# This file is a part of the vllm-ascend project.
1616#
1717
18- import gc
1918import os
2019from datetime import timedelta
2120from typing import TYPE_CHECKING , Optional , Tuple
@@ -107,12 +106,6 @@ def synchronize(cls):
107106 def mem_get_info (cls ) -> Tuple [int , int ]:
108107 return torch .npu .mem_get_info ()
109108
110- @classmethod
111- def clear_npu_memory (cls ):
112- gc .collect ()
113- torch .npu .empty_cache ()
114- torch .npu .reset_peak_memory_stats ()
115-
116109 @classmethod
117110 def check_and_update_config (cls , vllm_config : VllmConfig ) -> None :
118111 # initialize ascend config from vllm additional_config
Original file line number Diff line number Diff line change 1818#
1919
2020import atexit
21+ import gc
2122import math
2223from contextlib import contextmanager , nullcontext
2324from enum import Enum
@@ -294,3 +295,9 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool):
294295 return FusedMoEState .All2All
295296 else :
296297 return FusedMoEState .MC2
298+
299+
300+ def clear_npu_memory ():
301+ gc .collect ()
302+ torch .npu .empty_cache ()
303+ torch .npu .reset_peak_memory_stats ()
Original file line number Diff line number Diff line change 2323
2424import msgpack # type: ignore
2525import torch
26- import torch .distributed
2726import zmq
2827from torch import nn
2928from vllm import envs
5150from vllm_ascend .device_allocator .camem import CaMemAllocator
5251from vllm_ascend .distributed .parallel_state import init_ascend_model_parallel
5352from vllm_ascend .platform import NPUPlatform
54- from vllm_ascend .utils import try_register_lib
53+ from vllm_ascend .utils import clear_npu_memory , try_register_lib
5554from vllm_ascend .worker .model_runner import NPUModelRunner
5655from vllm_ascend .worker .pooling_model_runner import NPUPoolingModelRunner
5756
@@ -280,7 +279,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
280279 """
281280 # Profile the memory usage of the model and get the maximum number of
282281 # cache blocks that can be allocated with the remaining free memory.
283- NPUPlatform . empty_cache ()
282+ clear_npu_memory ()
284283
285284 # Execute a forward pass with dummy inputs to profile the memory usage
286285 # of the model.
Original file line number Diff line number Diff line change 4242from vllm_ascend .device_allocator .camem import CaMemAllocator
4343from vllm_ascend .distributed .parallel_state import init_ascend_model_parallel
4444from vllm_ascend .platform import NPUPlatform
45- from vllm_ascend .utils import try_register_lib
45+ from vllm_ascend .utils import clear_npu_memory , try_register_lib
4646from vllm_ascend .worker .model_runner_v1 import NPUModelRunner
4747
4848
@@ -136,7 +136,7 @@ def init_device(self):
136136 def determine_available_memory (self ) -> int :
137137 # Profile the memory usage of the model and get the maximum number of
138138 # cache blocks that can be allocated with the remaining free memory.
139- NPUPlatform . clear_npu_memory ()
139+ clear_npu_memory ()
140140
141141 # Execute a forward pass with dummy inputs to profile the memory usage
142142 # of the model.
You can’t perform that action at this time.
0 commit comments