@@ -152,8 +152,13 @@ def __init__(self):
152152 self .pointer_to_data : dict [int , AllocationData ] = {}
153153 self .current_tag : str = CuMemAllocator .default_tag
154154 self .allocator_and_pools : dict [str , Any ] = {}
155+ # Creating strong references to the two callbacks here to prevent
156+ # these ephemeral bound-method objects being garbage collected.
157+ # See discussions in https://github.com/vllm-project/vllm/pull/22724
158+ self .python_malloc_callback = self ._python_malloc_callback
159+ self .python_free_callback = self ._python_free_callback
155160
156- def python_malloc_callback (self , allocation_handle : HandleType ) -> None :
161+ def _python_malloc_callback (self , allocation_handle : HandleType ) -> None :
157162 """
158163 Internal method to store the allocation data
159164 when memory is allocated in the memory pool."""
@@ -162,7 +167,7 @@ def python_malloc_callback(self, allocation_handle: HandleType) -> None:
162167 allocation_handle , self .current_tag )
163168 return
164169
165- def python_free_callback (self , ptr : int ) -> HandleType :
170+ def _python_free_callback (self , ptr : int ) -> HandleType :
166171 """
167172 Internal method to look up the allocation data
168173 when memory is freed in the memory pool."""
@@ -212,9 +217,9 @@ def sleep(
212217 def wake_up (self , tags : Optional [list [str ]] = None ) -> None :
213218 """
214219 Wake up the allocator from sleep mode.
215- All data that is previously offloaded will be loaded back to GPU
220+ All data that is previously offloaded will be loaded back to GPU
216221 memory, and the rest of the data will have empty memory.
217-
222+
218223 :param tags: The tags of the memory allocation that will be loaded
219224 back to GPU memory. If None, all memory allocation will be loaded
220225 back to GPU memory.
0 commit comments