Skip to content

Commit ddb94c2

Browse files
authored
[core] Add tags parameter to wake_up() (#15500)
Signed-off-by: Eric <erictang000@gmail.com>
1 parent 90969fb commit ddb94c2

File tree

18 files changed

+143
-70
lines changed

18 files changed

+143
-70
lines changed

tests/basic_correctness/test_cumem.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
155155

156156
llm.wake_up()
157157
output2 = llm.generate(prompt, sampling_params)
158-
159158
# cmp output
160159
assert output[0].outputs[0].text == output2[0].outputs[0].text
160+
161+
llm.sleep(level=1)
162+
llm.wake_up(tags=["weights"])
163+
164+
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
165+
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
166+
167+
# should just reallocate memory for weights (1B model, ~2GiB weights)
168+
if use_v1:
169+
assert used_bytes < 10 * GiB_bytes
170+
else:
171+
assert used_bytes < 6 * GiB_bytes
172+
173+
# now allocate kv cache memory
174+
llm.wake_up(tags=["kv_cache"])
175+
output3 = llm.generate(prompt, sampling_params)
176+
177+
# cmp output
178+
assert output[0].outputs[0].text == output3[0].outputs[0].text

tests/entrypoints/openai/test_sleep.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,37 @@ def test_sleep_mode():
2525
"VLLM_SERVER_DEV_MODE": "1",
2626
"CUDA_VISIBLE_DEVICES": "0"
2727
}) as remote_server:
28+
response = requests.post(remote_server.url_for("sleep"),
29+
params={"level": "1"})
30+
assert response.status_code == 200
31+
response = requests.get(remote_server.url_for("is_sleeping"))
32+
assert response.status_code == 200
33+
assert response.json().get("is_sleeping") is True
34+
35+
response = requests.post(remote_server.url_for("wake_up"))
36+
assert response.status_code == 200
37+
response = requests.get(remote_server.url_for("is_sleeping"))
38+
assert response.status_code == 200
39+
assert response.json().get("is_sleeping") is False
2840

29-
response = requests.post(remote_server.url_for("/sleep"),
41+
# test wake up with tags
42+
response = requests.post(remote_server.url_for("sleep"),
3043
params={"level": "1"})
3144
assert response.status_code == 200
32-
response = requests.get(remote_server.url_for("/is_sleeping"))
45+
46+
response = requests.post(remote_server.url_for("wake_up"),
47+
params={"tags": ["weights"]})
48+
assert response.status_code == 200
49+
50+
# is sleeping should be false after waking up any part of the engine
51+
response = requests.get(remote_server.url_for("is_sleeping"))
3352
assert response.status_code == 200
3453
assert response.json().get("is_sleeping") is True
3554

36-
response = requests.post(remote_server.url_for("/wake_up"))
55+
response = requests.post(remote_server.url_for("wake_up"),
56+
params={"tags": ["kv_cache"]})
3757
assert response.status_code == 200
38-
response = requests.get(remote_server.url_for("/is_sleeping"))
58+
59+
response = requests.get(remote_server.url_for("is_sleeping"))
3960
assert response.status_code == 200
4061
assert response.json().get("is_sleeping") is False

vllm/device_allocator/cumem.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -208,22 +208,28 @@ def sleep(
208208
gc.collect()
209209
torch.cuda.empty_cache()
210210

211-
def wake_up(self):
211+
def wake_up(self, tags: Optional[list[str]] = None) -> None:
212212
"""
213213
Wake up the allocator from sleep mode.
214-
All data that is previously offloaded will be loaded back to GPU
215-
memory, and the rest of the data will have empty memory."""
214+
All data that is previously offloaded will be loaded back to GPU
215+
memory, and the rest of the data will have empty memory.
216+
217+
:param tags: The tags of the memory allocation that will be loaded
218+
back to GPU memory. If None, all memory allocation will be loaded
219+
back to GPU memory.
220+
"""
216221
for ptr, data in self.pointer_to_data.items():
217-
handle = data.handle
218-
create_and_map(handle)
219-
if data.cpu_backup_tensor is not None:
220-
cpu_backup_tensor = data.cpu_backup_tensor
221-
if cpu_backup_tensor is not None:
222-
size_in_bytes = cpu_backup_tensor.numel(
223-
) * cpu_backup_tensor.element_size()
224-
cpu_ptr = cpu_backup_tensor.data_ptr()
225-
libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
226-
data.cpu_backup_tensor = None
222+
if tags is None or data.tag in tags:
223+
handle = data.handle
224+
create_and_map(handle)
225+
if data.cpu_backup_tensor is not None:
226+
cpu_backup_tensor = data.cpu_backup_tensor
227+
if cpu_backup_tensor is not None:
228+
size_in_bytes = cpu_backup_tensor.numel(
229+
) * cpu_backup_tensor.element_size()
230+
cpu_ptr = cpu_backup_tensor.data_ptr()
231+
libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
232+
data.cpu_backup_tensor = None
227233

228234
@contextmanager
229235
def use_memory_pool(self, tag: Optional[str] = None):

vllm/engine/async_llm_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,8 +1225,8 @@ async def reset_prefix_cache(self,
12251225
async def sleep(self, level: int = 1) -> None:
12261226
self.engine.sleep(level)
12271227

1228-
async def wake_up(self) -> None:
1229-
self.engine.wake_up()
1228+
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
1229+
self.engine.wake_up(tags)
12301230

12311231
async def is_sleeping(self) -> bool:
12321232
return self.engine.is_sleeping()

vllm/engine/llm_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1938,10 +1938,10 @@ def sleep(self, level: int = 1) -> None:
19381938
"Sleep mode is not enabled in the model config")
19391939
self.model_executor.sleep(level=level)
19401940

1941-
def wake_up(self) -> None:
1941+
def wake_up(self, tags: Optional[list[str]] = None) -> None:
19421942
assert self.vllm_config.model_config.enable_sleep_mode, (
19431943
"Sleep mode is not enabled in the model config")
1944-
self.model_executor.wake_up()
1944+
self.model_executor.wake_up(tags)
19451945

19461946
def is_sleeping(self) -> bool:
19471947
return self.model_executor.is_sleeping

vllm/engine/multiprocessing/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,9 @@ class RPCSleepRequest(Enum):
133133
SLEEP_LEVEL_2 = 2
134134

135135

136-
class RPCWakeUpRequest(Enum):
137-
WAKE_UP = 1
136+
@dataclass
137+
class RPCWakeUpRequest:
138+
tags: Optional[list[str]] = None
138139

139140

140141
@dataclass

vllm/engine/multiprocessing/client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -697,10 +697,10 @@ async def sleep(self, level: int = 1) -> None:
697697
return await self._send_one_way_rpc_request(
698698
request=RPCSleepRequest(level), socket=self.input_socket)
699699

700-
async def wake_up(self) -> None:
700+
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
701701
"""Wake up the engine"""
702702
return await self._send_one_way_rpc_request(
703-
request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
703+
request=RPCWakeUpRequest(tags), socket=self.input_socket)
704704

705705
async def is_sleeping(self) -> bool:
706706
"""Check whether the engine is sleeping"""

vllm/engine/multiprocessing/engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def handle_new_input(self):
274274
elif isinstance(request, RPCSleepRequest):
275275
self.sleep(request.value)
276276
elif isinstance(request, RPCWakeUpRequest):
277-
self.wake_up()
277+
self.wake_up(request.tags)
278278
elif isinstance(request, RPCIsSleepingRequest):
279279
self._handle_is_sleeping_request(request)
280280
else:
@@ -415,8 +415,8 @@ def reset_prefix_cache(self) -> bool:
415415
def sleep(self, level: int = 1) -> None:
416416
self.engine.sleep(level)
417417

418-
def wake_up(self) -> None:
419-
self.engine.wake_up()
418+
def wake_up(self, tags: Optional[list[str]] = None) -> None:
419+
self.engine.wake_up(tags)
420420

421421
def is_sleeping(self) -> bool:
422422
return self.engine.is_sleeping()

vllm/engine/protocol.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ async def sleep(self, level: int = 1) -> None:
282282
...
283283

284284
@abstractmethod
285-
async def wake_up(self) -> None:
285+
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
286286
"""Wake up the engine"""
287287
...
288288

vllm/entrypoints/llm.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,26 +1200,35 @@ def sleep(self, level: int = 1):
12001200
The caller should guarantee that no requests are being processed
12011201
during the sleep period, before `wake_up` is called.
12021202
1203-
:param level: The sleep level. Level 1 sleep will offload the model
1204-
weights and discard the kv cache. The content of kv cache is
1205-
forgotten. Level 1 sleep is good for sleeping and waking up the
1206-
engine to run the same model again. The model weights are backed
1207-
up in CPU memory. Please make sure there's enough CPU memory to
1208-
store the model weights. Level 2 sleep will discard both the model
1209-
weights and the kv cache. The content of both the model weights
1210-
and kv cache is forgotten. Level 2 sleep is good for sleeping and
1211-
waking up the engine to run a different model or update the model,
1212-
where previous model weights are not needed. It reduces CPU memory
1213-
pressure.
1203+
Args:
1204+
level: The sleep level. Level 1 sleep will offload the model
1205+
weights and discard the kv cache. The content of kv cache
1206+
is forgotten. Level 1 sleep is good for sleeping and waking
1207+
up the engine to run the same model again. The model weights
1208+
are backed up in CPU memory. Please make sure there's enough
1209+
CPU memory to store the model weights. Level 2 sleep will
1210+
discard both the model weights and the kv cache. The content
1211+
of both the model weights and kv cache is forgotten. Level 2
1212+
sleep is good for sleeping and waking up the engine to run a
1213+
different model or update the model, where previous model
1214+
weights are not needed. It reduces CPU memory pressure.
12141215
"""
12151216
self.reset_prefix_cache()
12161217
self.llm_engine.sleep(level=level)
12171218

1218-
def wake_up(self):
1219+
def wake_up(self, tags: Optional[list[str]] = None):
12191220
"""
12201221
Wake up the engine from sleep mode. See the :meth:`sleep` method
1221-
for more details."""
1222-
self.llm_engine.wake_up()
1222+
for more details.
1223+
1224+
Args:
1225+
tags: An optional list of tags to reallocate the engine memory
1226+
for specific memory allocations. Values must be in
1227+
("weights", "kv_cache",). If None, all memory is reallocated.
1228+
wake_up should be called with all tags (or None) before the
1229+
engine is used again.
1230+
"""
1231+
self.llm_engine.wake_up(tags)
12231232

12241233
# LEGACY
12251234
def _convert_v1_inputs(

0 commit comments

Comments
 (0)