@@ -1200,26 +1200,35 @@ def sleep(self, level: int = 1):
12001200 The caller should guarantee that no requests are being processed
12011201 during the sleep period, before `wake_up` is called.
12021202
1203- :param level: The sleep level. Level 1 sleep will offload the model
1204- weights and discard the kv cache. The content of kv cache is
1205- forgotten. Level 1 sleep is good for sleeping and waking up the
1206- engine to run the same model again. The model weights are backed
1207- up in CPU memory. Please make sure there's enough CPU memory to
1208- store the model weights. Level 2 sleep will discard both the model
1209- weights and the kv cache. The content of both the model weights
1210- and kv cache is forgotten. Level 2 sleep is good for sleeping and
1211- waking up the engine to run a different model or update the model,
1212- where previous model weights are not needed. It reduces CPU memory
1213- pressure.
1203+ Args:
1204+ level: The sleep level. Level 1 sleep will offload the model
1205+ weights and discard the kv cache. The content of kv cache
1206+ is forgotten. Level 1 sleep is good for sleeping and waking
1207+ up the engine to run the same model again. The model weights
1208+ are backed up in CPU memory. Please make sure there's enough
1209+ CPU memory to store the model weights. Level 2 sleep will
1210+ discard both the model weights and the kv cache. The content
1211+ of both the model weights and kv cache is forgotten. Level 2
1212+ sleep is good for sleeping and waking up the engine to run a
1213+ different model or update the model, where previous model
1214+ weights are not needed. It reduces CPU memory pressure.
12141215 """
12151216 self .reset_prefix_cache ()
12161217 self .llm_engine .sleep (level = level )
12171218
1218- def wake_up (self ):
1219+ def wake_up (self , tags : Optional [ list [ str ]] = None ):
12191220 """
12201221 Wake up the engine from sleep mode. See the :meth:`sleep` method
1221- for more details."""
1222- self .llm_engine .wake_up ()
1222+ for more details.
1223+
1224+ Args:
1225+ tags: An optional list of tags to reallocate the engine memory
1226+ for specific memory allocations. Values must be in
1227+ ("weights", "kv_cache",). If None, all memory is reallocated.
1228+ wake_up should be called with all tags (or None) before the
1229+ engine is used again.
1230+ """
1231+ self .llm_engine .wake_up (tags )
12231232
12241233 # LEGACY
12251234 def _convert_v1_inputs (
0 commit comments