2828 --proc-per-node=2
2929 MOE models:
3030 python examples/offline_external_launcher.py \
31- --model="Qwen/Qwen3-0.6B " \
31+ --model="Qwen/Qwen3-30B-A3B " \
3232 --tp-size=2 \
3333 --proc-per-node=2 \
3434 --enable-expert-parallel
3535
3636Multi-node:
3737 Node 0 (assume the node has ip of 10.99.48.128):
3838 python examples/offline_external_launcher.py \
39- --model="Qwen/Qwen3-0.6B " \
39+ --model="Qwen/Qwen3-30B-A3B " \
4040 --tp-size=2 \
4141 --node-size=2 \
4242 --node-rank=0 \
4646 --master-port=13345
4747 Node 1:
4848 python examples/offline_external_launcher.py \
49- --model="Qwen/Qwen3-0.6B " \
49+ --model="Qwen/Qwen3-30B-A3B " \
5050 --tp-size=2 \
5151 --node-size=2 \
5252 --node-rank=1 \
6666from vllm import LLM , SamplingParams
6767from vllm .distributed .parallel_state import ( # noqa E402
6868 destroy_distributed_environment , destroy_model_parallel , get_tp_group )
69- from vllm .utils import get_open_port
69+ from vllm .utils import get_open_port , GiB_bytes
7070
7171os .environ ["VLLM_USE_MODELSCOPE" ] = "True"
7272os .environ ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
@@ -114,20 +114,44 @@ def parse_args():
114114 parser .add_argument ("--enable-expert-parallel" ,
115115 action = "store_true" ,
116116 help = "Enable expert parallel, used in MOE models." )
117- return parser .parse_args ()
117+ parser .add_argument ("--enable-sleep-mode" ,
118+ action = "store_true" ,
119+ help = "Enable sleep mode for the engine." )
120+ parser .add_argument ("--temperature" ,
121+ type = float ,
122+ default = 0.8 ,
123+ help = "Float that controls the randomness of the sampling." )
124+ parser .add_argument ("--model-weight-gib" ,
125+ type = float ,
126+ default = None ,
127+ help = "Model weight memory usage in GiB (e.g., 1.0 for 0.5B model)." )
128+
129+ args = parser .parse_args ()
130+ if args .enable_sleep_mode :
131+ if args .model_weight_gib is None or args .temperature != 0 :
132+ parser .error ("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set." )
133+ if args .model_weight_gib <= 0 :
134+ parser .error ("model-weight-gib must be greater than 0 when enable-sleep-mode is set." )
135+ if args .model == parser .get_default ("model" ) and args .model_weight_gib is None :
136+ parser .error ("model-weight-gib must be provided for default model when enable-sleep-mode is set." )
137+
138+ return args
118139
119140
120141def main (
121142 local_rank : int ,
122143 rank : int ,
123144 master_addr : str ,
124145 master_port : int ,
146+ model_weight_gib : float ,
125147 model : str = "Qwen/Qwen3-0.6B" ,
126148 world_size : int = 4 ,
127149 tensor_parallel_size : int = 2 ,
128150 enable_expert_parallel : bool = False ,
129151 enforce_eager : bool = False ,
130152 trust_remote_code : bool = True ,
153+ enable_sleep_mode : bool = False ,
154+ temperature : float = 0.8 ,
131155):
132156 os .environ ["MASTER_ADDR" ] = master_addr
133157 os .environ ["MASTER_PORT" ] = str (master_port )
@@ -147,7 +171,7 @@ def main(
147171 "The future of AI is" ,
148172 ] * 10
149173 sampling_params = SamplingParams (
150- temperature = 0.8 ,
174+ temperature = temperature ,
151175 top_p = 0.95 ,
152176 max_tokens = 10 ,
153177 )
@@ -159,10 +183,31 @@ def main(
159183 trust_remote_code = trust_remote_code ,
160184 distributed_executor_backend = "external_launcher" ,
161185 seed = 0 ,
186+ enable_sleep_mode = enable_sleep_mode ,
162187 )
163188 tp_ranks = get_tp_group ().ranks
164189 print (f'TP RANKS: { tp_ranks } ' )
190+
165191 outputs = llm .generate (prompts , sampling_params )
192+
193+ if enable_sleep_mode :
194+ if rank == 0 :
195+ free_bytes_before_sleep , total = torch .npu .mem_get_info ()
196+ llm .sleep (level = 1 )
197+ if rank == 0 :
198+ free_bytes_after_sleep , total = torch .npu .mem_get_info ()
199+ freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
200+ print (f"Freed memory: { freed_bytes / 1024 ** 3 :.2f} GiB" )
201+ # now the freed memory should be larger than the model weights
202+ assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes
203+
204+ llm .wake_up ()
205+ outputs_after_wakeup = llm .generate (prompts , sampling_params )
206+ if rank == 0 :
207+ # cmp output
208+ assert outputs [0 ].outputs [0 ].text == outputs_after_wakeup [0 ].outputs [0 ].text
209+ print ("Sleep and wake up successfully!!" )
210+
166211 for i , output in enumerate (outputs ):
167212 if i >= 5 :
168213 # print only 5 outputs
@@ -214,12 +259,15 @@ def cleanup_env_and_memory():
214259 rank ,
215260 master_addr ,
216261 master_port ,
262+ args .model_weight_gib ,
217263 args .model ,
218264 world_size ,
219265 tp_size ,
220266 args .enable_expert_parallel ,
221267 args .enforce_eager ,
222268 args .trust_remote_code ,
269+ args .enable_sleep_mode ,
270+ args .temperature ,
223271 ))
224272
225273 proc .start ()
0 commit comments