@@ -117,68 +117,59 @@ def model(x):
117117
118118@create_new_process_for_each_test () 
119119@pytest .mark .parametrize ( 
120-     "model, use_v1 " , 
120+     "model" , 
121121    [ 
122122        # sleep mode with safetensors  
123-         ( "meta-llama/Llama-3.2-1B" ,  True ) , 
123+         "meta-llama/Llama-3.2-1B" , 
124124        # sleep mode with pytorch checkpoint  
125-         ( "facebook/opt-125m" ,  True ) , 
125+         "facebook/opt-125m" , 
126126    ], 
127127) 
128- def  test_end_to_end (monkeypatch : pytest .MonkeyPatch , model : str , use_v1 : bool ):
129-     with  monkeypatch .context () as  m :
130-         assert  use_v1 
131-         m .setenv ("VLLM_USE_V1" , "1" )
132-         free , total  =  torch .cuda .mem_get_info ()
133-         used_bytes_baseline  =  total  -  free   # in case other process is running 
134-         llm  =  LLM (model , enable_sleep_mode = True )
135-         prompt  =  "How are you?" 
136-         sampling_params  =  SamplingParams (temperature = 0 , max_tokens = 10 )
137-         output  =  llm .generate (prompt , sampling_params )
138- 
139-         # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, 
140-         # which is difficult to measure in the test. therefore, we only 
141-         # test sleep level 1 here. 
142-         llm .sleep (level = 1 )
143- 
144-         free_gpu_bytes_after_sleep , total  =  torch .cuda .mem_get_info ()
145-         used_bytes  =  total  -  free_gpu_bytes_after_sleep  -  used_bytes_baseline 
146-         # now the memory usage is mostly cudagraph memory pool, 
147-         # and it should be less than the model weights (1B model, 2GiB weights) 
148- 
149-         # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) 
150-         # is captured but cannot be releasesd from PyTorch due to a known bug, 
151-         # therefore high memory usage after `llm.sleep` is called is expected. 
152-         # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode 
153-         # in V1. 
154-         if  use_v1 :
155-             assert  used_bytes  <  7  *  GiB_bytes 
156-         else :
157-             assert  used_bytes  <  2  *  GiB_bytes 
158- 
159-         llm .wake_up ()
160-         output2  =  llm .generate (prompt , sampling_params )
161-         # cmp output 
162-         assert  output [0 ].outputs [0 ].text  ==  output2 [0 ].outputs [0 ].text 
163- 
164-         llm .sleep (level = 1 )
165-         llm .wake_up (tags = ["weights" ])
166- 
167-         free_gpu_bytes_wake_up_w , total  =  torch .cuda .mem_get_info ()
168-         used_bytes  =  total  -  free_gpu_bytes_wake_up_w  -  used_bytes_baseline 
169- 
170-         # should just reallocate memory for weights (1B model, ~2GiB weights) 
171-         if  use_v1 :
172-             assert  used_bytes  <  10  *  GiB_bytes 
173-         else :
174-             assert  used_bytes  <  6  *  GiB_bytes 
175- 
176-         # now allocate kv cache memory 
177-         llm .wake_up (tags = ["kv_cache" ])
178-         output3  =  llm .generate (prompt , sampling_params )
179- 
180-         # cmp output 
181-         assert  output [0 ].outputs [0 ].text  ==  output3 [0 ].outputs [0 ].text 
128+ def  test_end_to_end (model : str ):
129+     free , total  =  torch .cuda .mem_get_info ()
130+     used_bytes_baseline  =  total  -  free   # in case other process is running 
131+     llm  =  LLM (model , enable_sleep_mode = True )
132+     prompt  =  "How are you?" 
133+     sampling_params  =  SamplingParams (temperature = 0 , max_tokens = 10 )
134+     output  =  llm .generate (prompt , sampling_params )
135+ 
136+     # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, 
137+     # which is difficult to measure in the test. therefore, we only 
138+     # test sleep level 1 here. 
139+     llm .sleep (level = 1 )
140+ 
141+     free_gpu_bytes_after_sleep , total  =  torch .cuda .mem_get_info ()
142+     used_bytes  =  total  -  free_gpu_bytes_after_sleep  -  used_bytes_baseline 
143+     # now the memory usage is mostly cudagraph memory pool, 
144+     # and it should be less than the model weights (1B model, 2GiB weights) 
145+ 
146+     # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) 
147+     # is captured but cannot be releasesd from PyTorch due to a known bug, 
148+     # therefore high memory usage after `llm.sleep` is called is expected. 
149+     # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode 
150+     # in V1. 
151+     assert  used_bytes  <  7  *  GiB_bytes 
152+ 
153+     llm .wake_up ()
154+     output2  =  llm .generate (prompt , sampling_params )
155+     # cmp output 
156+     assert  output [0 ].outputs [0 ].text  ==  output2 [0 ].outputs [0 ].text 
157+ 
158+     llm .sleep (level = 1 )
159+     llm .wake_up (tags = ["weights" ])
160+ 
161+     free_gpu_bytes_wake_up_w , total  =  torch .cuda .mem_get_info ()
162+     used_bytes  =  total  -  free_gpu_bytes_wake_up_w  -  used_bytes_baseline 
163+ 
164+     # should just reallocate memory for weights (1B model, ~2GiB weights) 
165+     assert  used_bytes  <  10  *  GiB_bytes 
166+ 
167+     # now allocate kv cache memory 
168+     llm .wake_up (tags = ["kv_cache" ])
169+     output3  =  llm .generate (prompt , sampling_params )
170+ 
171+     # cmp output 
172+     assert  output [0 ].outputs [0 ].text  ==  output3 [0 ].outputs [0 ].text 
182173
183174
184175@create_new_process_for_each_test () 
0 commit comments