@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
4747@pytest .mark .parametrize ("max_tokens" , [5 ]) 
4848@pytest .mark .parametrize ("enforce_eager" , [False ]) 
4949def  test_models (
50+     monkeypatch : pytest .MonkeyPatch ,
5051    hf_runner ,
5152    model : str ,
5253    backend : str ,
@@ -63,31 +64,33 @@ def test_models(
6364        pytest .skip (
6465            f"{ backend }  )
6566
66-     os .environ ["VLLM_ATTENTION_BACKEND" ] =  backend 
67+     with  monkeypatch .context () as  m :
68+         m .setenv ("VLLM_ATTENTION_BACKEND" , backend )
6769
68-     # 5042 tokens for gemma2 
69-     # gemma2 has alternating sliding window size of 4096 
70-     # we need a prompt with more than 4096 tokens to test the sliding window 
71-     prompt  =  "The following numbers of the sequence "  +  ", " .join (
72-         str (i ) for  i  in  range (1024 )) +  " are:" 
73-     example_prompts  =  [prompt ]
70+          # 5042 tokens for gemma2 
71+          # gemma2 has alternating sliding window size of 4096 
72+          # we need a prompt with more than 4096 tokens to test the sliding window 
73+          prompt  =  "The following numbers of the sequence "  +  ", " .join (
74+              str (i ) for  i  in  range (1024 )) +  " are:" 
75+          example_prompts  =  [prompt ]
7476
75-     with  hf_runner (model , dtype = dtype ) as  hf_model :
76-         hf_outputs  =  hf_model .generate_greedy (example_prompts , max_tokens )
77+          with  hf_runner (model , dtype = dtype ) as  hf_model :
78+              hf_outputs  =  hf_model .generate_greedy (example_prompts , max_tokens )
7779
78-     with  VllmRunner (model ,
79-                     max_model_len = 8192 ,
80-                     dtype = dtype ,
81-                     enforce_eager = enforce_eager ,
82-                     gpu_memory_utilization = 0.7 ) as  vllm_model :
83-         vllm_outputs  =  vllm_model .generate_greedy (example_prompts , max_tokens )
80+         with  VllmRunner (model ,
81+                         max_model_len = 8192 ,
82+                         dtype = dtype ,
83+                         enforce_eager = enforce_eager ,
84+                         gpu_memory_utilization = 0.7 ) as  vllm_model :
85+             vllm_outputs  =  vllm_model .generate_greedy (example_prompts ,
86+                                                       max_tokens )
8487
85-     check_outputs_equal (
86-         outputs_0_lst = hf_outputs ,
87-         outputs_1_lst = vllm_outputs ,
88-         name_0 = "hf" ,
89-         name_1 = "vllm" ,
90-     )
88+          check_outputs_equal (
89+              outputs_0_lst = hf_outputs ,
90+              outputs_1_lst = vllm_outputs ,
91+              name_0 = "hf" ,
92+              name_1 = "vllm" ,
93+          )
9194
9295
9396@multi_gpu_test (num_gpus = 2 ) 
@@ -104,6 +107,7 @@ def test_models(
104107        ("meta-llama/Meta-Llama-3-8B" , "ray" , "FLASHINFER" , "A100" ), 
105108    ]) 
106109def  test_models_distributed (
110+     monkeypatch : pytest .MonkeyPatch ,
107111    hf_runner ,
108112    vllm_runner ,
109113    example_prompts ,
@@ -116,34 +120,41 @@ def test_models_distributed(
116120    if  test_suite  !=  TARGET_TEST_SUITE :
117121        pytest .skip (f"Skip test for { test_suite }  )
118122
119-     if  model  ==  "meta-llama/Llama-3.2-1B-Instruct"  and  distributed_executor_backend  ==  "ray"  and  attention_backend  ==  ""  and  test_suite  ==  "L4" :  # noqa 
120-         # test Ray Compiled Graph 
121-         os .environ ['VLLM_USE_RAY_SPMD_WORKER' ] =  "1" 
122-         os .environ ['VLLM_USE_RAY_COMPILED_DAG' ] =  "1" 
123- 
124-     if  attention_backend :
125-         os .environ ["VLLM_ATTENTION_BACKEND" ] =  attention_backend 
126- 
127-     dtype  =  "half" 
128-     max_tokens  =  5 
129- 
130-     # NOTE: take care of the order. run vLLM first, and then run HF. 
131-     # vLLM needs a fresh new process without cuda initialization. 
132-     # if we run HF first, the cuda initialization will be done and it 
133-     # will hurt multiprocessing backend with fork method (the default method). 
134-     with  vllm_runner (model ,
135-                      dtype = dtype ,
136-                      tensor_parallel_size = 2 ,
137-                      distributed_executor_backend = distributed_executor_backend 
138-                      ) as  vllm_model :
139-         vllm_outputs  =  vllm_model .generate_greedy (example_prompts , max_tokens )
140- 
141-     with  hf_runner (model , dtype = dtype ) as  hf_model :
142-         hf_outputs  =  hf_model .generate_greedy (example_prompts , max_tokens )
143- 
144-     check_outputs_equal (
145-         outputs_0_lst = hf_outputs ,
146-         outputs_1_lst = vllm_outputs ,
147-         name_0 = "hf" ,
148-         name_1 = "vllm" ,
149-     )
123+     with  monkeypatch .context () as  monkeypatch_context :
124+         if  model  ==  "meta-llama/Llama-3.2-1B-Instruct"  and  distributed_executor_backend  ==  "ray"  and  attention_backend  ==  ""  and  test_suite  ==  "L4" :  # noqa 
125+             # test Ray Compiled Graph 
126+             monkeypatch_context .setenv ("VLLM_USE_RAY_SPMD_WORKER" , "1" )
127+             monkeypatch_context .setenv ("VLLM_USE_RAY_COMPILED_DAG" , "1" )
128+ 
129+         if  attention_backend :
130+             monkeypatch_context .setenv (
131+                 "VLLM_ATTENTION_BACKEND" ,
132+                 attention_backend ,
133+             )
134+ 
135+         dtype  =  "half" 
136+         max_tokens  =  5 
137+ 
138+         # NOTE: take care of the order. run vLLM first, and then run HF. 
139+         # vLLM needs a fresh new process without cuda initialization. 
140+         # if we run HF first, the cuda initialization will be done and it 
141+         # will hurt multiprocessing backend with fork method 
142+         # (the default method). 
143+         with  vllm_runner (
144+                 model ,
145+                 dtype = dtype ,
146+                 tensor_parallel_size = 2 ,
147+                 distributed_executor_backend = distributed_executor_backend ,
148+         ) as  vllm_model :
149+             vllm_outputs  =  vllm_model .generate_greedy (example_prompts ,
150+                                                       max_tokens )
151+ 
152+         with  hf_runner (model , dtype = dtype ) as  hf_model :
153+             hf_outputs  =  hf_model .generate_greedy (example_prompts , max_tokens )
154+ 
155+         check_outputs_equal (
156+             outputs_0_lst = hf_outputs ,
157+             outputs_1_lst = vllm_outputs ,
158+             name_0 = "hf" ,
159+             name_1 = "vllm" ,
160+         )
0 commit comments