@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
4747@pytest .mark .parametrize ("max_tokens" , [5 ])
4848@pytest .mark .parametrize ("enforce_eager" , [False ])
4949def test_models (
50+ monkeypatch : pytest .MonkeyPatch ,
5051 hf_runner ,
5152 model : str ,
5253 backend : str ,
@@ -63,31 +64,33 @@ def test_models(
6364 pytest .skip (
6465 f"{ backend } does not support gemma2 with full context length." )
6566
66- os .environ ["VLLM_ATTENTION_BACKEND" ] = backend
67+ with monkeypatch .context () as m :
68+ m .setenv ("VLLM_ATTENTION_BACKEND" , backend )
6769
68- # 5042 tokens for gemma2
69- # gemma2 has alternating sliding window size of 4096
70- # we need a prompt with more than 4096 tokens to test the sliding window
71- prompt = "The following numbers of the sequence " + ", " .join (
72- str (i ) for i in range (1024 )) + " are:"
73- example_prompts = [prompt ]
70+ # 5042 tokens for gemma2
71+ # gemma2 has alternating sliding window size of 4096
72+ # we need a prompt with more than 4096 tokens to test the sliding window
73+ prompt = "The following numbers of the sequence " + ", " .join (
74+ str (i ) for i in range (1024 )) + " are:"
75+ example_prompts = [prompt ]
7476
75- with hf_runner (model , dtype = dtype ) as hf_model :
76- hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
77+ with hf_runner (model , dtype = dtype ) as hf_model :
78+ hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
7779
78- with VllmRunner (model ,
79- max_model_len = 8192 ,
80- dtype = dtype ,
81- enforce_eager = enforce_eager ,
82- gpu_memory_utilization = 0.7 ) as vllm_model :
83- vllm_outputs = vllm_model .generate_greedy (example_prompts , max_tokens )
80+ with VllmRunner (model ,
81+ max_model_len = 8192 ,
82+ dtype = dtype ,
83+ enforce_eager = enforce_eager ,
84+ gpu_memory_utilization = 0.7 ) as vllm_model :
85+ vllm_outputs = vllm_model .generate_greedy (example_prompts ,
86+ max_tokens )
8487
85- check_outputs_equal (
86- outputs_0_lst = hf_outputs ,
87- outputs_1_lst = vllm_outputs ,
88- name_0 = "hf" ,
89- name_1 = "vllm" ,
90- )
88+ check_outputs_equal (
89+ outputs_0_lst = hf_outputs ,
90+ outputs_1_lst = vllm_outputs ,
91+ name_0 = "hf" ,
92+ name_1 = "vllm" ,
93+ )
9194
9295
9396@multi_gpu_test (num_gpus = 2 )
@@ -104,6 +107,7 @@ def test_models(
104107 ("meta-llama/Meta-Llama-3-8B" , "ray" , "FLASHINFER" , "A100" ),
105108 ])
106109def test_models_distributed (
110+ monkeypatch : pytest .MonkeyPatch ,
107111 hf_runner ,
108112 vllm_runner ,
109113 example_prompts ,
@@ -116,34 +120,41 @@ def test_models_distributed(
116120 if test_suite != TARGET_TEST_SUITE :
117121 pytest .skip (f"Skip test for { test_suite } " )
118122
119- if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4" : # noqa
120- # test Ray Compiled Graph
121- os .environ ['VLLM_USE_RAY_SPMD_WORKER' ] = "1"
122- os .environ ['VLLM_USE_RAY_COMPILED_DAG' ] = "1"
123-
124- if attention_backend :
125- os .environ ["VLLM_ATTENTION_BACKEND" ] = attention_backend
126-
127- dtype = "half"
128- max_tokens = 5
129-
130- # NOTE: take care of the order. run vLLM first, and then run HF.
131- # vLLM needs a fresh new process without cuda initialization.
132- # if we run HF first, the cuda initialization will be done and it
133- # will hurt multiprocessing backend with fork method (the default method).
134- with vllm_runner (model ,
135- dtype = dtype ,
136- tensor_parallel_size = 2 ,
137- distributed_executor_backend = distributed_executor_backend
138- ) as vllm_model :
139- vllm_outputs = vllm_model .generate_greedy (example_prompts , max_tokens )
140-
141- with hf_runner (model , dtype = dtype ) as hf_model :
142- hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
143-
144- check_outputs_equal (
145- outputs_0_lst = hf_outputs ,
146- outputs_1_lst = vllm_outputs ,
147- name_0 = "hf" ,
148- name_1 = "vllm" ,
149- )
123+ with monkeypatch .context () as monkeypatch_context :
124+ if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4" : # noqa
125+ # test Ray Compiled Graph
126+ monkeypatch_context .setenv ("VLLM_USE_RAY_SPMD_WORKER" , "1" )
127+ monkeypatch_context .setenv ("VLLM_USE_RAY_COMPILED_DAG" , "1" )
128+
129+ if attention_backend :
130+ monkeypatch_context .setenv (
131+ "VLLM_ATTENTION_BACKEND" ,
132+ attention_backend ,
133+ )
134+
135+ dtype = "half"
136+ max_tokens = 5
137+
138+ # NOTE: take care of the order. run vLLM first, and then run HF.
139+ # vLLM needs a fresh new process without cuda initialization.
140+ # if we run HF first, the cuda initialization will be done and it
141+ # will hurt multiprocessing backend with fork method
142+ # (the default method).
143+ with vllm_runner (
144+ model ,
145+ dtype = dtype ,
146+ tensor_parallel_size = 2 ,
147+ distributed_executor_backend = distributed_executor_backend ,
148+ ) as vllm_model :
149+ vllm_outputs = vllm_model .generate_greedy (example_prompts ,
150+ max_tokens )
151+
152+ with hf_runner (model , dtype = dtype ) as hf_model :
153+ hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
154+
155+ check_outputs_equal (
156+ outputs_0_lst = hf_outputs ,
157+ outputs_1_lst = vllm_outputs ,
158+ name_0 = "hf" ,
159+ name_1 = "vllm" ,
160+ )
0 commit comments