1414
1515import pytest
1616
17- from vllm .config import TaskOption
17+ from vllm .config import _FLOAT16_NOT_SUPPORTED_MODELS , TaskOption
1818from vllm .logger import init_logger
19+ from vllm .transformers_utils .config import get_config
1920
2021from ..models .registry import HF_EXAMPLE_MODELS
2122from ..utils import compare_two_settings , create_new_process_for_each_test
@@ -158,7 +159,7 @@ def iter_params(self, model_id: str):
158159 "databricks/dbrx-instruct" : PPTestSettings .fast (load_format = "dummy" ),
159160 "Deci/DeciLM-7B-instruct" : PPTestSettings .fast (),
160161 "deepseek-ai/deepseek-llm-7b-chat" : PPTestSettings .fast (),
161- "deepseek-ai/DeepSeek-V2-Lite-Chat" : PPTestSettings .fast (),
162+ "deepseek-ai/DeepSeek-V2-Lite-Chat" : PPTestSettings .fast (tp_base = 2 ),
162163 "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" : PPTestSettings .fast (),
163164 "tiiuae/falcon-7b" : PPTestSettings .fast (),
164165 "google/gemma-1.1-2b-it" : PPTestSettings .fast (),
@@ -210,9 +211,11 @@ def iter_params(self, model_id: str):
210211
211212EMBEDDING_MODELS = { # type: ignore[var-annotated]
212213 # [Text-only]
213- "intfloat/e5-mistral-7b-instruct" : PPTestSettings .fast (),
214- "BAAI/bge-multilingual-gemma2" : PPTestSettings .fast (),
215- "Qwen/Qwen2.5-Math-RM-72B" : PPTestSettings .fast (load_format = "dummy" ),
214+ "intfloat/e5-mistral-7b-instruct" : PPTestSettings .fast (task = "embed" ),
215+ "BAAI/bge-multilingual-gemma2" : PPTestSettings .fast (task = "embed" ),
216+ "Qwen/Qwen2.5-Math-RM-72B" : PPTestSettings .fast (
217+ load_format = "dummy" , task = "embed"
218+ ),
216219}
217220
218221MULTIMODAL_MODELS = {
@@ -248,6 +251,7 @@ def iter_params(self, model_id: str):
248251 "meta-llama/Llama-3.2-1B-Instruct" ,
249252 "ArthurZ/Ilama-3.2-1B" ,
250253 "ibm/PowerLM-3b" ,
254+ "deepseek-ai/DeepSeek-V2-Lite-Chat" ,
251255 # [LANGUAGE EMBEDDING]
252256 "intfloat/e5-mistral-7b-instruct" ,
253257 "BAAI/bge-multilingual-gemma2" ,
@@ -287,6 +291,11 @@ def _compare_tp(
287291 trust_remote_code = model_info .trust_remote_code
288292 tokenizer_mode = model_info .tokenizer_mode
289293 hf_overrides = model_info .hf_overrides
294+ hf_config = get_config (model_id , trust_remote_code )
295+
296+ dtype = "float16"
297+ if hf_config .model_type in _FLOAT16_NOT_SUPPORTED_MODELS :
298+ dtype = "bfloat16"
290299
291300 if load_format == "dummy" :
292301 # Avoid OOM
@@ -316,7 +325,7 @@ def _compare_tp(
316325 common_args = [
317326 # use half precision for speed and memory savings in CI environment
318327 "--dtype" ,
319- "float16" ,
328+ dtype ,
320329 "--max-model-len" ,
321330 "2048" ,
322331 "--max-num-seqs" ,
@@ -338,6 +347,7 @@ def _compare_tp(
338347 common_args .extend (["--hf-overrides" , json .dumps (hf_overrides )])
339348
340349 specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
350+ testing_ray_compiled_graph = False
341351 if distributed_backend == "ray" and (vllm_major_version == "1"
342352 or specific_case ):
343353 # For V1, test Ray Compiled Graph for all the tests
@@ -351,6 +361,7 @@ def _compare_tp(
351361 # Temporary. Currently when zeromq + SPMD is used, it does not properly
352362 # terminate because of a Ray Compiled Graph issue.
353363 common_args .append ("--disable-frontend-multiprocessing" )
364+ testing_ray_compiled_graph = True
354365 elif distributed_backend == "mp" :
355366 # Both V0/V1 of multiprocessing executor support PP
356367 pp_env = {
@@ -394,7 +405,6 @@ def _compare_tp(
394405 tp_env ,
395406 method = method )
396407 except Exception :
397- testing_ray_compiled_graph = pp_env is not None
398408 if testing_ray_compiled_graph and vllm_major_version == "0" :
399409 # Ray Compiled Graph tests are flaky for V0,
400410 # so we don't want to fail the test
0 commit comments