0xrushi
diff --git a/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/online_serving/pooling/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/online_serving/pooling/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/online_serving/pooling/ner.py‎ renamed to ‎examples/online_serving/pooling/ner_client.py‎ b/‎examples/online_serving/pooling/ner.py‎ renamed to ‎examples/online_serving/pooling/ner_client.py‎
diff --git a/‎tests/ci_envs.py‎
Lines changed: 6 additions & 0 deletions b/‎tests/ci_envs.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/entrypoints/pooling/llm/test_classify.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/entrypoints/pooling/llm/test_classify.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/entrypoints/pooling/llm/test_embedding.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/entrypoints/pooling/llm/test_embedding.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/entrypoints/pooling/llm/test_encode.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/entrypoints/pooling/llm/test_encode.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/entrypoints/pooling/llm/test_score.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/entrypoints/pooling/llm/test_score.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/models/language/generation_ppl_test/ppl_utils.py‎
Lines changed: 6 additions & 20 deletions b/‎tests/models/language/generation_ppl_test/ppl_utils.py‎
Lines changed: 6 additions & 20 deletions
diff --git a/‎tests/models/language/pooling/test_head_dtype.py‎
Lines changed: 47 additions & 0 deletions b/‎tests/models/language/pooling/test_head_dtype.py‎
Lines changed: 47 additions & 0 deletions
@@ -581,7 +581,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  | ✅︎ |
 
 !!! note
-    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
+    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner_client.py>.
 
 [](){ #supported-mm-models }
 
 
@@ -15,7 +15,7 @@ python examples/online_serving/pooling/jinaai_rerank_client.py
 ## Named Entity Recognition (NER) usage
 
 ```bash
-python examples/online_serving/pooling/ner.py
+python examples/online_serving/pooling/ner_client.py
 ```
 
 ## Openai chat embedding for multimodal usage
 
@@ -8,6 +8,8 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 
+from vllm.envs import maybe_convert_bool
+
 if TYPE_CHECKING:
     VLLM_CI_NO_SKIP: bool = False
     VLLM_CI_DTYPE: str | None = None
@@ -25,6 +27,10 @@
     "VLLM_CI_HEAD_DTYPE": lambda: os.getenv("VLLM_CI_HEAD_DTYPE", None),
     # Allow changing the head dtype used by transformers in tests
     "VLLM_CI_HF_DTYPE": lambda: os.getenv("VLLM_CI_HF_DTYPE", None),
+    # Allow control over whether tests use enforce_eager
+    "VLLM_CI_ENFORCE_EAGER": lambda: maybe_convert_bool(
+        os.getenv("VLLM_CI_ENFORCE_EAGER", None)
+    ),
 }
 
 
 
@@ -58,7 +58,9 @@ def get_outputs(activation):
     )
 
 
+@pytest.mark.skip_global_cleanup
 def test_encode_api(llm: LLM):
+    # chunked prefill does not support all pooling
     err_msg = "pooling_task must be one of.+"
     with pytest.raises(ValueError, match=err_msg):
         llm.encode(prompts, use_tqdm=False)
 
@@ -35,7 +35,6 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
     def get_outputs(normalize):
         outputs = llm.embed(
 
@@ -74,7 +74,6 @@ def test_multiple_pooling_params(llm: LLM):
     assert len(PROMPTS) == len(outputs)
 
 
-@pytest.mark.skip_global_cleanup
 def test_right_side_truncation(llm: LLM):
     # Embeddings models should truncate the end of the prompt
     tokenizer = llm.get_tokenizer()
 
@@ -33,7 +33,6 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
     def get_outputs(activation):
         text_1 = "What is the capital of France?"
 
@@ -3,12 +3,15 @@
 # Adapted from https://huggingface.co/docs/transformers/perplexity
 from typing import cast
 
-import pytest
 import torch
 from datasets import load_dataset
 
 import tests.ci_envs as ci_envs
-from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
+from tests.models.utils import (
+    GenerateModelInfo,
+    TokensTextLogprobsPromptLogprobs,
+    get_vllm_extra_kwargs,
+)
 from vllm.logprobs import Logprob
 
 # See #24485
@@ -25,27 +28,10 @@ def wikitext_ppl_test(
     vllm_extra_kwargs=None,
     atol=PPL_TOL,
 ):
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
 
     dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
     with vllm_runner(
         model_info.name,
         gpu_memory_utilization=0.7,
 
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["nie3e/sentiment-polish-gpt2-small"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for head_dtype_str in ["float32", "model"]:
+        with vllm_runner(
+            model,
+            max_model_len=512,
+            dtype=dtype,
+            hf_overrides={"head_dtype": head_dtype_str},
+        ) as vllm_model:
+            model_config = vllm_model.llm.llm_engine.model_config
+            model_dtype = model_config.dtype
+            head_dtype = model_config.head_dtype
+
+            if head_dtype_str == "float32":
+                assert head_dtype == torch.float32
+            elif head_dtype_str == "model":
+                assert head_dtype == model_dtype
+
+            vllm_outputs = vllm_model.classify(example_prompts)
+
+        for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+            hf_output = torch.tensor(hf_output).float()
+            vllm_output = torch.tensor(vllm_output).float()
+
+            assert torch.allclose(hf_output, vllm_output, atol=1e-2)
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,9 @@ def get_outputs(activation):`
`58`	`58`	`)`
`59`	`59`
`60`	`60`
	`61`	`+@pytest.mark.skip_global_cleanup`
`61`	`62`	`def test_encode_api(llm: LLM):`
	`63`	`+ # chunked prefill does not support all pooling`
`62`	`64`	`err_msg = "pooling_task must be one of.+"`
`63`	`65`	`with pytest.raises(ValueError, match=err_msg):`
`64`	`66`	`llm.encode(prompts, use_tqdm=False)`