vllm-project · DarkLight1337 · Oct 13, 2025 · Oct 11, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -581,7 +581,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` |  |  | ✅︎ |
 
 !!! note
-    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
+    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner_client.py>.
 
 [](){ #supported-mm-models }
 

@@ -15,7 +15,7 @@ python examples/online_serving/pooling/jinaai_rerank_client.py
 ## Named Entity Recognition (NER) usage
 
 ```bash
-python examples/online_serving/pooling/ner.py
+python examples/online_serving/pooling/ner_client.py
 ```
 
 ## Openai chat embedding for multimodal usage

diff --git a/tests/ci_envs.py b/tests/ci_envs.py
@@ -8,6 +8,8 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 
+from vllm.envs import maybe_convert_bool
+
 if TYPE_CHECKING:
     VLLM_CI_NO_SKIP: bool = False
     VLLM_CI_DTYPE: str | None = None
@@ -25,6 +27,10 @@
     "VLLM_CI_HEAD_DTYPE": lambda: os.getenv("VLLM_CI_HEAD_DTYPE", None),
     # Allow changing the head dtype used by transformers in tests
     "VLLM_CI_HF_DTYPE": lambda: os.getenv("VLLM_CI_HF_DTYPE", None),
+    # Allow control over whether tests use enforce_eager
+    "VLLM_CI_ENFORCE_EAGER": lambda: maybe_convert_bool(
+        os.getenv("VLLM_CI_ENFORCE_EAGER", None)
+    ),
 }
 
 

@@ -58,7 +58,9 @@ def get_outputs(activation):
     )
 
 
+@pytest.mark.skip_global_cleanup
 def test_encode_api(llm: LLM):
+    # chunked prefill does not support all pooling
     err_msg = "pooling_task must be one of.+"
     with pytest.raises(ValueError, match=err_msg):
         llm.encode(prompts, use_tqdm=False)

@@ -35,7 +35,6 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
     def get_outputs(normalize):
         outputs = llm.embed(

@@ -74,7 +74,6 @@ def test_multiple_pooling_params(llm: LLM):
     assert len(PROMPTS) == len(outputs)
 
 
-@pytest.mark.skip_global_cleanup
 def test_right_side_truncation(llm: LLM):
     # Embeddings models should truncate the end of the prompt
     tokenizer = llm.get_tokenizer()

@@ -33,7 +33,6 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
     def get_outputs(activation):
         text_1 = "What is the capital of France?"

@@ -3,12 +3,15 @@
 # Adapted from https://huggingface.co/docs/transformers/perplexity
 from typing import cast
 
-import pytest
 import torch
 from datasets import load_dataset
 
 import tests.ci_envs as ci_envs
-from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
+from tests.models.utils import (
+    GenerateModelInfo,
+    TokensTextLogprobsPromptLogprobs,
+    get_vllm_extra_kwargs,
+)
 from vllm.logprobs import Logprob
 
 # See #24485
@@ -25,27 +28,10 @@ def wikitext_ppl_test(
     vllm_extra_kwargs=None,
     atol=PPL_TOL,
 ):
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
 
     dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
 
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
     with vllm_runner(
         model_info.name,
         gpu_memory_utilization=0.7,

@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["nie3e/sentiment-polish-gpt2-small"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for head_dtype_str in ["float32", "model"]:
+        with vllm_runner(
+            model,
+            max_model_len=512,
+            dtype=dtype,
+            hf_overrides={"head_dtype": head_dtype_str},
+        ) as vllm_model:
+            model_config = vllm_model.llm.llm_engine.model_config
+            model_dtype = model_config.dtype
+            head_dtype = model_config.head_dtype
+
+            if head_dtype_str == "float32":
+                assert head_dtype == torch.float32
+            elif head_dtype_str == "model":
+                assert head_dtype == model_dtype
+
+            vllm_outputs = vllm_model.classify(example_prompts)
+
+        for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+            hf_output = torch.tensor(hf_output).float()
+            vllm_output = torch.tensor(vllm_output).float()
+
+            assert torch.allclose(hf_output, vllm_output, atol=1e-2)
@@ -3,7 +3,6 @@
 
 import types
 
-import numpy as np
 import pytest
 import torch
 import torch.nn as nn
@@ -14,29 +13,34 @@
 )
 
 # ---------------------------------------------------------------------
-# 1) Functional test: SPLADE formula correctness (no HF download needed)
+# Functional test: SPLADE formula correctness (no HF download needed)
 # ---------------------------------------------------------------------
 
 
 @pytest.mark.parametrize("B,T,H,V", [(2, 3, 5, 7)])
+@torch.inference_mode
 def test_splade_pooler_matches_reference_formula(B, T, H, V):
     """Ensure SPLADESparsePooler forward() matches the mathematical formula:
     log1p(relu(logits)) -> max over sequence length (after masking)."""
     torch.manual_seed(0)
 
     # Prepare [B] sequences of shape [T, H]
     hs_list = [torch.randn(T, H) for _ in range(B)]
+    hs_tenser = torch.cat(hs_list)
 
     # Simulate PoolingMetadata (only required fields)
     prompt_lens = [T, T - 1]
+    prompt_lens_tenser = torch.tensor(prompt_lens, dtype=torch.int32)
     token_ids = torch.tensor(
         [
             [101, 5, 102],  # Batch 0: [CLS], token, [SEP]
             [101, 6, 6],  # Batch 1: [CLS], token, token (last token ignored)
         ],
         dtype=torch.long,
     )
-    meta = types.SimpleNamespace(prompt_lens=prompt_lens, prompt_token_ids=token_ids)
+    meta = types.SimpleNamespace(
+        prompt_lens=prompt_lens_tenser, prompt_token_ids=token_ids
+    )
 
     # MLM head (prefer BertMLMHead, fallback to Linear if unavailable)
     try:
@@ -46,10 +50,10 @@ def test_splade_pooler_matches_reference_formula(B, T, H, V):
 
     # Forward pass through SPLADE pooler
     pooler = SPLADESparsePooler(mlm_head=mlm_head, pooling="max", remove_cls_sep=True)
-    pooled = pooler(hidden_states=hs_list, pooling_metadata=meta)  # list of [V]
+    pooled = pooler(hidden_states=hs_tenser, pooling_metadata=meta)  # list of [V]
 
     # Basic output checks
-    assert isinstance(pooled, list) and len(pooled) == B
+    assert isinstance(pooled, torch.Tensor) and len(pooled) == B
     for vec in pooled:
         assert vec.shape == (V,)
         assert torch.isfinite(vec).all()
@@ -83,40 +87,3 @@ def ref_one(hs: torch.Tensor, L: int, tid_row: torch.Tensor) -> torch.Tensor:
         rtol=1e-4,
         atol=1e-4,
     )
-
-
-# ---------------------------------------------------------------------
-# 2) Integration smoke test: end-to-end embedding path wiring
-# ---------------------------------------------------------------------
-
-
-@pytest.mark.cpu_model
-def test_bert_splade_sparse_embed_smoke(vllm_runner, monkeypatch):
-    """Ensure BertSpladeSparseEmbeddingModel loads and produces sparse embeddings."""
-    from transformers import AutoTokenizer
-
-    MODEL_ID = "hf-internal-testing/tiny-random-bert"
-    hf_overrides = {"architectures": ["BertSpladeSparseEmbeddingModel"]}
-
-    # Enforce CPU-only execution (optional)
-    monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "")
-    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
-    tok = AutoTokenizer.from_pretrained(MODEL_ID)
-    vocab_size = tok.vocab_size
-
-    # The embed path should route through SPLADESparsePooler
-    with vllm_runner(
-        MODEL_ID,
-        runner="pooling",
-        max_model_len=64,
-        hf_overrides=hf_overrides,
-    ) as vm:
-        outs = vm.embed(["hello world", "splade sparse test"])
-
-        # Basic sanity checks
-        assert len(outs) == 2
-        assert outs[0].shape[0] == vocab_size
-        assert outs[1].shape[0] == vocab_size
-        assert np.isfinite(outs[0]).all() and (outs[0] >= 0).all()
-        assert np.isfinite(outs[1]).all() and (outs[1] >= 0).all()
@@ -6,12 +6,16 @@
 
 import mteb
 import numpy as np
-import pytest
 import requests
 import torch
 
 import tests.ci_envs as ci_envs
-from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close
+from tests.models.utils import (
+    EmbedModelInfo,
+    RerankModelInfo,
+    check_embeddings_close,
+    get_vllm_extra_kwargs,
+)
 
 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -165,28 +169,11 @@ def mteb_test_embed_models(
     hf_model_callback=None,
     atol=MTEB_EMBED_TOL,
 ):
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
 
     # Test embed_dims, isnan and whether to use normalize
     example_prompts = ["The chef prepared a delicious meal." * 1000]
 
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
-
     with vllm_runner(
         model_info.name,
         runner="pooling",
@@ -212,9 +199,12 @@ def mteb_test_embed_models(
         vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
         head_dtype = model_config.head_dtype
 
-        # Test embed_dims, isnan and whether to use normalize
+        # Test embedding_size, isnan and whether to use normalize
         vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
-        assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
+        outputs_tensor = torch.tensor(vllm_outputs)
+        assert not torch.any(torch.isnan(outputs_tensor))
+        embedding_size = model_config.embedding_size
+        assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
 
     # Accelerate mteb test by setting
     # SentenceTransformers mteb score to a constant
@@ -231,7 +221,7 @@ def mteb_test_embed_models(
             st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
             st_dtype = next(hf_model.model.parameters()).dtype
 
-            # Test embed_dims and whether to use normalize
+            # Check embeddings close to hf outputs
             hf_outputs = hf_model.encode(example_prompts)
             check_embeddings_close(
                 embeddings_0_lst=hf_outputs,
@@ -323,24 +313,7 @@ def mteb_test_rerank_models(
     vllm_mteb_encoder=VllmMtebEncoder,
     atol=MTEB_RERANK_TOL,
 ):
-    # A model family has many models with the same architecture,
-    # and we don't need to test each one.
-    if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
-        pytest.skip("Skipping test.")
-
-    # Allow vllm to test using the given dtype, such as float32
-    vllm_extra_kwargs = vllm_extra_kwargs or {}
-    vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype
-
-    # Allow vllm to test using hf_overrides
-    if model_info.hf_overrides is not None:
-        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
-
-    # Allow changing the head dtype used by vllm in tests
-    if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
-        if "hf_overrides" not in vllm_extra_kwargs:
-            vllm_extra_kwargs["hf_overrides"] = {}
-        vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
 
     with vllm_runner(
         model_info.name,