Skip to content
Merged
2 changes: 1 addition & 1 deletion docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | ✅︎ |

!!! note
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner_client.py>.

[](){ #supported-mm-models }

Expand Down
2 changes: 1 addition & 1 deletion examples/online_serving/pooling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ python examples/online_serving/pooling/jinaai_rerank_client.py
## Named Entity Recognition (NER) usage

```bash
python examples/online_serving/pooling/ner.py
python examples/online_serving/pooling/ner_client.py
```

## Openai chat embedding for multimodal usage
Expand Down
6 changes: 6 additions & 0 deletions tests/ci_envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from collections.abc import Callable
from typing import TYPE_CHECKING, Any

from vllm.envs import maybe_convert_bool

if TYPE_CHECKING:
VLLM_CI_NO_SKIP: bool = False
VLLM_CI_DTYPE: str | None = None
Expand All @@ -25,6 +27,10 @@
"VLLM_CI_HEAD_DTYPE": lambda: os.getenv("VLLM_CI_HEAD_DTYPE", None),
# Allow changing the head dtype used by transformers in tests
"VLLM_CI_HF_DTYPE": lambda: os.getenv("VLLM_CI_HF_DTYPE", None),
# Allow control over whether tests use enforce_eager
"VLLM_CI_ENFORCE_EAGER": lambda: maybe_convert_bool(
os.getenv("VLLM_CI_ENFORCE_EAGER", None)
),
}


Expand Down
2 changes: 2 additions & 0 deletions tests/entrypoints/pooling/llm/test_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ def get_outputs(activation):
)


@pytest.mark.skip_global_cleanup
def test_encode_api(llm: LLM):
# chunked prefill does not support all pooling
err_msg = "pooling_task must be one of.+"
with pytest.raises(ValueError, match=err_msg):
llm.encode(prompts, use_tqdm=False)
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/pooling/llm/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def llm():
cleanup_dist_env_and_memory()


@pytest.mark.skip_global_cleanup
def test_pooling_params(llm: LLM):
def get_outputs(normalize):
outputs = llm.embed(
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/pooling/llm/test_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def test_multiple_pooling_params(llm: LLM):
assert len(PROMPTS) == len(outputs)


@pytest.mark.skip_global_cleanup
def test_right_side_truncation(llm: LLM):
# Embeddings models should truncate the end of the prompt
tokenizer = llm.get_tokenizer()
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/pooling/llm/test_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def llm():
cleanup_dist_env_and_memory()


@pytest.mark.skip_global_cleanup
def test_pooling_params(llm: LLM):
def get_outputs(activation):
text_1 = "What is the capital of France?"
Expand Down
26 changes: 6 additions & 20 deletions tests/models/language/generation_ppl_test/ppl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
# Adapted from https://huggingface.co/docs/transformers/perplexity
from typing import cast

import pytest
import torch
from datasets import load_dataset

import tests.ci_envs as ci_envs
from tests.models.utils import GenerateModelInfo, TokensTextLogprobsPromptLogprobs
from tests.models.utils import (
GenerateModelInfo,
TokensTextLogprobsPromptLogprobs,
get_vllm_extra_kwargs,
)
from vllm.logprobs import Logprob

# See #24485
Expand All @@ -25,27 +28,10 @@ def wikitext_ppl_test(
vllm_extra_kwargs=None,
atol=PPL_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
pytest.skip("Skipping test.")
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype

# Allow vllm to test using hf_overrides
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

# Allow changing the head dtype used by vllm in tests
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE

with vllm_runner(
model_info.name,
gpu_memory_utilization=0.7,
Expand Down
47 changes: 47 additions & 0 deletions tests/models/language/pooling/test_head_dtype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from transformers import AutoModelForSequenceClassification


@pytest.mark.parametrize(
"model",
["nie3e/sentiment-polish-gpt2-small"],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_classify_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
) -> None:
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
) as hf_model:
hf_outputs = hf_model.classify(example_prompts)

for head_dtype_str in ["float32", "model"]:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
hf_overrides={"head_dtype": head_dtype_str},
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
model_dtype = model_config.dtype
head_dtype = model_config.head_dtype

if head_dtype_str == "float32":
assert head_dtype == torch.float32
elif head_dtype_str == "model":
assert head_dtype == model_dtype

vllm_outputs = vllm_model.classify(example_prompts)

for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).float()
vllm_output = torch.tensor(vllm_output).float()

assert torch.allclose(hf_output, vllm_output, atol=1e-2)
51 changes: 9 additions & 42 deletions tests/models/language/pooling/test_splade_sparse_pooler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import types

import numpy as np
import pytest
import torch
import torch.nn as nn
Expand All @@ -14,29 +13,34 @@
)

# ---------------------------------------------------------------------
# 1) Functional test: SPLADE formula correctness (no HF download needed)
# Functional test: SPLADE formula correctness (no HF download needed)
# ---------------------------------------------------------------------


@pytest.mark.parametrize("B,T,H,V", [(2, 3, 5, 7)])
@torch.inference_mode
def test_splade_pooler_matches_reference_formula(B, T, H, V):
"""Ensure SPLADESparsePooler forward() matches the mathematical formula:
log1p(relu(logits)) -> max over sequence length (after masking)."""
torch.manual_seed(0)

# Prepare [B] sequences of shape [T, H]
hs_list = [torch.randn(T, H) for _ in range(B)]
hs_tenser = torch.cat(hs_list)

# Simulate PoolingMetadata (only required fields)
prompt_lens = [T, T - 1]
prompt_lens_tenser = torch.tensor(prompt_lens, dtype=torch.int32)
token_ids = torch.tensor(
[
[101, 5, 102], # Batch 0: [CLS], token, [SEP]
[101, 6, 6], # Batch 1: [CLS], token, token (last token ignored)
],
dtype=torch.long,
)
meta = types.SimpleNamespace(prompt_lens=prompt_lens, prompt_token_ids=token_ids)
meta = types.SimpleNamespace(
prompt_lens=prompt_lens_tenser, prompt_token_ids=token_ids
)

# MLM head (prefer BertMLMHead, fallback to Linear if unavailable)
try:
Expand All @@ -46,10 +50,10 @@ def test_splade_pooler_matches_reference_formula(B, T, H, V):

# Forward pass through SPLADE pooler
pooler = SPLADESparsePooler(mlm_head=mlm_head, pooling="max", remove_cls_sep=True)
pooled = pooler(hidden_states=hs_list, pooling_metadata=meta) # list of [V]
pooled = pooler(hidden_states=hs_tenser, pooling_metadata=meta) # list of [V]

# Basic output checks
assert isinstance(pooled, list) and len(pooled) == B
assert isinstance(pooled, torch.Tensor) and len(pooled) == B
for vec in pooled:
assert vec.shape == (V,)
assert torch.isfinite(vec).all()
Expand Down Expand Up @@ -83,40 +87,3 @@ def ref_one(hs: torch.Tensor, L: int, tid_row: torch.Tensor) -> torch.Tensor:
rtol=1e-4,
atol=1e-4,
)


# ---------------------------------------------------------------------
# 2) Integration smoke test: end-to-end embedding path wiring
# ---------------------------------------------------------------------


@pytest.mark.cpu_model
def test_bert_splade_sparse_embed_smoke(vllm_runner, monkeypatch):
"""Ensure BertSpladeSparseEmbeddingModel loads and produces sparse embeddings."""
from transformers import AutoTokenizer

MODEL_ID = "hf-internal-testing/tiny-random-bert"
hf_overrides = {"architectures": ["BertSpladeSparseEmbeddingModel"]}

# Enforce CPU-only execution (optional)
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "")
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")

tok = AutoTokenizer.from_pretrained(MODEL_ID)
vocab_size = tok.vocab_size

# The embed path should route through SPLADESparsePooler
with vllm_runner(
MODEL_ID,
runner="pooling",
max_model_len=64,
hf_overrides=hf_overrides,
) as vm:
outs = vm.embed(["hello world", "splade sparse test"])

# Basic sanity checks
assert len(outs) == 2
assert outs[0].shape[0] == vocab_size
assert outs[1].shape[0] == vocab_size
assert np.isfinite(outs[0]).all() and (outs[0] >= 0).all()
assert np.isfinite(outs[1]).all() and (outs[1] >= 0).all()
55 changes: 14 additions & 41 deletions tests/models/language/pooling_mteb_test/mteb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@

import mteb
import numpy as np
import pytest
import requests
import torch

import tests.ci_envs as ci_envs
from tests.models.utils import EmbedModelInfo, RerankModelInfo, check_embeddings_close
from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)

# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
Expand Down Expand Up @@ -165,28 +169,11 @@ def mteb_test_embed_models(
hf_model_callback=None,
atol=MTEB_EMBED_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
pytest.skip("Skipping test.")
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)

# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]

# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype

# Allow vllm to test using hf_overrides
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

# Allow changing the head dtype used by vllm in tests
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE

with vllm_runner(
model_info.name,
runner="pooling",
Expand All @@ -212,9 +199,12 @@ def mteb_test_embed_models(
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
head_dtype = model_config.head_dtype

# Test embed_dims, isnan and whether to use normalize
# Test embedding_size, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
outputs_tensor = torch.tensor(vllm_outputs)
assert not torch.any(torch.isnan(outputs_tensor))
embedding_size = model_config.embedding_size
assert torch.tensor(vllm_outputs).shape[-1] == embedding_size

# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
Expand All @@ -231,7 +221,7 @@ def mteb_test_embed_models(
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype

# Test embed_dims and whether to use normalize
# Check embeddings close to hf outputs
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
Expand Down Expand Up @@ -323,24 +313,7 @@ def mteb_test_rerank_models(
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL,
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not ci_envs.VLLM_CI_NO_SKIP and not model_info.enable_test:
pytest.skip("Skipping test.")

# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = ci_envs.VLLM_CI_DTYPE or model_info.dtype

# Allow vllm to test using hf_overrides
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

# Allow changing the head dtype used by vllm in tests
if ci_envs.VLLM_CI_HEAD_DTYPE is not None:
if "hf_overrides" not in vllm_extra_kwargs:
vllm_extra_kwargs["hf_overrides"] = {}
vllm_extra_kwargs["hf_overrides"]["head_dtype"] = ci_envs.VLLM_CI_HEAD_DTYPE
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)

with vllm_runner(
model_info.name,
Expand Down
Loading