Polishing

noooop · noooop · commit c6d4cb79b7a2 · 2025-09-16T13:53:02.000+08:00
Signed-off-by: wang.yuqi &lt;noooop@126.com&gt;
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -554,6 +554,16 @@ If your model is not in the above list, we will try to automatically convert the
     For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
     e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 
+#### Token Classification
+These models primarily support the [`LLM.encode`](./pooling_models.md#llmreward) API.
+
+| Architecture | Models     | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
+|--------------|------------|-------------------|----------------------|---------------------------|---------------------|
+| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. |  |  |  |
+
+!!! note
+    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
+
 [](){ #supported-mm-models }
 
 ## List of Multimodal Language Models
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
@@ -29,5 +29,11 @@ python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ## Qwen3 reranker usage
 
 ```bash
-python qwen3_reranker.py
+python examples/offline_inference/pooling/qwen3_reranker.py
 ```
+
+## Named Entity Recognition (NER) usage
+
+```bash
+python examples/offline_inference/pooling/ner.py
+```
diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
 
 from argparse import Namespace
 
@@ -25,13 +26,13 @@ def main(args: Namespace):
     prompts = ["Barack Obama visited Microsoft headquarters in Seattle on January 2025."]
 
     # Create an LLM.
-    # You should pass runner="pooling" for reward models
     llm = LLM(**vars(args))
-
     tokenizer = llm.get_tokenizer()
     label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
 
-    outputs = llm.reward(prompts)
+    # Run inference
+    outputs = llm.encode(prompts)
+
     for prompt, output in zip(prompts, outputs):
         logits = output.outputs.data
         predictions = logits.argmax(dim=-1)
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
@@ -41,3 +41,9 @@ python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
 ```bash
 python examples/online_serving/pooling/openai_pooling_client.py
 ```
+
+## Named Entity Recognition (NER) usage
+
+```bash
+python examples/online_serving/pooling/ner.py
+```
diff --git a/examples/online_serving/pooling/ner.py b/examples/online_serving/pooling/ner.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
+
 """
-Example online usage of Pooling API.
+Example online usage of Pooling API for Named Entity Recognition (NER).
 
 Run `vllm serve <model> --runner pooling`
 to start up the server in vLLM. e.g.
@@ -10,7 +12,6 @@
 """
 
 import argparse
-import pprint
 
 import requests
 import torch
@@ -36,32 +37,32 @@ def main(args):
     api_url = f"http://{args.host}:{args.port}/pooling"
     model_name = args.model
 
-
+    # Load tokenizer and config
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     config = AutoConfig.from_pretrained(model_name)
     label_map = config.id2label
 
+    # Input text
     text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
-
     prompt = {"model": model_name, "input": text}
-    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
 
-    outputs = pooling_response.json()["data"]
-
-    for output in outputs:
-        logits = torch.tensor(output['data'])
-        predictions = logits.argmax(dim=-1)
-
-        inputs = tokenizer(text, return_tensors="pt")
-
-        # Map predictions to labels
-        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-        labels = [label_map[p.item()] for p in predictions]
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
 
-        # Print results
-        for token, label in zip(tokens, labels):
-            if token not in tokenizer.all_special_tokens:
-                print(f"{token:15} → {label}")
+    # Run inference
+    output = pooling_response.json()["data"][0]
+    logits = torch.tensor(output['data'])
+    predictions = logits.argmax(dim=-1)
+    inputs = tokenizer(text, return_tensors="pt")
+
+    # Map predictions to labels
+    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+    labels = [label_map[p.item()] for p in predictions]
+    assert len(tokens) == len(predictions)
+
+    # Print results
+    for token, label in zip(tokens, labels):
+        if token not in tokenizer.all_special_tokens:
+            print(f"{token:15} → {label}")
 
 
 if __name__ == "__main__":
diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForTokenClassification
+
+from tests.models.utils import softmax
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("boltuix/NeuroBERT-NER", ),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float"])
+@torch.inference_mode
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    # The float32 is required for this tiny model to pass the test.
+
+    with vllm_runner(model,
+                     max_model_len=None,
+                     dtype=dtype,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForTokenClassification) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            hf_outputs.append(softmax(output.logits[0]))
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output).cpu().float()
+        vllm_output = torch.tensor(vllm_output).cpu().float()
+        assert torch.allclose(hf_output, vllm_output, 1e-2)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -932,6 +932,10 @@ def encode(
             considered legacy and may be deprecated in the future. You should
             instead pass them via the `inputs` parameter.
         """
+
+        if self.supported_tasks == ["encode"]:
+            pooling_task = "encode"
+
         if pooling_task is None:
             if "embed" in self.supported_tasks:
                 pooling_task = "embed"