From b6b2e123cd1b4fab1e2c85a51a8541b27bf616af Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 24 Sep 2025 07:37:29 +0800
Subject: [PATCH 01/25] token_embed & token_classify

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 50982d3d0d0f..ccb4bf2d347e 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -30,11 +30,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe
 vLLM will attempt to automatically convert the model according to the architecture names
 shown in the table below.
 
-| Architecture                                    | `--convert` | Supported pooling tasks       |
-|-------------------------------------------------|-------------|-------------------------------|
-| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `encode`, `embed`             |
-| `*For*Classification`, `*ClassificationModel`   | `classify`  | `encode`, `classify`, `score` |
-| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `encode`                      |
+| Architecture                                    | `--convert` | Supported pooling tasks               |
+|-------------------------------------------------|-------------|---------------------------------------|
+| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
+| `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
+| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `token_classify`                      |
 
 !!! tip
     You can explicitly set `--convert <type>` to specify how to convert the model.

From dd06fe14a25ecc5774ba66488efb63bac952d019 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 16:19:13 +0800
Subject: [PATCH 02/25] /pooling endpoint support all pooling tasks

Signed-off-by: wang.yuqi <noooop@126.com>
---
 .../pooling/openai/test_classification.py     | 76 +++++++++++++++----
 .../pooling/openai/test_embedding.py          | 53 ++++++++++++-
 .../entrypoints/pooling/openai/test_rerank.py | 41 +++++++++-
 vllm/entrypoints/openai/api_server.py         |  7 +-
 vllm/entrypoints/openai/protocol.py           | 61 +++++++++++++--
 vllm/entrypoints/openai/serving_pooling.py    | 23 ++++--
 6 files changed, 223 insertions(+), 38 deletions(-)

diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
index 92d40efad21c..2bffe5dabedc 100644
--- a/tests/entrypoints/pooling/openai/test_classification.py
+++ b/tests/entrypoints/pooling/openai/test_classification.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse
+from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse
 
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
@@ -191,18 +191,7 @@ async def get_outputs(activation):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_pooling(server: RemoteOpenAIServer, model_name: str):
-    # pooling api uses ALL pooling, which does not support chunked prefill.
-    response = requests.post(
-        server.url_for("pooling"),
-        json={"model": model_name, "input": "test", "encoding_format": "float"},
-    )
-    assert response.json()["error"]["type"] == "BadRequestError"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_score(server: RemoteOpenAIServer, model_name: str):
+async def test_score(server: RemoteOpenAIServer, model_name: str):
     # score api is only enabled for num_labels == 1.
     response = requests.post(
         server.url_for("score"),
@@ -217,7 +206,7 @@ def test_score(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_rerank(server: RemoteOpenAIServer, model_name: str):
+async def test_rerank(server: RemoteOpenAIServer, model_name: str):
     # rerank api is only enabled for num_labels == 1.
     response = requests.post(
         server.url_for("rerank"),
@@ -228,3 +217,62 @@ def test_rerank(server: RemoteOpenAIServer, model_name: str):
         },
     )
     assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
+    # token_classify uses ALL pooling, which does not support chunked prefill.
+    task = "token_classify"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py
index b3f12283fdbd..e971b23e8f1a 100644
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -562,12 +562,40 @@ async def get_outputs(normalize):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_pooling(server: RemoteOpenAIServer, model_name: str):
+async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "embed"
     input_text = ["The chef prepared a delicious meal."]
 
     response = requests.post(
         server.url_for("pooling"),
-        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "token_embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
     )
 
     poolings = PoolingResponse.model_validate(response.json())
@@ -575,3 +603,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str):
     assert len(poolings.data) == 1
     assert len(poolings.data[0].data) == 11
     assert len(poolings.data[0].data[0]) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
index e43148d25fee..edfb3f7cb4dd 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -163,7 +163,25 @@ async def get_outputs(activation):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_pooling(server: RemoteOpenAIServer, model_name: str):
+async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
     input_text = ["The chef prepared a delicious meal."]
 
     response = requests.post(
@@ -176,3 +194,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str):
     assert len(poolings.data) == 1
     assert len(poolings.data[0].data) == 11
     assert len(poolings.data[0].data[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 632bd741290b..1dc07d908a8d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1749,12 +1749,7 @@ async def init_app_state(
                 log_error_stack=args.log_error_stack,
             )
         )
-        if (
-            any(
-                task in supported_tasks
-                for task in ["token_embed", "token_classify", "plugin"]
-            )
-        )
+        if supported_tasks
         else None
     )
     state.openai_serving_embedding = (
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0778e4d78790..4bfa7ce9a927 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -49,6 +49,7 @@
 )
 from openai_harmony import Message as OpenAIHarmonyMessage
 
+from vllm.tasks import PoolingTask
 from vllm.utils.serial_utils import (
     EmbedDType,
     EncodingFormat,
@@ -1669,8 +1670,42 @@ def to_pooling_params(self):
 
 EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
 
-PoolingCompletionRequest = EmbeddingCompletionRequest
-PoolingChatRequest = EmbeddingChatRequest
+
+class PoolingCompletionRequest(EmbeddingCompletionRequest):
+    task: PoolingTask | None = None
+    activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            activation=self.activation,
+        )
+
+
+class PoolingChatRequest(EmbeddingChatRequest):
+    task: PoolingTask | None = None
+    activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            activation=self.activation,
+        )
+
 
 T = TypeVar("T")
 
@@ -1686,6 +1721,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
     """
     data: T
 
+    task: PoolingTask = "plugin"
     encoding_format: EncodingFormat = "float"
     embed_dtype: EmbedDType = Field(
         default="float32",
@@ -1749,8 +1785,11 @@ class ScoreRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: bool | None = None
-
+    activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
     # --8<-- [end:score-extra-params]
 
     def to_pooling_params(self):
@@ -1783,8 +1822,11 @@ class RerankRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: bool | None = None
-
+    activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
     # --8<-- [end:rerank-extra-params]
 
     def to_pooling_params(self):
@@ -1958,8 +2000,11 @@ class ClassificationRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: bool | None = None
-
+    activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
     # --8<-- [end:classification-extra-params]
 
     def to_pooling_params(self):
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 568896ccbf1b..0eade272111f 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -170,15 +170,24 @@ async def create_pooling(
                 pooling_params = request.to_pooling_params()
 
             pooling_task: PoolingTask
-            if "token_embed" in self.supported_tasks:
-                pooling_task = "token_embed"
-            elif "token_classify" in self.supported_tasks:
-                pooling_task = "token_classify"
-            elif "plugin" in self.supported_tasks:
-                pooling_task = "plugin"
+            if request.task is None:
+                if "token_embed" in self.supported_tasks:
+                    pooling_task = "token_embed"
+                elif "token_classify" in self.supported_tasks:
+                    pooling_task = "token_classify"
+                elif "plugin" in self.supported_tasks:
+                    pooling_task = "plugin"
+                else:
+                    return self.create_error_response(
+                        f"pooling_task must be one of {self.supported_tasks}."
+                    )
             else:
+                pooling_task = request.task
+
+            if pooling_task not in self.supported_tasks:
                 return self.create_error_response(
-                    f"pooling_task must be one of {self.supported_tasks}."
+                    f"Task {pooling_task} is not supported, it"
+                    f" must be one of {self.supported_tasks}."
                 )
 
             try:

From 064346100f5cdc7a64cf657244638c1e3db25381 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 17:44:45 +0800
Subject: [PATCH 03/25] update

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 68 +++++++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 15 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 21b3b8b0b31e..1a89ccdce9a9 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -45,12 +45,12 @@ Each pooling model in vLLM supports one or more of these tasks according to
 [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
 enabling the corresponding APIs:
 
-| Task       | APIs                                 |
-|------------|--------------------------------------|
-| `encode`   | `LLM.reward(...)`                    |
-| `embed`    | `LLM.embed(...)`, `LLM.score(...)`\* |
-| `classify` | `LLM.classify(...)`                  |
-| `score`    | `LLM.score(...)`                     |
+| Task             | APIs                                 |
+|------------------|--------------------------------------|
+| `token_classify` | `LLM.reward(...)`                    |
+| `embed`          | `LLM.embed(...)`, `LLM.score(...)`\* |
+| `classify`       | `LLM.classify(...)`                  |
+| `score`          | `LLM.score(...)`                     |
 
 \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
 
@@ -91,7 +91,7 @@ It is primarily designed for embedding models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-small", runner="pooling")
+llm = LLM(model="intfloat/e5-small")
 (output,) = llm.embed("Hello, my name is")
 
 embeds = output.outputs.embedding
@@ -108,7 +108,7 @@ It is primarily designed for classification models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach")
 (output,) = llm.classify("Hello, my name is")
 
 probs = output.outputs.probs
@@ -129,7 +129,7 @@ It is designed for embedding models and cross-encoder models. Embedding models u
 ```python
 from vllm import LLM
 
-llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
+llm = LLM(model="BAAI/bge-reranker-v2-m3")
 (output,) = llm.score(
     "What is the capital of France?",
     "The capital of Brazil is Brasilia.",
@@ -144,12 +144,11 @@ A code example can be found here: [examples/offline_inference/basic/score.py](..
 ### `LLM.reward`
 
 The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
-It returns the extracted hidden states directly.
 
 ```python
 from vllm import LLM
 
-llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
+llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True)
 (output,) = llm.reward("Hello, my name is")
 
 data = output.outputs.data
@@ -161,20 +160,22 @@ A code example can be found here: [examples/offline_inference/basic/reward.py](.
 ### `LLM.encode`
 
 The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
-It returns the extracted hidden states directly.
 
 !!! note
     Please use one of the more specific methods or set the task directly when using `LLM.encode`:
 
     - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
     - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
-    - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`.
     - For similarity scores, use `LLM.score(...)`.  
+    - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
+    - For token classification, use `pooling_task="token_classify"`.
+    - For multi-vector retrieval, use `pooling_task="token_embed"`
+    - For IO Processor Plugins , use `pooling_task="plugin"`
 
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-small", runner="pooling")
+llm = LLM(model="intfloat/e5-small")
 (output,) = llm.encode("Hello, my name is", pooling_task="embed")
 
 data = output.outputs.data
@@ -185,10 +186,47 @@ print(f"Data: {data!r}")
 
 Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
 
-- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
 - [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
 - [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models.
+- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
+
+!!! note
+    Please use one of the more specific methods or set the task directly when using  [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
+
+    - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
+    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
+    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).  
+    - For rewards, `task":"token_classify"`.
+    - For token classification, use `task":"token_classify"`.
+    - For multi-vector retrieval, use `task":"token_embed"`
+    - For IO Processor Plugins , use `task":"plugin"`
+
+```python
+# start a supported embeddings model server with `vllm serve`, e.g.
+# vllm serve intfloat/e5-small
+import requests
+
+host = "localhost"
+port = "8000"
+model_name = "intfloat/e5-small"
+
+api_url = f"http://{host}:{port}/pooling"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+prompt = {"model": model_name, "input": prompts, "task": "embed"}
+
+response = requests.post(api_url, json=prompt)
+
+for output in response.json()["data"]:
+    data = output["data"]
+    print(f"Data: {data!r} (size={len(data)})")
+```
 
 ## Matryoshka Embeddings
 

From ce69d7b5ff04cd2729aa54b4894b14a0fd917a82 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 18:13:33 +0800
Subject: [PATCH 04/25] update examples

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/design/io_processor_plugins.md           |  2 +-
 examples/offline_inference/pooling/README.md  | 12 ++++++++
 examples/offline_inference/pooling/ner.py     |  2 +-
 .../{ => pooling}/prithvi_geospatial_mae.py   |  0
 .../prithvi_geospatial_mae_io_processor.py    |  0
 examples/online_serving/pooling/README.md     | 30 +++++++++++++++++++
 .../openai_cross_encoder_score.py             |  0
 ...enai_cross_encoder_score_for_multimodal.py |  0
 .../{ => pooling}/prithvi_geospatial_mae.py   |  0
 9 files changed, 44 insertions(+), 2 deletions(-)
 rename examples/offline_inference/{ => pooling}/prithvi_geospatial_mae.py (100%)
 rename examples/offline_inference/{ => pooling}/prithvi_geospatial_mae_io_processor.py (100%)
 rename examples/online_serving/{ => pooling}/openai_cross_encoder_score.py (100%)
 rename examples/online_serving/{ => pooling}/openai_cross_encoder_score_for_multimodal.py (100%)
 rename examples/online_serving/{ => pooling}/prithvi_geospatial_mae.py (100%)

diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index fb64a7bb9c8f..2f4b17f191a5 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
 The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
 
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
 ## Using an IO Processor plugin
 
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
index cd9717122b16..e10606676f06 100644
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -38,6 +38,18 @@ python examples/offline_inference/pooling/multi_vector_retrieval.py
 python examples/offline_inference/pooling/ner.py
 ```
 
+## prithvi geospatial mae usage
+
+```bash
+python examples/offline_inference/pooling/prithvi_geospatial_mae.py
+```
+
+## IO Processor Plugins for prithvi geospatial mae usage
+
+```bash
+python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
+```
+
 ## Qwen3 reranker usage
 
 ```bash
diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py
index b2dffdd6c5ee..34c80e7ccffd 100644
--- a/examples/offline_inference/pooling/ner.py
+++ b/examples/offline_inference/pooling/ner.py
@@ -33,7 +33,7 @@ def main(args: Namespace):
     label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
 
     # Run inference
-    outputs = llm.encode(prompts)
+    outputs = llm.encode(prompts, pooling_task="token_classify")
 
     for prompt, output in zip(prompts, outputs):
         logits = output.outputs.data
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/pooling/prithvi_geospatial_mae.py
similarity index 100%
rename from examples/offline_inference/prithvi_geospatial_mae.py
rename to examples/offline_inference/pooling/prithvi_geospatial_mae.py
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
similarity index 100%
rename from examples/offline_inference/prithvi_geospatial_mae_io_processor.py
rename to examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index 3b6da20d5f0f..5080716d97ca 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -3,36 +3,42 @@
 ## Cohere rerank usage
 
 ```bash
+# vllm serve BAAI/bge-reranker-base
 python examples/online_serving/pooling/cohere_rerank_client.py
 ```
 
 ## Embedding requests base64 encoding_format usage
 
 ```bash
+# vllm serve intfloat/e5-small
 python examples/online_serving/pooling/embedding_requests_base64_client.py
 ```
 
 ## Embedding requests bytes encoding_format usage
 
 ```bash
+# vllm serve intfloat/e5-small
 python examples/online_serving/pooling/embedding_requests_bytes_client.py
 ```
 
 ## Jinaai rerank usage
 
 ```bash
+# vllm serve BAAI/bge-reranker-base
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```
 
 ## Multi vector retrieval usage
 
 ```bash
+# vllm serve BAAI/bge-m3
 python examples/online_serving/pooling/multi_vector_retrieval_client.py
 ```
 
 ## Named Entity Recognition (NER) usage
 
 ```bash
+# vllm serve boltuix/NeuroBERT-NER
 python examples/online_serving/pooling/ner_client.py
 ```
 
@@ -45,23 +51,47 @@ python examples/online_serving/pooling/openai_chat_embedding_client_for_multimod
 ## Openai classification usage
 
 ```bash
+# vllm serve jason9693/Qwen2.5-1.5B-apeach
 python examples/online_serving/pooling/openai_classification_client.py
 ```
 
+## Openai cross_encoder score usage
+
+```bash
+# vllm serve BAAI/bge-reranker-v2-m3
+python examples/online_serving/pooling/openai_cross_encoder_score.py
+```
+
+## Openai cross_encoder score for multimodal usage
+
+```bash
+# vllm serve jinaai/jina-reranker-m0
+python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
+```
+
 ## Openai embedding usage
 
 ```bash
+# vllm serve intfloat/e5-small
 python examples/online_serving/pooling/openai_embedding_client.py
 ```
 
 ## Openai embedding matryoshka dimensions usage
 
 ```bash
+# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
 ```
 
 ## Openai pooling usage
 
 ```bash
+# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
 python examples/online_serving/pooling/openai_pooling_client.py
 ```
+
+## Online prithvi geospatial mae usage
+
+```bash
+python examples/online_serving/pooling/prithvi_geospatial_mae.py
+```
\ No newline at end of file
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/pooling/openai_cross_encoder_score.py
similarity index 100%
rename from examples/online_serving/openai_cross_encoder_score.py
rename to examples/online_serving/pooling/openai_cross_encoder_score.py
diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
similarity index 100%
rename from examples/online_serving/openai_cross_encoder_score_for_multimodal.py
rename to examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/pooling/prithvi_geospatial_mae.py
similarity index 100%
rename from examples/online_serving/prithvi_geospatial_mae.py
rename to examples/online_serving/pooling/prithvi_geospatial_mae.py

From f9d85cf3f26f7d39ae883b542fd694b450aa76c1 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 18:22:04 +0800
Subject: [PATCH 05/25] update examples

Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/online_serving/pooling/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index 5080716d97ca..634fa049b048 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -94,4 +94,4 @@ python examples/online_serving/pooling/openai_pooling_client.py
 
 ```bash
 python examples/online_serving/pooling/prithvi_geospatial_mae.py
-```
\ No newline at end of file
+```

From 1ea309d12568755305c65504def4347747da5e30 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 18:31:51 +0800
Subject: [PATCH 06/25] Update vllm/entrypoints/openai/api_server.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 vllm/entrypoints/openai/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1dc07d908a8d..eaeb6906f499 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1749,7 +1749,7 @@ async def init_app_state(
                 log_error_stack=args.log_error_stack,
             )
         )
-        if supported_tasks
+        if any(task in POOLING_TASKS for task in supported_tasks)
         else None
     )
     state.openai_serving_embedding = (

From cdabfc0d6ebdf43b445781ab8a1fa527ac77e956 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 18:43:52 +0800
Subject: [PATCH 07/25] fix

Signed-off-by: wang.yuqi <noooop@126.com>
---
 vllm/entrypoints/openai/api_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index eaeb6906f499..bc7aadc31d3c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -107,6 +107,7 @@
 )
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
+from vllm.tasks import POOLING_TASKS
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device
 from vllm.utils.argparse_utils import FlexibleArgumentParser

From 986de1ad7727bfc345f2978d19ff11d7eb1960e7 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 19:04:54 +0800
Subject: [PATCH 08/25] Deprecated Feature

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 1a89ccdce9a9..a276eefb6845 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -303,3 +303,13 @@ Expected output:
 ```
 
 An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
+
+## Deprecated Feature
+
+### Encode task
+Split the encode task into two more specific token wise tasks: token_embed and token_classify:
+- token_embed is the same as embed, using normalize as activation.
+- token_classify is the same as classify, default using softmax as activation.
+
+### Remove softmax from PoolingParams
+Remove softmax from PoolingParams, prefer using activation, since we actually allow classify and token_classify to use any activation function. 

From 06b191559fda4c05d1cb48730d1ef5c0b91dfabd Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 21:19:53 +0800
Subject: [PATCH 09/25] Update docs/models/pooling_models.md

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index a276eefb6845..ee4f18187a2b 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -304,7 +304,7 @@ Expected output:
 
 An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
 
-## Deprecated Feature
+## Deprecated Features
 
 ### Encode task
 Split the encode task into two more specific token wise tasks: token_embed and token_classify:

From 267d03780da04a5bc7c23417200309d0e61bfac0 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 21:20:17 +0800
Subject: [PATCH 10/25] Update docs/models/pooling_models.md

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index ee4f18187a2b..2933c8440b9f 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -307,9 +307,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
 ## Deprecated Features
 
 ### Encode task
-Split the encode task into two more specific token wise tasks: token_embed and token_classify:
-- token_embed is the same as embed, using normalize as activation.
-- token_classify is the same as classify, default using softmax as activation.
+
+We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+- `token_embed` is the same as embed, using normalize as activation.
+- `token_classify` is the same as classify, default using softmax as activation.
 
 ### Remove softmax from PoolingParams
-Remove softmax from PoolingParams, prefer using activation, since we actually allow classify and token_classify to use any activation function. 
+
+We are going to remove `softmax` from `PoolingParams`. Instead, you should set `activation`, since we actually allow `classify` and `token_classify` to use any activation function. 

From 3b13620b49ad481269c20b336d0ac3da5e1e31a5 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 21:20:51 +0800
Subject: [PATCH 11/25] Update examples/offline_inference/pooling/README.md

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/offline_inference/pooling/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
index e10606676f06..9deca2753090 100644
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -38,7 +38,7 @@ python examples/offline_inference/pooling/multi_vector_retrieval.py
 python examples/offline_inference/pooling/ner.py
 ```
 
-## prithvi geospatial mae usage
+## Prithvi Geospatial MAE usage
 
 ```bash
 python examples/offline_inference/pooling/prithvi_geospatial_mae.py

From bb3a6f8abef0e21e7785575cdd0d28a7f7d00efc Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 21:21:08 +0800
Subject: [PATCH 12/25] Update examples/offline_inference/pooling/README.md

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/offline_inference/pooling/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
index 9deca2753090..ad78be38716b 100644
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -44,7 +44,7 @@ python examples/offline_inference/pooling/ner.py
 python examples/offline_inference/pooling/prithvi_geospatial_mae.py
 ```
 
-## IO Processor Plugins for prithvi geospatial mae usage
+## IO Processor Plugins for Prithvi Geospatial MAE
 
 ```bash
 python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py

From 351d526d762ac30d5c05a925a8793f6f377cd28b Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:11:56 +0800
Subject: [PATCH 13/25] Update examples/online_serving/pooling/README.md

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/online_serving/pooling/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index 634fa049b048..6dc0ba6816ca 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -90,7 +90,7 @@ python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
 python examples/online_serving/pooling/openai_pooling_client.py
 ```
 
-## Online prithvi geospatial mae usage
+## Online Prithvi Geospatial MAE usage
 
 ```bash
 python examples/online_serving/pooling/prithvi_geospatial_mae.py

From 12db9e36b0f9e75df4a299fc27e0046ff57dbb48 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:12:06 +0800
Subject: [PATCH 14/25] Update examples/online_serving/pooling/README.md

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/online_serving/pooling/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index 6dc0ba6816ca..8e2878ae4573 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -62,7 +62,7 @@ python examples/online_serving/pooling/openai_classification_client.py
 python examples/online_serving/pooling/openai_cross_encoder_score.py
 ```
 
-## Openai cross_encoder score for multimodal usage
+## OpenAI cross_encoder score for multimodal usage
 
 ```bash
 # vllm serve jinaai/jina-reranker-m0

From 4938636145632101e067cbd497531439dc0697ea Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:17:19 +0800
Subject: [PATCH 15/25] Pooling Tasks

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 2933c8440b9f..f79e9f44bbfa 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -45,12 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to
 [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
 enabling the corresponding APIs:
 
-| Task             | APIs                                 |
-|------------------|--------------------------------------|
-| `token_classify` | `LLM.reward(...)`                    |
-| `embed`          | `LLM.embed(...)`, `LLM.score(...)`\* |
-| `classify`       | `LLM.classify(...)`                  |
-| `score`          | `LLM.score(...)`                     |
+| Task             | APIs                                                                          |
+|------------------|-------------------------------------------------------------------------------|
+| `embed`          | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` |
+| `classify`       | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`               |
+| `score`          | `LLM.score(...)`                                                              |
+| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")`           |
+| `token_embed`    | `LLM.encode(..., pooling_task="token_embed")`                                 |
+| `plugin`         | `LLM.encode(..., pooling_task="plugin")`                                      |
 
 \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task.
 

From a7ba610b04a4bf878f120fc4497b81a1ae74faa7 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:19:03 +0800
Subject: [PATCH 16/25] + runner="pooling"

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index f79e9f44bbfa..f43f19f48fa8 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -93,7 +93,7 @@ It is primarily designed for embedding models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-small")
+llm = LLM(model="intfloat/e5-small", runner="pooling")
 (output,) = llm.embed("Hello, my name is")
 
 embeds = output.outputs.embedding
@@ -110,7 +110,7 @@ It is primarily designed for classification models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach")
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
 (output,) = llm.classify("Hello, my name is")
 
 probs = output.outputs.probs
@@ -131,7 +131,7 @@ It is designed for embedding models and cross-encoder models. Embedding models u
 ```python
 from vllm import LLM
 
-llm = LLM(model="BAAI/bge-reranker-v2-m3")
+llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
 (output,) = llm.score(
     "What is the capital of France?",
     "The capital of Brazil is Brasilia.",
@@ -150,7 +150,7 @@ The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
 ```python
 from vllm import LLM
 
-llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True)
+llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True, runner="pooling")
 (output,) = llm.reward("Hello, my name is")
 
 data = output.outputs.data
@@ -178,7 +178,7 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
 from vllm import LLM
 
 llm = LLM(model="intfloat/e5-small")
-(output,) = llm.encode("Hello, my name is", pooling_task="embed")
+(output,) = llm.encode("Hello, my name is", pooling_task="embed", runner="pooling")
 
 data = output.outputs.data
 print(f"Data: {data!r}")

From 4188194d2616b66498b3e7d0522bd54629efae28 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:22:14 +0800
Subject: [PATCH 17/25] Openai -> OpenAI

Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/online_serving/pooling/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index 8e2878ae4573..b76ad21f0481 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -42,20 +42,20 @@ python examples/online_serving/pooling/multi_vector_retrieval_client.py
 python examples/online_serving/pooling/ner_client.py
 ```
 
-## Openai chat embedding for multimodal usage
+## OpenAI chat embedding for multimodal usage
 
 ```bash
 python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
 ```
 
-## Openai classification usage
+## OpenAI classification usage
 
 ```bash
 # vllm serve jason9693/Qwen2.5-1.5B-apeach
 python examples/online_serving/pooling/openai_classification_client.py
 ```
 
-## Openai cross_encoder score usage
+## OpenAI cross_encoder score usage
 
 ```bash
 # vllm serve BAAI/bge-reranker-v2-m3
@@ -69,21 +69,21 @@ python examples/online_serving/pooling/openai_cross_encoder_score.py
 python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
 ```
 
-## Openai embedding usage
+## OpenAI embedding usage
 
 ```bash
 # vllm serve intfloat/e5-small
 python examples/online_serving/pooling/openai_embedding_client.py
 ```
 
-## Openai embedding matryoshka dimensions usage
+## OpenAI embedding matryoshka dimensions usage
 
 ```bash
 # vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
 ```
 
-## Openai pooling usage
+## OpenAI pooling usage
 
 ```bash
 # vllm serve internlm/internlm2-1_8b-reward --trust-remote-code

From 86ce4c43590758e1e0e2dbfe9eebeac737c54e6d Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:31:47 +0800
Subject: [PATCH 18/25] activation -> use_activation

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md                  |  2 +-
 tests/entrypoints/pooling/llm/test_classify.py | 10 +++++-----
 tests/entrypoints/pooling/llm/test_score.py    | 10 +++++-----
 .../test_pooler_config_init_behaviour.py       |  4 ++--
 tests/test_pooling_params.py                   | 14 +++++++-------
 vllm/model_executor/layers/pooler.py           |  4 ++--
 vllm/model_executor/models/config.py           |  4 ++--
 vllm/pooling_params.py                         | 18 +++++++++---------
 8 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index f43f19f48fa8..b0a1a15d25ec 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -316,4 +316,4 @@ We have split the `encode` task into two more specific token wise tasks: `token_
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` from `PoolingParams`. Instead, you should set `activation`, since we actually allow `classify` and `token_classify` to use any activation function. 
+We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. 
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py
index 96f634ee0a8c..d5b781097b44 100644
--- a/tests/entrypoints/pooling/llm/test_classify.py
+++ b/tests/entrypoints/pooling/llm/test_classify.py
@@ -37,15 +37,15 @@ def llm():
 
 @pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
-    def get_outputs(activation):
+    def get_outputs(use_activation):
         outputs = llm.classify(
-            prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
+            prompts, pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False
         )
         return torch.tensor([x.outputs.probs for x in outputs])
 
-    default = get_outputs(activation=None)
-    w_activation = get_outputs(activation=True)
-    wo_activation = get_outputs(activation=False)
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
 
     assert torch.allclose(default, w_activation, atol=1e-2), (
         "Default should use activation."
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py
index 2df973dd7863..b69c6a47c191 100644
--- a/tests/entrypoints/pooling/llm/test_score.py
+++ b/tests/entrypoints/pooling/llm/test_score.py
@@ -34,21 +34,21 @@ def llm():
 
 
 def test_pooling_params(llm: LLM):
-    def get_outputs(activation):
+    def get_outputs(use_activation):
         text_1 = "What is the capital of France?"
         text_2 = "The capital of France is Paris."
 
         outputs = llm.score(
             text_1,
             text_2,
-            pooling_params=PoolingParams(activation=activation),
+            pooling_params=PoolingParams(use_activation=use_activation),
             use_tqdm=False,
         )
         return torch.tensor([x.outputs.score for x in outputs])
 
-    default = get_outputs(activation=None)
-    w_activation = get_outputs(activation=True)
-    wo_activation = get_outputs(activation=False)
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
 
     assert torch.allclose(default, w_activation, atol=1e-2), (
         "Default should use activation."
diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
index 55663ee3f1b4..61e082f2789f 100644
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -24,7 +24,7 @@ def test_classify_models_using_activation(
         model,
         max_model_len=512,
         dtype=dtype,
-        pooler_config=PoolerConfig(activation=False),
+        pooler_config=PoolerConfig(use_activation=False),
     ) as vllm_model:
         wo_activation_out = vllm_model.classify(example_prompts)
 
@@ -32,7 +32,7 @@ def test_classify_models_using_activation(
         model,
         max_model_len=512,
         dtype=dtype,
-        pooler_config=PoolerConfig(activation=True),
+        pooler_config=PoolerConfig(use_activation=True),
     ) as vllm_model:
         w_activation_out = vllm_model.classify(example_prompts)
 
diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py
index e73d7efc1483..7812562c8948 100644
--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
@@ -17,7 +17,7 @@
     ),
 ]
 
-classify_parameters = ["activation"]
+classify_parameters = ["use_activation"]
 embed_parameters = ["dimensions", "normalize"]
 step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
 
@@ -88,13 +88,13 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
 def test_classify(task):
     model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
 
-    pooling_params = PoolingParams(activation=None)
+    pooling_params = PoolingParams(use_activation=None)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(activation=True)
+    pooling_params = PoolingParams(use_activation=True)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(activation=False)
+    pooling_params = PoolingParams(use_activation=False)
     pooling_params.verify(task=task, model_config=model_config)
 
     invalid_parameters = embed_parameters + step_pooling_parameters
@@ -137,13 +137,13 @@ def test_token_classify(pooling_type: str):
         pooler_config=PoolerConfig(pooling_type=pooling_type)
     )
 
-    pooling_params = PoolingParams(activation=None)
+    pooling_params = PoolingParams(use_activation=None)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(activation=True)
+    pooling_params = PoolingParams(use_activation=True)
     pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(activation=False)
+    pooling_params = PoolingParams(use_activation=False)
     pooling_params.verify(task=task, model_config=model_config)
 
     invalid_parameters = embed_parameters
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 145f18f23566..7dd02e32ff21 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -607,7 +607,7 @@ def forward(
             pooled_data -= self.logit_bias
 
         pooling_params = get_pooling_params(pooling_metadata)
-        flags = [p.activation for p in pooling_params]
+        flags = [p.use_activation for p in pooling_params]
 
         if len(set(flags)) == 1:
             scores = self.act_fn(pooled_data) if flags[0] else pooled_data
@@ -681,7 +681,7 @@ def forward(
         if self.logit_bias is not None:
             scores -= self.logit_bias
 
-        if pooling_param.activation:
+        if pooling_param.use_activation:
             scores = self.act_fn(scores)
 
         # scores shape: [n_token, num_labels]
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 493b74bddda7..d1af1b6c802b 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -53,8 +53,8 @@ class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         pooler_config = vllm_config.model_config.pooler_config
-        if pooler_config.activation is None:
-            pooler_config.activation = False
+        if pooler_config.use_activation is None:
+            pooler_config.use_activation = False
 
 
 class JinaRobertaModelConfig(VerifyAndUpdateConfig):
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 090d92414465..0fe6b7c5f301 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -28,7 +28,7 @@ class PoolingParams(
         normalize: Whether to normalize the embeddings outputs.
         dimensions: Reduce the dimensions of embeddings
             if model support matryoshka representation.
-        activation: Whether to apply activation function to
+        use_activation: Whether to apply activation function to
             the classification outputs.
     """
 
@@ -44,7 +44,7 @@ class PoolingParams(
 
     ## for classification, scoring and rerank
     # --8<-- [start:classification-pooling-params]
-    activation: bool | None = None
+    use_activation: bool | None = None
     # --8<-- [end:classification-pooling-params]
 
     ## for step pooling models
@@ -59,16 +59,16 @@ class PoolingParams(
 
     @property
     def all_parameters(self) -> list[str]:
-        return ["dimensions", "normalize", "activation"]
+        return ["dimensions", "normalize", "use_activation"]
 
     @property
     def valid_parameters(self):
         return {
             "embed": ["dimensions", "normalize"],
-            "classify": ["activation"],
-            "score": ["activation"],
+            "classify": ["use_activation"],
+            "score": ["use_activation"],
             "token_embed": ["dimensions", "normalize"],
-            "token_classify": ["activation"],
+            "token_classify": ["use_activation"],
         }
 
     def clone(self) -> "PoolingParams":
@@ -168,8 +168,8 @@ def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
                     raise ValueError("Dimensions must be greater than 0")
 
         elif self.task in ["classify", "score", "token_classify"]:
-            if self.activation is None:
-                self.activation = True
+            if self.use_activation is None:
+                self.use_activation = True
         else:
             raise ValueError(f"Unknown pooling task: {self.task}")
 
@@ -197,7 +197,7 @@ def __repr__(self) -> str:
             f"task={self.task}, "
             f"normalize={self.normalize}, "
             f"dimensions={self.dimensions}, "
-            f"activation={self.activation}, "
+            f"use_activation={self.use_activation}, "
             f"step_tag_id={self.step_tag_id}, "
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "

From 44c7d8ab9e86a6759b58b4ee34c44a8100b7adad Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:34:51 +0800
Subject: [PATCH 19/25] fix

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index b0a1a15d25ec..14cab221adba 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -311,6 +311,7 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
 ### Encode task
 
 We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+
 - `token_embed` is the same as embed, using normalize as activation.
 - `token_classify` is the same as classify, default using softmax as activation.
 

From d46428a4282302d2746046ee35aee0d18f08a30a Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:37:33 +0800
Subject: [PATCH 20/25] fix

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/entrypoints/pooling/llm/test_classify.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py
index d5b781097b44..1063c3b6b755 100644
--- a/tests/entrypoints/pooling/llm/test_classify.py
+++ b/tests/entrypoints/pooling/llm/test_classify.py
@@ -39,7 +39,9 @@ def llm():
 def test_pooling_params(llm: LLM):
     def get_outputs(use_activation):
         outputs = llm.classify(
-            prompts, pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False
+            prompts,
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
         )
         return torch.tensor([x.outputs.probs for x in outputs])
 

From 90df794a0ed0f07a4282748a4422311b25da4a54 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:49:40 +0800
Subject: [PATCH 21/25] activation -> use_activation

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md                 |  6 +++---
 tests/entrypoints/pooling/llm/test_reward.py  | 12 ++++++-----
 .../pooling/openai/test_classification.py     | 16 +++++++++------
 .../entrypoints/pooling/openai/test_rerank.py | 12 +++++------
 .../entrypoints/pooling/openai/test_score.py  | 16 +++++++--------
 vllm/entrypoints/openai/protocol.py           | 20 +++++++++----------
 6 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 14cab221adba..5c9180c69c4b 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -150,7 +150,7 @@ The [reward][vllm.LLM.reward] method is available to all reward models in vLLM.
 ```python
 from vllm import LLM
 
-llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True, runner="pooling")
+llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
 (output,) = llm.reward("Hello, my name is")
 
 data = output.outputs.data
@@ -177,8 +177,8 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-small")
-(output,) = llm.encode("Hello, my name is", pooling_task="embed", runner="pooling")
+llm = LLM(model="intfloat/e5-small", runner="pooling")
+(output,) = llm.encode("Hello, my name is", pooling_task="embed")
 
 data = output.outputs.data
 print(f"Data: {data!r}")
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py
index 81058dbad891..0255704cecd9 100644
--- a/tests/entrypoints/pooling/llm/test_reward.py
+++ b/tests/entrypoints/pooling/llm/test_reward.py
@@ -37,15 +37,17 @@ def llm():
 
 
 def test_pooling_params(llm: LLM):
-    def get_outputs(activation):
+    def get_outputs(use_activation):
         outputs = llm.reward(
-            prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
+            prompts,
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
         )
         return torch.cat([x.outputs.data for x in outputs])
 
-    default = get_outputs(activation=None)
-    w_activation = get_outputs(activation=True)
-    wo_activation = get_outputs(activation=False)
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
 
     assert torch.allclose(default, w_activation, atol=1e-2), (
         "Default should use activation."
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
index 2bffe5dabedc..671bb948780a 100644
--- a/tests/entrypoints/pooling/openai/test_classification.py
+++ b/tests/entrypoints/pooling/openai/test_classification.py
@@ -163,20 +163,24 @@ async def test_invocations(server: RemoteOpenAIServer):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_activation(server: RemoteOpenAIServer, model_name: str):
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
     input_text = ["This product was excellent and exceeded my expectations"]
 
-    async def get_outputs(activation):
+    async def get_outputs(use_activation):
         response = requests.post(
             server.url_for("classify"),
-            json={"model": model_name, "input": input_text, "activation": activation},
+            json={
+                "model": model_name,
+                "input": input_text,
+                "use_activation": use_activation,
+            },
         )
         outputs = response.json()
         return torch.tensor([x["probs"] for x in outputs["data"]])
 
-    default = await get_outputs(activation=None)
-    w_activation = await get_outputs(activation=True)
-    wo_activation = await get_outputs(activation=False)
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
 
     assert torch.allclose(default, w_activation, atol=1e-2), (
         "Default should use activation."
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
index edfb3f7cb4dd..1d85190c12a1 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -125,8 +125,8 @@ def test_invocations(server: RemoteOpenAIServer):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_activation(server: RemoteOpenAIServer, model_name: str):
-    async def get_outputs(activation):
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
+    async def get_outputs(use_activation):
         query = "What is the capital of France?"
         documents = [
             "The capital of Brazil is Brasilia.",
@@ -139,16 +139,16 @@ async def get_outputs(activation):
                 "model": model_name,
                 "query": query,
                 "documents": documents,
-                "activation": activation,
+                "use_activation": use_activation,
             },
         )
         outputs = response.json()
 
         return torch.tensor([x["relevance_score"] for x in outputs["results"]])
 
-    default = await get_outputs(activation=None)
-    w_activation = await get_outputs(activation=True)
-    wo_activation = await get_outputs(activation=False)
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
 
     assert torch.allclose(default, w_activation, atol=1e-2), (
         "Default should use activation."
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py
index ef213ab0ea18..b8f796d47efa 100644
--- a/tests/entrypoints/pooling/openai/test_score.py
+++ b/tests/entrypoints/pooling/openai/test_score.py
@@ -218,8 +218,8 @@ def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
             # TODO: reset this tolerance to 0.01 once we find
             # an alternative to flash_attn with bfloat16
 
-    def test_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
-        def get_outputs(activation):
+    def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
+        def get_outputs(use_activation):
             text_1 = "What is the capital of France?"
             text_2 = "The capital of France is Paris."
             response = requests.post(
@@ -228,7 +228,7 @@ def get_outputs(activation):
                     "model": model["name"],
                     "text_1": text_1,
                     "text_2": text_2,
-                    "activation": activation,
+                    "use_activation": use_activation,
                 },
             )
             if response.status_code != 200:
@@ -238,9 +238,9 @@ def get_outputs(activation):
             return torch.tensor([x["score"] for x in outputs["data"]])
 
         if model["is_cross_encoder"]:
-            default = get_outputs(activation=None)
-            w_activation = get_outputs(activation=True)
-            wo_activation = get_outputs(activation=False)
+            default = get_outputs(use_activation=None)
+            w_activation = get_outputs(use_activation=True)
+            wo_activation = get_outputs(use_activation=False)
 
             assert torch.allclose(default, w_activation, atol=1e-2), (
                 "Default should use activation."
@@ -252,8 +252,8 @@ def get_outputs(activation):
                 "w_activation should be close to activation(wo_activation)."
             )
         else:
-            get_outputs(activation=None)
+            get_outputs(use_activation=None)
 
             # The activation parameter only works for the is_cross_encoder model
-            response = get_outputs(activation=True)
+            response = get_outputs(use_activation=True)
             assert response.status_code == 400
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4bfa7ce9a927..dfb5995024f4 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1673,7 +1673,7 @@ def to_pooling_params(self):
 
 class PoolingCompletionRequest(EmbeddingCompletionRequest):
     task: PoolingTask | None = None
-    activation: bool | None = Field(
+    use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
         "If it is a classify or token_classify task, the default is True; "
@@ -1685,13 +1685,13 @@ def to_pooling_params(self):
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
             normalize=self.normalize,
-            activation=self.activation,
+            use_activation=self.use_activation,
         )
 
 
 class PoolingChatRequest(EmbeddingChatRequest):
     task: PoolingTask | None = None
-    activation: bool | None = Field(
+    use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
         "If it is a classify or token_classify task, the default is True; "
@@ -1703,7 +1703,7 @@ def to_pooling_params(self):
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
             normalize=self.normalize,
-            activation=self.activation,
+            use_activation=self.use_activation,
         )
 
 
@@ -1785,7 +1785,7 @@ class ScoreRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: bool | None = Field(
+    use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
         "Default is True.",
@@ -1795,7 +1795,7 @@ class ScoreRequest(OpenAIBaseModel):
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            activation=self.activation,
+            use_activation=self.use_activation,
         )
 
 
@@ -1822,7 +1822,7 @@ class RerankRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: bool | None = Field(
+    use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
         "Default is True.",
@@ -1832,7 +1832,7 @@ class RerankRequest(OpenAIBaseModel):
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            activation=self.activation,
+            use_activation=self.use_activation,
         )
 
 
@@ -2000,7 +2000,7 @@ class ClassificationRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: bool | None = Field(
+    use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
         "Default is True.",
@@ -2010,7 +2010,7 @@ class ClassificationRequest(OpenAIBaseModel):
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            activation=self.activation,
+            use_activation=self.use_activation,
         )
 
 

From 90746ca29bcc7680e6126601bd009f5ef885fe54 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 28 Oct 2025 22:52:59 +0800
Subject: [PATCH 22/25] fix

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 5c9180c69c4b..bb2a7287f1a0 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -317,4 +317,4 @@ We have split the `encode` task into two more specific token wise tasks: `token_
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. 
+We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.

From 2cf3132fd80c93191af916f4c246d5e9450250c9 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 29 Oct 2025 00:19:31 +0800
Subject: [PATCH 23/25] fix

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/serving/openai_compatible_server.md | 4 ++--
 vllm/config/pooler.py                    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 1414718a697d..e331b3422ea6 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -638,7 +638,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: [examples/online_serving/openai_cross_encoder_score.py](../../examples/online_serving/openai_cross_encoder_score.py)
+Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
 
 #### Single inference
 
@@ -819,7 +819,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
         print("Scoring output:", response_json["data"][0]["score"])
         print("Scoring output:", response_json["data"][1]["score"])
         ```
-Full example: [examples/online_serving/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/openai_cross_encoder_score_for_multimodal.py)
+Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
 
 #### Extra parameters
 
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 0590f74aa4c9..426779c23ce5 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -48,7 +48,7 @@ class PoolerConfig:
     """
 
     ## for classification models
-    activation: bool | None = None
+    use_activation: bool | None = None
     """
     Whether to apply activation function to the classification outputs.
     Defaults to True.

From 794669dfe627645b764bcdf262c74f6957d37f56 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 29 Oct 2025 00:20:24 +0800
Subject: [PATCH 24/25] fix

Signed-off-by: wang.yuqi <noooop@126.com>
---
 .../language/pooling/test_pooler_config_init_behaviour.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
index 61e082f2789f..deb5de984d90 100644
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -104,7 +104,7 @@ def test_reward_models_using_activation(
         model,
         max_model_len=1024,
         dtype=dtype,
-        pooler_config=PoolerConfig(activation=False),
+        pooler_config=PoolerConfig(use_activation=False),
     ) as vllm_model:
         wo_activation = vllm_model.reward(example_prompts)
 
@@ -112,7 +112,7 @@ def test_reward_models_using_activation(
         model,
         max_model_len=1024,
         dtype=dtype,
-        pooler_config=PoolerConfig(activation=True),
+        pooler_config=PoolerConfig(use_activation=True),
     ) as vllm_model:
         w_activation = vllm_model.reward(example_prompts)
 

From 4c2a98e4bddc2c892ae97926e79fc4d267f31d47 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 30 Oct 2025 15:58:59 +0800
Subject: [PATCH 25/25] add deprecated waring

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/pooling_models.md       |  2 +-
 vllm/config/pooler.py               | 36 +++++++++++++++---
 vllm/entrypoints/openai/protocol.py | 57 ++++++++++++++++++++++++++---
 vllm/pooling_params.py              | 16 +++++---
 4 files changed, 95 insertions(+), 16 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index bb2a7287f1a0..18bb645ea9a9 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -317,4 +317,4 @@ We have split the `encode` task into two more specific token wise tasks: `token_
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 426779c23ce5..6bece8d0785b 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -7,6 +7,9 @@
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @config
@@ -48,6 +51,14 @@ class PoolerConfig:
     """
 
     ## for classification models
+    softmax: float | None = None
+    """
+    softmax will be deprecated, please use use_activation instead.
+    """
+    activation: float | None = None
+    """
+    activation will be deprecated, please use use_activation instead.
+    """
     use_activation: bool | None = None
     """
     Whether to apply activation function to the classification outputs.
@@ -59,11 +70,6 @@ class PoolerConfig:
     """
 
     ## for reward models
-    softmax: bool | None = None
-    """
-    Whether to apply softmax to the reward outputs.
-    Defaults to True.
-    """
     step_tag_id: int | None = None
     """
     If set, only the score corresponding to the `step_tag_id` in the
@@ -77,6 +83,10 @@ class PoolerConfig:
     `math-shepherd-mistral-7b-prm` model.
     """
 
+    def __post_init__(self):
+        # raise deprecated warning for softmax and activation
+        self.use_activation = get_use_activation(self)
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -94,3 +104,19 @@ def compute_hash(self) -> str:
         factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
+
+
+def get_use_activation(o: object):
+    if softmax := getattr(o, "softmax", None) is not None:
+        logger.warning_once(
+            "softmax will be deprecated, please use use_activation instead."
+        )
+        return softmax
+
+    if activation := getattr(o, "activation", None) is not None:
+        logger.warning_once(
+            "activation will be deprecated, please use use_activation instead."
+        )
+        return activation
+
+    return getattr(o, "use_activation", None)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index dfb5995024f4..d0061f9d5b40 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -49,6 +49,7 @@
 )
 from openai_harmony import Message as OpenAIHarmonyMessage
 
+from vllm.config.pooler import get_use_activation
 from vllm.tasks import PoolingTask
 from vllm.utils.serial_utils import (
     EmbedDType,
@@ -1673,6 +1674,14 @@ def to_pooling_params(self):
 
 class PoolingCompletionRequest(EmbeddingCompletionRequest):
     task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
     use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
@@ -1685,12 +1694,20 @@ def to_pooling_params(self):
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
             normalize=self.normalize,
-            use_activation=self.use_activation,
+            use_activation=get_use_activation(self),
         )
 
 
 class PoolingChatRequest(EmbeddingChatRequest):
     task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
     use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
@@ -1703,7 +1720,7 @@ def to_pooling_params(self):
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             dimensions=self.dimensions,
             normalize=self.normalize,
-            use_activation=self.use_activation,
+            use_activation=get_use_activation(self),
         )
 
 
@@ -1785,6 +1802,16 @@ class ScoreRequest(OpenAIBaseModel):
         ),
     )
 
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
     use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
@@ -1795,7 +1822,7 @@ class ScoreRequest(OpenAIBaseModel):
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=self.use_activation,
+            use_activation=get_use_activation(self),
         )
 
 
@@ -1822,6 +1849,16 @@ class RerankRequest(OpenAIBaseModel):
         ),
     )
 
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
     use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
@@ -1832,7 +1869,7 @@ class RerankRequest(OpenAIBaseModel):
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=self.use_activation,
+            use_activation=get_use_activation(self),
         )
 
 
@@ -2000,6 +2037,16 @@ class ClassificationRequest(OpenAIBaseModel):
         ),
     )
 
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
     use_activation: bool | None = Field(
         default=None,
         description="Whether to use activation for classification outputs. "
@@ -2010,7 +2057,7 @@ class ClassificationRequest(OpenAIBaseModel):
     def to_pooling_params(self):
         return PoolingParams(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=self.use_activation,
+            use_activation=get_use_activation(self),
         )
 
 
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 0fe6b7c5f301..72a8320cc1bf 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,16 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Annotated, Any, Optional
+from typing import Annotated, Any, Optional
 
 import msgspec
 
+from vllm.config import ModelConfig, PoolerConfig
+from vllm.config.pooler import get_use_activation
 from vllm.sampling_params import RequestOutputKind
 from vllm.tasks import PoolingTask
 
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig, PoolerConfig
-
 
 class PoolingParams(
     msgspec.Struct,
@@ -25,9 +24,11 @@ class PoolingParams(
             Set to -1 to use the model's default truncation size.
             Set to k to keep only the last k tokens (left truncation).
             Set to None to disable truncation.
-        normalize: Whether to normalize the embeddings outputs.
         dimensions: Reduce the dimensions of embeddings
             if model support matryoshka representation.
+        normalize: Whether to normalize the embeddings outputs.
+        softmax: softmax will be deprecated, please use use_activation instead.
+        activation: activation will be deprecated, please use use_activation instead.
         use_activation: Whether to apply activation function to
             the classification outputs.
     """
@@ -44,6 +45,8 @@ class PoolingParams(
 
     ## for classification, scoring and rerank
     # --8<-- [start:classification-pooling-params]
+    softmax: bool | None = None
+    activation: bool | None = None
     use_activation: bool | None = None
     # --8<-- [end:classification-pooling-params]
 
@@ -84,6 +87,9 @@ def verify(
             msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
             raise ValueError(msg)
 
+        # raise deprecated warning for softmax and activation
+        self.use_activation = get_use_activation(self)
+
         # plugin task uses io_processor.parse_request to verify inputs,
         # skipping PoolingParams verify
         if self.task == "plugin":