From b6b2e123cd1b4fab1e2c85a51a8541b27bf616af Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 24 Sep 2025 07:37:29 +0800 Subject: [PATCH 01/25] token_embed & token_classify Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 50982d3d0d0f..ccb4bf2d347e 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -30,11 +30,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe vLLM will attempt to automatically convert the model according to the architecture names shown in the table below. -| Architecture | `--convert` | Supported pooling tasks | -|-------------------------------------------------|-------------|-------------------------------| -| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `encode`, `embed` | -| `*For*Classification`, `*ClassificationModel` | `classify` | `encode`, `classify`, `score` | -| `*ForRewardModeling`, `*RewardModel` | `reward` | `encode` | +| Architecture | `--convert` | Supported pooling tasks | +|-------------------------------------------------|-------------|---------------------------------------| +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | +| `*ForRewardModeling`, `*RewardModel` | `reward` | `token_classify` | !!! tip You can explicitly set `--convert ` to specify how to convert the model. From dd06fe14a25ecc5774ba66488efb63bac952d019 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 16:19:13 +0800 Subject: [PATCH 02/25] /pooling endpoint support all pooling tasks Signed-off-by: wang.yuqi --- .../pooling/openai/test_classification.py | 76 +++++++++++++++---- .../pooling/openai/test_embedding.py | 53 ++++++++++++- .../entrypoints/pooling/openai/test_rerank.py | 41 +++++++++- vllm/entrypoints/openai/api_server.py | 7 +- vllm/entrypoints/openai/protocol.py | 61 +++++++++++++-- vllm/entrypoints/openai/serving_pooling.py | 23 ++++-- 6 files changed, 223 insertions(+), 38 deletions(-) diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py index 92d40efad21c..2bffe5dabedc 100644 --- a/tests/entrypoints/pooling/openai/test_classification.py +++ b/tests/entrypoints/pooling/openai/test_classification.py @@ -7,7 +7,7 @@ import torch.nn.functional as F from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import ClassificationResponse +from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" DTYPE = "float32" # Use float32 to avoid NaN issue @@ -191,18 +191,7 @@ async def get_outputs(activation): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_pooling(server: RemoteOpenAIServer, model_name: str): - # pooling api uses ALL pooling, which does not support chunked prefill. - response = requests.post( - server.url_for("pooling"), - json={"model": model_name, "input": "test", "encoding_format": "float"}, - ) - assert response.json()["error"]["type"] == "BadRequestError" - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_score(server: RemoteOpenAIServer, model_name: str): +async def test_score(server: RemoteOpenAIServer, model_name: str): # score api is only enabled for num_labels == 1. response = requests.post( server.url_for("score"), @@ -217,7 +206,7 @@ def test_score(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_rerank(server: RemoteOpenAIServer, model_name: str): +async def test_rerank(server: RemoteOpenAIServer, model_name: str): # rerank api is only enabled for num_labels == 1. response = requests.post( server.url_for("rerank"), @@ -228,3 +217,62 @@ def test_rerank(server: RemoteOpenAIServer, model_name: str): }, ) assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): + input_text = "This product was excellent and exceeded my expectations" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 2 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): + # token_classify uses ALL pooling, which does not support chunked prefill. + task = "token_classify" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index b3f12283fdbd..e971b23e8f1a 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -562,12 +562,40 @@ async def get_outputs(normalize): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling(server: RemoteOpenAIServer, model_name: str): +async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str): + task = "embed" input_text = ["The chef prepared a delicious meal."] response = requests.post( server.url_for("pooling"), - json={"model": model_name, "input": input_text, "encoding_format": "float"}, + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": task, + }, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 384 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str): + task = "token_embed" + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": task, + }, ) poolings = PoolingResponse.model_validate(response.json()) @@ -575,3 +603,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str): assert len(poolings.data) == 1 assert len(poolings.data[0].data) == 11 assert len(poolings.data[0].data[0]) == 384 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index e43148d25fee..edfb3f7cb4dd 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -163,7 +163,25 @@ async def get_outputs(activation): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling(server: RemoteOpenAIServer, model_name: str): +async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): + input_text = "This product was excellent and exceeded my expectations" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): input_text = ["The chef prepared a delicious meal."] response = requests.post( @@ -176,3 +194,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str): assert len(poolings.data) == 1 assert len(poolings.data[0].data) == 11 assert len(poolings.data[0].data[0]) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 632bd741290b..1dc07d908a8d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1749,12 +1749,7 @@ async def init_app_state( log_error_stack=args.log_error_stack, ) ) - if ( - any( - task in supported_tasks - for task in ["token_embed", "token_classify", "plugin"] - ) - ) + if supported_tasks else None ) state.openai_serving_embedding = ( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0778e4d78790..4bfa7ce9a927 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -49,6 +49,7 @@ ) from openai_harmony import Message as OpenAIHarmonyMessage +from vllm.tasks import PoolingTask from vllm.utils.serial_utils import ( EmbedDType, EncodingFormat, @@ -1669,8 +1670,42 @@ def to_pooling_params(self): EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest -PoolingCompletionRequest = EmbeddingCompletionRequest -PoolingChatRequest = EmbeddingChatRequest + +class PoolingCompletionRequest(EmbeddingCompletionRequest): + task: PoolingTask | None = None + activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + activation=self.activation, + ) + + +class PoolingChatRequest(EmbeddingChatRequest): + task: PoolingTask | None = None + activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + activation=self.activation, + ) + T = TypeVar("T") @@ -1686,6 +1721,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): """ data: T + task: PoolingTask = "plugin" encoding_format: EncodingFormat = "float" embed_dtype: EmbedDType = Field( default="float32", @@ -1749,8 +1785,11 @@ class ScoreRequest(OpenAIBaseModel): ), ) - activation: bool | None = None - + activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:score-extra-params] def to_pooling_params(self): @@ -1783,8 +1822,11 @@ class RerankRequest(OpenAIBaseModel): ), ) - activation: bool | None = None - + activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:rerank-extra-params] def to_pooling_params(self): @@ -1958,8 +2000,11 @@ class ClassificationRequest(OpenAIBaseModel): ), ) - activation: bool | None = None - + activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:classification-extra-params] def to_pooling_params(self): diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 568896ccbf1b..0eade272111f 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -170,15 +170,24 @@ async def create_pooling( pooling_params = request.to_pooling_params() pooling_task: PoolingTask - if "token_embed" in self.supported_tasks: - pooling_task = "token_embed" - elif "token_classify" in self.supported_tasks: - pooling_task = "token_classify" - elif "plugin" in self.supported_tasks: - pooling_task = "plugin" + if request.task is None: + if "token_embed" in self.supported_tasks: + pooling_task = "token_embed" + elif "token_classify" in self.supported_tasks: + pooling_task = "token_classify" + elif "plugin" in self.supported_tasks: + pooling_task = "plugin" + else: + return self.create_error_response( + f"pooling_task must be one of {self.supported_tasks}." + ) else: + pooling_task = request.task + + if pooling_task not in self.supported_tasks: return self.create_error_response( - f"pooling_task must be one of {self.supported_tasks}." + f"Task {pooling_task} is not supported, it" + f" must be one of {self.supported_tasks}." ) try: From 064346100f5cdc7a64cf657244638c1e3db25381 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 17:44:45 +0800 Subject: [PATCH 03/25] update Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 68 +++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 21b3b8b0b31e..1a89ccdce9a9 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -45,12 +45,12 @@ Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], enabling the corresponding APIs: -| Task | APIs | -|------------|--------------------------------------| -| `encode` | `LLM.reward(...)` | -| `embed` | `LLM.embed(...)`, `LLM.score(...)`\* | -| `classify` | `LLM.classify(...)` | -| `score` | `LLM.score(...)` | +| Task | APIs | +|------------------|--------------------------------------| +| `token_classify` | `LLM.reward(...)` | +| `embed` | `LLM.embed(...)`, `LLM.score(...)`\* | +| `classify` | `LLM.classify(...)` | +| `score` | `LLM.score(...)` | \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task. @@ -91,7 +91,7 @@ It is primarily designed for embedding models. ```python from vllm import LLM -llm = LLM(model="intfloat/e5-small", runner="pooling") +llm = LLM(model="intfloat/e5-small") (output,) = llm.embed("Hello, my name is") embeds = output.outputs.embedding @@ -108,7 +108,7 @@ It is primarily designed for classification models. ```python from vllm import LLM -llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach") (output,) = llm.classify("Hello, my name is") probs = output.outputs.probs @@ -129,7 +129,7 @@ It is designed for embedding models and cross-encoder models. Embedding models u ```python from vllm import LLM -llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") +llm = LLM(model="BAAI/bge-reranker-v2-m3") (output,) = llm.score( "What is the capital of France?", "The capital of Brazil is Brasilia.", @@ -144,12 +144,11 @@ A code example can be found here: [examples/offline_inference/basic/score.py](.. ### `LLM.reward` The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. -It returns the extracted hidden states directly. ```python from vllm import LLM -llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True) +llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True) (output,) = llm.reward("Hello, my name is") data = output.outputs.data @@ -161,20 +160,22 @@ A code example can be found here: [examples/offline_inference/basic/reward.py](. ### `LLM.encode` The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. -It returns the extracted hidden states directly. !!! note Please use one of the more specific methods or set the task directly when using `LLM.encode`: - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. - - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`. - For similarity scores, use `LLM.score(...)`. + - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`. + - For token classification, use `pooling_task="token_classify"`. + - For multi-vector retrieval, use `pooling_task="token_embed"` + - For IO Processor Plugins , use `pooling_task="plugin"` ```python from vllm import LLM -llm = LLM(model="intfloat/e5-small", runner="pooling") +llm = LLM(model="intfloat/e5-small") (output,) = llm.encode("Hello, my name is", pooling_task="embed") data = output.outputs.data @@ -185,10 +186,47 @@ print(f"Data: {data!r}") Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: -- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models. - [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models. - [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models. +- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. + +!!! note + Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.: + + - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`. + - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`. + - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). + - For rewards, `task":"token_classify"`. + - For token classification, use `task":"token_classify"`. + - For multi-vector retrieval, use `task":"token_embed"` + - For IO Processor Plugins , use `task":"plugin"` + +```python +# start a supported embeddings model server with `vllm serve`, e.g. +# vllm serve intfloat/e5-small +import requests + +host = "localhost" +port = "8000" +model_name = "intfloat/e5-small" + +api_url = f"http://{host}:{port}/pooling" + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +prompt = {"model": model_name, "input": prompts, "task": "embed"} + +response = requests.post(api_url, json=prompt) + +for output in response.json()["data"]: + data = output["data"] + print(f"Data: {data!r} (size={len(data)})") +``` ## Matryoshka Embeddings From ce69d7b5ff04cd2729aa54b4894b14a0fd917a82 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 18:13:33 +0800 Subject: [PATCH 04/25] update examples Signed-off-by: wang.yuqi --- docs/design/io_processor_plugins.md | 2 +- examples/offline_inference/pooling/README.md | 12 ++++++++ examples/offline_inference/pooling/ner.py | 2 +- .../{ => pooling}/prithvi_geospatial_mae.py | 0 .../prithvi_geospatial_mae_io_processor.py | 0 examples/online_serving/pooling/README.md | 30 +++++++++++++++++++ .../openai_cross_encoder_score.py | 0 ...enai_cross_encoder_score_for_multimodal.py | 0 .../{ => pooling}/prithvi_geospatial_mae.py | 0 9 files changed, 44 insertions(+), 2 deletions(-) rename examples/offline_inference/{ => pooling}/prithvi_geospatial_mae.py (100%) rename examples/offline_inference/{ => pooling}/prithvi_geospatial_mae_io_processor.py (100%) rename examples/online_serving/{ => pooling}/openai_cross_encoder_score.py (100%) rename examples/online_serving/{ => pooling}/openai_cross_encoder_score_for_multimodal.py (100%) rename examples/online_serving/{ => pooling}/prithvi_geospatial_mae.py (100%) diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index fb64a7bb9c8f..2f4b17f191a5 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py). -An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples. +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples. ## Using an IO Processor plugin diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index cd9717122b16..e10606676f06 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -38,6 +38,18 @@ python examples/offline_inference/pooling/multi_vector_retrieval.py python examples/offline_inference/pooling/ner.py ``` +## prithvi geospatial mae usage + +```bash +python examples/offline_inference/pooling/prithvi_geospatial_mae.py +``` + +## IO Processor Plugins for prithvi geospatial mae usage + +```bash +python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py +``` + ## Qwen3 reranker usage ```bash diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py index b2dffdd6c5ee..34c80e7ccffd 100644 --- a/examples/offline_inference/pooling/ner.py +++ b/examples/offline_inference/pooling/ner.py @@ -33,7 +33,7 @@ def main(args: Namespace): label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label # Run inference - outputs = llm.encode(prompts) + outputs = llm.encode(prompts, pooling_task="token_classify") for prompt, output in zip(prompts, outputs): logits = output.outputs.data diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/pooling/prithvi_geospatial_mae.py similarity index 100% rename from examples/offline_inference/prithvi_geospatial_mae.py rename to examples/offline_inference/pooling/prithvi_geospatial_mae.py diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py similarity index 100% rename from examples/offline_inference/prithvi_geospatial_mae_io_processor.py rename to examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 3b6da20d5f0f..5080716d97ca 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -3,36 +3,42 @@ ## Cohere rerank usage ```bash +# vllm serve BAAI/bge-reranker-base python examples/online_serving/pooling/cohere_rerank_client.py ``` ## Embedding requests base64 encoding_format usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/embedding_requests_base64_client.py ``` ## Embedding requests bytes encoding_format usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/embedding_requests_bytes_client.py ``` ## Jinaai rerank usage ```bash +# vllm serve BAAI/bge-reranker-base python examples/online_serving/pooling/jinaai_rerank_client.py ``` ## Multi vector retrieval usage ```bash +# vllm serve BAAI/bge-m3 python examples/online_serving/pooling/multi_vector_retrieval_client.py ``` ## Named Entity Recognition (NER) usage ```bash +# vllm serve boltuix/NeuroBERT-NER python examples/online_serving/pooling/ner_client.py ``` @@ -45,23 +51,47 @@ python examples/online_serving/pooling/openai_chat_embedding_client_for_multimod ## Openai classification usage ```bash +# vllm serve jason9693/Qwen2.5-1.5B-apeach python examples/online_serving/pooling/openai_classification_client.py ``` +## Openai cross_encoder score usage + +```bash +# vllm serve BAAI/bge-reranker-v2-m3 +python examples/online_serving/pooling/openai_cross_encoder_score.py +``` + +## Openai cross_encoder score for multimodal usage + +```bash +# vllm serve jinaai/jina-reranker-m0 +python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py +``` + ## Openai embedding usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/openai_embedding_client.py ``` ## Openai embedding matryoshka dimensions usage ```bash +# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py ``` ## Openai pooling usage ```bash +# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code python examples/online_serving/pooling/openai_pooling_client.py ``` + +## Online prithvi geospatial mae usage + +```bash +python examples/online_serving/pooling/prithvi_geospatial_mae.py +``` \ No newline at end of file diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/pooling/openai_cross_encoder_score.py similarity index 100% rename from examples/online_serving/openai_cross_encoder_score.py rename to examples/online_serving/pooling/openai_cross_encoder_score.py diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py similarity index 100% rename from examples/online_serving/openai_cross_encoder_score_for_multimodal.py rename to examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/pooling/prithvi_geospatial_mae.py similarity index 100% rename from examples/online_serving/prithvi_geospatial_mae.py rename to examples/online_serving/pooling/prithvi_geospatial_mae.py From f9d85cf3f26f7d39ae883b542fd694b450aa76c1 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 18:22:04 +0800 Subject: [PATCH 05/25] update examples Signed-off-by: wang.yuqi --- examples/online_serving/pooling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 5080716d97ca..634fa049b048 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -94,4 +94,4 @@ python examples/online_serving/pooling/openai_pooling_client.py ```bash python examples/online_serving/pooling/prithvi_geospatial_mae.py -``` \ No newline at end of file +``` From 1ea309d12568755305c65504def4347747da5e30 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 18:31:51 +0800 Subject: [PATCH 06/25] Update vllm/entrypoints/openai/api_server.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: wang.yuqi --- vllm/entrypoints/openai/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1dc07d908a8d..eaeb6906f499 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1749,7 +1749,7 @@ async def init_app_state( log_error_stack=args.log_error_stack, ) ) - if supported_tasks + if any(task in POOLING_TASKS for task in supported_tasks) else None ) state.openai_serving_embedding = ( From cdabfc0d6ebdf43b445781ab8a1fa527ac77e956 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 18:43:52 +0800 Subject: [PATCH 07/25] fix Signed-off-by: wang.yuqi --- vllm/entrypoints/openai/api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index eaeb6906f499..bc7aadc31d3c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -107,6 +107,7 @@ ) from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager +from vllm.tasks import POOLING_TASKS from vllm.usage.usage_lib import UsageContext from vllm.utils import Device from vllm.utils.argparse_utils import FlexibleArgumentParser From 986de1ad7727bfc345f2978d19ff11d7eb1960e7 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 19:04:54 +0800 Subject: [PATCH 08/25] Deprecated Feature Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 1a89ccdce9a9..a276eefb6845 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -303,3 +303,13 @@ Expected output: ``` An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) + +## Deprecated Feature + +### Encode task +Split the encode task into two more specific token wise tasks: token_embed and token_classify: +- token_embed is the same as embed, using normalize as activation. +- token_classify is the same as classify, default using softmax as activation. + +### Remove softmax from PoolingParams +Remove softmax from PoolingParams, prefer using activation, since we actually allow classify and token_classify to use any activation function. From 06b191559fda4c05d1cb48730d1ef5c0b91dfabd Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 21:19:53 +0800 Subject: [PATCH 09/25] Update docs/models/pooling_models.md Co-authored-by: Cyrus Leung Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index a276eefb6845..ee4f18187a2b 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -304,7 +304,7 @@ Expected output: An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) -## Deprecated Feature +## Deprecated Features ### Encode task Split the encode task into two more specific token wise tasks: token_embed and token_classify: From 267d03780da04a5bc7c23417200309d0e61bfac0 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 21:20:17 +0800 Subject: [PATCH 10/25] Update docs/models/pooling_models.md Co-authored-by: Cyrus Leung Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index ee4f18187a2b..2933c8440b9f 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -307,9 +307,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope ## Deprecated Features ### Encode task -Split the encode task into two more specific token wise tasks: token_embed and token_classify: -- token_embed is the same as embed, using normalize as activation. -- token_classify is the same as classify, default using softmax as activation. + +We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`: +- `token_embed` is the same as embed, using normalize as activation. +- `token_classify` is the same as classify, default using softmax as activation. ### Remove softmax from PoolingParams -Remove softmax from PoolingParams, prefer using activation, since we actually allow classify and token_classify to use any activation function. + +We are going to remove `softmax` from `PoolingParams`. Instead, you should set `activation`, since we actually allow `classify` and `token_classify` to use any activation function. From 3b13620b49ad481269c20b336d0ac3da5e1e31a5 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 21:20:51 +0800 Subject: [PATCH 11/25] Update examples/offline_inference/pooling/README.md Co-authored-by: Cyrus Leung Signed-off-by: wang.yuqi --- examples/offline_inference/pooling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index e10606676f06..9deca2753090 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -38,7 +38,7 @@ python examples/offline_inference/pooling/multi_vector_retrieval.py python examples/offline_inference/pooling/ner.py ``` -## prithvi geospatial mae usage +## Prithvi Geospatial MAE usage ```bash python examples/offline_inference/pooling/prithvi_geospatial_mae.py From bb3a6f8abef0e21e7785575cdd0d28a7f7d00efc Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 21:21:08 +0800 Subject: [PATCH 12/25] Update examples/offline_inference/pooling/README.md Co-authored-by: Cyrus Leung Signed-off-by: wang.yuqi --- examples/offline_inference/pooling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index 9deca2753090..ad78be38716b 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -44,7 +44,7 @@ python examples/offline_inference/pooling/ner.py python examples/offline_inference/pooling/prithvi_geospatial_mae.py ``` -## IO Processor Plugins for prithvi geospatial mae usage +## IO Processor Plugins for Prithvi Geospatial MAE ```bash python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py From 351d526d762ac30d5c05a925a8793f6f377cd28b Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:11:56 +0800 Subject: [PATCH 13/25] Update examples/online_serving/pooling/README.md Co-authored-by: Cyrus Leung Signed-off-by: wang.yuqi --- examples/online_serving/pooling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 634fa049b048..6dc0ba6816ca 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -90,7 +90,7 @@ python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py python examples/online_serving/pooling/openai_pooling_client.py ``` -## Online prithvi geospatial mae usage +## Online Prithvi Geospatial MAE usage ```bash python examples/online_serving/pooling/prithvi_geospatial_mae.py From 12db9e36b0f9e75df4a299fc27e0046ff57dbb48 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:12:06 +0800 Subject: [PATCH 14/25] Update examples/online_serving/pooling/README.md Co-authored-by: Cyrus Leung Signed-off-by: wang.yuqi --- examples/online_serving/pooling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 6dc0ba6816ca..8e2878ae4573 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -62,7 +62,7 @@ python examples/online_serving/pooling/openai_classification_client.py python examples/online_serving/pooling/openai_cross_encoder_score.py ``` -## Openai cross_encoder score for multimodal usage +## OpenAI cross_encoder score for multimodal usage ```bash # vllm serve jinaai/jina-reranker-m0 From 4938636145632101e067cbd497531439dc0697ea Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:17:19 +0800 Subject: [PATCH 15/25] Pooling Tasks Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 2933c8440b9f..f79e9f44bbfa 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -45,12 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], enabling the corresponding APIs: -| Task | APIs | -|------------------|--------------------------------------| -| `token_classify` | `LLM.reward(...)` | -| `embed` | `LLM.embed(...)`, `LLM.score(...)`\* | -| `classify` | `LLM.classify(...)` | -| `score` | `LLM.score(...)` | +| Task | APIs | +|------------------|-------------------------------------------------------------------------------| +| `embed` | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` | +| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | +| `score` | `LLM.score(...)` | +| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | +| `token_embed` | `LLM.encode(..., pooling_task="token_embed")` | +| `plugin` | `LLM.encode(..., pooling_task="plugin")` | \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task. From a7ba610b04a4bf878f120fc4497b81a1ae74faa7 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:19:03 +0800 Subject: [PATCH 16/25] + runner="pooling" Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index f79e9f44bbfa..f43f19f48fa8 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -93,7 +93,7 @@ It is primarily designed for embedding models. ```python from vllm import LLM -llm = LLM(model="intfloat/e5-small") +llm = LLM(model="intfloat/e5-small", runner="pooling") (output,) = llm.embed("Hello, my name is") embeds = output.outputs.embedding @@ -110,7 +110,7 @@ It is primarily designed for classification models. ```python from vllm import LLM -llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach") +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") (output,) = llm.classify("Hello, my name is") probs = output.outputs.probs @@ -131,7 +131,7 @@ It is designed for embedding models and cross-encoder models. Embedding models u ```python from vllm import LLM -llm = LLM(model="BAAI/bge-reranker-v2-m3") +llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") (output,) = llm.score( "What is the capital of France?", "The capital of Brazil is Brasilia.", @@ -150,7 +150,7 @@ The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. ```python from vllm import LLM -llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True) +llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True, runner="pooling") (output,) = llm.reward("Hello, my name is") data = output.outputs.data @@ -178,7 +178,7 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. from vllm import LLM llm = LLM(model="intfloat/e5-small") -(output,) = llm.encode("Hello, my name is", pooling_task="embed") +(output,) = llm.encode("Hello, my name is", pooling_task="embed", runner="pooling") data = output.outputs.data print(f"Data: {data!r}") From 4188194d2616b66498b3e7d0522bd54629efae28 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:22:14 +0800 Subject: [PATCH 17/25] Openai -> OpenAI Signed-off-by: wang.yuqi --- examples/online_serving/pooling/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 8e2878ae4573..b76ad21f0481 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -42,20 +42,20 @@ python examples/online_serving/pooling/multi_vector_retrieval_client.py python examples/online_serving/pooling/ner_client.py ``` -## Openai chat embedding for multimodal usage +## OpenAI chat embedding for multimodal usage ```bash python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py ``` -## Openai classification usage +## OpenAI classification usage ```bash # vllm serve jason9693/Qwen2.5-1.5B-apeach python examples/online_serving/pooling/openai_classification_client.py ``` -## Openai cross_encoder score usage +## OpenAI cross_encoder score usage ```bash # vllm serve BAAI/bge-reranker-v2-m3 @@ -69,21 +69,21 @@ python examples/online_serving/pooling/openai_cross_encoder_score.py python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py ``` -## Openai embedding usage +## OpenAI embedding usage ```bash # vllm serve intfloat/e5-small python examples/online_serving/pooling/openai_embedding_client.py ``` -## Openai embedding matryoshka dimensions usage +## OpenAI embedding matryoshka dimensions usage ```bash # vllm serve jinaai/jina-embeddings-v3 --trust-remote-code python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py ``` -## Openai pooling usage +## OpenAI pooling usage ```bash # vllm serve internlm/internlm2-1_8b-reward --trust-remote-code From 86ce4c43590758e1e0e2dbfe9eebeac737c54e6d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:31:47 +0800 Subject: [PATCH 18/25] activation -> use_activation Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 2 +- tests/entrypoints/pooling/llm/test_classify.py | 10 +++++----- tests/entrypoints/pooling/llm/test_score.py | 10 +++++----- .../test_pooler_config_init_behaviour.py | 4 ++-- tests/test_pooling_params.py | 14 +++++++------- vllm/model_executor/layers/pooler.py | 4 ++-- vllm/model_executor/models/config.py | 4 ++-- vllm/pooling_params.py | 18 +++++++++--------- 8 files changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index f43f19f48fa8..b0a1a15d25ec 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -316,4 +316,4 @@ We have split the `encode` task into two more specific token wise tasks: `token_ ### Remove softmax from PoolingParams -We are going to remove `softmax` from `PoolingParams`. Instead, you should set `activation`, since we actually allow `classify` and `token_classify` to use any activation function. +We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index 96f634ee0a8c..d5b781097b44 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -37,15 +37,15 @@ def llm(): @pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): outputs = llm.classify( - prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False + prompts, pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False ) return torch.tensor([x.outputs.probs for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py index 2df973dd7863..b69c6a47c191 100644 --- a/tests/entrypoints/pooling/llm/test_score.py +++ b/tests/entrypoints/pooling/llm/test_score.py @@ -34,21 +34,21 @@ def llm(): def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." outputs = llm.score( text_1, text_2, - pooling_params=PoolingParams(activation=activation), + pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False, ) return torch.tensor([x.outputs.score for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 55663ee3f1b4..61e082f2789f 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -24,7 +24,7 @@ def test_classify_models_using_activation( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(activation=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_activation_out = vllm_model.classify(example_prompts) @@ -32,7 +32,7 @@ def test_classify_models_using_activation( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(activation=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_activation_out = vllm_model.classify(example_prompts) diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index e73d7efc1483..7812562c8948 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -17,7 +17,7 @@ ), ] -classify_parameters = ["activation"] +classify_parameters = ["use_activation"] embed_parameters = ["dimensions", "normalize"] step_pooling_parameters = ["step_tag_id", "returned_token_ids"] @@ -88,13 +88,13 @@ def test_embed_dimensions(model_info: EmbedModelInfo): def test_classify(task): model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) - pooling_params = PoolingParams(activation=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters + step_pooling_parameters @@ -137,13 +137,13 @@ def test_token_classify(pooling_type: str): pooler_config=PoolerConfig(pooling_type=pooling_type) ) - pooling_params = PoolingParams(activation=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 145f18f23566..7dd02e32ff21 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -607,7 +607,7 @@ def forward( pooled_data -= self.logit_bias pooling_params = get_pooling_params(pooling_metadata) - flags = [p.activation for p in pooling_params] + flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: scores = self.act_fn(pooled_data) if flags[0] else pooled_data @@ -681,7 +681,7 @@ def forward( if self.logit_bias is not None: scores -= self.logit_bias - if pooling_param.activation: + if pooling_param.use_activation: scores = self.act_fn(scores) # scores shape: [n_token, num_labels] diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 493b74bddda7..d1af1b6c802b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -53,8 +53,8 @@ class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: pooler_config = vllm_config.model_config.pooler_config - if pooler_config.activation is None: - pooler_config.activation = False + if pooler_config.use_activation is None: + pooler_config.use_activation = False class JinaRobertaModelConfig(VerifyAndUpdateConfig): diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 090d92414465..0fe6b7c5f301 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -28,7 +28,7 @@ class PoolingParams( normalize: Whether to normalize the embeddings outputs. dimensions: Reduce the dimensions of embeddings if model support matryoshka representation. - activation: Whether to apply activation function to + use_activation: Whether to apply activation function to the classification outputs. """ @@ -44,7 +44,7 @@ class PoolingParams( ## for classification, scoring and rerank # --8<-- [start:classification-pooling-params] - activation: bool | None = None + use_activation: bool | None = None # --8<-- [end:classification-pooling-params] ## for step pooling models @@ -59,16 +59,16 @@ class PoolingParams( @property def all_parameters(self) -> list[str]: - return ["dimensions", "normalize", "activation"] + return ["dimensions", "normalize", "use_activation"] @property def valid_parameters(self): return { "embed": ["dimensions", "normalize"], - "classify": ["activation"], - "score": ["activation"], + "classify": ["use_activation"], + "score": ["use_activation"], "token_embed": ["dimensions", "normalize"], - "token_classify": ["activation"], + "token_classify": ["use_activation"], } def clone(self) -> "PoolingParams": @@ -168,8 +168,8 @@ def _set_default_parameters(self, model_config: Optional["ModelConfig"]): raise ValueError("Dimensions must be greater than 0") elif self.task in ["classify", "score", "token_classify"]: - if self.activation is None: - self.activation = True + if self.use_activation is None: + self.use_activation = True else: raise ValueError(f"Unknown pooling task: {self.task}") @@ -197,7 +197,7 @@ def __repr__(self) -> str: f"task={self.task}, " f"normalize={self.normalize}, " f"dimensions={self.dimensions}, " - f"activation={self.activation}, " + f"use_activation={self.use_activation}, " f"step_tag_id={self.step_tag_id}, " f"returned_token_ids={self.returned_token_ids}, " f"requires_token_ids={self.requires_token_ids}, " From 44c7d8ab9e86a6759b58b4ee34c44a8100b7adad Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:34:51 +0800 Subject: [PATCH 19/25] fix Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index b0a1a15d25ec..14cab221adba 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -311,6 +311,7 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope ### Encode task We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`: + - `token_embed` is the same as embed, using normalize as activation. - `token_classify` is the same as classify, default using softmax as activation. From d46428a4282302d2746046ee35aee0d18f08a30a Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:37:33 +0800 Subject: [PATCH 20/25] fix Signed-off-by: wang.yuqi --- tests/entrypoints/pooling/llm/test_classify.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index d5b781097b44..1063c3b6b755 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -39,7 +39,9 @@ def llm(): def test_pooling_params(llm: LLM): def get_outputs(use_activation): outputs = llm.classify( - prompts, pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, ) return torch.tensor([x.outputs.probs for x in outputs]) From 90df794a0ed0f07a4282748a4422311b25da4a54 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:49:40 +0800 Subject: [PATCH 21/25] activation -> use_activation Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 6 +++--- tests/entrypoints/pooling/llm/test_reward.py | 12 ++++++----- .../pooling/openai/test_classification.py | 16 +++++++++------ .../entrypoints/pooling/openai/test_rerank.py | 12 +++++------ .../entrypoints/pooling/openai/test_score.py | 16 +++++++-------- vllm/entrypoints/openai/protocol.py | 20 +++++++++---------- 6 files changed, 44 insertions(+), 38 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 14cab221adba..5c9180c69c4b 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -150,7 +150,7 @@ The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. ```python from vllm import LLM -llm = LLM(model="internlm/internlm2-1_8b-reward", trust_remote_code=True, runner="pooling") +llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True) (output,) = llm.reward("Hello, my name is") data = output.outputs.data @@ -177,8 +177,8 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. ```python from vllm import LLM -llm = LLM(model="intfloat/e5-small") -(output,) = llm.encode("Hello, my name is", pooling_task="embed", runner="pooling") +llm = LLM(model="intfloat/e5-small", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="embed") data = output.outputs.data print(f"Data: {data!r}") diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py index 81058dbad891..0255704cecd9 100644 --- a/tests/entrypoints/pooling/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -37,15 +37,17 @@ def llm(): def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): outputs = llm.reward( - prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, ) return torch.cat([x.outputs.data for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py index 2bffe5dabedc..671bb948780a 100644 --- a/tests/entrypoints/pooling/openai/test_classification.py +++ b/tests/entrypoints/pooling/openai/test_classification.py @@ -163,20 +163,24 @@ async def test_invocations(server: RemoteOpenAIServer): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_activation(server: RemoteOpenAIServer, model_name: str): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): input_text = ["This product was excellent and exceeded my expectations"] - async def get_outputs(activation): + async def get_outputs(use_activation): response = requests.post( server.url_for("classify"), - json={"model": model_name, "input": input_text, "activation": activation}, + json={ + "model": model_name, + "input": input_text, + "use_activation": use_activation, + }, ) outputs = response.json() return torch.tensor([x["probs"] for x in outputs["data"]]) - default = await get_outputs(activation=None) - w_activation = await get_outputs(activation=True) - wo_activation = await get_outputs(activation=False) + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index edfb3f7cb4dd..1d85190c12a1 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -125,8 +125,8 @@ def test_invocations(server: RemoteOpenAIServer): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_activation(server: RemoteOpenAIServer, model_name: str): - async def get_outputs(activation): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): + async def get_outputs(use_activation): query = "What is the capital of France?" documents = [ "The capital of Brazil is Brasilia.", @@ -139,16 +139,16 @@ async def get_outputs(activation): "model": model_name, "query": query, "documents": documents, - "activation": activation, + "use_activation": use_activation, }, ) outputs = response.json() return torch.tensor([x["relevance_score"] for x in outputs["results"]]) - default = await get_outputs(activation=None) - w_activation = await get_outputs(activation=True) - wo_activation = await get_outputs(activation=False) + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py index ef213ab0ea18..b8f796d47efa 100644 --- a/tests/entrypoints/pooling/openai/test_score.py +++ b/tests/entrypoints/pooling/openai/test_score.py @@ -218,8 +218,8 @@ def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]): # TODO: reset this tolerance to 0.01 once we find # an alternative to flash_attn with bfloat16 - def test_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): - def get_outputs(activation): + def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): + def get_outputs(use_activation): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." response = requests.post( @@ -228,7 +228,7 @@ def get_outputs(activation): "model": model["name"], "text_1": text_1, "text_2": text_2, - "activation": activation, + "use_activation": use_activation, }, ) if response.status_code != 200: @@ -238,9 +238,9 @@ def get_outputs(activation): return torch.tensor([x["score"] for x in outputs["data"]]) if model["is_cross_encoder"]: - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -252,8 +252,8 @@ def get_outputs(activation): "w_activation should be close to activation(wo_activation)." ) else: - get_outputs(activation=None) + get_outputs(use_activation=None) # The activation parameter only works for the is_cross_encoder model - response = get_outputs(activation=True) + response = get_outputs(use_activation=True) assert response.status_code == 400 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4bfa7ce9a927..dfb5995024f4 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1673,7 +1673,7 @@ def to_pooling_params(self): class PoolingCompletionRequest(EmbeddingCompletionRequest): task: PoolingTask | None = None - activation: bool | None = Field( + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " "If it is a classify or token_classify task, the default is True; " @@ -1685,13 +1685,13 @@ def to_pooling_params(self): truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, normalize=self.normalize, - activation=self.activation, + use_activation=self.use_activation, ) class PoolingChatRequest(EmbeddingChatRequest): task: PoolingTask | None = None - activation: bool | None = Field( + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " "If it is a classify or token_classify task, the default is True; " @@ -1703,7 +1703,7 @@ def to_pooling_params(self): truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, normalize=self.normalize, - activation=self.activation, + use_activation=self.use_activation, ) @@ -1785,7 +1785,7 @@ class ScoreRequest(OpenAIBaseModel): ), ) - activation: bool | None = Field( + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " "Default is True.", @@ -1795,7 +1795,7 @@ class ScoreRequest(OpenAIBaseModel): def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=self.use_activation, ) @@ -1822,7 +1822,7 @@ class RerankRequest(OpenAIBaseModel): ), ) - activation: bool | None = Field( + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " "Default is True.", @@ -1832,7 +1832,7 @@ class RerankRequest(OpenAIBaseModel): def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=self.use_activation, ) @@ -2000,7 +2000,7 @@ class ClassificationRequest(OpenAIBaseModel): ), ) - activation: bool | None = Field( + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " "Default is True.", @@ -2010,7 +2010,7 @@ class ClassificationRequest(OpenAIBaseModel): def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=self.use_activation, ) From 90746ca29bcc7680e6126601bd009f5ef885fe54 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Oct 2025 22:52:59 +0800 Subject: [PATCH 22/25] fix Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 5c9180c69c4b..bb2a7287f1a0 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -317,4 +317,4 @@ We have split the `encode` task into two more specific token wise tasks: `token_ ### Remove softmax from PoolingParams -We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. +We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. From 2cf3132fd80c93191af916f4c246d5e9450250c9 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 29 Oct 2025 00:19:31 +0800 Subject: [PATCH 23/25] fix Signed-off-by: wang.yuqi --- docs/serving/openai_compatible_server.md | 4 ++-- vllm/config/pooler.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 1414718a697d..e331b3422ea6 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -638,7 +638,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: [examples/online_serving/openai_cross_encoder_score.py](../../examples/online_serving/openai_cross_encoder_score.py) +Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py) #### Single inference @@ -819,7 +819,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including print("Scoring output:", response_json["data"][0]["score"]) print("Scoring output:", response_json["data"][1]["score"]) ``` -Full example: [examples/online_serving/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/openai_cross_encoder_score_for_multimodal.py) +Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py) #### Extra parameters diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 0590f74aa4c9..426779c23ce5 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -48,7 +48,7 @@ class PoolerConfig: """ ## for classification models - activation: bool | None = None + use_activation: bool | None = None """ Whether to apply activation function to the classification outputs. Defaults to True. From 794669dfe627645b764bcdf262c74f6957d37f56 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 29 Oct 2025 00:20:24 +0800 Subject: [PATCH 24/25] fix Signed-off-by: wang.yuqi --- .../language/pooling/test_pooler_config_init_behaviour.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 61e082f2789f..deb5de984d90 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -104,7 +104,7 @@ def test_reward_models_using_activation( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(activation=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_activation = vllm_model.reward(example_prompts) @@ -112,7 +112,7 @@ def test_reward_models_using_activation( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(activation=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_activation = vllm_model.reward(example_prompts) From 4c2a98e4bddc2c892ae97926e79fc4d267f31d47 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 30 Oct 2025 15:58:59 +0800 Subject: [PATCH 25/25] add deprecated waring Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 2 +- vllm/config/pooler.py | 36 +++++++++++++++--- vllm/entrypoints/openai/protocol.py | 57 ++++++++++++++++++++++++++--- vllm/pooling_params.py | 16 +++++--- 4 files changed, 95 insertions(+), 16 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index bb2a7287f1a0..18bb645ea9a9 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -317,4 +317,4 @@ We have split the `encode` task into two more specific token wise tasks: `token_ ### Remove softmax from PoolingParams -We are going to remove `softmax` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. +We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 426779c23ce5..6bece8d0785b 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -7,6 +7,9 @@ from pydantic.dataclasses import dataclass from vllm.config.utils import config +from vllm.logger import init_logger + +logger = init_logger(__name__) @config @@ -48,6 +51,14 @@ class PoolerConfig: """ ## for classification models + softmax: float | None = None + """ + softmax will be deprecated, please use use_activation instead. + """ + activation: float | None = None + """ + activation will be deprecated, please use use_activation instead. + """ use_activation: bool | None = None """ Whether to apply activation function to the classification outputs. @@ -59,11 +70,6 @@ class PoolerConfig: """ ## for reward models - softmax: bool | None = None - """ - Whether to apply softmax to the reward outputs. - Defaults to True. - """ step_tag_id: int | None = None """ If set, only the score corresponding to the `step_tag_id` in the @@ -77,6 +83,10 @@ class PoolerConfig: `math-shepherd-mistral-7b-prm` model. """ + def __post_init__(self): + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -94,3 +104,19 @@ def compute_hash(self) -> str: factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str + + +def get_use_activation(o: object): + if softmax := getattr(o, "softmax", None) is not None: + logger.warning_once( + "softmax will be deprecated, please use use_activation instead." + ) + return softmax + + if activation := getattr(o, "activation", None) is not None: + logger.warning_once( + "activation will be deprecated, please use use_activation instead." + ) + return activation + + return getattr(o, "use_activation", None) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index dfb5995024f4..d0061f9d5b40 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -49,6 +49,7 @@ ) from openai_harmony import Message as OpenAIHarmonyMessage +from vllm.config.pooler import get_use_activation from vllm.tasks import PoolingTask from vllm.utils.serial_utils import ( EmbedDType, @@ -1673,6 +1674,14 @@ def to_pooling_params(self): class PoolingCompletionRequest(EmbeddingCompletionRequest): task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " @@ -1685,12 +1694,20 @@ def to_pooling_params(self): truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, normalize=self.normalize, - use_activation=self.use_activation, + use_activation=get_use_activation(self), ) class PoolingChatRequest(EmbeddingChatRequest): task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " @@ -1703,7 +1720,7 @@ def to_pooling_params(self): truncate_prompt_tokens=self.truncate_prompt_tokens, dimensions=self.dimensions, normalize=self.normalize, - use_activation=self.use_activation, + use_activation=get_use_activation(self), ) @@ -1785,6 +1802,16 @@ class ScoreRequest(OpenAIBaseModel): ), ) + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " @@ -1795,7 +1822,7 @@ class ScoreRequest(OpenAIBaseModel): def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - use_activation=self.use_activation, + use_activation=get_use_activation(self), ) @@ -1822,6 +1849,16 @@ class RerankRequest(OpenAIBaseModel): ), ) + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " @@ -1832,7 +1869,7 @@ class RerankRequest(OpenAIBaseModel): def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - use_activation=self.use_activation, + use_activation=get_use_activation(self), ) @@ -2000,6 +2037,16 @@ class ClassificationRequest(OpenAIBaseModel): ), ) + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( default=None, description="Whether to use activation for classification outputs. " @@ -2010,7 +2057,7 @@ class ClassificationRequest(OpenAIBaseModel): def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - use_activation=self.use_activation, + use_activation=get_use_activation(self), ) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 0fe6b7c5f301..72a8320cc1bf 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import TYPE_CHECKING, Annotated, Any, Optional +from typing import Annotated, Any, Optional import msgspec +from vllm.config import ModelConfig, PoolerConfig +from vllm.config.pooler import get_use_activation from vllm.sampling_params import RequestOutputKind from vllm.tasks import PoolingTask -if TYPE_CHECKING: - from vllm.config import ModelConfig, PoolerConfig - class PoolingParams( msgspec.Struct, @@ -25,9 +24,11 @@ class PoolingParams( Set to -1 to use the model's default truncation size. Set to k to keep only the last k tokens (left truncation). Set to None to disable truncation. - normalize: Whether to normalize the embeddings outputs. dimensions: Reduce the dimensions of embeddings if model support matryoshka representation. + normalize: Whether to normalize the embeddings outputs. + softmax: softmax will be deprecated, please use use_activation instead. + activation: activation will be deprecated, please use use_activation instead. use_activation: Whether to apply activation function to the classification outputs. """ @@ -44,6 +45,8 @@ class PoolingParams( ## for classification, scoring and rerank # --8<-- [start:classification-pooling-params] + softmax: bool | None = None + activation: bool | None = None use_activation: bool | None = None # --8<-- [end:classification-pooling-params] @@ -84,6 +87,9 @@ def verify( msg = f"You cannot overwrite {self.task=!r} with {task=!r}!" raise ValueError(msg) + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin":