[Model] Add num_cached_tokens for PoolingRequestOutput (vllm-project#27378)

noooop · albertoperdomo2 · commit b0743999ed47 · 2025-10-23T21:17:37.000+01:00
Signed-off-by: wang.yuqi &lt;noooop@126.com&gt;
Signed-off-by: Alberto Perdomo &lt;aperdomo@redhat.com&gt;
diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@@ -19,14 +19,25 @@ def test_classify_models(
     model: str,
     dtype: str,
 ) -> None:
-    example_prompts = example_prompts * 2
+    # example_prompts is too short for testing prefix_caching
+    example_prompts = [s * 10 for s in example_prompts]
 
     with vllm_runner(
         model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
     ) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert cache_config.enable_prefix_caching
-        vllm_outputs = vllm_model.classify(example_prompts)
+
+        # First Run
+        vllm_model.classify(example_prompts)
+
+        # assert prefix_caching works
+        pooling_outputs = vllm_model.llm.encode(
+            example_prompts, pooling_task="classify"
+        )
+        for output in pooling_outputs:
+            assert output.num_cached_tokens > 0
+        vllm_outputs = [req_output.outputs.data for req_output in pooling_outputs]
 
     with hf_runner(
         model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
@@ -54,7 +65,8 @@ def test_embed_models(
     model: str,
     dtype: str,
 ):
-    example_prompts = [str(s).strip() for s in example_prompts] * 2
+    # example_prompts is too short for testing prefix_caching
+    example_prompts = [str(s).strip() * 10 for s in example_prompts]
 
     with vllm_runner(
         model,
@@ -64,7 +76,15 @@ def test_embed_models(
     ) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert cache_config.enable_prefix_caching
-        vllm_outputs = vllm_model.embed(example_prompts)
+
+        # First Run
+        vllm_model.embed(example_prompts)
+
+        # assert prefix_caching works
+        pooling_outputs = vllm_model.llm.encode(example_prompts, pooling_task="embed")
+        for output in pooling_outputs:
+            assert output.num_cached_tokens > 0
+        vllm_outputs = [req_output.outputs.data for req_output in pooling_outputs]
 
     with hf_runner(
         model,
diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import TokensPrompt
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-0.6B"],
+)
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, model: str):
+    n_prompt_tokens = [55, 56, 57]
+    token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
+
+    with vllm_runner(
+        model,
+        max_model_len=128,
+        enforce_eager=True,
+        runner="pooling",
+        enable_chunked_prefill=False,
+        enable_prefix_caching=False,
+    ) as vllm_model:
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="token_embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert output.num_cached_tokens == 0
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1078,6 +1078,9 @@ def encode(
                 PoolingRequestOutput[Any](
                     request_id="",
                     outputs=processed_outputs,
+                    num_cached_tokens=getattr(
+                        processed_outputs, "num_cached_tokens", 0
+                    ),
                     prompt_token_ids=[],
                     finished=True,
                 )
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
@@ -583,6 +583,7 @@ async def _collect_batch(
                             request_id=aggregator["request_id"],
                             prompt_token_ids=original_token_ids,
                             outputs=pooling_output_data,
+                            num_cached_tokens=0,
                             finished=True,
                         )
 
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
@@ -66,6 +66,7 @@ def _cosine_similarity(
                 request_id=f"{emb_1.request_id}_{emb_2.request_id}",
                 outputs=pair_score,
                 prompt_token_ids=tokens,
+                num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
                 finished=True,
             )
         )
diff --git a/vllm/outputs.py b/vllm/outputs.py
@@ -201,14 +201,21 @@ class PoolingRequestOutput(Generic[_O]):
         request_id (str): A unique identifier for the pooling request.
         outputs (PoolingOutput): The pooling results for the given input.
         prompt_token_ids (list[int]): A list of token IDs used in the prompt.
+        num_cached_tokens: The number of tokens with prefix cache hit.
         finished (bool): A flag indicating whether the pooling is completed.
     """
 
     def __init__(
-        self, request_id: str, outputs: _O, prompt_token_ids: list[int], finished: bool
+        self,
+        request_id: str,
+        outputs: _O,
+        prompt_token_ids: list[int],
+        num_cached_tokens: int,
+        finished: bool,
     ):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
+        self.num_cached_tokens = num_cached_tokens
         self.finished = finished
         self.outputs = outputs
 
@@ -217,6 +224,7 @@ def __repr__(self):
             f"{type(self).__name__}(request_id={self.request_id!r}, "
             f"outputs={self.outputs!r}, "
             f"prompt_token_ids={self.prompt_token_ids}, "
+            f"num_cached_tokens={self.num_cached_tokens}, "
             f"finished={self.finished})"
         )
 
@@ -255,6 +263,7 @@ def from_base(request_output: PoolingRequestOutput):
             request_id=request_output.request_id,
             outputs=EmbeddingOutput.from_base(request_output.outputs),
             prompt_token_ids=request_output.prompt_token_ids,
+            num_cached_tokens=request_output.num_cached_tokens,
             finished=request_output.finished,
         )
 
@@ -294,6 +303,7 @@ def from_base(request_output: PoolingRequestOutput):
             request_id=request_output.request_id,
             outputs=ClassificationOutput.from_base(request_output.outputs),
             prompt_token_ids=request_output.prompt_token_ids,
+            num_cached_tokens=request_output.num_cached_tokens,
             finished=request_output.finished,
         )
 
@@ -330,5 +340,6 @@ def from_base(request_output: PoolingRequestOutput):
             request_id=request_output.request_id,
             outputs=ScoringOutput.from_base(request_output.outputs),
             prompt_token_ids=request_output.prompt_token_ids,
+            num_cached_tokens=request_output.num_cached_tokens,
             finished=request_output.finished,
         )
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -230,6 +230,7 @@ def _new_request_output(
             return PoolingRequestOutput(
                 request_id=request_id,
                 outputs=first_output,
+                num_cached_tokens=self.num_cached_tokens,
                 prompt_token_ids=self.prompt_token_ids,
                 finished=finished,
             )

Original file line number	Diff line number	Diff line change
`@@ -583,6 +583,7 @@ async def _collect_batch(`
`583`	`583`	`request_id=aggregator["request_id"],`
`584`	`584`	`prompt_token_ids=original_token_ids,`
`585`	`585`	`outputs=pooling_output_data,`
	`586`	`+ num_cached_tokens=0,`
`586`	`587`	`finished=True,`
`587`	`588`	`)`
`588`	`589`
Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,7 @@ def _cosine_similarity(`
`66`	`66`	`request_id=f"{emb_1.request_id}_{emb_2.request_id}",`
`67`	`67`	`outputs=pair_score,`
`68`	`68`	`prompt_token_ids=tokens,`
	`69`	`+ num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,`
`69`	`70`	`finished=True,`
`70`	`71`	`)`
`71`	`72`	`)`
Original file line number	Diff line number	Diff line change
`@@ -230,6 +230,7 @@ def _new_request_output(`
`230`	`230`	`return PoolingRequestOutput(`
`231`	`231`	`request_id=request_id,`
`232`	`232`	`outputs=first_output,`
	`233`	`+ num_cached_tokens=self.num_cached_tokens,`
`233`	`234`	`prompt_token_ids=self.prompt_token_ids,`
`234`	`235`	`finished=finished,`
`235`	`236`	`)`