add threshold to request logger and fix some calls to encode

Kfir Wolfson · Kfir Wolfson · commit c087238e397a · 2025-10-06T09:35:19.000+03:00
Signed-off-by: Kfir Wolfson &lt;kfirw@pliops.com&gt;
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
@@ -26,6 +26,7 @@ def log_inputs(
         prompt_embeds: Optional[torch.Tensor],
         params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]],
         lora_request: Optional[LoRARequest],
+        cache_hit_threshold: Optional[float],
     ) -> None:
         max_log_len = self.max_log_len
         if max_log_len is not None:
@@ -46,6 +47,7 @@ def log_inputs(
             prompt_token_ids,
             prompt_embeds.shape if prompt_embeds is not None else None,
             lora_request,
+            cache_hit_threshold,
         )
 
     def log_outputs(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -324,11 +324,14 @@ async def create_chat_completion(
                         self.default_sampling_params,
                     )
 
+                cache_hit_threshold = request.cache_hit_threshold
+
                 self._log_inputs(
                     request_id,
                     request_prompts[i],
                     params=sampling_params,
                     lora_request=lora_request,
+                    cache_hit_threshold=cache_hit_threshold,
                 )
 
                 trace_headers = (
@@ -352,6 +355,7 @@ async def create_chat_completion(
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        cache_hit_threshold=request.cache_hit_threshold,
                     )
 
                     generator = self.engine_client.generate(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -182,12 +182,14 @@ async def create_completion(
                     )
 
                 request_id_item = f"{request_id}-{i}"
+                cache_hit_threshold = request.cache_hit_threshold
 
                 self._log_inputs(
                     request_id_item,
                     engine_prompt,
                     params=sampling_params,
                     lora_request=lora_request,
+                    cache_hit_threshold=cache_hit_threshold,
                 )
 
                 trace_headers = (
@@ -215,6 +217,7 @@ async def create_completion(
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
+                        cache_hit_threshold=cache_hit_threshold,
                     )
 
                     generator = self.engine_client.generate(
@@ -224,10 +227,7 @@ async def create_completion(
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
-                        prompt_text=prompt_text,
-                        tokenization_kwargs=tokenization_kwargs,
-                        cache_hit_threshold=request.cache_hit_threshold
-                    )
+                        cache_hit_threshold=request.cache_hit_threshold)
 
                 generators.append(generator)
         except ValueError as e:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
@@ -205,12 +205,15 @@ async def _process_chunked_request(
                 prompt=chunk_text, prompt_token_ids=chunk_tokens
             )
 
+            cache_hit_threshold = getattr(ctx.request, "cache_hit_threshold",
+                                          None)
             # Log the chunk
             self._log_inputs(
                 chunk_request_id,
                 chunk_request_prompt,
                 params=pooling_params,
                 lora_request=ctx.lora_request,
+                cache_hit_threshold=cache_hit_threshold,
             )
 
             # Create generator for this chunk and wrap it to return indices
@@ -221,6 +224,7 @@ async def _process_chunked_request(
                 lora_request=ctx.lora_request,
                 trace_headers=trace_headers,
                 priority=getattr(ctx.request, "priority", 0),
+                cache_hit_threshold=cache_hit_threshold,
             )
 
             generators.append(original_generator)
@@ -320,12 +324,14 @@ async def _create_single_prompt_generator(
     ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         """Create a generator for a single prompt using standard processing."""
         request_id_item = f"{ctx.request_id}-{prompt_index}"
+        cache_hit_threshold = getattr(ctx.request, "cache_hit_threshold", None)
 
         self._log_inputs(
             request_id_item,
             engine_prompt,
             params=pooling_params,
             lora_request=ctx.lora_request,
+            cache_hit_threshold=cache_hit_threshold,
         )
 
         # Return the original generator without wrapping
@@ -336,7 +342,7 @@ async def _create_single_prompt_generator(
             lora_request=ctx.lora_request,
             trace_headers=trace_headers,
             priority=getattr(ctx.request, "priority", 0),
-        )
+            cache_hit_threshold=cache_hit_threshold)
 
     @override
     async def _prepare_generators(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -420,13 +420,13 @@ async def _prepare_generators(
 
             for i, engine_prompt in enumerate(ctx.engine_prompts):
                 request_id_item = f"{ctx.request_id}-{i}"
-
-                self._log_inputs(
-                    request_id_item,
-                    engine_prompt,
-                    params=pooling_params,
-                    lora_request=ctx.lora_request,
-                )
+                cache_hit_threshold = getattr(ctx.request,
+                                              "cache_hit_threshold", None)
+                self._log_inputs(request_id_item,
+                                 engine_prompt,
+                                 params=pooling_params,
+                                 lora_request=ctx.lora_request,
+                                 cache_hit_threshold=cache_hit_threshold)
 
                 generator = self.engine_client.encode(
                     engine_prompt,
@@ -435,7 +435,7 @@ async def _prepare_generators(
                     lora_request=ctx.lora_request,
                     trace_headers=trace_headers,
                     priority=getattr(ctx.request, "priority", 0),
-                )
+                    cache_hit_threshold=cache_hit_threshold)
 
                 generators.append(generator)
 
@@ -935,6 +935,7 @@ async def _process_inputs(
         lora_request: Optional[LoRARequest],
         trace_headers: Optional[Mapping[str, str]],
         priority: int,
+        cache_hit_threshold: Optional[float] = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for AsyncLLM."""
         tokenization_kwargs: dict[str, Any] = {}
@@ -951,7 +952,7 @@ async def _process_inputs(
             tokenization_kwargs=tokenization_kwargs,
             trace_headers=trace_headers,
             priority=priority,
-        )
+            cache_hit_threshold=cache_hit_threshold)
         return engine_request, tokenization_kwargs
 
     async def _generate_with_builtin_tools(
@@ -968,11 +969,13 @@ async def _generate_with_builtin_tools(
         prompt_text, _, _ = self._get_prompt_components(request_prompt)
         orig_priority = priority
         while True:
+            cache_hit_threshold = kwargs.get("cache_hit_threshold")
             self._log_inputs(
                 request_id,
                 request_prompt,
                 params=sampling_params,
                 lora_request=lora_request,
+                cache_hit_threshold=cache_hit_threshold,
             )
             trace_headers = kwargs.get("trace_headers")
             engine_request, tokenization_kwargs = await self._process_inputs(
@@ -982,6 +985,7 @@ async def _generate_with_builtin_tools(
                 lora_request=lora_request,
                 trace_headers=trace_headers,
                 priority=priority,
+                cache_hit_threshold=cache_hit_threshold,
             )
 
             generator = self.engine_client.generate(
@@ -1036,20 +1040,20 @@ def _log_inputs(
         inputs: Union[RequestPrompt, PromptType],
         params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]],
         lora_request: Optional[LoRARequest],
+        cache_hit_threshold: Optional[float] = None,
     ) -> None:
         if self.request_logger is None:
             return
 
         prompt, prompt_token_ids, prompt_embeds = self._get_prompt_components(inputs)
 
-        self.request_logger.log_inputs(
-            request_id,
-            prompt,
-            prompt_token_ids,
-            prompt_embeds,
-            params=params,
-            lora_request=lora_request,
-        )
+        self.request_logger.log_inputs(request_id,
+                                       prompt,
+                                       prompt_token_ids,
+                                       prompt_embeds,
+                                       params=params,
+                                       lora_request=lora_request,
+                                       cache_hit_threshold=cache_hit_threshold)
 
     async def _get_trace_headers(
         self,

Original file line number	Diff line number	Diff line change
`@@ -324,11 +324,14 @@ async def create_chat_completion(`
`324`	`324`	`self.default_sampling_params,`
`325`	`325`	`)`
`326`	`326`
	`327`	`+ cache_hit_threshold = request.cache_hit_threshold`
	`328`	`+`
`327`	`329`	`self._log_inputs(`
`328`	`330`	`request_id,`
`329`	`331`	`request_prompts[i],`
`330`	`332`	`params=sampling_params,`
`331`	`333`	`lora_request=lora_request,`
	`334`	`+ cache_hit_threshold=cache_hit_threshold,`
`332`	`335`	`)`
`333`	`336`
`334`	`337`	`trace_headers = (`
`@@ -352,6 +355,7 @@ async def create_chat_completion(`
`352`	`355`	`lora_request=lora_request,`
`353`	`356`	`trace_headers=trace_headers,`
`354`	`357`	`priority=request.priority,`
	`358`	`+ cache_hit_threshold=request.cache_hit_threshold,`
`355`	`359`	`)`
`356`	`360`
`357`	`361`	`generator = self.engine_client.generate(`