@@ -420,13 +420,13 @@ async def _prepare_generators(
420420
421421 for i , engine_prompt in enumerate (ctx .engine_prompts ):
422422 request_id_item = f"{ ctx .request_id } -{ i } "
423-
424- self . _log_inputs (
425- request_id_item ,
426- engine_prompt ,
427- params = pooling_params ,
428- lora_request = ctx .lora_request ,
429- )
423+ cache_hit_threshold = getattr ( ctx . request ,
424+ "cache_hit_threshold" , None )
425+ self . _log_inputs ( request_id_item ,
426+ engine_prompt ,
427+ params = pooling_params ,
428+ lora_request = ctx .lora_request ,
429+ cache_hit_threshold = cache_hit_threshold )
430430
431431 generator = self .engine_client .encode (
432432 engine_prompt ,
@@ -435,7 +435,7 @@ async def _prepare_generators(
435435 lora_request = ctx .lora_request ,
436436 trace_headers = trace_headers ,
437437 priority = getattr (ctx .request , "priority" , 0 ),
438- )
438+ cache_hit_threshold = cache_hit_threshold )
439439
440440 generators .append (generator )
441441
@@ -935,6 +935,7 @@ async def _process_inputs(
935935 lora_request : Optional [LoRARequest ],
936936 trace_headers : Optional [Mapping [str , str ]],
937937 priority : int ,
938+ cache_hit_threshold : Optional [float ] = None ,
938939 ) -> tuple [EngineCoreRequest , dict [str , Any ]]:
939940 """Use the Processor to process inputs for AsyncLLM."""
940941 tokenization_kwargs : dict [str , Any ] = {}
@@ -951,7 +952,7 @@ async def _process_inputs(
951952 tokenization_kwargs = tokenization_kwargs ,
952953 trace_headers = trace_headers ,
953954 priority = priority ,
954- )
955+ cache_hit_threshold = cache_hit_threshold )
955956 return engine_request , tokenization_kwargs
956957
957958 async def _generate_with_builtin_tools (
@@ -968,11 +969,13 @@ async def _generate_with_builtin_tools(
968969 prompt_text , _ , _ = self ._get_prompt_components (request_prompt )
969970 orig_priority = priority
970971 while True :
972+ cache_hit_threshold = kwargs .get ("cache_hit_threshold" )
971973 self ._log_inputs (
972974 request_id ,
973975 request_prompt ,
974976 params = sampling_params ,
975977 lora_request = lora_request ,
978+ cache_hit_threshold = cache_hit_threshold ,
976979 )
977980 trace_headers = kwargs .get ("trace_headers" )
978981 engine_request , tokenization_kwargs = await self ._process_inputs (
@@ -982,6 +985,7 @@ async def _generate_with_builtin_tools(
982985 lora_request = lora_request ,
983986 trace_headers = trace_headers ,
984987 priority = priority ,
988+ cache_hit_threshold = cache_hit_threshold ,
985989 )
986990
987991 generator = self .engine_client .generate (
@@ -1036,20 +1040,20 @@ def _log_inputs(
10361040 inputs : Union [RequestPrompt , PromptType ],
10371041 params : Optional [Union [SamplingParams , PoolingParams , BeamSearchParams ]],
10381042 lora_request : Optional [LoRARequest ],
1043+ cache_hit_threshold : Optional [float ] = None ,
10391044 ) -> None :
10401045 if self .request_logger is None :
10411046 return
10421047
10431048 prompt , prompt_token_ids , prompt_embeds = self ._get_prompt_components (inputs )
10441049
1045- self .request_logger .log_inputs (
1046- request_id ,
1047- prompt ,
1048- prompt_token_ids ,
1049- prompt_embeds ,
1050- params = params ,
1051- lora_request = lora_request ,
1052- )
1050+ self .request_logger .log_inputs (request_id ,
1051+ prompt ,
1052+ prompt_token_ids ,
1053+ prompt_embeds ,
1054+ params = params ,
1055+ lora_request = lora_request ,
1056+ cache_hit_threshold = cache_hit_threshold )
10531057
10541058 async def _get_trace_headers (
10551059 self ,
0 commit comments