1414from vllm .v1 .sample .metadata import SamplingMetadata
1515from vllm .v1 .worker .block_table import BlockTable
1616
17+ _SAMPLING_EPS = 1e-5
18+
1719if TYPE_CHECKING :
1820 from vllm .multimodal .inputs import PlaceholderRange
1921
@@ -120,6 +122,16 @@ def __init__(
120122 self .top_k_cpu = self .top_k_cpu_tensor .numpy ()
121123 self .top_k_reqs : Set [str ] = set ()
122124
125+ self .min_p = torch .empty ((max_num_reqs , ),
126+ dtype = torch .float32 ,
127+ device = device )
128+ self .min_p_cpu_tensor = torch .empty ((max_num_reqs , ),
129+ dtype = torch .float32 ,
130+ device = "cpu" ,
131+ pin_memory = pin_memory )
132+ self .min_p_cpu = self .min_p_cpu_tensor .numpy ()
133+ self .min_p_reqs : Set [str ] = set ()
134+
123135 # Frequency penalty related data structures
124136 self .frequency_penalties = torch .empty ((max_num_reqs , ),
125137 dtype = torch .float ,
@@ -223,8 +235,11 @@ def add_request(
223235 self .top_k_cpu [req_index ] = sampling_params .top_k
224236 if sampling_params .top_k > 0 :
225237 self .top_k_reqs .add (req_id )
238+ self .min_p_cpu [req_index ] = sampling_params .min_p
226239 self .frequency_penalties_cpu [
227240 req_index ] = sampling_params .frequency_penalty
241+ if sampling_params .min_p > _SAMPLING_EPS :
242+ self .min_p_reqs .add (req_id )
228243 if sampling_params .frequency_penalty != 0.0 :
229244 self .frequency_penalties_reqs .add (req_id )
230245 self .presence_penalties_cpu [
@@ -273,6 +288,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
273288 self .random_reqs .discard (req_id )
274289 self .top_p_reqs .discard (req_id )
275290 self .top_k_reqs .discard (req_id )
291+ self .min_p_reqs .discard (req_id )
276292 self .frequency_penalties_reqs .discard (req_id )
277293 self .presence_penalties_reqs .discard (req_id )
278294 self .repetition_penalties_reqs .discard (req_id )
@@ -299,6 +315,7 @@ def clear(self) -> None:
299315 self .random_reqs .clear ()
300316 self .top_p_reqs .clear ()
301317 self .top_k_reqs .clear ()
318+ self .min_p_reqs .clear ()
302319 self .frequency_penalties_reqs .clear ()
303320 self .presence_penalties_reqs .clear ()
304321 self .repetition_penalties_reqs .clear ()
@@ -354,6 +371,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
354371 empty_index ] = self .presence_penalties_cpu [last_req_index ]
355372 self .repetition_penalties_cpu [
356373 empty_index ] = self .repetition_penalties_cpu [last_req_index ]
374+ self .min_p_cpu [empty_index ] = self .min_p_cpu [last_req_index ]
357375 self .min_tokens [empty_index ] = self .min_tokens [last_req_index ]
358376 self .stop_token_ids [empty_index ] = self .stop_token_ids [
359377 last_req_index ]
@@ -381,6 +399,8 @@ def make_sampling_metadata(
381399 self .top_p_cpu_tensor [:self .num_reqs ], non_blocking = True )
382400 self .top_k [:self .num_reqs ].copy_ (
383401 self .top_k_cpu_tensor [:self .num_reqs ], non_blocking = True )
402+ self .min_p [:self .num_reqs ].copy_ (
403+ self .min_p_cpu_tensor [:self .num_reqs ], non_blocking = True )
384404 if not self .no_penalties :
385405 # Since syncing these tensors is expensive only copy them
386406 # if necessary i.e. if there are requests which require
@@ -421,6 +441,8 @@ def make_sampling_metadata(
421441 all_random = self .all_random ,
422442 top_p = self .top_p [:self .num_reqs ],
423443 top_k = self .top_k [:self .num_reqs ],
444+ min_p = self .min_p [:self .num_reqs ],
445+ no_min_p = self .no_min_p ,
424446 no_top_p = self .no_top_p ,
425447 no_top_k = self .no_top_k ,
426448 generators = self .generators ,
@@ -497,6 +519,10 @@ def no_top_p(self) -> bool:
497519 def no_top_k (self ) -> bool :
498520 return len (self .top_k_reqs ) == 0
499521
522+ @property
523+ def no_min_p (self ) -> bool :
524+ return len (self .min_p_reqs ) == 0
525+
500526 @property
501527 def no_penalties (self ) -> bool :
502528 return (len (self .presence_penalties_reqs ) == 0
0 commit comments