From 4897f02c42cdc6a6eba6ae6d6d8fca99d6ed35d5 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Wed, 12 Feb 2025 23:28:15 -0800 Subject: [PATCH 1/4] Introduce allowed_token_ids support in the v1 Sampler Signed-off-by: Lu Fang Complete the main logic for allowed_token_ids in v1 sampler Signed-off-by: Lu Fang lint the code Signed-off-by: Lu Fang Add boundary checks for allowed_token_ids Signed-off-by: Lu Fang address the comments Signed-off-by: Lu Fang --- tests/v1/sample/test_sampler.py | 99 +++++++++++++++++++++---- tests/v1/worker/test_gpu_input_batch.py | 18 +++++ vllm/v1/sample/metadata.py | 8 ++ vllm/v1/sample/sampler.py | 25 ++++++- vllm/v1/worker/gpu_input_batch.py | 52 ++++++++++++- 5 files changed, 181 insertions(+), 21 deletions(-) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 3f6301c54267..864ac12397fc 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -57,6 +57,28 @@ def _create_logit_bias( return res +def _create_allowed_token_ids( + batch_size: int, + vocab_size: int, + num_allowed_token_ids: int, + device: torch.device, +) -> Tuple[bool, Optional[torch.Tensor]]: + mask: Optional[torch.Tensor] = None + no_allowed_token_ids = True + for i in range(batch_size): + if i % 2 == 1: + continue + if mask is None: + mask = torch.zeros((batch_size, vocab_size), + dtype=torch.bool, + device=device) + start = min(i, vocab_size - 1) + end = min(i + num_allowed_token_ids, vocab_size - 1) + mask[i, start:end] = True + no_allowed_token_ids = False + return (no_allowed_token_ids, mask) + + def _create_default_sampling_metadata( num_output_tokens: int, batch_size: int, @@ -92,6 +114,8 @@ def _create_default_sampling_metadata( no_penalties=True, min_tokens={}, logit_bias=[None] * batch_size, + no_allowed_token_ids=True, + allowed_token_ids_mask=None, ) return fake_sampling_metadata @@ -253,7 +277,10 @@ def test_sampler_frequency_penalty(device: str, batch_size: int, sampling_metadata.frequency_penalties = _create_penalty_tensor( batch_size, frequency_penalty, torch.device(device)) output_token_ids, sorted_token_ids_in_output = \ - _create_weighted_output_token_list(batch_size, VOCAB_SIZE) + _create_weighted_output_token_list( + batch_size, + VOCAB_SIZE, + ) sampling_metadata.output_token_ids = output_token_ids sampling_metadata.no_penalties = False sampler = Sampler() @@ -262,8 +289,8 @@ def test_sampler_frequency_penalty(device: str, batch_size: int, for batch_idx in range(batch_size): non_penalized_token_id = logits[batch_idx].argmax().item() penalized_token_id = logits[batch_idx].argmin().item() - distinct_sorted_token_ids_in_output = \ - sorted_token_ids_in_output[batch_idx] + distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[ + batch_idx] most_frequent_token_id = distinct_sorted_token_ids_in_output[ len(distinct_sorted_token_ids_in_output) - 1] if frequency_penalty > 0: @@ -272,8 +299,8 @@ def test_sampler_frequency_penalty(device: str, batch_size: int, # non-penalized token ID is not present in the output, while the # most penalized token is the one that occurs most frequently in # the output. - assert non_penalized_token_id \ - not in distinct_sorted_token_ids_in_output + assert (non_penalized_token_id + not in distinct_sorted_token_ids_in_output) assert penalized_token_id == most_frequent_token_id elif frequency_penalty < 0: # If `frequency_penalty` is set to < 0, it indicates @@ -282,8 +309,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int, # in the output, while the penalized token ID is one that has not # yet appeared. assert non_penalized_token_id == most_frequent_token_id - assert penalized_token_id \ - not in distinct_sorted_token_ids_in_output + assert penalized_token_id not in distinct_sorted_token_ids_in_output @pytest.mark.parametrize("device", CUDA_DEVICES) @@ -318,18 +344,18 @@ def test_sampler_repetition_penalty(device: str, batch_size: int, # If `repetition_penalty` > 1.0, verify that the non-penalized # token ID has not been seen before, while the penalized token ID # exists either in the prompt or the output. - assert (non_penalized_token_id not in prompt_tokens and \ - non_penalized_token_id not in output_tokens) - assert (penalized_token_id in prompt_tokens or \ - penalized_token_id in output_tokens) + assert (non_penalized_token_id not in prompt_tokens + and non_penalized_token_id not in output_tokens) + assert (penalized_token_id in prompt_tokens + or penalized_token_id in output_tokens) elif repetition_penalty < 1.0: # If `repetition_penalty` < 1.0, verify that the penalized # token ID has not been seen before, while the non-penalized # token ID exists either in the prompt or the output. - assert (penalized_token_id not in prompt_tokens and \ - penalized_token_id not in output_tokens) - assert (non_penalized_token_id in prompt_tokens or \ - non_penalized_token_id in output_tokens) + assert (penalized_token_id not in prompt_tokens + and penalized_token_id not in output_tokens) + assert (non_penalized_token_id in prompt_tokens + or non_penalized_token_id in output_tokens) @pytest.mark.parametrize("device", CUDA_DEVICES) @@ -404,3 +430,46 @@ def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float): 1e-2) else: assert logits_for_req[token_id] == pytest.approx(1e-2) + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2, 32]) +@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2]) +def test_sampler_allowed_token_ids(device: str, batch_size: int, + num_allowed_token_ids: int): + """ + Test to verify that when the repetition penalty is enabled, tokens + are penalized based on their presence in the prompt or the existing + output. + """ + torch.set_default_device(device) + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + no_allowed_token_ids, mask = _create_allowed_token_ids( + batch_size=batch_size, + vocab_size=VOCAB_SIZE, + num_allowed_token_ids=num_allowed_token_ids, + device=device, + ) + sampling_metadata.no_allowed_token_ids = no_allowed_token_ids + sampling_metadata.allowed_token_ids_mask = mask + sampler = Sampler() + logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata) + logits = logits.cpu() + assert not sampling_metadata.no_allowed_token_ids + for batch_idx in range(batch_size): + logits_for_req = logits[batch_idx] + if batch_idx % 2 == 1: + assert torch.all(logits_for_req != -float("inf")) + continue + for token_id in range(VOCAB_SIZE): + start = min(batch_idx, VOCAB_SIZE - 1) + end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1) + if token_id >= start and token_id < end: + assert logits_for_req[token_id] == -float( + "inf"), f"{batch_idx}, {token_id}" + else: + assert logits_for_req[token_id] != -float("inf") diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index cb3b3d21fbb3..692dde27b401 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -66,6 +66,11 @@ def _construct_expected_sampling_metadata( temperature = [0.0 for _ in range(num_reqs)] min_tokens = {} logit_bias = [None] * num_reqs + has_allowed_token_ids = [False] * num_reqs + allowed_token_ids_mask = torch.zeros(num_reqs, + VOCAB_SIZE, + dtype=torch.bool, + device=device) for req in reqs: if req.req_id not in req_ids_retained: continue @@ -86,6 +91,11 @@ def _construct_expected_sampling_metadata( req.sampling_params.min_tokens, req.sampling_params.all_stop_token_ids) logit_bias[index_in_input_batch] = req.sampling_params.logit_bias + if req.sampling_params.allowed_token_ids: + has_allowed_token_ids[index_in_input_batch] = True + allowed_token_ids_mask[index_in_input_batch][ + req.sampling_params.allowed_token_ids] = True + return SamplingMetadata( temperature=torch.tensor(temperature, dtype=torch.float, device=device), @@ -121,6 +131,8 @@ def _construct_expected_sampling_metadata( and all(x == 0 for x in frequency_penalties) and all(x == 1 for x in repetition_penalties)), logit_bias=logit_bias, + no_allowed_token_ids=not any(has_allowed_token_ids), + allowed_token_ids_mask=allowed_token_ids_mask, ) @@ -242,3 +254,9 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool: assert expected_sampling_metadata.no_penalties == \ sampling_metadata.no_penalties assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias + assert (expected_sampling_metadata.no_allowed_token_ids == + sampling_metadata.no_allowed_token_ids) + if not sampling_metadata.no_allowed_token_ids: + assert torch.allclose( + expected_sampling_metadata.allowed_token_ids_mask, + sampling_metadata.allowed_token_ids_mask) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index 6d82d3a79c8e..d5421c7d30e6 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -37,3 +37,11 @@ class SamplingMetadata: min_tokens: Dict[int, Tuple[int, Set[int]]] logit_bias: List[Optional[Dict[int, float]]] + + # These two parameters are for allowed_token_ids. + # `no_allowed_token_ids`` is a bool to indicate whether we have + # allowed_token_ids. + # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size, + # vocab size). + no_allowed_token_ids: bool + allowed_token_ids_mask: Optional[torch.Tensor] diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index ff978b3b6c41..124a04bac42d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -47,6 +47,8 @@ def forward( # Use float32 for the logits. logits = logits.to(torch.float32) + # Apply allowed token ids. + logits = self.apply_allowed_token_ids(logits, sampling_metadata) # Apply logits bias. logits = self.apply_logits_bias(logits, sampling_metadata) # Apply penalties (e.g., min_tokens, freq_penalties). @@ -56,8 +58,8 @@ def forward( # Gather the logprobs of the topk and sampled token (if requested). # Get logprobs and rank tensors (if requested) - logprobs_tensors = None if num_logprobs is None else \ - self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled) + logprobs_tensors = (None if num_logprobs is None else \ + self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)) # Use int32 to reduce the tensor size. sampled = sampled.to(torch.int32) @@ -181,14 +183,17 @@ def apply_penalties( apply_min_token_penalties(logits, sampling_metadata.output_token_ids, sampling_metadata.min_tokens) + if not sampling_metadata.no_penalties: assert sampling_metadata.prompt_token_ids is not None logits = apply_all_penalties( - logits, sampling_metadata.prompt_token_ids, + logits, + sampling_metadata.prompt_token_ids, sampling_metadata.presence_penalties, sampling_metadata.frequency_penalties, sampling_metadata.repetition_penalties, - sampling_metadata.output_token_ids) + sampling_metadata.output_token_ids, + ) return logits def apply_min_p( @@ -226,3 +231,15 @@ def apply_logits_bias( for token_id, bias in logit_bias.items(): logits[i, token_id] += bias return logits + + def apply_allowed_token_ids( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + # One idea is implement this as a PyTorch C++ op, and we may + # even optimize the logit_bias layout. + if not sampling_metadata.no_allowed_token_ids: + logits.masked_fill_(sampling_metadata.allowed_token_ids_mask, + float("-inf")) + return logits diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index bd1c369acb30..dc9561337792 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -143,7 +143,7 @@ def __init__( device="cpu", pin_memory=pin_memory) self.frequency_penalties_cpu = \ - self.frequency_penalties_cpu_tensor.numpy() + self.frequency_penalties_cpu_tensor.numpy() self.frequency_penalties_reqs: Set[str] = set() # Presence penalty related data structures @@ -168,7 +168,7 @@ def __init__( device="cpu", pin_memory=pin_memory) self.repetition_penalties_cpu = \ - self.repetition_penalties_cpu_tensor.numpy() + self.repetition_penalties_cpu_tensor.numpy() self.repetition_penalties_reqs: Set[str] = set() # req_index -> (min_tokens, stop_token_ids) @@ -192,6 +192,9 @@ def __init__( self.logit_bias: List[Optional[Dict[int, float]]] = [None] * max_num_reqs + self.has_allowed_token_ids: List[bool] = [False] * max_num_reqs + self.allowed_token_ids_mask: Optional[torch.Tensor] = None + self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None self.req_output_token_ids: List[Optional[List[int]]] = [] @@ -287,6 +290,28 @@ def add_request( if sampling_params.logit_bias is not None: self.logit_bias[req_index] = sampling_params.logit_bias + if sampling_params.allowed_token_ids: + # NOTE(houseroad): put the check here since no vocab_size info + # available in vllm/sampling_params.py + if not all(0 <= tid < self.vocab_size + for tid in sampling_params.allowed_token_ids): + raise ValueError( + "allowed_token_ids contains out-of-vocab token id") + self.has_allowed_token_ids[req_index] = True + if self.allowed_token_ids_mask_cpu_tensor is None: + # Lazy allocation for this tensor, which can be large. + self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device=self.device) + self.allowed_token_ids_mask_cpu_tensor = torch.zeros( + self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device="cpu") + self.allowed_token_ids_mask_cpu_tensor[req_index][ + sampling_params.allowed_token_ids] = True + # Add request lora ID if request.lora_request: lora_id = request.lora_request.lora_int_id @@ -332,6 +357,9 @@ def remove_request(self, req_id: str) -> Optional[int]: self.request_lora_mapping[req_index] = 0 self.logit_bias[req_index] = None + self.has_allowed_token_ids[req_index] = False + if self.allowed_token_ids_mask_cpu_tensor is not None: + self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) return req_index def condense(self, empty_req_indices: List[int]) -> None: @@ -400,6 +428,13 @@ def condense(self, empty_req_indices: List[int]) -> None: self.logit_bias[empty_index] = self.logit_bias[last_req_index] + self.has_allowed_token_ids[ + empty_index] = self.has_allowed_token_ids[last_req_index] + if self.allowed_token_ids_mask_cpu_tensor is not None: + self.allowed_token_ids_mask_cpu_tensor[ + empty_index] = self.allowed_token_ids_mask_cpu_tensor[ + last_req_index] + # Decrement last_req_index since it is now empty. last_req_index -= 1 @@ -442,6 +477,13 @@ def _make_sampling_metadata(self) -> SamplingMetadata: else: prompt_token_ids = None + allowed_token_ids_mask: Optional[torch.Tensor] = None + if not self.no_allowed_token_ids and \ + self.allowed_token_ids_mask is not None: + copy_slice(self.allowed_token_ids_mask_cpu_tensor, + self.allowed_token_ids_mask, num_reqs) + allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs] + return SamplingMetadata( temperature=temperature, all_greedy=self.all_greedy, @@ -460,6 +502,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata: min_tokens=self.min_tokens, no_penalties=self.no_penalties, logit_bias=self.logit_bias[:num_reqs], + no_allowed_token_ids=self.no_allowed_token_ids, + allowed_token_ids_mask=allowed_token_ids_mask, ) def get_sampling_metadata( @@ -550,3 +594,7 @@ def max_num_logprobs(self) -> Optional[int]: @property def no_prompt_logprob(self) -> bool: return not self.num_prompt_logprobs + + @property + def no_allowed_token_ids(self) -> bool: + return not any(self.has_allowed_token_ids) From 368369c5e0f72b25bcc13d54667e43900a78b360 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Thu, 20 Feb 2025 13:54:02 -0800 Subject: [PATCH 2/4] address comments Signed-off-by: Lu Fang --- tests/v1/sample/test_sampler.py | 11 +++-------- tests/v1/worker/test_gpu_input_batch.py | 7 +------ vllm/v1/sample/metadata.py | 4 ---- vllm/v1/sample/sampler.py | 4 +--- vllm/v1/worker/gpu_input_batch.py | 15 ++++++--------- 5 files changed, 11 insertions(+), 30 deletions(-) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 864ac12397fc..34fba5a9f6d7 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -62,9 +62,8 @@ def _create_allowed_token_ids( vocab_size: int, num_allowed_token_ids: int, device: torch.device, -) -> Tuple[bool, Optional[torch.Tensor]]: +) -> Optional[torch.Tensor]: mask: Optional[torch.Tensor] = None - no_allowed_token_ids = True for i in range(batch_size): if i % 2 == 1: continue @@ -75,8 +74,7 @@ def _create_allowed_token_ids( start = min(i, vocab_size - 1) end = min(i + num_allowed_token_ids, vocab_size - 1) mask[i, start:end] = True - no_allowed_token_ids = False - return (no_allowed_token_ids, mask) + return mask def _create_default_sampling_metadata( @@ -114,7 +112,6 @@ def _create_default_sampling_metadata( no_penalties=True, min_tokens={}, logit_bias=[None] * batch_size, - no_allowed_token_ids=True, allowed_token_ids_mask=None, ) return fake_sampling_metadata @@ -448,18 +445,16 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int, fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) sampling_metadata = _create_default_sampling_metadata( NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) - no_allowed_token_ids, mask = _create_allowed_token_ids( + mask = _create_allowed_token_ids( batch_size=batch_size, vocab_size=VOCAB_SIZE, num_allowed_token_ids=num_allowed_token_ids, device=device, ) - sampling_metadata.no_allowed_token_ids = no_allowed_token_ids sampling_metadata.allowed_token_ids_mask = mask sampler = Sampler() logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata) logits = logits.cpu() - assert not sampling_metadata.no_allowed_token_ids for batch_idx in range(batch_size): logits_for_req = logits[batch_idx] if batch_idx % 2 == 1: diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 692dde27b401..0aee266264ac 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -66,7 +66,6 @@ def _construct_expected_sampling_metadata( temperature = [0.0 for _ in range(num_reqs)] min_tokens = {} logit_bias = [None] * num_reqs - has_allowed_token_ids = [False] * num_reqs allowed_token_ids_mask = torch.zeros(num_reqs, VOCAB_SIZE, dtype=torch.bool, @@ -92,7 +91,6 @@ def _construct_expected_sampling_metadata( req.sampling_params.all_stop_token_ids) logit_bias[index_in_input_batch] = req.sampling_params.logit_bias if req.sampling_params.allowed_token_ids: - has_allowed_token_ids[index_in_input_batch] = True allowed_token_ids_mask[index_in_input_batch][ req.sampling_params.allowed_token_ids] = True @@ -131,7 +129,6 @@ def _construct_expected_sampling_metadata( and all(x == 0 for x in frequency_penalties) and all(x == 1 for x in repetition_penalties)), logit_bias=logit_bias, - no_allowed_token_ids=not any(has_allowed_token_ids), allowed_token_ids_mask=allowed_token_ids_mask, ) @@ -254,9 +251,7 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool: assert expected_sampling_metadata.no_penalties == \ sampling_metadata.no_penalties assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias - assert (expected_sampling_metadata.no_allowed_token_ids == - sampling_metadata.no_allowed_token_ids) - if not sampling_metadata.no_allowed_token_ids: + if sampling_metadata.allowed_token_ids_mask: assert torch.allclose( expected_sampling_metadata.allowed_token_ids_mask, sampling_metadata.allowed_token_ids_mask) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index d5421c7d30e6..9f7770bbd078 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -38,10 +38,6 @@ class SamplingMetadata: logit_bias: List[Optional[Dict[int, float]]] - # These two parameters are for allowed_token_ids. - # `no_allowed_token_ids`` is a bool to indicate whether we have - # allowed_token_ids. # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size, # vocab size). - no_allowed_token_ids: bool allowed_token_ids_mask: Optional[torch.Tensor] diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 124a04bac42d..0e4cbada915b 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -237,9 +237,7 @@ def apply_allowed_token_ids( logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - # One idea is implement this as a PyTorch C++ op, and we may - # even optimize the logit_bias layout. - if not sampling_metadata.no_allowed_token_ids: + if sampling_metadata.allowed_token_ids_mask is not None: logits.masked_fill_(sampling_metadata.allowed_token_ids_mask, float("-inf")) return logits diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index dc9561337792..fab11a6035d7 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -192,7 +192,7 @@ def __init__( self.logit_bias: List[Optional[Dict[int, float]]] = [None] * max_num_reqs - self.has_allowed_token_ids: List[bool] = [False] * max_num_reqs + self.has_allowed_token_ids: Set[str] = set() self.allowed_token_ids_mask: Optional[torch.Tensor] = None self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None @@ -297,7 +297,7 @@ def add_request( for tid in sampling_params.allowed_token_ids): raise ValueError( "allowed_token_ids contains out-of-vocab token id") - self.has_allowed_token_ids[req_index] = True + self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: # Lazy allocation for this tensor, which can be large. self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, @@ -357,7 +357,7 @@ def remove_request(self, req_id: str) -> Optional[int]: self.request_lora_mapping[req_index] = 0 self.logit_bias[req_index] = None - self.has_allowed_token_ids[req_index] = False + self.has_allowed_token_ids.discard(req_id) if self.allowed_token_ids_mask_cpu_tensor is not None: self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) return req_index @@ -428,8 +428,6 @@ def condense(self, empty_req_indices: List[int]) -> None: self.logit_bias[empty_index] = self.logit_bias[last_req_index] - self.has_allowed_token_ids[ - empty_index] = self.has_allowed_token_ids[last_req_index] if self.allowed_token_ids_mask_cpu_tensor is not None: self.allowed_token_ids_mask_cpu_tensor[ empty_index] = self.allowed_token_ids_mask_cpu_tensor[ @@ -478,8 +476,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata: prompt_token_ids = None allowed_token_ids_mask: Optional[torch.Tensor] = None - if not self.no_allowed_token_ids and \ - self.allowed_token_ids_mask is not None: + if not self.no_allowed_token_ids: + assert self.allowed_token_ids_mask is not None copy_slice(self.allowed_token_ids_mask_cpu_tensor, self.allowed_token_ids_mask, num_reqs) allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs] @@ -502,7 +500,6 @@ def _make_sampling_metadata(self) -> SamplingMetadata: min_tokens=self.min_tokens, no_penalties=self.no_penalties, logit_bias=self.logit_bias[:num_reqs], - no_allowed_token_ids=self.no_allowed_token_ids, allowed_token_ids_mask=allowed_token_ids_mask, ) @@ -597,4 +594,4 @@ def no_prompt_logprob(self) -> bool: @property def no_allowed_token_ids(self) -> bool: - return not any(self.has_allowed_token_ids) + return len(self.has_allowed_token_ids) == 0 From 60b439370fa556a6ed6c9fb33dbb54680f5a5acd Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Thu, 20 Feb 2025 19:30:38 -0800 Subject: [PATCH 3/4] move the check in the Processor Signed-off-by: Lu Fang --- vllm/v1/engine/processor.py | 14 ++++++++++++++ vllm/v1/sample/sampler.py | 5 ++--- vllm/v1/worker/gpu_input_batch.py | 6 ------ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index b7eee5a39972..2547cebaede7 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -83,6 +83,19 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") + def _validate_allowed_token_ids( + self, + params: Union[SamplingParams, PoolingParams], + ) -> None: + if not isinstance(params, SamplingParams): + return + if params.allowed_token_ids is None: + return + if not all(0 <= tid < self.model_config.vocab_size + for tid in params.allowed_token_ids): + raise ValueError( + "allowed_token_ids contains out-of-vocab token id") + def process_inputs( self, request_id: str, @@ -100,6 +113,7 @@ def process_inputs( self._validate_logprobs(params) self._validate_lora(lora_request) + self._validate_allowed_token_ids(params) if arrival_time is None: arrival_time = time.time() diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 0e4cbada915b..47ec26d42024 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -58,8 +58,8 @@ def forward( # Gather the logprobs of the topk and sampled token (if requested). # Get logprobs and rank tensors (if requested) - logprobs_tensors = (None if num_logprobs is None else \ - self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)) + logprobs_tensors = None if num_logprobs is None else \ + self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled) # Use int32 to reduce the tensor size. sampled = sampled.to(torch.int32) @@ -183,7 +183,6 @@ def apply_penalties( apply_min_token_penalties(logits, sampling_metadata.output_token_ids, sampling_metadata.min_tokens) - if not sampling_metadata.no_penalties: assert sampling_metadata.prompt_token_ids is not None logits = apply_all_penalties( diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index fab11a6035d7..d9fc53490c07 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -291,12 +291,6 @@ def add_request( self.logit_bias[req_index] = sampling_params.logit_bias if sampling_params.allowed_token_ids: - # NOTE(houseroad): put the check here since no vocab_size info - # available in vllm/sampling_params.py - if not all(0 <= tid < self.vocab_size - for tid in sampling_params.allowed_token_ids): - raise ValueError( - "allowed_token_ids contains out-of-vocab token id") self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: # Lazy allocation for this tensor, which can be large. From beaba718f45afbc452f97082623aab8e98869e44 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Fri, 21 Feb 2025 14:00:52 -0800 Subject: [PATCH 4/4] fix more tests Signed-off-by: Lu Fang --- tests/v1/sample/test_rejection_sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 3e810e525e1c..956d91c6daf7 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -43,6 +43,7 @@ def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata: output_token_ids=[], min_tokens={}, logit_bias=[None] * batch_size, + allowed_token_ids_mask=None, )