From 4897f02c42cdc6a6eba6ae6d6d8fca99d6ed35d5 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Wed, 12 Feb 2025 23:28:15 -0800
Subject: [PATCH 1/4] Introduce allowed_token_ids support in the v1 Sampler

Signed-off-by: Lu Fang <lufang@fb.com>

Complete the main logic for allowed_token_ids in v1 sampler

Signed-off-by: Lu Fang <lufang@fb.com>

lint the code

Signed-off-by: Lu Fang <lufang@fb.com>

Add boundary checks for allowed_token_ids

Signed-off-by: Lu Fang <lufang@fb.com>

address the comments

Signed-off-by: Lu Fang <lufang@fb.com>
---
 tests/v1/sample/test_sampler.py         | 99 +++++++++++++++++++++----
 tests/v1/worker/test_gpu_input_batch.py | 18 +++++
 vllm/v1/sample/metadata.py              |  8 ++
 vllm/v1/sample/sampler.py               | 25 ++++++-
 vllm/v1/worker/gpu_input_batch.py       | 52 ++++++++++++-
 5 files changed, 181 insertions(+), 21 deletions(-)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 3f6301c54267..864ac12397fc 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -57,6 +57,28 @@ def _create_logit_bias(
     return res
 
 
+def _create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> Tuple[bool, Optional[torch.Tensor]]:
+    mask: Optional[torch.Tensor] = None
+    no_allowed_token_ids = True
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros((batch_size, vocab_size),
+                               dtype=torch.bool,
+                               device=device)
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+        no_allowed_token_ids = False
+    return (no_allowed_token_ids, mask)
+
+
 def _create_default_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -92,6 +114,8 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         min_tokens={},
         logit_bias=[None] * batch_size,
+        no_allowed_token_ids=True,
+        allowed_token_ids_mask=None,
     )
     return fake_sampling_metadata
 
@@ -253,7 +277,10 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     sampling_metadata.frequency_penalties = _create_penalty_tensor(
         batch_size, frequency_penalty, torch.device(device))
     output_token_ids, sorted_token_ids_in_output = \
-        _create_weighted_output_token_list(batch_size, VOCAB_SIZE)
+        _create_weighted_output_token_list(
+            batch_size,
+            VOCAB_SIZE,
+        )
     sampling_metadata.output_token_ids = output_token_ids
     sampling_metadata.no_penalties = False
     sampler = Sampler()
@@ -262,8 +289,8 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     for batch_idx in range(batch_size):
         non_penalized_token_id = logits[batch_idx].argmax().item()
         penalized_token_id = logits[batch_idx].argmin().item()
-        distinct_sorted_token_ids_in_output = \
-            sorted_token_ids_in_output[batch_idx]
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[
+            batch_idx]
         most_frequent_token_id = distinct_sorted_token_ids_in_output[
             len(distinct_sorted_token_ids_in_output) - 1]
         if frequency_penalty > 0:
@@ -272,8 +299,8 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
             # non-penalized token ID is not present in the output, while the
             # most penalized token is the one that occurs most frequently in
             # the output.
-            assert non_penalized_token_id \
-                not in distinct_sorted_token_ids_in_output
+            assert (non_penalized_token_id
+                    not in distinct_sorted_token_ids_in_output)
             assert penalized_token_id == most_frequent_token_id
         elif frequency_penalty < 0:
             # If `frequency_penalty` is set to < 0, it indicates
@@ -282,8 +309,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
             # in the output, while the penalized token ID is one that has not
             # yet appeared.
             assert non_penalized_token_id == most_frequent_token_id
-            assert penalized_token_id \
-                not in distinct_sorted_token_ids_in_output
+            assert penalized_token_id not in distinct_sorted_token_ids_in_output
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -318,18 +344,18 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
             # If `repetition_penalty` > 1.0, verify that the non-penalized
             # token ID has not been seen before, while the penalized token ID
             # exists either in the prompt or the output.
-            assert (non_penalized_token_id not in prompt_tokens and \
-                non_penalized_token_id not in output_tokens)
-            assert (penalized_token_id  in prompt_tokens or \
-                penalized_token_id in output_tokens)
+            assert (non_penalized_token_id not in prompt_tokens
+                    and non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id in prompt_tokens
+                    or penalized_token_id in output_tokens)
         elif repetition_penalty < 1.0:
             # If `repetition_penalty` < 1.0, verify that the penalized
             # token ID has not been seen before, while the non-penalized
             # token ID exists either in the prompt or the output.
-            assert (penalized_token_id not in prompt_tokens and \
-                penalized_token_id not in output_tokens)
-            assert (non_penalized_token_id  in prompt_tokens or \
-                non_penalized_token_id in output_tokens)
+            assert (penalized_token_id not in prompt_tokens
+                    and penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id in prompt_tokens
+                    or non_penalized_token_id in output_tokens)
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -404,3 +430,46 @@ def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
                                                                  1e-2)
             else:
                 assert logits_for_req[token_id] == pytest.approx(1e-2)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
+def test_sampler_allowed_token_ids(device: str, batch_size: int,
+                                   num_allowed_token_ids: int):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    no_allowed_token_ids, mask = _create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=device,
+    )
+    sampling_metadata.no_allowed_token_ids = no_allowed_token_ids
+    sampling_metadata.allowed_token_ids_mask = mask
+    sampler = Sampler()
+    logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    assert not sampling_metadata.no_allowed_token_ids
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        if batch_idx % 2 == 1:
+            assert torch.all(logits_for_req != -float("inf"))
+            continue
+        for token_id in range(VOCAB_SIZE):
+            start = min(batch_idx, VOCAB_SIZE - 1)
+            end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
+            if token_id >= start and token_id < end:
+                assert logits_for_req[token_id] == -float(
+                    "inf"), f"{batch_idx}, {token_id}"
+            else:
+                assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index cb3b3d21fbb3..692dde27b401 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -66,6 +66,11 @@ def _construct_expected_sampling_metadata(
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
+    has_allowed_token_ids = [False] * num_reqs
+    allowed_token_ids_mask = torch.zeros(num_reqs,
+                                         VOCAB_SIZE,
+                                         dtype=torch.bool,
+                                         device=device)
     for req in reqs:
         if req.req_id not in req_ids_retained:
             continue
@@ -86,6 +91,11 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.min_tokens,
             req.sampling_params.all_stop_token_ids)
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
+        if req.sampling_params.allowed_token_ids:
+            has_allowed_token_ids[index_in_input_batch] = True
+            allowed_token_ids_mask[index_in_input_batch][
+                req.sampling_params.allowed_token_ids] = True
+
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
                                  device=device),
@@ -121,6 +131,8 @@ def _construct_expected_sampling_metadata(
                       and all(x == 0 for x in frequency_penalties)
                       and all(x == 1 for x in repetition_penalties)),
         logit_bias=logit_bias,
+        no_allowed_token_ids=not any(has_allowed_token_ids),
+        allowed_token_ids_mask=allowed_token_ids_mask,
     )
 
 
@@ -242,3 +254,9 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
     assert expected_sampling_metadata.no_penalties == \
            sampling_metadata.no_penalties
     assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
+    assert (expected_sampling_metadata.no_allowed_token_ids ==
+            sampling_metadata.no_allowed_token_ids)
+    if not sampling_metadata.no_allowed_token_ids:
+        assert torch.allclose(
+            expected_sampling_metadata.allowed_token_ids_mask,
+            sampling_metadata.allowed_token_ids_mask)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 6d82d3a79c8e..d5421c7d30e6 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -37,3 +37,11 @@ class SamplingMetadata:
     min_tokens: Dict[int, Tuple[int, Set[int]]]
 
     logit_bias: List[Optional[Dict[int, float]]]
+
+    # These two parameters are for allowed_token_ids.
+    # `no_allowed_token_ids`` is a bool to indicate whether we have
+    # allowed_token_ids.
+    # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
+    # vocab size).
+    no_allowed_token_ids: bool
+    allowed_token_ids_mask: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index ff978b3b6c41..124a04bac42d 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -47,6 +47,8 @@ def forward(
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
+        # Apply allowed token ids.
+        logits = self.apply_allowed_token_ids(logits, sampling_metadata)
         # Apply logits bias.
         logits = self.apply_logits_bias(logits, sampling_metadata)
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -56,8 +58,8 @@ def forward(
 
         # Gather the logprobs of the topk and sampled token (if requested).
         # Get logprobs and rank tensors (if requested)
-        logprobs_tensors = None if num_logprobs is None else \
-            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)
+        logprobs_tensors = (None if num_logprobs is None else \
+            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled))
 
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
@@ -181,14 +183,17 @@ def apply_penalties(
             apply_min_token_penalties(logits,
                                       sampling_metadata.output_token_ids,
                                       sampling_metadata.min_tokens)
+
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
             logits = apply_all_penalties(
-                logits, sampling_metadata.prompt_token_ids,
+                logits,
+                sampling_metadata.prompt_token_ids,
                 sampling_metadata.presence_penalties,
                 sampling_metadata.frequency_penalties,
                 sampling_metadata.repetition_penalties,
-                sampling_metadata.output_token_ids)
+                sampling_metadata.output_token_ids,
+            )
         return logits
 
     def apply_min_p(
@@ -226,3 +231,15 @@ def apply_logits_bias(
                 for token_id, bias in logit_bias.items():
                     logits[i, token_id] += bias
         return logits
+
+    def apply_allowed_token_ids(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        # One idea is implement this as a PyTorch C++ op, and we may
+        # even optimize the logit_bias layout.
+        if not sampling_metadata.no_allowed_token_ids:
+            logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
+                                float("-inf"))
+        return logits
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index bd1c369acb30..dc9561337792 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -143,7 +143,7 @@ def __init__(
             device="cpu",
             pin_memory=pin_memory)
         self.frequency_penalties_cpu = \
-                self.frequency_penalties_cpu_tensor.numpy()
+            self.frequency_penalties_cpu_tensor.numpy()
         self.frequency_penalties_reqs: Set[str] = set()
 
         # Presence penalty related data structures
@@ -168,7 +168,7 @@ def __init__(
             device="cpu",
             pin_memory=pin_memory)
         self.repetition_penalties_cpu = \
-                self.repetition_penalties_cpu_tensor.numpy()
+            self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: Set[str] = set()
 
         # req_index -> (min_tokens, stop_token_ids)
@@ -192,6 +192,9 @@ def __init__(
 
         self.logit_bias: List[Optional[Dict[int,
                                             float]]] = [None] * max_num_reqs
+        self.has_allowed_token_ids: List[bool] = [False] * max_num_reqs
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
         self.req_output_token_ids: List[Optional[List[int]]] = []
 
@@ -287,6 +290,28 @@ def add_request(
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
+        if sampling_params.allowed_token_ids:
+            # NOTE(houseroad): put the check here since no vocab_size info
+            # available in vllm/sampling_params.py
+            if not all(0 <= tid < self.vocab_size
+                       for tid in sampling_params.allowed_token_ids):
+                raise ValueError(
+                    "allowed_token_ids contains out-of-vocab token id")
+            self.has_allowed_token_ids[req_index] = True
+            if self.allowed_token_ids_mask_cpu_tensor is None:
+                # Lazy allocation for this tensor, which can be large.
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs,
+                    self.vocab_size,
+                    dtype=torch.bool,
+                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index][
+                sampling_params.allowed_token_ids] = True
+
         # Add request lora ID
         if request.lora_request:
             lora_id = request.lora_request.lora_int_id
@@ -332,6 +357,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
             self.request_lora_mapping[req_index] = 0
 
         self.logit_bias[req_index] = None
+        self.has_allowed_token_ids[req_index] = False
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         return req_index
 
     def condense(self, empty_req_indices: List[int]) -> None:
@@ -400,6 +428,13 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
             self.logit_bias[empty_index] = self.logit_bias[last_req_index]
 
+            self.has_allowed_token_ids[
+                empty_index] = self.has_allowed_token_ids[last_req_index]
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -442,6 +477,13 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
         else:
             prompt_token_ids = None
 
+        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        if not self.no_allowed_token_ids and \
+                self.allowed_token_ids_mask is not None:
+            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
+                       self.allowed_token_ids_mask, num_reqs)
+            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
+
         return SamplingMetadata(
             temperature=temperature,
             all_greedy=self.all_greedy,
@@ -460,6 +502,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
+            no_allowed_token_ids=self.no_allowed_token_ids,
+            allowed_token_ids_mask=allowed_token_ids_mask,
         )
 
     def get_sampling_metadata(
@@ -550,3 +594,7 @@ def max_num_logprobs(self) -> Optional[int]:
     @property
     def no_prompt_logprob(self) -> bool:
         return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return not any(self.has_allowed_token_ids)

From 368369c5e0f72b25bcc13d54667e43900a78b360 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Thu, 20 Feb 2025 13:54:02 -0800
Subject: [PATCH 2/4] address comments

Signed-off-by: Lu Fang <lufang@fb.com>
---
 tests/v1/sample/test_sampler.py         | 11 +++--------
 tests/v1/worker/test_gpu_input_batch.py |  7 +------
 vllm/v1/sample/metadata.py              |  4 ----
 vllm/v1/sample/sampler.py               |  4 +---
 vllm/v1/worker/gpu_input_batch.py       | 15 ++++++---------
 5 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 864ac12397fc..34fba5a9f6d7 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -62,9 +62,8 @@ def _create_allowed_token_ids(
     vocab_size: int,
     num_allowed_token_ids: int,
     device: torch.device,
-) -> Tuple[bool, Optional[torch.Tensor]]:
+) -> Optional[torch.Tensor]:
     mask: Optional[torch.Tensor] = None
-    no_allowed_token_ids = True
     for i in range(batch_size):
         if i % 2 == 1:
             continue
@@ -75,8 +74,7 @@ def _create_allowed_token_ids(
         start = min(i, vocab_size - 1)
         end = min(i + num_allowed_token_ids, vocab_size - 1)
         mask[i, start:end] = True
-        no_allowed_token_ids = False
-    return (no_allowed_token_ids, mask)
+    return mask
 
 
 def _create_default_sampling_metadata(
@@ -114,7 +112,6 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         min_tokens={},
         logit_bias=[None] * batch_size,
-        no_allowed_token_ids=True,
         allowed_token_ids_mask=None,
     )
     return fake_sampling_metadata
@@ -448,18 +445,16 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    no_allowed_token_ids, mask = _create_allowed_token_ids(
+    mask = _create_allowed_token_ids(
         batch_size=batch_size,
         vocab_size=VOCAB_SIZE,
         num_allowed_token_ids=num_allowed_token_ids,
         device=device,
     )
-    sampling_metadata.no_allowed_token_ids = no_allowed_token_ids
     sampling_metadata.allowed_token_ids_mask = mask
     sampler = Sampler()
     logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata)
     logits = logits.cpu()
-    assert not sampling_metadata.no_allowed_token_ids
     for batch_idx in range(batch_size):
         logits_for_req = logits[batch_idx]
         if batch_idx % 2 == 1:
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 692dde27b401..0aee266264ac 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -66,7 +66,6 @@ def _construct_expected_sampling_metadata(
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
-    has_allowed_token_ids = [False] * num_reqs
     allowed_token_ids_mask = torch.zeros(num_reqs,
                                          VOCAB_SIZE,
                                          dtype=torch.bool,
@@ -92,7 +91,6 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.all_stop_token_ids)
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
         if req.sampling_params.allowed_token_ids:
-            has_allowed_token_ids[index_in_input_batch] = True
             allowed_token_ids_mask[index_in_input_batch][
                 req.sampling_params.allowed_token_ids] = True
 
@@ -131,7 +129,6 @@ def _construct_expected_sampling_metadata(
                       and all(x == 0 for x in frequency_penalties)
                       and all(x == 1 for x in repetition_penalties)),
         logit_bias=logit_bias,
-        no_allowed_token_ids=not any(has_allowed_token_ids),
         allowed_token_ids_mask=allowed_token_ids_mask,
     )
 
@@ -254,9 +251,7 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
     assert expected_sampling_metadata.no_penalties == \
            sampling_metadata.no_penalties
     assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
-    assert (expected_sampling_metadata.no_allowed_token_ids ==
-            sampling_metadata.no_allowed_token_ids)
-    if not sampling_metadata.no_allowed_token_ids:
+    if sampling_metadata.allowed_token_ids_mask:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
             sampling_metadata.allowed_token_ids_mask)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index d5421c7d30e6..9f7770bbd078 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -38,10 +38,6 @@ class SamplingMetadata:
 
     logit_bias: List[Optional[Dict[int, float]]]
 
-    # These two parameters are for allowed_token_ids.
-    # `no_allowed_token_ids`` is a bool to indicate whether we have
-    # allowed_token_ids.
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
-    no_allowed_token_ids: bool
     allowed_token_ids_mask: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 124a04bac42d..0e4cbada915b 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -237,9 +237,7 @@ def apply_allowed_token_ids(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
-        # One idea is implement this as a PyTorch C++ op, and we may
-        # even optimize the logit_bias layout.
-        if not sampling_metadata.no_allowed_token_ids:
+        if sampling_metadata.allowed_token_ids_mask is not None:
             logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
                                 float("-inf"))
         return logits
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index dc9561337792..fab11a6035d7 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -192,7 +192,7 @@ def __init__(
 
         self.logit_bias: List[Optional[Dict[int,
                                             float]]] = [None] * max_num_reqs
-        self.has_allowed_token_ids: List[bool] = [False] * max_num_reqs
+        self.has_allowed_token_ids: Set[str] = set()
         self.allowed_token_ids_mask: Optional[torch.Tensor] = None
         self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
@@ -297,7 +297,7 @@ def add_request(
                        for tid in sampling_params.allowed_token_ids):
                 raise ValueError(
                     "allowed_token_ids contains out-of-vocab token id")
-            self.has_allowed_token_ids[req_index] = True
+            self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
                 self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
@@ -357,7 +357,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
             self.request_lora_mapping[req_index] = 0
 
         self.logit_bias[req_index] = None
-        self.has_allowed_token_ids[req_index] = False
+        self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         return req_index
@@ -428,8 +428,6 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
             self.logit_bias[empty_index] = self.logit_bias[last_req_index]
 
-            self.has_allowed_token_ids[
-                empty_index] = self.has_allowed_token_ids[last_req_index]
             if self.allowed_token_ids_mask_cpu_tensor is not None:
                 self.allowed_token_ids_mask_cpu_tensor[
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
@@ -478,8 +476,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             prompt_token_ids = None
 
         allowed_token_ids_mask: Optional[torch.Tensor] = None
-        if not self.no_allowed_token_ids and \
-                self.allowed_token_ids_mask is not None:
+        if not self.no_allowed_token_ids:
+            assert self.allowed_token_ids_mask is not None
             copy_slice(self.allowed_token_ids_mask_cpu_tensor,
                        self.allowed_token_ids_mask, num_reqs)
             allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
@@ -502,7 +500,6 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
-            no_allowed_token_ids=self.no_allowed_token_ids,
             allowed_token_ids_mask=allowed_token_ids_mask,
         )
 
@@ -597,4 +594,4 @@ def no_prompt_logprob(self) -> bool:
 
     @property
     def no_allowed_token_ids(self) -> bool:
-        return not any(self.has_allowed_token_ids)
+        return len(self.has_allowed_token_ids) == 0

From 60b439370fa556a6ed6c9fb33dbb54680f5a5acd Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Thu, 20 Feb 2025 19:30:38 -0800
Subject: [PATCH 3/4] move the check in the Processor

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/v1/engine/processor.py       | 14 ++++++++++++++
 vllm/v1/sample/sampler.py         |  5 ++---
 vllm/v1/worker/gpu_input_batch.py |  6 ------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b7eee5a39972..2547cebaede7 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -83,6 +83,19 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
 
+    def _validate_allowed_token_ids(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> None:
+        if not isinstance(params, SamplingParams):
+            return
+        if params.allowed_token_ids is None:
+            return
+        if not all(0 <= tid < self.model_config.vocab_size
+                   for tid in params.allowed_token_ids):
+            raise ValueError(
+                "allowed_token_ids contains out-of-vocab token id")
+
     def process_inputs(
         self,
         request_id: str,
@@ -100,6 +113,7 @@ def process_inputs(
 
         self._validate_logprobs(params)
         self._validate_lora(lora_request)
+        self._validate_allowed_token_ids(params)
 
         if arrival_time is None:
             arrival_time = time.time()
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 0e4cbada915b..47ec26d42024 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -58,8 +58,8 @@ def forward(
 
         # Gather the logprobs of the topk and sampled token (if requested).
         # Get logprobs and rank tensors (if requested)
-        logprobs_tensors = (None if num_logprobs is None else \
-            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled))
+        logprobs_tensors = None if num_logprobs is None else \
+            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)
 
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
@@ -183,7 +183,6 @@ def apply_penalties(
             apply_min_token_penalties(logits,
                                       sampling_metadata.output_token_ids,
                                       sampling_metadata.min_tokens)
-
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
             logits = apply_all_penalties(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index fab11a6035d7..d9fc53490c07 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -291,12 +291,6 @@ def add_request(
             self.logit_bias[req_index] = sampling_params.logit_bias
 
         if sampling_params.allowed_token_ids:
-            # NOTE(houseroad): put the check here since no vocab_size info
-            # available in vllm/sampling_params.py
-            if not all(0 <= tid < self.vocab_size
-                       for tid in sampling_params.allowed_token_ids):
-                raise ValueError(
-                    "allowed_token_ids contains out-of-vocab token id")
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.

From beaba718f45afbc452f97082623aab8e98869e44 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Fri, 21 Feb 2025 14:00:52 -0800
Subject: [PATCH 4/4] fix more tests

Signed-off-by: Lu Fang <lufang@fb.com>
---
 tests/v1/sample/test_rejection_sampler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3e810e525e1c..956d91c6daf7 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -43,6 +43,7 @@ def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
         output_token_ids=[],
         min_tokens={},
         logit_bias=[None] * batch_size,
+        allowed_token_ids_mask=None,
     )