cleancode

wangxiaoxin-sherie · wangxiaoxin-sherie · commit 77b79f7c8e38 · 2025-06-03T11:40:20.000+08:00
diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py
@@ -83,6 +83,7 @@ def test_multimodal(model, prompt_template, vllm_runner):
                                    images=images,
                                    max_tokens=64)
 
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "float16"])
 @pytest.mark.parametrize("max_tokens", [5])
@@ -94,7 +95,10 @@ def test_models_topk(model: str, dtype: str, max_tokens: int) -> None:
         "The capital of France is",
         "The future of AI is",
     ]
-    sampling_params = SamplingParams(max_tokens = max_tokens, temperature = 0.0, top_k = 50, top_p = 0.9)
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
 
     with VllmRunner(model,
                     max_model_len=8192,
diff --git a/tests/singlecard/test_sampler.py b/tests/singlecard/test_sampler.py
@@ -20,8 +20,6 @@
 
 import torch
 
-from vllm.v1.sample.ops.topk_topp_sampler import \
-    apply_top_k_top_p  # noqa: F401
 from vllm.v1.sample.sampler import Sampler  # noqa: F401
 
 # Set tolerance to 1 for quant ops
@@ -51,6 +49,49 @@ def apply_min_p_new(
     return logits
 
 
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits.
+
+    If a top-p is used, this function will sort the logits tensor,
+    which can be slow for large batches.
+
+    The logits tensor may be updated in-place.
+    """
+    if p is None:
+        if k is None:
+            return logits
+
+        # Avoid sorting vocab for top-k only case.
+        return apply_top_k_only(logits, k)
+
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+
+    if k is not None:
+        # Apply top-k.
+        top_k_mask = logits_sort.size(1) - k.to(torch.long)  # shape: B
+        # Get all the top_k values.
+        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+        top_k_mask = logits_sort < top_k_mask
+        logits_sort.masked_fill_(top_k_mask, -float("inf"))
+
+    if p is not None:
+        # Apply top-p.
+        probs_sort = logits_sort.softmax(dim=-1)
+        probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
+        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+        # at least one
+        top_p_mask[:, -1] = False
+        logits_sort.masked_fill_(top_p_mask, -float("inf"))
+
+    # Re-sort the probabilities.
+    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+
+
 def apply_top_k_top_p_new(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],