Fixup imports further, ignore 'frozen' imports in compilation, fix

bwasti · bwasti · commit 355bda6a5e1c · 2025-10-15T18:06:11.000-07:00
chunked prefill setting

Signed-off-by: Bram Wasti &lt;bwasti@meta.com&gt;
diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/generation/test_batch_invariance.py
@@ -707,6 +707,282 @@ def test_logprobs_WITHOUT_batch_invariance_should_FAIL(backend):
             os.environ["VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT"] = old_value
 
 
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Batch invariance tests only supported on Hopper (SM90)",
+)
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="Requires CUDA to match production inference path.",
+)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
+@pytest.mark.forked
+def test_decode_logprobs_match_prefill_logprobs(backend):
+    """
+    Test that verifies decode logprobs match prefill logprobs.
+
+    For each decoded token at position i:
+    1. Run decode to generate N tokens and collect their logprobs
+    2. For each position i in [0, N):
+       - Take prefix = prompt + tokens[0:i]
+       - Run prefill(prefix + tokens[i]) to get logprob of tokens[i]
+       - Verify prefill logprob matches decode logprob bitwise
+
+    This ensures that the logprobs from decode are consistent with what
+    we would get if we ran prefill on each prefix.
+    """
+    backend = os.getenv("VLLM_ATTENTION_BACKEND", backend)
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+
+    seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
+    random.seed(seed)
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
+
+    from vllm.model_executor.layers.batch_invariant import (
+        vllm_kernel_override_batch_invariant,
+    )
+
+    disable_custom_ar = vllm_kernel_override_batch_invariant()
+
+    if disable_custom_ar:
+        print(f"\n{'=' * 80}")
+        print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
+        print(f"{'=' * 80}\n")
+
+    llm = LLM(
+        model=model_name,
+        tensor_parallel_size=tp_size,
+        enable_prefix_caching=False,
+        max_num_seqs=32,
+        max_model_len=8192,
+        dtype="bfloat16",
+    )
+
+    # Use a few test prompts
+    num_test_prompts = int(os.getenv("VLLM_DECODE_PREFILL_NUM_PROMPTS", "4"))
+    prompts = [_random_prompt(10, 50) for _ in range(num_test_prompts)]
+
+    # Generate longer sequences to test multiple decode steps
+    max_tokens = int(os.getenv("VLLM_DECODE_PREFILL_MAX_TOKENS", "16"))
+
+    sp = SamplingParams(
+        temperature=0.0,  # Greedy for determinism
+        max_tokens=max_tokens,
+        logprobs=5,
+    )
+
+    print("\n" + "=" * 80)
+    print("STEP 1: Running decode to generate tokens and collect logprobs")
+    print("=" * 80 + "\n")
+
+    # Step 1: Run decode and collect logprobs
+    decode_outputs = llm.generate(prompts, sp, use_tqdm=False)
+
+    failed_comparisons = []
+
+    for prompt_idx, (prompt, decode_output) in enumerate(zip(prompts, decode_outputs)):
+        print(f"\n[Prompt {prompt_idx}] Testing: {prompt[:80]}...")
+
+        # Extract decode logprobs and tokens
+        decode_logprobs, token_ids = _extract_step_logprobs(decode_output)
+        if decode_logprobs is None:
+            pytest.skip(
+                "Logprobs are not available on RequestOutput; "
+                "enable logprobs return to run this test."
+            )
+
+        print(f"[Prompt {prompt_idx}] Generated {len(token_ids)} tokens: {token_ids}")
+        print(f"[Prompt {prompt_idx}] Decode logprobs: {decode_logprobs.tolist()}")
+
+        # Step 2: For each token position, run prefill and compare
+        print(f"\n[Prompt {prompt_idx}] Verifying each token via prefill...")
+
+        for token_idx in range(len(token_ids)):
+            # Construct the prefix up to (but not including) this token
+            current_token = token_ids[token_idx]
+
+            # We need to detokenize to get the text prefix
+            # For this, we'll use the tokenizer from the LLM
+            # However, the LLM API doesn't expose tokenizer easily, so we'll
+            # construct the prefix by decoding from the original prompt
+
+            # Get text up to this point by using the output text
+            # This is approximate but should work for verification
+            if token_idx == 0:
+                prefix_prompt = prompt
+            else:
+                # Use the partial output text up to this token
+                # We'll need to construct this from the full output
+                prefix_output = decode_output.outputs[0]
+                # Get the text for tokens 0 to token_idx-1
+                # Unfortunately, we don't have per-token text, so we'll use
+                # a different approach: run prefill with prompt + tokens[0:token_idx]
+
+                # Actually, we need to get the actual text. Let's use a workaround:
+                # Run a generation with max_tokens = token_idx to get that prefix
+                prefix_sp = SamplingParams(
+                    temperature=0.0,
+                    max_tokens=token_idx,
+                    logprobs=1,
+                )
+                prefix_output = llm.generate([prompt], prefix_sp, use_tqdm=False)[0]
+                prefix_prompt = prompt + prefix_output.outputs[0].text
+
+            # Now run prefill with max_tokens=1 to get the logprob of the next token
+            prefill_sp = SamplingParams(
+                temperature=0.0,
+                max_tokens=1,
+                logprobs=5,
+            )
+
+            print(
+                f"  [Token {token_idx}] Running prefill for prefix "
+                f"(len={len(prefix_prompt)})..."
+            )
+            prefill_output = llm.generate([prefix_prompt], prefill_sp, use_tqdm=False)[
+                0
+            ]
+            prefill_logprobs, prefill_token_ids = _extract_step_logprobs(prefill_output)
+
+            if prefill_logprobs is None:
+                print(f"  [Token {token_idx}] Warning: No prefill logprobs available")
+                continue
+
+            # The first token from prefill should match the current token
+            prefill_token = prefill_token_ids[0]
+            prefill_logprob = prefill_logprobs[0].item()
+            decode_logprob = decode_logprobs[token_idx].item()
+
+            print(
+                f"  [Token {token_idx}] Decode token: {current_token}, "
+                f"logprob: {decode_logprob:.8f}"
+            )
+            print(
+                f"  [Token {token_idx}] Prefill token: {prefill_token}, "
+                f"logprob: {prefill_logprob:.8f}"
+            )
+
+            # Check if tokens match
+            if current_token != prefill_token:
+                failed_comparisons.append(
+                    {
+                        "prompt_idx": prompt_idx,
+                        "token_idx": token_idx,
+                        "reason": "Token mismatch",
+                        "decode_token": current_token,
+                        "prefill_token": prefill_token,
+                        "decode_logprob": decode_logprob,
+                        "prefill_logprob": prefill_logprob,
+                        "prompt_text": prompt[:100],
+                        "prefix_text": prefix_prompt[:100],
+                    }
+                )
+                print(f"  [Token {token_idx}] ✗ TOKEN MISMATCH!")
+                continue
+
+            # Check if logprobs match bitwise
+            if decode_logprob != prefill_logprob:
+                diff = abs(decode_logprob - prefill_logprob)
+                failed_comparisons.append(
+                    {
+                        "prompt_idx": prompt_idx,
+                        "token_idx": token_idx,
+                        "reason": "Logprob mismatch",
+                        "decode_token": current_token,
+                        "prefill_token": prefill_token,
+                        "decode_logprob": decode_logprob,
+                        "prefill_logprob": prefill_logprob,
+                        "diff": diff,
+                        "prompt_text": prompt[:100],
+                        "prefix_text": prefix_prompt[:100],
+                        "decode_all_tokens": token_ids,
+                        "decode_all_logprobs": decode_logprobs.tolist(),
+                    }
+                )
+                print(f"  [Token {token_idx}] ✗ LOGPROB MISMATCH! diff={diff:.8e}")
+            else:
+                print(f"  [Token {token_idx}] ✓ Match (bitwise equal)")
+
+    # Print summary
+    print(f"\n{'=' * 80}")
+    if failed_comparisons:
+        print(f"DECODE-PREFILL MISMATCH: {len(failed_comparisons)} failures detected")
+        print(f"{'=' * 80}")
+
+        # Group failures by prompt for better readability
+        failures_by_prompt: dict[int, list[dict]] = {}
+        for fail in failed_comparisons:
+            pid = fail["prompt_idx"]
+            if pid not in failures_by_prompt:
+                failures_by_prompt[pid] = []
+            failures_by_prompt[pid].append(fail)
+
+        for prompt_idx, failures in failures_by_prompt.items():
+            print(f"\n{'=' * 80}")
+            print(f"PROMPT {prompt_idx}: {failures[0]['prompt_text']}...")
+            print(f"{'=' * 80}")
+            print(f"Total failures for this prompt: {len(failures)}")
+
+            # Show where mismatches occur (which token positions)
+            mismatch_positions = [f["token_idx"] for f in failures]
+            print(f"Mismatch at token positions: {mismatch_positions}")
+
+            # Show first few failures in detail
+            for i, fail in enumerate(failures[:5]):  # Show first 5 failures per prompt
+                print(f"\n  [Failure {i + 1}] Token position {fail['token_idx']}:")
+                print(f"    Reason: {fail['reason']}")
+                print(f"    Prefix text: '{fail['prefix_text']}...'")
+                print(
+                    f"    Decode:  token={fail['decode_token']}, "
+                    f"logprob={fail['decode_logprob']:.10f}"
+                )
+                print(
+                    f"    Prefill: token={fail['prefill_token']}, "
+                    f"logprob={fail['prefill_logprob']:.10f}"
+                )
+                if "diff" in fail:
+                    print(f"    Difference: {fail['diff']:.10e}")
+                    # Show in hex to see bitwise difference
+                    import struct
+
+                    decode_hex = struct.pack("f", fail["decode_logprob"]).hex()
+                    prefill_hex = struct.pack("f", fail["prefill_logprob"]).hex()
+                    print(f"    Decode logprob (hex):  0x{decode_hex}")
+                    print(f"    Prefill logprob (hex): 0x{prefill_hex}")
+
+                # If we have all tokens/logprobs, show the context
+                if "decode_all_tokens" in fail and "decode_all_logprobs" in fail:
+                    token_idx = fail["token_idx"]
+                    all_tokens = fail["decode_all_tokens"]
+                    all_logprobs = fail["decode_all_logprobs"]
+
+                    # Show context: 2 tokens before and after
+                    start = max(0, token_idx - 2)
+                    end = min(len(all_tokens), token_idx + 3)
+
+                    print(f"    Context (tokens {start} to {end - 1}):")
+                    for j in range(start, end):
+                        marker = " <-- MISMATCH" if j == token_idx else ""
+                        print(
+                            f"      [{j}] token={all_tokens[j]}, "
+                            f"logprob={all_logprobs[j]:.8f}{marker}"
+                        )
+
+            if len(failures) > 5:
+                print(f"\n  ... and {len(failures) - 5} more failures for this prompt")
+
+        print(f"\n{'=' * 80}\n")
+
+        pytest.fail(
+            f"Decode logprobs do not match prefill logprobs: "
+            f"{len(failed_comparisons)} mismatches found."
+        )
+    else:
+        print("✓ SUCCESS: All decode logprobs match prefill logprobs bitwise!")
+        print(f"{'=' * 80}\n")
+
+
 def LLM_with_max_seqs(
     model: str,
     max_num_seqs: int,
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
@@ -3,6 +3,7 @@
 
 import hashlib
 import inspect
+import os
 import pickle
 from unittest.mock import patch
 
@@ -168,7 +169,8 @@ def _compute_code_hash(files: set[str]) -> str:
     )
     file_contents = {}
     for filepath in files:
-        if filepath == "<string>":
+        # Skip files that don't exist (e.g., <string>, <frozen modules>, etc.)
+        if not os.path.isfile(filepath):
             file_contents[filepath] = ""
         else:
             with open(filepath) as f:
diff --git a/vllm/config/model.py b/vllm/config/model.py
@@ -20,6 +20,9 @@
 from vllm.config.scheduler import RunnerType
 from vllm.config.utils import assert_hashable, config, getattr_iter
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_kernel_override_batch_invariant,
+)
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
     ConfigFormat,
@@ -420,10 +423,6 @@ def __post_init__(
         video_pruning_rate: float | None,
     ) -> None:
         # Enable batch invariance settings if requested
-        from vllm.model_executor.layers.batch_invariant import (
-            vllm_kernel_override_batch_invariant,
-        )
-
         if vllm_kernel_override_batch_invariant():
             self.enforce_eager = True
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -14,6 +14,9 @@
 import vllm.envs as envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_kernel_override_batch_invariant,
+)
 from vllm.platforms import current_platform
 from vllm.utils import cuda_device_count_stateless, get_open_ports_list
 
@@ -560,10 +563,6 @@ def use_ray(self) -> bool:
     def _verify_args(self) -> Self:
         # Lazy import to avoid circular import
         from vllm.executor.executor_base import ExecutorBase
-        from vllm.model_executor.layers.batch_invariant import (
-            vllm_kernel_override_batch_invariant,
-        )
-        from vllm.platforms import current_platform
 
         # Enable batch invariance settings if requested
         if vllm_kernel_override_batch_invariant():
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -170,20 +170,12 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self, is_encoder_decoder: bool) -> None:
-        from vllm.model_executor.layers.batch_invariant import (
-            vllm_kernel_override_batch_invariant,
-        )
-
         if self.max_model_len is None:
             self.max_model_len = 8192
 
         if self.max_num_seqs is None:
             self.max_num_seqs = 128
 
-        # Enable batch invariance settings if requested
-        if vllm_kernel_override_batch_invariant():
-            self.enable_chunked_prefill = False
-
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1698,8 +1698,7 @@ def _set_default_args(
         # for non-pooling tasks.
         # For pooling tasks the default is False
         if model_config.runner_type != "pooling":
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = True
+            self.enable_chunked_prefill = True
 
             # TODO: When prefix caching supports prompt embeds inputs, this
             # check can be removed.
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
@@ -755,9 +755,9 @@ def vllm_kernel_override_batch_invariant():
 def override_envs_for_invariance():
     curr_attn_backend = envs.VLLM_ATTENTION_BACKEND
     supported_backends = [
+        "FLASH_ATTN",  # best supported backend
         "FLEX_ATTENTION",
         "FLASHINFER",
-        "FLASH_ATTN",
         "FLASH_ATTN_MLA",
         "TRITON_MLA",
         # Not yet supported MLA backends