Fix Bug #3 and nucleus collapse: Increase tau_q soft floor to 2.0

yuz207 · yuz207 · commit b729072fc46e · 2025-09-27T20:16:46.000-07:00
ROOT CAUSE: draft_q_soft_temp=0.50 was SHARPENING the distribution instead of softening it (dividing by tau<1.0 doubles logit magnitudes). This caused nucleus to collapse to 1-2 survivors → q≈1.0 → acceptance stuck at ~0.7038 (average p_target). FIXES: 1. Config defaults (config.py, arg_utils.py): - draft_q_temp_offset: 0.15 → 0.25 (better dynamic range) - draft_q_soft_temp: 0.50 → 2.0 (SOFTENS instead of sharpens) At draft_temp=0.05: - Before: tau_q = max(0.05+0.15, 0.50) = 0.50 (2x sharper!) - After: tau_q = max(0.05+0.25, 2.0) = 2.0 (2x softer) 2. Force min_keep=2 in nucleus (eagle.py line 271): - Added keep_sorted[..., :2] = True - Prevents survivors=1 by construction (defensive programming) 3. Fix smoothing to uniform over kept set (eagle.py lines 275-287): - Before: Mixed with untempered baseline (wrong approach) - After: Uniform distribution over survivors only (correct) - Prevents q from reaching exactly 1.0 in corner cases 4. Remove dead code (eagle.py line 322): - Deleted unused self._current_sampling_metadata assignment - No longer needed with draft-anchored approach (bug #2 fix) Expected results: - tau_q ≥ 2.0 at ultracold temps → softer distribution - NUC_DEBUG: survivors = hundreds/thousands (not 1-2) - Q_DEBUG: q ∈ [0.5, 0.8] (not 0.98-1.0) - Accept rate: dynamic range restored across temp sweep
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -376,8 +376,8 @@ class EngineArgs:
     draft_top_p: float = 0.95
     draft_top_k: int = 0
     # Draft-anchored adaptive temperature settings
-    draft_q_temp_offset: float = 0.15
-    draft_q_soft_temp: float = 0.50
+    draft_q_temp_offset: float = 0.25
+    draft_q_soft_temp: float = 2.0
     draft_mix_lambda_max: float = 0.05
     revision: Optional[str] = ModelConfig.revision
     code_revision: Optional[str] = ModelConfig.code_revision
diff --git a/vllm/v1/spec_decode/config.py b/vllm/v1/spec_decode/config.py
@@ -28,8 +28,8 @@ class SpecDecodeOptConfig:
     draft_top_k: int = 0  # 0 = disabled
 
     # Draft-anchored adaptive temperature settings
-    draft_q_temp_offset: float = 0.15  # Offset added to draft_temp
-    draft_q_soft_temp: float = 0.50  # Soft floor to prevent ultra-cold collapse
+    draft_q_temp_offset: float = 0.25  # Offset added to draft_temp
+    draft_q_soft_temp: float = 2.0  # Soft floor to prevent ultra-cold collapse
     draft_mix_lambda_max: float = 0.05  # Tiny smoothing over baseline
 
     # Debug and profiling settings
@@ -95,12 +95,12 @@ def from_cli_args(cls, vllm_config) -> "SpecDecodeOptConfig":
         if hasattr(vllm_config, 'draft_q_temp_offset'):
             config.draft_q_temp_offset = vllm_config.draft_q_temp_offset
         else:
-            config.draft_q_temp_offset = float(os.environ.get('VLLM_DRAFT_Q_TEMP_OFFSET', '0.15'))
+            config.draft_q_temp_offset = float(os.environ.get('VLLM_DRAFT_Q_TEMP_OFFSET', '0.25'))
 
         if hasattr(vllm_config, 'draft_q_soft_temp'):
             config.draft_q_soft_temp = vllm_config.draft_q_soft_temp
         else:
-            config.draft_q_soft_temp = float(os.environ.get('VLLM_DRAFT_Q_SOFT_TEMP', '0.50'))
+            config.draft_q_soft_temp = float(os.environ.get('VLLM_DRAFT_Q_SOFT_TEMP', '2.0'))
 
         if hasattr(vllm_config, 'draft_mix_lambda_max'):
             config.draft_mix_lambda_max = vllm_config.draft_mix_lambda_max
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -268,22 +268,27 @@ def _sample_draft_tokens(
                 keep_sorted = torch.zeros_like(sp, dtype=torch.bool)
                 keep_sorted[..., 0] = True
                 keep_sorted[..., 1:] = csum[..., :-1] < top_p  # STRICTLY below (correct rule)
+                keep_sorted[..., :2] = True  # Force min_keep=2 (prevents survivors=1)
                 keep = torch.zeros_like(p, dtype=torch.bool).scatter(-1, si, keep_sorted)
                 x = torch.where(keep, x, torch.full_like(x, float("-inf")))
 
-            # Optional smoothing with untempered baseline
-            probs_full = torch.softmax(x, dim=-1)
+            # Optional smoothing over kept set (uniform mix)
             lam = float(getattr(self.opt_config, "draft_mix_lambda_max", 0.0) or 0.0)
             print(f"[SMOOTH_DEBUG] lambda_max from config: {lam}, will run smoothing: {lam > 0.0}",
                   file=sys.stderr, flush=True)
+            logp_full = torch.log_softmax(x, dim=-1)
             if lam > 0.0:
-                base = torch.softmax(logits_f32, dim=-1)  # untempered baseline
-                probs_full = (1.0 - lam) * probs_full + lam * base
-                probs_full = probs_full / probs_full.sum(dim=-1, keepdim=True)
-            logp_full = torch.log(probs_full.clamp_min(1e-20))
+                kept = torch.isfinite(logp_full)
+                p = torch.exp(logp_full)
+                # Uniform over survivors only
+                u = kept.float() / kept.float().sum(dim=-1, keepdim=True).clamp_min(1.0)
+                p = (1.0 - lam) * p + lam * u
+                p = p * kept  # Ensure dropped stay at 0
+                logp_full = torch.log(p.clamp_min(1e-45))
 
             # Sample token and gather its logp
-            tok = torch.distributions.Categorical(probs=probs_full).sample()
+            cat = torch.distributions.Categorical(logits=logp_full)
+            tok = cat.sample()
             tok_logp = logp_full.gather(-1, tok.unsqueeze(-1)).squeeze(-1)
 
             # Debug logging
@@ -318,9 +323,6 @@ def propose(
         sampling_metadata: SamplingMetadata,
         mm_embeds: Optional[list[torch.Tensor]] = None,
     ) -> torch.Tensor:
-        # Store sampling_metadata so _sample_draft_tokens() can access target temperature
-        self._current_sampling_metadata = sampling_metadata
-
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]