Add ngram-eagle SD method

ekagra-ranjan · ekagra-ranjan · commit 20d93dc6eb76 · 2025-09-05T21:36:20.000Z
Signed-off-by: Ekagra Ranjan &lt;3116519+ekagra-ranjan@users.noreply.github.com&gt;
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import json
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
@@ -53,9 +53,10 @@ def parse_args():
         "--method",
         type=str,
         default="eagle",
-        choices=["ngram", "eagle", "eagle3", "mtp"],
+        choices=["ngram", "eagle", "eagle3", "mtp", "ngram-eagle"],
     )
     parser.add_argument("--num-spec-tokens", type=int, default=2)
+    parser.add_argument("--num-speculative-tokens-per-method", type=str, default='{\"ngram\": 2, \"eagle\": 2}')
     parser.add_argument("--prompt-lookup-max", type=int, default=5)
     parser.add_argument("--prompt-lookup-min", type=int, default=2)
     parser.add_argument("--tp", type=int, default=1)
@@ -118,6 +119,22 @@ def main():
             "prompt_lookup_max": args.prompt_lookup_max,
             "prompt_lookup_min": args.prompt_lookup_min,
         }
+    elif args.method == "ngram-eagle":
+        num_speculative_tokens_per_method = json.loads(args.num_speculative_tokens_per_method)
+        eagle_dir = args.eagle_dir
+        if eagle_dir is None:
+            eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+        args.num_spec_tokens = max(
+            num_speculative_tokens_per_method["ngram"],
+            num_speculative_tokens_per_method["eagle"],
+        )
+        speculative_config = {
+            "method": "ngram-eagle",
+            "model": eagle_dir,
+            "num_speculative_tokens_per_method": num_speculative_tokens_per_method,
+            "prompt_lookup_max": args.prompt_lookup_max,
+            "prompt_lookup_min": args.prompt_lookup_min,
+        }
     else:
         raise ValueError(f"unknown method: {args.method}")
 
@@ -150,6 +167,7 @@ def main():
             print("-" * 50)
             print(f"prompt: {output.prompt}")
             print(f"generated text: {output.outputs[0].text}")
+            print(f"num of generated tokens: {len(output.outputs[0].token_ids)}")
             print("-" * 50)
 
     try:
@@ -179,6 +197,10 @@ def main():
             assert isinstance(metric, Vector)
             for pos in range(len(metric.values)):
                 acceptance_counts[pos] += metric.values[pos]
+        elif metric.name == "vllm:generation_tokens":
+            assert isinstance(metric, Counter)
+            print(f"num generation tokens: {metric.value}")
+            total_tokens_generated = metric.value
 
     print("-" * 50)
     print(f"total_num_output_tokens: {total_num_output_tokens}")
@@ -187,6 +209,10 @@ def main():
     print(f"num_accepted_tokens: {num_accepted_tokens}")
     acceptance_length = 1 + (num_accepted_tokens / num_drafts) if num_drafts > 0 else 1
     print(f"mean acceptance length: {acceptance_length:.2f}")
+    num_tokens_generated_without_sd = total_tokens_generated - (num_drafts + num_accepted_tokens)
+    seq_normalized_acceptance_length = (total_tokens_generated) / (num_drafts + num_tokens_generated_without_sd)
+    print(f"num_tokens_generated_without_sd: {num_tokens_generated_without_sd}")
+    print(f"seq normalized acceptance length: {seq_normalized_acceptance_length:.2f}")
     print("-" * 50)
 
     # print acceptance at each token position
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
@@ -1278,6 +1278,8 @@ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
 
 
 def get_samples(args, tokenizer) -> list[SampleRequest]:
+    if not hasattr(args, "request_id_prefix"):
+        args.request_id_prefix = ""
     if args.dataset_name == "custom":
         dataset = CustomDataset(dataset_path=args.dataset_path)
         input_requests = dataset.sample(
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -1936,7 +1936,7 @@ def __post_init__(self):
             self.device = torch.device(self.device_type)
 
 
-SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
+SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "ngram-eagle", "medusa",
                             "mlp_speculator", "draft_model", "deepseek_mtp",
                             "ernie_mtp"]
 
@@ -1950,6 +1950,9 @@ class SpeculativeConfig:
     num_speculative_tokens: SkipValidation[int] = None  # type: ignore
     """The number of speculative tokens, if provided. It will default to the
     number in the draft model config if present, otherwise, it is required."""
+    num_speculative_tokens_per_method: Optional[dict[str, int]] = None
+    """The number of speculative tokens for each method, if provided. Max of 
+    the values will be used if `num_speculative_tokens` is not provided."""
     model: Optional[str] = None
     """The name of the draft model, eagle head, or additional weights, if
     provided."""
@@ -2109,6 +2112,18 @@ def __post_init__(self):
                 raise ValueError("num_speculative_tokens was provided without "
                                  "speculative model.")
 
+        # set num_speculative_tokens from num_speculative_tokens_per_method
+        # for methods like ngram-eagle
+        if self.num_speculative_tokens_per_method is not None:
+            max_num_speculative_tokens = max(
+                self.num_speculative_tokens_per_method.values())
+            if self.num_speculative_tokens is None:
+                self.num_speculative_tokens = max_num_speculative_tokens
+            else:
+                assert self.num_speculative_tokens < max_num_speculative_tokens, (
+                    "num_speculative_tokens should be None or must be less than or equal to the "
+                    "max value in num_speculative_tokens_per_method.") 
+
         # Automatically configure the method for ngram when "model" is used
         # instead of "method"
         if self.method is None and (self.model is not None
@@ -2118,6 +2133,8 @@ def __post_init__(self):
         if self.method in ("ngram", "[ngram]"):
             # Unified to "ngram" internally
             self.method = "ngram"
+
+        if self.method in ("ngram", "ngram-eagle"):
             # Set default values if not provided
             if (self.prompt_lookup_min is None
                     and self.prompt_lookup_max is None):
@@ -2148,9 +2165,13 @@ def __post_init__(self):
             # draft related config as None here.
             self.draft_model_config = self.target_model_config
             self.draft_parallel_config = self.target_parallel_config
-        else:
-            self.prompt_lookup_max = 0
-            self.prompt_lookup_min = 0
+        
+        # allow ngram-eagle to use this code block similar to eagle
+        if self.method not in ("ngram"):
+
+            if self.method != "ngram-eagle":
+                self.prompt_lookup_max = 0
+                self.prompt_lookup_min = 0
 
             if self.model is not None:
                 self.draft_model_config = ModelConfig(
@@ -2179,7 +2200,7 @@ def __post_init__(self):
                 )
 
                 # Automatically detect the method
-                if self.method in ('eagle', 'eagle3'):
+                if self.method in ('eagle', 'eagle3', 'ngram-eagle'):
                     pass
                 elif "eagle-" in self.draft_model_config.model.lower() or \
                         "eagle3-" in self.draft_model_config.model.lower():
@@ -2216,7 +2237,7 @@ def __post_init__(self):
                         "eagle, or deepseek_mtp.")
 
                 # Replace hf_config for EAGLE draft_model
-                if self.method in ("eagle", "eagle3"):
+                if self.method in ("eagle", "eagle3", "ngram-eagle"):
                     if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                         raise ValueError(
                             "Chunked prefill and EAGLE are not compatible "
@@ -2422,7 +2443,7 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
+        return self.method in ("eagle", "eagle3", "ngram-eagle", "deepseek_mtp", "ernie_mtp")
 
     def __repr__(self) -> str:
         method = self.method
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
@@ -46,7 +46,7 @@ def __init__(self,
         # Eagle model name should follow naming convention of
         # LlamaForCausalLM -> EagleLlamaForCausalLM
         # LlamaForCausalLM -> Eagle3LlamaForCausalLM
-        if method == "eagle":
+        if method in ("eagle", "ngram-eagle"):
             assert self.model is not None, \
                 "model should not be None when method is eagle"
             kwargs["architectures"] = [
@@ -62,7 +62,7 @@ def __init__(self,
             ]
         else:
             raise ValueError(f"Invalid method {method}. "
-                             "Supported methods are eagle and eagle3.")
+                             "Supported methods are eagle, ngram-eagle and eagle3.")
 
         super().__init__(**kwargs)
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -61,8 +61,16 @@ def __init__(
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
-        self.num_speculative_tokens = (
-            self.speculative_config.num_speculative_tokens)
+        
+        if self.method == "ngram-eagle":
+            self.num_speculative_tokens = (
+                self.speculative_config.num_speculative_tokens_per_method["eagle"])
+        else:
+            self.num_speculative_tokens = (
+                self.speculative_config.num_speculative_tokens)
+
+        logger.info(f"EagleProposer: method={self.method}, num_speculative_tokens={self.num_speculative_tokens}")
+
         self.max_num_tokens = (
             vllm_config.scheduler_config.max_num_batched_tokens)
         self.token_arange_np = np.arange(self.max_num_tokens)
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
@@ -6,6 +6,9 @@
 from numba import jit
 
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class NgramProposer:
@@ -22,13 +25,18 @@ def __init__(self, vllm_config: VllmConfig):
         # Number of tokens follow the match. If there are less than k
         # tokens follow the match, we will return the maximum amount of
         # tokens until the end.
-        self.k = vllm_config.speculative_config.num_speculative_tokens
+        self.method = vllm_config.speculative_config.method
+        if self.method == "ngram-eagle":
+            self.k = vllm_config.speculative_config.num_speculative_tokens_per_method["ngram"]
+        else:
+            self.k = vllm_config.speculative_config.num_speculative_tokens
         # Maximum length of the model.
         self.max_model_len = vllm_config.model_config.max_model_len
 
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(np.zeros(1024, dtype=np.int32))
+        logger.info(f"NgramProposer: min_n={self.min_n}, max_n={self.max_n}, k={self.k}, max_model_len={self.max_model_len}")  # noqa: E501
 
     def propose(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -185,19 +185,26 @@ def __init__(
         # NOTE(Jiayi): currently we put the entire draft model on
         # the last PP rank. This is not ideal if there are many
         # layers in the draft model.
+        found_draft = False
         if self.speculative_config and get_pp_group().is_last_rank:
-            if self.speculative_config.method == "ngram":
-                self.drafter = NgramProposer(self.vllm_config)
-            elif self.speculative_config.use_eagle():
-                self.drafter = EagleProposer(self.vllm_config, self.device,
+            # use ifs and not elifs to allow multiple 
+            # draft models to be initialized
+            if self.speculative_config.method == "ngram" \
+                or self.speculative_config.method == "ngram-eagle":
+                self.drafter_ngram = NgramProposer(self.vllm_config)
+                found_draft = True
+            if self.speculative_config.use_eagle():
+                self.drafter_eagle = EagleProposer(self.vllm_config, self.device,
                                              self)  # type: ignore
                 if self.speculative_config.method == "eagle3":
                     self.use_aux_hidden_state_outputs = True
-            elif self.speculative_config.method == "medusa":
+                found_draft = True
+            if self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
                     vllm_config=self.vllm_config,
                     device=self.device)  # type: ignore
-            else:
+                found_draft = True
+            if not found_draft:
                 raise ValueError("Unknown speculative decoding method: "
                                  f"{self.speculative_config.method}")
             self.rejection_sampler = RejectionSampler()
@@ -1775,10 +1782,12 @@ def propose_draft_token_ids(
         common_attn_metadata: CommonAttentionMetadata,
     ) -> Union[list[list[int]], torch.Tensor]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if self.speculative_config.method == "ngram":
-            assert isinstance(self.drafter, NgramProposer)
+        if self.speculative_config.method == "ngram" or  self.speculative_config.method == "ngram-eagle":
+            assert isinstance(self.drafter_ngram, NgramProposer)
             draft_token_ids = self.propose_ngram_draft_token_ids(
                 sampled_token_ids)
+            if self.speculative_config.method == "ngram-eagle":
+                draft_token_ids_ngram = draft_token_ids
         elif self.speculative_config.method == "medusa":
             assert isinstance(self.drafter, MedusaProposer)
             if sample_hidden_states.shape[0] == len(sampled_token_ids):
@@ -1799,8 +1808,8 @@ def propose_draft_token_ids(
                 target_hidden_states=hidden_states,
                 sampling_metadata=sampling_metadata,
             )
-        elif self.speculative_config.use_eagle():
-            assert isinstance(self.drafter, EagleProposer)
+        if self.speculative_config.use_eagle():
+            assert isinstance(self.drafter_eagle, EagleProposer)
             # TODO(woosuk): Refactor the loop.
             req_ids = self.input_batch.req_ids
             next_token_ids: list[int] = []
@@ -1842,7 +1851,7 @@ def propose_draft_token_ids(
                 num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens,
                                                        dtype=torch.int32)
                 common_attn_metadata, token_indices =\
-                    self.drafter.prepare_inputs(
+                    self.drafter_eagle.prepare_inputs(
                     common_attn_metadata, num_rejected_tokens_cpu)
 
                 target_token_ids = self.input_ids.gpu[token_indices]
@@ -1858,7 +1867,7 @@ def propose_draft_token_ids(
                 mm_embeds = self._gather_mm_embeddings(scheduler_output,
                                                        shift_computed_tokens=1)
 
-            draft_token_ids = self.drafter.propose(
+            draft_token_ids = self.drafter_eagle.propose(
                 target_token_ids=target_token_ids,
                 target_positions=target_positions,
                 target_hidden_states=target_hidden_states,
@@ -1867,6 +1876,25 @@ def propose_draft_token_ids(
                 common_attn_metadata=common_attn_metadata,
                 mm_embeds=mm_embeds,
             )
+            if self.speculative_config.method == "ngram-eagle":
+                draft_token_ids_eagle = draft_token_ids
+
+        if self.speculative_config.method == "ngram-eagle":
+            assert draft_token_ids_ngram is not None, "ngram proposer failed"
+            assert draft_token_ids_eagle is not None, "eagle proposer failed"
+            # eagle draft is torch but we need list
+            draft_token_ids_eagle = draft_token_ids_eagle.tolist()
+            draft_token_ids = []
+
+            # combine ngram and eagle drafts
+            # prefer ngram drafts when available
+            # choose eagle drafts when ngram drafts are empty
+            for bid in range(len(draft_token_ids_ngram)):
+                if len(draft_token_ids_ngram[bid]):
+                    draft_token_ids.append(draft_token_ids_ngram[bid])
+                else:
+                    draft_token_ids.append(draft_token_ids_eagle[bid])
+                    
         return draft_token_ids
 
     def propose_ngram_draft_token_ids(
@@ -1896,7 +1924,7 @@ def propose_ngram_draft_token_ids(
                 draft_token_ids.append([])
                 continue
 
-            drafter_output = self.drafter.propose(
+            drafter_output = self.drafter_ngram.propose(
                 self.input_batch.token_ids_cpu[i, :num_tokens])
             if drafter_output is None or len(drafter_output) == 0:
                 draft_token_ids.append([])
@@ -1963,6 +1991,9 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             if hasattr(self, "drafter"):
                 logger.info("Loading drafter model...")
                 self.drafter.load_model(self.model)
+            if hasattr(self, "drafter_eagle"):
+                logger.info("Loading eagle drafter model...")
+                self.drafter_eagle.load_model(self.model)
             if self.use_aux_hidden_state_outputs:
                 if supports_eagle3(self.model):
                     self.model.set_aux_hidden_state_layers(
@@ -2379,8 +2410,8 @@ def _dummy_run(
                 hidden_states = outputs
 
             if self.speculative_config and self.speculative_config.use_eagle():
-                assert isinstance(self.drafter, EagleProposer)
-                self.drafter.dummy_run(num_tokens)
+                assert isinstance(self.drafter_eagle, EagleProposer)
+                self.drafter_eagle.dummy_run(num_tokens)
 
         # This is necessary to avoid blocking DP.
         # For dummy runs, we typically skip EPLB since we don't have any real
@@ -3133,10 +3164,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
 
         if self.speculative_config and self.speculative_config.use_eagle():
-            assert isinstance(self.drafter, EagleProposer)
+            assert isinstance(self.drafter_eagle, EagleProposer)
             # validate all draft model layers belong to the same kv cache
             # group
-            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+            self.drafter_eagle.validate_same_kv_cache_group(kv_cache_config)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)