From da3dc6155c2796c9317317671a0a7d7ef5d13ddd Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Mon, 19 May 2025 22:40:49 +0000
Subject: [PATCH 01/21] llama4 type eagle support in v1

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 vllm/model_executor/models/llama4_eagle.py | 257 +++++++++++++++++++++
 vllm/model_executor/models/registry.py     |   1 +
 2 files changed, 258 insertions(+)
 create mode 100644 vllm/model_executor/models/llama4_eagle.py

diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
new file mode 100644
index 000000000000..409baf83871e
--- /dev/null
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
+                                               Llama4ForCausalLM)
+from vllm.model_executor.models.utils import (AutoWeightsLoader,
+                                              extract_layer_index,
+                                              is_pp_missing_parameter,
+                                              maybe_prefix)
+
+
+@support_torch_compile
+class EagleLlama4Model(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 start_layer_id: int = 0):
+
+        super().__init__()
+        self.config = (
+            vllm_config.speculative_config.draft_model_config.hf_config)
+        self.vocab_size = self.config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        if vllm_config.speculative_config.quantization:
+            self.quant_config = vllm_config.quant_config
+        else:
+            self.quant_config = None
+
+        self.layers = nn.ModuleList([
+            Llama4DecoderLayer(
+                config=self.config,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
+            )
+        ])
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+        self.num_experts = self.config.num_local_experts
+
+        self.norm = RMSNorm(
+            hidden_size=self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_moe_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict[str, nn.Parameter],
+        loaded_params: set[str],
+        expert_params_mapping: list[tuple[str, str, int, str]],
+        fused: bool = True,
+    ) -> bool:
+        expert_param_loaded = False
+        if "experts.gate_up_proj" in name:
+            loaded_weight = loaded_weight.chunk(2, dim=-1)
+        for (param_name, weight_name, expert_id,
+             shard_id) in expert_params_mapping:
+            new_loaded_weight = loaded_weight
+            if fused:
+                e_str, _, proj_str, _ = weight_name.split('.')
+                weight_name = f"{e_str}.{proj_str}"
+                param_name = f"{param_name}weight"
+            if weight_name not in name:
+                continue
+            full_param_name = name.replace(weight_name, param_name)
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                    and name not in params_dict):
+                continue
+            param = params_dict[full_param_name]
+            weight_loader = param.weight_loader
+            if fused:
+                if "w13" in full_param_name:
+                    shard_idx = 0 if shard_id == "w1" else 1
+                    new_loaded_weight = new_loaded_weight[shard_idx]
+                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+                layer_idx = extract_layer_index(name)
+                # EP mapping
+                expert_map = self.layers[
+                    layer_idx].feed_forward.experts.expert_map
+                if expert_map is not None:
+                    local_expert_indices = (expert_map != -1) \
+                                            .nonzero() \
+                                            .flatten() \
+                                            .to(new_loaded_weight.device)
+                    new_loaded_weight = new_loaded_weight[local_expert_indices]
+                    expert_id = local_expert_indices[0].item()
+            else:
+                # TODO: add EP support for non fused weights
+                pass
+            weight_loader(param,
+                          new_loaded_weight,
+                          full_param_name,
+                          shard_id=shard_id,
+                          expert_id=expert_id)
+
+            loaded_params.add(full_param_name)
+            expert_param_loaded = True
+        return expert_param_loaded
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        fused_experts_params = False
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.num_experts)
+        expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="gate_up_proj",
+            num_experts=1)
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                fused_experts_params = True
+                expert_params_mapping = expert_params_mapping_fused
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                moe_loaded = self.load_moe_expert_weights(
+                    name,
+                    loaded_weight,
+                    params_dict,
+                    loaded_params,
+                    expert_params_mapping,
+                    fused=fused_experts_params)
+
+                if not moe_loaded:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+
+        return loaded_params
+
+
+class EagleLlama4ForCausalLM(Llama4ForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
+        nn.Module.__init__(self)
+        self.config = (
+            vllm_config.speculative_config.draft_model_config.hf_config)
+        if start_layer_id > 0:
+            original_no_rope_layers = self.config.no_rope_layers
+
+            # If start_layer_id is 0, we will hit NotImplementedError in
+            # vllm/v1/utils.py. If we don't pad no_rope_layers, will get
+            # index out of bounds in constructor of Llama4Attention layer.
+            self.config.no_rope_layers = [None] * start_layer_id
+            self.config.no_rope_layers.extend(original_no_rope_layers)
+
+        self.model = EagleLlama4Model(vllm_config=vllm_config,
+                                      prefix="model",
+                                      start_layer_id=start_layer_id)
+
+        self.lm_head = ParallelLMHead(num_embeddings=self.config.vocab_size,
+                                      embedding_dim=self.config.hidden_size)
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c55f7ccd344f..b7c53eea1125 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -221,6 +221,7 @@
     "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"),
     "EAGLEModel": ("eagle", "EAGLE"),
     "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
+    "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),

From 06bfb261671f01d9b3d898369bf5c2a872cc7d85 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sun, 15 Jun 2025 00:06:45 +0000
Subject: [PATCH 02/21] updating code to match current standards. removed
 redundant lm_head

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 vllm/model_executor/models/llama4_eagle.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 409baf83871e..f8afc3765722 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -214,10 +214,13 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 class EagleLlama4ForCausalLM(Llama4ForCausalLM):
 
-    def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         self.config = (
             vllm_config.speculative_config.draft_model_config.hf_config)
+
+        start_layer_id = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
         if start_layer_id > 0:
             original_no_rope_layers = self.config.no_rope_layers
 
@@ -231,9 +234,6 @@ def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
                                       prefix="model",
                                       start_layer_id=start_layer_id)
 
-        self.lm_head = ParallelLMHead(num_embeddings=self.config.vocab_size,
-                                      embedding_dim=self.config.hidden_size)
-
         logit_scale = getattr(self.config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.config.vocab_size,
                                                 scale=logit_scale)

From 40df89deb57098211e3b1c4b4664258c950fd23d Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sun, 15 Jun 2025 00:09:26 +0000
Subject: [PATCH 03/21] add spdx filecopyright text

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 vllm/model_executor/models/llama4_eagle.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index f8afc3765722..b7c44cc7e44d 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 

From e9d9241b5441f4e9b96b02ef177f6bb48a02e10b Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sun, 15 Jun 2025 00:16:24 +0000
Subject: [PATCH 04/21] fix linter

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 vllm/model_executor/models/llama4_eagle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index b7c44cc7e44d..2671f9ad8622 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
                                                Llama4ForCausalLM)

From 25bf27612a07068ba1373248081a4e2b50ddb5d6 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sat, 21 Jun 2025 07:12:49 +0000
Subject: [PATCH 05/21] tests

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_spec_decode.py   | 46 ++++++++++++++++++++----------
 tests/v1/spec_decode/test_eagle.py | 29 ++++++++++++-------
 2 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 93e7c12f3a09..6f50bf7e9e9b 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -9,6 +9,9 @@
 
 from vllm import LLM, SamplingParams
 
+TP8_REQUIRED_MODELS = [
+    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+]
 
 @pytest.fixture
 def test_prompts():
@@ -53,14 +56,6 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
-def eagle_model_name():
-    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-
-
-def eagle3_model_name():
-    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
-
-
 def test_ngram_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -105,13 +100,23 @@ def test_ngram_correctness(
         del spec_llm
 
 
-@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
+@pytest.mark.parametrize("method_model_and_draft_model", 
+        [(
+            "eagle", "meta-llama/Llama-3.1-8B-Instruct", 
+            "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+        ),(
+            "eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"
+        ),(
+            "eagle3","meta-llama/Llama-3.1-8B-Instruct", 
+            "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+        )], 
+        ids=["llama3_eagle", "llama4_eagle", "llama3_eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
-    model_name: str,
-    use_eagle3: bool,
+    method_model_and_draft_model: tuple[str, str],
 ):
     '''
     Compare the outputs of a original LLM and a speculative LLM
@@ -120,17 +125,28 @@ def test_eagle_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=2048)
+        model_name = method_model_and_draft_model[1]
+
+        tp = 1
+
+        if model_name in TP8_REQUIRED_MODELS:
+            tp = 8
+
+        ref_llm = LLM(model=model_name, 
+                      tensor_parallel_size=tp,
+                      max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
-        spec_model_name = eagle3_model_name(
-        ) if use_eagle3 else eagle_model_name()
+        method = method_model_and_draft_model[0]
+        spec_model_name = method_model_and_draft_model[2]
+
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
+            tensor_parallel_size=tp,
             speculative_config={
-                "method": "eagle3" if use_eagle3 else "eagle",
+                "method": method,
                 "model": spec_model_name,
                 "num_speculative_tokens": 3,
                 "max_model_len": 2048,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index c93b7f57c041..ba1107eb756f 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -12,12 +12,15 @@
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.v1.spec_decode.eagle import EagleProposer
 
-model_dir = "meta-llama/Llama-3.1-8B-Instruct"
-eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+llama3_model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+llama3_eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+llama3_eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
+llama4_model_dir = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+llama4_eagle_dir = "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"
 
-def _create_proposer(method: str, k: int) -> EagleProposer:
+
+def _create_proposer(method: str, model_dir: str, draft_model_dir: str, k: int) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
                                task="generate",
                                max_model_len=100,
@@ -27,9 +30,6 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
                                seed=None,
                                trust_remote_code=False)
 
-    # Choose model directory based on method
-    draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
-
     speculative_config = SpeculativeConfig(
         target_model_config=model_config,
         target_parallel_config=ParallelConfig(),
@@ -115,8 +115,9 @@ def test_prepare_inputs():
 
 
 @pytest.mark.parametrize("method,proposer_helper", [
-    ("eagle", lambda k: _create_proposer("eagle", k)),
-    ("eagle3", lambda k: _create_proposer("eagle3", k)),
+    ("eagle", lambda k: _create_proposer("eagle", llama3_model_dir, llama3_eagle_dir, k)),
+    ("eagle", lambda k: _create_proposer("eagle", llama4_model_dir, llama4_eagle_dir, k)),
+    ("eagle3", lambda k: _create_proposer("eagle3", llama3_model_dir, llama3_eagle3_dir, k)),
 ])
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
@@ -196,7 +197,13 @@ class _TargetModelStub(LlamaForCausalLM):
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
-def test_propose(num_speculative_tokens):
+@pytest.mark.parametrize("model_and_draft_model", [
+    (llama3_model_dir, llama3_eagle_dir), 
+    (llama4_model_dir, llama4_eagle_dir)
+])
+def test_propose(num_speculative_tokens, model_and_draft_model):
+    model_dir = model_and_draft_model[0]
+    draft_model_dir = model_and_draft_model[1]
     # Use GPU device
     device = torch.device('cuda')
 
@@ -208,7 +215,7 @@ def test_propose(num_speculative_tokens):
     vocab_size = 100
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", num_speculative_tokens)
+    proposer = _create_proposer("eagle", model_dir, draft_model_dir, num_speculative_tokens)
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size
 

From 1868c120c65ce49225c0c966b4e2c2ae9fc26212 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sat, 21 Jun 2025 07:26:18 +0000
Subject: [PATCH 06/21] fix linter

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_spec_decode.py   | 25 +++++++++++--------------
 tests/v1/spec_decode/test_eagle.py | 21 +++++++++++++--------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 6f50bf7e9e9b..6ac0d082d3d4 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -100,23 +100,20 @@ def test_ngram_correctness(
         del spec_llm
 
 
-@pytest.mark.parametrize("method_model_and_draft_model", 
-        [(
-            "eagle", "meta-llama/Llama-3.1-8B-Instruct", 
-            "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-        ),(
-            "eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-            "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"
-        ),(
-            "eagle3","meta-llama/Llama-3.1-8B-Instruct", 
-            "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
-        )], 
-        ids=["llama3_eagle", "llama4_eagle", "llama3_eagle3"])
+@pytest.mark.parametrize(
+    "method_model_and_draft_model",
+    [("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"),
+     ("eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+      "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"),
+     ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")],
+    ids=["llama3_eagle", "llama4_eagle", "llama3_eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
-    method_model_and_draft_model: tuple[str, str],
+    method_model_and_draft_model: tuple[str, str, str],
 ):
     '''
     Compare the outputs of a original LLM and a speculative LLM
@@ -132,7 +129,7 @@ def test_eagle_correctness(
         if model_name in TP8_REQUIRED_MODELS:
             tp = 8
 
-        ref_llm = LLM(model=model_name, 
+        ref_llm = LLM(model=model_name,
                       tensor_parallel_size=tp,
                       max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index ba1107eb756f..cc031ff8e51d 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -115,9 +115,14 @@ def test_prepare_inputs():
 
 
 @pytest.mark.parametrize("method,proposer_helper", [
-    ("eagle", lambda k: _create_proposer("eagle", llama3_model_dir, llama3_eagle_dir, k)),
-    ("eagle", lambda k: _create_proposer("eagle", llama4_model_dir, llama4_eagle_dir, k)),
-    ("eagle3", lambda k: _create_proposer("eagle3", llama3_model_dir, llama3_eagle3_dir, k)),
+    ("eagle",
+     lambda k: _create_proposer("eagle", llama3_model_dir, llama3_eagle_dir, k)
+     ),
+    ("eagle",
+     lambda k: _create_proposer("eagle", llama4_model_dir, llama4_eagle_dir, k)
+     ),
+    ("eagle3", lambda k: _create_proposer("eagle3", llama3_model_dir,
+                                          llama3_eagle3_dir, k)),
 ])
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
@@ -197,10 +202,9 @@ class _TargetModelStub(LlamaForCausalLM):
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
-@pytest.mark.parametrize("model_and_draft_model", [
-    (llama3_model_dir, llama3_eagle_dir), 
-    (llama4_model_dir, llama4_eagle_dir)
-])
+@pytest.mark.parametrize("model_and_draft_model",
+                         [(llama3_model_dir, llama3_eagle_dir),
+                          (llama4_model_dir, llama4_eagle_dir)])
 def test_propose(num_speculative_tokens, model_and_draft_model):
     model_dir = model_and_draft_model[0]
     draft_model_dir = model_and_draft_model[1]
@@ -215,7 +219,8 @@ def test_propose(num_speculative_tokens, model_and_draft_model):
     vocab_size = 100
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", model_dir, draft_model_dir, num_speculative_tokens)
+    proposer = _create_proposer("eagle", model_dir, draft_model_dir, 
+                                num_speculative_tokens)
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size
 

From 23136c84c5100311f9b2ddc882c7171fbbc375af Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sat, 21 Jun 2025 07:38:20 +0000
Subject: [PATCH 07/21] fix linter

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_spec_decode.py   | 1 +
 tests/v1/spec_decode/test_eagle.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 6ac0d082d3d4..f61a0c52d8b3 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -13,6 +13,7 @@
     "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
 ]
 
+
 @pytest.fixture
 def test_prompts():
     prompt_types = ["repeat", "sentence"]
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index cc031ff8e51d..ad9c7a41f6ce 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -20,7 +20,8 @@
 llama4_eagle_dir = "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"
 
 
-def _create_proposer(method: str, model_dir: str, draft_model_dir: str, k: int) -> EagleProposer:
+def _create_proposer(method: str, model_dir: str, draft_model_dir: str, 
+                     k: int) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
                                task="generate",
                                max_model_len=100,
@@ -219,7 +220,7 @@ def test_propose(num_speculative_tokens, model_and_draft_model):
     vocab_size = 100
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", model_dir, draft_model_dir, 
+    proposer = _create_proposer("eagle", model_dir, draft_model_dir,
                                 num_speculative_tokens)
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size

From 5c65200a04d1a1d0a3231ce37084c34ad1c50367 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sat, 21 Jun 2025 07:46:08 +0000
Subject: [PATCH 08/21] remove whitespace

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/spec_decode/test_eagle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index ad9c7a41f6ce..7977b1352074 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -20,7 +20,7 @@
 llama4_eagle_dir = "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"
 
 
-def _create_proposer(method: str, model_dir: str, draft_model_dir: str, 
+def _create_proposer(method: str, model_dir: str, draft_model_dir: str,
                      k: int) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
                                task="generate",

From 1de5b8470d4481f8431bdca673ada3923aef5913 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Tue, 24 Jun 2025 23:05:18 +0000
Subject: [PATCH 09/21] split tests

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_llama4_eagle.py | 119 ++++++++++++++++++++++++++++++
 tests/v1/e2e/test_spec_decode.py  |  15 +---
 2 files changed, 120 insertions(+), 14 deletions(-)
 create mode 100644 tests/v1/e2e/test_llama4_eagle.py

diff --git a/tests/v1/e2e/test_llama4_eagle.py b/tests/v1/e2e/test_llama4_eagle.py
new file mode 100644
index 000000000000..c517f621ed63
--- /dev/null
+++ b/tests/v1/e2e/test_llama4_eagle.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# To run this file, run 
+# pytest /path/to/vllm/tests/v1/e2e/test_llama4_eagle.py
+
+from __future__ import annotations
+
+import random
+from typing import Any
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+TP8_REQUIRED_MODELS = [
+    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+]
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 100
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
+
+
+@pytest.mark.parametrize(
+    "method_model_and_draft_model",
+    [("eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+      "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct")],
+    ids=["llama4_eagle",])
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    method_model_and_draft_model: tuple[str, str, str],
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        model_name = method_model_and_draft_model[1]
+
+        tp = 1
+
+        if model_name in TP8_REQUIRED_MODELS:
+            tp = 8
+
+        ref_llm = LLM(model=model_name,
+                      tensor_parallel_size=tp,
+                      max_model_len=2048)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        method = method_model_and_draft_model[0]
+        spec_model_name = method_model_and_draft_model[2]
+
+        spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index f61a0c52d8b3..edf7c59a061b 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -9,10 +9,6 @@
 
 from vllm import LLM, SamplingParams
 
-TP8_REQUIRED_MODELS = [
-    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-]
-
 
 @pytest.fixture
 def test_prompts():
@@ -105,11 +101,9 @@ def test_ngram_correctness(
     "method_model_and_draft_model",
     [("eagle", "meta-llama/Llama-3.1-8B-Instruct",
       "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"),
-     ("eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-      "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"),
      ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
       "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")],
-    ids=["llama3_eagle", "llama4_eagle", "llama3_eagle3"])
+    ids=["llama3_eagle", "llama3_eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -125,13 +119,7 @@ def test_eagle_correctness(
 
         model_name = method_model_and_draft_model[1]
 
-        tp = 1
-
-        if model_name in TP8_REQUIRED_MODELS:
-            tp = 8
-
         ref_llm = LLM(model=model_name,
-                      tensor_parallel_size=tp,
                       max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
@@ -142,7 +130,6 @@ def test_eagle_correctness(
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
-            tensor_parallel_size=tp,
             speculative_config={
                 "method": method,
                 "model": spec_model_name,

From 89fdd43d849e46dd3552686086fcd03f86d6e8ac Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Tue, 24 Jun 2025 23:14:19 +0000
Subject: [PATCH 10/21] fix linter

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_llama4_eagle.py |  6 ++++--
 tests/v1/e2e/test_spec_decode.py  | 16 +++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/v1/e2e/test_llama4_eagle.py b/tests/v1/e2e/test_llama4_eagle.py
index c517f621ed63..41d534a1dc0f 100644
--- a/tests/v1/e2e/test_llama4_eagle.py
+++ b/tests/v1/e2e/test_llama4_eagle.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# To run this file, run 
+# To run this file, run
 # pytest /path/to/vllm/tests/v1/e2e/test_llama4_eagle.py
 
 from __future__ import annotations
@@ -60,7 +60,9 @@ def sampling_config():
     "method_model_and_draft_model",
     [("eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
       "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct")],
-    ids=["llama4_eagle",])
+    ids=[
+        "llama4_eagle",
+    ])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index edf7c59a061b..242310f385c4 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -97,13 +97,12 @@ def test_ngram_correctness(
         del spec_llm
 
 
-@pytest.mark.parametrize(
-    "method_model_and_draft_model",
-    [("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"),
-     ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")],
-    ids=["llama3_eagle", "llama3_eagle3"])
+@pytest.mark.parametrize("method_model_and_draft_model",
+                         [("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+                           "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"),
+                          ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+                           "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")],
+                         ids=["llama3_eagle", "llama3_eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -119,8 +118,7 @@ def test_eagle_correctness(
 
         model_name = method_model_and_draft_model[1]
 
-        ref_llm = LLM(model=model_name,
-                      max_model_len=2048)
+        ref_llm = LLM(model=model_name, max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 

From 74ae303f8e9a889ab545e92d568837d6de85526c Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Wed, 25 Jun 2025 21:54:18 +0000
Subject: [PATCH 11/21] address comments 1

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_spec_decode.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 242310f385c4..01cac773bc2f 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -116,15 +116,12 @@ def test_eagle_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        model_name = method_model_and_draft_model[1]
+        method, model_name, spec_model_name = method_model_and_draft_model
 
         ref_llm = LLM(model=model_name, max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
-        method = method_model_and_draft_model[0]
-        spec_model_name = method_model_and_draft_model[2]
-
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,

From ea6cca9cd4fe1b6721d0b814eda0f5a4811cd8b9 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Wed, 25 Jun 2025 22:08:40 +0000
Subject: [PATCH 12/21] address comments 2

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/v1/e2e/test_llama4_eagle.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/tests/v1/e2e/test_llama4_eagle.py b/tests/v1/e2e/test_llama4_eagle.py
index 41d534a1dc0f..56574ab7af52 100644
--- a/tests/v1/e2e/test_llama4_eagle.py
+++ b/tests/v1/e2e/test_llama4_eagle.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # To run this file, run
-# pytest /path/to/vllm/tests/v1/e2e/test_llama4_eagle.py
+# pytest -vx /tests/v1/e2e/test_llama4_eagle.py
 
 from __future__ import annotations
 
@@ -13,10 +13,6 @@
 
 from vllm import LLM, SamplingParams
 
-TP8_REQUIRED_MODELS = [
-    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-]
-
 
 @pytest.fixture
 def test_prompts():
@@ -76,12 +72,9 @@ def test_eagle_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        model_name = method_model_and_draft_model[1]
-
-        tp = 1
+        method, model_name, spec_model_name = method_model_and_draft_model
 
-        if model_name in TP8_REQUIRED_MODELS:
-            tp = 8
+        tp = 8
 
         ref_llm = LLM(model=model_name,
                       tensor_parallel_size=tp,
@@ -89,9 +82,6 @@ def test_eagle_correctness(
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
-        method = method_model_and_draft_model[0]
-        spec_model_name = method_model_and_draft_model[2]
-
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,

From e6679185d26562851db224e289f20909d09fbfe1 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Thu, 26 Jun 2025 20:26:50 +0000
Subject: [PATCH 13/21] fix registry test

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/registry.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 49510af880cf..5986b89fe69f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -424,6 +424,10 @@ def check_available_online(
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
+    "EagleLlama4ForCausalLM": _HfExamplesInfo("ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct",
+                                             trust_remote_code=True,
+                                             speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct",
+                                             tokenizer="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"),
     "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",

From d342950f86cbde31f036f836acdce8fd25965672 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Thu, 26 Jun 2025 20:34:56 +0000
Subject: [PATCH 14/21] fix linter

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/registry.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 5986b89fe69f..b6efd2a25683 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -424,10 +424,10 @@ def check_available_online(
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
-    "EagleLlama4ForCausalLM": _HfExamplesInfo("ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct",
+    "EagleLlama4ForCausalLM": _HfExamplesInfo("ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
                                              trust_remote_code=True,
-                                             speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct",
-                                             tokenizer="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"),
+                                             speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
+                                             tokenizer="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"), # noqa: E501
     "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",

From a850d1e9904e6d5145f72f92f2ded8a45e5f7bb7 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Sun, 29 Jun 2025 20:33:38 +0000
Subject: [PATCH 15/21] skip initialization test

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/test_initialization.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index df72607767fd..4d17f9b94f0c 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -25,6 +25,10 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     if model_arch == "GraniteSpeechForConditionalGeneration":
         pytest.skip("Avoid OOM")
 
+    # FIXME: Enable once V1 is supported
+    if model_arch == "EagleLlama4ForCausalLM":
+        pytest.skip("Not supported on V0 engine")
+
     # Avoid OOM and reduce initialization time by only using 1 layer
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)

From 32edc20450ec3cc5578ccf4ee4899e95e8dfe5f2 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Mon, 30 Jun 2025 00:29:49 +0000
Subject: [PATCH 16/21] ignore llama4 test

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a13e2cb78218..f5ff06961f20 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -260,7 +260,7 @@ steps:
     - pytest -v -s v1/test_metrics_reader.py
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e --ignore=v1/e2e/test_llama4_eagle.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

From 07c5c8c20ddd29bff9be4ddf7633f816b22eed32 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Mon, 30 Jun 2025 01:30:07 +0000
Subject: [PATCH 17/21] update initialization test

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/registry.py            | 15 +++++++++++++--
 tests/models/test_initialization.py |  9 +++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index d31dfc7914c0..5e5778185b1e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -32,6 +32,12 @@ class _HfExamplesInfo:
     for speculative decoding.
     """
 
+    speculative_method: Optional[str] = None
+    """
+    The default speculative method to use for testing this architecture, which 
+    is only used for speculative decoding.
+    """
+
     min_transformers_version: Optional[str] = None
     """
     The minimum version of HF Transformers that is required to run this model.
@@ -61,6 +67,9 @@ class _HfExamplesInfo:
     v0_only: bool = False
     """The model is only available with the vLLM V0 engine."""
 
+    v1_only: bool = False
+    """The model is only available with the vLLM V1 engine."""
+
     hf_overrides: dict[str, Any] = field(default_factory=dict)
     """The ``hf_overrides`` required to load the model."""
 
@@ -434,10 +443,12 @@ def check_available_online(
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
-    "EagleLlama4ForCausalLM": _HfExamplesInfo("ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
+    "EagleLlama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", # noqa: E501
                                              trust_remote_code=True,
                                              speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
-                                             tokenizer="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"), # noqa: E501
+                                             tokenizer="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", # noqa: E501
+                                             speculative_method="eagle",
+                                             v1_only=True),
     "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 4d17f9b94f0c..2337bd9fb88d 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -25,10 +25,6 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     if model_arch == "GraniteSpeechForConditionalGeneration":
         pytest.skip("Avoid OOM")
 
-    # FIXME: Enable once V1 is supported
-    if model_arch == "EagleLlama4ForCausalLM":
-        pytest.skip("Not supported on V0 engine")
-
     # Avoid OOM and reduce initialization time by only using 1 layer
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
@@ -88,12 +84,17 @@ def _initialize_kv_caches_v1(self, vllm_config):
                        _initialize_kv_caches_v1), monkeypatch.context() as m):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
+        if model_info.v1_only:
+            m.setenv("VLLM_USE_V1", "1")
+
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             revision=model_info.revision,
             speculative_config={
+                "method": model_info.speculative_method 
+                        if model_info.speculative_method else None,
                 "model": model_info.speculative_model,
                 "num_speculative_tokens": 1,
             } if model_info.speculative_model else None,

From 815a8a2a61fbfca7342ec90bb28541bae9ec6f42 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Mon, 30 Jun 2025 01:38:41 +0000
Subject: [PATCH 18/21] fix linter

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/test_initialization.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 2337bd9fb88d..8e0bee1940dd 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -93,10 +93,13 @@ def _initialize_kv_caches_v1(self, vllm_config):
             tokenizer_mode=model_info.tokenizer_mode,
             revision=model_info.revision,
             speculative_config={
-                "method": model_info.speculative_method 
-                        if model_info.speculative_method else None,
-                "model": model_info.speculative_model,
-                "num_speculative_tokens": 1,
+                "method":
+                model_info.speculative_method
+                if model_info.speculative_method else None,
+                "model":
+                model_info.speculative_model,
+                "num_speculative_tokens":
+                1,
             } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,

From 96f22bd464f75f6829c2bfbb17591d2418f87b88 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Mon, 30 Jun 2025 04:18:54 +0000
Subject: [PATCH 19/21] change to scout

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/registry.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 5e5778185b1e..a027db02e868 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -443,11 +443,12 @@ def check_available_online(
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
-    "EagleLlama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", # noqa: E501
+    "EagleLlama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                              trust_remote_code=True,
                                              speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
-                                             tokenizer="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", # noqa: E501
+                                             tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                              speculative_method="eagle",
+                                             max_model_len=10240,
                                              v1_only=True),
     "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
                                             trust_remote_code=True,

From 963f57c2f00192d1703cc03c2549aab00eec2b53 Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Mon, 30 Jun 2025 07:42:12 +0000
Subject: [PATCH 20/21] change max model len

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index a027db02e868..3d5677a5a957 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -448,7 +448,7 @@ def check_available_online(
                                              speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
                                              tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                              speculative_method="eagle",
-                                             max_model_len=10240,
+                                             max_model_len=256,
                                              v1_only=True),
     "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
                                             trust_remote_code=True,

From 7ef96beea926d5e88256969485b0f8ae36d0fe8f Mon Sep 17 00:00:00 2001
From: Ronald Xu <ronaldxu@amazon.com>
Date: Tue, 8 Jul 2025 14:08:21 +0000
Subject: [PATCH 21/21] skip test

Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
---
 tests/models/test_initialization.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 4ed0a484336f..1094f8ad2ceb 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -23,7 +23,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
 
     # FIXME: Possible memory leak in the previous tests?
     if model_arch in ("GraniteSpeechForConditionalGeneration",
-                      "KimiVLForConditionalGeneration"):
+                      "KimiVLForConditionalGeneration",
+                      "EagleLlama4ForCausalLM"):
         pytest.skip("Avoid OOM")
 
     # Avoid OOM and reduce initialization time by only using 1 layer