ai-dynamo
diff --git a/‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml‎
Lines changed: 0 additions & 41 deletions b/‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml‎ renamed to ‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml‎
Lines changed: 13 additions & 11 deletions b/‎components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml‎ renamed to ‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml‎
Lines changed: 3 additions & 5 deletions b/‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml‎
Lines changed: 2 additions & 4 deletions b/‎components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml‎
Lines changed: 0 additions & 38 deletions b/‎components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml‎
Lines changed: 0 additions & 43 deletions b/‎components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml‎
Lines changed: 0 additions & 43 deletions
diff --git a/‎components/backends/trtllm/llama4_plus_eagle.md‎
Lines changed: 1 addition & 13 deletions b/‎components/backends/trtllm/llama4_plus_eagle.md‎
Lines changed: 1 addition & 13 deletions
@@ -14,24 +14,26 @@
 # limitations under the License.
 
 backend: pytorch
-tensor_parallel_size: 8
-moe_expert_parallel_size: 8
-max_batch_size: 1
-max_num_tokens: 8192
-max_seq_len: 8192
-print_iter_log: true
-disable_overlap_scheduler: true
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+max_batch_size: 192
+max_num_tokens: 3072
+disable_overlap_scheduler: false
 
 # Enable Speculative Decoding in the model engine
 speculative_config:
   decoding_type: Eagle
   max_draft_len: 3
   speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: True
+  eagle3_one_model: true
 
 kv_cache_config:
-  free_gpu_memory_fraction: 0.5
+  free_gpu_memory_fraction: 0.2
   enable_block_reuse: false
 
-cache_transceiver_config:
-  backend: default
+cuda_graph_config:
+  enable_padding: true
+  batch_sizes: [1,2,3,4,5,6,7,8,16,32,48,64,128,190,191,192]
+
+print_iter_log: true
+
@@ -17,23 +17,21 @@ backend: pytorch
 tensor_parallel_size: 4
 moe_expert_parallel_size: 4
 max_batch_size: 256
-max_num_tokens: 512
+max_num_tokens: 1024
 # 8704 = 8192 ISL + 512 OSL
 max_seq_len: 8704
 disable_overlap_scheduler: true
-enable_autotuner: false
 
 # Enable Speculative Decoding in the model engine
 speculative_config:
   decoding_type: Eagle
-  max_draft_len: 1
+  max_draft_len: 3
   speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: false
+  eagle3_one_model: true
 
 kv_cache_config:
   free_gpu_memory_fraction: 0.5
   enable_block_reuse: false
-  dtype: fp8
 
 cuda_graph_config:
   enable_padding: true
 
@@ -21,19 +21,17 @@ max_num_tokens: 8192
 max_seq_len: 8192
 print_iter_log: true
 disable_overlap_scheduler: true
-enable_autotuner: false
 
 # Enable Speculative Decoding in the model engine
 speculative_config:
   decoding_type: Eagle
-  max_draft_len: 1
+  max_draft_len: 3
   speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: false
+  eagle3_one_model: true
 
 kv_cache_config:
   free_gpu_memory_fraction: 0.5
   enable_block_reuse: false
-  dtype: fp8
 
 cache_transceiver_config:
   backend: default
@@ -30,16 +30,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu
 For advanced control over how requests are routed between prefill and decode workers in disaggregated mode, refer to the [Disaggregation Strategy](./README.md#disaggregation-strategy) section.
 
 ## Notes
-* To run Eagle Speculative Decoding with Llama 4, ensure the container meets the following criteria:
-  * Built with a version of TensorRT-LLM based on the 0.21 release [Link](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.21)
-* If you need to download model weights off huggingface, make sure you run the command `huggingface-cli login` and have access to the necessary gated models.
-
-## Eagle3-one-model
-* Eagle3-one-model (`eagle3_one_model=True`) config is added in `engine_configs/llama4/eagle_one_model`. Build dynamo with the latest commit `66f299a` in TRTLLM 1.0.0.rc2 [Link](https://github.com/NVIDIA/TensorRT-LLM/commits/v1.0.0rc2/).
-* The configs in `engine_configs/llama4/eagle_one_model` are tested with 8xH100 cluster. Be sure to change the `NUM_GPUS_PER_NODE` accordingly or change TP/EP size in config. 1 8xH100 node for aggregated .yml file, 2 8xH100 for prefill/decode .yml file.
-* The current `./multinode/start_frontend_services.sh` may got ran `NUM_GPUS_PER_NODE` times depending on how srun/mpi is launched, beware that the frontend service only needs to be ran once.
-* Eagle3-one-model appends the eagle3 layer at the end of the TRTLLM engine, instead of sending base/draft requests between 2 engines. Visit TRTLLM for more information.
-
+* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `engine_configs/llama4/eagle` folder.
 
 ## Setup
 
@@ -66,7 +57,6 @@ export NUM_NODES=1
 export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml"
 ./multinode/srun_aggregated.sh
 ```
-* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team.
 
 ## Disaggregated Serving
 
@@ -77,8 +67,6 @@ export NUM_DECODE_NODES=1
 export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml"
 ./multinode/srun_disaggregated.sh
 ```
-* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team.
-
 
 ## Example Request