Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/backends/trtllm/engine_configs/decode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ kv_cache_config:
free_gpu_memory_fraction: 0.95

cache_transceiver_config:
backend: default
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,7 @@ cuda_graph_config:
- 128
- 256

print_iter_log: true
print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,6 @@ disable_overlap_scheduler: true
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,6 @@ cuda_graph_config:
- 256

print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,7 @@ kv_cache_config:
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,6 @@ cuda_graph_config:


print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,7 @@ kv_cache_config:
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
disable_overlap_scheduler: true
autotuner_enabled: false
enable_autotuner: false

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false

kv_cache_config:
Expand All @@ -49,3 +49,6 @@ cuda_graph_config:
- 256

print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,20 @@ max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true
autotuner_enabled: false
enable_autotuner: false

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
dtype: fp8

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,6 @@ cuda_graph_config:
max_batch_size: 256

print_iter_log: true

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ speculative_config:
kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false

cache_transceiver_config:
backend: default
Loading