Skip to content
Merged
5 changes: 4 additions & 1 deletion components/backends/trtllm/engine_configs/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,7 @@ kv_cache_config:
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true


cuda_graph_config:
max_batch_size: 16
9 changes: 7 additions & 2 deletions components/backends/trtllm/engine_configs/decode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,16 @@ tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
disable_overlap_scheduler: false
use_cuda_graph: true

cuda_graph_config:
max_batch_size: 16

kv_cache_config:
free_gpu_memory_fraction: 0.95

cache_transceiver_config:
backend: default
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,24 @@ max_num_tokens: 8448
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.30
dtype: fp8

# Enable the MTP(Multi-Token Prediction) in the model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256

print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,24 @@ max_num_tokens: 512
max_seq_len: 8704
kv_cache_config:
free_gpu_memory_fraction: 0.85
dtype: fp8

# Enable the MTP(Multi-Token Prediction) in decode model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256

print_iter_log: true
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
dtype: fp8

print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true

# Enable the MTP(Multi-Token Prediction) in the prefill model engine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,26 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
dtype: fp8


# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256

print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,27 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
dtype: fp8

# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256

cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to
# add larger cuda graph batch sizes below to match.
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256

print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,11 @@ max_seq_len: 8192

kv_cache_config:
free_gpu_memory_fraction: 0.75
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs

# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs
kv_cache_dtype: fp8
print_iter_log: true
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,20 @@ enable_attention_dp: true
max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448

kv_cache_config:
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
kv_cache_dtype: fp8
dtype: fp8

cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,37 @@
backend: pytorch

# WideEP related settings
moe_backend: WideEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
moe_config:
backend: WIDEEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

tensor_parallel_size: 16
moe_expert_parallel_size: 16

enable_attention_dp: true
max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448

kv_cache_config:
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
kv_cache_dtype: fp8
free_gpu_memory_fraction: 0.3
dtype: fp8

cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
backend: pytorch

# WideEP related settings
moe_backend: WideEP
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
moe_config:
backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

# TP/EP/PP/DP
tensor_parallel_size: 16
Expand All @@ -35,25 +36,28 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
free_gpu_memory_fraction: 0.30
dtype: fp8


# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to
# add larger cuda graph batch sizes below to match.
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256


print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
backend: pytorch

# WideEP related settings
moe_backend: WideEP
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
moe_config:
backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

# TP/EP/PP/DP
tensor_parallel_size: 16
Expand All @@ -29,13 +30,12 @@ max_num_tokens: 8192
max_seq_len: 8192

kv_cache_config:
free_gpu_memory_fraction: 0.75
free_gpu_memory_fraction: 0.3
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs

# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs
kv_cache_dtype: fp8
print_iter_log: true
Loading
Loading