File tree Expand file tree Collapse file tree 18 files changed +177
-158
lines changed
components/backends/trtllm Expand file tree Collapse file tree 18 files changed +177
-158
lines changed Original file line number Diff line number Diff line change @@ -28,4 +28,7 @@ kv_cache_config:
2828# NOTE: overlap_scheduler enabled by default since this commit and changed
2929# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3030# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
31- use_cuda_graph : true
31+
32+
33+ cuda_graph_config :
34+ max_batch_size : 16
Original file line number Diff line number Diff line change @@ -16,11 +16,16 @@ tensor_parallel_size: 1
1616moe_expert_parallel_size : 1
1717enable_attention_dp : false
1818max_num_tokens : 8192
19- max_batch_size : 16
2019trust_remote_code : true
2120backend : pytorch
2221enable_chunked_prefill : true
2322disable_overlap_scheduler : false
24- use_cuda_graph : true
23+
24+ cuda_graph_config :
25+ max_batch_size : 16
26+
2527kv_cache_config :
2628 free_gpu_memory_fraction : 0.95
29+
30+ cache_transceiver_config :
31+ backend : default
Original file line number Diff line number Diff line change @@ -28,23 +28,24 @@ max_num_tokens: 8448
2828max_seq_len : 8448
2929kv_cache_config :
3030 free_gpu_memory_fraction : 0.30
31+ dtype : fp8
3132
3233# Enable the MTP(Multi-Token Prediction) in the model engine
3334speculative_config :
3435 decoding_type : MTP
3536 num_nextn_predict_layers : 1
3637
37- use_cuda_graph : true
38- cuda_graph_padding_enabled : true
39- cuda_graph_batch_sizes :
40- - 1
41- - 2
42- - 4
43- - 8
44- - 16
45- - 32
46- - 64
47- - 128
48- - 256
38+ cuda_graph_config :
39+ enable_padding : true
40+ batch_sizes :
41+ - 1
42+ - 2
43+ - 4
44+ - 8
45+ - 16
46+ - 32
47+ - 64
48+ - 128
49+ - 256
50+
4951print_iter_log : true
50- kv_cache_dtype : fp8
Original file line number Diff line number Diff line change @@ -31,23 +31,24 @@ max_num_tokens: 512
3131max_seq_len : 8704
3232kv_cache_config :
3333 free_gpu_memory_fraction : 0.85
34+ dtype : fp8
3435
3536# Enable the MTP(Multi-Token Prediction) in decode model engine
3637speculative_config :
3738 decoding_type : MTP
3839 num_nextn_predict_layers : 1
3940
40- use_cuda_graph : true
41- cuda_graph_padding_enabled : true
42- cuda_graph_batch_sizes :
43- - 1
44- - 2
45- - 4
46- - 8
47- - 16
48- - 32
49- - 64
50- - 128
51- - 256
52- print_iter_log : true
53- kv_cache_dtype : fp8
41+ cuda_graph_config :
42+ enable_padding : true
43+ batch_sizes :
44+ - 1
45+ - 2
46+ - 4
47+ - 8
48+ - 16
49+ - 32
50+ - 64
51+ - 128
52+ - 256
53+
54+ print_iter_log : true
Original file line number Diff line number Diff line change @@ -27,8 +27,9 @@ max_num_tokens: 8192
2727max_seq_len : 8192
2828kv_cache_config :
2929 free_gpu_memory_fraction : 0.75
30+ dtype : fp8
31+
3032print_iter_log : true
31- kv_cache_dtype : fp8
3233disable_overlap_scheduler : true
3334
3435# Enable the MTP(Multi-Token Prediction) in the prefill model engine
Original file line number Diff line number Diff line change @@ -31,24 +31,26 @@ kv_cache_config:
3131 # With dp attention enabled: large ISL at high concurrency may need
3232 # free_gpu_memory_fraction low to have enough available memory.
3333 # free_gpu_memory_fraction: 0.30
34+ dtype : fp8
35+
3436
3537# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3638# NOTE: overlap_scheduler enabled by default since this commit and changed
3739# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3840# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
39- use_cuda_graph : true
40- cuda_graph_padding_enabled : true
41+ cuda_graph_config :
42+ enable_padding : true
4143# NOTE: For larger max batch size, you may want to add larger cuda graph
4244# batch sizes below to match.
43- cuda_graph_batch_sizes :
44- - 1
45- - 2
46- - 4
47- - 8
48- - 16
49- - 32
50- - 64
51- - 128
52- - 256
45+ batch_sizes :
46+ - 1
47+ - 2
48+ - 4
49+ - 8
50+ - 16
51+ - 32
52+ - 64
53+ - 128
54+ - 256
55+
5356print_iter_log : true
54- kv_cache_dtype : fp8
Original file line number Diff line number Diff line change @@ -31,25 +31,27 @@ kv_cache_config:
3131 # With dp attention enabled: large ISL at high concurrency may need
3232 # free_gpu_memory_fraction low to have enough available memory.
3333 # free_gpu_memory_fraction: 0.30
34+ dtype : fp8
3435
3536# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3637# NOTE: overlap_scheduler enabled by default since this commit and changed
3738# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3839# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
3940disable_overlap_scheduler : false
40- use_cuda_graph : true
41- cuda_graph_padding_enabled : true
42- # NOTE: For larger max batch size, you may want to add larger cuda graph
43- # batch sizes below to match.
44- cuda_graph_batch_sizes :
45- - 1
46- - 2
47- - 4
48- - 8
49- - 16
50- - 32
51- - 64
52- - 128
53- - 256
41+
42+ cuda_graph_config :
43+ enable_padding : true
44+ # NOTE: For larger max batch size, you may want to
45+ # add larger cuda graph batch sizes below to match.
46+ batch_sizes :
47+ - 1
48+ - 2
49+ - 4
50+ - 8
51+ - 16
52+ - 32
53+ - 64
54+ - 128
55+ - 256
56+
5457print_iter_log : true
55- kv_cache_dtype : fp8
Original file line number Diff line number Diff line change @@ -26,12 +26,11 @@ max_seq_len: 8192
2626
2727kv_cache_config :
2828 free_gpu_memory_fraction : 0.75
29+ dtype : fp8 # NOTE: This dtype must match in both prefill/decode configs
2930
3031# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3132# NOTE: overlap_scheduler enabled by default since this commit and changed
3233# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3334# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
3435disable_overlap_scheduler : true
35- print_iter_log : true
36- # NOTE: This dtype must match in both prefill/decode configs
37- kv_cache_dtype : fp8
36+ print_iter_log : true
Original file line number Diff line number Diff line change @@ -10,18 +10,20 @@ enable_attention_dp: true
1010max_batch_size : 256
1111max_num_tokens : 256
1212max_seq_len : 8448
13+
1314kv_cache_config :
1415 free_gpu_memory_fraction : 0.7
15- use_cuda_graph : true
16- cuda_graph_padding_enabled : true
17- cuda_graph_batch_sizes :
18- - 1
19- - 2
20- - 4
21- - 8
22- - 16
23- - 32
24- - 64
25- - 128
26- - 256
27- kv_cache_dtype : fp8
16+ dtype : fp8
17+
18+ cuda_graph_config :
19+ enable_padding : true
20+ batch_sizes :
21+ - 1
22+ - 2
23+ - 4
24+ - 8
25+ - 16
26+ - 32
27+ - 64
28+ - 128
29+ - 256
Original file line number Diff line number Diff line change 33backend : pytorch
44
55# WideEP related settings
6- moe_backend : WideEP
7- # moe_max_num_tokens will default to max_num_tokens if left unspecified.
8- #
9- # If you want to set this value explicitly, one recommendation is below:
10- # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
11- # 4096 = 256 * 16
12- # moe_max_num_tokens: 4096
13- moe_load_balancer : /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
6+ moe_config :
7+ backend : WIDEEP
8+ # moe_max_num_tokens will default to max_num_tokens if left unspecified.
9+ #
10+ # If you want to set this value explicitly, one recommendation is below:
11+ # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
12+ # 4096 = 256 * 16
13+ # moe_max_num_tokens: 4096
14+ load_balancer : /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
15+
1416tensor_parallel_size : 16
1517moe_expert_parallel_size : 16
1618
1719enable_attention_dp : true
1820max_batch_size : 256
1921max_num_tokens : 256
2022max_seq_len : 8448
23+
2124kv_cache_config :
22- free_gpu_memory_fraction : 0.7
23- use_cuda_graph : true
24- cuda_graph_padding_enabled : true
25- cuda_graph_batch_sizes :
26- - 1
27- - 2
28- - 4
29- - 8
30- - 16
31- - 32
32- - 64
33- - 128
34- - 256
35- kv_cache_dtype : fp8
25+ free_gpu_memory_fraction : 0.3
26+ dtype : fp8
27+
28+ cuda_graph_config :
29+ enable_padding : true
30+ batch_sizes :
31+ - 1
32+ - 2
33+ - 4
34+ - 8
35+ - 16
36+ - 32
37+ - 64
38+ - 128
39+ - 256
You can’t perform that action at this time.
0 commit comments