Skip to content

Commit f10aab3

Browse files
fix: Migrating trtllm examples from 1.0.0rc0 to 1.0.4rc4 (#2217)
1 parent 97390ac commit f10aab3

File tree

18 files changed

+177
-158
lines changed

18 files changed

+177
-158
lines changed

components/backends/trtllm/engine_configs/agg.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,7 @@ kv_cache_config:
2828
# NOTE: overlap_scheduler enabled by default since this commit and changed
2929
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3030
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
31-
use_cuda_graph: true
31+
32+
33+
cuda_graph_config:
34+
max_batch_size: 16

components/backends/trtllm/engine_configs/decode.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,16 @@ tensor_parallel_size: 1
1616
moe_expert_parallel_size: 1
1717
enable_attention_dp: false
1818
max_num_tokens: 8192
19-
max_batch_size: 16
2019
trust_remote_code: true
2120
backend: pytorch
2221
enable_chunked_prefill: true
2322
disable_overlap_scheduler: false
24-
use_cuda_graph: true
23+
24+
cuda_graph_config:
25+
max_batch_size: 16
26+
2527
kv_cache_config:
2628
free_gpu_memory_fraction: 0.95
29+
30+
cache_transceiver_config:
31+
backend: default

components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,24 @@ max_num_tokens: 8448
2828
max_seq_len: 8448
2929
kv_cache_config:
3030
free_gpu_memory_fraction: 0.30
31+
dtype: fp8
3132

3233
# Enable the MTP(Multi-Token Prediction) in the model engine
3334
speculative_config:
3435
decoding_type: MTP
3536
num_nextn_predict_layers: 1
3637

37-
use_cuda_graph: true
38-
cuda_graph_padding_enabled: true
39-
cuda_graph_batch_sizes:
40-
- 1
41-
- 2
42-
- 4
43-
- 8
44-
- 16
45-
- 32
46-
- 64
47-
- 128
48-
- 256
38+
cuda_graph_config:
39+
enable_padding: true
40+
batch_sizes:
41+
- 1
42+
- 2
43+
- 4
44+
- 8
45+
- 16
46+
- 32
47+
- 64
48+
- 128
49+
- 256
50+
4951
print_iter_log: true
50-
kv_cache_dtype: fp8

components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,24 @@ max_num_tokens: 512
3131
max_seq_len: 8704
3232
kv_cache_config:
3333
free_gpu_memory_fraction: 0.85
34+
dtype: fp8
3435

3536
# Enable the MTP(Multi-Token Prediction) in decode model engine
3637
speculative_config:
3738
decoding_type: MTP
3839
num_nextn_predict_layers: 1
3940

40-
use_cuda_graph: true
41-
cuda_graph_padding_enabled: true
42-
cuda_graph_batch_sizes:
43-
- 1
44-
- 2
45-
- 4
46-
- 8
47-
- 16
48-
- 32
49-
- 64
50-
- 128
51-
- 256
52-
print_iter_log: true
53-
kv_cache_dtype: fp8
41+
cuda_graph_config:
42+
enable_padding: true
43+
batch_sizes:
44+
- 1
45+
- 2
46+
- 4
47+
- 8
48+
- 16
49+
- 32
50+
- 64
51+
- 128
52+
- 256
53+
54+
print_iter_log: true

components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ max_num_tokens: 8192
2727
max_seq_len: 8192
2828
kv_cache_config:
2929
free_gpu_memory_fraction: 0.75
30+
dtype: fp8
31+
3032
print_iter_log: true
31-
kv_cache_dtype: fp8
3233
disable_overlap_scheduler: true
3334

3435
# Enable the MTP(Multi-Token Prediction) in the prefill model engine

components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,24 +31,26 @@ kv_cache_config:
3131
# With dp attention enabled: large ISL at high concurrency may need
3232
# free_gpu_memory_fraction low to have enough available memory.
3333
# free_gpu_memory_fraction: 0.30
34+
dtype: fp8
35+
3436

3537
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3638
# NOTE: overlap_scheduler enabled by default since this commit and changed
3739
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3840
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
39-
use_cuda_graph: true
40-
cuda_graph_padding_enabled: true
41+
cuda_graph_config:
42+
enable_padding: true
4143
# NOTE: For larger max batch size, you may want to add larger cuda graph
4244
# batch sizes below to match.
43-
cuda_graph_batch_sizes:
44-
- 1
45-
- 2
46-
- 4
47-
- 8
48-
- 16
49-
- 32
50-
- 64
51-
- 128
52-
- 256
45+
batch_sizes:
46+
- 1
47+
- 2
48+
- 4
49+
- 8
50+
- 16
51+
- 32
52+
- 64
53+
- 128
54+
- 256
55+
5356
print_iter_log: true
54-
kv_cache_dtype: fp8

components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,25 +31,27 @@ kv_cache_config:
3131
# With dp attention enabled: large ISL at high concurrency may need
3232
# free_gpu_memory_fraction low to have enough available memory.
3333
# free_gpu_memory_fraction: 0.30
34+
dtype: fp8
3435

3536
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3637
# NOTE: overlap_scheduler enabled by default since this commit and changed
3738
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3839
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
3940
disable_overlap_scheduler: false
40-
use_cuda_graph: true
41-
cuda_graph_padding_enabled: true
42-
# NOTE: For larger max batch size, you may want to add larger cuda graph
43-
# batch sizes below to match.
44-
cuda_graph_batch_sizes:
45-
- 1
46-
- 2
47-
- 4
48-
- 8
49-
- 16
50-
- 32
51-
- 64
52-
- 128
53-
- 256
41+
42+
cuda_graph_config:
43+
enable_padding: true
44+
# NOTE: For larger max batch size, you may want to
45+
# add larger cuda graph batch sizes below to match.
46+
batch_sizes:
47+
- 1
48+
- 2
49+
- 4
50+
- 8
51+
- 16
52+
- 32
53+
- 64
54+
- 128
55+
- 256
56+
5457
print_iter_log: true
55-
kv_cache_dtype: fp8

components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,11 @@ max_seq_len: 8192
2626

2727
kv_cache_config:
2828
free_gpu_memory_fraction: 0.75
29+
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
2930

3031
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3132
# NOTE: overlap_scheduler enabled by default since this commit and changed
3233
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
3334
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
3435
disable_overlap_scheduler: true
35-
print_iter_log: true
36-
# NOTE: This dtype must match in both prefill/decode configs
37-
kv_cache_dtype: fp8
36+
print_iter_log: true

components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,20 @@ enable_attention_dp: true
1010
max_batch_size: 256
1111
max_num_tokens: 256
1212
max_seq_len: 8448
13+
1314
kv_cache_config:
1415
free_gpu_memory_fraction: 0.7
15-
use_cuda_graph: true
16-
cuda_graph_padding_enabled: true
17-
cuda_graph_batch_sizes:
18-
- 1
19-
- 2
20-
- 4
21-
- 8
22-
- 16
23-
- 32
24-
- 64
25-
- 128
26-
- 256
27-
kv_cache_dtype: fp8
16+
dtype: fp8
17+
18+
cuda_graph_config:
19+
enable_padding: true
20+
batch_sizes:
21+
- 1
22+
- 2
23+
- 4
24+
- 8
25+
- 16
26+
- 32
27+
- 64
28+
- 128
29+
- 256

components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,37 @@
33
backend: pytorch
44

55
# WideEP related settings
6-
moe_backend: WideEP
7-
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
8-
#
9-
# If you want to set this value explicitly, one recommendation is below:
10-
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
11-
# 4096 = 256 * 16
12-
# moe_max_num_tokens: 4096
13-
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
6+
moe_config:
7+
backend: WIDEEP
8+
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
9+
#
10+
# If you want to set this value explicitly, one recommendation is below:
11+
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
12+
# 4096 = 256 * 16
13+
# moe_max_num_tokens: 4096
14+
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
15+
1416
tensor_parallel_size: 16
1517
moe_expert_parallel_size: 16
1618

1719
enable_attention_dp: true
1820
max_batch_size: 256
1921
max_num_tokens: 256
2022
max_seq_len: 8448
23+
2124
kv_cache_config:
22-
free_gpu_memory_fraction: 0.7
23-
use_cuda_graph: true
24-
cuda_graph_padding_enabled: true
25-
cuda_graph_batch_sizes:
26-
- 1
27-
- 2
28-
- 4
29-
- 8
30-
- 16
31-
- 32
32-
- 64
33-
- 128
34-
- 256
35-
kv_cache_dtype: fp8
25+
free_gpu_memory_fraction: 0.3
26+
dtype: fp8
27+
28+
cuda_graph_config:
29+
enable_padding: true
30+
batch_sizes:
31+
- 1
32+
- 2
33+
- 4
34+
- 8
35+
- 16
36+
- 32
37+
- 64
38+
- 128
39+
- 256

0 commit comments

Comments
 (0)