Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions container/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit.
TRTLLM_COMMIT=290649b6aaed5f233b0a0adf50edc1347f8d2b14
TRTLLM_COMMIT="8cb6163a57226e69d8a85788eff542a440ed9c89"

# TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
Expand Down Expand Up @@ -519,4 +519,4 @@ if [ -z "$RUN_PREFIX" ]; then
set -x
fi

{ set +x; } 2>/dev/null
{ set +x; } 2>/dev/null
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ kv_cache_config:
# free_gpu_memory_fraction: 0.30

pytorch_backend_config:
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
Expand All @@ -54,5 +57,4 @@ pytorch_backend_config:
- 128
- 256
print_iter_log: true
enable_overlap_scheduler: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ context_servers:
pipeline_parallel_size: 1
enable_attention_dp: true

free_gpu_memory_fraction: 0.75
kv_cache_config:
free_gpu_memory_fraction: 0.75

pytorch_backend_config:
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8
Expand All @@ -54,13 +59,16 @@ generation_servers:
pipeline_parallel_size: 1
enable_attention_dp: false

# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction: 0.85
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
kv_cache_config:
# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction: 0.85
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30

pytorch_backend_config:
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
Expand All @@ -76,6 +84,5 @@ generation_servers:
- 128
- 256
print_iter_log: true
enable_overlap_scheduler: true
# NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8
4 changes: 3 additions & 1 deletion examples/tensorrt_llm/configs/llm_api_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,7 @@ kv_cache_config:
free_gpu_memory_fraction: 0.95

pytorch_backend_config:
enable_overlap_scheduler: true
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ kv_cache_config:
enable_block_reuse: true

pytorch_backend_config:
enable_overlap_scheduler: false
use_cuda_graph: false
enable_iter_perf_stats: true
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
enable_iter_perf_stats: true
4 changes: 3 additions & 1 deletion examples/tensorrt_llm/configs/llm_api_config_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ kv_cache_config:
enable_block_reuse: true

pytorch_backend_config:
enable_overlap_scheduler: true
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
enable_iter_perf_stats: true
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ context_servers:
cache_transceiver_config:
max_num_tokens: 10240
pytorch_backend_config:
enable_overlap_scheduler: false
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
use_cuda_graph: false
urls:
- "localhost:8001"
Expand All @@ -49,7 +51,8 @@ generation_servers:
cache_transceiver_config:
max_num_tokens: 256
pytorch_backend_config:
enable_overlap_scheduler: true
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: false
urls:
- "localhost:8002"
- "localhost:8002"
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ context_servers:
cache_transceiver_config:
max_num_tokens: 10240
pytorch_backend_config:
enable_overlap_scheduler: false
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
use_cuda_graph: false
enable_iter_perf_stats: true
urls:
Expand All @@ -54,8 +56,9 @@ generation_servers:
cache_transceiver_config:
max_num_tokens: 256
pytorch_backend_config:
enable_overlap_scheduler: true
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: false
enable_iter_perf_stats: true
urls:
- "localhost:8002"
- "localhost:8002"
Loading