ai-dynamo · rmccorm4 · May 31, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
@@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-TRTLLM_COMMIT=290649b6aaed5f233b0a0adf50edc1347f8d2b14
+TRTLLM_COMMIT="8cb6163a57226e69d8a85788eff542a440ed9c89"
 
 # TensorRT-LLM PyPI index URL
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
@@ -519,4 +519,4 @@ if [ -z "$RUN_PREFIX" ]; then
     set -x
 fi
 
-{ set +x; } 2>/dev/null
+{ set +x; } 2>/dev/null
@@ -39,6 +39,9 @@ kv_cache_config:
   # free_gpu_memory_fraction: 0.30
 
 pytorch_backend_config:
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
   use_cuda_graph: true
   cuda_graph_padding_enabled: true
   # NOTE: For larger max batch size, you may want to add larger cuda graph
@@ -54,5 +57,4 @@ pytorch_backend_config:
   - 128
   - 256
   print_iter_log: true
-  enable_overlap_scheduler: true
   kv_cache_dtype: fp8
@@ -34,8 +34,13 @@ context_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
 
-  free_gpu_memory_fraction: 0.75
+  kv_cache_config:
+    free_gpu_memory_fraction: 0.75
+
   pytorch_backend_config:
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    # Overlap scheduler not currently supported in context-only
+    disable_overlap_scheduler: true
     print_iter_log: true
     # NOTE: This dtype must match in both context/generation configs
     kv_cache_dtype: fp8
@@ -54,13 +59,16 @@ generation_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: false
 
-  # With dp attention disabled: high free_gpu_memory_fraction is fine.
-  free_gpu_memory_fraction: 0.85
-  # With dp attention enabled: large ISL at high concurrency may need
-  # free_gpu_memory_fraction low to have enough available memory.
-  # free_gpu_memory_fraction: 0.30
+  kv_cache_config:
+    # With dp attention disabled: high free_gpu_memory_fraction is fine.
+    free_gpu_memory_fraction: 0.85
+    # With dp attention enabled: large ISL at high concurrency may need
+    # free_gpu_memory_fraction low to have enough available memory.
+    # free_gpu_memory_fraction: 0.30
 
   pytorch_backend_config:
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    disable_overlap_scheduler: false
     use_cuda_graph: true
     cuda_graph_padding_enabled: true
     # NOTE: For larger max batch size, you may want to add larger cuda graph
@@ -76,6 +84,5 @@ generation_servers:
     - 128
     - 256
     print_iter_log: true
-    enable_overlap_scheduler: true
     # NOTE: This dtype must match in both context/generation configs
     kv_cache_dtype: fp8
@@ -34,5 +34,7 @@ kv_cache_config:
   free_gpu_memory_fraction: 0.95
 
 pytorch_backend_config:
-  enable_overlap_scheduler: true
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
   use_cuda_graph: true
@@ -36,6 +36,8 @@ kv_cache_config:
   enable_block_reuse: true
 
 pytorch_backend_config:
-  enable_overlap_scheduler: false
-  use_cuda_graph: false
-  enable_iter_perf_stats: true
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+  use_cuda_graph: true
+  enable_iter_perf_stats: true
@@ -36,6 +36,8 @@ kv_cache_config:
   enable_block_reuse: true
 
 pytorch_backend_config:
-  enable_overlap_scheduler: true
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
   use_cuda_graph: true
   enable_iter_perf_stats: true
@@ -34,7 +34,9 @@ context_servers:
   cache_transceiver_config:
     max_num_tokens: 10240
   pytorch_backend_config:
-    enable_overlap_scheduler: false
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    # Overlap scheduler not currently supported in context-only
+    disable_overlap_scheduler: true
     use_cuda_graph: false
   urls:
       - "localhost:8001"
@@ -49,7 +51,8 @@ generation_servers:
   cache_transceiver_config:
     max_num_tokens: 256
   pytorch_backend_config:
-    enable_overlap_scheduler: true
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    disable_overlap_scheduler: false
     use_cuda_graph: false
   urls:
-      - "localhost:8002"
+      - "localhost:8002"
@@ -36,7 +36,9 @@ context_servers:
   cache_transceiver_config:
     max_num_tokens: 10240
   pytorch_backend_config:
-    enable_overlap_scheduler: false
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    # Overlap scheduler not currently supported in context-only
+    disable_overlap_scheduler: true
     use_cuda_graph: false
     enable_iter_perf_stats: true
   urls:
@@ -54,8 +56,9 @@ generation_servers:
   cache_transceiver_config:
     max_num_tokens: 256
   pytorch_backend_config:
-    enable_overlap_scheduler: true
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    disable_overlap_scheduler: false
     use_cuda_graph: false
     enable_iter_perf_stats: true
   urls:
-      - "localhost:8002"
+      - "localhost:8002"