File tree Expand file tree Collapse file tree 2 files changed +4
-12
lines changed
examples/tensorrt_llm/configs
llmapi_disagg_router_configs Expand file tree Collapse file tree 2 files changed +4
-12
lines changed Original file line number Diff line number Diff line change @@ -30,9 +30,7 @@ context_servers:
3030 max_batch_size : 16
3131 enable_chunked_prefill : false
3232 kv_cache_config :
33- free_gpu_memory_fraction : 0.40
34- cache_transceiver_config :
35- max_num_tokens : 10240
33+ free_gpu_memory_fraction : 0.75
3634 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3735 # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
3836 # Overlap scheduler not currently supported in context-only
@@ -47,9 +45,7 @@ generation_servers:
4745 max_num_tokens : 256
4846 max_batch_size : 256
4947 kv_cache_config :
50- free_gpu_memory_fraction : 0.40
51- cache_transceiver_config :
52- max_num_tokens : 256
48+ free_gpu_memory_fraction : 0.75
5349 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
5450 # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
5551 disable_overlap_scheduler : false
Original file line number Diff line number Diff line change @@ -30,11 +30,9 @@ context_servers:
3030 max_batch_size : 16
3131 enable_chunked_prefill : false
3232 kv_cache_config :
33- free_gpu_memory_fraction : 0.40
33+ free_gpu_memory_fraction : 0.75
3434 event_buffer_max_size : 1024
3535 enable_block_reuse : true
36- cache_transceiver_config :
37- max_num_tokens : 10240
3836 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
3937 # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
4038 # Overlap scheduler not currently supported in context-only
@@ -50,11 +48,9 @@ generation_servers:
5048 max_num_tokens : 256
5149 max_batch_size : 256
5250 kv_cache_config :
53- free_gpu_memory_fraction : 0.40
51+ free_gpu_memory_fraction : 0.75
5452 event_buffer_max_size : 1024
5553 enable_block_reuse : true
56- cache_transceiver_config :
57- max_num_tokens : 256
5854 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
5955 # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
6056 disable_overlap_scheduler : false
You can’t perform that action at this time.
0 commit comments