From 5a211b915ae98e8f12c517570802914b085bbb11 Mon Sep 17 00:00:00 2001
From: Kaiyu <26294424+kaiyux@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:15:27 -0700
Subject: [PATCH 1/2] Update TensorRT-LLM
---
.gitignore | 2 +
README.md | 460 +-
benchmarks/cpp/README.md | 10 +-
benchmarks/cpp/gptSessionBenchmark.cpp | 8 +
benchmarks/python/allowed_configs.py | 33 +
benchmarks/python/build.py | 46 +
.../tensorrt_llm/batch_manager/GptManager.h | 12 +-
.../batch_manager/kvCacheConfig.h | 2 +
.../batch_manager/kvCacheManager.h | 73 +-
.../tensorrt_llm/batch_manager/llmRequest.h | 16 +-
.../batch_manager/peftCacheManager.h | 25 +-
.../batch_manager/peftCacheManagerConfig.h | 1 +
.../batch_manager/schedulerPolicy.h | 2 +
.../batch_manager/trtGptModelOptionalParams.h | 6 +-
cpp/include/tensorrt_llm/common/mpiUtils.h | 29 +-
cpp/include/tensorrt_llm/executor/executor.h | 54 +-
.../tensorrt_llm/executor/serialization.h | 117 +
cpp/include/tensorrt_llm/executor/types.h | 16 +
.../tensorrt_llm/runtime/decodingMode.h | 38 +-
.../tensorrt_llm/runtime/decodingOutput.h | 61 +-
cpp/include/tensorrt_llm/runtime/gptDecoder.h | 2 +-
.../tensorrt_llm/runtime/gptDecoderBatch.h | 4 +-
.../tensorrt_llm/runtime/gptJsonConfig.h | 12 +-
cpp/include/tensorrt_llm/runtime/gptSession.h | 19 +-
cpp/include/tensorrt_llm/runtime/iBuffer.h | 7 +
.../runtime/iStatefulGptDecoder.h | 2 +-
cpp/include/tensorrt_llm/runtime/loraCache.h | 12 +-
.../{gptModelConfig.h => modelConfig.h} | 38 +-
.../tensorrt_llm/runtime/samplingConfig.h | 85 +-
.../runtime/utils/multiDeviceUtils.h | 0
cpp/tensorrt_llm/CMakeLists.txt | 55 +-
.../libtensorrt_llm_batch_manager_static.a | 4 +-
...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +-
.../aarch64-linux-gnu/version.txt | 6 +-
.../libtensorrt_llm_batch_manager_static.a | 4 +-
...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +-
.../tensorrt_llm_batch_manager_static.lib | 4 +-
cpp/tensorrt_llm/common/cudaDriverWrapper.cpp | 10 +
cpp/tensorrt_llm/common/cudaDriverWrapper.h | 9 +
cpp/tensorrt_llm/common/envUtils.cpp | 36 +-
cpp/tensorrt_llm/common/envUtils.h | 10 +
cpp/tensorrt_llm/common/mpiUtils.cpp | 28 +-
cpp/tensorrt_llm/common/workspace.h | 2 +-
.../cutlass_extensions/epilogue_helpers.h | 10 +-
.../gemm/kernel/default_fpA_intB_traits.h | 39 +-
.../gemm/kernel/fpA_intB_gemm.h | 4 +-
.../gemm/kernel/mixed_gemm_B_layout.h | 50 +-
.../gemm/kernel/moe_cutlass_kernel.h | 5 +-
.../threadblock/default_dq_mma_multistage.h | 37 +-
.../threadblock/default_dq_mma_pipelined.h | 70 +-
.../gemm/threadblock/default_mma.h | 67 +-
.../dq_mma_multistage_finegrained.h | 45 +-
.../threadblock/dq_mma_multistage_percol.h | 21 +-
.../gemm/threadblock/dq_mma_pipelined.h | 3 +-
.../gemm/warp/default_mma_tensor_op.h | 2 +-
.../warp/mma_tensorop_compute_B_with_f16.h | 13 +-
.../gemm/warp/mma_tensorop_dequantizer.h | 6 +-
.../include/cutlass_extensions/gemm_configs.h | 13 +
.../libtensorrt_llm_executor_static.a | 4 +-
...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +-
.../executor/aarch64-linux-gnu/version.txt | 6 +-
.../libtensorrt_llm_executor_static.a | 4 +-
...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +-
.../tensorrt_llm_executor_static.lib | 4 +-
.../executor_worker/CMakeLists.txt | 26 +
.../executor_worker/executorWorker.cpp | 80 +
cpp/tensorrt_llm/kernels/beamSearchKernels.cu | 15 +-
cpp/tensorrt_llm/kernels/beamSearchKernels.h | 68 +-
.../beamSearchKernelsTemplate.h | 632 +--
.../fmhaRunner.cpp | 3 -
.../kernels/cutlass_kernels/CMakeLists.txt | 115 +-
.../cutlass_kernels/cutlass_heuristic.cpp | 60 +-
.../cutlass_kernels/cutlass_heuristic.h | 5 +-
.../cutlass_kernels/cutlass_preprocessors.cpp | 134 +-
.../cutlass_kernels/cutlass_preprocessors.h | 17 +-
.../cutlass_kernels/cutlass_type_conversion.h | 2 +-
...e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu | 2 +-
...e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu | 2 +-
.../e4m3_int4_gemm_per_col_f16_out_f16.cu | 2 +-
.../fpA_intB_gemm/fpA_intB_gemm_template.h | 220 +-
.../fpA_intB_gemm_template_sm90.h | 2 +-
.../launchers/fpA_intB_launcher_sm90.inl | 6 +-
.../int8_gemm/int8_gemm_template.h | 13 +-
.../launchers/moe_gemm_launcher_sm90.h | 36 +
.../launchers/moe_gemm_launcher_sm90.inl | 304 ++
.../moe_gemm/moe_gemm_kernels.h | 131 +-
.../moe_gemm/moe_gemm_kernels_fp8_fp8.cu | 25 +
.../moe_gemm/moe_gemm_kernels_template.h | 344 +-
.../moe_gemm/moe_gemm_kernels_template_sm90.h | 214 +
.../moe_gemm/moe_sm90_traits.h | 50 +
.../python/generate_kernels.py | 173 +-
...eam_1_kvt_e4m3_nqpkv_8_m_8_sm_90.cubin.cpp | 3400 +++---------
...m3_pagedKV_128_nqpkv_8_m_8_sm_90.cubin.cpp | 3586 +++---------
...4m3_pagedKV_64_nqpkv_8_m_8_sm_90.cubin.cpp | 3594 +++---------
...eam_1_kvt_e4m3_nqpkv_8_m_8_sm_90.cubin.cpp | 4576 ++++------------
...m3_pagedKV_128_nqpkv_8_m_8_sm_90.cubin.cpp | 4820 ++++-------------
...4m3_pagedKV_64_nqpkv_8_m_8_sm_90.cubin.cpp | 4820 ++++-------------
...eam_1_kvt_e4m3_nqpkv_8_m_8_sm_90.cubin.cpp | 2704 +++------
...m3_pagedKV_128_nqpkv_8_m_8_sm_90.cubin.cpp | 2894 +++-------
...4m3_pagedKV_64_nqpkv_8_m_8_sm_90.cubin.cpp | 2902 +++-------
...eam_1_kvt_e4m3_nqpkv_8_m_8_sm_90.cubin.cpp | 3340 ++++--------
...m3_pagedKV_128_nqpkv_8_m_8_sm_90.cubin.cpp | 3586 ++++--------
...4m3_pagedKV_64_nqpkv_8_m_8_sm_90.cubin.cpp | 3586 ++++--------
.../decoderXQAConstants.h | 14 +-
.../decoderXQAImpl.h | 6 +
.../decoderXQAImplPrecompiled.cpp | 187 +-
.../decoderXQARunner.cpp | 14 +-
cpp/tensorrt_llm/kernels/decodingKernels.cu | 104 +-
cpp/tensorrt_llm/kernels/kvCacheIndex.h | 56 +
cpp/tensorrt_llm/kernels/kvCacheUtils.h | 26 +-
cpp/tensorrt_llm/kernels/lookupKernels.cu | 30 +-
cpp/tensorrt_llm/kernels/lookupKernels.h | 4 +-
cpp/tensorrt_llm/kernels/mambaConv1dKernels.h | 16 +-
.../kernels/mixtureOfExperts/moe_kernels.cu | 592 +-
.../kernels/mixtureOfExperts/moe_kernels.h | 126 +-
.../parallelDecoding/kvCacheUpdateKernels.cu | 6 +-
cpp/tensorrt_llm/kernels/penaltyTypes.h | 14 -
.../kernels/weightOnlyBatchedGemv/details.h | 35 +-
.../kernels/weightOnlyBatchedGemv/kernel.h | 6 +-
.../weightOnlyBatchedGemv/kernelDispatcher.h | 6 +-
...atcherBf16Int4GroupwiseColumnMajorFalse.cu | 3 +-
...Int4GroupwiseColumnMajorInterleavedTrue.cu | 2 +-
...tcherBf16Int4PerChannelColumnMajorFalse.cu | 2 +-
...nt4PerChannelColumnMajorInterleavedTrue.cu | 2 +-
...tcherBf16Int8PerChannelColumnMajorFalse.cu | 2 +-
...nt8PerChannelColumnMajorInterleavedTrue.cu | 2 +-
...atcherFp16Int4GroupwiseColumnMajorFalse.cu | 3 +-
...Int4GroupwiseColumnMajorInterleavedTrue.cu | 5 +-
...tcherFp16Int4PerChannelColumnMajorFalse.cu | 2 +-
...nt4PerChannelColumnMajorInterleavedTrue.cu | 2 +-
...atcherFp16Int4PerChannelColumnMajorTrue.cu | 3 +-
...tcherFp16Int8PerChannelColumnMajorFalse.cu | 2 +-
...nt8PerChannelColumnMajorInterleavedTrue.cu | 2 +-
...atcherFp16Int8PerChannelColumnMajorTrue.cu | 3 +-
.../weightOnlyBatchedGemv/kernelLauncher.h | 13 +-
.../kernels/weightOnlyBatchedGemv/utility.h | 2 +-
cpp/tensorrt_llm/layers/beamSearchLayer.cu | 103 +-
.../layers/defaultDecodingParams.h | 114 +
.../layers/dynamicDecodeLayer.cpp | 41 +-
.../layers/medusaDecodingLayer.cpp | 6 +-
cpp/tensorrt_llm/layers/topKSamplingLayer.cu | 7 +-
cpp/tensorrt_llm/layers/topPSamplingLayer.cu | 21 +-
.../plugins/common/gemmPluginProfiler.cpp | 25 +-
.../plugins/common/gemmPluginProfiler.h | 2 +
.../gptAttentionCommon/gptAttentionCommon.cpp | 9 +-
.../plugins/lookupPlugin/lookupPlugin.cpp | 14 +-
.../mixtureOfExpertsPlugin.cpp | 219 +-
.../mixtureOfExperts/mixtureOfExpertsPlugin.h | 55 +-
.../weightOnlyGroupwiseQuantMatmulPlugin.cpp | 21 +-
.../weightOnlyGroupwiseQuantMatmulPlugin.h | 2 +-
cpp/tensorrt_llm/pybind/bindings.cpp | 93 +-
cpp/tensorrt_llm/pybind/executor/bindings.cpp | 56 +-
cpp/tensorrt_llm/pybind/executor/executor.h | 12 +-
cpp/tensorrt_llm/runtime/decodingOutput.cpp | 80 +-
cpp/tensorrt_llm/runtime/gptDecoder.cpp | 75 +-
cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp | 4 +-
cpp/tensorrt_llm/runtime/gptJsonConfig.cpp | 64 +-
cpp/tensorrt_llm/runtime/gptSession.cpp | 4 +-
cpp/tensorrt_llm/runtime/loraCache.cpp | 8 +-
cpp/tensorrt_llm/runtime/loraManager.cpp | 15 +-
cpp/tensorrt_llm/runtime/loraManager.h | 16 +-
cpp/tensorrt_llm/runtime/loraUtils.cpp | 6 +-
cpp/tensorrt_llm/runtime/loraUtils.h | 4 +-
cpp/tensorrt_llm/runtime/runtimeBuffers.cpp | 16 +-
cpp/tensorrt_llm/runtime/runtimeBuffers.h | 20 +-
cpp/tensorrt_llm/runtime/runtimeKernels.cu | 8 +-
cpp/tensorrt_llm/runtime/runtimeKernels.h | 9 +-
cpp/tensorrt_llm/runtime/ssmStateBuffers.cpp | 14 +-
cpp/tensorrt_llm/runtime/ssmStateBuffers.h | 14 +-
.../runtime/statefulGptDecoder.cpp | 2 +-
cpp/tensorrt_llm/runtime/statefulGptDecoder.h | 2 +-
.../runtime/transformerBuffers.cpp | 32 +-
cpp/tensorrt_llm/runtime/transformerBuffers.h | 22 +-
cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp | 36 +-
cpp/tensorrt_llm/thop/gatherTreeOp.cpp | 48 +-
.../thop/parallelDecodeKVCacheUpdateOp.cpp | 9 +-
cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp | 32 +-
cpp/tests/CMakeLists.txt | 4 +-
cpp/tests/kernels/mixtureOfExpertsTest.cu | 854 ++-
cpp/tests/kernels/shiftKCacheKernelTest.cu | 2 +-
cpp/tests/resources/.gitignore | 1 +
.../data/test_model_lora_config.json | 72 +
cpp/tests/resources/scripts/test_cpp.py | 16 +
cpp/tests/runtime/gptDecoderBatchTest.cpp | 17 +-
cpp/tests/runtime/gptDecoderTest.cpp | 5 +-
cpp/tests/runtime/gptSessionTest.cpp | 15 +-
cpp/tests/runtime/loraCacheTest.cpp | 14 +-
cpp/tests/runtime/loraManagerTest.cpp | 18 +-
cpp/tests/runtime/loraUtilsTest.cpp | 4 +-
cpp/tests/runtime/medusaModuleTest.cpp | 2 +-
cpp/tests/runtime/transposeKVKernelTest.cpp | 9 +-
.../2023-05-17-how-to-add-a-new-model.md | 17 -
.../batch-manager.md} | 10 +-
docs/source/advanced/expert-parallelism.md | 26 +
.../gpt-attention.md} | 61 +-
.../gpt-runtime.md} | 47 +-
docs/source/{ => advanced}/graph-rewriting.md | 8 +-
.../inference-request.md} | 23 +-
docs/source/{ => advanced}/lora.md | 41 +-
docs/source/architecture/add-model.md | 16 +
docs/source/{ => architecture}/checkpoint.md | 10 +-
.../core-concepts.md} | 80 +-
docs/source/architecture/overview.md | 18 +
docs/source/{ => architecture}/workflow.md | 0
docs/source/blogs/Falcon180B-H200.md | 8 +-
docs/source/blogs/H100vsA100.md | 4 +-
docs/source/blogs/H200launch.md | 4 +-
docs/source/blogs/XQA-kernel.md | 2 +-
docs/source/build_from_source.md | 216 -
docs/source/index.rst | 132 +-
.../installation/build-from-source-linux.md | 167 +
.../installation/build-from-source-windows.md | 182 +
docs/source/installation/linux.md | 33 +
docs/source/installation/windows.md | 65 +
docs/source/overview.md | 45 +
.../perf-analysis.md} | 4 +-
.../perf-best-practices.md} | 2 +
.../perf-overview.md} | 28 +-
.../source/python-api/tensorrt_llm.layers.rst | 2 +-
docs/source/quick-start-guide.md | 140 +
docs/source/{ => reference}/memory.md | 2 +
docs/source/{ => reference}/precision.md | 13 +-
docs/source/reference/support-matrix.md | 111 +
.../troubleshooting.md} | 86 +-
docs/source/release-notes.md | 229 +
examples/baichuan/README.md | 2 +-
examples/baichuan/convert_checkpoint.py | 3 +-
examples/baichuan/requirements.txt | 2 +-
examples/bloom/requirements.txt | 2 +-
examples/chatglm/requirements.txt | 2 +-
examples/cpp/executor/README.md | 7 +
.../cpp/executor/executorExampleAdvanced.cpp | 19 +
examples/dbrx/requirements.txt | 2 +-
examples/falcon/requirements.txt | 2 +-
examples/gemma/README.md | 15 +-
examples/gemma/convert_checkpoint.py | 28 +-
examples/gemma/requirements.txt | 2 +-
examples/gpt/convert_checkpoint.py | 48 +-
examples/gpt/requirements.txt | 2 +-
examples/gptneox/convert_checkpoint.py | 3 +-
examples/gptneox/requirements.txt | 2 +-
examples/high-level-api/requirements.txt | 2 +-
examples/internlm/requirements.txt | 2 +-
examples/llama/requirements.txt | 2 +-
examples/mamba/README.md | 4 +-
examples/mamba/convert_checkpoint.py | 5 +-
examples/mamba/requirements.txt | 2 +-
examples/medusa/requirements.txt | 2 +-
examples/mixtral/README.md | 95 +-
examples/mixtral/requirements.txt | 2 +-
examples/mpt/requirements.txt | 2 +-
examples/opt/requirements.txt | 2 +-
examples/phi/requirements.txt | 2 +-
examples/quantization/README.md | 7 +-
examples/quantization/requirements.txt | 2 +-
examples/qwen/README.md | 30 +-
examples/qwen/convert_checkpoint.py | 7 +
examples/qwen/requirements.txt | 2 +-
examples/qwenvl/requirements.txt | 2 +-
examples/skywork/requirements.txt | 2 +-
examples/smaug/requirements.txt | 2 +-
examples/summarize.py | 12 +-
examples/utils.py | 12 +-
examples/whisper/requirements.txt | 2 +-
requirements-windows.txt | 4 +-
requirements.txt | 6 +-
scripts/build_wheel.py | 10 +-
tensorrt_llm/__init__.py | 4 +-
tensorrt_llm/_utils.py | 25 +
tensorrt_llm/auto_parallel/cluster_info.py | 23 +-
tensorrt_llm/auto_parallel/utils.py | 2 +-
tensorrt_llm/builder.py | 41 +-
tensorrt_llm/commands/build.py | 16 +
tensorrt_llm/executor.py | 449 +-
tensorrt_llm/functional.py | 425 +-
tensorrt_llm/hlapi/_perf_evaluator.py | 8 +-
tensorrt_llm/hlapi/llm.py | 13 +-
tensorrt_llm/hlapi/utils.py | 4 +
tensorrt_llm/layers/moe.py | 228 +-
tensorrt_llm/models/gemma/model.py | 19 +-
tensorrt_llm/models/gemma/smoothquant.py | 7 +-
tensorrt_llm/models/gemma/weight.py | 41 +-
tensorrt_llm/models/llama/convert.py | 24 +-
tensorrt_llm/models/llama/model.py | 1 +
tensorrt_llm/models/llama/weight.py | 19 +-
tensorrt_llm/models/modeling_utils.py | 15 +-
tensorrt_llm/models/qwen/convert.py | 364 +-
tensorrt_llm/models/qwen/model.py | 5 +-
tensorrt_llm/models/qwen/utils.py | 31 +
tensorrt_llm/models/qwen/weight.py | 80 +-
tensorrt_llm/network.py | 20 +-
tensorrt_llm/quantization/quantize.py | 8 +-
tensorrt_llm/quantization/quantize_by_ammo.py | 55 +-
tensorrt_llm/runtime/generation.py | 5 +-
tensorrt_llm/tools/onnx_utils.py | 78 +
tensorrt_llm/version.py | 2 +-
tests/bindings/test_bindings.py | 131 +-
tests/bindings/test_executor_bindings.py | 67 +
tests/bindings/test_gpt_session.py | 2 +-
tests/functional/test_argmax.py | 68 +
tests/functional/test_embedding_single_gpu.py | 4 -
tests/functional/test_gather_nd.py | 242 +
tests/functional/test_logsoftmax.py | 177 +
tests/functional/test_moe.py | 573 +-
tests/functional/test_nonzero.py | 79 +
tests/functional/test_sample.py | 122 +
tests/functional/test_scatter.py | 114 +
tests/functional/test_selective_scan.py | 22 +-
tests/functional/test_squeeze.py | 68 +
tests/functional/test_unsqueeze.py | 68 +
tests/functional/test_where.py | 106 +
tests/hlapi/grid_searcher.py | 45 +-
tests/hlapi/hlapi_evaluator.py | 178 +
tests/hlapi/test_executor.py | 40 +-
...test_weight_only_groupwise_quant_matmul.py | 32 +-
tests/utils/util.py | 35 +-
windows/README.md | 316 +-
windows/setup_env.ps1 | 39 +-
318 files changed, 21365 insertions(+), 37303 deletions(-)
create mode 100644 cpp/include/tensorrt_llm/executor/serialization.h
rename cpp/include/tensorrt_llm/runtime/{gptModelConfig.h => modelConfig.h} (91%)
rename cpp/{ => include}/tensorrt_llm/runtime/utils/multiDeviceUtils.h (100%)
create mode 100644 cpp/tensorrt_llm/executor_worker/CMakeLists.txt
create mode 100644 cpp/tensorrt_llm/executor_worker/executorWorker.cpp
create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.h
create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl
create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu
create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template_sm90.h
create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_sm90_traits.h
create mode 100644 cpp/tensorrt_llm/kernels/kvCacheIndex.h
create mode 100644 cpp/tensorrt_llm/layers/defaultDecodingParams.h
create mode 100644 cpp/tests/resources/data/test_model_lora_config.json
delete mode 100644 docs/source/2023-05-17-how-to-add-a-new-model.md
rename docs/source/{batch_manager.md => advanced/batch-manager.md} (97%)
create mode 100644 docs/source/advanced/expert-parallelism.md
rename docs/source/{gpt_attention.md => advanced/gpt-attention.md} (88%)
rename docs/source/{gpt_runtime.md => advanced/gpt-runtime.md} (83%)
rename docs/source/{ => advanced}/graph-rewriting.md (92%)
rename docs/source/{inference_request.md => advanced/inference-request.md} (61%)
rename docs/source/{ => advanced}/lora.md (68%)
create mode 100644 docs/source/architecture/add-model.md
rename docs/source/{ => architecture}/checkpoint.md (93%)
rename docs/source/{architecture.md => architecture/core-concepts.md} (81%)
create mode 100644 docs/source/architecture/overview.md
rename docs/source/{ => architecture}/workflow.md (100%)
delete mode 100644 docs/source/build_from_source.md
create mode 100644 docs/source/installation/build-from-source-linux.md
create mode 100644 docs/source/installation/build-from-source-windows.md
create mode 100644 docs/source/installation/linux.md
create mode 100644 docs/source/installation/windows.md
create mode 100644 docs/source/overview.md
rename docs/source/{performance_analysis.md => performance/perf-analysis.md} (98%)
rename docs/source/{perf_best_practices.md => performance/perf-best-practices.md} (99%)
rename docs/source/{performance.md => performance/perf-overview.md} (92%)
create mode 100644 docs/source/quick-start-guide.md
rename docs/source/{ => reference}/memory.md (99%)
rename docs/source/{ => reference}/precision.md (96%)
create mode 100644 docs/source/reference/support-matrix.md
rename docs/source/{2023-05-19-how-to-debug.md => reference/troubleshooting.md} (67%)
create mode 100644 docs/source/release-notes.md
create mode 100644 tensorrt_llm/tools/onnx_utils.py
create mode 100644 tests/functional/test_argmax.py
create mode 100644 tests/functional/test_gather_nd.py
create mode 100644 tests/functional/test_logsoftmax.py
create mode 100644 tests/functional/test_nonzero.py
create mode 100644 tests/functional/test_sample.py
create mode 100644 tests/functional/test_scatter.py
create mode 100644 tests/functional/test_squeeze.py
create mode 100644 tests/functional/test_unsqueeze.py
create mode 100644 tests/functional/test_where.py
create mode 100644 tests/hlapi/hlapi_evaluator.py
diff --git a/.gitignore b/.gitignore
index cb9aee85b..15e677c07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,8 @@ cpp/.ccache/
tensorrt_llm/libs
tensorrt_llm/bindings.pyi
tensorrt_llm/bindings/*.pyi
+*docs/cpp_docs*
+*docs/source/_cpp_gen*
# Testing
.coverage.*
diff --git a/README.md b/README.md
index 7342b3b96..774dc59d8 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ TensorRT-LLM
[data:image/s3,"s3://crabby-images/7e146/7e14689337757b8c393afa763639d7a68e0e6263" alt="version"](./setup.py)
[data:image/s3,"s3://crabby-images/b164c/b164ca7aed446cfec53062cbfc7417b86bfff454" alt="license"](./LICENSE)
-[Architecture](./docs/source/architecture.md) | [Results](./docs/source/performance.md) | [Examples](./examples/) | [Documentation](./docs/source/)
+[Architecture](./docs/source/architecture/overview.md) | [Results](./docs/source/performance/perf-overview.md) | [Examples](./examples/) | [Documentation](./docs/source/)
---
@@ -29,42 +29,13 @@ TensorRT-LLM
* [2023/10/17] [Large Language Models up to 4x Faster on RTX With TensorRT-LLM for Windows
](https://blogs.nvidia.com/blog/2023/10/17/tensorrt-llm-windows-stable-diffusion-rtx/)
-
-## Table of Contents
-
-- [TensorRT-LLM](#tensorrt-llm)
- - [Latest News](#latest-news)
- - [Table of Contents](#table-of-contents)
- - [TensorRT-LLM Overview](#tensorrt-llm-overview)
- - [Installation](#installation)
- - [Quick Start](#quick-start)
- - [Support Matrix](#support-matrix)
- - [Devices](#devices)
- - [Precision](#precision)
- - [Key Features](#key-features)
- - [Models](#models)
- - [Performance](#performance)
- - [Advanced Topics](#advanced-topics)
- - [Quantization](#quantization)
- - [In-flight Batching](#in-flight-batching)
- - [Attention](#attention)
- - [Graph Rewriting](#graph-rewriting)
- - [Benchmark](#benchmark)
- - [Troubleshooting](#troubleshooting)
- - [Release notes](#release-notes)
- - [Change Log](#change-log)
- - [Versions 0.9.0](#versions-090)
- - [For history change log, please see CHANGELOG.md.](#for-history-change-log-please-see-changelogmd)
- - [Known Issues](#known-issues)
- - [Report Issues](#report-issues)
-
## TensorRT-LLM Overview
-TensorRT-LLM provides users with an easy-to-use Python API to define Large
+TensorRT-LLM is an easy-to-use Python API to define Large
Language Models (LLMs) and build
[TensorRT](https://developer.nvidia.com/tensorrt) engines that contain
state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.
-TensorRT-LLM also contains components to create Python and C++ runtimes that
+TensorRT-LLM contains components to create Python and C++ runtimes that
execute those TensorRT engines. It also includes a
[backend](https://github.com/triton-inference-server/tensorrtllm_backend)
for integration with the
@@ -76,8 +47,8 @@ multiple nodes with multiple GPUs (using
and/or
[Pipeline Parallelism](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/parallelisms.html#pipeline-parallelism)).
-The Python API of TensorRT-LLM is architectured to look similar to the
-[PyTorch](https://pytorch.org) API. It provides users with a
+The TensorRT-LLM Python API architecture looks similar to the
+[PyTorch](https://pytorch.org) API. It provides a
[functional](./tensorrt_llm/functional.py) module containing functions like
`einsum`, `softmax`, `matmul` or `view`. The [layers](./tensorrt_llm/layers)
module bundles useful building blocks to assemble LLMs; like an `Attention`
@@ -86,422 +57,21 @@ like `GPTAttention` or `BertAttention`, can be found in the
[models](./tensorrt_llm/models) module.
TensorRT-LLM comes with several popular models pre-defined. They can easily be
-modified and extended to fit custom needs. See below for a list of supported
-[models](#Models).
+modified and extended to fit custom needs. Refer to the [Support Matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html) for a list of supported models.
To maximize performance and reduce memory footprint, TensorRT-LLM allows the
-models to be executed using different quantization modes (see
-[`examples/gpt`](./examples/gpt) for concrete examples). TensorRT-LLM supports
+models to be executed using different quantization modes (refer to
+[`support matrix`](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html#software)). TensorRT-LLM supports
INT4 or INT8 weights (and FP16 activations; a.k.a. INT4/INT8 weight-only) as
well as a complete implementation of the
[SmoothQuant](https://arxiv.org/abs/2211.10438) technique.
-For a more detailed presentation of the software architecture and the key
-concepts used in TensorRT-LLM, we recommend you to read the following
-[document](./docs/source/architecture.md).
-
-## Installation
-
-After installing the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit),
-please run the following commands to install TensorRT-LLM for x86_64 users.
-
-```bash
-# Obtain and start the basic docker image environment.
-docker run --rm --runtime=nvidia --gpus all --entrypoint /bin/bash -it nvidia/cuda:12.1.0-devel-ubuntu22.04
-
-# Install dependencies, TensorRT-LLM requires Python 3.10
-apt-get update && apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev
-
-# Install the latest preview version (corresponding to the main branch) of TensorRT-LLM.
-# If you want to install the stable version (corresponding to the release branch), please
-# remove the `--pre` option.
-pip3 install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com
-
-# Check installation
-python3 -c "import tensorrt_llm"
-```
-
-For developers who have the best performance requirements, debugging needs, or use the aarch64 architecture,
-please refer to the instructions for [building from source code](docs/source/build_from_source.md).
-
-For Windows installation, see [`Windows`](windows/README.md).
-
-## Quick Start
-
-Please be sure to complete the [installation steps](#installation) before proceeding with the following steps.
-
-To create a TensorRT engine for an existing model, there are 3 steps:
-
-1. Download pre-trained weights,
-2. Build a fully-optimized engine of the model,
-3. Deploy the engine, in other words, run the fully-optimized model.
-
-The following sections show how to use TensorRT-LLM to run the
-[BLOOM-560m](https://huggingface.co/bigscience/bloom-560m) model.
-
-***0. In the BLOOM folder***
-
-Inside the Docker container, you have to install the requirements:
-
-```bash
-pip install -r examples/bloom/requirements.txt
-git lfs install
-```
-
-***1. Download the model weights from HuggingFace***
-
-From the BLOOM example folder, you must download the weights of the model.
-
-```bash
-cd examples/bloom
-rm -rf ./bloom/560M
-mkdir -p ./bloom/560M && git clone https://huggingface.co/bigscience/bloom-560m ./bloom/560M
-
-```
-***2. Build the engine***
-
-```bash
-# Single GPU on BLOOM 560M
-python convert_checkpoint.py --model_dir ./bloom/560M/ \
- --dtype float16 \
- --output_dir ./bloom/560M/trt_ckpt/fp16/1-gpu/
-# May need to add trtllm-build to PATH, export PATH=/usr/local/bin:$PATH
-trtllm-build --checkpoint_dir ./bloom/560M/trt_ckpt/fp16/1-gpu/ \
- --gemm_plugin float16 \
- --output_dir ./bloom/560M/trt_engines/fp16/1-gpu/
-```
-
-See the BLOOM [example](examples/bloom) for more details and options regarding the `trtllm-build` command.
-
-***3. Run***
-
-The `../summarize.py` script can be used to perform the summarization of articles
-from the CNN Daily dataset:
-
-```bash
-python ../summarize.py --test_trt_llm \
- --hf_model_dir ./bloom/560M/ \
- --data_type fp16 \
- --engine_dir ./bloom/560M/trt_engines/fp16/1-gpu/
-```
-
-More details about the script and how to run the BLOOM model can be found in
-the example [folder](examples/bloom). Many more [models](#models) than BLOOM
-are implemented in TensorRT-LLM. They can be found in the
-[examples](./examples/) directory.
-
-Beyond local execution, you can also use the NVIDIA Triton Inference Server to create a production-ready deployment of your LLM as described in this [blog](https://developer.nvidia.com/blog/optimizing-inference-on-llms-with-tensorrt-llm-now-publicly-available/).
-
-## Support Matrix
-
-TensorRT-LLM optimizes the performance of a range of well-known models on
-NVIDIA GPUs. The following sections provide a list of supported GPU
-architectures as well as important features implemented in TensorRT-LLM.
-
-### Devices
-
-TensorRT-LLM supports the following architectures:
-
-* [NVIDIA Hopper](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/) (SM90), for example, H200, H100, H20
-* [NVIDIA Ada Lovelace](https://www.nvidia.com/en-us/geforce/ada-lovelace-architecture/) (SM89), for example, L40S, L20, L4
-* [NVIDIA Ampere](https://www.nvidia.com/en-us/data-center/ampere-architecture/) (SM80, SM86), for example, A100, A30, A10G
-* [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/) (SM75), for example, T4
-* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) (SM70 - experimental), for example, V100
-
-
-It is important to note that TensorRT-LLM is expected to work on all GPUs based on the Volta, Turing, Ampere, Hopper, and Ada Lovelace architectures. Certain limitations may apply.
-
-### Precision
-
-Various numerical precisions are supported in TensorRT-LLM. The support for
-some of those numerical features require specific architectures:
-
-| | FP32 | FP16 | BF16 | FP8 | INT8 | INT4 |
-| :------------------ | :--- | :--- | :--- | :--- | :---- | :---- |
-| Volta (SM70) | Y | Y | N | N | Y (1) | Y (2) |
-| Turing (SM75) | Y | Y | N | N | Y (1) | Y (2) |
-| Ampere (SM80, SM86) | Y | Y | Y | N | Y | Y (3) |
-| Ada-Lovelace (SM89) | Y | Y | Y | Y | Y | Y |
-| Hopper (SM90) | Y | Y | Y | Y | Y | Y |
-
-(1) INT8 SmoothQuant is not supported on SM70 and SM75.
-(2) INT4 AWQ and GPTQ are not supported on SM < 80.
-(3) INT4 AWQ and GPTQ with FP8 activations require SM >= 89.
-
-In this release of TensorRT-LLM, the support for FP8 and quantized data types
-(INT8 or INT4) is not implemented for all the models. See the
-[precision](./docs/source/precision.md) document and the
-[examples](./examples/.) folder for additional details.
-
-### Key Features
-
-TensorRT-LLM contains examples that implement the following features.
-
-* Multi-head Attention([MHA](https://arxiv.org/abs/1706.03762))
-* Multi-query Attention ([MQA](https://arxiv.org/abs/1911.02150))
-* Group-query Attention([GQA](https://arxiv.org/abs/2307.09288))
-* In-flight Batching
-* Paged KV Cache for the Attention
-* Tensor Parallelism
-* Pipeline Parallelism
-* INT4/INT8 Weight-Only Quantization (W4A16 & W8A16)
-* [SmoothQuant](https://arxiv.org/abs/2211.10438)
-* [GPTQ](https://arxiv.org/abs/2210.17323)
-* [AWQ](https://arxiv.org/abs/2306.00978)
-* [FP8](https://arxiv.org/abs/2209.05433)
-* Greedy-search
-* Beam-search
-* RoPE
-
-In this release of TensorRT-LLM, some of the features are not enabled for all
-the models listed in the [examples](examples/.) folder.
-
-### Models
-
-The list of supported models is:
-
-* [Baichuan](examples/baichuan)
-* [BART](examples/enc_dec)
-* [BERT](examples/bert)
-* [Blip2](examples/blip2)
-* [BLOOM](examples/bloom)
-* [ChatGLM](examples/chatglm)
-* [FairSeq NMT](examples/enc_dec/nmt)
-* [Falcon](examples/falcon)
-* [Flan-T5](examples/enc_dec)
-* [GPT](examples/gpt)
-* [GPT-J](examples/gptj)
-* [GPT-Nemo](examples/gpt)
-* [GPT-NeoX](examples/gptneox)
-* [InternLM](examples/internlm)
-* [LLaMA](examples/llama)
-* [LLaMA-v2](examples/llama)
-* [Mamba](examples/mamba)
-* [mBART](examples/enc_dec)
-* [Medusa](examples/medusa)
-* [Mistral](examples/llama#mistral-v01)
-* [MPT](examples/mpt)
-* [mT5](examples/enc_dec)
-* [OPT](examples/opt)
-* [Phi-1.5/Phi-2](examples/phi)
-* [Qwen](examples/qwen)
-* [Replit Code](examples/mpt)
-* [RoBERTa](examples/bert)
-* [SantaCoder](examples/gpt)
-* [StarCoder1/StarCoder2](examples/gpt)
-* [T5](examples/enc_dec)
-* [Whisper](examples/whisper)
-
-Note: [Encoder-Decoder](examples/enc_dec/) provides general encoder-decoder
-functionality that supports many encoder-decoder models such as T5 family, BART family, Whisper family, NMT family, etc. We
-unroll the exact model names in the list above to let users find specific
-models easier.
-
-The list of supported multi-modal models is:
-
-* [BLIP2 w/ OPT-2.7B](examples/multimodal)
-* [BLIP2 w/ T5-XL](examples/multimodal)
-* [LLaVA-v1.5-7B](examples/multimodal)
-* [Nougat family](examples/multimodal) Nougat-small, Nougat-base
-
-Note: Multi-modal provides general multi-modal functionality that supports many multi-modal architectures such as BLIP family, LLaVA family, etc. We unroll the exact model names in the list above to let users find specific models easier.
-
-## Performance
-
-Please refer to the [performance](./docs/source/performance.md) page for
-performance numbers. That page contains measured numbers for four variants of
-popular models (GPT-J, LLAMA-7B, LLAMA-70B, Falcon-180B), measured on the H100,
-L40S and A100 GPU(s).
-
-## Advanced Topics
-
-### Quantization
-
-This [document](./docs/source/precision.md) describes the different
-quantization methods implemented in TensorRT-LLM and contains a support matrix
-for the different models.
-
-### In-flight Batching
-
-TensorRT-LLM supports in-flight batching of requests (also known as continuous
-batching or iteration-level batching). It's a
-[technique](./docs/source/batch_manager.md) that aims at reducing wait
-times in queues, eliminating the need for padding requests and allowing for
-higher GPU utilization.
-
-### Attention
-
-TensorRT-LLM implements several variants of the Attention mechanism that
-appears in most the Large Language Models. This
-[document](./docs/source/gpt_attention.md) summarizes those implementations and
-how they are optimized in TensorRT-LLM.
-
-### Graph Rewriting
-
-TensorRT-LLM uses a declarative approach to define neural networks and contains
-techniques to optimize the underlying graph. For more details, please refer to
-[doc](./docs/source/graph-rewriting.md)
-
-### Benchmark
-
-TensorRT-LLM provides [C++](./benchmarks/cpp/README.md) and
-[Python](./benchmarks/python/README.md) tools to perform benchmarking. Note,
-however, that it is recommended to use the C++ version.
-
-## Troubleshooting
-
-* If you encounter accuracy issues in the generated text, you may want to increase
- the internal precision in the attention layer. For that, pass the `--context_fmha_fp32_acc enable` to
- `trtllm-build`.
-
-* It's recommended to add options `–shm-size=1g –ulimit memlock=-1` to the
- docker or nvidia-docker run command. Otherwise you may see NCCL errors when
- running multiple GPU inferences. See
- https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#errors
- for details.
-
-* When building models, memory-related issues such as
-```
-[09/23/2023-03:13:00] [TRT] [E] 9: GPTLMHeadModel/layers/0/attention/qkv/PLUGIN_V2_Gemm_0: could not find any supported formats consistent with input/output data types
-[09/23/2023-03:13:00] [TRT] [E] 9: [pluginV2Builder.cpp::reportPluginError::24] Error Code 9: Internal Error (GPTLMHeadModel/layers/0/attention/qkv/PLUGIN_V2_Gemm_0: could not find any supported formats consistent with input/output data types)
-```
-may happen. One possible solution is to reduce the amount of memory needed by
-reducing the maximum batch size, input and output lengths. Another option is to
-enable plugins, for example: `--gpt_attention_plugin`.
-
-* MPI + Slurm
-
-TensorRT-LLM is a
-[MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface)-aware package
-that uses [`mpi4py`](https://mpi4py.readthedocs.io/en/stable/). If you are
-running scripts in a [Slurm](https://slurm.schedmd.com/) environment, you might
-encounter interferences:
-```
---------------------------------------------------------------------------
-PMI2_Init failed to initialize. Return code: 14
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-The application appears to have been direct launched using "srun",
-but OMPI was not built with SLURM's PMI support and therefore cannot
-execute. There are several options for building PMI support under
-SLURM, depending upon the SLURM version you are using:
-
- version 16.05 or later: you can use SLURM's PMIx support. This
- requires that you configure and build SLURM --with-pmix.
-
- Versions earlier than 16.05: you must use either SLURM's PMI-1 or
- PMI-2 support. SLURM builds PMI-1 by default, or you can manually
- install PMI-2. You must then build Open MPI using --with-pmi pointing
- to the SLURM PMI library location.
-
-Please configure as appropriate and try again.
---------------------------------------------------------------------------
-```
-As a rule of thumb, if you are running TensorRT-LLM interactively on a Slurm
-node, prefix your commands with `mpirun -n 1` to run TensorRT-LLM in a
-dedicated MPI environment, not the one provided by your Slurm allocation.
-
-For example: `mpirun -n 1 python3 examples/run.py ...`
-
-## Release notes
-
- * TensorRT-LLM requires TensorRT 9.3 and 24.02 containers.
-
-### Change Log
-
-#### Versions 0.9.0
-
-* Model Support
- - Support distil-whisper, thanks to the contribution from @Bhuvanesh09 in PR #1061
- - Support HuggingFace StarCoder2
- - Support VILA
- - Support Smaug-72B-v0.1
- - Migrate BLIP-2 examples to `examples/multimodal`
-* Features
- - Add support to context chunking to work with KV cache reuse
- - Enable different rewind tokens per sequence for Medusa
- - BART LoRA support (limited to the Python runtime)
- - Enable multi-LoRA for BART LoRA
- - Support `early_stopping=False` in beam search for C++ Runtime
- - Add logits post processor to the batch manager (see docs/source/batch_manager.md#logits-post-processor-optional)
- - Support import and convert HuggingFace Gemma checkpoints, thanks for the contribution from @mfuntowicz in #1147
- - Support loading Gemma from HuggingFace
- - Support auto parallelism planner for high-level API and unified builder workflow
- - Support run `GptSession` without OpenMPI #1220
- - [BREAKING CHANGE] TopP sampling optimization with deterministic AIR TopP algorithm is enabled by default
- - Medusa IFB support
- - [Experimental] Support FP8 FMHA, note that the performance is not optimal, and we will keep optimizing it
- - [BREAKING CHANGE] Support embedding sharing for Gemma
- - More head sizes support for LLaMA-like models
- - Ampere (sm80, sm86), Ada (sm89), Hopper(sm90) all support head sizes [32, 40, 64, 80, 96, 104, 128, 160, 256] now.
- - OOTB functionality support
- - T5
- - Mixtral 8x7B
-* API
- - C++ `executor` API
- - Add Python bindings, see documentation and examples in `examples/bindings`
- - Add advanced and multi-GPU examples for Python binding of `executor` C++ API, see `examples/bindings/README.md`
- - Add documents for C++ `executor` API, see `docs/source/executor.md`
- - High-level API (refer to `examples/high-level-api/README.md` for guidance)
- - [BREAKING CHANGE] Reuse the `QuantConfig` used in `trtllm-build` tool, support broader quantization features
- - Support in `LLM()` API to accept engines built by `trtllm-build` command
- - Add support for TensorRT-LLM checkpoint as model input
- - Refine `SamplingConfig` used in `LLM.generate` or `LLM.generate_async` APIs, with the support of beam search, a variety of penalties, and more features
- - Add support for the StreamingLLM feature, enable it by setting `LLM(streaming_llm=...)`
- - Migrate Mixtral to high level API and unified builder workflow
- - [BREAKING CHANGE] Refactored Qwen model to the unified build workflow, see `examples/qwen/README.md` for the latest commands
- - [BREAKING CHANGE] Move LLaMA convert checkpoint script from examples directory into the core library
- - [BREAKING CHANGE] Refactor GPT with unified building workflow, see `examples/gpt/README.md` for the latest commands
- - [BREAKING CHANGE] Removed all the lora related flags from convert_checkpoint.py script and the checkpoint content to `trtllm-build` command, to generalize the feature better to more models
- - [BREAKING CHANGE] Removed the use_prompt_tuning flag and options from convert_checkpoint.py script and the checkpoint content, to generalize the feature better to more models. Use the `trtllm-build --max_prompt_embedding_table_size` instead.
- - [BREAKING CHANGE] Changed the `trtllm-build --world_size` flag to `--auto_parallel` flag, the option is used for auto parallel planner only.
- - [BREAKING CHANGE] `AsyncLLMEngine` is removed, `tensorrt_llm.GenerationExecutor` class is refactored to work with both explicitly launching with `mpirun` in the application level, and accept an MPI communicator created by `mpi4py`
- - [BREAKING CHANGE] `examples/server` are removed, see `examples/app` instead.
- - [BREAKING CHANGE] Remove LoRA related parameters from convert checkpoint scripts
- - [BREAKING CHANGE] Simplify Qwen convert checkpoint script
- - [BREAKING CHANGE] Remove `model` parameter from `gptManagerBenchmark` and `gptSessionBenchmark`
-* Bug fixes
- - Fix a weight-only quant bug for Whisper to make sure that the `encoder_input_len_range` is not 0, thanks to the contribution from @Eddie-Wang1120 in #992
- - Fix the issue that log probabilities in Python runtime are not returned #983
- - Multi-GPU fixes for multimodal examples #1003
- - Fix wrong `end_id` issue for Qwen #987
- - Fix a non-stopping generation issue #1118 #1123
- - Fix wrong link in examples/mixtral/README.md #1181
- - Fix LLaMA2-7B bad results when int8 kv cache and per-channel int8 weight only are enabled #967
- - Fix wrong `head_size` when importing Gemma model from HuggingFace Hub, thanks for the contribution from @mfuntowicz in #1148
- - Fix ChatGLM2-6B building failure on INT8 #1239
- - Fix wrong relative path in Baichuan documentation #1242
- - Fix wrong `SamplingConfig` tensors in `ModelRunnerCpp` #1183
- - Fix error when converting SmoothQuant LLaMA #1267
- - Fix the issue that `examples/run.py` only load one line from `--input_file`
- - Fix the issue that `ModelRunnerCpp` does not transfer `SamplingConfig` tensor fields correctly #1183
-* Benchmark
- - Add emulated static batching in `gptManagerBenchmark`
- - Support arbitrary dataset from HuggingFace for C++ benchmarks, see “Prepare dataset” section in `benchmarks/cpp/README.md`
- - Add percentile latency report to `gptManagerBenchmark`
-* Performance
- - Optimize `gptDecoderBatch` to support batched sampling
- - Enable FMHA for models in BART, Whisper and NMT family
- - Remove router tensor parallelism to improve performance for MoE models, thanks to the contribution from @megha95 in #1091
- - Improve custom all-reduce kernel
-* Infra
- - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.02-py3`
- - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.02-py3`
- - The dependent TensorRT version is updated to 9.3
- - The dependent PyTorch version is updated to 2.2
- - The dependent CUDA version is updated to 12.3.2 (a.k.a. 12.3 Update 2)
-
-#### For history change log, please see [CHANGELOG.md](./CHANGELOG.md).
-
-### Known Issues
-
- * On windows, running context FMHA plugin with FP16 accumulation on LLaMA, Mistral and Phi models suffers from poor accuracy and the resulting inference output may be garbled. The suggestion to workaround these is to enable FP32 accumulation when building the models, i.e. passing the options `--context_fmha disable --context_fmha_fp32_acc enable` to `trtllm-build` command as a work-around, and this should be fixed in the next version
-
- * The hang reported in issue
- [#149](https://github.com/triton-inference-server/tensorrtllm_backend/issues/149)
- has not been reproduced by the TensorRT-LLM team. If it is caused by a bug
- in TensorRT-LLM, that bug may be present in that release
+## Getting Started
-### Report Issues
+To get started with TensorRT-LLM, visit our documentation:
-You can use GitHub issues to report issues with TensorRT-LLM.
+- [Quick Start Guide](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)
+- [Release Notes](https://nvidia.github.io/TensorRT-LLM/release-notes.html)
+- [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
+- [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
+- [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
index 4ff6e7aaf..8408b08e0 100644
--- a/benchmarks/cpp/README.md
+++ b/benchmarks/cpp/README.md
@@ -225,9 +225,7 @@ python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
--output_dir ${CONVERTED_CHECKPOINT} \
--dtype ${DTYPE} \
--tp_size ${TP} \
- --pp_size 1 \
- --lora_target_modules attn_qkv \
- --max_lora_rank ${MAX_LORA_RANK}
+ --pp_size 1
${HOME}/.local/bin/trtllm-build \
--checkpoint_dir ${CONVERTED_CHECKPOINT} \
@@ -235,13 +233,11 @@ ${HOME}/.local/bin/trtllm-build \
--max_batch_size ${MAX_BATCH} \
--max_input_len $MAX_LEN \
--max_output_len $MAX_LEN \
- --gpt_attention_plugin float16 \
- --paged_kv_cache enable \
- --remove_input_padding enable \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
- --use_custom_all_reduce disable
+ --lora_target_modules attn_qkv \
+ --max_lora_rank ${MAX_LORA_RANK}
NUM_LORAS=(8 16 24 32 64 128 256)
NUM_REQUESTS=1024
diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
index af1ca03d7..bae5d2bcd 100644
--- a/benchmarks/cpp/gptSessionBenchmark.cpp
+++ b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -14,6 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
+/*****************************************************************************
+ *
+ * GptSession is going to be deprecated soon.
+ * Please do not add new functionality in this file!
+ *
+ *****************************************************************************/
+
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/mpiUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
index 7ba58c0f0..28e905fd5 100644
--- a/benchmarks/python/allowed_configs.py
+++ b/benchmarks/python/allowed_configs.py
@@ -1127,6 +1127,39 @@ class ModelConfig:
max_output_len=200,
builder_opt=None,
)),
+ "qwen1.5_7b_chat":
+ ModelConfig(name="qwen1.5_7b_chat",
+ family="qwen2",
+ benchmark_type="gpt",
+ build_config=BuildConfig(num_layers=32,
+ num_heads=32,
+ hidden_size=4096,
+ vocab_size=151936,
+ hidden_act='silu',
+ n_positions=8192,
+ inter_size=11008,
+ max_batch_size=128,
+ max_input_len=512,
+ max_output_len=200,
+ builder_opt=None,
+ bias=False)),
+ "qwen1.5_14b_chat":
+ ModelConfig(name="qwen1.5_14b_chat",
+ family="qwen2",
+ benchmark_type="gpt",
+ build_config=BuildConfig(
+ num_layers=40,
+ num_heads=40,
+ hidden_size=5120,
+ vocab_size=152064,
+ hidden_act='silu',
+ n_positions=8192,
+ inter_size=13696,
+ max_batch_size=64,
+ max_input_len=512,
+ max_output_len=200,
+ builder_opt=None,
+ )),
"mamba_2.8b":
ModelConfig(name="mamba_2.8b",
family="mamba",
diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
index 5f4340411..489da7528 100644
--- a/benchmarks/python/build.py
+++ b/benchmarks/python/build.py
@@ -232,6 +232,7 @@ def build_gpt(args):
builder_config_extra_kwargs['mamba_expand'] = build_config[
'mamba_expand']
builder_config_extra_kwargs['max_beam_width'] = max_beam_width
+ builder_config_extra_kwargs['layer_types'] = ['recurrent']
builder_config = builder.create_builder_config(
name=args.model,
precision=args.dtype,
@@ -715,6 +716,51 @@ def build_gpt(args):
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
+ 'qwen_type':
+ 'qwen',
+ }
+ config = PretrainedConfig.from_dict(config)
+ tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
+ elif family == "qwen2":
+ config = {
+ 'architecture':
+ 'QWenForCausalLM',
+ 'dtype':
+ args.dtype,
+ 'num_hidden_layers':
+ build_config['num_layers'],
+ 'num_attention_heads':
+ build_config['num_heads'],
+ 'num_key_value_heads':
+ build_config['num_heads'] if build_config['num_kv_heads'] is None
+ else build_config['num_kv_heads'],
+ 'hidden_size':
+ build_config['hidden_size'],
+ 'intermediate_size':
+ build_config['inter_size'],
+ 'vocab_size':
+ build_config['vocab_size'],
+ 'position_embedding_type':
+ 'rope_gpt_neox',
+ 'max_position_embeddings':
+ build_config['n_positions'],
+ 'hidden_act':
+ build_config['hidden_act'],
+ 'quantization': {
+ 'group_size': 128,
+ 'quant_algo': quant_algo,
+ 'kv_cache_quant_algo': kv_cache_quant_algo
+ },
+ 'mapping': {
+ 'world_size': world_size,
+ 'tp_size': world_size
+ },
+ 'moe_num_experts':
+ build_config["moe_num_experts"],
+ 'moe_top_k':
+ build_config["moe_top_k"],
+ 'qwen_type':
+ 'qwen2',
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
index f3c413ac5..bf5160e65 100644
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -21,7 +21,7 @@
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/schedulerPolicy.h"
#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include
@@ -79,9 +79,13 @@ class GptManager
virtual ~GptManager();
protected:
+ /* Synchronizes the decoder */
+ virtual BatchManagerErrorCode_t forwardSync();
+
/* Invokes one step of backend
Updates state of all requests */
- virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set& activeRequestsIds);
+ virtual BatchManagerErrorCode_t forwardAsync(
+ RequestList& activeRequests, std::unordered_set& activeRequestsIds);
private:
[[nodiscard]] SizeType getMaxInputLen() const;
@@ -89,7 +93,7 @@ class GptManager
[[nodiscard]] SizeType getMaxNumSequences() const;
void validateLlmRequest(
- LlmRequest& newReq, runtime::GptModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
+ LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
static std::shared_ptr fillLlmRequest(std::shared_ptr newReq);
static std::shared_ptr> getReqInputTokens(std::shared_ptr newReq);
static SizeType getMaxNewTokens(std::shared_ptr newReq);
@@ -108,7 +112,7 @@ class GptManager
// List of live requests
RequestList mActiveRequests;
// IDs of live requests
- std::set mActiveRequestsIds;
+ std::unordered_set mActiveRequestsIds;
// Boolean that controls if prompt should be included in output tokens for non-streaming
bool mExcludeInputInOutput;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
index 154aa352a..47c60ed61 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@@ -63,6 +63,8 @@ class KvCacheConfig
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
}
+ friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
+
std::optional maxTokens;
std::optional maxAttentionWindow;
std::optional sinkTokenLength;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index d4cc2c08e..3b34ccd63 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -18,15 +18,16 @@
#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
#include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
-#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/kernels/kvCacheIndex.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include
+
#include
#include
#include
@@ -89,15 +90,15 @@ struct KvCacheStats
class KVCacheBlock
{
public:
- using OffsetType = std::int32_t;
+ using IdType = std::int32_t;
- explicit KVCacheBlock(OffsetType blockIdx, OffsetType blocksInPrimaryPool);
+ explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
void startScheduling();
- [[nodiscard]] OffsetType getBlockIdx() const;
+ [[nodiscard]] IdType getBlockId() const;
- [[nodiscard]] OffsetType getMemoryPoolBlockOffset() const;
+ [[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const;
[[nodiscard]] bool isPrimary() const;
@@ -143,11 +144,12 @@ class KVCacheBlock
[[nodiscard]] bool isShared() const;
private:
- // Linear index of block in pool
- OffsetType mBlockIdx;
+ // Linear ID of block independent of pool
+ IdType mBlockId;
- // Block in memory pool backing this block
- OffsetType mMemoryPoolBlockOffset;
+ // Index of block in memory pool backing this block
+ // Choice of pool is encoded into the type
+ kernels::KVCacheIndex mMemoryPoolBlockIndex;
// Number of references to the block
SizeType mRefCount;
@@ -169,9 +171,6 @@ class KVCacheBlock
// Flag indicating if block is full
bool mIsFull;
-
- // Flag indicating mMemoryPoolBlockOffset refers to secondary pool
- static constexpr OffsetType secondaryPoolFlag = static_cast(1) << (8 * sizeof(OffsetType) - 1);
};
class GenerationRequest
@@ -220,14 +219,14 @@ class GenerationRequest
return mCacheBlockIds;
}
- void addCacheBlock(SizeType beamIdx, SizeType blockIdx)
+ void addCacheBlock(SizeType beamIdx, KVCacheBlock::IdType blockId)
{
- mCacheBlockIds.at(beamIdx).push_back(blockIdx);
+ mCacheBlockIds.at(beamIdx).push_back(blockId);
}
- void changeCacheBlock(SizeType beamIdx, SizeType pagedBlockIdx, SizeType blockIdx)
+ void changeCacheBlock(SizeType beamIdx, SizeType pagedBlockIdx, KVCacheBlock::IdType blockId)
{
- mCacheBlockIds.at(beamIdx).at(pagedBlockIdx) = blockIdx;
+ mCacheBlockIds.at(beamIdx).at(pagedBlockIdx) = blockId;
}
void clearCacheBlocks()
@@ -264,7 +263,7 @@ class GenerationRequest
// Number of beams
SizeType mBeamWidth;
// List of blocks allocated for each beam of the sequence
- std::vector> mCacheBlockIds;
+ std::vector> mCacheBlockIds;
// Number of tokens already in kv cache before context phase.
// A value > 0 indicates cached kv cache blocks were reused.
// One value per beam.
@@ -348,7 +347,7 @@ class BlockManager
[[nodiscard]] SizeType getMaxNumBlocks() const noexcept
{
- return static_cast(mAllBlocksByIdx.size());
+ return static_cast(mAllBlocksById.size());
}
[[nodiscard]] SizeType getTokensPerBlock() const noexcept
@@ -356,7 +355,8 @@ class BlockManager
return mTokensPerBlock;
}
- //! \brief Get size of one field in one layer in one block.
+ //! \brief Get size of one K/V cache block in one layer.
+ //! @details Volume of [numKvHeads, tokensPerBlock, sizePerHead]
[[nodiscard]] SizeType getBlockSize() const
{
return mBlockSize;
@@ -372,10 +372,10 @@ class BlockManager
return mSecondaryPool;
}
- //! \brief Get offset in pool to K or V block.
- //! \param blockIdx the blockIdx as returned by getBlockIdx()
+ //! \brief Get index in pool to K or V block.
+ //! \param blockId the blockId as returned by getBlockId()
//! \param fieldIdx either 0 (K) or 1 (V),
- [[nodiscard]] SizeType getKOrVBlockOffset(SizeType blockIdx, SizeType fieldIdx) const;
+ [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex(KVCacheBlock::IdType blockId, SizeType fieldIdx) const;
//! \brief Bring offloaded block from secondary to primary memory.
//! \details Does nothing of block is already in primary memory.
@@ -442,7 +442,7 @@ class BlockManager
// Number of tokens per one block
SizeType mTokensPerBlock;
// List of all blocks by idx
- std::vector mAllBlocksByIdx;
+ std::vector mAllBlocksById;
// Dummy block acting as root for BlockToken searches
BlockPtr mCachedBlocksRoot;
// Statistics for block allocations/reuse
@@ -452,7 +452,6 @@ class BlockManager
class KVCacheManager
{
public:
- using OffsetType = KVCacheBlock::OffsetType;
using SizeType = tensorrt_llm::runtime::SizeType;
using SequencesPtr = GenerationRequest::SharedPtr;
using CudaStreamPtr = std::shared_ptr;
@@ -495,12 +494,6 @@ class KVCacheManager
return kvCacheStats;
}
- // Volume of [numKvHeads, tokensPerBlock, sizePerHead]
- [[nodiscard]] SizeType getBlockSize() const
- {
- return mBlockManager.getBlockSize();
- }
-
[[nodiscard]] SizeType getMaxBlocksPerSeq() const
{
return mMaxBlocksPerSeq;
@@ -544,21 +537,21 @@ class KVCacheManager
runtime::ITensor& output, SizeType outputSlotOffset, SizeType seqSlotIdx, SizeType beamWidth) const;
// Volume of [2, numKvHeads, tokensPerBlock, sizePerHead]
- [[nodiscard]] static SizeType constexpr calculatePageSize(tensorrt_llm::runtime::GptModelConfig const& modelConfig)
+ [[nodiscard]] static SizeType constexpr calculatePageSize(tensorrt_llm::runtime::ModelConfig const& modelConfig)
{
return 2 * modelConfig.getNbKvHeads() * modelConfig.getTokensPerBlock() * modelConfig.getSizePerHead();
}
// numLayers * 2 * numKvHeads * sizePerHead
[[nodiscard]] static SizeType constexpr calculateCacheSizePerToken(
- tensorrt_llm::runtime::GptModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig)
+ tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig)
{
- return modelConfig.getNbLayers(worldConfig.getPipelineParallelism()) * 2 * modelConfig.getNbKvHeads()
+ return modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism()) * 2 * modelConfig.getNbKvHeads()
* modelConfig.getSizePerHead();
}
[[nodiscard]] static std::tuple const calculateMaxNumBlocks(KvCacheConfig const& config,
- nvinfer1::DataType dtype, tensorrt_llm::runtime::GptModelConfig const& modelConfig,
+ nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
[[nodiscard]] SizeType getNumPrepopulatedTokens(SizeType batchSlotIdx, SizeType beamIdx) const
@@ -576,8 +569,8 @@ class KVCacheManager
void rewindKVCache(SizeType seqSlotIdx, SizeType rewindLengths);
private:
- void setOffsets(OffsetType* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType seqSlotIdx, SizeType beamIdx,
- SizeType blockIdx, SizeType blockId) const;
+ void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType seqSlotIdx,
+ SizeType beamIdx, SizeType blockIdx, KVCacheBlock::IdType blockId) const;
void resetBlockOffsets(SizeType seqSlotIdx, SizeType beamWidth);
void cacheBlockOffsets(GenerationRequest const& seq, SizeType seqSlotIdx);
@@ -586,8 +579,6 @@ class KVCacheManager
void updateToken(SizeType seqSlotIdx, bool addToken);
private:
- // Number of layers
- SizeType mNumLayers;
// Maximum number of sequences
SizeType mMaxNumSequences;
// Maximum beam width
@@ -607,8 +598,8 @@ class KVCacheManager
BlockManager mBlockManager;
// List of all sequences
std::vector mSequences;
- // buffer for block offsets for all managed sequences
- runtime::ITensor::SharedPtr mSequenceBlockOffsets;
+ // buffer for block indices for all managed sequences
+ runtime::ITensor::SharedPtr mSequenceBlockIndices;
// Whether to cache KV pages for reuse
bool mEnableBlockReuse;
};
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index c545afa33..f1a5abace 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -92,6 +92,7 @@ class GenericLlmRequest
, mCumLogProbs(samplingConfig.beamWidth)
, mDraftTokens(draftTokens.value_or(std::make_shared()))
, mDraftLogits(draftLogits)
+ , mNumTokensPerIteration(1)
, mReturnContextLogits(returnContextLogits)
, mReturnGenerationLogits(returnGenerationLogits)
, mExcludeInputFromOutput(excludeInputFromOutput)
@@ -189,9 +190,9 @@ class GenericLlmRequest
{
auto const maxNewTokens = maxSequenceLen - mPromptLen;
TLLM_LOG_WARNING(
- "Number of requested output tokens (%d) exceeds maximum sequence length (%d). "
+ "Prompt length + number of requested output tokens (%d + %d) exceeds maximum sequence length (%d). "
"Number of requested output tokens is changed to (%d).",
- mMaxNewTokens, maxSequenceLen, maxNewTokens);
+ mPromptLen, mMaxNewTokens, maxSequenceLen, maxNewTokens);
mMaxNewTokens = maxNewTokens;
}
@@ -494,6 +495,16 @@ class GenericLlmRequest
return mDraftTokens->size();
}
+ void setNumTokensPerIteration(SizeType numTokensPerIteration)
+ {
+ mNumTokensPerIteration = numTokensPerIteration;
+ }
+
+ SizeType getNumTokensPerIteration() const
+ {
+ return mNumTokensPerIteration;
+ }
+
void setReturnContextLogits(bool const returnContextLogits)
{
mReturnContextLogits = returnContextLogits;
@@ -766,6 +777,7 @@ class GenericLlmRequest
VecLogProbs mCumLogProbs; // [beamSize]
std::shared_ptr mDraftTokens;
std::optional mDraftLogits;
+ SizeType mNumTokensPerIteration;
// Save logits
bool mReturnContextLogits;
diff --git a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
index 024bb07b0..bcd47dd9a 100644
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
@@ -12,10 +12,11 @@
#pragma once
+#include "tensorrt_llm/batch_manager/common.h"
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/peftCacheManagerConfig.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/loraCache.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/workerPool.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include
@@ -23,6 +24,7 @@
#include
#include
#include
+#include
namespace tensorrt_llm::batch_manager
{
@@ -39,7 +41,7 @@ class BasePeftCacheManager
{
public:
using LlmRequestPtr = std::shared_ptr;
- using RequestTable = std::map;
+ using RequestVector = std::vector;
using PeftTable = std::map>>;
/**
@@ -50,13 +52,14 @@ class BasePeftCacheManager
virtual void addRequestPeft(LlmRequestPtr llmRequest, bool tryGpuCache = true) = 0;
/**
- * \brief ensures device cache has all the weights needed to execute batch as specified by requestTable.
+ * \brief ensures device cache has all the weights needed to execute batch as specified by requests.
* This acts as sync for the copy tasks started by addRequestPeft
- * \param[in] requestTable: current request table
+ * \param[in] contextRequests: current context requests
+ * \param[in] genRequests: current generation requests
* \param[in] resetGpuCache: reset (make all tasks evictable)
* \returns -- a PeftTable
*/
- virtual PeftTable ensureBatch(RequestTable const& requestTable, bool resetGpuCache = false) = 0;
+ virtual PeftTable ensureBatch(ScheduledRequests const& scheduledRequests, bool resetGpuCache = false) = 0;
/**
* \brief mark all the tasks in device cache as done
@@ -77,12 +80,12 @@ class BasePeftCacheManager
class PeftCacheManager : public BasePeftCacheManager
{
public:
- PeftCacheManager(PeftCacheManagerConfig const& config, runtime::GptModelConfig const& modelConfig,
+ PeftCacheManager(PeftCacheManagerConfig const& config, runtime::ModelConfig const& modelConfig,
runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
void addRequestPeft(std::shared_ptr llmRequest, bool tryGpuCache = true) override;
- PeftTable ensureBatch(RequestTable const& requestTable, bool resetGpuCache = false) override;
+ PeftTable ensureBatch(ScheduledRequests const& scheduledRequests, bool resetGpuCache = false) override;
[[nodiscard]] bool isTaskCached(uint64_t taskId) const;
@@ -116,7 +119,7 @@ class PeftCacheManager : public BasePeftCacheManager
runtime::BufferManager const& bufferManager);
static std::pair getPageManagerConfig(
- PeftCacheManagerConfig const& config, runtime::GptModelConfig const& modelConfig,
+ PeftCacheManagerConfig const& config, runtime::ModelConfig const& modelConfig,
runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
private:
@@ -133,9 +136,9 @@ class PeftCacheManager : public BasePeftCacheManager
std::unordered_map> mTaskIdToPausedReqIds;
std::tuple>, std::map>> getTaskMaps(
- RequestTable const& requestTable);
+ ScheduledRequests const& scheduledRequests);
- runtime::GptModelConfig mModelConfig;
+ runtime::ModelConfig mModelConfig;
runtime::WorldConfig mWorldConfig;
int mDevice{-1};
@@ -145,7 +148,7 @@ class NoOpPeftCacheManager : public BasePeftCacheManager
{
void addRequestPeft(std::shared_ptr llmRequest, bool tryGpuCache = true) override;
- PeftTable ensureBatch(RequestTable const& requestTable, bool resetGpuCache = false) override;
+ PeftTable ensureBatch(ScheduledRequests const& scheduledRequests, bool resetGpuCache = false) override;
void resetDeviceCache() override;
diff --git a/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h b/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h
index fbaec751b..df8ccb375 100644
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h
@@ -60,6 +60,7 @@ struct PeftCacheManagerConfig
, optimalAdapterSize(cfg.getOptimalAdapterSize())
, maxAdapterSize(cfg.getMaxAdapterSize())
, numPutWorkers(cfg.getNumPutWorkers())
+ , numEnsureWorkers(cfg.getNumEnsureWorkers())
, numCopyStreams(cfg.getNumCopyStreams())
, maxPagesPerBlockHost(cfg.getMaxPagesPerBlockHost())
, maxPagesPerBlockDevice(cfg.getMaxPagesPerBlockDevice())
diff --git a/cpp/include/tensorrt_llm/batch_manager/schedulerPolicy.h b/cpp/include/tensorrt_llm/batch_manager/schedulerPolicy.h
index 8910e5a0c..34773afbb 100644
--- a/cpp/include/tensorrt_llm/batch_manager/schedulerPolicy.h
+++ b/cpp/include/tensorrt_llm/batch_manager/schedulerPolicy.h
@@ -31,4 +31,6 @@ SchedulerPolicy execToBatchManagerSchedPolicy(executor::SchedulerPolicy policy);
executor::SchedulerPolicy batchManagerToExecSchedPolicy(SchedulerPolicy policy);
+std::ostream& operator<<(std::ostream& os, SchedulerPolicy policy);
+
} // namespace tensorrt_llm::batch_manager::batch_scheduler
diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
index c09fa5e4d..2ba1e336f 100644
--- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@@ -57,7 +57,9 @@ class TrtGptModelOptionalParams
explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig)
: TrtGptModelOptionalParams(KvCacheConfig(executorConfig.getKvCacheConfig()), false,
executorConfig.getParallelConfig().value_or(executor::ParallelConfig()).getDeviceIds(),
- executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(), std::nullopt,
+ executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
+ runtime::DecodingMode::fromExecutor(
+ executorConfig.getDecodingMode().value_or(executor::DecodingMode::kNONE)),
PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
executorConfig.getMedusaChoices())
{
@@ -70,6 +72,8 @@ class TrtGptModelOptionalParams
&& enableChunkedContext == other.enableChunkedContext && decodingMode == other.decodingMode;
}
+ friend std::ostream& operator<<(std::ostream& os, TrtGptModelOptionalParams const& self);
+
KvCacheConfig kvCacheConfig;
bool enableTrtOverlap;
diff --git a/cpp/include/tensorrt_llm/common/mpiUtils.h b/cpp/include/tensorrt_llm/common/mpiUtils.h
index 51e622e30..fe93d2214 100644
--- a/cpp/include/tensorrt_llm/common/mpiUtils.h
+++ b/cpp/include/tensorrt_llm/common/mpiUtils.h
@@ -17,7 +17,6 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/utils/multiDeviceUtils.h"
#ifdef ENABLE_FP8
@@ -36,6 +35,11 @@
#define MPICHECK(cmd) TLLM_MPI_CHECK(cmd)
+namespace tensorrt_llm::runtime
+{
+class IBuffer;
+}
+
// A wrapper module of the MPI library.
namespace tensorrt_llm::mpi
{
@@ -234,18 +238,11 @@ class MpiComm
std::shared_ptr bcastAsync(void* buffer, size_t size, MpiType dtype, int root) const;
- std::shared_ptr bcastAsync(runtime::IBuffer& buf, int root) const
- {
- TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
- return bcastAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
- }
+ std::shared_ptr bcastAsync(runtime::IBuffer& buf, int root) const;
void bcast(void* buffer, size_t size, MpiType dtype, int root) const;
- void bcast(runtime::IBuffer& buf, int root) const
- {
- bcast(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
- }
+ void bcast(runtime::IBuffer& buf, int root) const;
template
void bcastValue(T& value, int root) const
@@ -281,11 +278,7 @@ class MpiComm
void send(void const* buffer, std::size_t size, MpiType dtype, int dest, int tag) const;
- void send(runtime::IBuffer const& buf, int dest, int tag) const
- {
- TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
- send(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag);
- }
+ void send(runtime::IBuffer const& buf, int dest, int tag) const;
template
void send(T const& value, int dest, int tag) const
@@ -302,11 +295,7 @@ class MpiComm
MPI_Status recv(void* buffer, size_t size, MpiType dtype, int source, int tag) const;
- MPI_Status recv(runtime::IBuffer& buf, int source, int tag) const
- {
- TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
- return recv(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, source, tag);
- }
+ MPI_Status recv(runtime::IBuffer& buf, int source, int tag) const;
template
MPI_Status recv(T& value, int source, int tag) const
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index c6207b941..100436c59 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -29,6 +29,11 @@
#include
#include
+namespace tensorrt_llm::mpi
+{
+class MpiComm;
+}
+
namespace tensorrt_llm::executor
{
@@ -310,6 +315,7 @@ class Response
[[nodiscard]] Result getResult() const;
private:
+ friend class Serialization;
class Impl;
std::unique_ptr mImpl;
};
@@ -323,6 +329,8 @@ class SchedulerConfig
[[nodiscard]] SchedulerPolicy getPolicy() const;
private:
+ friend class Serialization;
+
/// @brief The scheduler policy. See SchedulerPolicy.
SchedulerPolicy mPolicy;
};
@@ -346,6 +354,8 @@ class KvCacheConfig
[[nodiscard]] bool getOnboardBlocks() const;
private:
+ friend class Serialization;
+
/// @brief Controls if KV cache blocks can be reused for different requests
bool mEnableBlockReuse;
@@ -378,6 +388,26 @@ SizeType const kDefaultIterStatsMaxIterations = 1000;
// Per request stats may have additional overhead due to going through all requests. Turned off by default.
SizeType const kDefaultRequestStatsMaxIterations = 0;
+class OrchestratorConfig
+{
+public:
+ explicit OrchestratorConfig(bool isOrchestrator = true, std::string workerExecutablePath = "",
+ std::shared_ptr orchLeaderComm = nullptr);
+
+ [[nodiscard]] bool getIsOrchestrator() const;
+ [[nodiscard]] std::string getWorkerExecutablePath() const;
+ [[nodiscard]] std::shared_ptr getOrchLeaderComm() const;
+
+ void setIsOrchestrator(bool isOrchestrator);
+ void setWorkerExecutablePath(std::string const& workerExecutablePath);
+ void setOrchLeaderComm(std::shared_ptr const& orchLeaderComm);
+
+private:
+ bool mIsOrchestrator;
+ std::string mWorkerExecutablePath;
+ std::shared_ptr mOrchLeaderComm;
+};
+
/// @brief A configuration class for the parallel execution parameters
/// Currently only supports commType = CommunicationType::kMPI
class ParallelConfig
@@ -392,19 +422,24 @@ class ParallelConfig
explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI,
CommunicationMode commMode = CommunicationMode::kLEADER,
std::optional> deviceIds = std::nullopt,
- std::optional> participantIds = std::nullopt);
+ std::optional> participantIds = std::nullopt,
+ std::optional const& orchestratorConfig = std::nullopt);
[[nodiscard]] CommunicationType getCommunicationType() const;
[[nodiscard]] CommunicationMode getCommunicationMode() const;
[[nodiscard]] std::optional> getDeviceIds() const;
[[nodiscard]] std::optional> getParticipantIds() const;
+ [[nodiscard]] std::optional getOrchestratorConfig() const;
void setCommunicationType(CommunicationType type);
void setCommunicationMode(CommunicationMode mode);
void setDeviceIds(std::vector const& deviceIds);
void setParticipantIds(std::vector const& participantIds);
+ void setOrchestratorConfig(OrchestratorConfig const& orchestratorConfig);
private:
+ friend class Serialization;
+
/// @brief The type of communication protocol used. Default is MPI.
CommunicationType mCommType;
@@ -416,6 +451,9 @@ class ParallelConfig
/// @brief The participant ids (MPI ranks for example) used for executing this model
std::optional> mParticipantIds;
+
+ /// @brief Optional orchestrator configuration
+ std::optional mOrchestratorConfig;
};
/// @brief config for PeftCacheManager
@@ -428,6 +466,8 @@ class PeftCacheConfig
SizeType maxPagesPerBlockDevice = 8, std::optional const& deviceCachePercent = std::nullopt,
std::optional const& hostCacheSize = std::nullopt);
+ bool operator==(PeftCacheConfig const& other) const;
+
[[nodiscard]] SizeType getNumHostModuleLayer() const;
[[nodiscard]] SizeType getNumDeviceModuleLayer() const;
[[nodiscard]] SizeType getOptimalAdapterSize() const;
@@ -441,6 +481,8 @@ class PeftCacheConfig
[[nodiscard]] std::optional getHostCacheSize() const;
private:
+ friend class Serialization;
+
// number of max sized 1-layer 1-module adapterSize=1 sets of weights that can be stored in host cache
SizeType mNumHostModuleLayer;
// number of max sized 1-layer 1-module sets of weights that can be stored in host cache
@@ -460,7 +502,7 @@ class PeftCacheConfig
// Number of cache pages per allocation block (device)
SizeType mMaxPagesPerBlockDevice;
// percent of memory after engine load to use for cache
- std::optional mDeviceCachePercent;
+ std::optional mDeviceCachePercent;
// size in bytes to use for host cache
std::optional mHostCacheSize;
};
@@ -477,7 +519,8 @@ class ExecutorConfig
std::optional parallelConfig = std::nullopt,
std::optional const& peftCacheConfig = std::nullopt,
std::optional logitsPostProcessorMap = std::nullopt,
- std::optional medusaChoices = std::nullopt);
+ std::optional medusaChoices = std::nullopt,
+ std::optional decodingMode = std::nullopt);
[[nodiscard]] SizeType getMaxBeamWidth() const;
[[nodiscard]] SchedulerConfig getSchedulerConfig() const;
@@ -491,6 +534,7 @@ class ExecutorConfig
[[nodiscard]] std::optional getPeftCacheConfig() const;
[[nodiscard]] std::optional getLogitsPostProcessorMap() const;
[[nodiscard]] std::optional getMedusaChoices() const;
+ [[nodiscard]] std::optional getDecodingMode() const;
void setMaxBeamWidth(SizeType maxBeamWidth);
void setSchedulerConfig(SchedulerConfig const& schedulerConfig);
@@ -504,8 +548,11 @@ class ExecutorConfig
void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig);
void setLogitsPostProcessorMap(LogitsPostProcessorMap const& logitsPostProcessorMap);
void setMedusaChoices(MedusaChoices const& medusaChoices);
+ void setDecodingMode(DecodingMode decodingMode);
private:
+ friend class Serialization;
+
/// @brief The beam width value of requests that will be sent to the executor
SizeType mMaxBeamWidth;
@@ -535,6 +582,7 @@ class ExecutorConfig
std::optional mPeftCacheConfig;
std::optional mLogitsPostProcessorMap;
std::optional mMedusaChoices;
+ std::optional mDecodingMode;
};
/// @brief The executor is responsible for receiving new requests and sending responses, and running the inference
diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h
new file mode 100644
index 000000000..526832b43
--- /dev/null
+++ b/cpp/include/tensorrt_llm/executor/serialization.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/executor/types.h"
+#include
+#include
+
+namespace tensorrt_llm::executor
+{
+
+class Serialization
+{
+public:
+ // SamplingConfig
+ [[nodiscard]] static SamplingConfig deserializeSamplingConfig(std::istream& is);
+ static void serialize(SamplingConfig const& config, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(SamplingConfig const& config);
+
+ // OutputConfig
+ [[nodiscard]] static OutputConfig deserializeOutputConfig(std::istream& is);
+ static void serialize(OutputConfig const& config, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(OutputConfig const& config);
+
+ // SpeculativeDecodingConfig
+ [[nodiscard]] static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig(std::istream& is);
+ static void serialize(SpeculativeDecodingConfig const& config, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(SpeculativeDecodingConfig const& config);
+
+ // PromptTuningConfig
+ [[nodiscard]] static PromptTuningConfig deserializePromptTuningConfig(std::istream& is);
+ static void serialize(PromptTuningConfig const& config, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(PromptTuningConfig const& config);
+
+ // LoraConfig
+ [[nodiscard]] static LoraConfig deserializeLoraConfig(std::istream& is);
+ static void serialize(LoraConfig const& config, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(LoraConfig const& config);
+
+ // Request
+ [[nodiscard]] static Request deserializeRequest(std::istream& is);
+ static void serialize(Request const& request, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(Request const& request);
+
+ // Tensor
+ [[nodiscard]] static Tensor deserializeTensor(std::istream& is);
+ static void serialize(Tensor const& tensor, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(Tensor const& tensor);
+
+ // Result
+ [[nodiscard]] static Result deserializeResult(std::istream& is);
+ static void serialize(Result const& result, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(Result const& result);
+
+ // Response
+ [[nodiscard]] static Response deserializeResponse(std::istream& is);
+ static void serialize(Response const& response, std::ostream& os);
+ [[nodiscard]] static size_t serializedSize(Response const& response);
+
+ // Vector of responses
+ static std::vector deserializeResponses(std::vector& buffer);
+ static std::vector serialize(std::vector const& responses);
+
+ // KvCacheConfig
+ static KvCacheConfig deserializeKvCacheConfig(std::istream& is);
+ static void serialize(KvCacheConfig const& kvCacheConfig, std::ostream& os);
+ static size_t serializedSize(KvCacheConfig const& kvCacheConfig);
+
+ // SchedulerConfig
+ static SchedulerConfig deserializeSchedulerConfig(std::istream& is);
+ static void serialize(SchedulerConfig const& schedulerConfig, std::ostream& os);
+ static size_t serializedSize(SchedulerConfig const& schedulerConfig);
+
+ // ParallelConfig
+ static ParallelConfig deserializeParallelConfig(std::istream& is);
+ static void serialize(ParallelConfig const& parallelConfig, std::ostream& os);
+ static size_t serializedSize(ParallelConfig const& parallelConfig);
+
+ // PeftCacheConfig
+ static PeftCacheConfig deserializePeftCacheConfig(std::istream& is);
+ static void serialize(PeftCacheConfig const& peftCacheConfig, std::ostream& os);
+ static size_t serializedSize(PeftCacheConfig const& peftCacheConfig);
+
+ // OrchestratorConfig
+ static OrchestratorConfig deserializeOrchestratorConfig(std::istream& is);
+ static void serialize(OrchestratorConfig const& orchestratorConfig, std::ostream& os);
+ static size_t serializedSize(OrchestratorConfig const& orchestratorConfig);
+
+ // ExecutorConfig
+ static ExecutorConfig deserializeExecutorConfig(std::istream& is);
+ static void serialize(ExecutorConfig const& executorConfig, std::ostream& os);
+ static size_t serializedSize(ExecutorConfig const& executorConfig);
+
+ // String
+ static std::string deserializeString(std::istream& is);
+
+ // ModelType
+ static ModelType deserializeModelType(std::istream& is);
+};
+
+} // namespace tensorrt_llm::executor
diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
index 3b2acb94d..934802f1b 100644
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@@ -191,6 +191,9 @@ enum class CommunicationMode
kLEADER, // With the leader mode, only the leader can enqueue requests. The requests will be
// broadcasted to the workers. All participants can get response via awaitResponses. The leader is the
// first participant in the provided participant IDS, or 0 if participant ID is not provided
+ kORCHESTRATOR, // With the orchestrator mode, only the orchestrator can enqueue requests and await responses. The
+ // requests will be broadcasted to the workers. The orchestrator will spawn new processes for the
+ // execution of the model
};
/// @brief Struct that holds the stats of a KV cache manager
@@ -305,4 +308,17 @@ struct RequestStatsPerIteration
std::vector requestStats;
};
+/// @brief Decoding mode
+enum class DecodingMode
+{
+ /// @brief No mode specified. Config will be determined from the beam width of the first request at runtime
+ /// TopKTopP if beamWidth == 1, BeamSearch otherwise
+ kNONE,
+ kTOP_K,
+ kTOP_P,
+ kBEAM_SEARCH,
+ kMEDUSA,
+ kTOP_K_TOP_P,
+};
+
} // namespace tensorrt_llm::executor
diff --git a/cpp/include/tensorrt_llm/runtime/decodingMode.h b/cpp/include/tensorrt_llm/runtime/decodingMode.h
index 9c400668f..c697e8cdb 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingMode.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingMode.h
@@ -16,6 +16,8 @@
#pragma once
+#include "tensorrt_llm/executor/executor.h"
+
namespace tensorrt_llm
{
namespace runtime
@@ -54,37 +56,37 @@ class DecodingMode
return DecodingMode{kMedusa};
}
- bool constexpr isNone()
+ bool constexpr isNone() const
{
return mState == 0;
}
- bool constexpr isTopK()
+ bool constexpr isTopK() const
{
return anyBitSet(kTopK);
}
- bool constexpr isTopP()
+ bool constexpr isTopP() const
{
return anyBitSet(kTopP);
}
- bool constexpr isTopKorTopP()
+ bool constexpr isTopKorTopP() const
{
return anyBitSet(kTopKTopP);
}
- bool constexpr isTopKandTopP()
+ bool constexpr isTopKandTopP() const
{
return allBitSet(kTopKTopP);
}
- bool constexpr isBeamSearch()
+ bool constexpr isBeamSearch() const
{
return anyBitSet(kBeamSearch);
}
- bool constexpr isMedusa()
+ bool constexpr isMedusa() const
{
return anyBitSet(kMedusa);
}
@@ -96,6 +98,28 @@ class DecodingMode
return mState == other.mState;
}
+ static DecodingMode fromExecutor(executor::DecodingMode decodingMode)
+ {
+ switch (decodingMode)
+ {
+ case executor::DecodingMode::kNONE: return DecodingMode::None();
+
+ case executor::DecodingMode::kTOP_K: return DecodingMode::TopK();
+
+ case executor::DecodingMode::kTOP_P: return DecodingMode::TopP();
+
+ case executor::DecodingMode::kBEAM_SEARCH: return DecodingMode::BeamSearch();
+
+ case executor::DecodingMode::kMEDUSA: return DecodingMode::Medusa();
+
+ case executor::DecodingMode::kTOP_K_TOP_P: return DecodingMode::TopKTopP();
+
+ default: TLLM_THROW("Invalid decoding mode"); break;
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, DecodingMode other);
+
private:
constexpr DecodingMode(UnderlyingType state)
: mState(state)
diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
index 04e250316..4a2f847f4 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
@@ -29,17 +29,21 @@ class DecodingOutput
public:
using TensorPtr = ITensor::SharedPtr;
+ // BS: batch_size, BM: beam_width, MSL: max_seq_length
+ // All TensorPtr without special comments are on gpu
+
class BeamHypotheses
{
public:
- TensorPtr outputIdsTgt; // [batchSize, 2 * beamWidth, maxSeqLen]
- TensorPtr sequenceLengthsTgt; // [batchSize, 2 * beamWidth]
- TensorPtr cumLogProbs; // [batchSize, 2 * beamWidth]
- TensorPtr normedScores; // [batchSize, 2 * beamWidth]
- TensorPtr logProbs; // [batchSize, 2 * beamWidth, maxSeqLen]
- TensorPtr minNormedScores; // [batchSize]
- TensorPtr numBeams; // [batchSize]
- TensorPtr isDone; // [batchSize]
+ // The same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
+ TensorPtr outputIdsCBA; // [BS, BM*2, MSL]
+ TensorPtr sequenceLengthsCBA; // [BS, BM]
+ TensorPtr cumLogProbsCBA; // [BS, BM*2]
+ TensorPtr normedScoresCBA; // [BS, BM*2]
+ TensorPtr logProbsCBA; // [BS, BM*2, MSL]
+ TensorPtr minNormedScoresCBA; // [BS]
+ TensorPtr numBeamsCBA; // [BS]
+ TensorPtr batchDones; // [BS]
void empty(BufferManager& manager);
@@ -61,27 +65,26 @@ class DecodingOutput
}
// mandatory parameters
- TensorPtr ids; // [batchSize, beamWidth, maxSeqLen], on gpu, must contain previously generated token ids for all
- // steps before DecodingInput.step
- TensorPtr newTokensSteps; // [maxTokensPerStep, batchSize, beamWidth] new tokens at each generated token of
- // maxTokensPerStep, on gpu.
- TensorPtr newTokens; // [batchSize, beamWidth] usually a view of newTokensSteps for the current token, on gpu.
- std::vector newTokensVec; // vector of size maxTokensPerStep with tensor [batchSize, beamWidth].
- // Vector of views on newTokensSteps for each token. Elements are on gpu.
+ TensorPtr ids; // [BS, BM, MSL], contains previously generated token ids for all
+ // steps before DecodingInput.step
+ TensorPtr newTokensSteps; // [maxTokensPerStep, BS, BM] new tokens at each generated token of
+ // maxTokensPerStep
+ TensorPtr newTokens; // [BS, BM] usually a view of newTokensSteps for the current token
+ std::vector newTokensVec; // vector of size maxTokensPerStep with tensor [BS, BM].
+ // Vector of views on newTokensSteps for each token
// optional parameters
- TensorPtr finished; // [batchSize, beamWidth],
- // Set to true by decoding if any of the stop conditions are met or if DecodingInput.finished is
- // true. In beam search and to determine whether to stop according to
- // DecodingInput.sequenceLimitLength, on gpu
- TensorPtr finishedSum; // [batchSize], the sum of finished sequences per request, in pinned memory
+ TensorPtr finished; // [BS, BM], set to true by decoding if any of the stop conditions are met or if
+ // DecodingInput.finished is true. In beam search and to determine whether to stop according to
+ // DecodingInput.sequenceLimitLength
+ TensorPtr finishedSum; // [BS], the sum of finished sequences per request, in pinned memory
// mandatory parameters for beam search
- TensorPtr logProbs; // [batchSize, beamWidth, maxSeqLen], must be float*, on gpu
- TensorPtr cumLogProbs; // [batchSize, beamWidth], optional for sampling, on gpu
- TensorPtr parentIds; // [batchSize, beamWidth, maxSeqLen], on gpu
- TensorPtr lengths; // [batchSize, beamWidth], total sequence lengths including padding, on gpu
- TensorPtr cacheIndirection; // [batchSize, beamWidth, maxSeqLen], k/v indirection for next generation step, on gpu
+ TensorPtr logProbs; // [BS, BM, MSL], must be float*
+ TensorPtr cumLogProbs; // [BS, BM], optional for sampling
+ TensorPtr parentIds; // [BS, BM, MSL]
+ TensorPtr lengths; // [BS, BM], total sequence lengths including padding
+ TensorPtr cacheIndirection; // [BS, BM, MSL], k/v indirection for next generation step
BeamHypotheses beamHypotheses;
@@ -89,10 +92,10 @@ class DecodingOutput
class MedusaOutputs
{
public:
- TensorPtr medusaNextDraftTokens; // [maxBatchSize, maxTokensPerStep], on gpu
- TensorPtr medusaAcceptedTokensLen; // [maxBatchSize], on gpu
- TensorPtr medusaAcceptedLengthsCumSum; // [maxBatchSize + 1], on gpu
- TensorPtr medusaPathsOffsets; // [maxBatchSize * maxNumHeads], on gpu
+ TensorPtr medusaNextDraftTokens; // [maxBatchSize, maxTokensPerStep]
+ TensorPtr medusaAcceptedTokensLen; // [maxBatchSize]
+ TensorPtr medusaAcceptedLengthsCumSum; // [maxBatchSize + 1]
+ TensorPtr medusaPathsOffsets; // [maxBatchSize * maxNumHeads]
};
std::optional medusaOutputs;
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
index a353d5974..37cad3e9a 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
@@ -21,7 +21,7 @@
#include "tensorrt_llm/runtime/decodingInput.h"
#include "tensorrt_llm/runtime/decodingMode.h"
#include "tensorrt_llm/runtime/decodingOutput.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/samplingConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
index 039e71870..df37e0adc 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
@@ -43,7 +43,7 @@ class GptDecoderBatch : public IGptDecoderBatch
//! Setup the decoder before calling `forward()`
void setup(DecodingMode const& mode, SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow,
SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, bool fusedDecoder,
- nvinfer1::DataType dtype, GptModelConfig const& modelConfig) override;
+ nvinfer1::DataType dtype, ModelConfig const& modelConfig) override;
void newBatch(
GenerationInput const& inputs, GenerationOutput const& outputs, SamplingConfig const& samplingConfig) override;
@@ -182,7 +182,7 @@ class GptDecoderBatch : public IGptDecoderBatch
void allocateMedusaBuffers();
//! @brief Setup buffers for medusa decoding.
- void setupMedusa(GptModelConfig const& modelConfig);
+ void setupMedusa(ModelConfig const& modelConfig);
//! @brief Setups decoder internal tensors for new speculative decoding request
void newRequestSpeculativeDecoding(
diff --git a/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h b/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h
index c82f0e718..7a4bae4ef 100644
--- a/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h
@@ -17,7 +17,7 @@
#pragma once
#include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include
@@ -32,13 +32,13 @@ class GptJsonConfig
{
public:
GptJsonConfig(std::string name, std::string version, std::string precision, SizeType tensorParallelism,
- SizeType pipelineParallelism, GptModelConfig const& modelConfig)
+ SizeType pipelineParallelism, ModelConfig const& modelConfig)
: mName(std::move(name))
, mVersion(std::move(version))
, mPrecision(std::move(precision))
, mTensorParallelism{tensorParallelism}
, mPipelineParallelism{pipelineParallelism}
- , mGptModelConfig(modelConfig)
+ , mModelConfig(modelConfig)
{
}
@@ -48,9 +48,9 @@ class GptJsonConfig
static GptJsonConfig parse(std::filesystem::path const& path);
- [[nodiscard]] GptModelConfig getModelConfig() const
+ [[nodiscard]] ModelConfig getModelConfig() const
{
- return mGptModelConfig;
+ return mModelConfig;
}
[[nodiscard]] std::string const& getName() const
@@ -96,7 +96,7 @@ class GptJsonConfig
std::string const mPrecision;
SizeType const mTensorParallelism;
SizeType const mPipelineParallelism;
- GptModelConfig const mGptModelConfig;
+ ModelConfig const mModelConfig;
};
} // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
index fac23508e..1bc7f0b06 100644
--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -14,6 +14,13 @@
* limitations under the License.
*/
+/*****************************************************************************
+ *
+ * GptSession is going to be deprecated soon.
+ * Please do not add new functionality in this file!
+ *
+ *****************************************************************************/
+
#pragma once
#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
@@ -23,8 +30,8 @@
#include "tensorrt_llm/runtime/decodingMode.h"
#include "tensorrt_llm/runtime/generationInput.h"
#include "tensorrt_llm/runtime/generationOutput.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/samplingConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
@@ -150,17 +157,17 @@ class GptSession
//! @param engineBuffer The compiled TensorRT engine (const void*),
//! @param engineSize The size in bytes of the TensorRT engine (size_t),
//! @param logger The optional logger.
- GptSession(Config const& sessionConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig,
+ GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig,
void const* engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr);
- GptSession(Config const& sessionConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig,
+ GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig,
std::vector const& engineBuffer, LoggerPtr logger = nullptr)
: GptSession(
sessionConfig, modelConfig, worldConfig, engineBuffer.data(), engineBuffer.size(), std::move(logger))
{
}
- GptSession(Config const& sessionConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig,
+ GptSession(Config const& sessionConfig, ModelConfig const& modelConfig, WorldConfig const& worldConfig,
std::string const& engineFile, LoggerPtr logger = nullptr)
: GptSession(sessionConfig, modelConfig, worldConfig, utils::loadEngine(engineFile), std::move(logger))
{
@@ -170,7 +177,7 @@ class GptSession
[[nodiscard]] BufferManager const& getBufferManager() const;
- [[nodiscard]] GptModelConfig const& getModelConfig() const
+ [[nodiscard]] ModelConfig const& getModelConfig() const
{
return mModelConfig;
}
@@ -335,7 +342,7 @@ class GptSession
friend class batch_manager::TrtGptModelV1;
private:
- GptModelConfig const mModelConfig;
+ ModelConfig const mModelConfig;
WorldConfig const mWorldConfig;
int mDevice{-1};
std::shared_ptr mPipelineComm;
diff --git a/cpp/include/tensorrt_llm/runtime/iBuffer.h b/cpp/include/tensorrt_llm/runtime/iBuffer.h
index 3ddcf3fe9..5a55c66ff 100644
--- a/cpp/include/tensorrt_llm/runtime/iBuffer.h
+++ b/cpp/include/tensorrt_llm/runtime/iBuffer.h
@@ -18,6 +18,7 @@
#include "tensorrt_llm/common/arrayView.h"
#include "tensorrt_llm/common/dataType.h"
+#include "tensorrt_llm/kernels/kvCacheIndex.h"
#include
@@ -307,6 +308,12 @@ struct TRTDataType<__nv_fp8_e4m3>
};
#endif
+template <>
+struct TRTDataType
+{
+ static constexpr auto value = TRTDataType::value;
+};
+
template <>
struct TRTDataType
{
diff --git a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
index 4b09651e0..90cf0d497 100644
--- a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
@@ -75,7 +75,7 @@ class IStatefulGptDecoder
//! Setup the decoder before calling `forward()`, also calls reshapeBuffers
virtual void setup(DecodingMode const& mode, SizeType maxBatchSize, SizeType maxBeamWidth,
SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep,
- bool fusedDecoder, nvinfer1::DataType dtype, GptModelConfig const& modelConfig)
+ bool fusedDecoder, nvinfer1::DataType dtype, ModelConfig const& modelConfig)
= 0;
//! @brief Initialize the decoder with new batch of inputs.
diff --git a/cpp/include/tensorrt_llm/runtime/loraCache.h b/cpp/include/tensorrt_llm/runtime/loraCache.h
index d9bf51ef9..bfb3c701e 100644
--- a/cpp/include/tensorrt_llm/runtime/loraCache.h
+++ b/cpp/include/tensorrt_llm/runtime/loraCache.h
@@ -18,10 +18,10 @@
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/loraCachePageManagerConfig.h"
#include "tensorrt_llm/runtime/loraModule.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include
#include
@@ -159,11 +159,11 @@ class LoraCache
/**
* param[in] pageManagerConfig: a LoraCachePageManagerConfig
- * param[in] modelConfig: a GptModelConfig
+ * param[in] modelConfig: a ModelConfig
* param[in] worldConfig: a WorldConfig
* param[in] bufferManager: a BufferManager only used to allocate page blocks
*/
- LoraCache(LoraCachePageManagerConfig const& pageManagerConfig, GptModelConfig const& modelConfig,
+ LoraCache(LoraCachePageManagerConfig const& pageManagerConfig, ModelConfig const& modelConfig,
WorldConfig const& worldConfig, BufferManager const& bufferManager);
/**
@@ -277,7 +277,7 @@ class LoraCache
* \brief Copy task weights to cache pages.
* \param[in] weights: task weights
* \param[in] config: task config tensor
- * \param[in] modelConfig: a GptModelConfig
+ * \param[in] modelConfig: a ModelConfig
* \param[in] worldConfig: a WorldConfig
* \param[in] modelIdToModel: map from lora module id to LoraModule
* \param[in] manager: a BufferManager the manager to use to perform the copies
@@ -286,7 +286,7 @@ class LoraCache
* \returns -- list of cache Values objects
*/
static std::vector copyToPages(TensorPtr weights, TensorPtr config,
- GptModelConfig const& modelConfig, WorldConfig const& worldConfig,
+ ModelConfig const& modelConfig, WorldConfig const& worldConfig,
std::unordered_map moduleIdToModel, BufferManager const& manager,
std::vector const& pages, std::vector const& pageIds);
@@ -385,7 +385,7 @@ class LoraCache
};
LoraCachePageManagerConfig mPageManagerConfig;
- GptModelConfig mModelConfig;
+ ModelConfig mModelConfig;
WorldConfig mWorldConfig;
// Protects mCachePageManager
diff --git a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
similarity index 91%
rename from cpp/include/tensorrt_llm/runtime/gptModelConfig.h
rename to cpp/include/tensorrt_llm/runtime/modelConfig.h
index 4e069d8fb..6a1189baa 100644
--- a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -32,7 +32,7 @@ struct MambaConfig
SizeType expand = 0;
};
-class GptModelConfig
+class ModelConfig
{
public:
enum class ModelVariant : std::int32_t
@@ -42,10 +42,11 @@ class GptModelConfig
kMamba = 2, // https://github.com/state-spaces/mamba
};
- explicit GptModelConfig(
- SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
+ explicit ModelConfig(SizeType vocabSize, SizeType nbAttentionLayers, SizeType nbSsmLayers, SizeType nbHeads,
+ SizeType hiddenSize, nvinfer1::DataType dtype)
: mVocabSize(vocabSize)
- , mNbLayers(nbLayers)
+ , mNbAttentionLayers(nbAttentionLayers)
+ , mNbSsmLayers(nbSsmLayers)
, mNbHeads(nbHeads)
, mNbKvHeads(nbHeads)
, mHiddenSize(hiddenSize)
@@ -71,6 +72,7 @@ class GptModelConfig
, mMaxDraftLen(0)
, mUseContextFMHAForGeneration(false)
, mPagedContextFMHA(false)
+ , mUseXQA{false}
, mUseLoraPlugin(false)
, mMlpHiddenSize(0)
, mMedusaModule(std::nullopt)
@@ -87,10 +89,16 @@ class GptModelConfig
return (mVocabSize + worldSize - 1) / worldSize * worldSize;
}
- [[nodiscard]] SizeType constexpr getNbLayers(SizeType pipelineParallelism = 1) const
+ [[nodiscard]] SizeType constexpr getNbAttentionLayers(SizeType pipelineParallelism = 1) const
{
- TLLM_CHECK(mNbLayers % pipelineParallelism == 0);
- return mNbLayers / pipelineParallelism;
+ TLLM_CHECK(mNbAttentionLayers % pipelineParallelism == 0);
+ return mNbAttentionLayers / pipelineParallelism;
+ }
+
+ [[nodiscard]] SizeType constexpr getNbSsmLayers(SizeType pipelineParallelism = 1) const
+ {
+ TLLM_CHECK(mNbSsmLayers % pipelineParallelism == 0);
+ return mNbSsmLayers / pipelineParallelism;
}
[[nodiscard]] SizeType constexpr getNbHeads() const noexcept
@@ -344,6 +352,16 @@ class GptModelConfig
return mPagedContextFMHA;
}
+ void constexpr useXQA(bool useXQA) noexcept
+ {
+ mUseXQA = useXQA;
+ }
+
+ [[nodiscard]] bool constexpr useXQA() const noexcept
+ {
+ return mUseXQA;
+ }
+
[[nodiscard]] bool constexpr useLoraPlugin() const noexcept
{
return mUseLoraPlugin;
@@ -354,7 +372,7 @@ class GptModelConfig
mUseLoraPlugin = useLoraPlugin;
}
- std::vector const& getLoraModules() const noexcept
+ [[nodiscard]] std::vector const& getLoraModules() const noexcept
{
return mLoraModules;
}
@@ -442,7 +460,8 @@ class GptModelConfig
private:
SizeType mVocabSize;
- SizeType mNbLayers;
+ SizeType mNbAttentionLayers;
+ SizeType mNbSsmLayers;
SizeType mNbHeads;
SizeType mNbKvHeads;
SizeType mHiddenSize;
@@ -471,6 +490,7 @@ class GptModelConfig
bool mUseContextFMHAForGeneration;
bool mPagedContextFMHA;
+ bool mUseXQA;
bool mUseLoraPlugin;
std::vector mLoraModules;
diff --git a/cpp/include/tensorrt_llm/runtime/samplingConfig.h b/cpp/include/tensorrt_llm/runtime/samplingConfig.h
index be1060853..787766148 100644
--- a/cpp/include/tensorrt_llm/runtime/samplingConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/samplingConfig.h
@@ -17,6 +17,7 @@
#pragma once
#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/layers/defaultDecodingParams.h"
#include "tensorrt_llm/runtime/common.h"
#include
@@ -36,25 +37,21 @@ class SamplingConfig
template
static OptVec fuseValues(
- std::vector const& configs, std::function(SizeType ci)> accessor)
+ std::vector const& configs, std::function(size_t ci)> accessor, T defaultValue)
{
std::vector values;
- auto const hasValues = accessor(0).has_value();
for (size_t ci = 0; ci < configs.size(); ++ci)
{
+ auto value = defaultValue;
auto const& configValue = accessor(ci);
- TLLM_CHECK(hasValues == configValue.has_value());
- if (hasValues)
+ if (configValue.has_value())
{
TLLM_CHECK(configValue.value().size() == 1);
- values.push_back(configValue.value().front());
+ value = configValue.value().front();
}
+ values.push_back(value);
}
- if (!hasValues)
- {
- return std::nullopt;
- }
return std::make_optional>(values);
}
@@ -72,26 +69,52 @@ class SamplingConfig
TLLM_CHECK(configs.size() > 0);
beamWidth = configs.front().beamWidth;
normalizeLogProbs = configs.front().normalizeLogProbs;
- temperature = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].temperature; });
- minLength = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].minLength; });
- repetitionPenalty
- = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].repetitionPenalty; });
- presencePenalty
- = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].presencePenalty; });
- topK = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].topK; });
- topP = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].topP; });
- randomSeed = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].randomSeed; });
- topPDecay = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].topPDecay; });
- topPMin = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].topPMin; });
- topPResetIds = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].topPResetIds; });
- beamSearchDiversityRate
- = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].beamSearchDiversityRate; });
- lengthPenalty = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].lengthPenalty; });
- earlyStopping = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].earlyStopping; });
- draftAcceptanceThreshold
- = fuseValues(configs, [&configs](SizeType ci) { return configs[ci].draftAcceptanceThreshold; });
- topKMedusaHeads = fuseValues>(
- configs, [&configs](SizeType ci) { return configs[ci].topKMedusaHeads; });
+ temperature = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].temperature; },
+ layers::DefaultDecodingParams::getTemperature());
+ minLength = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].minLength; },
+ layers::DefaultDecodingParams::getMinLength());
+ repetitionPenalty = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].repetitionPenalty; },
+ layers::DefaultDecodingParams::getRepetitionPenalty());
+ presencePenalty = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].presencePenalty; },
+ layers::DefaultDecodingParams::getPresencePenalty());
+ frequencyPenalty = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].frequencyPenalty; },
+ layers::DefaultDecodingParams::getFrequencyPenalty());
+ topK = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].topK; }, layers::DefaultDecodingParams::getTopK());
+ topP = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].topP; }, layers::DefaultDecodingParams::getTopP());
+ randomSeed = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].randomSeed; },
+ layers::DefaultDecodingParams::getSeed());
+ topPDecay = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].topPDecay; },
+ layers::DefaultDecodingParams::getTopPDecay());
+ topPMin = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].topPMin; },
+ layers::DefaultDecodingParams::getTopPMin());
+ topPResetIds = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].topPResetIds; },
+ layers::DefaultDecodingParams::getTopPResetId());
+ beamSearchDiversityRate = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].beamSearchDiversityRate; },
+ layers::DefaultDecodingParams::getBeamSearchDiversity());
+ lengthPenalty = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].lengthPenalty; },
+ layers::DefaultDecodingParams::getLengthPenalty());
+ earlyStopping = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].earlyStopping; },
+ layers::DefaultDecodingParams::getEarlyStopping());
+ topKMedusaHeads = fuseValues>(
+ configs, [&configs](size_t ci) { return configs[ci].topKMedusaHeads; },
+ layers::DefaultDecodingParams::getTopKMedusaHeads());
+ // Only used for tests.
+ draftAcceptanceThreshold = fuseValues(
+ configs, [&configs](size_t ci) { return configs[ci].draftAcceptanceThreshold; }, 0);
}
explicit SamplingConfig(executor::SamplingConfig const& samplingConfig,
@@ -148,13 +171,13 @@ class SamplingConfig
// beam search layer
OptVec beamSearchDiversityRate; // [1] or [batch_size]
OptVec lengthPenalty; // [1] or [batch_size]
- OptVec earlyStopping; // [1] or [batch_size]
+ OptVec earlyStopping; // [1] or [batch_size]
// speculative decoding, only the first value is used (in gptDecoderBatch.cpp)
OptVec draftAcceptanceThreshold; // [1] or [batch_size]
// medusa params
- OptVec> topKMedusaHeads; // [batchSize, maxMedusaHeads]
+ OptVec> topKMedusaHeads; // [batchSize, maxMedusaHeads]
std::optional normalizeLogProbs;
diff --git a/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h b/cpp/include/tensorrt_llm/runtime/utils/multiDeviceUtils.h
similarity index 100%
rename from cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h
rename to cpp/include/tensorrt_llm/runtime/utils/multiDeviceUtils.h
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index 751b43089..97c652352 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -30,6 +30,7 @@ add_subdirectory(common)
add_subdirectory(kernels)
add_subdirectory(layers)
add_subdirectory(runtime)
+add_subdirectory(executor_worker)
set(BATCH_MANAGER_TARGET tensorrt_llm_batch_manager_static)
set(BATCH_MANAGER_TARGET_ARCH "unknown")
@@ -196,8 +197,9 @@ set(TRTLLM_LINK_LIBS
kernels_src
context_attention_src
decoder_attention_src
- cutlass2_src
- cutlass3_src
+ fpA_intB_gemm_src
+ moe_gemm_src
+ cutlass_src
layers_src
runtime_src)
@@ -218,44 +220,31 @@ set_target_properties(
PROPERTIES CXX_STANDARD "17" CXX_STANDARD_REQUIRED "YES" CXX_EXTENSIONS "NO"
LINK_FLAGS "${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}")
-target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS})
-
-if(WIN32)
- target_link_libraries(${SHARED_TARGET}
- PUBLIC $)
- set_target_properties(
- ${SHARED_TARGET} PROPERTIES LINK_FLAGS
- "/WHOLEARCHIVE:${BATCH_MANAGER_TARGET}")
-else()
- # Assume everything else is like gcc
- target_link_libraries(
- ${SHARED_TARGET}
- PRIVATE "-Wl,--whole-archive" $
- "-Wl,--no-whole-archive")
-endif()
-
-if(WIN32)
- target_link_libraries(${SHARED_TARGET}
- PUBLIC $)
- set_target_properties(
- ${SHARED_TARGET} PROPERTIES LINK_FLAGS "/WHOLEARCHIVE:${EXECUTOR_TARGET}")
-else()
- # Assume everything else is like gcc
- target_link_libraries(
- ${SHARED_TARGET}
- PRIVATE "-Wl,--whole-archive" $
- "-Wl,--no-whole-archive")
-endif()
+function(link_whole_archive TARGET LIBRARY_TO_LINK)
+ if(WIN32)
+ target_link_libraries(${TARGET} PUBLIC $)
+ set_target_properties(
+ ${TARGET} PROPERTIES LINK_FLAGS "/WHOLEARCHIVE:${LIBRARY_TO_LINK}")
+ else()
+ # Assume everything else is like gcc
+ target_link_libraries(
+ ${TARGET} PRIVATE "-Wl,--whole-archive" $
+ "-Wl,--no-whole-archive")
+ endif()
+endfunction()
-add_dependencies(${SHARED_TARGET} check_symbol)
-add_dependencies(${SHARED_TARGET} check_symbol_executor)
+target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS})
+link_whole_archive(${SHARED_TARGET} ${BATCH_MANAGER_TARGET})
+link_whole_archive(${SHARED_TARGET} ${EXECUTOR_TARGET})
# Cyclic dependency of batch manager on TRT-LLM
target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE ${SHARED_TARGET})
-
# Cyclic dependency of executor on TRT-LLM
target_link_libraries(${EXECUTOR_TARGET} INTERFACE ${SHARED_TARGET})
+add_dependencies(${SHARED_TARGET} check_symbol)
+add_dependencies(${SHARED_TARGET} check_symbol_executor)
+
if(BUILD_PYT)
add_subdirectory(thop)
endif()
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 232995c8c..557044510 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:6bd5ec7130a703889eb51fe6591c93a079ded644ca089099efe5e3d72474838e
-size 2896708
+oid sha256:d8a083974ff58e74dec95d1ad438bf84be9adeedeb20b5e7254fe56d6a4bf40c
+size 2997970
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index a82c84b2b..1d18f8425 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:d25d35be9ec13d1f0a0b9f3ed40362879d9ac50bdfcdcb827990554a26ff5c10
-size 2923694
+oid sha256:40cace20ce33a945ed12a2a2e382053aa90113d8bed2623c985dbb60b943251e
+size 3034874
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index 50577e778..c0373a814 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-cafe56cc4a916b91ea338a8412c79fef libtensorrt_llm_batch_manager_static.a
-3274866669694da8f09e30388939b7dd libtensorrt_llm_batch_manager_static.pre_cxx11.a
-165fe125d6bf55090d8a7dec012d08f8d0e7a54b commit
\ No newline at end of file
+7c5e14e8ed4e3e0641a8aefa659a03c0 libtensorrt_llm_batch_manager_static.a
+79a986633cb1f0dc6621423bbbf21727 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+83029c1606a00e0e4aaf5ea2de17867a6e5ddd9b commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 532e616e7..e8ac17414 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:27dbbdae087a946d1762f11efe953a1b1b282e27747708145c405e9380fce287
-size 2822910
+oid sha256:913f548b9f66aaea93baaa40bd7ca37f4fb0b52f5ed0778b1fe52c136141433c
+size 2916334
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 2b55b566b..7c9e64206 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:622724d6b9219dd3d4710a822ca92d497c466cdc34149258f9559c08f4470f8e
-size 2796594
+oid sha256:8dd40bb9cafae379971b365c8206fd20addb7816c64953456568110e5f694b0e
+size 2900610
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
index d6a4a05e5..ae9ad3606 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:296c78f2c29774fab2145465a9a515a7e4aaedde96ba3c3f6fa5af91fa92dee6
-size 18976374
+oid sha256:889f62ee370c0a00c1ccfc26e82fcd1410413e44e6d955aca12a90c906e89239
+size 18428048
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
index 4c816d44c..072b3c443 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
@@ -62,6 +62,7 @@ CUDADriverWrapper::CUDADriverWrapper()
*(void**) (&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
*(void**) (&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
*(void**) (&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
+ *(void**) (&_cuTensorMapEncodeTiled) = load_sym(handle, "cuTensorMapEncodeTiled");
}
CUDADriverWrapper::~CUDADriverWrapper()
@@ -143,5 +144,14 @@ CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, unsigned int gridDimX,
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
}
+CUresult CUDADriverWrapper::cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType,
+ cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
+ cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
+ CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const
+{
+ return (*_cuTensorMapEncodeTiled)(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, globalStrides,
+ boxDim, elementStrides, interleave, swizzle, l2Promotion, oobFill);
+}
+
} // namespace common
} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
index d5eb5f2d7..7be5023a1 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@@ -70,6 +70,11 @@ class CUDADriverWrapper
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
CUstream hStream, void** kernelParams, void** extra) const;
+ CUresult cuTensorMapEncodeTiled(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank,
+ void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides, cuuint32_t const* boxDim,
+ cuuint32_t const* elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+ CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) const;
+
private:
void* handle;
CUresult (*_cuGetErrorName)(CUresult, char const**);
@@ -89,6 +94,10 @@ class CUDADriverWrapper
CUresult (*_cuLaunchKernel)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes,
CUstream hStream, void** kernelParams, void** extra);
+ CUresult (*_cuTensorMapEncodeTiled)(CUtensorMap* tensorMap, CUtensorMapDataType tensorDataType,
+ cuuint32_t tensorRank, void* globalAddress, cuuint64_t const* globalDim, cuuint64_t const* globalStrides,
+ cuuint32_t const* boxDim, cuuint32_t const* elementStrides, CUtensorMapInterleave interleave,
+ CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
};
inline void cuErrCheck_(CUresult stat, CUDADriverWrapper const& wrap, char const* file, int line)
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
index 06f87b828..e6764f04a 100644
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -22,21 +22,39 @@
namespace tensorrt_llm::common
{
-// XQA kernels (optimized kernels for generation phase).
-bool forceXQAKernels()
+static std::optional getIntEnv(char const* name)
{
- char const* force_xqa_env_var = getenv("TRTLLM_FORCE_XQA");
- static bool forceXQA = false;
- if (force_xqa_env_var != nullptr)
+ char const* const env = std::getenv(name);
+ if (env == nullptr)
{
- if (force_xqa_env_var[0] == '1' && force_xqa_env_var[1] == '\0')
- {
- forceXQA = true;
- }
+ return std::nullopt;
}
+ int32_t const val = std::stoi(env);
+ if (val <= 0)
+ {
+ return std::nullopt;
+ }
+ return {val};
+};
+
+// XQA kernels (optimized kernels for generation phase).
+bool forceXQAKernels()
+{
+ static bool const forceXQA = (getIntEnv("TRTLLM_FORCE_XQA").value_or(0) != 0);
return forceXQA;
}
+int32_t xqaMaxNbCtaPerKVHeadFactor()
+{
+ return envXqaNbCtaPerKVHead().value_or(8);
+}
+
+std::optional envXqaNbCtaPerKVHead()
+{
+ static std::optional const ret = getIntEnv("TRTLLM_XQA_BLOCKS_PER_SEQUENCE");
+ return ret;
+}
+
// Tune the number of blocks per sequence for accuracy/performance purpose.
bool getEnvMmhaMultiblockDebug()
{
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
index 1a5224c4f..16429c74c 100644
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@@ -16,6 +16,8 @@
*/
#pragma once
+#include
+#include
namespace tensorrt_llm::common
{
@@ -23,6 +25,14 @@ namespace tensorrt_llm::common
// XQA kernels (optimized kernels for generation phase).
bool forceXQAKernels();
+// max number of CTAs for each KV head, multiple CTAs for one KV head is multi-block mode.
+// this number defines the maximum number when reaches both max_batch_size and max_beam_width.
+// If batch_size or beam_width doesn't reach maximum value, it is possible to have more CTAs per KV head than this
+// value.
+int32_t xqaMaxNbCtaPerKVHeadFactor();
+
+std::optional envXqaNbCtaPerKVHead();
+
// Tune the number of blocks per sequence for accuracy/performance purpose.
bool getEnvMmhaMultiblockDebug();
diff --git a/cpp/tensorrt_llm/common/mpiUtils.cpp b/cpp/tensorrt_llm/common/mpiUtils.cpp
index 9c8cb2856..0c1aa1769 100644
--- a/cpp/tensorrt_llm/common/mpiUtils.cpp
+++ b/cpp/tensorrt_llm/common/mpiUtils.cpp
@@ -19,6 +19,7 @@
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
#include
#include
@@ -35,7 +36,7 @@ namespace tensorrt_llm::mpi
MPI_Datatype getMpiDtype(MpiType dtype)
{
- static const std::unordered_map dtype_map{
+ static std::unordered_map const dtype_map{
{MpiType::kBYTE, MPI_BYTE},
{MpiType::kHALF, MPI_UINT16_T},
@@ -57,7 +58,7 @@ MPI_Datatype getMpiDtype(MpiType dtype)
MPI_Op getMpiOp(MpiOp op)
{
- static const std::unordered_map op_map{
+ static std::unordered_map const op_map{
{MpiOp::NULLOP, MPI_OP_NULL},
{MpiOp::MAX, MPI_MAX},
{MpiOp::MIN, MPI_MIN},
@@ -122,16 +123,33 @@ std::shared_ptr MpiComm::bcastAsync(void* buffer, size_t size, MpiTy
return r;
}
+std::shared_ptr MpiComm::bcastAsync(runtime::IBuffer& buf, int root) const
+{
+ TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
+ return bcastAsync(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
+}
+
void MpiComm::bcast(void* buffer, size_t size, MpiType dtype, int root) const
{
MPICHECK(MPI_Bcast(buffer, size, getMpiDtype(dtype), root, mComm));
}
+void MpiComm::bcast(runtime::IBuffer& buf, int root) const
+{
+ bcast(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, root);
+}
+
void MpiComm::send(void const* buffer, size_t size, MpiType dtype, int dest, int tag) const
{
MPICHECK(MPI_Send(buffer, size, getMpiDtype(dtype), dest, tag, mComm));
}
+void MpiComm::send(runtime::IBuffer const& buf, int dest, int tag) const
+{
+ TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
+ send(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, dest, tag);
+}
+
MPI_Status MpiComm::recv(void* buffer, size_t size, MpiType dtype, int source, int tag) const
{
MPI_Status status{};
@@ -139,6 +157,12 @@ MPI_Status MpiComm::recv(void* buffer, size_t size, MpiType dtype, int source, i
return status;
}
+MPI_Status MpiComm::recv(runtime::IBuffer& buf, int source, int tag) const
+{
+ TLLM_CHECK(buf.getMemoryType() != runtime::MemoryType::kGPU);
+ return recv(buf.data(), buf.getSizeInBytes(), MpiType::kBYTE, source, tag);
+}
+
MpiComm MpiComm::split(int color, int key) const
{
MPI_Comm splitComm;
diff --git a/cpp/tensorrt_llm/common/workspace.h b/cpp/tensorrt_llm/common/workspace.h
index 7620d93d7..471da2ce7 100644
--- a/cpp/tensorrt_llm/common/workspace.h
+++ b/cpp/tensorrt_llm/common/workspace.h
@@ -63,7 +63,7 @@ int8_t* nextWorkspacePtrWithAlignment(
return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, alignment);
}
-size_t calculateTotalWorkspaceSize(size_t* workspaces, int count, const uintptr_t alignment = kCudaMemAlign)
+size_t calculateTotalWorkspaceSize(size_t const* workspaces, int count, const uintptr_t alignment = kCudaMemAlign)
{
size_t total = 0;
for (int i = 0; i < count; i++)
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
index 758ed7d82..233d633a8 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
@@ -30,6 +30,7 @@
#include "cutlass/epilogue/thread/linear_combination_relu.h"
#include "cutlass/epilogue/thread/linear_combination_silu.h"
#include "cutlass_extensions/epilogue/thread/fused_activations.h"
+#include
namespace tensorrt_llm
{
@@ -48,19 +49,19 @@ struct EpilogueOpBiasFtGelu
{
};
-struct EpilogueOpDefaultSilu
+struct EpilogueOpBias
{
};
-struct EpilogueOpDefaultReLU
+struct EpilogueOpDefaultSilu
{
};
-struct EpilogueOpDefaultFtGelu
+struct EpilogueOpDefaultReLU
{
};
-struct EpilogueOpBias
+struct EpilogueOpDefaultFtGelu
{
};
@@ -71,6 +72,7 @@ struct EpilogueOpDefault
template
struct Epilogue
{
+ static_assert(sizeof(ElementType) == 0, "Unrecognized Epilogue Tag");
};
constexpr auto BiasScaleMode = cutlass::epilogue::thread::ScaleType::NoBetaScaling;
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
index 3ba82dd52..ee084116a 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
@@ -36,10 +36,11 @@ namespace kernel
template
struct MixedGemmArchTraits
{
+ static_assert(dependent_false, "Unrecognised parameterization");
};
-template
-struct MixedGemmArchTraits
+template
+struct MixedGemmArchTraits
{
static constexpr int Stages = 2;
using OperatorClass = cutlass::arch::OpClassSimt;
@@ -66,7 +67,7 @@ struct MixedGemmArchTraits::value>::type>
{
private:
- using LayoutDetails = LayoutDetailsB;
+ using LayoutDetails = LayoutDetailsB;
public:
static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
@@ -92,7 +93,7 @@ struct MixedGemmArchTraits::value>::type>
{
private:
- using LayoutDetails = LayoutDetailsB;
+ using LayoutDetails = LayoutDetailsB;
public:
static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
@@ -116,7 +117,7 @@ struct MixedGemmArchTraits::value>::type>
{
private:
- using LayoutDetails = LayoutDetailsB;
+ using LayoutDetails = LayoutDetailsB;
public:
static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
@@ -133,6 +134,34 @@ struct MixedGemmArchTraits
+struct MixedGemmArchTraits::value
+ || cutlass::platform::is_same::value
+#ifdef ENABLE_FP8
+ || cutlass::platform::is_same::value>::type
+#endif
+ >
+{
+private:
+ using LayoutDetails = LayoutDetailsB;
+
+public:
+ static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
+
+ using OperatorClass = cutlass::arch::OpClassTensorOp;
+ using AccType = float;
+ using LayoutB = typename LayoutDetails::Layout;
+
+ static constexpr int ElementsPerAccessA = 128 / cutlass::sizeof_bits::value;
+ static constexpr int ElementsPerAccessB = LayoutDetails::ElementsPerAccess;
+ static constexpr int ElementsPerAccessC = 128 / cutlass::sizeof_bits::value;
+ using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256 / cutlass::sizeof_bits::value>;
+
+ using Operator = typename LayoutDetails::Operator;
+};
+
} // namespace kernel
} // namespace gemm
} // namespace cutlass
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
index 053f73103..63469f225 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -546,8 +546,10 @@ struct GemmFpAIntB
run_kernel(params, shared_storage);
#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
run_kernel(params, shared_storage);
-#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 890)
run_kernel(params, shared_storage);
+#elif (__CUDA_ARCH__ == 890)
+ run_kernel(params, shared_storage);
#elif (__CUDA_ARCH__ >= 900)
CUTLASS_NOT_IMPLEMENTED(); // Don't compile these for Hopper or later. Use CUTLASS 3.x kernels.
#else
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
index 33404721f..a1712431e 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -42,16 +42,16 @@ namespace gemm
namespace kernel
{
-template
+template
struct LayoutDetailsB
{
};
// Volta specialiations. Volta will dequantize before STS, so we need a different operator
-template
-struct LayoutDetailsB
+template
+struct LayoutDetailsB
{
- static constexpr int ThreadblockK = 64;
+ static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value;
using Layout = layout::ColumnMajor;
static constexpr int ElementsPerAccess = 8;
using Operator = cutlass::arch::OpMultiplyAdd;
@@ -59,19 +59,19 @@ struct LayoutDetailsB
// Specializations for Turing+ when B is FP16. These are currently only used for MoE networks.
// TODO - Switch this to column major for weights since gemms should be more performant.
-template
-struct LayoutDetailsB= 75>::type>
+template
+struct LayoutDetailsB= 75>::type>
{
- static constexpr int ThreadblockK = 64;
+ static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value;
using Layout = layout::ColumnMajor;
static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits::value;
using Operator = cutlass::arch::OpMultiplyAdd;
};
-template
-struct LayoutDetailsB= 75>::type>
+template
+struct LayoutDetailsB= 75>::type>
{
- static constexpr int ThreadblockK = 64;
+ static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value;
using Layout = layout::ColumnMajor;
static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits::value;
using Operator = cutlass::arch::OpMultiplyAdd;
@@ -79,11 +79,12 @@ struct LayoutDetailsB
- struct LayoutDetailsB < uint8_t,
- Arch, typename platform::enable_if= 75 && Arch::kMinComputeCapability<90>::type>
+template
+ struct LayoutDetailsB < TypeA,
+ uint8_t, Arch,
+ typename platform::enable_if= 75 && Arch::kMinComputeCapability<90>::type>
{
- static constexpr int ThreadblockK = 64;
+ static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value;
private:
static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits::value;
@@ -95,11 +96,12 @@ template
using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
};
-template
- struct LayoutDetailsB < uint4b_t,
- Arch, typename platform::enable_if= 75 && Arch::kMinComputeCapability<90>::type>
+template
+ struct LayoutDetailsB < TypeA,
+ uint4b_t, Arch,
+ typename platform::enable_if= 75 && Arch::kMinComputeCapability<90>::type>
{
- static constexpr int ThreadblockK = 64;
+ static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value;
private:
static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits::value;
@@ -111,19 +113,19 @@ template
using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
};
-template
-struct LayoutDetailsB= 90>::type>
+template
+struct LayoutDetailsB= 90>::type>
{
- static constexpr int ThreadblockK = 64;
+ static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value;
using Layout = layout::ColumnMajor;
static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits::value;
using Operator = cutlass::arch::OpMultiplyAdd;
};
-template