Update TensorRT-LLM (#2413)

NVIDIA · Nov 5, 2024 · b7868dd · b7868dd
1 parent f6821ee
commit b7868dd
Show file tree

Hide file tree

Showing 366 changed files with 109,008 additions and 7,479 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,15 +29,15 @@ dump*/
 config.json
 /*.svg
 cpp/cmake-build-*
-cpp/.ccache/
+cpp/.ccache
 tensorrt_llm/bin
 tensorrt_llm/libs
 tensorrt_llm/bindings.*.so
 tensorrt_llm/bindings.pyi
-tensorrt_llm/bindings/*.pyi
+tensorrt_llm/bindings/**/*.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
-docs/source/llm-api
+docs/source/llm-api/*.rst
 docs/source/llm-api-examples/llm_*.rst
 *.swp
 

diff --git a/README.md b/README.md
@@ -11,21 +11,29 @@ TensorRT-LLM
 [![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
-[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
+[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
 
 ---
 <div align="left">
 
 ## Latest News
+
+* [2024/11/02] 🌟🌟🌟 NVIDIA and LlamaIndex Developer Contest
+🙌 Enter for a chance to win prizes including an NVIDIA® GeForce RTX™ 4080 SUPER GPU, DLI credits, and more🙌
+[➡️ link](https://developer.nvidia.com/llamaindex-developer-contest)
+<div align="center">
+<img src="docs/source/media/image-11-02-2024.png" width="50%">
+<div align="left">
+
+* [2024/10/28] 🏎️🏎️🏎️ NVIDIA GH200 Superchip Accelerates Inference by 2x in Multiturn Interactions with Llama Models
+[➡️ link](https://developer.nvidia.com/blog/nvidia-gh200-superchip-accelerates-inference-by-2x-in-multiturn-interactions-with-llama-models/)
+
 * [2024/10/22] New 📝 Step-by-step instructions on how to
 ✅ Optimize LLMs with NVIDIA TensorRT-LLM,
 ✅ Deploy the optimized models with Triton Inference Server,
 ✅ Autoscale LLMs deployment in a Kubernetes environment.
 🙌 Technical Deep Dive:
 [➡️ link](https://nvda.ws/3YgI8UT)
-<div align="center">
-<img src="docs/source/media/image-10-22-2024.png" width="50%">
-<div align="left">
 
 * [2024/10/07] 🚀🚀🚀Optimizing Microsoft Bing Visual Search with NVIDIA Accelerated Libraries
 [➡️ link](https://developer.nvidia.com/blog/optimizing-microsoft-bing-visual-search-with-nvidia-accelerated-libraries/)
@@ -45,6 +53,9 @@ TensorRT-LLM
 * [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
 
+<details close>
+<summary>Previous News</summary>
+
 * [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
 
@@ -71,9 +82,6 @@ TensorRT-LLM
 * [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100.
 [➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467)
 
-<details close>
-<summary>Previous News</summary>
-
 * [2024/06/24] Enhanced with NVIDIA #TensorRT #LLM, @upstage.ai’s solar-10.7B-instruct is ready to power your developer projects through our API catalog 🏎️. ✨[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )
 
 * [2024/06/18] CYMI: 🤩 Stable Diffusion 3 dropped last week 🎊 🏎️ Speed up your SD3 with #TensorRT INT8 Quantization[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -7,6 +7,6 @@ There are currently three workflows to benchmark TensorRT-LLM:
   - The recommended workflow that uses TensorRT-LLM C++ API and can take advantage of the latest features of TensorRT-LLM.
 * [Python benchmarks](./python)
   - The Python benchmarking scripts can only benchmark the Python runtime, which do not support the latest features, such as in-flight batching.
-* [The Python benchmarking suite](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html)
+* [The Python benchmarking suite](../docs/source/performance/perf-benchmarking.md)
   - This benchmarker is native to TensorRT-LLM and is a Python benchmarker for reproducing and testing the performance of TensorRT-LLM.
   - _NOTE_: This benchmarking suite is a current work in progress and is prone to large changes.
diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -147,6 +147,7 @@ struct BenchmarkParams
     std::optional<float> freeGpuMemoryFraction{std::nullopt};
     std::optional<float> crossKvCacheFraction{std::nullopt};
     bool enableTrtOverlap{false};
+    bool enableBatchSizeTuning{false};
     bool enableBlockReuse{false};
     bool enableChunkedContext{false};
     bool streaming{false};
@@ -879,8 +880,9 @@ class ExecutorServer
         , mShutdown(false)
         , mLogIterationData(logIterationData)
     {
+        texec::DynamicBatchConfig dynamicBatchConfig(benchmarkParams.enableBatchSizeTuning);
+        texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy, std::nullopt, dynamicBatchConfig);
 
-        texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
         texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
             benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
             benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
@@ -1971,6 +1973,8 @@ int main(int argc, char* argv[])
         "max_num_tokens", "The max runtime number of tokens per batch when benchmarking", cxxopts::value<int>());
     options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
         cxxopts::value<bool>()->default_value("false"));
+    options.add_options()(
+        "enable_batch_size_tuning", "Dynamic tuning of batch size", cxxopts::value<bool>()->default_value("false"));
     options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()("streaming", "Operate in streaming mode", cxxopts::value<bool>()->default_value("false"));
@@ -2152,6 +2156,9 @@ int main(int argc, char* argv[])
     // Argument: Enable TRT overlap
     benchmarkParams.enableTrtOverlap = result["enable_trt_overlap"].as<bool>();
 
+    // Argument: Enable dynamic tuning of batch size
+    benchmarkParams.enableBatchSizeTuning = result["enable_batch_size_tuning"].as<bool>();
+
     // Argument: Enable KV cache reuse
     benchmarkParams.enableBlockReuse = result["enable_kv_cache_reuse"].as<bool>();
 
@@ -2190,6 +2197,11 @@ int main(int argc, char* argv[])
     // Argument: Enable batch stats output
     bool logIterationData = result["log_iteration_data"].as<bool>();
 
+    if (logIterationData)
+    {
+        TLLM_LOG_WARNING("Setting log_iteration_data to true adds overheads and may result in lower perf");
+    }
+
     // Argument: Enable chunked context
     benchmarkParams.enableChunkedContext = result["enable_chunked_context"].as<bool>();
 

diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py
@@ -231,8 +231,6 @@ def dataset(root_args, **kwargs):
             }, root_args.output)
     else:
         print_dataset(
-            task_ids,
             input_ids,
             output_lens,
-            tokenizer=None,
         )
diff --git a/benchmarks/python/all_reduce.py b/benchmarks/python/all_reduce.py
@@ -41,7 +41,7 @@ def allreduce_benchmark(dtype: str,
     torch.cuda.set_device(local_rank)
     cudart.cudaSetDevice(local_rank)
 
-    mapping = Mapping(world_size, rank, gpus_per_node, world_size)
+    mapping = Mapping(world_size, rank, gpus_per_node, tp_size=world_size)
 
     if world_size == 1:
         raise RuntimeError("Benchmark must run with mpi_world_size > 1")

diff --git a/benchmarks/python/enc_dec_benchmark.py b/benchmarks/python/enc_dec_benchmark.py
@@ -93,7 +93,7 @@ def read_config(component):
 
                 cross_attention = pretrained_config[
                     "architecture"] == "DecoderModel"
-                skip_cross_qkv = pretrained_config.get('skip_cross_qkv', False)
+                skip_cross_kv = pretrained_config.get('skip_cross_kv', False)
                 has_position_embedding = pretrained_config[
                     "has_position_embedding"]
                 has_token_type_embedding = hasattr(pretrained_config,
@@ -138,7 +138,7 @@ def read_config(component):
                     lora_target_modules=lora_config.get('lora_target_modules'),
                     trtllm_modules_to_hf_modules=lora_config.get(
                         'trtllm_modules_to_hf_modules'),
-                    skip_cross_qkv=skip_cross_qkv,
+                    skip_cross_kv=skip_cross_kv,
                 )
 
                 # additional info for benchmark

diff --git a/cpp/include/tensorrt_llm/batch_manager/contextProgress.h b/cpp/include/tensorrt_llm/batch_manager/contextProgress.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/cudaEvent.h"
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <vector>
+
+namespace tensorrt_llm::batch_manager
+{
+
+// Use to track progress of context phase in dist-serving
+class ContextProgress
+{
+public:
+    ContextProgress(int numLayers);
+
+    void recordEvent(int layerIdx, cudaStream_t stream);
+
+    void wait(int layerIdx);
+
+    int getNumLayers() const
+    {
+        return mCudaEvents.size();
+    }
+
+    cudaEvent_t getEvent(int layerIdx)
+    {
+        return mCudaEvents.at(layerIdx).get();
+    }
+
+private:
+    std::mutex mMutex;
+    std::condition_variable mConditionVariable;
+    std::unique_ptr<std::atomic_bool[]> mCudaEventsRecorded;
+    std::vector<runtime::CudaEvent> mCudaEvents;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h b/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/mpiUtils.h"
+#include "tensorrt_llm/runtime/eagleBuffers.h"
+#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/lookaheadBuffers.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
+#include "tensorrt_llm/runtime/worldConfig.h"
+
+#include <optional>
+#include <vector>
+
+namespace tensorrt_llm::runtime
+{
+class TllmRuntime;
+} // namespace tensorrt_llm::runtime
+
+namespace tensorrt_llm::batch_manager
+{
+
+class DecoderStepAsyncSend
+{
+public:
+    using BufferPtr = runtime::IBuffer::SharedPtr;
+
+    DecoderStepAsyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, BufferPtr const& newOutputTokensHost,
+        BufferPtr const& finished, BufferPtr const& sequenceLengthsHost, BufferPtr const& cumLogProbsHost,
+        BufferPtr const& logProbsHost, BufferPtr const& cacheIndirectionOutput, BufferPtr const& acceptedCumSum,
+        BufferPtr const& packedPaths, BufferPtr const& finishReasonsHost, int peer);
+
+    ~DecoderStepAsyncSend();
+
+private:
+    std::shared_ptr<mpi::MpiRequest> mRequest1;
+    std::shared_ptr<mpi::MpiRequest> mRequest2;
+    std::shared_ptr<mpi::MpiRequest> mRequest3;
+    std::shared_ptr<mpi::MpiRequest> mRequest4;
+    std::shared_ptr<mpi::MpiRequest> mRequest5;
+    std::shared_ptr<mpi::MpiRequest> mRequest6;
+    std::shared_ptr<mpi::MpiRequest> mRequest7;
+    std::shared_ptr<mpi::MpiRequest> mRequest8;
+    std::shared_ptr<mpi::MpiRequest> mRequest9;
+};
+
+class DecoderSlotAsyncSend
+{
+public:
+    using TensorPtr = runtime::ITensor::SharedPtr;
+
+    DecoderSlotAsyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, TensorPtr const& outputIdsView,
+        TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView, TensorPtr const& logProbsView,
+        bool returnLogProbs, int peer);
+
+    ~DecoderSlotAsyncSend();
+
+private:
+    std::shared_ptr<mpi::MpiRequest> mRequest1;
+    std::shared_ptr<mpi::MpiRequest> mRequest2;
+    std::shared_ptr<mpi::MpiRequest> mRequest3;
+    std::shared_ptr<mpi::MpiRequest> mRequest4;
+};
+
+class DecoderBuffers
+{
+public:
+    using SizeType32 = runtime::SizeType32;
+    using TensorPtr = runtime::ITensor::SharedPtr;
+
+    std::vector<TensorPtr> logits;
+    TensorPtr slotOutputIds;     // [mMaxNumRequests, beamWidth, maxSeqLen], outputIds of all batch slots
+    TensorPtr slotOutputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot
+    TensorPtr cacheIndirectionInput;
+    TensorPtr cacheIndirectionOutput;
+    TensorPtr sequenceLengths;     // [mMaxNumRequests]
+    TensorPtr sequenceLengthsHost; // [mMaxNumRequests] pinned host tensor
+    TensorPtr finished;            // [mMaxNumRequests] pinned host tensor
+    TensorPtr newOutputTokens;     // [maxTokensPerStep, mMaxNumRequests, beamWidth]
+    TensorPtr newOutputTokensHost; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
+    TensorPtr cumLogProbs;         // [mMaxNumRequests, beamWidth]
+    TensorPtr cumLogProbsHost;     // [mMaxNumRequests, beamWidth]
+    TensorPtr logProbs;            // [mMaxNumRequests, beamWidth, maxSeqLen]
+    TensorPtr logProbsHost;        // [mMaxNumRequests, beamWidth, maxSeqLen]
+    TensorPtr finishReasonsHost;   // [mMaxNumRequests, beamWidth]
+
+    class DraftBuffers
+    {
+    public:
+        TensorPtr nextDraftTokensDevice;        // [mMaxNumRequests, maxTokensPerStep-1]
+        TensorPtr nextDraftTokensHost;          // [mMaxNumRequests, maxTokensPerStep-1]
+        TensorPtr prevDraftTokensLengthsDevice; // [mMaxNumRequests]
+        TensorPtr prevDraftTokensLengthsHost;   // [mMaxNumRequests]
+        TensorPtr nextDraftTokensLengthsDevice; // [mMaxNumRequests]
+        TensorPtr nextDraftTokensLengthsHost;   // [mMaxNumRequests]
+        TensorPtr acceptedLengthsCumSumDevice;  // [mMaxNumRequests+1]
+        TensorPtr acceptedPackedPathsDevice;    // [mMaxNumRequests * maxAcceptedTokens]
+        std::vector<std::vector<runtime::ITensor::SharedPtr>>
+            predictedDraftLogits;               // [mMaxNumRequests][mMaxNumHeads][maxDraftTokens + 1, vocabSize]
+
+        void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime,
+            runtime::ModelConfig const& modelConfig);
+    };
+
+    DraftBuffers draftBuffers;
+    runtime::ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers;
+    runtime::EagleBuffers::Inputs eagleBuffers;
+    std::optional<runtime::LookaheadDecodingBuffers> lookaheadBuffers;
+
+    DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
+        SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime,
+        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
+
+    std::unique_ptr<DecoderStepAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
+        bool returnLogProbs, SizeType32 maxBeamWidth, bool useMedusa, int peer);
+
+    void recv(std::shared_ptr<mpi::MpiComm> const& commSession, bool returnLogProbs, SizeType32 maxBeamWidth,
+        bool useMedusa, int peer);
+};
+
+class SlotDecoderBuffers
+{
+public:
+    using SizeType32 = runtime::SizeType32;
+    using TensorPtr = runtime::ITensor::SharedPtr;
+
+    TensorPtr outputIds;           // [beamWidth, maxSeqLen], outputIds of single batch slot
+    TensorPtr outputIdsHost;       // [beamWidth, maxSeqLen], outputIds of single batch slot
+    TensorPtr sequenceLengthsHost; // [beamWidth]
+    TensorPtr cumLogProbs;         // [beamWidth]
+    TensorPtr cumLogProbsHost;     // [beamWidth]
+    TensorPtr logProbs;            // [beamWidth, maxSeqLen]
+    TensorPtr logProbsHost;        // [beamWidth, maxSeqLen]
+    TensorPtr finishReasonsHost;   // [beamWidth]
+
+    SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::TllmRuntime const& runtime);
+
+    static std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
+        TensorPtr const& outputIdsView, TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView,
+        TensorPtr const& logProbsView, bool returnLogProbs, int peer);
+
+    std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
+        TensorPtr const& sequenceLengthView, bool returnLogProbs, int peer);
+
+    void recv(std::shared_ptr<mpi::MpiComm> const& commSession, TensorPtr const& sequenceLengthView,
+        bool returnLogProbs, int peer);
+};
+
+} // namespace tensorrt_llm::batch_manager