NVIDIA · juney-nvidia · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)

diff --git a/benchmarks/python/mem_monitor.py b/benchmarks/python/mem_monitor.py
@@ -18,7 +18,8 @@
 
 
 def get_memory_info(handle):
-    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle,
+                                              version=pynvml.nvmlMemory_v2)
     total = round(mem_info.total / 1024 / 1024 / 1024, 2)
     used = round(mem_info.used / 1024 / 1024 / 1024, 2)
     free = round(mem_info.used / 1024 / 1024 / 1024, 2)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -22,6 +22,7 @@ include(CheckLanguage)
 include(cmake/modules/set_ifndef.cmake)
 include(cmake/modules/find_library_create_target.cmake)
 include(cmake/modules/resolve_dirs.cmake)
+include(cmake/modules/parse_make_options.cmake)
 
 project(tensorrt_llm LANGUAGES CXX)
 
@@ -246,6 +247,22 @@ endif()
 set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
 message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
 
+if(NOT WIN32 AND NOT DEFINED USE_CXX11_ABI)
+  find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+  execute_process(
+    COMMAND ${Python3_EXECUTABLE} "-c"
+            "import torch; print(torch.compiled_with_cxx11_abi(),end='');"
+    RESULT_VARIABLE _PYTHON_SUCCESS
+    OUTPUT_VARIABLE USE_CXX11_ABI)
+  # Convert the bool variable to integer.
+  if(USE_CXX11_ABI)
+    set(USE_CXX11_ABI 1)
+  else()
+    set(USE_CXX11_ABI 0)
+  endif()
+  message(STATUS "USE_CXX11_ABI is set by python Torch to ${USE_CXX11_ABI}")
+endif()
+
 if(BUILD_PYT)
   # Build TORCH_CUDA_ARCH_LIST
   set(TORCH_CUDA_ARCH_LIST "")
@@ -304,27 +321,39 @@ print(os.path.dirname(torch.__file__),end='');"
   message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
   add_compile_options(${TORCH_CXX_FLAGS})
   add_compile_definitions(TORCH_CUDA=1)
+
+  if(DEFINED USE_CXX11_ABI)
+    parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
+    if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
+       AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL ${USE_CXX11_ABI})
+      message(
+        WARNING
+          "The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
+          "found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
+          "setting will be discarded.")
+    endif()
+  endif()
+
+elseif(NOT WIN32)
+  if(NOT USE_CXX11_ABI)
+    add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
+  message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
 endif()
 
 file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
      REGEX "#define NV_TENSORRT_.*")
 foreach(TYPE MAJOR MINOR PATCH BUILD)
-  string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING
-               ${VERSION_STRINGS})
-  string(REGEX MATCH "[0-9]" TRT_${TYPE} ${TRT_TYPE_STRING})
-endforeach(TYPE)
-
-foreach(TYPE MAJOR MINOR PATCH)
-  string(REGEX MATCH "NV_TENSORRT_SONAME_${TYPE} [0-9]" TRT_TYPE_STRING
+  string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
                ${VERSION_STRINGS})
-  string(REGEX MATCH "[0-9]" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
+  string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
 endforeach(TYPE)
 
 set(TRT_VERSION
     "${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}"
     CACHE STRING "TensorRT project version")
 set(TRT_SOVERSION
-    "${TRT_SO_MAJOR}"
+    "${TRT_MAJOR}"
     CACHE STRING "TensorRT library so version")
 message(
   STATUS

diff --git a/cpp/cmake/modules/parse_make_options.cmake b/cpp/cmake/modules/parse_make_options.cmake
@@ -0,0 +1,28 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+function(parse_make_options options result)
+  foreach(option ${options})
+    string(REGEX REPLACE "(-D|-)" "" option ${option})
+    string(REPLACE "=" ";" option ${option})
+    list(GET option 0 option_name)
+    list(GET option 1 option_value)
+    set(${result}_${option_name}
+        ${option_value}
+        PARENT_SCOPE)
+  endforeach()
+endfunction()
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -86,6 +86,7 @@ class GptManager
     std::shared_ptr<TrtGptModel> mTrtGptModel;
     SizeType mMaxInputLen;
     SizeType mMaxOutputLen;
+    SizeType mMaxKvCacheLen;
     SizeType mMaxNumSequences;
     std::optional<uint64_t> mTerminateReqId;
 

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@@ -29,14 +29,17 @@ class KvCacheConfig
 public:
     using SizeType = tensorrt_llm::runtime::SizeType;
 
-    explicit KvCacheConfig(
-        std::optional<SizeType> maxTokens = std::nullopt, std::optional<float> freeGpuMemoryFraction = std::nullopt)
+    explicit KvCacheConfig(std::optional<SizeType> maxTokens = std::nullopt,
+        std::optional<SizeType> maxKvCacheLength = std::nullopt,
+        std::optional<float> freeGpuMemoryFraction = std::nullopt)
         : maxTokens{maxTokens}
+        , maxKvCacheLength{maxKvCacheLength}
         , freeGpuMemoryFraction{freeGpuMemoryFraction}
     {
     }
 
     std::optional<SizeType> maxTokens;
+    std::optional<SizeType> maxKvCacheLength;
     std::optional<float> freeGpuMemoryFraction;
 
     static constexpr auto kDefaultGpuMemFraction = 0.85f;

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -217,7 +217,7 @@ class KVCacheManager
 
     KVCacheManager(SizeType numLayers, SizeType numHeads, SizeType numKvHeads, SizeType hiddenSize,
         SizeType tokensPerBlock, SizeType maxNumBlocks, SizeType maxBatchSize, SizeType maxBeamWidth,
-        SizeType maxBlocksPerSeq, nvinfer1::DataType dtype, CudaStreamPtr stream);
+        SizeType maxBlocksPerSeq, SizeType maxKvCacheLength, nvinfer1::DataType dtype, CudaStreamPtr stream);
 
     void startScheduling();
 
@@ -330,6 +330,9 @@ class KVCacheManager
     SizeType mMaxBeamWidth;
     // Maximum number of blocks per sequence
     SizeType mMaxBlocksPerSeq;
+    // Maximum kv cache length per sequence
+    // Enable cyclic kv cache when it exceeds
+    SizeType mMaxKvCacheLength;
     // Pools
     std::vector<runtime::ITensor::SharedPtr> mPools;
     // Block manager

diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h
@@ -29,9 +29,10 @@ class DecodingInput
 public:
     using TensorPtr = std::shared_ptr<ITensor const>;
 
-    DecodingInput(SizeType maxLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)
+    DecodingInput(SizeType maxLength, SizeType maxKvCacheLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)
         : step{maxLength}
         , maxLength{maxLength}
+        , maxKvCacheLength{maxKvCacheLength}
         , batchSize{batchSize}
         , logits{std::move(logits)}
         , endIds{std::move(endIds)}
@@ -43,6 +44,7 @@ class DecodingInput
     // mandatory parameters
     SizeType step;
     SizeType maxLength;
+    SizeType maxKvCacheLength;
     SizeType batchSize;
     TensorPtr logits; // [batchSize, beamWidth, vocabSizePadded], on gpu
     TensorPtr endIds; // [batchSize * beamWidth], on gpu

diff --git a/cpp/include/tensorrt_llm/runtime/generationInput.h b/cpp/include/tensorrt_llm/runtime/generationInput.h
@@ -54,7 +54,7 @@ class GenericGenerationInput
     bool packed;       // indicates if ids are packed or padded to maxInputLength
 
     // optional parameters
-    TensorPtr embeddingBiasOpt;           // [vocabSizePadded], on gpu
+    TensorPtr embeddingBias;              // [vocabSizePadded], on gpu
     TensorPtr badWordsList;               // [2, badWordsLength] or [batchSize, 2, badWordsLength], on gpu
     TensorPtr stopWordsList;              // [batchSize, 2, stopWordsLength], on gpu
     std::optional<SizeType> maxNewTokens; // max number of tokens to generate

diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
@@ -44,8 +44,8 @@ class GptDecoderBatch : public IGptDecoderBatch
     GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream);
 
     //! Setup the decoder before calling `forward()`
-    void setup(
-        SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) override;
+    void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength,
+        nvinfer1::DataType dtype) override;
 
     //! @brief Initialize the decoder at `batchIdx` with a new `request`.
     void newRequest(
@@ -166,6 +166,7 @@ class GptDecoderBatch : public IGptDecoderBatch
     std::vector<SizeType> mMaxNewTokens;
     std::vector<SizeType> mBeamWidths;
     SizeType mMaxSequenceLength{};
+    SizeType mMaxKvCacheLength{};
     SizeType mActualBatchSize{};
 };
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -140,10 +140,10 @@ class GptSession
 
     void createContexts(SizeType numBatchesCtx, SizeType numBatchesGen, bool useCudaGraphs);
     void createBuffers(SizeType numMicroBatches);
-    void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength,
+    void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength,
         nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches);
-    void createKvCacheManager(
-        SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, KvCacheConfig const& config);
+    void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength,
+        SizeType maxSequenceLength, KvCacheConfig const& config);
     void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength);
 
     void executeContextStep(std::vector<GenerationInput> const& microBatches,
@@ -258,6 +258,7 @@ class GptSession
     std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles;
 
     SizeType mDecoderMaxSequenceLength{};
+    SizeType mDecoderMaxKvCacheLength{};
 
     LoggerPtr mLogger;
     std::shared_ptr<TllmRuntime> mRuntime;

diff --git a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
@@ -73,8 +73,8 @@ class IStatefulGptDecoder
     using TensorPtr = std::shared_ptr<ITensor>;
 
     //! Setup the decoder before calling `forward()`, also calls reshapeBuffers
-    virtual void setup(
-        SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype)
+    virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength,
+        SizeType maxSequenceLength, nvinfer1::DataType dtype)
         = 0;
 
     //! @brief Initialize the decoder with new batch of inputs.

diff --git a/cpp/include/tensorrt_llm/runtime/iTensor.h b/cpp/include/tensorrt_llm/runtime/iTensor.h
@@ -25,6 +25,7 @@
 #include <cstdint>
 #include <functional>
 #include <initializer_list>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <ostream>
@@ -48,6 +49,9 @@ class ITensor : virtual public IBuffer
     using UniqueConstPtr = std::unique_ptr<ITensor const>;
     using SharedConstPtr = std::shared_ptr<ITensor const>;
     using Shape = nvinfer1::Dims;
+    using DimType = std::remove_reference_t<decltype(Shape::d[0])>;
+
+    ~ITensor() override = default;
 
     //!
     //! \brief Returns the tensor dimensions.
@@ -59,7 +63,13 @@ class ITensor : virtual public IBuffer
     //!
     virtual void reshape(Shape const& dims) = 0;
 
-    ~ITensor() override = default;
+    void resize(std::size_t newSize) override
+    {
+        if (newSize == getSize())
+            return;
+
+        reshape(makeShape({castSize(newSize)}));
+    }
 
     //!
     //! \brief Not allowed to copy.
@@ -101,18 +111,7 @@ class ITensor : virtual public IBuffer
     //! \param dim The dimension that should be removed ("squeezed").
     //! \return A new shape without the unit dimension.
     //!
-    static Shape squeeze(Shape const& shape, SizeType dim)
-    {
-        TLLM_CHECK_WITH_INFO(shape.nbDims > 0, "Cannot squeeze 1-dimensional tensor");
-        TLLM_CHECK_WITH_INFO(
-            dim < shape.nbDims, common::fmtstr("Invalid index %d, tensor has %d dimensions", dim, shape.nbDims));
-        TLLM_CHECK_WITH_INFO(shape.d[dim] == 1, "Can only squeeze dimension of size 1");
-
-        Shape newDims{shape.nbDims - 1};
-        std::copy(shape.d, shape.d + dim, newDims.d);
-        std::copy(shape.d + dim + 1, shape.d + shape.nbDims, newDims.d + dim);
-        return newDims;
-    }
+    static Shape squeeze(Shape const& shape, SizeType dim);
 
     //!
     //! \brief Add a *unit* dimension to `shape` at the specified position.
@@ -121,17 +120,7 @@ class ITensor : virtual public IBuffer
     //! \param dim The dimension where unit dimension should be added.
     //! \return A new shape with the added unit dimension.
     //!
-    static Shape unsqueeze(Shape const& shape, SizeType dim)
-    {
-        TLLM_CHECK_WITH_INFO(dim <= shape.nbDims && dim >= 0,
-            common::fmtstr("Invalid dim %d, tensor has %d dimensions", dim, shape.nbDims));
-
-        Shape newDims{shape.nbDims + 1};
-        std::copy(shape.d, shape.d + dim, newDims.d);
-        newDims.d[dim] = 1;
-        std::copy(shape.d + dim, shape.d + shape.nbDims, newDims.d + dim + 1);
-        return newDims;
-    }
+    static Shape unsqueeze(Shape const& shape, SizeType dim);
 
     //!
     //! \brief Removes the given *unit* dimensions from this tensor.
@@ -251,6 +240,13 @@ class ITensor : virtual public IBuffer
 
 protected:
     ITensor() = default;
+
+    static DimType castSize(size_t newSize)
+    {
+        TLLM_CHECK_WITH_INFO(
+            newSize <= std::numeric_limits<DimType>::max(), "New size is too large. Use reshape() instead.");
+        return static_cast<DimType>(newSize);
+    }
 };
 
 //! \brief Utility function to print a shape.

diff --git a/cpp/include/tensorrt_llm/runtime/promptTuningParams.h b/cpp/include/tensorrt_llm/runtime/promptTuningParams.h
@@ -71,7 +71,7 @@ class PromptTuningParams : public GenericPromptTuningParams<ITensor::SharedPtr>
     // Function assumes that the first numContextRequests requests in the batch are context requests
     void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests,
         const std::vector<SizeType>& reqBeamWidths, const std::vector<SizeType>& reqPromptLengths,
-        BufferManager& manager, bool packedInput);
+        BufferManager const& manager, bool packedInput);
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
@@ -84,14 +84,6 @@ if(BUILD_BATCH_MANAGER)
 else()
   add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED)
   if(NOT WIN32) # Linux
-    execute_process(
-      COMMAND ${Python3_EXECUTABLE} "-c"
-              "import torch; print(torch.compiled_with_cxx11_abi(),end='');"
-      RESULT_VARIABLE _PYTHON_SUCCESS
-      OUTPUT_VARIABLE USE_CXX11_ABI)
-
-    message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")
-
     if(USE_CXX11_ABI)
       set(BATCH_MANAGER_LIB_LOC
           "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a"

diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
diff --git a/...orrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/...orrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-e1dc326c0c45864b9e7963b4d92d322f  libtensorrt_llm_batch_manager_static.a
-d2e9d76efe6b4173270aa6b494dfe59c  libtensorrt_llm_batch_manager_static.pre_cxx11.a
-07363ea7a6fdd6eeedc1670dedeeaedff7f9a848 commit
+0776a4d41c06192c4ca0409ad8b837de  libtensorrt_llm_batch_manager_static.a
+c901725d5d278fd8d41f524f81fe5170  libtensorrt_llm_batch_manager_static.pre_cxx11.a
+b3330c65d9b23d4f20c2b8d5a7c24cd45c910cd4 commit
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
diff --git a/...sorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/...sorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a