Skip to content

Commit

Permalink
Update TensorRT-LLM (#349)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
  • Loading branch information
kaiyux and Shixiaowei02 authored Nov 10, 2023
1 parent cd6bbab commit b2fd493
Show file tree
Hide file tree
Showing 153 changed files with 3,860 additions and 1,807 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ TensorRT-LLM
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/architecture.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/python/mem_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@


def get_memory_info(handle):
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle,
version=pynvml.nvmlMemory_v2)
total = round(mem_info.total / 1024 / 1024 / 1024, 2)
used = round(mem_info.used / 1024 / 1024 / 1024, 2)
free = round(mem_info.used / 1024 / 1024 / 1024, 2)
Expand Down
47 changes: 38 additions & 9 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ include(CheckLanguage)
include(cmake/modules/set_ifndef.cmake)
include(cmake/modules/find_library_create_target.cmake)
include(cmake/modules/resolve_dirs.cmake)
include(cmake/modules/parse_make_options.cmake)

project(tensorrt_llm LANGUAGES CXX)

Expand Down Expand Up @@ -246,6 +247,22 @@ endif()
set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")

if(NOT WIN32 AND NOT DEFINED USE_CXX11_ABI)
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
execute_process(
COMMAND ${Python3_EXECUTABLE} "-c"
"import torch; print(torch.compiled_with_cxx11_abi(),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE USE_CXX11_ABI)
# Convert the bool variable to integer.
if(USE_CXX11_ABI)
set(USE_CXX11_ABI 1)
else()
set(USE_CXX11_ABI 0)
endif()
message(STATUS "USE_CXX11_ABI is set by python Torch to ${USE_CXX11_ABI}")
endif()

if(BUILD_PYT)
# Build TORCH_CUDA_ARCH_LIST
set(TORCH_CUDA_ARCH_LIST "")
Expand Down Expand Up @@ -304,27 +321,39 @@ print(os.path.dirname(torch.__file__),end='');"
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
add_compile_options(${TORCH_CXX_FLAGS})
add_compile_definitions(TORCH_CUDA=1)

if(DEFINED USE_CXX11_ABI)
parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL ${USE_CXX11_ABI})
message(
WARNING
"The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
"found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
"setting will be discarded.")
endif()
endif()

elseif(NOT WIN32)
if(NOT USE_CXX11_ABI)
add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
endif()
message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
endif()

file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
REGEX "#define NV_TENSORRT_.*")
foreach(TYPE MAJOR MINOR PATCH BUILD)
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING
${VERSION_STRINGS})
string(REGEX MATCH "[0-9]" TRT_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

foreach(TYPE MAJOR MINOR PATCH)
string(REGEX MATCH "NV_TENSORRT_SONAME_${TYPE} [0-9]" TRT_TYPE_STRING
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
${VERSION_STRINGS})
string(REGEX MATCH "[0-9]" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

set(TRT_VERSION
"${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}"
CACHE STRING "TensorRT project version")
set(TRT_SOVERSION
"${TRT_SO_MAJOR}"
"${TRT_MAJOR}"
CACHE STRING "TensorRT library so version")
message(
STATUS
Expand Down
28 changes: 28 additions & 0 deletions cpp/cmake/modules/parse_make_options.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#

function(parse_make_options options result)
foreach(option ${options})
string(REGEX REPLACE "(-D|-)" "" option ${option})
string(REPLACE "=" ";" option ${option})
list(GET option 0 option_name)
list(GET option 1 option_value)
set(${result}_${option_name}
${option_value}
PARENT_SCOPE)
endforeach()
endfunction()
1 change: 1 addition & 0 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class GptManager
std::shared_ptr<TrtGptModel> mTrtGptModel;
SizeType mMaxInputLen;
SizeType mMaxOutputLen;
SizeType mMaxKvCacheLen;
SizeType mMaxNumSequences;
std::optional<uint64_t> mTerminateReqId;

Expand Down
7 changes: 5 additions & 2 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,17 @@ class KvCacheConfig
public:
using SizeType = tensorrt_llm::runtime::SizeType;

explicit KvCacheConfig(
std::optional<SizeType> maxTokens = std::nullopt, std::optional<float> freeGpuMemoryFraction = std::nullopt)
explicit KvCacheConfig(std::optional<SizeType> maxTokens = std::nullopt,
std::optional<SizeType> maxKvCacheLength = std::nullopt,
std::optional<float> freeGpuMemoryFraction = std::nullopt)
: maxTokens{maxTokens}
, maxKvCacheLength{maxKvCacheLength}
, freeGpuMemoryFraction{freeGpuMemoryFraction}
{
}

std::optional<SizeType> maxTokens;
std::optional<SizeType> maxKvCacheLength;
std::optional<float> freeGpuMemoryFraction;

static constexpr auto kDefaultGpuMemFraction = 0.85f;
Expand Down
5 changes: 4 additions & 1 deletion cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class KVCacheManager

KVCacheManager(SizeType numLayers, SizeType numHeads, SizeType numKvHeads, SizeType hiddenSize,
SizeType tokensPerBlock, SizeType maxNumBlocks, SizeType maxBatchSize, SizeType maxBeamWidth,
SizeType maxBlocksPerSeq, nvinfer1::DataType dtype, CudaStreamPtr stream);
SizeType maxBlocksPerSeq, SizeType maxKvCacheLength, nvinfer1::DataType dtype, CudaStreamPtr stream);

void startScheduling();

Expand Down Expand Up @@ -330,6 +330,9 @@ class KVCacheManager
SizeType mMaxBeamWidth;
// Maximum number of blocks per sequence
SizeType mMaxBlocksPerSeq;
// Maximum kv cache length per sequence
// Enable cyclic kv cache when it exceeds
SizeType mMaxKvCacheLength;
// Pools
std::vector<runtime::ITensor::SharedPtr> mPools;
// Block manager
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/tensorrt_llm/runtime/decodingInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ class DecodingInput
public:
using TensorPtr = std::shared_ptr<ITensor const>;

DecodingInput(SizeType maxLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)
DecodingInput(SizeType maxLength, SizeType maxKvCacheLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)
: step{maxLength}
, maxLength{maxLength}
, maxKvCacheLength{maxKvCacheLength}
, batchSize{batchSize}
, logits{std::move(logits)}
, endIds{std::move(endIds)}
Expand All @@ -43,6 +44,7 @@ class DecodingInput
// mandatory parameters
SizeType step;
SizeType maxLength;
SizeType maxKvCacheLength;
SizeType batchSize;
TensorPtr logits; // [batchSize, beamWidth, vocabSizePadded], on gpu
TensorPtr endIds; // [batchSize * beamWidth], on gpu
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/runtime/generationInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class GenericGenerationInput
bool packed; // indicates if ids are packed or padded to maxInputLength

// optional parameters
TensorPtr embeddingBiasOpt; // [vocabSizePadded], on gpu
TensorPtr embeddingBias; // [vocabSizePadded], on gpu
TensorPtr badWordsList; // [2, badWordsLength] or [batchSize, 2, badWordsLength], on gpu
TensorPtr stopWordsList; // [batchSize, 2, stopWordsLength], on gpu
std::optional<SizeType> maxNewTokens; // max number of tokens to generate
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class GptDecoderBatch : public IGptDecoderBatch
GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream);

//! Setup the decoder before calling `forward()`
void setup(
SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) override;
void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength,
nvinfer1::DataType dtype) override;

//! @brief Initialize the decoder at `batchIdx` with a new `request`.
void newRequest(
Expand Down Expand Up @@ -166,6 +166,7 @@ class GptDecoderBatch : public IGptDecoderBatch
std::vector<SizeType> mMaxNewTokens;
std::vector<SizeType> mBeamWidths;
SizeType mMaxSequenceLength{};
SizeType mMaxKvCacheLength{};
SizeType mActualBatchSize{};
};
} // namespace tensorrt_llm::runtime
7 changes: 4 additions & 3 deletions cpp/include/tensorrt_llm/runtime/gptSession.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ class GptSession

void createContexts(SizeType numBatchesCtx, SizeType numBatchesGen, bool useCudaGraphs);
void createBuffers(SizeType numMicroBatches);
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength,
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength,
nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches);
void createKvCacheManager(
SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, KvCacheConfig const& config);
void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength,
SizeType maxSequenceLength, KvCacheConfig const& config);
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength);

void executeContextStep(std::vector<GenerationInput> const& microBatches,
Expand Down Expand Up @@ -258,6 +258,7 @@ class GptSession
std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles;

SizeType mDecoderMaxSequenceLength{};
SizeType mDecoderMaxKvCacheLength{};

LoggerPtr mLogger;
std::shared_ptr<TllmRuntime> mRuntime;
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ class IStatefulGptDecoder
using TensorPtr = std::shared_ptr<ITensor>;

//! Setup the decoder before calling `forward()`, also calls reshapeBuffers
virtual void setup(
SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype)
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength,
SizeType maxSequenceLength, nvinfer1::DataType dtype)
= 0;

//! @brief Initialize the decoder with new batch of inputs.
Expand Down
44 changes: 20 additions & 24 deletions cpp/include/tensorrt_llm/runtime/iTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cstdint>
#include <functional>
#include <initializer_list>
#include <limits>
#include <memory>
#include <numeric>
#include <ostream>
Expand All @@ -48,6 +49,9 @@ class ITensor : virtual public IBuffer
using UniqueConstPtr = std::unique_ptr<ITensor const>;
using SharedConstPtr = std::shared_ptr<ITensor const>;
using Shape = nvinfer1::Dims;
using DimType = std::remove_reference_t<decltype(Shape::d[0])>;

~ITensor() override = default;

//!
//! \brief Returns the tensor dimensions.
Expand All @@ -59,7 +63,13 @@ class ITensor : virtual public IBuffer
//!
virtual void reshape(Shape const& dims) = 0;

~ITensor() override = default;
void resize(std::size_t newSize) override
{
if (newSize == getSize())
return;

reshape(makeShape({castSize(newSize)}));
}

//!
//! \brief Not allowed to copy.
Expand Down Expand Up @@ -101,18 +111,7 @@ class ITensor : virtual public IBuffer
//! \param dim The dimension that should be removed ("squeezed").
//! \return A new shape without the unit dimension.
//!
static Shape squeeze(Shape const& shape, SizeType dim)
{
TLLM_CHECK_WITH_INFO(shape.nbDims > 0, "Cannot squeeze 1-dimensional tensor");
TLLM_CHECK_WITH_INFO(
dim < shape.nbDims, common::fmtstr("Invalid index %d, tensor has %d dimensions", dim, shape.nbDims));
TLLM_CHECK_WITH_INFO(shape.d[dim] == 1, "Can only squeeze dimension of size 1");

Shape newDims{shape.nbDims - 1};
std::copy(shape.d, shape.d + dim, newDims.d);
std::copy(shape.d + dim + 1, shape.d + shape.nbDims, newDims.d + dim);
return newDims;
}
static Shape squeeze(Shape const& shape, SizeType dim);

//!
//! \brief Add a *unit* dimension to `shape` at the specified position.
Expand All @@ -121,17 +120,7 @@ class ITensor : virtual public IBuffer
//! \param dim The dimension where unit dimension should be added.
//! \return A new shape with the added unit dimension.
//!
static Shape unsqueeze(Shape const& shape, SizeType dim)
{
TLLM_CHECK_WITH_INFO(dim <= shape.nbDims && dim >= 0,
common::fmtstr("Invalid dim %d, tensor has %d dimensions", dim, shape.nbDims));

Shape newDims{shape.nbDims + 1};
std::copy(shape.d, shape.d + dim, newDims.d);
newDims.d[dim] = 1;
std::copy(shape.d + dim, shape.d + shape.nbDims, newDims.d + dim + 1);
return newDims;
}
static Shape unsqueeze(Shape const& shape, SizeType dim);

//!
//! \brief Removes the given *unit* dimensions from this tensor.
Expand Down Expand Up @@ -251,6 +240,13 @@ class ITensor : virtual public IBuffer

protected:
ITensor() = default;

static DimType castSize(size_t newSize)
{
TLLM_CHECK_WITH_INFO(
newSize <= std::numeric_limits<DimType>::max(), "New size is too large. Use reshape() instead.");
return static_cast<DimType>(newSize);
}
};

//! \brief Utility function to print a shape.
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/runtime/promptTuningParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class PromptTuningParams : public GenericPromptTuningParams<ITensor::SharedPtr>
// Function assumes that the first numContextRequests requests in the batch are context requests
void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests,
const std::vector<SizeType>& reqBeamWidths, const std::vector<SizeType>& reqPromptLengths,
BufferManager& manager, bool packedInput);
BufferManager const& manager, bool packedInput);
};

} // namespace tensorrt_llm::runtime
8 changes: 0 additions & 8 deletions cpp/tensorrt_llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,6 @@ if(BUILD_BATCH_MANAGER)
else()
add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED)
if(NOT WIN32) # Linux
execute_process(
COMMAND ${Python3_EXECUTABLE} "-c"
"import torch; print(torch.compiled_with_cxx11_abi(),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE USE_CXX11_ABI)

message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")

if(USE_CXX11_ABI)
set(BATCH_MANAGER_LIB_LOC
"${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a"
Expand Down
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
e1dc326c0c45864b9e7963b4d92d322f libtensorrt_llm_batch_manager_static.a
d2e9d76efe6b4173270aa6b494dfe59c libtensorrt_llm_batch_manager_static.pre_cxx11.a
07363ea7a6fdd6eeedc1670dedeeaedff7f9a848 commit
0776a4d41c06192c4ca0409ad8b837de libtensorrt_llm_batch_manager_static.a
c901725d5d278fd8d41f524f81fe5170 libtensorrt_llm_batch_manager_static.pre_cxx11.a
b3330c65d9b23d4f20c2b8d5a7c24cd45c910cd4 commit
Git LFS file not shown
Git LFS file not shown
Loading

0 comments on commit b2fd493

Please sign in to comment.