Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TensorRT-LLM #349

Merged
merged 2 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ TensorRT-LLM
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/architecture.md)   |   [Results](./docs/source/performance.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/python/mem_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@


def get_memory_info(handle):
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle,
version=pynvml.nvmlMemory_v2)
total = round(mem_info.total / 1024 / 1024 / 1024, 2)
used = round(mem_info.used / 1024 / 1024 / 1024, 2)
free = round(mem_info.used / 1024 / 1024 / 1024, 2)
Expand Down
47 changes: 38 additions & 9 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ include(CheckLanguage)
include(cmake/modules/set_ifndef.cmake)
include(cmake/modules/find_library_create_target.cmake)
include(cmake/modules/resolve_dirs.cmake)
include(cmake/modules/parse_make_options.cmake)

project(tensorrt_llm LANGUAGES CXX)

Expand Down Expand Up @@ -246,6 +247,22 @@ endif()
set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")

if(NOT WIN32 AND NOT DEFINED USE_CXX11_ABI)
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
execute_process(
COMMAND ${Python3_EXECUTABLE} "-c"
"import torch; print(torch.compiled_with_cxx11_abi(),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE USE_CXX11_ABI)
# Convert the bool variable to integer.
if(USE_CXX11_ABI)
set(USE_CXX11_ABI 1)
else()
set(USE_CXX11_ABI 0)
endif()
message(STATUS "USE_CXX11_ABI is set by python Torch to ${USE_CXX11_ABI}")
endif()

if(BUILD_PYT)
# Build TORCH_CUDA_ARCH_LIST
set(TORCH_CUDA_ARCH_LIST "")
Expand Down Expand Up @@ -304,27 +321,39 @@ print(os.path.dirname(torch.__file__),end='');"
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
add_compile_options(${TORCH_CXX_FLAGS})
add_compile_definitions(TORCH_CUDA=1)

if(DEFINED USE_CXX11_ABI)
parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL ${USE_CXX11_ABI})
message(
WARNING
"The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
"found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
"setting will be discarded.")
endif()
endif()

elseif(NOT WIN32)
if(NOT USE_CXX11_ABI)
add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
endif()
message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
endif()

file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
REGEX "#define NV_TENSORRT_.*")
foreach(TYPE MAJOR MINOR PATCH BUILD)
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING
${VERSION_STRINGS})
string(REGEX MATCH "[0-9]" TRT_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

foreach(TYPE MAJOR MINOR PATCH)
string(REGEX MATCH "NV_TENSORRT_SONAME_${TYPE} [0-9]" TRT_TYPE_STRING
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
${VERSION_STRINGS})
string(REGEX MATCH "[0-9]" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

set(TRT_VERSION
"${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}"
CACHE STRING "TensorRT project version")
set(TRT_SOVERSION
"${TRT_SO_MAJOR}"
"${TRT_MAJOR}"
CACHE STRING "TensorRT library so version")
message(
STATUS
Expand Down
28 changes: 28 additions & 0 deletions cpp/cmake/modules/parse_make_options.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#

function(parse_make_options options result)
foreach(option ${options})
string(REGEX REPLACE "(-D|-)" "" option ${option})
string(REPLACE "=" ";" option ${option})
list(GET option 0 option_name)
list(GET option 1 option_value)
set(${result}_${option_name}
${option_value}
PARENT_SCOPE)
endforeach()
endfunction()
1 change: 1 addition & 0 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class GptManager
std::shared_ptr<TrtGptModel> mTrtGptModel;
SizeType mMaxInputLen;
SizeType mMaxOutputLen;
SizeType mMaxKvCacheLen;
SizeType mMaxNumSequences;
std::optional<uint64_t> mTerminateReqId;

Expand Down
7 changes: 5 additions & 2 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,17 @@ class KvCacheConfig
public:
using SizeType = tensorrt_llm::runtime::SizeType;

explicit KvCacheConfig(
std::optional<SizeType> maxTokens = std::nullopt, std::optional<float> freeGpuMemoryFraction = std::nullopt)
explicit KvCacheConfig(std::optional<SizeType> maxTokens = std::nullopt,
std::optional<SizeType> maxKvCacheLength = std::nullopt,
std::optional<float> freeGpuMemoryFraction = std::nullopt)
: maxTokens{maxTokens}
, maxKvCacheLength{maxKvCacheLength}
, freeGpuMemoryFraction{freeGpuMemoryFraction}
{
}

std::optional<SizeType> maxTokens;
std::optional<SizeType> maxKvCacheLength;
std::optional<float> freeGpuMemoryFraction;

static constexpr auto kDefaultGpuMemFraction = 0.85f;
Expand Down
5 changes: 4 additions & 1 deletion cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class KVCacheManager

KVCacheManager(SizeType numLayers, SizeType numHeads, SizeType numKvHeads, SizeType hiddenSize,
SizeType tokensPerBlock, SizeType maxNumBlocks, SizeType maxBatchSize, SizeType maxBeamWidth,
SizeType maxBlocksPerSeq, nvinfer1::DataType dtype, CudaStreamPtr stream);
SizeType maxBlocksPerSeq, SizeType maxKvCacheLength, nvinfer1::DataType dtype, CudaStreamPtr stream);

void startScheduling();

Expand Down Expand Up @@ -330,6 +330,9 @@ class KVCacheManager
SizeType mMaxBeamWidth;
// Maximum number of blocks per sequence
SizeType mMaxBlocksPerSeq;
// Maximum kv cache length per sequence
// Enable cyclic kv cache when it exceeds
SizeType mMaxKvCacheLength;
// Pools
std::vector<runtime::ITensor::SharedPtr> mPools;
// Block manager
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/tensorrt_llm/runtime/decodingInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ class DecodingInput
public:
using TensorPtr = std::shared_ptr<ITensor const>;

DecodingInput(SizeType maxLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)
DecodingInput(SizeType maxLength, SizeType maxKvCacheLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)
: step{maxLength}
, maxLength{maxLength}
, maxKvCacheLength{maxKvCacheLength}
, batchSize{batchSize}
, logits{std::move(logits)}
, endIds{std::move(endIds)}
Expand All @@ -43,6 +44,7 @@ class DecodingInput
// mandatory parameters
SizeType step;
SizeType maxLength;
SizeType maxKvCacheLength;
SizeType batchSize;
TensorPtr logits; // [batchSize, beamWidth, vocabSizePadded], on gpu
TensorPtr endIds; // [batchSize * beamWidth], on gpu
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/runtime/generationInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class GenericGenerationInput
bool packed; // indicates if ids are packed or padded to maxInputLength

// optional parameters
TensorPtr embeddingBiasOpt; // [vocabSizePadded], on gpu
TensorPtr embeddingBias; // [vocabSizePadded], on gpu
TensorPtr badWordsList; // [2, badWordsLength] or [batchSize, 2, badWordsLength], on gpu
TensorPtr stopWordsList; // [batchSize, 2, stopWordsLength], on gpu
std::optional<SizeType> maxNewTokens; // max number of tokens to generate
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/tensorrt_llm/runtime/gptDecoderBatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class GptDecoderBatch : public IGptDecoderBatch
GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream);

//! Setup the decoder before calling `forward()`
void setup(
SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) override;
void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength,
nvinfer1::DataType dtype) override;

//! @brief Initialize the decoder at `batchIdx` with a new `request`.
void newRequest(
Expand Down Expand Up @@ -166,6 +166,7 @@ class GptDecoderBatch : public IGptDecoderBatch
std::vector<SizeType> mMaxNewTokens;
std::vector<SizeType> mBeamWidths;
SizeType mMaxSequenceLength{};
SizeType mMaxKvCacheLength{};
SizeType mActualBatchSize{};
};
} // namespace tensorrt_llm::runtime
7 changes: 4 additions & 3 deletions cpp/include/tensorrt_llm/runtime/gptSession.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ class GptSession

void createContexts(SizeType numBatchesCtx, SizeType numBatchesGen, bool useCudaGraphs);
void createBuffers(SizeType numMicroBatches);
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength,
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength,
nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches);
void createKvCacheManager(
SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, KvCacheConfig const& config);
void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength,
SizeType maxSequenceLength, KvCacheConfig const& config);
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength);

void executeContextStep(std::vector<GenerationInput> const& microBatches,
Expand Down Expand Up @@ -258,6 +258,7 @@ class GptSession
std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles;

SizeType mDecoderMaxSequenceLength{};
SizeType mDecoderMaxKvCacheLength{};

LoggerPtr mLogger;
std::shared_ptr<TllmRuntime> mRuntime;
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ class IStatefulGptDecoder
using TensorPtr = std::shared_ptr<ITensor>;

//! Setup the decoder before calling `forward()`, also calls reshapeBuffers
virtual void setup(
SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype)
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength,
SizeType maxSequenceLength, nvinfer1::DataType dtype)
= 0;

//! @brief Initialize the decoder with new batch of inputs.
Expand Down
44 changes: 20 additions & 24 deletions cpp/include/tensorrt_llm/runtime/iTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cstdint>
#include <functional>
#include <initializer_list>
#include <limits>
#include <memory>
#include <numeric>
#include <ostream>
Expand All @@ -48,6 +49,9 @@ class ITensor : virtual public IBuffer
using UniqueConstPtr = std::unique_ptr<ITensor const>;
using SharedConstPtr = std::shared_ptr<ITensor const>;
using Shape = nvinfer1::Dims;
using DimType = std::remove_reference_t<decltype(Shape::d[0])>;

~ITensor() override = default;

//!
//! \brief Returns the tensor dimensions.
Expand All @@ -59,7 +63,13 @@ class ITensor : virtual public IBuffer
//!
virtual void reshape(Shape const& dims) = 0;

~ITensor() override = default;
void resize(std::size_t newSize) override
{
if (newSize == getSize())
return;

reshape(makeShape({castSize(newSize)}));
}

//!
//! \brief Not allowed to copy.
Expand Down Expand Up @@ -101,18 +111,7 @@ class ITensor : virtual public IBuffer
//! \param dim The dimension that should be removed ("squeezed").
//! \return A new shape without the unit dimension.
//!
static Shape squeeze(Shape const& shape, SizeType dim)
{
TLLM_CHECK_WITH_INFO(shape.nbDims > 0, "Cannot squeeze 1-dimensional tensor");
TLLM_CHECK_WITH_INFO(
dim < shape.nbDims, common::fmtstr("Invalid index %d, tensor has %d dimensions", dim, shape.nbDims));
TLLM_CHECK_WITH_INFO(shape.d[dim] == 1, "Can only squeeze dimension of size 1");

Shape newDims{shape.nbDims - 1};
std::copy(shape.d, shape.d + dim, newDims.d);
std::copy(shape.d + dim + 1, shape.d + shape.nbDims, newDims.d + dim);
return newDims;
}
static Shape squeeze(Shape const& shape, SizeType dim);

//!
//! \brief Add a *unit* dimension to `shape` at the specified position.
Expand All @@ -121,17 +120,7 @@ class ITensor : virtual public IBuffer
//! \param dim The dimension where unit dimension should be added.
//! \return A new shape with the added unit dimension.
//!
static Shape unsqueeze(Shape const& shape, SizeType dim)
{
TLLM_CHECK_WITH_INFO(dim <= shape.nbDims && dim >= 0,
common::fmtstr("Invalid dim %d, tensor has %d dimensions", dim, shape.nbDims));

Shape newDims{shape.nbDims + 1};
std::copy(shape.d, shape.d + dim, newDims.d);
newDims.d[dim] = 1;
std::copy(shape.d + dim, shape.d + shape.nbDims, newDims.d + dim + 1);
return newDims;
}
static Shape unsqueeze(Shape const& shape, SizeType dim);

//!
//! \brief Removes the given *unit* dimensions from this tensor.
Expand Down Expand Up @@ -251,6 +240,13 @@ class ITensor : virtual public IBuffer

protected:
ITensor() = default;

static DimType castSize(size_t newSize)
{
TLLM_CHECK_WITH_INFO(
newSize <= std::numeric_limits<DimType>::max(), "New size is too large. Use reshape() instead.");
return static_cast<DimType>(newSize);
}
};

//! \brief Utility function to print a shape.
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/tensorrt_llm/runtime/promptTuningParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class PromptTuningParams : public GenericPromptTuningParams<ITensor::SharedPtr>
// Function assumes that the first numContextRequests requests in the batch are context requests
void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests,
const std::vector<SizeType>& reqBeamWidths, const std::vector<SizeType>& reqPromptLengths,
BufferManager& manager, bool packedInput);
BufferManager const& manager, bool packedInput);
};

} // namespace tensorrt_llm::runtime
8 changes: 0 additions & 8 deletions cpp/tensorrt_llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,6 @@ if(BUILD_BATCH_MANAGER)
else()
add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED)
if(NOT WIN32) # Linux
execute_process(
COMMAND ${Python3_EXECUTABLE} "-c"
"import torch; print(torch.compiled_with_cxx11_abi(),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE USE_CXX11_ABI)

message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")

if(USE_CXX11_ABI)
set(BATCH_MANAGER_LIB_LOC
"${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a"
Expand Down
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
e1dc326c0c45864b9e7963b4d92d322f libtensorrt_llm_batch_manager_static.a
d2e9d76efe6b4173270aa6b494dfe59c libtensorrt_llm_batch_manager_static.pre_cxx11.a
07363ea7a6fdd6eeedc1670dedeeaedff7f9a848 commit
0776a4d41c06192c4ca0409ad8b837de libtensorrt_llm_batch_manager_static.a
c901725d5d278fd8d41f524f81fe5170 libtensorrt_llm_batch_manager_static.pre_cxx11.a
b3330c65d9b23d4f20c2b8d5a7c24cd45c910cd4 commit
Git LFS file not shown
Git LFS file not shown
Loading