NVIDIA · DanBlanaru · Oct 8, 2024 · Oct 8, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -14,3 +14,6 @@
 [submodule "3rdparty/ucxx"]
 	path = 3rdparty/ucxx
 	url = https://github.com/GuanLuo/ucxx.git
+[submodule "3rdparty/pybind11"]
+	path = 3rdparty/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -46,5 +46,5 @@ repos:
         args:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
-        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid
+        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
         exclude: 'tests/llm-test-defs/turtle/test_input_files'
diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
diff --git a/README.md b/README.md
@@ -103,7 +103,7 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
 ## TensorRT-LLM Overview
 
 TensorRT-LLM is a library for optimizing Large Language Model (LLM) inference.
-It provides state-of-the-art optimziations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs
+It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs
 
 TensorRT-LLM provides a Python API to build LLMs into optimized
 [TensorRT](https://developer.nvidia.com/tensorrt) engines.

diff --git a/benchmarks/Suite.md b/benchmarks/Suite.md
@@ -42,7 +42,7 @@ This section covers how to benchmark TensorRT-LLM using inflight batching.
 ### Quickstart
 
 For this quick start guide, we will focus on running a short max throughput benchmark on
-`meta-llama/Llama-2-7b-hf` on a syntehtic dataset with a uniform distribution of prompts with ISL:OSL
+`meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL
 of 128:128. In order to run the benchmark from start to finish simply run the following commands:
 
 ```shell
@@ -101,12 +101,12 @@ The workflow for `trtllm-bench` is composed of the following steps:
 The inflight benchmark utilizes a fixed JSON schema so that it is simple and
 straightforward to specify requests. The schema is defined as follows:
 
-| Key | Required | Type | Description |
-| :- | :-: | :-: | :- |
-| `task_id`| Y | String | Unique identifier for the request. |
-| `prompt` | N* | String | Input text for a generation request. |
-| `logits` | N* | List[Integer] | List of logits that make up the request prompt. |
-| `output_tokens` | Y | Integer | Number of generated tokens for this request. |
+| Key             | Required |     Type      | Description                                     |
+| :-------------- | :------: | :-----------: | :---------------------------------------------- |
+| `task_id`       |    Y     |    String     | Unique identifier for the request.              |
+| `prompt`        |    N*    |    String     | Input text for a generation request.            |
+| `logits`        |    N*    | List[Integer] | List of logits that make up the request prompt. |
+| `output_tokens` |    Y     |    Integer    | Number of generated tokens for this request.    |
 
 > [!NOTE] Prompt and logits are mutually exclusive*
 > While having both `prompt` and `logits` is not required, at least one is required.

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -316,14 +316,17 @@ endif()
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
+add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+
 include_directories(
   ${CUDAToolkit_INCLUDE_DIRS}
   ${CUDNN_ROOT_DIR}/include
   ${NCCL_INCLUDE_DIR}
   ${3RDPARTY_DIR}/cutlass/include
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
-  ${3RDPARTY_DIR}/json/include)
+  ${3RDPARTY_DIR}/json/include
+  ${3RDPARTY_DIR}/pybind11/include)
 
 # TRT dependencies
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})

diff --git a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/common/algorithm.h"
+#include "tensorrt_llm/runtime/common.h"
+#include <variant>
+
+namespace tensorrt_llm::batch_manager
+{
+namespace kv_cache_manager
+{
+class KVCacheManager;
+}
+class BasePeftCacheManager;
+} // namespace tensorrt_llm::batch_manager
+
+namespace tensorrt_llm::batch_manager
+{
+
+using tensorrt_llm::runtime::SizeType32;
+
+/// @brief This scheduler takes into account the given request capacity and the KV cache capacity.
+///        Depending on the CapacitySchedulerPolicy it will schedule already started and new requests,
+///        or even pause previously started requests.
+class BaseCapacityScheduler
+{
+public:
+    explicit BaseCapacityScheduler(LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
+        : mNoScheduleUntilState(noScheduleUntilState)
+        , mNoScheduleAfterState(noScheduleAfterState)
+    {
+    }
+
+    [[nodiscard]] LlmRequestState constexpr getNoScheduleUntilState() const noexcept
+    {
+        return mNoScheduleUntilState;
+    }
+
+    [[nodiscard]] LlmRequestState constexpr getNoScheduleAfterState() const noexcept
+    {
+        return mNoScheduleAfterState;
+    }
+
+private:
+    /// The state until/after which the scheduler should not schedule requests
+    LlmRequestState mNoScheduleUntilState;
+    LlmRequestState mNoScheduleAfterState;
+};
+
+/// @brief Schedule up to maxNumRequests requests
+class MaxRequestsScheduler : public BaseCapacityScheduler
+{
+public:
+    explicit MaxRequestsScheduler(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    /// @brief Takes as input a sorted list of requests and outputs a sorted lists of requests
+    ///        to update for this current iteration, and a map of requests to pause
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+};
+
+/// @brief   Schedule requests using the MAX_UTILIZATION policy
+/// @details Try reserving resources to advance requests by one step,
+///          may pause previously started requests.
+class MaxUtilizationScheduler : public BaseCapacityScheduler
+{
+public:
+    MaxUtilizationScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager, bool manyMicroBatches,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    /// @return {fitsKvCache, fitsPeft}
+    std::pair<bool, bool> trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
+        RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages,
+        std::unordered_set<uint64_t>& seenTaskIds) const;
+
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+    std::shared_ptr<BasePeftCacheManager> mPeftCacheManager{nullptr};
+    /// @brief Boolean that indicates if multiple micro batches might be in flight
+    bool mManyMicroBatches;
+};
+
+/// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
+class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
+{
+public:
+    GuaranteedNoEvictScheduler(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+protected:
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> forwardImpl(
+        RequestList const& activeRequests, bool staticBatchScheduling) const;
+
+private:
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+    std::shared_ptr<BasePeftCacheManager> mPeftCacheManager{nullptr};
+};
+
+/// @brief Schedule requests using the STATIC_BATCH policy
+class StaticBatchScheduler : public GuaranteedNoEvictScheduler
+{
+public:
+    StaticBatchScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+};
+
+class CapacityScheduler : public Algorithm
+{
+public:
+    constexpr static auto name{"CapacityScheduler"};
+
+    CapacityScheduler() = default;
+
+    CapacityScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    static CapacityScheduler make(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE)
+    {
+        return CapacityScheduler{maxNumRequests, std::move(kvCacheManager), std::move(crossKvCacheManager),
+            std::move(peftCacheManager), capacitySchedulerPolicy, manyMicroBatches, noScheduleUntilState,
+            noScheduleAfterState};
+    }
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    std::variant<std::monostate, MaxRequestsScheduler, MaxUtilizationScheduler, GuaranteedNoEvictScheduler,
+        StaticBatchScheduler>
+        mScheduler;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/common.h b/cpp/include/tensorrt_llm/batch_manager/common.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/common.h"
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace tensorrt_llm::executor
+{
+class RequestWithId;
+}
+
+namespace tensorrt_llm::batch_manager
+{
+class LlmRequest;
+
+using RequestList = std::list<std::shared_ptr<LlmRequest>>;
+using RequestIdType = std::uint64_t;
+using RequestVector = std::vector<std::shared_ptr<LlmRequest>>;
+using ReqIdsSet = std::unordered_set<RequestIdType>;
+
+class ScheduledRequests
+{
+public:
+    /// @brief context phase requests (for decoder-only models) or encoder phase requests (for encoder-decoder models
+    /// and encoder-only models)
+    RequestVector contextRequests;
+
+    /// @brief generation phase requests (for decoder-only models) or empty for others
+    RequestVector generationRequests;
+
+    ScheduledRequests() = default;
+
+    explicit ScheduledRequests(RequestVector contextRequests, RequestVector generationRequests)
+        : contextRequests{std::move(contextRequests)}
+        , generationRequests{std::move(generationRequests)}
+    {
+    }
+
+    [[nodiscard]] bool empty() const
+    {
+        return contextRequests.empty() && generationRequests.empty();
+    }
+
+    [[nodiscard]] std::size_t size() const
+    {
+        return contextRequests.size() + generationRequests.size();
+    }
+};
+
+class BatchState
+{
+public:
+    BatchState() = default;
+
+    BatchState(runtime::SizeType32 numCtxRequests, runtime::SizeType32 numGenRequests, runtime::SizeType32 numTokens,
+        runtime::SizeType32 maxKvCacheLength)
+        : mNumCtxRequests{numCtxRequests}
+        , mNumGenRequests{numGenRequests}
+        , mNumTokens{numTokens}
+        , mMaxKvCacheLength{maxKvCacheLength}
+    {
+    }
+
+    bool isAnyContext() const
+    {
+        return mNumCtxRequests > 0;
+    }
+
+    bool operator==(BatchState const& other) const
+    {
+        return mNumCtxRequests == other.mNumCtxRequests && mNumGenRequests == other.mNumGenRequests
+            && mNumTokens == other.mNumTokens && mMaxKvCacheLength == other.mMaxKvCacheLength;
+    }
+
+    size_t hash() const
+    {
+        size_t h1 = std::hash<runtime::SizeType32>{}(mNumCtxRequests);
+        size_t h2 = std::hash<runtime::SizeType32>{}(mNumGenRequests);
+        size_t h3 = std::hash<runtime::SizeType32>{}(mNumTokens);
+        size_t h4 = std::hash<runtime::SizeType32>{}(mMaxKvCacheLength);
+        return h1 ^ h2 ^ h3 ^ h4;
+    }
+
+    runtime::SizeType32 mNumCtxRequests;
+    runtime::SizeType32 mNumGenRequests;
+    runtime::SizeType32 mNumTokens;
+    runtime::SizeType32 mMaxKvCacheLength;
+};
+
+struct BatchStateHash
+{
+    size_t operator()(BatchState const& bs) const
+    {
+        return bs.hash();
+    }
+};
+
+} // namespace tensorrt_llm::batch_manager