From 8809c33e18c961659a0a69baa873b9b2ba017274 Mon Sep 17 00:00:00 2001
From: Dan Blanaru <48605845+DanBlanaru@users.noreply.github.com>
Date: Tue, 8 Oct 2024 08:51:52 +0000
Subject: [PATCH] open source 4dbf696ae9b74a26829d120b67ab8443d70c8e58

Remove cu
---
 .gitmodules                                   |   3 +
 .pre-commit-config.yaml                       |   2 +-
 3rdparty/pybind11                             |   1 +
 README.md                                     |   2 +-
 benchmarks/Suite.md                           |  14 +-
 cpp/CMakeLists.txt                            |   5 +-
 .../batch_manager/capacityScheduler.h         | 187 ++++++
 .../tensorrt_llm/batch_manager/common.h       | 118 ++++
 .../batch_manager/evictionPolicy.h            |  74 +++
 .../batch_manager/kvCacheManager.h            | 152 +++--
 .../tensorrt_llm/batch_manager/kvCacheUtils.h |   5 +
 .../tensorrt_llm/batch_manager/llmRequest.h   | 124 ++--
 .../batch_manager/microBatchScheduler.h       | 108 +++
 .../batch_manager/peftCacheManager.h          |   2 +
 .../batch_manager/trtGptModelOptionalParams.h |  15 +-
 cpp/include/tensorrt_llm/common/algorithm.h   |  32 +
 cpp/include/tensorrt_llm/common/mpiUtils.h    |   2 +-
 cpp/include/tensorrt_llm/executor/executor.h  |  51 +-
 .../tensorrt_llm/executor/serialization.h     |   5 +
 cpp/include/tensorrt_llm/executor/types.h     |  33 +
 .../tensorrt_llm/runtime/decodingInput.h      |  16 +
 .../tensorrt_llm/runtime/decodingOutput.h     |   2 +-
 cpp/include/tensorrt_llm/runtime/gptDecoder.h |  10 -
 .../tensorrt_llm/runtime/gptDecoderBatched.h  |  15 +-
 .../libtensorrt_llm_batch_manager_static.a    |   4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |   4 +-
 .../aarch64-linux-gnu/version.txt             |   6 +-
 .../libtensorrt_llm_batch_manager_static.a    |   4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |   4 +-
 .../x86_64-linux-gnu/version.txt              |   6 +-
 .../tensorrt_llm_batch_manager_static.lib     |   4 +-
 .../x86_64-windows-msvc/version.txt           |   4 +-
 .../common/customAllReduceUtils.h             |   2 +-
 cpp/tensorrt_llm/common/mpiUtils.cpp          |  12 +
 cpp/tensorrt_llm/common/reduceKernelUtils.cuh |   6 +
 .../libtensorrt_llm_executor_static.a         |   4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |   4 +-
 .../executor/aarch64-linux-gnu/version.txt    |   6 +-
 .../libtensorrt_llm_executor_static.a         |   4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |   4 +-
 .../executor/x86_64-linux-gnu/version.txt     |   6 +-
 .../tensorrt_llm_executor_static.lib          |   4 +-
 .../executor/x86_64-windows-msvc/version.txt  |   4 +-
 .../beamSearchKernelsTemplate.h               |   4 +-
 .../fmhaRunner.cpp                            |  16 +-
 .../kernels/customAllReduceKernels.cu         | 613 ++++++++++++++++--
 .../kernels/customAllReduceKernels.h          |  18 +-
 .../aarch64-linux-gnu/version.txt             |   2 +-
 .../nvrtcWrapper/x86_64-linux-gnu/version.txt |   2 +-
 .../tensorrt_llm_nvrtc_wrapper.dll            |   2 +-
 .../x86_64-windows-msvc/version.txt           |   4 +-
 cpp/tensorrt_llm/kernels/decodingKernels.h    |  11 +
 cpp/tensorrt_llm/kernels/gptKernels.cu        |   2 +-
 ...orrt_llm_internal_cutlass_kernels_static.a |   2 +-
 ...nternal_cutlass_kernels_static.pre_cxx11.a |   2 +-
 .../aarch64-linux-gnu/version.txt             |   6 +-
 ...orrt_llm_internal_cutlass_kernels_static.a |   2 +-
 ...nternal_cutlass_kernels_static.pre_cxx11.a |   2 +-
 .../x86_64-linux-gnu/version.txt              |   6 +-
 ...rt_llm_internal_cutlass_kernels_static.lib |   4 +-
 .../x86_64-windows-msvc/version.txt           |   4 +-
 .../kernels/mixtureOfExperts/moe_kernels.cu   |   2 +-
 .../kernels/samplingAirTopPKernels.cu         |   2 +-
 .../kernels/samplingTopPKernels.cu            |  64 +-
 .../kernels/samplingTopPKernels.h             |  21 +-
 .../externalDraftTokensKernels.cu             | 361 ++++++-----
 .../externalDraftTokensKernels.h              | 123 ++--
 .../kernels/stopCriteriaKernels.h             |   2 +-
 cpp/tensorrt_llm/layers/decodingLayer.cpp     |  50 ++
 cpp/tensorrt_llm/layers/decodingLayer.h       |   2 +-
 cpp/tensorrt_llm/layers/decodingParams.h      |  36 +-
 .../layers/externalDraftTokensLayer.cpp       | 514 +++++++++++++++
 .../layers/externalDraftTokensLayer.h         | 100 +++
 cpp/tensorrt_llm/layers/topPSamplingLayer.cpp |   2 +-
 .../plugins/loraPlugin/loraPlugin.cpp         |   4 +-
 .../plugins/ncclPlugin/allreducePlugin.cpp    |  14 +-
 cpp/tensorrt_llm/pybind/CMakeLists.txt        |  47 +-
 .../pybind/batch_manager/algorithms.cpp       |  55 ++
 .../pybind/batch_manager/algorithms.h         |  28 +
 .../pybind/batch_manager/bindings.cpp         |  41 ++
 .../pybind/batch_manager/bindings.h           |  28 +
 .../pybind/batch_manager/gptManager.h         |   1 +
 .../pybind/batch_manager/inferenceRequest.h   |   1 +
 .../pybind/batch_manager/kvCacheManager.cpp   |  29 +
 .../pybind/batch_manager/kvCacheManager.h     |  36 +
 .../pybind/batch_manager/llmRequest.cpp       | 145 ++++-
 .../pybind/batch_manager/llmRequest.h         |  11 +
 .../pybind/batch_manager/namedTensor.h        |   1 +
 cpp/tensorrt_llm/pybind/bindings.cpp          |  27 +-
 .../pybind/common/algorithmBindings.h         |  39 ++
 .../pybind/common/opaqueBindings.h            |  18 +
 cpp/tensorrt_llm/pybind/executor/bindings.cpp |   7 +-
 cpp/tensorrt_llm/pybind/executor/bindings.h   |   2 +
 cpp/tensorrt_llm/pybind/executor/executor.h   |   2 +
 .../pybind/executor/streamCaster.h            |   4 +-
 .../pybind/executor/tensorCaster.h            |   4 +-
 cpp/tensorrt_llm/pybind/utils/bindTypes.h     |  69 ++
 cpp/tensorrt_llm/pybind/utils/pathCaster.h    |   1 +
 cpp/tensorrt_llm/runtime/gptDecoder.cpp       | 146 ++---
 .../runtime/gptDecoderBatched.cpp             | 193 ++----
 cpp/tensorrt_llm/runtime/ipcUtils.cpp         |  27 +-
 cpp/tensorrt_llm/runtime/tllmBuffers.h        |   2 +-
 cpp/tensorrt_llm/runtime/tllmRuntime.cpp      | 209 +++---
 cpp/tensorrt_llm/runtime/tllmRuntime.h        |  23 +-
 cpp/tests/CMakeLists.txt                      |   8 -
 .../kernels/allReduce/allReduceKernelTest.cu  | 181 ++++--
 cpp/tests/kernels/decodingKernelTest.cpp      | 468 ++++++++++++-
 .../kernels/sampling/samplingAirTopPTest.cpp  |   2 +-
 .../kernels/sampling/samplingTopPTest.cpp     |   2 +-
 .../data/test_model_lora_config.json          |   1 -
 .../scripts/build_chatglm_engines.py          |   1 -
 .../resources/scripts/build_gpt_engines.py    |   1 -
 .../resources/scripts/case_report_wrapper.py  |  42 --
 .../scripts/generate_expected_gpt_output.py   |  10 +-
 cpp/tests/resources/scripts/test_cpp.py       |  88 +--
 cpp/tests/runtime/gptDecoderBatchedTest.cpp   |   7 +-
 docker/Dockerfile.multi                       |   4 +-
 docs/source/advanced/gpt-runtime.md           |  22 +-
 docs/source/advanced/speculative-decoding.md  |  10 +-
 docs/source/architecture/workflow.md          |  23 +-
 docs/source/blogs/quantization-in-TRT-LLM.md  |  60 +-
 examples/baichuan/requirements.txt            |   2 +-
 .../bindings/executor/example_advanced.py     |   2 +-
 examples/bloom/requirements.txt               |   2 +-
 examples/chatglm/requirements.txt             |   2 +-
 examples/dbrx/requirements.txt                |   2 +-
 examples/draft_target_model/README.md         |  86 +++
 examples/draft_target_model/requirements.txt  |   6 +
 examples/falcon/requirements.txt              |   2 +-
 examples/gemma/requirements.txt               |   2 +-
 examples/gpt/README.md                        |   3 +-
 examples/gpt/requirements.txt                 |   2 +-
 examples/gptj/requirements.txt                |   2 +-
 examples/gptneox/requirements.txt             |   2 +-
 examples/grok/requirements.txt                |   2 +-
 examples/internlm/requirements.txt            |   2 +-
 examples/jais/requirements.txt                |   2 +-
 examples/llama/convert_checkpoint.py          |  15 +
 examples/llama/requirements.txt               |   2 +-
 examples/llm-api/requirements.txt             |   2 +-
 examples/mamba/requirements.txt               |   2 +-
 examples/medusa/requirements.txt              |   2 +-
 examples/mixtral/README.md                    |   2 +-
 examples/mixtral/requirements.txt             |   2 +-
 examples/model_api/README.md                  |   2 +-
 examples/model_api/llama.py                   |   2 -
 examples/model_api/llama_multi_gpu.py         |   1 -
 examples/mpt/requirements.txt                 |   2 +-
 examples/nemotron/requirements.txt            |   2 +-
 examples/opt/requirements.txt                 |   2 +-
 examples/phi/requirements.txt                 |   2 +-
 examples/quantization/quantize.py             |  42 +-
 examples/quantization/requirements.txt        |   2 +-
 examples/qwen/requirements.txt                |   2 +-
 examples/qwenvl/requirements.txt              |   2 +-
 examples/recurrentgemma/requirements.txt      |   2 +-
 examples/redrafter/requirements.txt           |   2 +-
 examples/run.py                               | 412 +++++++++---
 examples/skywork/requirements.txt             |   2 +-
 examples/smaug/requirements.txt               |   2 +-
 examples/summarize.py                         |   1 +
 examples/utils.py                             |  19 +-
 examples/whisper/requirements.txt             |   2 +-
 requirements-dev.txt                          |   1 +
 scripts/build_wheel.py                        |  10 +-
 tensorrt_llm/_utils.py                        |  28 +
 tensorrt_llm/bench/build/build.py             |  22 +-
 tensorrt_llm/bench/run/run.py                 |  11 +-
 tensorrt_llm/bench/utils/data.py              |  41 +-
 tensorrt_llm/bench/utils/tokenize.py          | 105 ---
 tensorrt_llm/builder.py                       |  28 +-
 tensorrt_llm/commands/build.py                |  11 +-
 tensorrt_llm/functional.py                    |  22 +-
 tensorrt_llm/hlapi/llm_utils.py               |   4 +-
 tensorrt_llm/layers/embedding.py              |   2 +-
 tensorrt_llm/layers/mlp.py                    |   4 +-
 tensorrt_llm/layers/moe.py                    |   6 +-
 tensorrt_llm/models/convert_utils.py          |   1 +
 tensorrt_llm/models/llama/convert.py          |  13 +-
 tensorrt_llm/models/llama/model.py            |   4 +-
 tensorrt_llm/models/model_weights_loader.py   |   2 +-
 tensorrt_llm/models/modeling_utils.py         | 259 ++++++--
 .../models/redrafter/redrafter_helper.py      | 136 +++-
 tensorrt_llm/parameter.py                     |   6 +-
 tensorrt_llm/plugin/plugin.py                 |  21 +-
 tensorrt_llm/quantization/mode.py             |  10 +-
 tensorrt_llm/quantization/quantize.py         |  90 ++-
 .../quantization/quantize_by_modelopt.py      | 171 ++++-
 tensorrt_llm/runtime/model_runner_cpp.py      | 245 ++++---
 tensorrt_llm/tools/multimodal_builder.py      |   3 +-
 tensorrt_llm/version.py                       |   2 +-
 tests/attention/test_gpt_attention.py         |   1 -
 tests/attention/test_gpt_attention_IFB.py     |   1 -
 tests/bindings/test_executor_bindings.py      |  15 +-
 tests/conftest.py                             | 112 +---
 tests/functional/test_moe.py                  |   3 +-
 tests/hlapi/test_llm.py                       |   4 +-
 tests/hlapi/test_llm_utils.py                 |   2 +-
 tests/model/test_gpt_e2e.py                   |   1 -
 tests/model/test_mamba.py                     |   2 +-
 tests/test_graph_rewriter.py                  |   2 +-
 tests/test_layer.py                           |   5 +-
 tests/test_model_runner_cpp.py                |  84 +++
 tests/utils/cpp_paths.py                      |   5 +
 tests/utils/util.py                           |   4 +-
 205 files changed, 5553 insertions(+), 1792 deletions(-)
 create mode 160000 3rdparty/pybind11
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/common.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
 create mode 100644 cpp/include/tensorrt_llm/common/algorithm.h
 create mode 100644 cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp
 create mode 100644 cpp/tensorrt_llm/layers/externalDraftTokensLayer.h
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/algorithms.h
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/bindings.h
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
 create mode 100644 cpp/tensorrt_llm/pybind/common/algorithmBindings.h
 create mode 100644 cpp/tensorrt_llm/pybind/common/opaqueBindings.h
 create mode 100644 cpp/tensorrt_llm/pybind/utils/bindTypes.h
 delete mode 100755 cpp/tests/resources/scripts/case_report_wrapper.py
 create mode 100644 examples/draft_target_model/README.md
 create mode 100644 examples/draft_target_model/requirements.txt
 delete mode 100644 tensorrt_llm/bench/utils/tokenize.py
 create mode 100644 tests/test_model_runner_cpp.py

diff --git a/.gitmodules b/.gitmodules
index 6fdb69781..5a0efb0c9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -14,3 +14,6 @@
 [submodule "3rdparty/ucxx"]
 	path = 3rdparty/ucxx
 	url = https://github.com/GuanLuo/ucxx.git
+[submodule "3rdparty/pybind11"]
+	path = 3rdparty/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e4219d974..2dc60bd2d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,5 +46,5 @@ repos:
         args:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
-        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid
+        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
         exclude: 'tests/llm-test-defs/turtle/test_input_files'
diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
new file mode 160000
index 000000000..f99ffd7e0
--- /dev/null
+++ b/3rdparty/pybind11
@@ -0,0 +1 @@
+Subproject commit f99ffd7e03001810a3e722bf48ad1a9e08415d7d
diff --git a/README.md b/README.md
index 0da5128a5..8f581897b 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
 ## TensorRT-LLM Overview
 
 TensorRT-LLM is a library for optimizing Large Language Model (LLM) inference.
-It provides state-of-the-art optimziations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs
+It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs
 
 TensorRT-LLM provides a Python API to build LLMs into optimized
 [TensorRT](https://developer.nvidia.com/tensorrt) engines.
diff --git a/benchmarks/Suite.md b/benchmarks/Suite.md
index 1c81f0767..ea58db807 100644
--- a/benchmarks/Suite.md
+++ b/benchmarks/Suite.md
@@ -42,7 +42,7 @@ This section covers how to benchmark TensorRT-LLM using inflight batching.
 ### Quickstart
 
 For this quick start guide, we will focus on running a short max throughput benchmark on
-`meta-llama/Llama-2-7b-hf` on a syntehtic dataset with a uniform distribution of prompts with ISL:OSL
+`meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL
 of 128:128. In order to run the benchmark from start to finish simply run the following commands:
 
 ```shell
@@ -101,12 +101,12 @@ The workflow for `trtllm-bench` is composed of the following steps:
 The inflight benchmark utilizes a fixed JSON schema so that it is simple and
 straightforward to specify requests. The schema is defined as follows:
 
-| Key | Required | Type | Description |
-| :- | :-: | :-: | :- |
-| `task_id`| Y | String | Unique identifier for the request. |
-| `prompt` | N* | String | Input text for a generation request. |
-| `logits` | N* | List[Integer] | List of logits that make up the request prompt. |
-| `output_tokens` | Y | Integer | Number of generated tokens for this request. |
+| Key             | Required |     Type      | Description                                     |
+| :-------------- | :------: | :-----------: | :---------------------------------------------- |
+| `task_id`       |    Y     |    String     | Unique identifier for the request.              |
+| `prompt`        |    N*    |    String     | Input text for a generation request.            |
+| `logits`        |    N*    | List[Integer] | List of logits that make up the request prompt. |
+| `output_tokens` |    Y     |    Integer    | Number of generated tokens for this request.    |
 
 > [!NOTE] Prompt and logits are mutually exclusive*
 > While having both `prompt` and `logits` is not required, at least one is required.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 014b7d65b..125526f7e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -316,6 +316,8 @@ endif()
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
+add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+
 include_directories(
   ${CUDAToolkit_INCLUDE_DIRS}
   ${CUDNN_ROOT_DIR}/include
@@ -323,7 +325,8 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/include
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
-  ${3RDPARTY_DIR}/json/include)
+  ${3RDPARTY_DIR}/json/include
+  ${3RDPARTY_DIR}/pybind11/include)
 
 # TRT dependencies
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
diff --git a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
new file mode 100644
index 000000000..a08544e2a
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/common/algorithm.h"
+#include "tensorrt_llm/runtime/common.h"
+#include <variant>
+
+namespace tensorrt_llm::batch_manager
+{
+namespace kv_cache_manager
+{
+class KVCacheManager;
+}
+class BasePeftCacheManager;
+} // namespace tensorrt_llm::batch_manager
+
+namespace tensorrt_llm::batch_manager
+{
+
+using tensorrt_llm::runtime::SizeType32;
+
+/// @brief This scheduler takes into account the given request capacity and the KV cache capacity.
+///        Depending on the CapacitySchedulerPolicy it will schedule already started and new requests,
+///        or even pause previously started requests.
+class BaseCapacityScheduler
+{
+public:
+    explicit BaseCapacityScheduler(LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
+        : mNoScheduleUntilState(noScheduleUntilState)
+        , mNoScheduleAfterState(noScheduleAfterState)
+    {
+    }
+
+    [[nodiscard]] LlmRequestState constexpr getNoScheduleUntilState() const noexcept
+    {
+        return mNoScheduleUntilState;
+    }
+
+    [[nodiscard]] LlmRequestState constexpr getNoScheduleAfterState() const noexcept
+    {
+        return mNoScheduleAfterState;
+    }
+
+private:
+    /// The state until/after which the scheduler should not schedule requests
+    LlmRequestState mNoScheduleUntilState;
+    LlmRequestState mNoScheduleAfterState;
+};
+
+/// @brief Schedule up to maxNumRequests requests
+class MaxRequestsScheduler : public BaseCapacityScheduler
+{
+public:
+    explicit MaxRequestsScheduler(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    /// @brief Takes as input a sorted list of requests and outputs a sorted lists of requests
+    ///        to update for this current iteration, and a map of requests to pause
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+};
+
+/// @brief   Schedule requests using the MAX_UTILIZATION policy
+/// @details Try reserving resources to advance requests by one step,
+///          may pause previously started requests.
+class MaxUtilizationScheduler : public BaseCapacityScheduler
+{
+public:
+    MaxUtilizationScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager, bool manyMicroBatches,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    /// @return {fitsKvCache, fitsPeft}
+    std::pair<bool, bool> trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
+        RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages,
+        std::unordered_set<uint64_t>& seenTaskIds) const;
+
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+    std::shared_ptr<BasePeftCacheManager> mPeftCacheManager{nullptr};
+    /// @brief Boolean that indicates if multiple micro batches might be in flight
+    bool mManyMicroBatches;
+};
+
+/// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
+class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
+{
+public:
+    GuaranteedNoEvictScheduler(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+protected:
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> forwardImpl(
+        RequestList const& activeRequests, bool staticBatchScheduling) const;
+
+private:
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+    std::shared_ptr<BasePeftCacheManager> mPeftCacheManager{nullptr};
+};
+
+/// @brief Schedule requests using the STATIC_BATCH policy
+class StaticBatchScheduler : public GuaranteedNoEvictScheduler
+{
+public:
+    StaticBatchScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+};
+
+class CapacityScheduler : public Algorithm
+{
+public:
+    constexpr static auto name{"CapacityScheduler"};
+
+    CapacityScheduler() = default;
+
+    CapacityScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    static CapacityScheduler make(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE)
+    {
+        return CapacityScheduler{maxNumRequests, std::move(kvCacheManager), std::move(crossKvCacheManager),
+            std::move(peftCacheManager), capacitySchedulerPolicy, manyMicroBatches, noScheduleUntilState,
+            noScheduleAfterState};
+    }
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    std::variant<std::monostate, MaxRequestsScheduler, MaxUtilizationScheduler, GuaranteedNoEvictScheduler,
+        StaticBatchScheduler>
+        mScheduler;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/common.h b/cpp/include/tensorrt_llm/batch_manager/common.h
new file mode 100644
index 000000000..6e4a76bc4
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/common.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/common.h"
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace tensorrt_llm::executor
+{
+class RequestWithId;
+}
+
+namespace tensorrt_llm::batch_manager
+{
+class LlmRequest;
+
+using RequestList = std::list<std::shared_ptr<LlmRequest>>;
+using RequestIdType = std::uint64_t;
+using RequestVector = std::vector<std::shared_ptr<LlmRequest>>;
+using ReqIdsSet = std::unordered_set<RequestIdType>;
+
+class ScheduledRequests
+{
+public:
+    /// @brief context phase requests (for decoder-only models) or encoder phase requests (for encoder-decoder models
+    /// and encoder-only models)
+    RequestVector contextRequests;
+
+    /// @brief generation phase requests (for decoder-only models) or empty for others
+    RequestVector generationRequests;
+
+    ScheduledRequests() = default;
+
+    explicit ScheduledRequests(RequestVector contextRequests, RequestVector generationRequests)
+        : contextRequests{std::move(contextRequests)}
+        , generationRequests{std::move(generationRequests)}
+    {
+    }
+
+    [[nodiscard]] bool empty() const
+    {
+        return contextRequests.empty() && generationRequests.empty();
+    }
+
+    [[nodiscard]] std::size_t size() const
+    {
+        return contextRequests.size() + generationRequests.size();
+    }
+};
+
+class BatchState
+{
+public:
+    BatchState() = default;
+
+    BatchState(runtime::SizeType32 numCtxRequests, runtime::SizeType32 numGenRequests, runtime::SizeType32 numTokens,
+        runtime::SizeType32 maxKvCacheLength)
+        : mNumCtxRequests{numCtxRequests}
+        , mNumGenRequests{numGenRequests}
+        , mNumTokens{numTokens}
+        , mMaxKvCacheLength{maxKvCacheLength}
+    {
+    }
+
+    bool isAnyContext() const
+    {
+        return mNumCtxRequests > 0;
+    }
+
+    bool operator==(BatchState const& other) const
+    {
+        return mNumCtxRequests == other.mNumCtxRequests && mNumGenRequests == other.mNumGenRequests
+            && mNumTokens == other.mNumTokens && mMaxKvCacheLength == other.mMaxKvCacheLength;
+    }
+
+    size_t hash() const
+    {
+        size_t h1 = std::hash<runtime::SizeType32>{}(mNumCtxRequests);
+        size_t h2 = std::hash<runtime::SizeType32>{}(mNumGenRequests);
+        size_t h3 = std::hash<runtime::SizeType32>{}(mNumTokens);
+        size_t h4 = std::hash<runtime::SizeType32>{}(mMaxKvCacheLength);
+        return h1 ^ h2 ^ h3 ^ h4;
+    }
+
+    runtime::SizeType32 mNumCtxRequests;
+    runtime::SizeType32 mNumGenRequests;
+    runtime::SizeType32 mNumTokens;
+    runtime::SizeType32 mMaxKvCacheLength;
+};
+
+struct BatchStateHash
+{
+    size_t operator()(BatchState const& bs) const
+    {
+        return bs.hash();
+    }
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
new file mode 100644
index 000000000..a7326eee7
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+
+#include <vector>
+
+using namespace tensorrt_llm::batch_manager::kv_cache_manager;
+
+namespace tensorrt_llm::batch_manager::eviction_policy
+{
+
+class BaseEvictionPolicy
+{
+public:
+    virtual ~BaseEvictionPolicy() = default;
+
+    virtual void initialize(
+        std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks)
+        = 0;
+
+    // Get a free block from the primary memory pool
+    virtual BlockPtr getFreePrimaryBlock() = 0;
+    // Get a free block from the secondary memory pool
+    virtual BlockPtr getFreeSecondaryBlock() = 0;
+    // Release a block. Prioritize the block for eviction if toFront=true
+    virtual void releaseBlock(BlockPtr block, bool toFront = false) = 0;
+    // Get the amount of free blocks in the primary memory pool
+    virtual SizeType32 getNumFreePrimaryBlocks() = 0;
+    // Get the amount of free blocks in the secondary memory pool
+    virtual SizeType32 getNumFreeSecondaryBlocks() = 0;
+    // Claim a free block. Called when the cache manager allocates or reuses a new block
+    virtual void claimBlock(KVCacheBlock block) = 0;
+};
+
+class LRUEvictionPolicy : public BaseEvictionPolicy
+{
+public:
+    void initialize(
+        std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) override;
+    BlockPtr getFreePrimaryBlock() override;
+    BlockPtr getFreeSecondaryBlock() override;
+    void releaseBlock(BlockPtr block, bool toFront = false) override;
+    SizeType32 getNumFreePrimaryBlocks() override;
+    SizeType32 getNumFreeSecondaryBlocks() override;
+
+    void claimBlock(KVCacheBlock block);
+
+private:
+    FreeBlocksQueue mFreePrimaryBlocks;
+    FreeBlocksQueue mFreeSecondaryBlocks;
+
+    std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
+
+    SizeType32 mFreePrimaryBlocksSize;
+    SizeType32 mFreeSecondaryBlocksSize;
+};
+
+} // namespace tensorrt_llm::batch_manager::eviction_policy
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index 959e2c39c..cc7aa9374 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -22,6 +22,7 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
@@ -29,13 +30,18 @@
 #include <NvInferRuntime.h>
 
 #include <cstdint>
-#include <functional>
+#include <limits>
 #include <list>
 #include <memory>
 #include <optional>
 #include <unordered_map>
 #include <vector>
 
+namespace tensorrt_llm::batch_manager::eviction_policy
+{
+class BaseEvictionPolicy;
+}
+
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
@@ -124,6 +130,8 @@ class KVCacheBlock
 
     [[nodiscard]] IdType getBlockId() const;
 
+    [[nodiscard]] NextBlockMap getNextBlocks() const;
+
     [[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const;
 
     [[nodiscard]] bool isPrimary() const;
@@ -144,22 +152,12 @@ class KVCacheBlock
 
     [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
 
-    void setFreeBlockIterator(FreeBlocksQueue::iterator freeBlockIterator);
-
-    void resetFreeBlockIterator();
-
-    [[nodiscard]] std::optional<FreeBlocksQueue::iterator> const& getFreeBlockIterator() const;
-
     void setPrevBlock(BlockPtr prevBlock);
 
     void addNextBlock(BlockKey const& blockKey, BlockPtr block);
 
     void removeNextBlock(BlockKey const& blockKey);
 
-    static std::shared_ptr<KVCacheBlock> findBestGPUBlockToFree(std::shared_ptr<KVCacheBlock> searchStart);
-
-    static std::shared_ptr<KVCacheBlock> findLeafBlock(std::shared_ptr<KVCacheBlock> searchStart);
-
     [[nodiscard]] BlockPtr findMatchingBlock(BlockKey const& blockKey) const;
 
     //! \brief Free block from previous block if present.
@@ -203,14 +201,21 @@ class GenerationRequest
 {
 public:
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using SharedPtr = std::shared_ptr<GenerationRequest>;
 
-    explicit GenerationRequest(SizeType32 seqSlotIdx, SizeType32 numTokens, SizeType32 beamWidth)
-        : mSeqSlotIdx(seqSlotIdx)
+    explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
+        SizeType32 maxBlocks, SizeType32 numPools = 1)
+        : mRequestId(requestId)
         , mNumTokens(numTokens)
         , mBeamWidth(beamWidth)
         , mCacheBlockIds(beamWidth)
+        , mCacheBlockIndices{
+              runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
+                  runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
     {
+        auto cacheBlockIdsRange = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices);
+        std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
+            tensorrt_llm::kernels::KVCacheIndex{
+                std::numeric_limits<tensorrt_llm::kernels::KVCacheIndex::UnderlyingType>::max()});
     }
 
     void addNewTokens(SizeType32 n)
@@ -225,9 +230,9 @@ class GenerationRequest
         mNumTokens -= n;
     }
 
-    [[nodiscard]] SizeType32 getSequenceSlotIdx() const
+    [[nodiscard]] LlmRequest::RequestIdType getRequestId() const
     {
-        return mSeqSlotIdx;
+        return mRequestId;
     }
 
     [[nodiscard]] SizeType32 getNumTokens() const
@@ -245,6 +250,16 @@ class GenerationRequest
         return mCacheBlockIds;
     }
 
+    [[nodiscard]] runtime::ITensor& getCacheBlockIndices()
+    {
+        return *mCacheBlockIndices;
+    }
+
+    [[nodiscard]] runtime::ITensor const& getCacheBlockIndices() const
+    {
+        return *mCacheBlockIndices;
+    }
+
     void addCacheBlock(SizeType32 beamIdx, KVCacheBlock::IdType blockId)
     {
         mCacheBlockIds.at(beamIdx).push_back(blockId);
@@ -272,14 +287,16 @@ class GenerationRequest
     }
 
 private:
-    // Slot id of the sequence
-    SizeType32 mSeqSlotIdx;
+    // Request id of the sequence
+    LlmRequest::RequestIdType mRequestId;
     // Current number of generated tokens
     SizeType32 mNumTokens;
     // Number of beams
     SizeType32 mBeamWidth;
-    // List of blocks allocated for each beam of the sequence
+    // List of block ids allocated for each beam of the sequence
     std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
+    // Tensor of block indices allocated for each beam of the sequence
+    runtime::ITensor::SharedPtr mCacheBlockIndices;
 };
 
 // attach metadata to a pool pointer
@@ -315,17 +332,19 @@ class KVCacheBlockPool
 // tokens_per_block, head_size]. The size per block and number of blocks are pre-determined and set in the constructor.
 // BlockManager maintains a list of free blocks at any time.
 // Alloc pops off the block at the front, and Free pushes it back to the vector.
-// BlockManager maintains a vector of lists of seqSlotIdx to allocated blocks
+// BlockManager maintains a vector of lists of request ids to allocated blocks
 // per sequence. This can be used to Free all blocks belonging to a sequence.
 class BlockManager
 {
 public:
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
     using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
+    using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy;
 
     explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
         SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
-        std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType = CacheType::kSELF);
+        SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
+        CacheType cacheType = CacheType::kSELF);
 
     ~BlockManager();
 
@@ -340,10 +359,6 @@ class BlockManager
     //! \brief Assign blocks for new sequence. Does not try to reuse blocks.
     void addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx);
 
-    //! \brief Release block, which puts it back onto free blocks queue.
-    //! \details Block appended by default, will be put at front if toFront is true.
-    void releaseBlock(std::shared_ptr<KVCacheBlock> block, bool toFront = false);
-
     //! \brief Allocate new block for each beam of the sequence.
     //! \details Might free cached blocks if no free blocks are available.
     void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams = false);
@@ -359,10 +374,7 @@ class BlockManager
     //! \brief Release last block in the sequence
     void releaseLastBlock(GenerationRequest& sequence);
 
-    [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept
-    {
-        return mFreePrimaryBlocksSize;
-    }
+    [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept;
 
     [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
     {
@@ -467,6 +479,11 @@ class BlockManager
     BlockKey findNewContextBlock(
         VecUniqueTokens const& uniqueTokens, std::shared_ptr<LlmRequest> const& llmRequest) const;
 
+    [[nodiscard]] runtime::BufferManager const& getBufferManager() const
+    {
+        return mBufferManager;
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -486,17 +503,9 @@ class BlockManager
     SizeType32 loadOrAllocateBlocks(
         std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence);
 
-    //! \brief Find best primary block to free.
-    //! \details The best primary block to free is the primary block that appears first in the queue and have no primary
-    //! block descendants
-    [[nodiscard]] std::shared_ptr<KVCacheBlock> findBestGPUBlockToFree();
-
     //! \brief Find block least likely to be reused, free it if necessary and return.
     [[nodiscard]] BlockPtr getFreeBlock();
 
-    //! \brief Claim block if it is in free blocks list.
-    void claimBlock(KVCacheBlock& block);
-
     //! \brief Free block from previous block and claim it from free blocks list.
     void claimLeafBlock(KVCacheBlock& block);
 
@@ -511,15 +520,9 @@ class BlockManager
     // Number of blocks in pools
     SizeType32 mNumPrimaryBlocks;
     SizeType32 mNumSecondaryBlocks;
-    // List of free blocks. Blocks are either backed by fast primary memory or slow secondary memory.
-    // We maintain separate queues for these.
-    // We cache size of each queue instead of calling std::list::size, because size is O(N) function.
-    SizeType32 mFreePrimaryBlocksSize;
-    SizeType32 mFreeSecondaryBlocksSize;
-    FreeBlocksQueue mFreePrimaryBlocks;
-    FreeBlocksQueue mFreeSecondaryBlocks;
+
     // List of allocated blocks for each sequences
-    std::vector<std::vector<BlockPtr>> mAllocatedBlocksPerSeq;
+    std::unordered_map<LlmRequest::RequestIdType, std::vector<BlockPtr>> mAllocatedBlocksPerSeq;
 
     // Pool per unique numKvHeads in the model
     std::vector<KVCacheBlockPool> mPools;
@@ -547,6 +550,8 @@ class BlockManager
     std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks;
     // KV cache type (self or cross)
     CacheType mCacheType;
+    // Eviction Policy
+    std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
 
 private:
     friend class KVCacheManager;
@@ -555,8 +560,9 @@ class BlockManager
 class KVCacheManager
 {
 public:
+    friend class KVCacheManagerBindings;
+
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using SequencesPtr = GenerationRequest::SharedPtr;
     using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
     using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
 
@@ -647,10 +653,10 @@ class KVCacheManager
     /// @return  The number of blocks
     [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const;
 
-    void addContextTokens(SizeType32 seqSlotIdx, SizeType32 numTokens);
+    void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
 
-    /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed.
-    void addToken(SizeType32 seqSlotIdx);
+    /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
+    void addToken(LlmRequest::RequestIdType requestId);
 
     /// @brief Add new request to the KV cache manager.
     /// @param inputLength Input length for which KV cache need to be allocated.
@@ -658,23 +664,29 @@ class KVCacheManager
     /// @param llmRequest Optional request to use for KV cache lookup.
     /// @details If llmRequest is supplied and KV cache reuse is enabled, try to recover KV cache blocks for
     /// inputLength - 1 tokens and populate prepopulatedPromptLen.
-    void addSequence(SizeType32 seqSlotIdx, SizeType32 inputLength, SizeType32 beamWidth,
+    void addSequence(LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
         std::shared_ptr<LlmRequest> const& llmRequest = nullptr);
 
-    void removeSequence(SizeType32 seqSlotIdx, std::shared_ptr<LlmRequest> const& llmRequest = nullptr);
+    void removeSequence(LlmRequest::RequestIdType requestId, std::shared_ptr<LlmRequest> const& llmRequest = nullptr);
 
-    void schedulingRemoveSequence(SizeType32 seqSlotIdx);
+    void schedulingRemoveSequence(LlmRequest::RequestIdType requestId);
 
-    [[nodiscard]] runtime::ITensor::UniquePtr getBlockPoolPointers() const;
+    [[nodiscard]] runtime::ITensor::SharedPtr getBlockPoolPointers() const
+    {
+        return mBlockPoolPointers;
+    }
 
-    [[nodiscard]] runtime::ITensor::UniquePtr getLayerToPoolMapping() const;
+    [[nodiscard]] runtime::ITensor::SharedPtr getLayerToPoolMapping() const
+    {
+        return mLayerToPoolMapping;
+    }
 
     void getBlockOffsetsOfBatch(
         runtime::ITensor& output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize, SizeType32 beamWidth) const;
 
     //! @return maxBlockCount of all beams
     SizeType32 copyBlockOffsets(
-        runtime::ITensor& output, SizeType32 outputSlotOffset, SizeType32 seqSlotIdx, SizeType32 beamWidth) const;
+        runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const;
 
     // Sum of numLayers * 2 * numKvHeads * sizePerHead for each pool
     [[nodiscard]] static SizeType32 calculateCacheSizePerToken(
@@ -697,10 +709,10 @@ class KVCacheManager
         return mEnableBlockReuse;
     }
 
-    void removeToken(SizeType32 seqSlotIdx);
-    void rewindKVCache(SizeType32 seqSlotIdx, SizeType32 rewindLengths);
+    void removeToken(LlmRequest::RequestIdType requestId);
+    void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths);
 
-    [[nodiscard]] GenerationRequest const& getSequence(SizeType32 seqSlotIdx) const;
+    [[nodiscard]] GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const;
 
     [[nodiscard]] bool isCrossKv() const
     {
@@ -714,7 +726,7 @@ class KVCacheManager
 
     //! \brief Store full context blocks contributed by llmRequest.
     //! \details These blocks become reusable from next step.
-    void storeContextBlocks(SizeType32 seqSlotIdx, std::shared_ptr<LlmRequest> const& llmRequest);
+    void storeContextBlocks(std::shared_ptr<LlmRequest> const& llmRequest);
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
 
@@ -722,14 +734,13 @@ class KVCacheManager
         SizeType32 tokensPerBlock, SizeType32 maxBeamWidth, SizeType32 sinkTokenLen, bool useOneMoreBlock);
 
 private:
-    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 seqSlotIdx,
-        SizeType32 beamIdx, SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
+    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
+        SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
 
-    void resetBlockOffsets(SizeType32 seqSlotIdx, SizeType32 beamWidth);
-    void cacheBlockOffsets(GenerationRequest const& seq, SizeType32 seqSlotIdx);
-    void cacheNewBlockOffsets(GenerationRequest const& seq, SizeType32 seqSlotIdx);
-    void updateNewBlockPointer(GenerationRequest const& seq, SizeType32 seqSlotIdx, SizeType32 blockIdx);
-    void updateToken(SizeType32 seqSlotIdx, bool addToken);
+    void cacheBlockOffsets(GenerationRequest& seq);
+    void cacheNewBlockOffsets(GenerationRequest& seq);
+    void updateNewBlockPointer(GenerationRequest& seq, SizeType32 blockIdx);
+    void updateToken(GenerationRequest& sequence, bool addToken);
 
 private:
     // Maximum number of sequences
@@ -749,12 +760,13 @@ class KVCacheManager
     SizeType32 mSinkBlockTokenLength;
     // Block manager
     BlockManager mBlockManager;
-    // List of all sequences
-    std::vector<SequencesPtr> mSequences;
-    // buffer for block indices for all managed sequences
-    runtime::ITensor::SharedPtr mSequenceBlockIndices;
+    // Map of all sequences
+    std::unordered_map<LlmRequest::RequestIdType, GenerationRequest> mSequences;
     // Whether to cache KV pages for reuse
     bool mEnableBlockReuse;
+    // buffers for static tensors, will be created after allocating pools
+    runtime::ITensor::SharedPtr mBlockPoolPointers;
+    runtime::ITensor::SharedPtr mLayerToPoolMapping;
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
index 1738cc428..69ca1963b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -65,6 +65,11 @@ class BlockIterator
         return ret;
     }
 
+    operator runtime::ITensor::SharedPtr()
+    {
+        return mCurrent;
+    }
+
     [[nodiscard]] bool operator==(BlockIterator const& other) const
     {
         return mIdx == other.mIdx && mPool.get() == other.mPool.get();
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index fed9dd21e..475970b7b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -55,6 +55,7 @@ enum class LlmRequestState : int32_t
                                               /// Waiting context-only request transmitting the kv cache
     kDISAGG_CONTEXT_COMPLETE = 8,             ///< Context-only request finished kv cache transmission.
     kDISAGG_GENERATION_TRANS_IN_PROGRESS = 9, ///< For disaggregated serving only: transmitting the kv cache
+    kWAITING_TO_SEND_LOGITS = 10,             ///< Generation phase completed, logits not sent yet
 };
 
 enum LlmRequestType
@@ -132,8 +133,7 @@ class GenericLlmRequest
         , mLoraWeights(std::move(loraWeights))
         , mLoraConfig(std::move(loraConfig))
         , mLookaheadConfig(std::move(lookaheadConfig))
-        , mContextChunkSize(std::nullopt)
-        , mContextCurrentPosition(0)
+        , mContextChunkSize{mPromptLen}
         , mLogProbs(samplingConfig.beamWidth)
         , mCumLogProbs(samplingConfig.beamWidth)
         , mDraftTokens(draftTokens.value_or(std::make_shared<VecTokens>()))
@@ -186,8 +186,7 @@ class GenericLlmRequest
         , mLoraWeights(std::nullopt)
         , mLoraConfig(std::nullopt)
         , mLookaheadConfig(std::nullopt)
-        , mContextChunkSize(std::nullopt)
-        , mContextCurrentPosition(0)
+        , mContextChunkSize{mPromptLen}
         , mLogProbs(mSamplingConfig.beamWidth)
         , mCumLogProbs(mSamplingConfig.beamWidth)
         , mDraftTokens(std::make_shared<VecTokens>())
@@ -392,6 +391,15 @@ class GenericLlmRequest
             mMaxNewTokens = maxNewTokens;
         }
 
+        if (mNumReturnSequences > 1 && mSamplingConfig.beamWidth > 1)
+        {
+            TLLM_THROW(
+                "Using mNumReturnSequences (%d) > 1 with beam search is currently disabled, since TensorRT-LLM returns "
+                "a total of mNumReturnSequences x beamWidth beams, rather than limiting the number of returned beams "
+                "to mNumReturnSequences. This restriction will be removed once the issue is resolved.",
+                mNumReturnSequences);
+        }
+
         TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");
 
         // validate extra ids when enabling kv cache reuse with prompt table
@@ -722,7 +730,7 @@ class GenericLlmRequest
         mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
                                                                      : LlmRequestState::kCONTEXT_INIT;
         mContextCurrentPosition = 0;
-        mContextChunkSize = std::nullopt;
+        mContextChunkSize = mPromptLen;
         mSeqSlot.reset();
     }
 
@@ -869,34 +877,33 @@ class GenericLlmRequest
         return mPromptLen;
     }
 
+    [[nodiscard]] SizeType32 getPrepopulatedPromptLen() const
+    {
+        return mPrepopulatedPromptLen;
+    }
+
     void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen, SizeType32 kvTokensPerBlock)
     {
         auto const promptLen = getPromptLen();
         TLLM_CHECK(prepopulatedPromptLen < promptLen);
+        mPrepopulatedPromptLen = prepopulatedPromptLen;
 
         if (prepopulatedPromptLen > 0)
         {
             // Currently, the runtime process is to apply for cache first and then determine prepopulation.
             // Use the prepopulated length to advance the context position and decrease chunk size if necessary.
-            if (isFullContextRequest())
-            {
-                setContextCurrentPosition(prepopulatedPromptLen);
-                setContextChunkSize(promptLen);
-            }
-            else
+            auto chunkSize = getContextChunkSize();
+            if (prepopulatedPromptLen + chunkSize < promptLen)
             {
-                auto chunkSize = getContextChunkSize();
-                if (prepopulatedPromptLen + chunkSize < promptLen)
-                {
-                    // make sure to end at block boundary after current chunk
-                    auto const flooredEndPosition
-                        = (prepopulatedPromptLen + chunkSize) / kvTokensPerBlock * kvTokensPerBlock;
-                    chunkSize = flooredEndPosition - prepopulatedPromptLen;
-                    TLLM_CHECK(chunkSize <= getContextChunkSize());
-                }
-                setContextCurrentPosition(prepopulatedPromptLen);
-                setContextChunkSize(chunkSize);
+                // make sure to end at block boundary after current chunk
+                auto const flooredEndPosition
+                    = (prepopulatedPromptLen + chunkSize) / kvTokensPerBlock * kvTokensPerBlock;
+                chunkSize = flooredEndPosition - prepopulatedPromptLen;
+                TLLM_CHECK(chunkSize <= getContextChunkSize());
             }
+            setContextCurrentPosition(prepopulatedPromptLen);
+            setContextChunkSize(chunkSize);
+
             if (!isLastContextChunk())
             {
                 TLLM_CHECK_WITH_INFO((getContextCurrentPosition() + getContextChunkSize()) % kvTokensPerBlock == 0,
@@ -1176,6 +1183,11 @@ class GenericLlmRequest
         return mState == LlmRequestState::kDISAGG_CONTEXT_COMPLETE;
     }
 
+    [[nodiscard]] bool isCompleteWaitingToSendLogits() const noexcept
+    {
+        return mState == LlmRequestState::kWAITING_TO_SEND_LOGITS;
+    }
+
     /// To determine whether the context is unchunked. When a context is chunked into only a part, it
     /// is still different from the unchunked state, which indicates the initial status.
     [[nodiscard]] bool isFullContextRequest() const noexcept
@@ -1211,12 +1223,11 @@ class GenericLlmRequest
         return mPromptLen - getContextCurrentPosition();
     }
 
-    /// To retrieve the context chunk size, throw an exception when the context is not chunked.
     [[nodiscard]] SizeType32 getContextChunkSize() const
     {
-        TLLM_CHECK_WITH_INFO(
-            isContextInitState() && mContextChunkSize, "The current request is not in context chunking state.");
-        return mContextChunkSize.value();
+        TLLM_CHECK_WITH_INFO(isContextInitState() || isDisaggGenerationInitState(),
+            "getContextChunkSize is only possible during the context phase.");
+        return mContextChunkSize;
     }
 
     /// To set the context chunk size, throw an exception when the chunk size is negative. If the chunk
@@ -1224,45 +1235,34 @@ class GenericLlmRequest
     /// remaining length.
     void setContextChunkSize(SizeType32 size)
     {
-        TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
+        TLLM_CHECK_WITH_INFO(isContextInitState(), "setContextChunkSize is only possible during the context phase.");
         TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size);
         mContextChunkSize = std::min(size, getContextRemainingLength());
     }
 
     /// Determines whether the current position is only one chunk away from the end of the context.
-    /// It will return true when the context is not chunked.
     [[nodiscard]] bool isLastContextChunk() const noexcept
     {
-        return isFullContextRequest()
-            || (isContextInitState() && getContextCurrentPosition() + getContextChunkSize() == mPromptLen);
+        return isDisaggGenerationInitState() || getContextCurrentPosition() + getContextChunkSize() == mPromptLen;
     }
 
-    /// Returns whether the position is at the beginning of the context. It will return true when the
-    /// context is not chunked.
+    /// Returns whether the position is at the beginning of the context.
     [[nodiscard]] bool isFirstContextChunk() const noexcept
     {
-        return isFullContextRequest() || getContextCurrentPosition() == 0;
-    }
-
-    [[nodiscard]] executor::PriorityType priority() const noexcept
-    {
-        return mPriority;
+        return getContextCurrentPosition() == 0;
     }
 
     /// Move the cursor forward one chunk. When not chunked, move forward to the end of the context.
     void moveToNextContextChunk()
     {
         TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
-        if (mContextChunkSize)
-        {
-            mContextCurrentPosition += getContextChunkSize();
-            setContextChunkSize(0);
-        }
-        else
-        {
-            TLLM_CHECK_WITH_INFO(mContextCurrentPosition == 0, "Full context out of bounds.");
-            mContextCurrentPosition = mPromptLen;
-        }
+        mContextCurrentPosition += getContextChunkSize();
+        setContextChunkSize(0);
+    }
+
+    [[nodiscard]] executor::PriorityType priority() const noexcept
+    {
+        return mPriority;
     }
 
     /// Increment the counter of decoding iterations.
@@ -1282,20 +1282,24 @@ class GenericLlmRequest
         return static_cast<float>(getMaxNumGeneratedTokens()) / mDecodingIter;
     }
 
+    [[nodiscard]] bool isFinished() const noexcept
+    {
+        return isGenerationCompleteState() || isDisaggContextTransmissionState() || isCompleteWaitingToSendLogits();
+    }
+
     /// @brief  Create a Response from the current state of the request
     /// @return An optional Response
-    std::optional<executor::Response> createResponse()
+    std::optional<executor::Response> createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0)
     {
         TLLM_CHECK(!isDisaggContextCompleteState());
-        if (isGenerationCompleteState() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)
-            || isDisaggContextTransmissionState())
+        if (isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))
         {
             TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
 
             executor::Result result;
             result.sequenceIndex = mSequenceIndex;
 
-            result.isSequenceFinal = isGenerationCompleteState() || isDisaggContextTransmissionState();
+            result.isSequenceFinal = isFinished();
             mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal;
 
             result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(),
@@ -1333,8 +1337,7 @@ class GenericLlmRequest
 
             auto const startTokenPos = maxNbTokens - maxNbTokensOut;
 
-            auto const shouldSendResponse = isGenerationCompleteState()
-                || (mIsStreaming && maxNbTokens > getMaxSentTokenLen()) || isDisaggContextTransmissionState();
+            auto const shouldSendResponse = isFinished() || (mIsStreaming && maxNbTokens > getMaxSentTokenLen());
 
             if (!shouldSendResponse)
             {
@@ -1374,6 +1377,11 @@ class GenericLlmRequest
                             = runtime::ITensor::slice(getGenerationLogitsHost(), startGenTokenPos, maxNbTokensOut);
                         result.generationLogits = executor::detail::ofITensor(generationLogitsHostCurrentStep);
                     }
+                    else if (useFastLogits)
+                    {
+                        result.specDecFastLogitsInfo
+                            = executor::SpeculativeDecodingFastLogitsInfo{mRequestId, mpiWorldRank};
+                    }
                     else
                     {
                         result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
@@ -1392,7 +1400,7 @@ class GenericLlmRequest
                 setMaxSentTokenLen(maxNbTokens);
 
                 auto requestId = isChild() ? mParentRequestId : mRequestId;
-                auto response = executor::Response(requestId, std::move(result));
+                auto response = executor::Response(requestId, std::move(result), mClientId);
 
                 return response;
             }
@@ -1483,8 +1491,8 @@ class GenericLlmRequest
     // To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one,
     // the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning
     // of null value is that the context is not chunked.
-    std::optional<SizeType32> mContextChunkSize;
-    SizeType32 mContextCurrentPosition;
+    SizeType32 mContextChunkSize{0};
+    SizeType32 mContextCurrentPosition{0};
 
     std::vector<VecLogProbs> mLogProbs; // [beamSize, seqLen]
     VecLogProbs mCumLogProbs;           // [beamSize]
@@ -1636,6 +1644,8 @@ class GenericLlmRequest
 
 class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
 {
+    friend class LlmRequestBindings;
+
 public:
     using Base = GenericLlmRequest<runtime::ITensor::SharedPtr>;
     using TensorPtr = Base::TensorPtr;
diff --git a/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
new file mode 100644
index 000000000..2e932ba23
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/common/algorithm.h"
+#include "tensorrt_llm/runtime/common.h"
+
+namespace tensorrt_llm::batch_manager
+{
+
+namespace batch_scheduler
+{
+
+struct ContextChunkingConfig
+{
+    ContextChunkingConfig() = default;
+
+    executor::ContextChunkingPolicy chunkingPolicy;
+    /// The minimum size, also known as the chunk unit size. It generally
+    /// needs to be equal to the size of the kv cache block or its integer
+    /// multiples (except for the last context chunk) to avoid fragmentation.
+    /// When set to null, it indicates that the context chunk is disabled.
+    tensorrt_llm::runtime::SizeType32 chunkUnitSize;
+};
+
+} // namespace batch_scheduler
+
+/// @brief This scheduler takes into account the desired batch size and limits of the TRT engine to schedule requests.
+class MicroBatchScheduler : Algorithm
+{
+public:
+    constexpr static auto name{"MicroBatchScheduler"};
+
+    using SizeType32 = tensorrt_llm::runtime::SizeType32;
+    using ContextChunkingPolicy = tensorrt_llm::executor::ContextChunkingPolicy;
+
+    MicroBatchScheduler() = default;
+
+    explicit MicroBatchScheduler(SizeType32 maxBatchSize, std::optional<SizeType32> maxNumTokens = std::nullopt,
+        std::optional<batch_scheduler::ContextChunkingConfig> ctxChunkConfig = std::nullopt,
+        std::optional<SizeType32> maxContextLength = std::nullopt,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    static MicroBatchScheduler make(SizeType32 maxBatchSize, std::optional<SizeType32> maxNumTokens = std::nullopt,
+        std::optional<batch_scheduler::ContextChunkingConfig> ctxChunkConfig = std::nullopt,
+        std::optional<SizeType32> maxContextLength = std::nullopt,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE)
+    {
+        return MicroBatchScheduler{
+            maxBatchSize, maxNumTokens, ctxChunkConfig, maxContextLength, noScheduleUntilState, noScheduleAfterState};
+    }
+
+    std::tuple<RequestVector, RequestVector> operator()(
+        RequestVector const& activeRequests, ReqIdsSet const& inflightReqIds);
+
+    static void setCtxRequestsChunkSize(RequestVector const& contextsToBeChunked, ContextChunkingPolicy ctxChunkPolicy,
+        std::optional<SizeType32> ctxTokensCapacity, SizeType32 chunkUnitSize,
+        std::optional<SizeType32> const& maxContextLength);
+
+private:
+    template <ContextChunkingPolicy tPolicy>
+    static void setCtxRequestsChunkSize(RequestVector const& contextsToBeChunked,
+        std::optional<SizeType32> ctxTokensCapacity, SizeType32 chunkUnitSize,
+        std::optional<SizeType32> const& maxContextLength);
+
+    /// After the chunk sizes have been determined, this function will discard
+    /// any draft tokens that don't fit.
+    static void fitDraftTokens(RequestVector const& contextsToBeChunked, std::optional<SizeType32> ctxTokensCapacity,
+        SizeType32 chunkUnitSize, std::optional<SizeType32> const& maxContextLength);
+
+    /// The maximum number of requests returned by scheduleRequests
+    SizeType32 mMaxBatchSize;
+
+    /// The maximum number of tokens to include in a batch
+    std::optional<SizeType32> mMaxNumTokens;
+
+    /// The maximum length of the context. If the context exceeds this length,
+    /// it must be chunked, otherwise it cannot be processed. Therefore, it
+    /// needs to be set together with the chunk unit size to make sense.
+    /// When set to null, it indicates that context length is unlimited.
+    std::optional<SizeType32> mMaxContextLength;
+
+    std::optional<batch_scheduler::ContextChunkingConfig> mCtxChunkConfig;
+
+    /// The state until/after which the scheduler should not schedule requests
+    LlmRequestState mNoScheduleUntilState;
+    LlmRequestState mNoScheduleAfterState;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
index 65808134b..f86e76b4b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
@@ -51,6 +51,8 @@ class PeftTaskNotCachedException : public runtime::LoraExpectedException
 class BasePeftCacheManager
 {
 public:
+    friend class BasePeftCacheManagerBindings;
+
     using LlmRequestPtr = std::shared_ptr<LlmRequest>;
     using RequestVector = std::vector<LlmRequestPtr>;
     using PeftTable = std::map<uint64_t, std::shared_ptr<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>>;
diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
index a1288c154..4a430d8c1 100644
--- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@@ -46,7 +46,9 @@ class TrtGptModelOptionalParams
         executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{},
         executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig
         = executor::ExtendedRuntimePerfKnobConfig{},
-        std::optional<executor::DebugConfig> debugConfig = std::nullopt, uint64_t maxSeqIdleMicroseconds = 180000000)
+        std::optional<executor::DebugConfig> debugConfig = std::nullopt, uint64_t maxSeqIdleMicroseconds = 180000000,
+        std::optional<executor::SpeculativeDecodingConfig> specDecConfig = std::nullopt,
+        bool isLeaderInOrchMode = false)
         : kvCacheConfig{kvCacheConfig}
         , enableTrtOverlap{enableTrtOverlap}
         , deviceIds(deviceIds)
@@ -62,10 +64,12 @@ class TrtGptModelOptionalParams
         , extendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig)
         , debugConfig{std::move(debugConfig)}
         , maxSeqIdleMicroseconds{maxSeqIdleMicroseconds}
+        , speculativeDecodingConfig{std::move(specDecConfig)}
+        , isLeaderInOrchMode{isLeaderInOrchMode}
     {
     }
 
-    explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig)
+    explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig, bool isLeaderInOrchMode)
         : TrtGptModelOptionalParams(KvCacheConfig(executorConfig.getKvCacheConfig()), false,
             executorConfig.getParallelConfig().value_or(executor::ParallelConfig()).getDeviceIds(),
             executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
@@ -74,7 +78,7 @@ class TrtGptModelOptionalParams
             executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(),
             executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(),
             executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(),
-            executorConfig.getMaxSeqIdleMicroseconds())
+            executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(), isLeaderInOrchMode)
     {
     }
 
@@ -94,6 +98,8 @@ class TrtGptModelOptionalParams
             && extendedRuntimePerfKnobConfig == other.extendedRuntimePerfKnobConfig //
             && debugConfig == other.debugConfig                                     //
             && maxSeqIdleMicroseconds == other.maxSeqIdleMicroseconds               //
+            && speculativeDecodingConfig == other.speculativeDecodingConfig         //
+            && isLeaderInOrchMode == other.isLeaderInOrchMode                       //
             ;
     }
 
@@ -117,6 +123,9 @@ class TrtGptModelOptionalParams
     std::optional<executor::DebugConfig> debugConfig;
     // Sequence is considered idle if not updated for this amount of time.
     uint64_t maxSeqIdleMicroseconds;
+    std::optional<executor::SpeculativeDecodingConfig> speculativeDecodingConfig;
+    // This rank is the leader worker in orchestrator mode
+    bool isLeaderInOrchMode;
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/common/algorithm.h b/cpp/include/tensorrt_llm/common/algorithm.h
new file mode 100644
index 000000000..9363504f7
--- /dev/null
+++ b/cpp/include/tensorrt_llm/common/algorithm.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace tensorrt_llm
+{
+
+// Base class for algorithms
+struct Algorithm
+{
+    Algorithm() = default;
+    Algorithm(Algorithm&&) = default;
+    Algorithm& operator=(Algorithm&&) = default;
+    Algorithm(Algorithm const&) = delete;
+    Algorithm& operator=(Algorithm const&) = delete;
+};
+
+} // namespace tensorrt_llm
diff --git a/cpp/include/tensorrt_llm/common/mpiUtils.h b/cpp/include/tensorrt_llm/common/mpiUtils.h
index 4a7bb53ae..d5801f36c 100644
--- a/cpp/include/tensorrt_llm/common/mpiUtils.h
+++ b/cpp/include/tensorrt_llm/common/mpiUtils.h
@@ -99,7 +99,6 @@ struct MpiTypeConverter<std::byte>
 };
 
 template <>
-
 struct MpiTypeConverter<half>
 
 {
@@ -387,6 +386,7 @@ class MpiComm
     void barrier() const;
 
     void mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const;
+    bool improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const;
 
     //! \brief Returns if a message with the specified source and tag is available
     bool iprobe(int source, int tag, MPI_Status* status) const;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index 807382c4a..bd832d153 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -186,11 +186,13 @@ class ExternalDraftTokensConfig
 {
 public:
     explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional<Tensor> logits = std::nullopt,
-        std::optional<FloatType> const& acceptanceThreshold = std::nullopt);
+        std::optional<FloatType> const& acceptanceThreshold = std::nullopt,
+        std::optional<bool> const& fastLogits = std::nullopt);
 
     [[nodiscard]] VecTokens getTokens() const;
     [[nodiscard]] std::optional<Tensor> getLogits() const;
     [[nodiscard]] std::optional<FloatType> getAcceptanceThreshold() const;
+    [[nodiscard]] std::optional<bool> getFastLogits() const;
 
 private:
     friend class Serialization;
@@ -200,6 +202,8 @@ class ExternalDraftTokensConfig
     std::optional<Tensor> mLogits;
     /// @brief The acceptance threshold. Must be > 0.f and <= 1.f
     std::optional<FloatType> mAcceptanceThreshold;
+    /// @brief Use direct transfer for draft logits
+    std::optional<bool> mFastLogits;
 };
 
 /// @brief Configuration for prompt tuning
@@ -318,6 +322,18 @@ class ContextPhaseParams
     StatePtr mState{nullptr, deleter};
 };
 
+/// @brief Configuration for speculative decoding (both draft and target models)
+class SpeculativeDecodingConfig
+{
+public:
+    explicit SpeculativeDecodingConfig(bool fastLogits);
+
+    bool operator==(SpeculativeDecodingConfig const& other) const;
+
+    /// @brief Send logits tensor directly from draft to target model.
+    bool fastLogits;
+};
+
 /// @brief A class that holds information about the request
 class Request
 {
@@ -437,6 +453,16 @@ class Request
     std::unique_ptr<Impl> mImpl;
 };
 
+/// @brief Struct that holds the logits information when using direct transfer
+struct SpeculativeDecodingFastLogitsInfo
+{
+    /// @brief Draft request id
+    uint64_t draftRequestId;
+
+    /// @brief MPI world rank of the draft model leader
+    int32_t draftParticipantId;
+};
+
 /// @brief Struct that holds the generation result
 struct Result
 {
@@ -455,11 +481,14 @@ struct Result
     /// @brief The context logits. Size [promptLen, vocabSizePadded]
     std::optional<Tensor> contextLogits;
 
-    /// @brief The context logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming)
+    /// @brief The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming)
     /// or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens)
     /// or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)
     std::optional<Tensor> generationLogits;
 
+    /// @brief Logits information for direct transfer when using fast logits
+    std::optional<SpeculativeDecodingFastLogitsInfo> specDecFastLogitsInfo;
+
     /// @brief The encoder output. Size [encoderLen, hiddenSize]
     std::optional<Tensor> encoderOutput;
 
@@ -484,8 +513,8 @@ struct Result
 class Response
 {
 public:
-    Response(IdType requestId, std::string errorMsg);
-    Response(IdType requestId, Result Result);
+    Response(IdType requestId, std::string errorMsg, std::optional<IdType> clientId = std::nullopt);
+    Response(IdType requestId, Result Result, std::optional<IdType> clientId = std::nullopt);
 
     ~Response();
     Response(Response const& other);
@@ -496,6 +525,9 @@ class Response
     /// @brief Get the id of the request for which this response was generated
     [[nodiscard]] IdType getRequestId() const;
 
+    /// @brief Get the client id of the request for which this response was generated
+    [[nodiscard]] std::optional<IdType> getClientId() const;
+
     /// @brief Indicates if this response has an error or not
     [[nodiscard]] bool hasError() const;
 
@@ -873,7 +905,8 @@ class ExecutorConfig
         std::optional<SizeType32> maxQueueSize = std::nullopt,
         ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
         std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0,
-        uint64_t maxSeqIdleMicroseconds = 180000000);
+        uint64_t maxSeqIdleMicroseconds = 180000000,
+        std::optional<SpeculativeDecodingConfig> specDecConfig = std::nullopt);
 
     [[nodiscard]] SizeType32 getMaxBeamWidth() const;
     [[nodiscard]] SchedulerConfig getSchedulerConfig() const;
@@ -895,6 +928,7 @@ class ExecutorConfig
     [[nodiscard]] std::optional<DebugConfig> getDebugConfig() const;
     [[nodiscard]] SizeType32 getRecvPollPeriodMs() const;
     [[nodiscard]] uint64_t getMaxSeqIdleMicroseconds() const;
+    [[nodiscard]] std::optional<SpeculativeDecodingConfig> getSpecDecConfig() const;
 
     void setMaxBeamWidth(SizeType32 maxBeamWidth);
     void setMaxBatchSize(SizeType32 maxBatchSize);
@@ -916,6 +950,7 @@ class ExecutorConfig
     void setDebugConfig(DebugConfig const& debugConfig);
     void setRecvPollPeriodMs(SizeType32 const& recvPollPeriodMs);
     void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens);
+    void setSpecDecConfig(SpeculativeDecodingConfig const& specDecConfig);
 
 private:
     friend class Serialization;
@@ -978,6 +1013,9 @@ class ExecutorConfig
     /// @brief The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default
     /// is 3 minutes.
     uint64_t mMaxSeqIdleMicroseconds;
+
+    /// @brief The speculative decoding configuration
+    std::optional<SpeculativeDecodingConfig> mSpeculativeDecodingConfig;
 };
 
 /// @brief The executor is responsible for receiving new requests and sending responses, and running the inference
@@ -1080,6 +1118,9 @@ class Executor
     /// @brief  Indicates if the current process is allowed to enqueueRequests
     [[nodiscard]] bool canEnqueueRequests() const;
 
+    /// @brief  Indicates if the current process participates in this executor instance
+    [[nodiscard]] bool isParticipant() const;
+
 private:
     class Impl;
     std::unique_ptr<Impl> mImpl;
diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h
index 9fe197dc9..28aba9dc1 100644
--- a/cpp/include/tensorrt_llm/executor/serialization.h
+++ b/cpp/include/tensorrt_llm/executor/serialization.h
@@ -95,6 +95,11 @@ class Serialization
     static void serialize(Tensor const& tensor, std::ostream& os);
     [[nodiscard]] static size_t serializedSize(Tensor const& tensor);
 
+    // SpeculativeDecodingFastLogitsInfo
+    [[nodiscard]] static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo(std::istream& is);
+    static void serialize(SpeculativeDecodingFastLogitsInfo const& info, std::ostream& os);
+    [[nodiscard]] static size_t serializedSize(SpeculativeDecodingFastLogitsInfo const& info);
+
     // Result
     [[nodiscard]] static Result deserializeResult(std::istream& is);
     static void serialize(Result const& result, std::ostream& os);
diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
index a2476caff..5a8525caf 100644
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@@ -446,6 +446,11 @@ class DecodingMode
         return DecodingMode{kExplicitDraftTokens | kStandardStopCriteria | kUseExplicitEosStop};
     }
 
+    static auto constexpr ExternalDraftTokens()
+    {
+        return DecodingMode{kExternalDraftTokens | kUsePenalties | kUseBanTokens | kStandardStopCriteria};
+    }
+
     auto constexpr useTemperature(bool useTemp)
     {
         mState = setBitTo(kUseTemperature, useTemp);
@@ -563,6 +568,11 @@ class DecodingMode
         return anyBitSet(kExplicitDraftTokens);
     }
 
+    [[nodiscard]] bool constexpr isExternalDraftTokens() const
+    {
+        return anyBitSet(kExternalDraftTokens);
+    }
+
     [[nodiscard]] bool constexpr isUseTemperature() const
     {
         return anyBitSet(kUseTemperature);
@@ -676,6 +686,7 @@ class DecodingMode
     static UnderlyingType constexpr kMedusa{1u << (kNumFlags + 4)};
     static UnderlyingType constexpr kLookahead{1u << (kNumFlags + 5)};
     static UnderlyingType constexpr kExplicitDraftTokens{1u << (kNumFlags + 6)};
+    static UnderlyingType constexpr kExternalDraftTokens{1u << (kNumFlags + 7)};
     static UnderlyingType constexpr kTopKTopP{kTopK | kTopP};
 
     [[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
@@ -706,6 +717,7 @@ static_assert(!DecodingMode::Auto().isBeamSearch());
 static_assert(!DecodingMode::Auto().isMedusa());
 static_assert(!DecodingMode::Auto().isLookahead());
 static_assert(!DecodingMode::Auto().isExplicitDraftTokens());
+static_assert(!DecodingMode::Auto().isExternalDraftTokens());
 
 static_assert(DecodingMode::TopK().isTopK());
 static_assert(DecodingMode::TopK().isTopKorTopP());
@@ -726,6 +738,7 @@ static_assert(!DecodingMode::TopK().isBeamSearch());
 static_assert(!DecodingMode::TopK().isMedusa());
 static_assert(!DecodingMode::TopK().isLookahead());
 static_assert(!DecodingMode::TopK().isExplicitDraftTokens());
+static_assert(!DecodingMode::TopK().isExternalDraftTokens());
 
 static_assert(DecodingMode::TopP().isTopP());
 static_assert(DecodingMode::TopP().isTopKorTopP());
@@ -739,6 +752,7 @@ static_assert(!DecodingMode::TopP().isBeamSearch());
 static_assert(!DecodingMode::TopP().isMedusa());
 static_assert(!DecodingMode::TopP().isLookahead());
 static_assert(!DecodingMode::TopP().isExplicitDraftTokens());
+static_assert(!DecodingMode::TopP().isExternalDraftTokens());
 
 static_assert(DecodingMode::TopKTopP().isTopK());
 static_assert(DecodingMode::TopKTopP().isTopP());
@@ -752,6 +766,7 @@ static_assert(!DecodingMode::TopKTopP().isBeamSearch());
 static_assert(!DecodingMode::TopKTopP().isMedusa());
 static_assert(!DecodingMode::TopKTopP().isLookahead());
 static_assert(!DecodingMode::TopKTopP().isExplicitDraftTokens());
+static_assert(!DecodingMode::TopKTopP().isExternalDraftTokens());
 
 static_assert(DecodingMode::BeamSearch().isBeamSearch());
 static_assert(DecodingMode::BeamSearch().isUseStopCriteria());
@@ -760,6 +775,7 @@ static_assert(!DecodingMode::BeamSearch().isTopKorTopP());
 static_assert(!DecodingMode::BeamSearch().isMedusa());
 static_assert(!DecodingMode::BeamSearch().isLookahead());
 static_assert(!DecodingMode::BeamSearch().isExplicitDraftTokens());
+static_assert(!DecodingMode::BeamSearch().isExternalDraftTokens());
 
 static_assert(!DecodingMode::Medusa().isAuto());
 static_assert(!DecodingMode::Medusa().isTopK());
@@ -775,6 +791,7 @@ static_assert(DecodingMode::Medusa().isUseStopCriteria());
 static_assert(DecodingMode::Medusa().isUsePenalty());
 static_assert(DecodingMode::Medusa().isUseMinLength());
 static_assert(DecodingMode::Medusa().isMedusa());
+static_assert(!DecodingMode::Medusa().isExternalDraftTokens());
 
 static_assert(!DecodingMode::Lookahead().isAuto());
 static_assert(!DecodingMode::Lookahead().isTopK());
@@ -788,6 +805,7 @@ static_assert(DecodingMode::Lookahead().isUseStopCriteria());
 static_assert(DecodingMode::Lookahead().isUseStopWords());
 static_assert(DecodingMode::Lookahead().isUseExplicitEosStop());
 static_assert(DecodingMode::Lookahead().isLookahead());
+static_assert(!DecodingMode::Lookahead().isExternalDraftTokens());
 
 static_assert(!DecodingMode::ExplicitDraftTokens().isAuto());
 static_assert(!DecodingMode::ExplicitDraftTokens().isTopK());
@@ -801,4 +819,19 @@ static_assert(!DecodingMode::ExplicitDraftTokens().isUsePenalty());
 static_assert(DecodingMode::ExplicitDraftTokens().isUseStopCriteria());
 static_assert(!DecodingMode::ExplicitDraftTokens().isUseBanWords());
 static_assert(DecodingMode::ExplicitDraftTokens().isExplicitDraftTokens());
+static_assert(!DecodingMode::ExplicitDraftTokens().isExternalDraftTokens());
+
+static_assert(!DecodingMode::ExternalDraftTokens().isTopK());
+static_assert(!DecodingMode::ExternalDraftTokens().isTopP());
+static_assert(!DecodingMode::ExternalDraftTokens().isTopKorTopP());
+static_assert(!DecodingMode::ExternalDraftTokens().isTopKandTopP());
+static_assert(DecodingMode::ExternalDraftTokens().isUseBanWords());
+static_assert(DecodingMode::ExternalDraftTokens().isUseOccurrencePenalty());
+static_assert(DecodingMode::ExternalDraftTokens().isUseStopCriteria());
+static_assert(!DecodingMode::ExternalDraftTokens().isAuto());
+static_assert(!DecodingMode::ExternalDraftTokens().isBeamSearch());
+static_assert(!DecodingMode::ExternalDraftTokens().isMedusa());
+static_assert(!DecodingMode::ExternalDraftTokens().isLookahead());
+static_assert(!DecodingMode::ExternalDraftTokens().isExplicitDraftTokens());
+static_assert(DecodingMode::ExternalDraftTokens().isExternalDraftTokens());
 } // namespace tensorrt_llm::executor
diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h
index 68ebf0547..630617b11 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingInput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingInput.h
@@ -108,6 +108,20 @@ class DecodingInput
         TensorConstPtr medusaTargetTokensPerStep; //!<  [batchSize], on gpu
     };
 
+    class ExternalDraftTokensInputs
+    {
+    public:
+        TensorPtr draftLogits;
+        TensorPtr draftProbs;
+        TensorPtr targetProbs;
+        TensorPtr numDraftTokens;
+        TensorPtr draftTokenIds;
+        TensorPtr useDraftLogits;
+        SizeType32 step;
+        float constantThreshold;
+        bool useRandomAcceptanceThreshold;
+    };
+
     class ExplicitDraftTokensInputs
     {
     public:
@@ -138,6 +152,8 @@ class DecodingInput
     std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs;
 
     std::optional<LookaheadInputs> lookaheadInputs;
+
+    std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
index 41062c25f..50a76588a 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
@@ -95,7 +95,7 @@ class DecodingOutput
     // mandatory parameters for beam search
     TensorPtr logProbs;         // [BS, BM, MSL], must be float*
     TensorPtr cumLogProbs;      // [BS, BM], optional for sampling
-    TensorPtr parentIds;        // [BS, BM, MSL]
+    TensorPtr parentIds;        // [BS, BM, MSL] index of the beam where the previous token is
     TensorPtr lengths;          // [BS, BM], total sequence lengths including padding
     TensorPtr cacheIndirection; // [BS, BM, MSL], k/v indirection for next generation step
 
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
index 7e031ca70..f12362ece 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
@@ -64,16 +64,6 @@ class IGptDecoder
 
     virtual SamplingConfig const& getSamplingConfig() = 0;
 
-    static void acceptDraftTokensByIds(ITensor const& targetTokenIds, ITensor const& draftTokenIds,
-        ITensor const& contextLengths, ITensor const& numDraftTokens, ITensor& sequenceLengths,
-        ITensor const& finishedVec, ITensor& finishedFinal, ITensor& finishedSum, ITensor const& batchSlots,
-        BufferManager::CudaStreamPtr const& stream);
-
-    static void acceptDraftTokensByLogits(ITensor& draftLogits, ITensor const& targetLogits, ITensor& draftProbs,
-        ITensor& targetProbs, ITensor const& numDraftTokens, ITensor& finished, ITensor const& batchSlots,
-        SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold,
-        curandState_t* curandState, BufferManager::CudaStreamPtr const& stream);
-
     static std::unique_ptr<IGptDecoder> create(executor::DecodingMode const& mode, nvinfer1::DataType dtype,
         size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength,
         BufferManager::CudaStreamPtr const& stream,
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
index 2db8fcc18..50bd89924 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -245,7 +245,7 @@ class GptDecoderBatched : public IGptDecoderBatched
     void newRequest(SizeType32 batchSlot, decoder_batch::Request const& request, SamplingConfig const& samplingConfig);
 
     //! @brief Allocate buffers for speculative decoding.
-    void allocateSpeculativeDecodingBuffers();
+    void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype);
 
     //! @brief Setup buffers for speculative decoding.
     void setupSpeculativeDecoding(ModelConfig const& modelConfig);
@@ -300,10 +300,6 @@ class GptDecoderBatched : public IGptDecoderBatched
     DecodingInputPtr mJointDecodingInput;
     DecodingOutputPtr mJointDecodingOutput;
 
-    std::vector<bool> mAcceptByLogits;
-    TensorPtr mNumDraftTokens;
-    TensorPtr mCurandStates;
-
     std::vector<SizeType32> mNbSteps;
     std::vector<bool> mFinished;
     TensorPtr mFinishedSum;
@@ -313,18 +309,9 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     TensorPtr mFinishedSteps;     // [maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState
                                   // for each generated token of maxTokensPerStep, on gpu
-    TensorPtr mDraftProbs;        // [batchSize, maxTokensPerEngineStep, beamWidth, vocabPadded], temporary data for
-                                  // speculative decoding accept by logits kernel, on gpu
-    TensorPtr mTargetProbs;       // [batchSize, maxTokensPerEngineStep, beamWidth, vocabPadded], temporary data for
-                                  // speculative decoding accept by logits kernel, on gpu
-    TensorPtr mDraftTokenIds;     // [batchSize, maxTokensPerEngineStep], draft token indices, on gpu
-    TensorPtr mDraftLogits;       // [batchSize, maxTokensPerEngineStep, vocabSizePadded], draft token logits, on gpu
 
     TensorPtr mBatchSlotsSetup;   // [maxBatchSize], int32_t, address map, pinned
     TensorPtr mBatchSlotsDecoder; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned
-    TensorPtr mBatchSlotsAcceptTokens; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned
-    TensorPtr mBatchSlotsAcceptLogits; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned
-    TensorPtr mTargetLogitsPtrs;       // [maxBatchSize], float*, pointers to target logits, pinned
     SizeType32 mMaxSequenceLength{};
     SizeType32 mMaxAttentionWindow{};
     SizeType32 mSinkTokenLength{};
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index c54da94de..d04677b80 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10b940475c5acd80a61674d8ce4e42cc4ef3d806bafb245bbed26751378274e3
-size 4904726
+oid sha256:1a292517d802f2297c5d12d5d14ab597f47f46ebd31412fac044ceb9ca51a482
+size 5160586
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index ac692bb61..462c03949 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2754f7887a1b5c37ba3d589320e16144039cfe5dc6a6c78ee71925861d7d511
-size 5015842
+oid sha256:8575fb58200701ae30feb4b8bd3f325f8018aac5505167fdba42e269adb3bd8c
+size 5271836
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index c8b35c9c0..aff5e53bd 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-ff71eabd0ac6ede5398b5b6ce4e26dcf libtensorrt_llm_batch_manager_static.a
-846eb112a182973e7c3b0b193300b4b8 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+954182e0c057f71f858a84f746201044 libtensorrt_llm_batch_manager_static.a
+dfe6ca360cf1d24a3dcae0a2bf8589c0 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 2b867222c..4e5be000e 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13b8701dd767b414a5376a91905985979ad9d2b975465ac00835c04656ee6508
-size 4766226
+oid sha256:8fe84073b7ccff8dc361fdee64c3ef30bc523909e0bf9c16547f76a05a53fb5c
+size 5009886
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 64680e7ae..46d8c1b5c 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd0b73a017fc5c663235dcd724eb104ecc49d12ff29b6e3744be6ea952d027db
-size 4722522
+oid sha256:6e565c2c3ce58656742772591d992aca91c7e46eb9fc711599d2d51928b88b48
+size 4970532
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
index 833efc826..2c9c2852f 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-1eb5c88f894f3361445d7254cbc29b03 libtensorrt_llm_batch_manager_static.a
-4e73341b23e8fb20b732ba08e03a54a8 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+61fd34e765788884d42f4ba27f085520 libtensorrt_llm_batch_manager_static.a
+e8a64dd19a234304483ef6756e67fd40 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
index 9fd773218..d1664c2e8 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4ac61c0b0816477c11bd6c66ec4c2f23f7b6e1400eacd8c07c333f79dec0bea
-size 30794956
+oid sha256:200a6721aa1d6e009c94866adab36ac686eb1beef02df267af7e18e31e11612b
+size 32436708
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
index db6e80406..45482c43b 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
@@ -1,2 +1,2 @@
-eefe7310a60098897724f46cf4aa54f8 tensorrt_llm_batch_manager_static.lib
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+9485cfa635b17378f23d1624b3acfbaf tensorrt_llm_batch_manager_static.lib
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h
index 9f2d93316..d7bf43b40 100644
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@@ -21,7 +21,7 @@
 namespace tensorrt_llm::utils::customAllReduceUtils
 {
 
-constexpr size_t NUM_POINTERS_PER_RANK = 4;
+constexpr size_t NUM_POINTERS_PER_RANK = 7;
 
 // WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py
 inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
diff --git a/cpp/tensorrt_llm/common/mpiUtils.cpp b/cpp/tensorrt_llm/common/mpiUtils.cpp
index b637e57f1..be1de0a9e 100644
--- a/cpp/tensorrt_llm/common/mpiUtils.cpp
+++ b/cpp/tensorrt_llm/common/mpiUtils.cpp
@@ -335,6 +335,18 @@ void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status)
 #endif // ENABLE_MULTI_DEVICE
 }
 
+bool MpiComm::improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
+{
+#if ENABLE_MULTI_DEVICE
+    int flag{0};
+    MPICHECK(MPI_Improbe(source, tag, mComm, &flag, msg, status));
+    return flag != 0;
+#else
+    TLLM_THROW("Multi device support is disabled.");
+    return false;
+#endif
+}
+
 bool MpiComm::iprobe(int source, int tag, MPI_Status* status) const
 {
 #if ENABLE_MULTI_DEVICE
diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
index 979d8dd6f..c5a4fe0e2 100644
--- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
+++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
@@ -38,6 +38,12 @@ namespace common
 template <int VPT>
 struct BytesToType;
 
+template <>
+struct BytesToType<1>
+{
+    using type = uint8_t;
+};
+
 template <>
 struct BytesToType<2>
 {
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
index d7f58205a..26b60736a 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebab2cc2c62a826ddec02597178b8e0c9bc316726f37f8eef37c06795aebcf03
-size 1784658
+oid sha256:809a1da76123ec4c640d63efc902209585223b66e23d887db9a198c5836986a2
+size 3349066
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index b8e5962bf..007fa3207 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b630f89708614e63c67871e21b6e32bfde71acc51549b650c57048c0fa343e7
-size 1812686
+oid sha256:6846ecefa017d03ab7d853908794c884ab4e92a500e223278b1d64eab59ed061
+size 3376088
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
index a4434f2dd..4a30230b9 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-136f1b9d2168cbb9011a341b267af9a2 libtensorrt_llm_executor_static.a
-183bd079377d6cd698d46370168a5726 libtensorrt_llm_executor_static.pre_cxx11.a
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+5a771664fdb75d99ba5fb90249ac26f0 libtensorrt_llm_executor_static.a
+3b433ea93b7d1d6fa471b457980f2680 libtensorrt_llm_executor_static.pre_cxx11.a
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
index d1c437693..7584b1fe6 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e04c76f6441a49db4d3996c62b4055395ae018384d8ee2f02ea5f0c4c0843902
-size 1853180
+oid sha256:479e86f410763445357f5d879cc666d210352dda9709ab5ab56e73591a9e8af8
+size 7851266
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 61c25133c..0f764244d 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95ba1a4b6bdcecbb592bbb42b4998bcb0eb1f45a318163635183bcde6950c4bf
-size 1764982
+oid sha256:6473c77d18929fa75342d63ffc591df39e8aeba1dda0b920b0187d4888710559
+size 7767384
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
index ad7ba2bf9..4baf60ba7 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-dfbd0d424c150253ff758aa5bd37a971 libtensorrt_llm_executor_static.a
-e82866739fef1d6df8293541967924bf libtensorrt_llm_executor_static.pre_cxx11.a
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+5424fb0f82076e03b5316f73aed04434 libtensorrt_llm_executor_static.a
+d0b1236baf61fc5c43383bbc1cd50fa8 libtensorrt_llm_executor_static.pre_cxx11.a
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
index 2799dc524..efd7ecf87 100644
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa8ba34fb98c5407e3d6944245086158c61b2c784b15c7b923fdd156b942224d
-size 19670642
+oid sha256:dee57c9257a6678833e3c0d83e8df07aff25c185bc085db75938cec6652044c0
+size 24568210
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
index d2e341ae7..681dc3284 100644
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
@@ -1,2 +1,2 @@
-784ad1fabd3d02466f95fbc463b64f5b tensorrt_llm_executor_static.lib
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+305fac5d046a574ded2d46d968f746b0 tensorrt_llm_executor_static.lib
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
index 8b44a419a..895a91483 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
@@ -630,7 +630,7 @@ void topKSoftMaxKernelLauncher(T const* logits, T const* bias, void* workspace,
     // ┃ pTemp    ┃ BS * PAD_K * VP * (2 * (PAD_K * 2) + 2) |                          | float     |
     // ┗━━━━━━━━━━┛ --------------------------------------------------------------------------------
 
-    // Stage1: gridDim(BS,BM,nVPart), blockDim(nBlockSize,1,1)
+    // beamStage1Kernel: gridDim(BS,BM,nVPart), blockDim(nBlockSize,1,1)
     // Each ThreadBlock takes `nVocabChunk` contiguous elements in logits to do TopK and reduce_md,
     //   then writes output into pTemp.
     // At end of this kernel, each ThreadBlock holds the indices and values of the top 2*BM elements,
@@ -647,7 +647,7 @@ void topKSoftMaxKernelLauncher(T const* logits, T const* bias, void* workspace,
     // ┃    md    ┃ 2               | 2         | float     |
     // ┗━━━━━━━━━━┛ -----------------------------------------
 
-    // Stage2: gridDim(BS,BM,1), blockDim(32/64/128,1,1)
+    // beamStage2Kernel: gridDim(BS,BM,1), blockDim(32/64/128,1,1)
     // Each TheadBlock takes `nVPart` contiguous Tiles in pTemp to do reduce_topk and reduce_md,
     //   writes output topk_id into in pTempId, writes topk_value + cumLogProbs into pTempVal.
 
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
index 5857e927d..0c228692c 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
@@ -165,7 +165,7 @@ void FusedMHARunnerV2::setupKernelParams(MHARunnerParams runnerParams)
     // Use exp2f optimization for warp-specialized ws kernels on Hopper.
     if (mLaunchParams.useBase2ExpTrick)
     {
-        // The kernel adopts the log2f optimziation.
+        // The kernel adopts the log2f optimization.
         constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
         set_alpha(mKernelParams.scale_bmm1, scale_bmm1 * float(kLog2e), DATA_TYPE_FP32);
     }
@@ -364,8 +364,8 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
 void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams)
 {
     // split D into multiple groups in order to match the TMA swizzle mode (128B)
-    const uint32_t d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
-    const uint32_t d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
+    uint32_t const d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
+    uint32_t const d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
 
     // separate q, k, v and o tma descriptors
     Multiple_tma_descriptor<4> qkv_tma_descriptor;
@@ -421,8 +421,8 @@ void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams)
     uint32_t fp32_to_tf32 = 0;
 
     // gmma descriptor mode
-    const uint32_t d_bytes_per_group = d_in_bytes / d_groups;
-    const cudaTmaDescSwizzle swizzle_mode = (d_bytes_per_group > 64
+    uint32_t const d_bytes_per_group = d_in_bytes / d_groups;
+    cudaTmaDescSwizzle const swizzle_mode = (d_bytes_per_group > 64
             ? cudaTmaDescSwizzle::SWIZZLE_128B
             : (d_bytes_per_group > 32 ? cudaTmaDescSwizzle::SWIZZLE_64B : cudaTmaDescSwizzle::SWIZZLE_32B));
 
@@ -474,8 +474,8 @@ void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams)
 void FusedMHARunnerV2::setSeparateQKvTmaDescriptors(MHARunnerParams runnerParams)
 {
     // split D into multiple groups in order to match the TMA swizzle mode (128B)
-    const uint32_t d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
-    const uint32_t d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
+    uint32_t const d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
+    uint32_t const d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
 
     uint32_t q_step = 0, kv_step = 0;
     xmmaKernel->getStepSize(q_step, kv_step, mKernelParams, mLaunchParams);
@@ -518,7 +518,7 @@ void FusedMHARunnerV2::setSeparateQKvTmaDescriptors(MHARunnerParams runnerParams
         = (get_size_in_bytes(mFixedParams.dataType) == 1) ? cudaTmaDescFormat::U8 : cudaTmaDescFormat::F16_RN;
 
     // gmma descriptor mode
-    const uint32_t d_bytes_per_group = d_in_bytes / d_groups;
+    uint32_t const d_bytes_per_group = d_in_bytes / d_groups;
     cudaTmaDescSwizzle const swizzle_mode = (d_bytes_per_group > 64
             ? cudaTmaDescSwizzle::SWIZZLE_128B
             : (d_bytes_per_group > 32 ? cudaTmaDescSwizzle::SWIZZLE_64B : cudaTmaDescSwizzle::SWIZZLE_32B));
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
index 0f2a514bf..d84188139 100644
--- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
+++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
@@ -17,8 +17,11 @@
 #include "customAllReduceKernels.h"
 #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/envUtils.h"
+#include <cooperative_groups.h>
 #include <tuple>
 #include <type_traits>
 
@@ -174,12 +177,6 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
 
 namespace reduce_fusion
 {
-namespace details
-{
-static constexpr int kBytesPerAccess = 16;
-static constexpr int kWarpSize = 32;
-static constexpr int kMaxCtaSize = 1024;
-}; // namespace details
 
 inline __device__ float warp_reduce_sum(float val)
 {
@@ -318,7 +315,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)
 }
 
 template <typename T, bool Bias = false, bool Residual = false, bool Affine = false>
-void rms_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
+void rms_norm_kernel_launcher(AllReduceParams& params, cudaStream_t stream)
 {
     static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
     TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
@@ -387,6 +384,395 @@ void rms_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
     }
 }
 
+template <typename T>
+struct NegZero128b
+{
+    static constexpr int v = static_cast<int>(0x80008000);
+    static constexpr int4 value = {v, v, v, v};
+};
+
+template <>
+struct NegZero128b<float>
+{
+    static constexpr int v = static_cast<int>(0x80000000);
+    static constexpr int4 value = {v, v, v, v};
+};
+
+template <typename T>
+__device__ static constexpr int4 NegZero128b_v = NegZero128b<T>::value;
+
+template <typename T>
+__device__ __forceinline__ bool is_neg_zero(T& v);
+
+template <>
+__device__ __forceinline__ bool is_neg_zero<float>(float& v)
+{
+    uint32_t bits = *reinterpret_cast<uint32_t*>(&v);
+    return bits == 0x80000000;
+}
+
+template <>
+__device__ __forceinline__ bool is_neg_zero<half>(half& v)
+{
+    uint16_t bits = *reinterpret_cast<uint16_t*>(&v);
+    return bits == 0x8000;
+}
+
+template <>
+__device__ __forceinline__ bool is_neg_zero<__nv_bfloat16>(__nv_bfloat16& v)
+{
+    uint16_t bits = *reinterpret_cast<uint16_t*>(&v);
+    return bits == 0x8000;
+}
+
+template <typename ValType, typename VecType>
+__device__ __forceinline__ VecType remove_neg_zero(VecType const& vec)
+{
+    static constexpr int kIter = sizeof(VecType) / sizeof(ValType);
+    using ReadOnlyValType = std::add_const_t<ValType>;
+    VecType ret;
+#pragma unroll
+    for (int i = 0; i < kIter; ++i)
+    {
+        auto val = reinterpret_cast<ReadOnlyValType*>(&vec)[i];
+        reinterpret_cast<ValType*>(&ret)[i] = is_neg_zero(val) ? static_cast<ValType>(0.f) : val;
+    }
+    return ret;
+}
+
+template <typename ValType, typename VecType>
+__device__ __forceinline__ bool has_neg_zero(VecType const& vec)
+{
+    static constexpr int kIter = sizeof(VecType) / sizeof(ValType);
+    using ReadOnlyValType = std::add_const_t<ValType>;
+#pragma unroll
+    for (int i = 0; i < kIter; ++i)
+    {
+        auto val = reinterpret_cast<ReadOnlyValType*>(&vec)[i];
+        if (is_neg_zero(val))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+template <typename ValType, typename VecType>
+__device__ __forceinline__ bool all_neg_zero(VecType const& vec)
+{
+    static constexpr int kIter = sizeof(VecType) / sizeof(ValType);
+    using ReadOnlyValType = std::add_const_t<ValType>;
+#pragma unroll
+    for (int i = 0; i < kIter; ++i)
+    {
+        auto val = reinterpret_cast<ReadOnlyValType*>(&vec)[i];
+        if (!is_neg_zero(val))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+__device__ __forceinline__ void st_global_release(int4 const& val, int4* addr)
+{
+    asm volatile("st.release.global.sys.v4.b32 [%4], {%0, %1, %2, %3};" ::"r"(val.x), "r"(val.y), "r"(val.z),
+        "r"(val.w), "l"(addr));
+}
+
+__device__ __forceinline__ int4 ld_global_acquire(int4* addr)
+{
+    int4 val;
+    asm volatile("ld.acquire.global.sys.v4.b32 {%0, %1, %2, %3}, [%4];"
+                 : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                 : "l"(addr));
+    return val;
+}
+
+__device__ __forceinline__ void st_global_volatile(int4 const& val, int4* addr)
+{
+    asm volatile("st.volatile.global.v4.b32 [%4], {%0, %1, %2, %3};" ::"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w),
+        "l"(addr));
+}
+
+__device__ __forceinline__ int4 ld_global_volatile(int4* addr)
+{
+    int4 val;
+    asm volatile("ld.volatile.global.v4.b32 {%0, %1, %2, %3}, [%4];"
+                 : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                 : "l"(addr));
+    return val;
+}
+
+template <typename ValType>
+__device__ __forceinline__ void set_neg_zero(int4* addr)
+{
+    st_global_volatile(NegZero128b_v<ValType>, addr);
+}
+
+template <typename T, int RanksPerNode, bool PushMode>
+struct Reducer;
+
+template <typename T, int RanksPerNode>
+struct Reducer<T, RanksPerNode, true>
+{
+    static __device__ __forceinline__ int4 allreduce(AllReduceParams& params, int global_offset)
+    {
+        using PackedStruct = typename PackedOn16Bytes<T>::Type;
+        int ping = params.barrier_flag % 3;
+        int pong = (params.barrier_flag + 2) % 3;
+        T const* local_input_buffer = reinterpret_cast<T const*>(params.local_input_buffer_ptr);
+        T* local_shared_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + ping * MAX_RANKS_PER_NODE]);
+        T* local_clean_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + pong * MAX_RANKS_PER_NODE]);
+        local_input_buffer += global_offset;
+        local_shared_buffer += global_offset;
+        local_clean_buffer += global_offset;
+        T* buffers[RanksPerNode];
+#pragma unroll
+        for (int ii = 0; ii < RanksPerNode; ++ii)
+        {
+            int rank = (params.local_rank + ii) % RanksPerNode;
+            buffers[ii] = reinterpret_cast<T*>(
+                              params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + ping * MAX_RANKS_PER_NODE])
+                + global_offset + params.local_rank * params.elts_total;
+        }
+        PackedStruct sum_vec, val;
+        val.packed = remove_neg_zero<T>(*reinterpret_cast<int4 const*>(local_input_buffer));
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            st_global_volatile(val.packed, reinterpret_cast<int4*>(buffers[ii]));
+        }
+        sum_vec.packed = val.packed;
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            int rank = (params.local_rank + ii) % RanksPerNode;
+            set_neg_zero<T>(reinterpret_cast<int4*>(local_clean_buffer + rank * params.elts_total));
+        }
+        PackedStruct vals[RanksPerNode - 1];
+        bool done = false;
+        while (!done)
+        {
+            done = true;
+#pragma unroll
+            for (int ii = 1; ii < RanksPerNode; ++ii)
+            {
+                int rank = (params.local_rank + ii) % RanksPerNode;
+                vals[ii - 1].packed
+                    = ld_global_volatile(reinterpret_cast<int4*>(local_shared_buffer + rank * params.elts_total));
+            }
+#pragma unroll
+            for (int ii = 0; ii < RanksPerNode - 1; ii++)
+            {
+                done &= !has_neg_zero<T>(vals[ii].packed);
+            }
+        }
+
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            sum_vec.packed = add128b(sum_vec, vals[ii - 1]);
+        }
+        return sum_vec.packed;
+    }
+};
+
+template <typename T, int RanksPerNode>
+struct Reducer<T, RanksPerNode, false>
+{
+    static __device__ __forceinline__ int4 allreduce(AllReduceParams& params, int global_offset)
+    {
+        using PackedStruct = typename PackedOn16Bytes<T>::Type;
+        int ping = params.barrier_flag % 3;
+        int pong = (params.barrier_flag + 2) % 3;
+        T const* local_input_buffer = reinterpret_cast<T const*>(params.local_input_buffer_ptr);
+        T* local_shared_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + ping * MAX_RANKS_PER_NODE]);
+        T* local_clean_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + pong * MAX_RANKS_PER_NODE]);
+        local_input_buffer += global_offset;
+        local_shared_buffer += global_offset;
+        local_clean_buffer += global_offset;
+        T* buffers[RanksPerNode];
+#pragma unroll
+        for (int ii = 0; ii < RanksPerNode; ++ii)
+        {
+            int rank = (params.local_rank + ii) % RanksPerNode;
+            buffers[ii] = reinterpret_cast<T*>(
+                              params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + ping * MAX_RANKS_PER_NODE])
+                + global_offset;
+        }
+        PackedStruct sum_vec, val;
+        val.packed = remove_neg_zero<T>(*reinterpret_cast<int4 const*>(local_input_buffer));
+        st_global_volatile(val.packed, reinterpret_cast<int4*>(local_shared_buffer));
+        sum_vec.packed = val.packed;
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            do
+            {
+                val.packed = ld_global_volatile(reinterpret_cast<int4*>(buffers[ii]));
+            } while (has_neg_zero<T>(val.packed));
+            sum_vec.packed = add128b(sum_vec, val);
+        }
+        set_neg_zero<T>(reinterpret_cast<int4*>(local_clean_buffer));
+        return sum_vec.packed;
+    }
+};
+
+template <int ClusterSize, typename T, int RanksPerNode, bool Bias = false, bool Affine = false, bool PushMode = true>
+static __global__ void lamport_style_one_shot_all_reduce_norm_kernel(AllReduceParams params)
+{
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    namespace cg = cooperative_groups;
+    static_assert(RanksPerNode <= 8);
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    using PackedStruct = typename PackedOn16Bytes<T>::Type;
+
+    cg::cluster_group cluster = cg::this_cluster();
+
+    __shared__ float cluster_acc;
+
+    int bid = blockIdx.x, tid = threadIdx.x;
+    int cluster_id = bid / ClusterSize, cluster_block_rank = bid % ClusterSize;
+
+    int token_id = cluster_id;
+    int cluster_offset = token_id * params.fusion_params.hidden_size;
+    int block_offset = cluster_block_rank * params.fusion_params.hidden_size / ClusterSize;
+    int thread_offset = tid * kPackedSize;
+
+    int inner_token_offset = block_offset + thread_offset;
+    int global_offset = cluster_offset + inner_token_offset;
+
+    T const* bias_buffer = reinterpret_cast<T const*>(params.fusion_params.bias_buffer);
+    T const* residual_buffer = reinterpret_cast<T const*>(params.fusion_params.residual_buffer);
+    T const* weight_buffer = reinterpret_cast<T const*>(params.fusion_params.weight_buffer);
+    T* local_final_output_buffer = reinterpret_cast<T*>(params.local_output_buffer_ptr);
+    T* intermediate_buffer = reinterpret_cast<T*>(params.fusion_params.intermediate_buffer);
+
+    local_final_output_buffer += global_offset;
+    intermediate_buffer += global_offset;
+    residual_buffer += global_offset;
+    bias_buffer += inner_token_offset;
+    weight_buffer += inner_token_offset;
+
+    PackedStruct weight_vec, bias_vec, residual_vec;
+    residual_vec.packed = *reinterpret_cast<int4 const*>(residual_buffer);
+    if constexpr (Bias)
+    {
+        bias_vec.packed = *reinterpret_cast<int4 const*>(bias_buffer);
+    }
+    if constexpr (Affine)
+    {
+        weight_vec.packed = *reinterpret_cast<int4 const*>(weight_buffer);
+    }
+
+    cudaGridDependencySynchronize();
+
+    float acc = 0.f;
+    PackedStruct sum_vec;
+    sum_vec.packed = Reducer<T, RanksPerNode, PushMode>::allreduce(params, global_offset);
+
+    if constexpr (Bias)
+    {
+        sum_vec.packed = add128b(sum_vec, bias_vec);
+    }
+    sum_vec.packed = add128b(sum_vec, residual_vec);
+    *reinterpret_cast<int4*>(intermediate_buffer) = sum_vec.packed;
+    acc = accumulate<T>(acc, sum_vec);
+    acc = block_reduce_sum(acc);
+    if (ClusterSize > 1)
+    {
+        if (threadIdx.x == 0)
+        {
+            cluster_acc = acc;
+        }
+        cluster.sync();
+        acc = 0.f;
+#pragma unroll
+        for (int ii = 0; ii < ClusterSize; ++ii)
+        {
+            acc += *cluster.map_shared_rank(&cluster_acc, ii);
+        }
+    }
+
+    float denom = __fsqrt_rn(__fdividef(acc, params.fusion_params.hidden_size) + params.fusion_params.eps);
+    sum_vec.packed = rms_norm<T, Affine>(denom, sum_vec, weight_vec);
+    *reinterpret_cast<int4*>(local_final_output_buffer) = sum_vec.packed;
+
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+int heuristic_min_warp_number(int tp_size, int hidden_size)
+{
+    if (hidden_size >= 4096)
+    {
+        return 4;
+    }
+    if (tp_size == 2)
+    {
+        return 32;
+    }
+    else
+    {
+        return 16;
+    }
+}
+
+template <typename T, int RanksPerNode, bool Bias, bool Affine>
+void lamport_style_one_shot_all_reduce_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
+{
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
+    int threads_per_token = params.fusion_params.hidden_size / kPackedSize;
+    int warps_per_token = (threads_per_token + details::kWarpSize - 1) / details::kWarpSize;
+    int token_num = params.elts_total / params.fusion_params.hidden_size;
+    int warp_min_number = heuristic_min_warp_number(RanksPerNode, params.fusion_params.hidden_size);
+    int cluster_size = std::min(((warps_per_token + warp_min_number - 1) / warp_min_number), details::kClusterMaxSize);
+    int cta_size = warps_per_token / cluster_size * details::kWarpSize;
+    TLLM_CHECK(cta_size <= details::kMaxCtaSize);
+    int cta_num = token_num * cluster_size;
+    cudaLaunchConfig_t kernel_config = {0};
+    kernel_config.gridDim = cta_num;
+    kernel_config.blockDim = cta_size;
+    kernel_config.dynamicSmemBytes = 0;
+    kernel_config.stream = stream;
+
+    cudaLaunchAttribute attribute[2];
+    attribute[0].id = cudaLaunchAttributeClusterDimension;
+    attribute[0].val.clusterDim.x = cluster_size;
+    attribute[0].val.clusterDim.y = 1;
+    attribute[0].val.clusterDim.z = 1;
+    kernel_config.attrs = attribute;
+    kernel_config.numAttrs = 1;
+    if (tensorrt_llm::common::getEnvEnablePDL())
+    {
+        attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+        attribute[1].val.programmaticStreamSerializationAllowed = 1;
+        kernel_config.numAttrs++;
+    }
+#define LAUNCH_LAMPORT_KERNEL(CLUSTER_SIZE)                                                                            \
+    if (cluster_size == CLUSTER_SIZE)                                                                                  \
+    {                                                                                                                  \
+        TLLM_CUDA_CHECK(cudaLaunchKernelEx(&kernel_config,                                                             \
+            lamport_style_one_shot_all_reduce_norm_kernel<CLUSTER_SIZE, T, RanksPerNode, Bias, Affine>, params));      \
+        return;                                                                                                        \
+    }
+    LAUNCH_LAMPORT_KERNEL(1);
+    LAUNCH_LAMPORT_KERNEL(2);
+    LAUNCH_LAMPORT_KERNEL(3);
+    LAUNCH_LAMPORT_KERNEL(4);
+    LAUNCH_LAMPORT_KERNEL(5);
+    LAUNCH_LAMPORT_KERNEL(6);
+    LAUNCH_LAMPORT_KERNEL(7);
+    LAUNCH_LAMPORT_KERNEL(8);
+#undef LAUNCH_LAMPORT_KERNEL
+}
+
 template <typename T, int RanksPerNode, bool Bias = false, bool Affine = false, bool UseSmem = false>
 static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kernel(AllReduceParams params)
 {
@@ -495,80 +881,145 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne
 #endif
 }
 
-template <typename T, int RanksPerNode, bool Bias, bool Affine>
-void one_shot_all_reduce_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
+template <typename T>
+bool is_lamport_supported(int token_num)
 {
-    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
-    TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
-    int need_threads = params.fusion_params.hidden_size / kPackedSize;
-    int cta_size;
-    if (need_threads <= details::kMaxCtaSize)
+    static char* disableLamportReduceNormFusionChar = std::getenv("DISABLE_LAMPORT_REDUCE_NORM_FUSION");
+    bool disableLamportReduceNormFusion = (disableLamportReduceNormFusionChar != nullptr);
+    if (disableLamportReduceNormFusion)
+        return false;
+    static int sm = tensorrt_llm::common::getSMVersion();
+    if (sm < 90)
     {
-        cta_size = (need_threads + details::kWarpSize - 1) / details::kWarpSize * details::kWarpSize;
+        return false;
     }
-    else
+    if (!std::is_same_v<T, half> && !std::is_same_v<T, __nv_bfloat16>)
     {
-        cta_size = details::kMaxCtaSize;
+        return false;
     }
-    int norm_num = params.elts_total / params.fusion_params.hidden_size;
-    int cta_num = std::min(norm_num, static_cast<int>(MAX_ALL_REDUCE_BLOCKS));
-    int smem_size = 0;
-
-    if (cta_size * kPackedSize < params.fusion_params.hidden_size)
+    if (token_num > details::kLamportTokenNumThreshold)
     {
-        smem_size = params.fusion_params.hidden_size * sizeof(T);
-        if (tensorrt_llm::common::getEnvEnablePDL())
-        {
-            TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
-
-            cudaLaunchConfig_t kernelConfig = {0};
-            kernelConfig.gridDim = cta_num;
-            kernelConfig.blockDim = cta_size;
-            kernelConfig.dynamicSmemBytes = smem_size;
-            kernelConfig.stream = stream;
+        return false;
+    }
+    return true;
+}
 
-            cudaLaunchAttribute attribute[1];
-            attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-            attribute[0].val.programmaticStreamSerializationAllowed = 1;
-            kernelConfig.attrs = attribute;
-            kernelConfig.numAttrs = 1;
+bool is_lamport_supported(nvinfer1::DataType dataType, int token_num)
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kFLOAT: return is_lamport_supported<float>(token_num);
+    case nvinfer1::DataType::kHALF: return is_lamport_supported<half>(token_num);
+#ifdef ENABLE_BF16
+    case nvinfer1::DataType::kBF16: return is_lamport_supported<__nv_bfloat16>(token_num);
+#endif
+    default: return false;
+    }
+}
 
-            TLLM_CUDA_CHECK(cudaLaunchKernelEx(
-                &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>, params));
-        }
-        else
-        {
-            one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>
-                <<<cta_num, cta_size, smem_size, stream>>>(params);
-        }
+template <typename T, int RanksPerNode, bool Bias, bool Affine>
+void one_shot_all_reduce_norm_kernel_launcher(AllReduceParams& params, cudaStream_t stream)
+{
+    int token_num = params.elts_total / params.fusion_params.hidden_size;
+    if (is_lamport_supported<T>(token_num))
+    {
+        lamport_style_one_shot_all_reduce_norm_kernel_launcher<T, RanksPerNode, Bias, Affine>(params, stream);
     }
     else
     {
-        if (tensorrt_llm::common::getEnvEnablePDL())
+        static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+        TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
+        int need_threads = params.fusion_params.hidden_size / kPackedSize;
+        int cta_size;
+        if (need_threads <= details::kMaxCtaSize)
         {
-            cudaLaunchConfig_t kernelConfig = {0};
-            kernelConfig.gridDim = cta_num;
-            kernelConfig.blockDim = cta_size;
-            kernelConfig.dynamicSmemBytes = smem_size;
-            kernelConfig.stream = stream;
-
-            cudaLaunchAttribute attribute[1];
-            attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-            attribute[0].val.programmaticStreamSerializationAllowed = 1;
-            kernelConfig.attrs = attribute;
-            kernelConfig.numAttrs = 1;
+            cta_size = (need_threads + details::kWarpSize - 1) / details::kWarpSize * details::kWarpSize;
+        }
+        else
+        {
+            cta_size = details::kMaxCtaSize;
+        }
+        int norm_num = params.elts_total / params.fusion_params.hidden_size;
+        int cta_num = std::min(norm_num, static_cast<int>(MAX_ALL_REDUCE_BLOCKS));
+        int smem_size = 0;
 
-            TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
-            TLLM_CUDA_CHECK(cudaLaunchKernelEx(
-                &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>, params));
+        if (cta_size * kPackedSize < params.fusion_params.hidden_size)
+        {
+            smem_size = params.fusion_params.hidden_size * sizeof(T);
+            if (tensorrt_llm::common::getEnvEnablePDL())
+            {
+                TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
+
+                cudaLaunchConfig_t kernelConfig = {0};
+                kernelConfig.gridDim = cta_num;
+                kernelConfig.blockDim = cta_size;
+                kernelConfig.dynamicSmemBytes = smem_size;
+                kernelConfig.stream = stream;
+
+                cudaLaunchAttribute attribute[1];
+                attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+                attribute[0].val.programmaticStreamSerializationAllowed = 1;
+                kernelConfig.attrs = attribute;
+                kernelConfig.numAttrs = 1;
+
+                TLLM_CUDA_CHECK(cudaLaunchKernelEx(
+                    &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>, params));
+            }
+            else
+            {
+                one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>
+                    <<<cta_num, cta_size, smem_size, stream>>>(params);
+            }
         }
         else
         {
-            one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>
-                <<<cta_num, cta_size, smem_size, stream>>>(params);
+            if (tensorrt_llm::common::getEnvEnablePDL())
+            {
+                cudaLaunchConfig_t kernelConfig = {0};
+                kernelConfig.gridDim = cta_num;
+                kernelConfig.blockDim = cta_size;
+                kernelConfig.dynamicSmemBytes = smem_size;
+                kernelConfig.stream = stream;
+
+                cudaLaunchAttribute attribute[1];
+                attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+                attribute[0].val.programmaticStreamSerializationAllowed = 1;
+                kernelConfig.attrs = attribute;
+                kernelConfig.numAttrs = 1;
+
+                TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
+                TLLM_CUDA_CHECK(cudaLaunchKernelEx(
+                    &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>, params));
+            }
+            else
+            {
+                one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>
+                    <<<cta_num, cta_size, smem_size, stream>>>(params);
+            }
         }
     }
 }
+
+template <typename T>
+__global__ void lamport_initialize_kernel(T* buffer, size_t size)
+{
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    using PackedStruct = typename PackedOn16Bytes<T>::Type;
+    for (size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * kPackedSize; offset < size;
+         offset += gridDim.x * blockDim.x * kPackedSize)
+    {
+        set_neg_zero<T>(reinterpret_cast<int4*>(&buffer[offset]));
+    }
+}
+
+template <typename T>
+void lamport_initialize_kernel_launcher(void* buffer, size_t size, cudaStream_t stream)
+{
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    int block_size = 1024;
+    int grid_size = (size + 1024 * kPackedSize - 1) / (1024 * kPackedSize);
+    lamport_initialize_kernel<T><<<grid_size, block_size, 0, stream>>>(reinterpret_cast<T*>(buffer), size);
+}
 }; // namespace reduce_fusion
 
 template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true, bool PUSH_MODE = false>
@@ -1117,13 +1568,24 @@ void AllReduceDispatchType(AllReduceParams& params, AllReduceStrategyType strat,
     }
 }
 
-AllReduceParams AllReduceParams::deserialize(int64_t* buffer, size_t tpSize, size_t tpRank)
+AllReduceParams AllReduceParams::deserialize(
+    int64_t* buffer, size_t tpSize, size_t tpRank, nvinfer1::DataType dataType, int token_num, AllReduceFusionOp op)
 {
     void* const* buffer_ptrs = reinterpret_cast<void* const*>(buffer);
-    auto const flag_ptr = &buffer[4 * tpSize];
+    int flag_offset;
+    if (op == AllReduceFusionOp::RESIDUAL_RMS_NORM && reduce_fusion::is_lamport_supported(dataType, token_num))
+    {
+        flag_offset = 0;
+    }
+    else
+    {
+        flag_offset = 1;
+    }
+    auto const flag_ptr
+        = &buffer[tensorrt_llm::utils::customAllReduceUtils::NUM_POINTERS_PER_RANK * tpSize + flag_offset];
     // cannot use 0 since 0 represents released state for barrier
     *flag_ptr += 1;
-    TLLM_LOG_TRACE("AllReduceParams's flag value is %d", *flag_ptr);
+    TLLM_LOG_TRACE("AllReduceParams's flag value is %d, flag offset %d", *flag_ptr, flag_offset);
     uint32_t flag_value = *flag_ptr;
     AllReduceParams params;
     // Even plugins use ping buffers, odd plugins use pong.
@@ -1208,4 +1670,25 @@ void residualRmsNorm(kernels::AllReduceParams& params, nvinfer1::DataType dataTy
     sync_check_cuda_error();
 }
 
+void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream)
+{
+    sync_check_cuda_error();
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        reduce_fusion::lamport_initialize_kernel_launcher<float>(buffer, size, stream);
+        break;
+    case nvinfer1::DataType::kHALF:
+        reduce_fusion::lamport_initialize_kernel_launcher<half>(buffer, size, stream);
+        break;
+#ifdef ENABLE_BF16
+    case nvinfer1::DataType::kBF16:
+        reduce_fusion::lamport_initialize_kernel_launcher<__nv_bfloat16>(buffer, size, stream);
+        break;
+#endif
+    default: TLLM_THROW("Unsupported dataType for customAllReduce");
+    }
+    sync_check_cuda_error();
+}
+
 } // namespace tensorrt_llm::kernels
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
index ebe6b8795..6a67ba13e 100644
--- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
+++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
@@ -31,6 +31,15 @@ constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24;
 constexpr size_t MAX_RANKS_PER_NODE = 8;
 constexpr size_t DEFAULT_BLOCK_SIZE = 512;
 
+namespace reduce_fusion::details
+{
+static constexpr int kBytesPerAccess = 16;
+static constexpr int kWarpSize = 32;
+static constexpr int kMaxCtaSize = 1024;
+static constexpr int kClusterMaxSize = 8;
+static constexpr int kLamportTokenNumThreshold = 16;
+}; // namespace reduce_fusion::details
+
 // Warning: python definition is in tensorrt_llm/functional.py
 // they must be kept in sync
 enum class AllReduceStrategyType : int8_t
@@ -73,6 +82,7 @@ struct AllReduceFusionParams
     float eps;
     // new residual
     void* intermediate_buffer;
+    void* lamport_peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE * 3];
 };
 
 struct AllReduceParams
@@ -81,7 +91,8 @@ struct AllReduceParams
     size_t elts_per_rank;
     size_t elts_per_block;
     size_t rank_offset;
-    size_t ranks_per_node, local_rank;
+    size_t ranks_per_node;
+    size_t local_rank;
     uint32_t barrier_flag;
     uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
     uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
@@ -91,7 +102,8 @@ struct AllReduceParams
 
     AllReduceFusionParams fusion_params;
 
-    static AllReduceParams deserialize(int64_t* buffer, size_t tpSize, size_t tpRank);
+    static AllReduceParams deserialize(int64_t* buffer, size_t tpSize, size_t tpRank, nvinfer1::DataType dataType,
+        int token_num, AllReduceFusionOp op);
 };
 
 bool configurationSupported(AllReduceStrategyType algo, size_t msg_size, size_t n_ranks, nvinfer1::DataType type);
@@ -101,4 +113,6 @@ void customAllReduce(kernels::AllReduceParams& params, nvinfer1::DataType dataTy
 
 void residualRmsNorm(kernels::AllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream);
 
+void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream);
+
 } // namespace tensorrt_llm::kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
index 6f3b1ed98..92ae4d99b 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
 88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
index d3923a7d2..e2ce46ae4 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
 95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
index 643b3b831..3f82a0827 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1471e322bb44cd65b98ee30e0befa32ae4c86e828f0b4fd4f02d4af4e710d08f
+oid sha256:db512d533ab4e4a4abd0047a65d891dfd6e1522f2d34c90f29296c3239fd3cc1
 size 1128448
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
index 6dded519b..465df4be7 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
@@ -1,3 +1,3 @@
 b7e624ba775e9f5090ef4b67bcdbd7a2 tensorrt_llm_nvrtc_wrapper.lib
-f9b1cc37a27dd0574bb41a2763a97be7 tensorrt_llm_nvrtc_wrapper.dll
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+d89a0a140d2d427af13c3794a4b21e2c tensorrt_llm_nvrtc_wrapper.dll
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h
index fc516fb8d..73e97930d 100644
--- a/cpp/tensorrt_llm/kernels/decodingKernels.h
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.h
@@ -121,6 +121,17 @@ void invokeTransposeLogProbs(float* output_log_probs, float* output_log_probs_ti
 
 namespace runtime::kernels
 {
+//! \brief Inserts the running beams into the finished beams stored in the CBA buffers. (beams where the most likely
+//! continuation is the end token get stored separately, and another candidate next token is stored). Then sorts the
+//! beams according to their cumulative log probs. Note: the kernels in gatherTree modify the buffers inplace. When
+//! streaming, we use tmp buffers since beam search kernels expect ungathered data.
+//!
+//! \param decodingOutput contains a slice of the output buffers to gather. Also contains the
+//! DecodingOutput::BeamHypotheses object with the finished beams.
+//! \param decodingInput used for endIds and input lengths.
+//! \param manager the usual buffer manager.
+//! \param samplingConfig the usual buffer samplingConfig.
+
 void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, BufferManager const& manager,
     SamplingConfig const& samplingConfig);
 } // namespace runtime::kernels
diff --git a/cpp/tensorrt_llm/kernels/gptKernels.cu b/cpp/tensorrt_llm/kernels/gptKernels.cu
index 30ce90e0a..ae4c9d895 100644
--- a/cpp/tensorrt_llm/kernels/gptKernels.cu
+++ b/cpp/tensorrt_llm/kernels/gptKernels.cu
@@ -228,7 +228,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void computeSeqAndPaddingOffsets
         }
     }
 
-    // Perpare values for fmha.
+    // Prepare values for fmha.
     if (threadIdx.x == 0 && blockIdx.x == 0)
     {
         // Reset fmha tile counter to 0 before launching fmha kernels.
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
index 36daef37b..70cc1d3d6 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9117f7cf5eef0ed452c0d0bc79242b84def103e7038c9d3df6e366690801ca92
+oid sha256:0814af36fed752bbe70d953cefbb78dd306c42f3d9f6848b7043a865e48f9662
 size 25364090
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
index 7eaca6cd9..84879c280 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b04913f9e9029a5ce5a222d5cc7492ff53323a548079d2fb32d5b2aeb0c2268
+oid sha256:ee46f2d1c9162f4302a1031f778fcb7c7110c84110427f97af6532ed9bd342fd
 size 25768990
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
index ecfff5209..736fddd4a 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-d54fb93f256601f4c4ad7f1c8e6e9919 libtensorrt_llm_internal_cutlass_kernels_static.a
-71028d801074f11138e890391e48591d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+90740ead1def66f350e14c133278463d libtensorrt_llm_internal_cutlass_kernels_static.a
+b0104227ffd1ce19fc1fdb45e349df36 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
index 715fba593..573caf92e 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8c685f8ea2f84838dfdbf448eab41c76fe88fe29db0d4a511d6d6d241ad1832
+oid sha256:4d9ba0f8b95cf64227cb0b17654fb7c9bc1741fe003889658b305750b388a4dc
 size 44173632
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
index 4f403b38e..daa8557bd 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9d75392ba3b59853c43072b4f9949b32cb6724813a39048e4585e9a8fb3e136
+oid sha256:4f848d5beebbd69792047a96b16f7145f8e1e3e311d2a19789ce639ad8149b0e
 size 43561206
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
index dcd8a686a..0c0c38e19 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-4fc3e1fb0db6a121f88a9141605d9285 libtensorrt_llm_internal_cutlass_kernels_static.a
-253731af750407020dbe6f2fbe50fa2b libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+2aaf05cb84f52b024e89d4fa634d6900 libtensorrt_llm_internal_cutlass_kernels_static.a
+f17ce186e9105c594e39d252777ce4c7 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
index e88023db2..5aa0009ca 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62af58f5e09d1cf5e347b02ef3bd3a186469162fc9645d038fb2cba23b597722
-size 88140804
+oid sha256:c429687e335c75f08186bcd8f629b50467cb0f2e484d755834c5b1cdbb9ecaf3
+size 88140796
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
index 5bb9d18b8..e14aff7e8 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
@@ -1,2 +1,2 @@
-eb7fc4a105eb6e6f52ba865f2b055233 tensorrt_llm_internal_cutlass_kernels_static.lib
-7f370deb0090d885d7518c2b146399ba3933c004 commit
\ No newline at end of file
+4f663be2b768088805ccec6dc33545fc tensorrt_llm_internal_cutlass_kernels_static.lib
+4dbf696ae9b74a26829d120b67ab8443d70c8e58 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
index bfde5bae3..05cccf03d 100644
--- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
@@ -1458,7 +1458,7 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType,
     size_t const softmax_out_size = num_softmax_outs * sizeof(float);
     size_t const permuted_scales_size = mayHaveFinalizeFused() ? num_moe_inputs * sizeof(float) : 0;
     size_t const glu_inter_size = glu_inter_elems * gemm_output_dtype; // May be an intermediate type for quantization
-    size_t const fc1_result_size = interbuf_elems * sizeof(T);         // Acitvation quantizes so back to sizeof(T)
+    size_t const fc1_result_size = interbuf_elems * sizeof(T);         // Activation quantizes so back to sizeof(T)
     size_t const sorter_size = CubKeyValueSorter::getWorkspaceSize(num_rows, num_experts);
     size_t const fc2_result_size = permuted_elems * gemm_output_dtype; // May be an intermediate type for quantization
 
diff --git a/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu
index 21891c35f..c78fb15b9 100644
--- a/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu
@@ -1427,7 +1427,7 @@ void invokeAirTopPSamplingWithDeterministicPara(TopPSamplingKernelParams<T> cons
             kernel = airTopPSampling<T, IdxT, AccT, HisT, BitsPerPass, SAMPLING_BLOCK_SIZE, true, isDeterministic>;
         }
 
-        kernel<<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(counters, histograms, countHistograms, params.outputIds,
+        kernel<<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(counters, histograms, countHistograms, params.outputIdsPtrs,
             params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs,
             params.outputLogProbs, params.endIds, params.maxBatchSize, params.skipDecode, pass, buf1, idxBuf1, buf2,
             idxBuf2, params.batchSlots);
diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu
index 13da77bdf..5605dbcea 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu
@@ -196,11 +196,11 @@ __device__ void epilogue(SizeType32 batchId, SizeType32 currentStep, SizeType32
 }
 
 template <typename T, int blockSize>
-__global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenIdType** ids, SizeType32* sequenceLength,
-    FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs, float* outputLogProbs,
-    SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize, curandState_t* curandState,
-    float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize, bool const* skipDecode,
-    SizeType32 const* batchSlots)
+__global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenIdType* ids, TokenIdType** idsPtrs,
+    SizeType32* sequenceLength, FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs,
+    float* outputLogProbs, SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize,
+    curandState_t* curandState, float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize,
+    bool const* skipDecode, SizeType32 const* batchSlots, bool returnAllTopP, SizeType32 maxSeqLen)
 {
     /**
      * Each block processes one request row sorted in descending order by probabilities.
@@ -235,14 +235,16 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
     }
 
     auto const probThreshold = topPs[batchSlot];
-    auto const currentStep = sequenceLength[batchSlot];
+    auto const currentStep = sequenceLength == nullptr ? 0 : sequenceLength[batchSlot];
+    auto* outputIdsRequestPtr = idsPtrs == nullptr ? ids + batchSlot * maxSeqLen : idsPtrs[batchSlot];
 
     // With P in (0.0; 1.0] we draw a random number P' in range (0.0; P]
     // We will sum all probs moving from the largest probability to the smallest and
     // will choose the token which probability makes cumulative probability sum to exceed P'
     if (threadIdx.x == 0)
     {
-        randNumS = curand_uniform(curandState + blockIdx.x) * probThreshold;
+        // if we want to return all top p indices, we should not do random sampling for probThreshold
+        randNumS = returnAllTopP ? probThreshold : curand_uniform(curandState + blockIdx.x) * probThreshold;
     }
 
     // if beginOffsetBuf and offsetBuf of sorting have same value,
@@ -253,8 +255,15 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
         if (tid == 0)
         {
             auto offset = batchId * vocabSize;
-            epilogue(batchSlot, currentStep, offset, ids, sortedIdVals, sortedProbs, cumLogProbs, outputLogProbs,
-                endIds, sequenceLength, finishedOutput, maxBatchSize);
+            if (returnAllTopP)
+            {
+                outputIdsRequestPtr[currentStep] = sortedIdVals[offset];
+            }
+            else
+            {
+                epilogue(batchSlot, currentStep, offset, idsPtrs, sortedIdVals, sortedProbs, cumLogProbs,
+                    outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize);
+            }
         }
         return;
     }
@@ -267,7 +276,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
     __syncthreads();
 
     auto offset = batchId * vocabSize;
-    ids[batchSlot][currentStep] = sortedIdVals[offset];
+    outputIdsRequestPtr[currentStep] = sortedIdVals[offset];
     auto end = ((vocabSize + blockSize - 1) / blockSize) * blockSize;
     SizeType32 selectedTokenId = 0;
     // Cumulative sum
@@ -285,11 +294,31 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
         }
     }
 
-    // select first thread exceeded the prob threshold or the last thread in case of P=1.0f
-    if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1))
+    if (returnAllTopP)
+    {
+        __shared__ SizeType32 sharedSelectedTokenId;
+        if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1))
+        {
+            sharedSelectedTokenId = selectedTokenId;
+        }
+        __syncthreads();
+        for (int vi = tid; vi <= sharedSelectedTokenId; vi += blockSize)
+        {
+            outputIdsRequestPtr[vi] = sortedIdVals[offset + vi];
+        }
+        if (tid == 0 && sharedSelectedTokenId != end - 1)
+        {
+            outputIdsRequestPtr[sharedSelectedTokenId + 1] = -1; // a boundary to record the end of all selected top Ps.
+        }
+    }
+    else
     {
-        epilogue(batchSlot, currentStep, offset + selectedTokenId, ids, sortedIdVals, sortedProbs, cumLogProbs,
-            outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize);
+        // select first thread exceeded the prob threshold or the last thread in case of P=1.0f
+        if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1))
+        {
+            epilogue(batchSlot, currentStep, offset + selectedTokenId, idsPtrs, sortedIdVals, sortedProbs, cumLogProbs,
+                outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize);
+        }
     }
 }
 
@@ -371,9 +400,10 @@ void invokeBatchTopPSampling(TopPSamplingKernelParams<T> const& params, cudaStre
     dim3 grid(params.batchSize);
     // Sample with Top P given sorted tokens
     topPSsampling<T, SAMPLING_BLOCK_SIZE><<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(sortedProbs, sortedIdVals,
-        params.outputIds, params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs,
-        params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded, params.curandState, params.topPs,
-        params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots);
+        params.outputIds, params.outputIdsPtrs, params.sequenceLength, params.finishedInput, params.finishedOutput,
+        params.cumLogProbs, params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded,
+        params.curandState, params.topPs, params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots,
+        params.returnAllTopP, params.maxSeqLen);
     sync_check_cuda_error();
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h
index 1cda8bc56..2ab025ba0 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h
+++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h
@@ -28,8 +28,13 @@ struct TopPSamplingKernelParams
     //! input buffer [batchSize, vocabSizePadded], required. Probabilities of each token in the vocab.
     T const* probs{nullptr};
 
-    //! output buffer [maxBatchSize][maxSeqLen], required. Contains pointers to rows with output tokens per request.
-    runtime::TokenIdType** outputIds{nullptr};
+    //! output buffer [maxBatchSize][maxSeqLen]. Contains pointers to rows with output tokens per request.
+    //! If nullptr, outputIds must be provided.
+    runtime::TokenIdType** outputIdsPtrs{nullptr};
+
+    //! output buffer [maxBatchSize, maxSeqLen], optional. Tensor to store output tokens.
+    //! Not used if outputIdsPtrs != nullptr
+    runtime::TokenIdType* outputIds{nullptr};
 
     //! pointer to the workspace. Has to be pre-allocated by caller.
     //! Function does not take ownership of the buffer.
@@ -73,6 +78,9 @@ struct TopPSamplingKernelParams
     runtime::SizeType32 batchSize{-1};
     runtime::SizeType32 maxBatchSize{-1};
     runtime::SizeType32 vocabSizePadded{-1};
+    runtime::SizeType32 maxSeqLen{-1};
+
+    bool returnAllTopP{false};
 
     void checkParams() const
     {
@@ -81,12 +89,17 @@ struct TopPSamplingKernelParams
         TLLM_CHECK(maxBatchSize >= batchSize);
         TLLM_CHECK(vocabSizePadded > 0);
         TLLM_CHECK(probs);
-        TLLM_CHECK(outputIds);
+        TLLM_CHECK(outputIds || outputIdsPtrs);
         TLLM_CHECK(workspace);
-        TLLM_CHECK(sequenceLength);
+        TLLM_CHECK((sequenceLength != nullptr) || returnAllTopP);
         TLLM_CHECK(curandState);
         TLLM_CHECK(topPs);
 
+        if (outputIds)
+        {
+            TLLM_CHECK(maxSeqLen > 0);
+        }
+
         TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0);
     }
 };
diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu
index 6036695cd..19336e2ed 100644
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu
@@ -35,230 +35,281 @@ namespace tensorrt_llm::kernels::speculative_decoding
 {
 namespace
 {
-__global__ void acceptDraftTokensByIds(TokenIdType const* draftIds, TokenIdType const* targetIds,
-    SizeType32 const* contextLengths, SizeType32 const* numsDraftTokens, SizeType32* sequenceLengths,
-    FinishedState const* finished, FinishedState* finishedFinal, SizeType32* finishedSum, SizeType32 const* batchSlots,
-    SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 maxSeqLen, SizeType32 maxDraftTokens)
-{
-    for (auto batchIdx = static_cast<SizeType32>(threadIdx.x); batchIdx < batchSize; batchIdx += blockDim.x)
-    {
-        auto const batchSlot = batchSlots[batchIdx];
-        auto const numDraftTokens = numsDraftTokens[batchSlot];
-
-        auto const contextLength = contextLengths[batchSlot];
-        auto& sequenceLength = sequenceLengths[batchSlot];
-        SizeType32 finishedDraftIdx = 0;
-        for (auto ti = contextLength; ti < min(sequenceLength, contextLength + numDraftTokens);
-             ++ti, ++finishedDraftIdx)
-        {
-            auto const draftIdx = ti - contextLength;
-            auto const targetTokenIdx = batchSlot * maxSeqLen + ti;
-            auto const draftTokenIdx = batchSlot * maxDraftTokens + draftIdx;
-            // Check if draft tokens are the same as target tokens
-            bool const accepted = draftIds[draftTokenIdx] == targetIds[targetTokenIdx];
-            if (!accepted)
-            {
-                // Set sequence length to the numAcceptedTokens + 1
-                sequenceLength = min(ti + 1, maxSeqLen);
-                // FIXME(nkorobov): do we need to set endIds here?
-                break;
-            }
-        }
-        FinishedState finishState = finished[finishedDraftIdx * maxBatchSize + batchSlot];
-        finishedFinal[batchSlot] = finishState;
-
-        if (finishedSum)
-        {
-            finishedSum[batchSlot] = static_cast<int>(finishState.isFinished());
-        }
-    }
-}
-} // namespace
-
-void invokeAcceptDraftTokensByIds(TokenIdType const* draftIds, TokenIdType const* targetIds,
-    SizeType32 const* contextLengths, SizeType32 const* numsDraftTokens, SizeType32* sequenceLengths,
-    FinishedState const* finished, FinishedState* finishedFinal, SizeType32* finishedSum, SizeType32 const* batchSlots,
-    SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, SizeType32 maxSeqLen,
-    SizeType32 maxDraftTokens, cudaStream_t stream)
-{
-    TLLM_CHECK(beamWidth == 1);
-    dim3 block(min(1024, batchSize));
-    dim3 grid(1);
-    acceptDraftTokensByIds<<<grid, block, 0, stream>>>(draftIds, targetIds, contextLengths, numsDraftTokens,
-        sequenceLengths, finished, finishedFinal, finishedSum, batchSlots, batchSize, maxBatchSize, maxSeqLen,
-        maxDraftTokens);
-}
 
-namespace
-{
 template <typename T>
-__global__ void acceptDraftTokensByLogitsKernel(T const* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens,
-    FinishedState* finished, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 batchSize,
-    SizeType32 maxBatchSize, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSize,
-    bool randomThreshold, float constantThreshold)
+__global__ void maskTargetLogitsKernel(T* targetLogits, SizeType32 const* batchSlots, SizeType32 beamWidth,
+    SizeType32 vocabSize, FinishedState const* finishedInput, SizeType32 maxBatchSize, bool const* batchUseDraftLogits,
+    SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds, SizeType32* runtimeTopKDevicePtr, bool* maskBuffer)
 {
+    /**
+     * @brief Masking the selected token to -inf as was done in Huggingface TopK/TopP Logits Warper
+     * https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/generation/logits_process.py#L533
+     */
+
     auto const bid = blockIdx.x;
-    auto const draftTokenIdx = blockIdx.y;
     auto const batchIdx = bid / beamWidth;
-    auto const beamIdx = bid % beamWidth;
+    auto const tid = static_cast<SizeType32>(threadIdx.x);
     auto const batchSlot = batchSlots[batchIdx];
-    auto const batchSlotBeamWidth = batchSlot * beamWidth + beamIdx;
 
-    auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth];
+    constexpr bool IS_HALF = std::is_same<T, half>::value;
+    T const MAX_T_VAL = (IS_HALF) ? HALF_FLT_MAX : FLT_MAX;
+
+    auto targetLogitsBatch = targetLogits + batchIdx * vocabSize;
+    auto& finishedState = finishedInput[batchSlot];
 
-    if (draftTokenIdx >= numDraftTokens)
+    auto* outputIdsAfterSamplingPtr = outputIdsAfterSampling + batchSlot * vocabSize;
+    auto const useDraftLogits = batchUseDraftLogits[batchSlot];
+
+    if (finishedState.isSkipDecoding())
     {
         return;
     }
 
-    auto const logitsOffset = (batchSlot * maxDraftTokens + draftTokenIdx) * beamWidth * vocabSize;
-    auto const draftProbsBatch = draftProbs + logitsOffset;
-    auto const targetProbsBatch = targetProbs + logitsOffset;
-    auto const vocabSizePadded = static_cast<SizeType32>((vocabSize + blockDim.x - 1) / blockDim.x) * blockDim.x;
+    __shared__ SizeType32 tokensToMask;
 
-    struct Candidate candidate;
-    __shared__ float threshold;
-    if (threadIdx.x == 0)
+    if (tid == 0)
     {
-        threshold = randomThreshold ? curand_uniform(curandState + batchSlot) : constantThreshold;
+        tokensToMask = runtimeTopKDevicePtr[batchSlot];
     }
     __syncthreads();
 
-    for (auto vIdx = static_cast<SizeType32>(threadIdx.x); vIdx < vocabSizePadded;
-         vIdx += static_cast<SizeType32>(blockDim.x))
+    for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
     {
-        bool const pred = vIdx < vocabSize;
-        auto const targetProb = pred ? static_cast<float>(targetProbsBatch[vIdx]) : 1.f;
-        auto const draftProb = pred ? static_cast<float>(draftProbsBatch[vIdx]) : 0.f;
-
-        if (draftProb > candidate.maxProb)
-        {
-            candidate.maxProb = draftProb;
-            candidate.rateQP = pred ? targetProb / draftProb : 0.f;
+        if (tokensToMask == 0 && outputIdsAfterSamplingPtr[vIdx] == -1)
+        { // we need to find the -1 boundary from returnAllTopP outputIds if topK == 0
+            tokensToMask = vIdx;
         }
+        maskBuffer[vIdx] = false;
     }
+
     __syncthreads();
 
-    typedef cub::BlockReduce<Candidate, 1024> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage reduce_buffer;
-    Candidate candidate_global = BlockReduce(reduce_buffer).Reduce(candidate, reduce_op);
+    if (!useDraftLogits && tid == 0)
+    {
+        targetOutputIds[batchSlot] = outputIdsAfterSamplingPtr[tokensToMask - 1];
+    }
+
+    for (SizeType32 vIdx = tid; vIdx < tokensToMask; vIdx += static_cast<SizeType32>(blockDim.x))
+    {
+        auto tokenToMask = outputIdsAfterSamplingPtr[vIdx];
+        maskBuffer[tokenToMask] = true;
+    }
+
     __syncthreads();
 
-    if (threadIdx.x == 0)
+    for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
     {
-        finished[draftTokenIdx * maxBatchSize * beamWidth + batchSlotBeamWidth]
-            = candidate_global.rateQP < threshold ? FinishedState::skipDecoding() : FinishedState::empty();
+        if (!maskBuffer[vIdx])
+        {
+            targetLogitsBatch[vIdx] = -MAX_T_VAL;
+        }
     }
 }
 
 template <typename T>
-__global__ void correctAcceptedStatesAndLogits(T const* draftProbs, T* targetProbs, T** targetLogits,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, SizeType32 const* batchSlots, SizeType32 batchSize,
-    SizeType32 maxBatchSize, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSize)
+__global__ void acceptDraftTokensKernel(T const* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens,
+    bool const* batchUseDraftLogits, TokenIdType const* draftIds, FinishedState const* finishedInput,
+    FinishedState* finishedOutput, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 maxDraftTokens,
+    SizeType32 beamWidth, SizeType32 vocabSize, bool randomThreshold, float constantThreshold, SizeType32 step,
+    bool* batchIsAccepted, SizeType32* targetOutputIds)
 {
     auto const bid = blockIdx.x;
+    auto const draftTokenIdx = step;
     auto const batchIdx = bid / beamWidth;
     auto const beamIdx = bid % beamWidth;
     auto const batchSlot = batchSlots[batchIdx];
     auto const batchSlotBeamWidth = batchSlot * beamWidth + beamIdx;
+    auto const tid = static_cast<SizeType32>(threadIdx.x);
+
     auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth];
+    auto const useDraftLogits = batchUseDraftLogits[batchSlotBeamWidth];
 
-    __shared__ SizeType32 numAcceptedTokens;
-    if (threadIdx.x == 0)
+    if (draftTokenIdx > numDraftTokens || finishedInput[batchSlot].isSkipDecoding())
     {
-        numAcceptedTokens = numDraftTokens;
-        bool cummulativeSkipDecoding = false;
-        for (SizeType32 ti = 0; ti < numDraftTokens + 1; ++ti)
+        if (tid == 0)
         {
-            auto& finishedState = finished[ti * maxBatchSize * beamWidth + batchSlotBeamWidth];
-            bool localSkipDecoding = finishedState.isSkipDecoding();
-            if (cummulativeSkipDecoding == false && localSkipDecoding == true)
+            batchIsAccepted[batchSlot] = true;
+            finishedOutput[batchSlot].setSkipDecoding();
+        }
+        return;
+    }
+
+    auto const logitsOffset = (batchSlot * maxDraftTokens + draftTokenIdx) * beamWidth * vocabSize;
+    auto const draftProbsBatch = draftProbs + logitsOffset;
+    auto const targetProbsBatch = targetProbs + (batchIdx * beamWidth * vocabSize);
+
+    __shared__ bool isAccepted;
+    __shared__ T sSumVal;
+    if (tid == 0)
+    {
+        if (draftTokenIdx < numDraftTokens)
+        {
+            auto const draftOutputTokenId = draftIds[batchSlot * maxDraftTokens + draftTokenIdx];
+            if (useDraftLogits)
             {
-                numAcceptedTokens = ti;
+                float threshold = randomThreshold ? curand_uniform(curandState + batchSlot) : constantThreshold;
+                auto const targetProb = static_cast<float>(targetProbsBatch[draftOutputTokenId]);
+                auto const draftProb = static_cast<float>(draftProbsBatch[draftOutputTokenId]);
+                auto rateQP = targetProb / draftProb;
+                if (rateQP < threshold)
+                {
+                    isAccepted = false;
+                    finishedOutput[batchSlot].setSkipDecoding();
+                }
+                else
+                {
+                    isAccepted = true;
+                }
             }
-
-            finishedState = cummulativeSkipDecoding ? FinishedState::skipDecoding() : FinishedState::empty();
-            cummulativeSkipDecoding |= localSkipDecoding;
+            else
+            {
+                // Check if draft tokens are the same as target tokens
+                isAccepted = targetOutputIds[batchSlot] == draftOutputTokenId;
+                if (!isAccepted)
+                {
+                    finishedOutput[batchSlot].setSkipDecoding();
+                }
+            }
+        }
+        else
+        {
+            isAccepted = false;
+            finishedOutput[batchSlot].setSkipDecoding();
         }
+        batchIsAccepted[batchSlot] = isAccepted;
     }
+
     __syncthreads();
 
-    if (numAcceptedTokens < numDraftTokens)
+    if (!isAccepted)
     {
-        auto const logitsIdx = (batchSlot * maxDraftTokens + numAcceptedTokens) * beamWidth * vocabSize;
-        auto const draftProbBatch = draftProbs + logitsIdx;
-        auto targetProbBatch = targetProbs + logitsIdx;
-        auto targetLogitsBatch = targetLogits[bid] + numAcceptedTokens * beamWidth * vocabSize;
-
-        float sumProbs = 0.f;
-        for (SizeType32 vIdx = static_cast<SizeType32>(threadIdx.x); vIdx < vocabSize;
-             vIdx += static_cast<SizeType32>(blockDim.x))
+        T const zeroVal = static_cast<T>(0.0f);
+        T sumVal = zeroVal;
+        for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
         {
-            auto const correctedProb = max(static_cast<float>(targetProbBatch[vIdx] - draftProbBatch[vIdx]), 0.f);
-            sumProbs += correctedProb;
-            targetProbBatch[vIdx] = correctedProb;
+            targetProbsBatch[vIdx]
+                -= (draftTokenIdx < numDraftTokens && useDraftLogits) ? draftProbsBatch[vIdx] : zeroVal;
+            targetProbsBatch[vIdx] = targetProbsBatch[vIdx] >= zeroVal ? targetProbsBatch[vIdx] : zeroVal;
+            sumVal += targetProbsBatch[vIdx];
         }
-
-        __shared__ float sumProbsShared;
-        sumProbs = blockReduceSum<float>((float) sumProbs);
-        if (threadIdx.x == 0)
+        sumVal = blockReduceSum<T>(sumVal);
+        if (tid == 0)
         {
-            sumProbsShared = max(sumProbs, 1e-6f);
+            sSumVal = sumVal;
         }
         __syncthreads();
 
-        for (SizeType32 vIdx = static_cast<SizeType32>(threadIdx.x); vIdx < vocabSize;
-             vIdx += static_cast<SizeType32>(blockDim.x))
+        for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
         {
-            auto const correctedNormProb = static_cast<float>(targetProbBatch[vIdx]) / sumProbsShared;
-            targetLogitsBatch[vIdx] = __logf(correctedNormProb / (1.f - correctedNormProb));
+            targetProbsBatch[vIdx] /= sSumVal;
         }
     }
 }
+
+__global__ void forwardAcceptedTokensKernel(SizeType32 batchSize, SizeType32 const* batchSlots, bool* batchIsAccepted,
+    SizeType32* sequenceLengths, TokenIdType const* draftIds, TokenIdType** idsPtrs, SizeType32 step,
+    SizeType32 maxDraftTokens, TokenIdType const* endIds, FinishedState* finishedOutput)
+{
+    auto index = static_cast<SizeType32>(blockIdx.x * blockDim.x + threadIdx.x);
+    for (SizeType32 bi = index; bi < batchSize; bi += static_cast<SizeType32>(gridDim.x * blockDim.x))
+    {
+        auto const batchSlot = batchSlots[bi];
+        if (batchIsAccepted[batchSlot] && !finishedOutput[batchSlot].isSkipDecoding())
+        {
+            auto const curSeqLen = sequenceLengths[batchSlot];
+            auto const draftTokenIdx = step;
+            auto const draftOutputTokenId = draftIds[batchSlot * maxDraftTokens + draftTokenIdx];
+            auto* outputIdsRequestPtr = idsPtrs[batchSlot];
+            auto const outIdx = curSeqLen;
+            outputIdsRequestPtr[outIdx] = draftOutputTokenId;
+            if (outputIdsRequestPtr[outIdx] == endIds[batchSlot])
+            {
+                finishedOutput[batchSlot].setFinishedEOS();
+                // Do not increase seq len when EOS is generated. Seq len should always contain only tokens to be
+                // outputted
+            }
+            else
+            {
+                // We don't need to set output finished state as it is assumed to be in non finished state
+                sequenceLengths[batchSlot] += 1;
+            }
+        }
+    }
+} // namespace
+
 } // namespace
 
 template <typename T>
-void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs, T* targetProbs,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold,
-    float constantThreshold, cudaStream_t stream)
+void invokeMaskTargetLogits(SizeType32 batchSize, T* targetLogits, SizeType32 const* batchSlots, SizeType32 beamWidth,
+    SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize,
+    bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds,
+    SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream)
 {
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK(beamWidth == 1);
-    {
-        invokeAddBiasSoftMax(draftLogits, static_cast<T**>(nullptr), draftProbs, static_cast<T*>(nullptr), nullptr,
-            finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,
-            /* skip softmax */ false,
-            /* batchSlotLogits */ true, stream);
-        invokeAddBiasSoftMax(static_cast<T*>(nullptr), targetLogits, targetProbs, static_cast<T*>(nullptr), nullptr,
-            finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,
-            /* skip softmax */ false,
-            /* batchSlotLogits */ true, stream);
-    }
     {
         dim3 block(1024);
-        dim3 grid(batchSize * beamWidth, maxDraftTokens);
-        acceptDraftTokensByLogitsKernel<<<grid, block, 0, stream>>>(draftProbs, targetProbs, numsDraftTokens, finished,
-            curandState, batchSlots, batchSize, maxBatchSize, maxDraftTokens, beamWidth, vocabSizePadded,
-            randomThreshold, constantThreshold);
+        dim3 grid(batchSize * beamWidth);
+        maskTargetLogitsKernel<<<grid, block, 0, stream>>>(targetLogits, batchSlots, beamWidth, vocabSizePadded,
+            finishedInput, maxBatchSize, batchUseDraftLogits, outputIdsAfterSampling, targetOutputIds,
+            runtimeTopKDevicePtr, maskBuffer);
     }
+    sync_check_cuda_error();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void invokeAcceptDraftTokens(SizeType32 batchSize, T* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens,
+    bool const* batchUseDraftLogits, TokenIdType const* draftIds, FinishedState const* finishedInput,
+    FinishedState* finishedOutput, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 maxDraftTokens,
+    SizeType32 beamWidth, SizeType32 vocabSizePadded, bool randomThreshold, float constantThreshold, SizeType32 step,
+    bool* batchIsAccepted, SizeType32* targetOutputIds, cudaStream_t stream)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    TLLM_CHECK(beamWidth == 1);
     {
         dim3 block(1024);
         dim3 grid(batchSize * beamWidth);
-        correctAcceptedStatesAndLogits<<<grid, block, 0, stream>>>(draftProbs, targetProbs, targetLogits,
-            numsDraftTokens, finished, batchSlots, batchSize, maxBatchSize, maxDraftTokens, beamWidth, vocabSizePadded);
+        acceptDraftTokensKernel<<<grid, block, 0, stream>>>(draftProbs, targetProbs, numsDraftTokens,
+            batchUseDraftLogits, draftIds, finishedInput, finishedOutput, curandState, batchSlots, maxDraftTokens,
+            beamWidth, vocabSizePadded, randomThreshold, constantThreshold, step, batchIsAccepted, targetOutputIds);
     }
+    sync_check_cuda_error();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-template void acceptDraftTokensByLogits(float* draftLogits, float** targetLogits, float* draftProbs, float* targetProbs,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold,
-    float constantThreshold, cudaStream_t stream);
-template void acceptDraftTokensByLogits(half* draftLogits, half** targetLogits, half* draftProbs, half* targetProbs,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold,
-    float constantThreshold, cudaStream_t stream);
+template void invokeMaskTargetLogits(SizeType32 batchSize, float* targetLogits, SizeType32 const* batchSlots,
+    SizeType32 beamWidth, SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize,
+    bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds,
+    SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream);
+template void invokeMaskTargetLogits(SizeType32 batchSize, half* targetLogits, SizeType32 const* batchSlots,
+    SizeType32 beamWidth, SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize,
+    bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds,
+    SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream);
+
+template void invokeAcceptDraftTokens(SizeType32 batchSize, float* draftProbs, float* targetProbs,
+    SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, TokenIdType const* draftIds,
+    FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState,
+    SizeType32 const* batchSlots, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSizePadded,
+    bool randomThreshold, float constantThreshold, SizeType32 step, bool* batchIsAccepted, SizeType32* targetOutputIds,
+    cudaStream_t stream);
+template void invokeAcceptDraftTokens(SizeType32 batchSize, half* draftProbs, half* targetProbs,
+    SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, TokenIdType const* draftIds,
+    FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState,
+    SizeType32 const* batchSlots, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSizePadded,
+    bool randomThreshold, float constantThreshold, SizeType32 step, bool* batchIsAccepted, SizeType32* targetOutputIds,
+    cudaStream_t stream);
 
+void invokeForwardAcceptedTokens(SizeType32 batchSize, SizeType32 const* batchSlots, bool* batchIsAccepted,
+    SizeType32* outputSequenceLengths, TokenIdType const* draftIds, TokenIdType** idsPtrs, SizeType32 step,
+    SizeType32 maxDraftTokens, TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    dim3 block(std::min(static_cast<uint32_t>(batchSize), 256u));
+    dim3 grid(divUp(static_cast<uint32_t>(batchSize), block.x));
+    forwardAcceptedTokensKernel<<<grid, block, 0, stream>>>(batchSize, batchSlots, batchIsAccepted,
+        outputSequenceLengths, draftIds, idsPtrs, step, maxDraftTokens, endIds, finishedOutput);
+    sync_check_cuda_error();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
 } // namespace tensorrt_llm::kernels::speculative_decoding
diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h
index 4921e1390..69ee81e40 100644
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h
@@ -26,84 +26,77 @@
 namespace tensorrt_llm::kernels::speculative_decoding
 {
 
-//! \brief Accepts or rejects draft tokens based on the equality of draft and target tokens
-//! for speculative decoding. Target token is accepted if targetToken == draftToken.
-//! If number of accepted tokens N < maxDraftTokens, then function accepts N + 1 tokens of target model.
-//! sequenceLengths, finishedSum and finishedFinal are modified accordingly.
-//!
-//! \param draftIds input buffer [batchSize, maxDraftTokens].
-//! Indices of the draft tokens.
-//! \param targetIds input buffer [batchSize, maxSeqLen]. Indices of the tokens decoded by the target model
-//! \param contextLengths input buffer [batchSize]. Context lengths of the requests without draft tokens
-//! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request
-//! \param sequenceLengths input/output buffer [batchSize] sequence lengths of the requests in batch
-//! Modified in-place according to the accepted/rejected tokens
-//! \param finished input buffer [maxDraftTokens + 1, batchSize] finished states at each decoding iteration
-//! \param finishedFinal output buffer [batchSize] finished states after accepting/rejecting tokens
-//! \param finishedSum output buffer [1] total number of requests in batch that finished the execution
-//! \param batchSlots input buffer [batchSize], address map from local index
-//! to global index [0, batchSize] -> [0, maxBatchSize]
-//! \param batchSize current batch size
-//! \param maxBatchSize maximum batch size
-//! \param beamWidth beam width
-//! \param maxSeqLen maximum sequence length
-//! \param maxDraftTokens maximum number of draft tokens
-//! \param stream stream
-void invokeAcceptDraftTokensByIds(runtime::TokenIdType const* draftIds, runtime::TokenIdType const* targetIds,
-    runtime::SizeType32 const* contextLengths, runtime::SizeType32 const* numsDraftTokens,
-    runtime::SizeType32* sequenceLengths, FinishedState const* finished, FinishedState* finishedFinal,
-    runtime::SizeType32* finishedSum, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize,
-    runtime::SizeType32 maxBatchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxSeqLen,
-    runtime::SizeType32 maxDraftTokens, cudaStream_t stream);
-
-//! \brief Performs probabilistic acceptance of draft tokens based on their probability distributions.
-//! Corrects targetLogits for the next to the last accepted token
+//! \brief Accepts or rejects draft tokens based on their probability distributions or the equality of draft and target
+//! tokens. Corrects targetLogits for the last accepted token
 //! according to https://openreview.net/pdf?id=C9NEblP8vS
 //!
-//! \param draftLogits input/output buffer [draftTokens, batchSize, beamWidth, vocabSize].
-//! Initially contains token logits of the draft model.
-//! \param targetLogits input/output buffer [batchSize][draftTokens+1, beamWidth, vocabSize].
-//! Vector of pointers to the logits.
-//! Initially contains token logits of the target model.
-//! It is modified in-place for next to the last accepted token such as
-//! P'(x) = norm(max(0, P_{n+1}(x) - Q_{n+1}(x))), where N < maxDraftTokens is number of accepted tokens.
+//! \param batchSize current batch size
 //! \param draftProbs output buffer [maxDraftTokens, batchSize, beamWidth, vocabSize].
 //! Workspace buffer for token probabilities of the draft model.
 //! \param targetProbs output buffer [maxDraftTokens+1, batchSize, beamWidth, vocabSize].
 //! Workspace buffer for token probabilities of the target model.
 //! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request
-//! \param finished output buffer [draftTokens, batchSize, beamWidth].
-//! At each step sets to NOT_FINISHED if token is accepted or SKIP_DECODING if token is not accepted
-//! \param curandState input buffer [batchSize]. Curand states properly
-//! initialized using invokeCurandInitialize per request.
-//! \param batchSlots input buffer [batchSize], address map from local index
-//! to global index [0, batchSize] -> [0, maxBatchSize]
-//! \param batchSize current batch size
-//! \param maxBatchSize maximum batch size
-//! \param beamWidth beam width
-//! \param vocabSize unpadded vocab size
-//! \param vocabSizePadded padded vocab size
+//! \param batchUseDraftLogits input buffer [batchSize]. Acceptance logic using draft logits or not, per request
+//! \param draftIds input buffer [batchSize, draftTokens]. Pointer to draft token ids.
+//! \param finishedInput input buffer [batchSize, beamWidth].
+//! \param finishedOutput output buffer [batchSize, beamWidth]. At each step sets SKIP_DECODING if token is not
+//! accepted.
+//! \param curandState input buffer [batchSize]. Curand states properly initialized using invokeCurandInitialize
+//! per request.
+//! \param batchSlots input buffer [batchSize], address map from local index to global index [0, batchSize] ->
+//! [0, maxBatchSize].
 //! \param maxDraftTokens maximum number of draft tokens
+//! \param beamWidth beam width (only beamWidth == 1 supported)
+//! \param vocabSizePadded padded vocab size
 //! \param randomThreshold True if use uniformly sampled threshold for token acceptance
 //! \param constantThreshold threshold used to accept tokens if randomThreshold is false
+//! \param step The current step of decoding (draft token id index)
+//! \param batchIsAccepted output buffer [batchSize]. Stores acceptance result for multinomial sampling later or
+//! forwarding next step.
+//! \param targetOutputIds input/output buffer [batchSize]. Stores target sampling output ids for acceptById
+//! logics.
 //! \param stream stream
 template <typename T>
-void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs, T* targetProbs,
-    runtime::SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 maxBatchSize,
-    runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSize, runtime::SizeType32 vocabSizePadded,
-    runtime::SizeType32 maxDraftTokens, bool randomThreshold, float constantThreshold, cudaStream_t stream);
+void invokeAcceptDraftTokens(runtime::SizeType32 batchSize, T* draftProbs, T* targetProbs,
+    runtime::SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, runtime::TokenIdType const* draftIds,
+    FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState,
+    runtime::SizeType32 const* batchSlots, runtime::SizeType32 maxDraftTokens, runtime::SizeType32 beamWidth,
+    runtime::SizeType32 vocabSizePadded, bool randomThreshold, float constantThreshold, runtime::SizeType32 step,
+    bool* batchIsAccepted, runtime::SizeType32* targetOutputIds, cudaStream_t stream);
 
-struct Candidate // Hold probability maximum and rate of target / dfraft, used in `acceptDraftTokensByLogits`
-{
-    float maxProb{0.f};
-    float rateQP{0.f};
-};
+//! \brief Mask the target logits with -inf for unselected topK/topP token ids.
+//! according to
+//! https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/generation/utils.py#L4064
+//!
+//! \param batchSize current batch size
+//! \param targetLogits input/output buffer [batchSize][draftTokens+1, beamWidth, vocabSize].
+//! Vector of pointers to the logits. (beamWidth == 1)
+//! Initially contains token logits of the target model.
+//! \param batchSlots input buffer [batchSize], address map from local index to global index [0, batchSize] ->
+//! [0, maxBatchSize].
+//! \param beamWidth beam width (only beamWidth == 1 supported)
+//! \param vocabSizePadded padded vocab size
+//! \param finishedInput input buffer [batchSize, beamWidth].
+//! \param maxBatchSize maximum batch size
+//! \param batchUseDraftLogits input buffer [batchSize]. Acceptance logic using draft logits or not, per request
+//! \param outputIdsAfterSampling input buffer [batchSize, vocabSize]. Stores all selected IDs from sampling for
+//! masking.
+//! \param targetOutputIds input/output buffer [batchSize]. Stores target sampling output ids for acceptById
+//! logics.
+//! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request
+//! \param runtimeTopKDevicePtr input buffer [batchSize] the topks in sampling step, for porting topK ids out.
+//! \param maskBuffer input buffer [batchSize, vocabSize] for masking calculation (index value to position).
+//! \param stream stream
+template <typename T>
+void invokeMaskTargetLogits(runtime::SizeType32 batchSize, T* targetLogits, runtime::SizeType32 const* batchSlots,
+    runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSizePadded, FinishedState const* finishedInput,
+    runtime::SizeType32 maxBatchSize, bool const* batchUseDraftLogits, runtime::SizeType32* outputIdsAfterSampling,
+    runtime::SizeType32* targetOutputIds, runtime::SizeType32* runtimeTopKDevicePtr, bool* maskBuffer,
+    cudaStream_t stream);
 
-__device__ __forceinline__ Candidate reduce_op(Candidate const& a, Candidate const& b)
-{
-    // Max-reduce operator of Candidate
-    return (a.maxProb > b.maxProb) ? a : b;
-}
+void invokeForwardAcceptedTokens(runtime::SizeType32 batchSize, runtime::SizeType32 const* batchSlots,
+    bool* batchIsAccepted, runtime::SizeType32* outputSequenceLengths, runtime::TokenIdType const* draftIds,
+    runtime::TokenIdType** idsPtrs, runtime::SizeType32 step, runtime::SizeType32 maxDraftTokens,
+    runtime::TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream);
 
 } // namespace tensorrt_llm::kernels::speculative_decoding
diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h
index 0ba522222..f60ac784e 100644
--- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h
+++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h
@@ -73,7 +73,7 @@ void invokeLengthCriterion(FinishedState* finished, runtime::SizeType32* finishe
     runtime::SizeType32* numNewTokens, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize,
     runtime::SizeType32 beamWidth, cudaStream_t stream);
 
-//! \brief Sets finished states based on the endIds and ajusts sequence length to length before the first EOS token.
+//! \brief Sets finished states based on the endIds and adjusts sequence length to length before the first EOS token.
 //! Does not support beamWidth > 1 for now.
 //!
 //! \param outputIds input buffer [maxBatchSize][beamWidth, maxSeqLen].
diff --git a/cpp/tensorrt_llm/layers/decodingLayer.cpp b/cpp/tensorrt_llm/layers/decodingLayer.cpp
index 1d91a626b..7e5c75964 100644
--- a/cpp/tensorrt_llm/layers/decodingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/decodingLayer.cpp
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/layers/beamSearchLayer.h"
 #include "tensorrt_llm/layers/decodingParams.h"
 #include "tensorrt_llm/layers/explicitDraftTokensLayer.h"
+#include "tensorrt_llm/layers/externalDraftTokensLayer.h"
 #include "tensorrt_llm/layers/layerUtils.h"
 #include "tensorrt_llm/layers/lookaheadDecodingLayer.h"
 #include "tensorrt_llm/layers/medusaDecodingLayer.h"
@@ -96,6 +97,10 @@ DecodingLayer<T>::DecodingLayer(executor::DecodingMode const& mode, DecoderDomai
     {
         mDecodingLayer = std::make_unique<ExplicitDraftTokensLayer<T>>(decoderDomain, mBufferManager);
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        mDecodingLayer = std::make_unique<ExternalDraftTokensLayer<T>>(mDecodingMode, decoderDomain, mBufferManager);
+    }
     else
     {
         TLLM_CHECK_WITH_INFO(false,
@@ -144,6 +149,12 @@ void DecodingLayer<T>::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorC
             beamWidth == 1, "Decoding mode is ExplicitDraftTokens, but beamWidth != 1 (%d != 1)", beamWidth);
         mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams, workspace);
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        TLLM_CHECK_WITH_INFO(
+            beamWidth == 1, "Decoding mode is external draft tokens, but beamWidth != 1 (%d != 1)", beamWidth);
+        mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams, workspace);
+    }
     else
     {
         TLLM_CHECK_WITH_INFO(false,
@@ -249,6 +260,45 @@ std::tuple<std::shared_ptr<BaseDecodingOutputs>, std::shared_ptr<BaseDecodingInp
         preparedInputs = baseInputs;
         preparedOutputs = baseOutputs;
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        auto externalDraftTokenParams = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+        auto const ite = externalDraftTokenParams->ite;
+        auto const step = externalDraftTokenParams->step;
+        auto const localBatchSize = static_cast<int64_t>(externalDraftTokenParams->localBatchSize);
+
+        TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() == 1,
+            "Decoding mode is TopK and/or TopP, but beamWidth != 1 (%d != 1)", localDecoderDomain.getBeamWidth());
+
+        // In sampling, we have supported batch sampling. So, we always compute all
+        // sentences once.
+        TensorConstPtr logitsSlice = ITensor::slice(*externalDraftTokenParams->logits, 0, localBatchSize);
+        TensorConstPtr endIdSlice = ITensor::slice(endIds, 0, localBatchSize);
+        auto decodeInputs = std::make_shared<ExternalDraftTokensInputs>(
+            endIdSlice, externalDraftTokenParams->batchSlots, step, ite, localBatchSize);
+
+        decodeInputs->finished = externalDraftTokenParams->finished;
+
+        decodeInputs->logits = logitsSlice;
+
+        if (externalDraftTokenParams->inputLengths)
+        {
+            auto& inputLengths = externalDraftTokenParams->inputLengths.value();
+            decodeInputs->inputLengths = ITensor::slice(inputLengths, 0, localBatchSize);
+        }
+        decodeInputs->draftLogits = externalDraftTokenParams->draftLogits;
+        decodeInputs->draftProbs = externalDraftTokenParams->draftProbs;
+        decodeInputs->targetProbs = externalDraftTokenParams->targetProbs;
+        decodeInputs->numDraftTokens = externalDraftTokenParams->numDraftTokens;
+        decodeInputs->draftTokenIds = externalDraftTokenParams->draftTokenIds;
+        decodeInputs->constantThreshold = externalDraftTokenParams->constantThreshold;
+        decodeInputs->useRandomAcceptanceThreshold = externalDraftTokenParams->useRandomAcceptanceThreshold;
+        decodeInputs->step = externalDraftTokenParams->step;
+        decodeInputs->useDraftLogits = externalDraftTokenParams->useDraftLogits;
+
+        preparedInputs = decodeInputs;
+        preparedOutputs = baseOutputs;
+    }
     else
     {
         TLLM_CHECK_WITH_INFO(false,
diff --git a/cpp/tensorrt_llm/layers/decodingLayer.h b/cpp/tensorrt_llm/layers/decodingLayer.h
index 78cd6b1b5..60780851f 100644
--- a/cpp/tensorrt_llm/layers/decodingLayer.h
+++ b/cpp/tensorrt_llm/layers/decodingLayer.h
@@ -45,7 +45,7 @@ class DecodingLayer : public BaseLayer
         std::shared_ptr<BaseDecodingInputs> const& inputs,
         std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
 
-    //! \brief Calls forwardSync of configired decoding layer.
+    //! \brief Calls forwardSync of configured decoding layer.
     void forwardSync(std::shared_ptr<BaseDecodingOutputs> const& outputs,
         std::shared_ptr<BaseDecodingInputs> const& inputs,
         std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
diff --git a/cpp/tensorrt_llm/layers/decodingParams.h b/cpp/tensorrt_llm/layers/decodingParams.h
index 07d200704..40dbbba1f 100644
--- a/cpp/tensorrt_llm/layers/decodingParams.h
+++ b/cpp/tensorrt_llm/layers/decodingParams.h
@@ -212,6 +212,13 @@ struct LookaheadSetupParams : public DecodingSetupParams
     TensorPtr attentionPackedMasks;
 };
 
+class ExternalDraftTokensSetupParams : public DecodingSetupParams
+{
+public:
+    std::optional<std::vector<runtime::SizeType32>> runtimeTopK; // [1] or [setupBatchSize] on cpu
+    std::optional<std::vector<float>> runtimeTopP;               // [1] or [setupBatchSize] on cpu
+};
+
 class BaseDecodingInputs
 {
 public:
@@ -331,6 +338,33 @@ class SamplingInputs : public DecodingInputs
     bool probsComputed{};
 };
 
+class ExternalDraftTokensInputs : public DecodingInputs
+{
+public:
+    explicit ExternalDraftTokensInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 step,
+        runtime::SizeType32 ite, runtime::SizeType32 localBatchSize)
+        : DecodingInputs{std::move(endIds), std::move(batchSlots), step, ite, localBatchSize}
+    {
+    }
+
+    TensorPtr draftLogits;
+    TensorPtr draftProbs;
+    TensorPtr targetProbs;
+    TensorPtr numDraftTokens;
+    TensorPtr draftTokenIds;
+    TensorPtr useDraftLogits;
+    runtime::SizeType32 step;
+    float constantThreshold;
+    bool useRandomAcceptanceThreshold;
+
+    //! optional parameters
+    //! [localBatchSize]
+    curandState_t* curandStates{};
+
+    //! Flag to mark that logits tensor contains probabilities
+    bool probsComputed{};
+};
+
 // Medusa inputs
 class MedusaDecodingInputs : public DecodingInputs
 {
@@ -477,7 +511,7 @@ class BeamSearchOutputs : public BaseDecodingOutputs
 //! {c'} is always accepted and {x', z'} is supposed to be accepted.
 //! The accepted tokens [c', x', z'] is saved in `outputIds` in-place, starting from `sequenceLength`.
 //! The `acceptedLength` is 3, and the accepted draft tokens length is 2.
-//! `sequenceLength` is also increaded by `acceptedLength` in-place.
+//! `sequenceLength` is also increased by `acceptedLength` in-place.
 //! The pathsOffset is {0, 1, 3} for {c', x', z'}.
 //! [] for accepted, <> for draft, {} for input/output.
 //!
diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp
new file mode 100644
index 000000000..5f29a9a13
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "externalDraftTokensLayer.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/decodingCommon.h"
+#include "tensorrt_llm/kernels/samplingTopKKernels.h"
+#include "tensorrt_llm/kernels/samplingTopPKernels.h"
+#include "tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h"
+#include "tensorrt_llm/layers/defaultDecodingParams.h"
+#include "tensorrt_llm/layers/layerUtils.h"
+#include "tensorrt_llm/runtime/runtimeKernels.h"
+
+#include <algorithm>
+
+namespace tksd = tensorrt_llm::kernels::speculative_decoding;
+
+using namespace tensorrt_llm::common;
+using namespace tensorrt_llm::kernels;
+using namespace tensorrt_llm::runtime;
+
+namespace tensorrt_llm::layers
+{
+
+template <typename T>
+ExternalDraftTokensLayer<T>::ExternalDraftTokensLayer(executor::DecodingMode const& mode,
+    DecoderDomain const& decoderDomain, std::shared_ptr<BufferManager> bufferManager)
+    : BaseLayer(decoderDomain, bufferManager)
+    , mDecodingMode(mode)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    TLLM_CHECK_WITH_INFO(!mDecodingMode.isBeamSearch(), "ExternalDraftTokensLayer does not support Beam search mode");
+
+    allocateBuffer(decoderDomain.getBatchSize());
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::allocateBuffer(SizeType32 batchSize)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    // top k workspace size
+    auto workspaceSize = getTopKWorkspaceSize<T>(batchSize, 1, TOP_K_MAX, mDecoderDomain.getVocabSizePadded());
+    mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize);
+    // top p workspace size
+    workspaceSize = getTopPWorkspaceSize<T>(batchSize, mDecoderDomain.getVocabSizePadded());
+    mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize);
+    // multinomial (top p == 1) workspace size
+    workspaceSize = getTopPWorkspaceSize<float>(batchSize, mDecoderDomain.getVocabSizePadded());
+    mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize);
+
+    // batchsize here is maxBatchSize
+    auto const batchSizeShape = ITensor::makeShape({batchSize});
+
+    mCurandStatesDevice
+        = mBufferManager->gpu(ITensor::makeShape({batchSize, sizeof(curandState_t)}), TRTDataType<int8_t>::value);
+    mBatchIsAccepted = mBufferManager->gpu(batchSizeShape, TRTDataType<bool>::value);
+    mRuntimeMultinomialDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+
+    // host buffers.
+    mSkipTopKDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<bool>::value);
+    mSkipTopKDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<bool>::value);
+    mSkipTopPDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<bool>::value);
+    mSkipTopPDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<bool>::value);
+    auto skipTopPDecodeHostRange = BufferRange<bool>(*mSkipTopPDecodeHost);
+    std::fill(skipTopPDecodeHostRange.begin(), skipTopPDecodeHostRange.end(), true);
+
+    mOutputIdsAfterSampling = mBufferManager->gpu(
+        ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType<TokenIdType>::value);
+    mTargetOutputIds = mBufferManager->gpu(ITensor::makeShape({batchSize}), TRTDataType<TokenIdType>::value);
+
+    mRuntimeTopKDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<SizeType32>::value);
+
+    mRuntimeTopPForTopKDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+
+    mRuntimeTopPDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+    mInitialTopPDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+
+    mMaskBuffer = mBufferManager->gpu(
+        ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType<bool>::value);
+
+    mSetupWorkspaceSize = std::max({mBatchIsAccepted->getSizeInBytes(), mRuntimeMultinomialDevice->getSizeInBytes(),
+        mSkipTopKDecodeDevice->getSizeInBytes(), mSkipTopPDecodeDevice->getSizeInBytes(),
+        mOutputIdsAfterSampling->getSizeInBytes(), mTargetOutputIds->getSizeInBytes(),
+        mRuntimeTopKDevice->getSizeInBytes(), mRuntimeTopPForTopKDevice->getSizeInBytes(),
+        mRuntimeTopPDevice->getSizeInBytes(), mInitialTopPDevice->getSizeInBytes(), mMaskBuffer->getSizeInBytes()});
+
+    mTargetLogits = mBufferManager->gpu(
+        ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType<T>::value);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorConstPtr batchSlots,
+    std::shared_ptr<BaseSetupParams> const& baseSetupParams,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto setupParams = std::dynamic_pointer_cast<ExternalDraftTokensSetupParams>(baseSetupParams);
+
+    workspace->initializeDeviceCurandStates(
+        setupParams->randomSeed, batchSize, workspace->getDeviceBatchSlots(), mCurandStatesDevice);
+
+    auto const* batchSlotsDevicePtr = workspace->getDeviceBatchSlotsPtr();
+    auto& runtimeMultinomialDeviceTensor = const_cast<ITensor&>(*mRuntimeMultinomialDevice);
+    tensorrt_llm::runtime::kernels::invokeFill(runtimeMultinomialDeviceTensor, 1.0f, mBufferManager->getStream());
+
+    auto* runtimeTopKDevicePtr = bufferCastOrNull<SizeType32>(mRuntimeTopKDevice);
+
+    // Prepare runtime top K
+    auto constexpr defaultTopK = 1u;
+    auto runtimeTopK = setupParams->runtimeTopK.value_or(std::vector<SizeType32>(batchSize, defaultTopK));
+    auto const runtimeTopKSize = runtimeTopK.size();
+    for (auto& topK : runtimeTopK)
+    {
+        if (topK < 0 || topK > TOP_K_MAX)
+        {
+            TLLM_LOG_WARNING(
+                "TopK (%d) is larger than max supported number (%d). Clip to max supported number.", topK, TOP_K_MAX);
+            topK = std::clamp(topK, 0, static_cast<SizeType32>(TOP_K_MAX));
+        }
+    }
+
+    if (runtimeTopKSize > 1)
+    {
+        TLLM_CHECK_WITH_INFO(runtimeTopK.size() == batchSize,
+            fmtstr("runtimeTopK.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopK.size(), batchSize));
+        DecodingLayerWorkspace::copyToWorkspace<SizeType32>(
+            *this->mBufferManager, runtimeTopK, workspace->getWorkspaceDeviceBuffer());
+        auto* setupWorkspaceDevicePtr = workspace->getWorkspaceDevicePtrAs<SizeType32>();
+        // fill top ks into runtimeTopKDevice
+        invokeScatterDecodingParams(
+            setupWorkspaceDevicePtr, runtimeTopKDevicePtr, batchSlotsDevicePtr, batchSize, getStream());
+    }
+
+    // FIXME(nkorobov): monotonically growing
+    auto const curMaxTopK = *std::max_element(std::begin(runtimeTopK), std::end(runtimeTopK));
+    mRuntimeMaxTopK = std::max(mRuntimeMaxTopK, curMaxTopK);
+
+    auto runtimeTopP = setupParams->runtimeTopP.value_or(std::vector<float>{});
+    auto const runtimeTopPSize = runtimeTopP.size();
+    auto* runtimeTopPForTopKDevicePtr = bufferCastOrNull<float>(mRuntimeTopPForTopKDevice);
+    auto* runtimeTopPDevicePtr = bufferCastOrNull<float>(mRuntimeTopPDevice);
+    auto* skipTopPDecodeHostPtr = bufferCastOrNull<bool>(mSkipTopPDecodeHost);
+
+    // if no top P, fill topP skip decode to true
+    if (runtimeTopPSize == 0)
+    {
+        auto const* batchSlotsPtr = bufferCast<SizeType32>(*batchSlots);
+        for (SizeType32 bi = 0; bi < batchSize; ++bi)
+        {
+            auto const bid = batchSlotsPtr[bi];
+            skipTopPDecodeHostPtr[bid] = true;
+        }
+        auto skipTopPDecodeHostSlice = IBuffer::slice(mSkipTopPDecodeHost, 0, mDecoderDomain.getBatchSize());
+        mBufferManager->copy(*skipTopPDecodeHostSlice, *mSkipTopPDecodeDevice);
+    }
+    else
+    {
+        for (auto& topP : runtimeTopP)
+        {
+            if (topP < 0.f || topP > 1.0f)
+            {
+                TLLM_LOG_WARNING("TopP (%f) is out of range ([0.0, 1.0f]). Clip to closest number.", topP);
+                topP = std::clamp(topP, 0.f, 1.f);
+            }
+        }
+        if (runtimeTopPSize > 1)
+        {
+            TLLM_CHECK_WITH_INFO(runtimeTopP.size() == batchSize,
+                fmtstr("runtimeTopP.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopP.size(), batchSize));
+            DecodingLayerWorkspace::copyToWorkspace<float>(
+                *this->mBufferManager, runtimeTopP, workspace->getWorkspaceDeviceBuffer());
+            auto* setupWorkspaceDevicePtr = workspace->getWorkspaceDevicePtrAs<float>();
+            // fill runtime top p device for top k kernel
+            invokeScatterDecodingParams(
+                setupWorkspaceDevicePtr, runtimeTopPForTopKDevicePtr, batchSlotsDevicePtr, batchSize, getStream());
+            // fill runtime top p device for top p kernel
+            invokeScatterDecodingParams(
+                setupWorkspaceDevicePtr, runtimeTopPDevicePtr, batchSlotsDevicePtr, batchSize, getStream());
+        }
+    }
+    // if no topP, default topP is 0.0f, but in invokeSetupTopKRuntimeArgs, it gets set to 1.0f if k > 0
+    auto const topP = (runtimeTopPSize == 0) ? DefaultDecodingParams::getTopP() : runtimeTopP.front();
+
+    auto* skipTopKDecodeDevicePtr = bufferCastOrNull<bool>(mSkipTopKDecodeDevice);
+    {
+        dim3 block(std::min(static_cast<uint32_t>(batchSize), 256u));
+        dim3 grid(divUp(static_cast<uint32_t>(batchSize), block.x));
+        // support topK up to TOP_K_MAX.
+        invokeSetupTopKRuntimeArgs(batchSize, curMaxTopK, runtimeTopKDevicePtr, runtimeTopKSize, topP,
+            runtimeTopPForTopKDevicePtr, runtimeTopPSize, skipTopKDecodeDevicePtr, batchSlotsDevicePtr, getStream());
+    }
+    auto const skipTopKHostDecodeDeviceSlice = ITensor::slice(mSkipTopKDecodeDevice, 0, mDecoderDomain.getBatchSize());
+    auto skipTopKDecodeHostSlice = ITensor::slice(mSkipTopKDecodeHost, 0, mDecoderDomain.getBatchSize());
+    mBufferManager->copy(*skipTopKHostDecodeDeviceSlice, *skipTopKDecodeHostSlice);
+
+    auto* skipTopPDecodeDevicePtr = bufferCast<bool>(*mSkipTopPDecodeDevice);
+    {
+        auto* initialTopPDevicePtr = bufferCast<float>(*mInitialTopPDevice);
+        invokeSetTopPRuntimeArgs(batchSize, curMaxTopK, runtimeTopKDevicePtr, runtimeTopKSize, topP,
+            runtimeTopPDevicePtr, runtimeTopPSize, skipTopPDecodeDevicePtr, batchSlotsDevicePtr, initialTopPDevicePtr,
+            getStream());
+    }
+    auto const skipTopPHostDecodeDeviceSlice = ITensor::slice(mSkipTopPDecodeDevice, 0, mDecoderDomain.getBatchSize());
+    auto skipTopPDecodeHostSlice = ITensor::slice(mSkipTopPDecodeHost, 0, mDecoderDomain.getBatchSize());
+    mBufferManager->copy(*skipTopPHostDecodeDeviceSlice, *skipTopPDecodeHostSlice);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const* endIds = bufferCast<TokenIdType>(*inputs->endIds);
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCast<FinishedState::UnderlyingType>(*inputs->finished.value()))
+        : nullptr;
+
+    inputs->curandStates = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStatesDevice));
+    inputs->probsComputed = true;
+
+    auto runtimeLogitsPtr = bufferCast<T>(*workspace->getDeviceRuntimeLogits());
+    auto logitsPtrsPtr = static_cast<T**>(nullptr);
+    auto biasPtr = static_cast<T*>(nullptr);
+    auto const* batchSlotsPtr = workspace->getDeviceBatchSlotsPtr();
+    mBufferManager->copy(runtimeLogitsPtr, *mTargetLogits);
+    invokeAddBiasSoftMax(runtimeLogitsPtr, logitsPtrsPtr, runtimeLogitsPtr, biasPtr, endIds, finishedInput,
+        batchSlotsPtr, batchSize, mDecoderDomain.getBatchSize(), /* bw */ 1, mDecoderDomain.getVocabSize(),
+        mDecoderDomain.getVocabSizePadded(), /*skipSoftMax*/ false, /* batchSlotLogits */ false, getStream());
+
+    auto const targetTokenIdsShape = (*outputs->outputIds).getShape();
+
+    // Fill the buffer for selected ids from sampling with zero. -1 will be set as a boundary if topP kernel is required
+    auto& outputIdsAfterSamplingTensor = const_cast<ITensor&>(*mOutputIdsAfterSampling);
+    tensorrt_llm::runtime::kernels::invokeFill(outputIdsAfterSamplingTensor, 0, mBufferManager->getStream());
+
+    // The logits from target engine should go through samplings first.
+    // gptDecoderBatched.cpp is calling dynamic decoder step by step, in this step, dynamic Decoder already forwarded
+    // PenaltyLayer, BanWordsLayer. For (TopK > 0) && (TopK == 0 && TopP == 0), we invoke TopK sampling kernel. The same
+    // logic is implemented in SamplingLayer.cpp
+    getAllTopKs(outputs, baseInputs, workspace);
+
+    // Only for (TopK == 0 && TopP > 0), we invoke TopP sampling
+    getAllTopPs(outputs, baseInputs, workspace);
+
+    // After all selected tokens are filled in mOutputIdsAfterSampling by topK, topP kernels, token acceptance logics
+    // starts. First we mask the logits of unselected token id to -inf as HF's TopK, TopP implementation. We compute the
+    // logit probs of draft and target and go through acceptance logics.
+    acceptDraftTokens(outputs, baseInputs, workspace);
+
+    // If the token of the sequence is not accepted, a multinomial sampling is required for the bonus token.
+    // Multinomial sampling is achieved through TopP kernel with TopP = 1 and already weighted-sum target logits.
+    // The acceptance result of each batch is used as skipDecode in topP kernel. If is accepted, no sampling is needed
+    // (early exit). Forwarding for the next step is also set in this kernel.
+    multinomialSampling(outputs, baseInputs, workspace);
+
+    // For the sequence with accepted tokens, we simply forward a step.
+    forwardAcceptedTokens(outputs, baseInputs, workspace);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+size_t ExternalDraftTokensLayer<T>::getWorkspaceSize() const noexcept
+{
+    return std::max(mWorkspaceSize, mSetupWorkspaceSize);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::acceptDraftTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto const draftLogitsShape = (*inputs->draftLogits).getShape();
+    auto const maxBatchSize = mDecoderDomain.getBatchSize();
+    auto const maxTokensPerStep = draftLogitsShape.d[1]; // 1
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+    auto constexpr beamWidth = 1;
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCastOrNull<FinishedState::UnderlyingType>(inputs->finished))
+        : nullptr;
+
+    FinishedState* finishedOutput = (outputs->finished)
+        ? reinterpret_cast<FinishedState*>(bufferCastOrNull<FinishedState::UnderlyingType>(outputs->finished))
+        : nullptr;
+
+    tksd::invokeMaskTargetLogits(batchSize, bufferCast<T>(*mTargetLogits), workspace->getDeviceBatchSlotsPtr(),
+        beamWidth, mDecoderDomain.getVocabSizePadded(), finishedInput, maxBatchSize,
+        bufferCast<bool>(*inputs->useDraftLogits), bufferCast<SizeType32>(*mOutputIdsAfterSampling),
+        bufferCast<SizeType32>(*mTargetOutputIds), bufferCastOrNull<SizeType32>(mRuntimeTopKDevice),
+        bufferCast<bool>(*mMaskBuffer), getStream());
+
+    if (inputs->step == 0)
+    {
+        invokeAddBiasSoftMax(bufferCast<T>(*inputs->draftLogits), static_cast<T**>(nullptr),
+            bufferCast<T>(*inputs->draftProbs), static_cast<T*>(nullptr), nullptr, finishedInput,
+            workspace->getDeviceBatchSlotsPtr(), batchSize, maxBatchSize, beamWidth * maxTokensPerStep,
+            mDecoderDomain.getVocabSize(), mDecoderDomain.getVocabSizePadded(),
+            /* skip softmax */ false,
+            /* batchSlotLogits */ true, getStream());
+    }
+
+    invokeAddBiasSoftMax(bufferCast<T>(*mTargetLogits), static_cast<T**>(nullptr), bufferCast<T>(*inputs->targetProbs),
+        static_cast<T*>(nullptr), nullptr, finishedInput, workspace->getDeviceBatchSlotsPtr(), batchSize, maxBatchSize,
+        beamWidth /* 1 */, mDecoderDomain.getVocabSize(), mDecoderDomain.getVocabSizePadded(),
+        /* skip softmax */ false,
+        /* batchSlotLogits */ false, getStream());
+
+    sync_check_cuda_error();
+
+    tksd::invokeAcceptDraftTokens(batchSize, bufferCast<T>(*inputs->draftProbs), bufferCast<T>(*inputs->targetProbs),
+        bufferCast<SizeType32>(*inputs->numDraftTokens), bufferCast<bool>(*inputs->useDraftLogits),
+        bufferCast<TokenIdType>(*inputs->draftTokenIds), finishedInput, finishedOutput, inputs->curandStates,
+        workspace->getDeviceBatchSlotsPtr(), maxTokensPerStep, beamWidth, mDecoderDomain.getVocabSizePadded(),
+        inputs->useRandomAcceptanceThreshold, inputs->constantThreshold, inputs->step,
+        bufferCast<bool>(*mBatchIsAccepted), bufferCast<SizeType32>(*mTargetOutputIds), getStream());
+
+    sync_check_cuda_error();
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::multinomialSampling(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+    auto probs = bufferCastOrNull<T>(inputs->targetProbs);
+    auto* sequenceLength = bufferCastOrNull<SizeType32>(outputs->sequenceLength);
+    auto const* endIds = bufferCastOrNull<TokenIdType>(inputs->endIds);
+
+    FinishedState* finishedOutput = (outputs->finished)
+        ? reinterpret_cast<FinishedState*>(bufferCastOrNull<FinishedState::UnderlyingType>(outputs->finished))
+        : nullptr;
+    TopPSamplingKernelParams<T> params{};
+
+    params.probs = probs;
+    params.outputIdsPtrs = bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr);
+    params.workspace = workspace->getRawWorkspaceDevicePtr();
+    params.topPs = bufferCastOrNull<float>(mRuntimeMultinomialDevice);
+    params.sequenceLength = sequenceLength;
+    params.endIds = endIds;
+    params.batchSlots = workspace->getDeviceBatchSlotsPtr();
+    params.finishedInput = nullptr;
+    params.finishedOutput = finishedOutput;
+    params.skipDecode = bufferCastOrNull<bool>(mBatchIsAccepted);
+    params.cumLogProbs = nullptr;
+    params.outputLogProbs = nullptr;
+    params.curandState = inputs->curandStates;
+    params.batchSize = batchSize;
+    params.maxBatchSize = mDecoderDomain.getBatchSize();
+    params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
+
+    invokeBatchTopPSampling<T>(params, getStream());
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::getAllTopKs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto logits = bufferCastOrNull<T>(inputs->logits);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const* batchSlotsHost = bufferCast<SizeType32>(*inputs->batchSlots);
+    auto* skipDecodeHostPtr = bufferCastOrNull<bool>(mSkipTopKDecodeHost);
+    auto const skip = allOfBatchSlots(batchSlotsHost, skipDecodeHostPtr, batchSize, true);
+    if (skip)
+    {
+        return;
+    }
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCastOrNull<FinishedState::UnderlyingType>(inputs->finished))
+        : nullptr;
+
+    TopKSamplingKernelParams<T> params{};
+    params.logProbs = logits;
+    params.outputIds = bufferCastOrNull<TokenIdType>(mOutputIdsAfterSampling);
+    params.workspace = workspace->getRawWorkspaceDevicePtr();
+    params.maxTopP = 1.0f;
+    params.topPs = bufferCastOrNull<float>(mRuntimeTopPForTopKDevice);
+    params.maxTopK = mRuntimeMaxTopK;
+    params.topKs = bufferCastOrNull<SizeType32>(mRuntimeTopKDevice);
+    params.batchSlots = workspace->getDeviceBatchSlotsPtr();
+    params.finishedInput = finishedInput;
+    params.skipDecode = bufferCastOrNull<bool>(mSkipTopKDecodeDevice);
+    params.curandState = inputs->curandStates;
+    params.batchSize = batchSize;
+    params.maxBatchSize = mDecoderDomain.getBatchSize();
+    params.maxTokensPerStep = 1;
+    params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
+    params.returnAllTopK = true;
+    params.maxSeqLen = mDecoderDomain.getVocabSizePadded(); // workaround for returning all topKs with outputIds
+    params.logitsHasProbs = inputs->probsComputed;
+
+    invokeBatchTopKSampling(params, getStream());
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::getAllTopPs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto logits = bufferCastOrNull<T>(inputs->logits);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const* batchSlotsHost = bufferCast<SizeType32>(*inputs->batchSlots);
+    auto* skipDecodeHostPtr = bufferCastOrNull<bool>(mSkipTopPDecodeHost);
+    auto const skip = allOfBatchSlots(batchSlotsHost, skipDecodeHostPtr, batchSize, true);
+    if (skip)
+    {
+        return;
+    }
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCastOrNull<FinishedState::UnderlyingType>(inputs->finished))
+        : nullptr;
+
+    TopPSamplingKernelParams<T> params{};
+    params.probs = logits;
+    params.outputIds = bufferCastOrNull<TokenIdType>(mOutputIdsAfterSampling);
+    params.workspace = workspace->getRawWorkspaceDevicePtr();
+    params.topPs = bufferCastOrNull<float>(mRuntimeTopPDevice);
+    params.batchSlots = workspace->getDeviceBatchSlotsPtr();
+    params.finishedInput = finishedInput;
+    params.skipDecode = bufferCastOrNull<bool>(mSkipTopPDecodeDevice);
+    params.curandState = inputs->curandStates;
+    params.batchSize = batchSize;
+    params.maxBatchSize = mDecoderDomain.getBatchSize();
+    params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
+    params.returnAllTopP = true;
+    params.maxSeqLen = mDecoderDomain.getVocabSizePadded();
+
+    invokeBatchTopPSampling<T>(params, getStream());
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::forwardAcceptedTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const draftLogitsShape = (*inputs->draftLogits).getShape();
+    auto const maxTokensPerStep = draftLogitsShape.d[1]; // 1
+
+    FinishedState* finishedOutput = (outputs->finished)
+        ? reinterpret_cast<FinishedState*>(bufferCastOrNull<FinishedState::UnderlyingType>(outputs->finished))
+        : nullptr;
+
+    tksd::invokeForwardAcceptedTokens(batchSize, workspace->getDeviceBatchSlotsPtr(),
+        bufferCast<bool>(*mBatchIsAccepted), bufferCastOrNull<SizeType32>(outputs->sequenceLength),
+        bufferCast<TokenIdType>(*inputs->draftTokenIds), bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr),
+        inputs->step, maxTokensPerStep, bufferCastOrNull<TokenIdType>(inputs->endIds), finishedOutput, getStream());
+
+    sync_check_cuda_error();
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template class ExternalDraftTokensLayer<float>;
+template class ExternalDraftTokensLayer<half>;
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h
new file mode 100644
index 000000000..4122c7c35
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/layers/baseLayer.h"
+#include "tensorrt_llm/layers/decodingParams.h"
+#include "tensorrt_llm/runtime/common.h"
+
+#include <curand_kernel.h>
+
+namespace tensorrt_llm::layers
+{
+
+//! \brief Top class for sampling layers.
+//! It sets up and executes TopKSamplingLayer and TopPSamplingLayer samplings
+template <typename T>
+class ExternalDraftTokensLayer : public BaseLayer
+{
+public:
+    using Base = BaseLayer;
+
+    ExternalDraftTokensLayer(executor::DecodingMode const& mode, DecoderDomain const& decoderDomain,
+        std::shared_ptr<runtime::BufferManager> bufferManager);
+
+    void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, TensorConstPtr batchSlots,
+        std::shared_ptr<BaseSetupParams> const& setupParams,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
+
+    void forwardAsync(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& inputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
+
+    //! @returns workspace needed for this layer in bytes
+    [[nodiscard]] size_t getWorkspaceSize() const noexcept override;
+
+protected:
+    runtime::SizeType32 mRuntimeMaxTopK{0};
+
+private:
+    using Base::mDecoderDomain;
+
+    executor::DecodingMode mDecodingMode;
+
+    size_t mWorkspaceSize{0};
+    size_t mSetupWorkspaceSize{0};
+
+    TensorPtr mCurandStatesDevice;
+    TensorPtr mSkipTopKDecodeDevice;
+    TensorPtr mSkipTopKDecodeHost;
+    TensorPtr mSkipTopPDecodeDevice;
+    TensorPtr mSkipTopPDecodeHost;
+
+    TensorPtr mBatchIsAccepted;
+    TensorPtr mRuntimeMultinomialDevice;
+
+    TensorPtr mOutputIdsAfterSampling;
+    TensorPtr mTargetOutputIds;
+    TensorPtr mRuntimeTopKDevice;
+    TensorPtr mRuntimeTopPForTopKDevice;
+    TensorPtr mRuntimeTopPDevice;
+    TensorPtr mInitialTopPDevice;
+    TensorPtr mMaskBuffer;
+
+    TensorPtr mTargetLogits;
+
+private:
+    void allocateBuffer(runtime::SizeType32 batchSize);
+    void acceptDraftTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void multinomialSampling(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void getAllTopKs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void getAllTopPs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void forwardAcceptedTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+};
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp b/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp
index fc0774450..f583b0e3e 100644
--- a/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp
@@ -267,7 +267,7 @@ void TopPSamplingLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> con
 
     TopPSamplingKernelParams<T> params{};
     params.probs = probs;
-    params.outputIds = bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr);
+    params.outputIdsPtrs = bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr);
     params.workspace = workspace->getRawWorkspaceDevicePtr();
     params.topPs = bufferCastOrNull<float>(mRuntimeTopPDevice);
     params.sequenceLength = sequenceLength;
diff --git a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
index ffc4b7a8d..6f6512f13 100644
--- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
@@ -259,7 +259,7 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
         int idx = 0;
         for (int reqId = 0; reqId < numReqs; reqId++)
         {
-            const RequestType reqType = static_cast<RequestType const>(reqTypes[reqId]);
+            RequestType const reqType = static_cast<RequestType const>(reqTypes[reqId]);
             if (reqType == RequestType::kGENERATION)
             {
                 mExpandLoraWeightPtrs.push_back(reinterpret_cast<void const*>(loraWeightModulePtrs[reqId * 2]));
@@ -284,7 +284,7 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
             fmtstr("LoraParams and input dims don't match, lora tokens %d input tokens %d", idx, numTokens));
     }
 
-    // only used for unifed gemm
+    // only used for unified gemm
     auto bestTactic = mPluginProfiler->getBestConfig(numTokens, mGemmId);
     mLoraImpl->setBestTactic(bestTactic);
     mLoraImpl->run(numTokens, numReqs, input, mExpandLoraRanks.data(), mExpandLoraWeightPtrs.data(), mWeightIndex,
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
index 784055cc5..f0d700fc1 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@@ -305,14 +305,17 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
             ++tpRank;
         }
 
+        int token_num = size / inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1];
+
         auto params = tensorrt_llm::kernels::AllReduceParams::deserialize(
-            reinterpret_cast<int64_t*>(const_cast<void*>(inputs[1])), tpSize, tpRank);
+            reinterpret_cast<int64_t*>(const_cast<void*>(inputs[1])), tpSize, tpRank, mType, token_num, mOp);
 
         params.local_output_buffer_ptr = outputs[0];
         params.local_input_buffer_ptr = inputs[0];
         params.elts_total = size;
         if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM)
         {
+
             int fusion_ptr_idx = 2;
             params.fusion_params.bias_buffer = mBias ? inputs[fusion_ptr_idx++] : nullptr;
             params.fusion_params.residual_buffer = inputs[fusion_ptr_idx++];
@@ -320,6 +323,15 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
             params.fusion_params.hidden_size = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1];
             params.fusion_params.eps = mEps;
             params.fusion_params.intermediate_buffer = outputs[1];
+            for (int i = 0; i < tpSize; ++i)
+            {
+                params.fusion_params.lamport_peer_comm_buffer_ptrs[i]
+                    = reinterpret_cast<void**>(const_cast<void*>(inputs[1]))[tpSize * 4 + i];
+                params.fusion_params.lamport_peer_comm_buffer_ptrs[i + tensorrt_llm::kernels::MAX_RANKS_PER_NODE]
+                    = reinterpret_cast<void**>(const_cast<void*>(inputs[1]))[tpSize * 5 + i];
+                params.fusion_params.lamport_peer_comm_buffer_ptrs[i + tensorrt_llm::kernels::MAX_RANKS_PER_NODE * 2]
+                    = reinterpret_cast<void**>(const_cast<void*>(inputs[1]))[tpSize * 6 + i];
+            }
         }
         tensorrt_llm::kernels::customAllReduce(params, mType, runtimeStrategy, mConfig, mOp, stream);
     }
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
index 1f7bf73fb..daae58398 100755
--- a/cpp/tensorrt_llm/pybind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -3,35 +3,19 @@ set(TRTLLM_PYBIND_MODULE
     ${TRTLLM_PYBIND_MODULE}
     PARENT_SCOPE)
 
-if(NOT BUILD_PYT)
-  message(
-    FATAL_ERROR
-      "Python bindings for C++ runtime require PyTorch. Please enable BUILD_PYT"
-  )
-endif()
-
-execute_process(
-  COMMAND ${Python3_EXECUTABLE} "-c"
-          "import pybind11 as pb11; print(pb11.get_cmake_dir(),end='');"
-  RESULT_VARIABLE PYBIND_CMAKE_DIR_RET
-  OUTPUT_VARIABLE PYBIND_CMAKE_DIR)
-
-if(PYBIND_CMAKE_DIR_RET MATCHES 0)
-  list(APPEND CMAKE_PREFIX_PATH "${PYBIND_CMAKE_DIR}")
-else()
-  message(ERROR "pybind11 CMake directory not found.")
-endif()
-
-find_package(pybind11 REQUIRED)
-
 set(SRCS
-    bindings.cpp
+    batch_manager/algorithms.cpp
+    batch_manager/bindings.cpp
     batch_manager/gptManager.cpp
-    batch_manager/llmRequest.cpp
     batch_manager/inferenceRequest.cpp
+    batch_manager/kvCacheManager.cpp
+    batch_manager/llmRequest.cpp
     batch_manager/namedTensor.cpp
     executor/bindings.cpp
-    executor/executor.cpp)
+    executor/executor.cpp
+    bindings.cpp)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
 
 pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
 
@@ -41,15 +25,12 @@ set_property(TARGET ${TRTLLM_PYBIND_MODULE} PROPERTY POSITION_INDEPENDENT_CODE
 target_link_directories(${TRTLLM_PYBIND_MODULE} PUBLIC
                         "${TORCH_INSTALL_PREFIX}/lib")
 target_link_libraries(
-  ${TRTLLM_PYBIND_MODULE} PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG}
-                                 ${NO_AS_NEEDED_FLAG})
-target_link_libraries(
-  ${TRTLLM_PYBIND_MODULE} PUBLIC ${Python3_LIBRARIES} ${TORCH_LIBRARIES}
-                                 torch_python ${UNDEFINED_FLAG})
-target_compile_definitions(${TRTLLM_PYBIND_MODULE}
-                           PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE})
-target_compile_definitions(${TRTLLM_PYBIND_MODULE}
-                           PUBLIC PYBIND11_DETAILED_ERROR_MESSAGES=1)
+  ${TRTLLM_PYBIND_MODULE}
+  PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
+         ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
+target_compile_definitions(
+  ${TRTLLM_PYBIND_MODULE} PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}
+                                 PYBIND11_DETAILED_ERROR_MESSAGES=1)
 
 if(NOT WIN32)
   set_target_properties(
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
new file mode 100644
index 000000000..15fc1ee4f
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "algorithms.h"
+#include "tensorrt_llm/batch_manager/capacityScheduler.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/pybind/common/algorithmBindings.h"
+
+namespace py = pybind11;
+
+using namespace tensorrt_llm::batch_manager;
+using namespace PybindUtils;
+
+void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::module_& m)
+{
+    // Algorithms with custom bindings
+    py::class_<CapacityScheduler>(m, CapacityScheduler::name)
+        .def_static("make", &CapacityScheduler::make, py::arg("max_num_requests"), py::arg("kv_cache_manager"),
+            py::arg("cross_kv_cache_manager"), py::arg("peft_cache_manager"), py::arg("capacity_scheduler_policy"),
+            py::arg("many_micro_batches") = false,
+            py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"),
+            py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE,
+                "LlmRequestState.GENERATION_COMPLETE"))
+        .def(py::init())
+        .def("__call__", &CapacityScheduler::operator())
+        .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; });
+
+    py::class_<MicroBatchScheduler>(m, MicroBatchScheduler::name)
+        .def_static("make", &MicroBatchScheduler::make, py::arg("max_batch_size"),
+            py::arg_v("max_num_tokens", std::nullopt, "None"), py::arg_v("ctx_chunk_config", std::nullopt, "None"),
+            py::arg_v("max_context_length", std::nullopt, "None"),
+            py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"),
+            py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE,
+                "LlmRequestState.GENERATION_COMPLETE"))
+        .def(py::init())
+        .def("__call__", &MicroBatchScheduler::operator())
+        .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h
new file mode 100644
index 000000000..895a4d13e
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::pybind::batch_manager::algorithms
+{
+
+void initBindings(pybind11::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
new file mode 100644
index 000000000..20de984f9
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -0,0 +1,41 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/pybind/utils/bindTypes.h"
+
+namespace py = pybind11;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tle = tensorrt_llm::executor;
+
+using namespace tensorrt_llm::runtime;
+
+namespace tensorrt_llm::pybind::batch_manager
+{
+
+void initBindings(pybind11::module_& m)
+{
+    py::class_<tb::batch_scheduler::ContextChunkingConfig>(m, "ContextChunkingConfig")
+        .def(py::init<tle::ContextChunkingPolicy, tensorrt_llm::runtime::SizeType32>(), py::arg("chunking_policy"),
+            py::arg("chunk_unit_size"))
+        .def_readwrite("chunking_policy", &tb::batch_scheduler::ContextChunkingConfig::chunkingPolicy)
+        .def_readwrite("chunk_unit_size", &tb::batch_scheduler::ContextChunkingConfig::chunkUnitSize);
+}
+
+} // namespace tensorrt_llm::pybind::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.h b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
new file mode 100644
index 000000000..326143d4f
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::pybind::batch_manager
+{
+
+void initBindings(pybind11::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h b/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h
index 3f19dddc7..0c3b81796 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h
@@ -21,6 +21,7 @@
 #include "namedTensor.h"
 #include "tensorrt_llm/batch_manager/GptManager.h"
 #include "tensorrt_llm/batch_manager/callbacks.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ops/tensor.h>
 #include <functional>
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h
index 98ae79b34..d30864e6e 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h
@@ -20,6 +20,7 @@
 #include "tensorrt_llm/batch_manager/inferenceRequest.h"
 #include "tensorrt_llm/pybind/batch_manager/llmRequest.h"
 #include "tensorrt_llm/pybind/batch_manager/namedTensor.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ATen.h>
 #include <pybind11/pybind11.h>
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
new file mode 100644
index 000000000..1e6e59b42
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "kvCacheManager.h"
+#include "tensorrt_llm/pybind/utils/bindTypes.h"
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace py = pybind11;
+
+void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
+{
+    // TODO: Provide proper bindings
+    py::classh<tb::kv_cache_manager::KVCacheManager>(m, "KVCacheManager");
+}
+
+void tb::BasePeftCacheManagerBindings::initBindings(py::module_& m)
+{
+    // TODO: Provide proper bindings
+    py::classh<tb::BasePeftCacheManager>(m, "BasePeftCacheManager");
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
new file mode 100644
index 000000000..7753c684d
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
@@ -0,0 +1,36 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+class KVCacheManagerBindings
+{
+public:
+    static void initBindings(pybind11::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager
+
+namespace tensorrt_llm::batch_manager
+{
+class BasePeftCacheManagerBindings
+{
+public:
+    static void initBindings(pybind11::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
index 193940083..4ef2e6851 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
@@ -17,22 +17,29 @@
 #include "llmRequest.h"
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/pybind/utils/bindTypes.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchUtils.h"
 #include "tensorrt_llm/runtime/torchView.h"
 
+#include <ATen/ATen.h>
 #include <pybind11/functional.h>
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
 #include <memory>
 
 namespace tb = tensorrt_llm::batch_manager;
 namespace tr = tensorrt_llm::runtime;
+namespace tle = tensorrt_llm::executor;
 
 using namespace tensorrt_llm::pybind::batch_manager;
 
+using LlmRequestPtr = std::shared_ptr<tb::LlmRequest>;
+using RequestList = std::list<LlmRequestPtr>;
+
 namespace
 {
 
@@ -166,7 +173,6 @@ void LlmRequest::initBindings(py::module_& m)
         .def_property_readonly("orig_prompt_len", &LlmRequest::getOrigPromptLen)
         .def("has_draft_tokens", &LlmRequest::hasDraftTokens)
         .def("move_to_next_context_chunk", &LlmRequest::moveToNextContextChunk)
-        .def("is_full_context_request", py::overload_cast<>(&LlmRequest::isFullContextRequest, py::const_))
         .def("is_last_context_chunk", py::overload_cast<>(&LlmRequest::isLastContextChunk, py::const_))
         .def("is_first_context_chunk", py::overload_cast<>(&LlmRequest::isFirstContextChunk, py::const_))
         .def("get_context_remaining_length", py::overload_cast<>(&LlmRequest::getContextRemainingLength, py::const_))
@@ -180,3 +186,140 @@ void LlmRequest::initBindings(py::module_& m)
             { self.setDraftLogits(std::make_optional<LlmRequest::TensorPtr>(logits)); })
         .def_property("num_return_sequences", &LlmRequest::getNumReturnSequences, &LlmRequest::setNumReturnSequences);
 }
+
+void tb::LlmRequestBindings::initBindings(py::module_& m)
+{
+    py::classh<tb::LlmRequest>(m, "PyLlmRequest")
+        .def("get_num_tokens", &tb::LlmRequest::getNumTokens, py::arg("beam"))
+        .def_property_readonly("max_beam_num_tokens", &tb::LlmRequest::getMaxBeamNumTokens)
+        .def("get_token", &tb::LlmRequest::getToken, py::arg("beam"), py::arg("pos"))
+        .def("get_tokens", py::overload_cast<tb::LlmRequest::SizeType32>(&tb::LlmRequest::getTokens, py::const_),
+            py::arg("beam"))
+        .def("get_tokens", py::overload_cast<>(&tb::LlmRequest::getTokens, py::const_))
+        .def_property_readonly("max_num_generated_tokens", &tb::LlmRequest::getMaxNumGeneratedTokens)
+        .def("add_new_token", &tb::LlmRequest::addNewToken, py::arg("token"), py::arg("beam"))
+        .def("add_new_tokens", &tb::LlmRequest::addNewTokens, py::arg("beam_tokens"))
+        .def("set_generated_tokens", &tb::LlmRequest::setGeneratedTokens, py::arg("generated_beam_tokens"))
+        .def("pause", &tb::LlmRequest::pause, py::arg("max_input_len"))
+        .def_property("max_sent_token_len", &tb::LlmRequest::getMaxSentTokenLen, &tb::LlmRequest::setMaxSentTokenLen)
+        .def("prompt_embedding_table",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getPromptEmbeddingTable();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("bad_words_list",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getBadWordsList();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def_property(
+            "draft_logits",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getDraftLogits();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            },
+            [](tb::LlmRequest& self, at::Tensor& logits)
+            { self.setDraftLogits(std::make_optional<tb::LlmRequest::TensorPtr>(tr::TorchView::of(logits))); })
+        .def("embedding_bias",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getEmbeddingBias();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("lora_config",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getLoraConfig();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("lora_weights",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getLoraWeights();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("stop_words_list",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getStopWordsList();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def_property_readonly("prompt_vocab_size", &tb::LlmRequest::getPromptVocabSize)
+        .def_property_readonly("lora_task_id", &tb::LlmRequest::getLoraTaskId)
+        .def_property_readonly("lookahead_config", &tb::LlmRequest::getLookaheadConfig)
+        .def_property_readonly(
+            "context_current_position", py::overload_cast<>(&tb::LlmRequest::getContextCurrentPosition, py::const_))
+        .def_property("context_chunk_size", &tb::LlmRequest::getContextChunkSize, &tb::LlmRequest::setContextChunkSize)
+        .def_readwrite("request_id", &tb::LlmRequest::mRequestId)
+        .def_readwrite("prompt_len", &tb::LlmRequest::mPromptLen)
+        .def_readwrite("max_new_tokens", &tb::LlmRequest::mMaxNewTokens)
+        .def_readwrite("sampling_config", &tb::LlmRequest::mSamplingConfig)
+        .def_readwrite("state", &tb::LlmRequest::mState)
+        .def_readwrite("is_streaming", &tb::LlmRequest::mIsStreaming)
+        .def_readwrite("end_id", &tb::LlmRequest::mEndId)
+        .def_readwrite("pad_id", &tb::LlmRequest::mPadId)
+        .def_readwrite("seq_slot", &tb::LlmRequest::mSeqSlot)
+        .def_property_readonly("return_log_probs", &tb::LlmRequest::returnLogProbs)
+        .def_property_readonly("return_context_logits", &tb::LlmRequest::setReturnContextLogits)
+        .def_property_readonly("return_generation_logits", &tb::LlmRequest::setReturnGenerationLogits)
+        .def_property_readonly("log_probs", py::overload_cast<>(&tb::LlmRequest::getLogProbs, py::const_))
+        .def("get_log_probs", py::overload_cast<tb::LlmRequest::SizeType32>(&tb::LlmRequest::getLogProbs, py::const_))
+        .def("set_log_probs", &tb::LlmRequest::setLogProbs, py::arg("log_probs"), py::arg("beam"))
+        .def("set_return_encoder_output", &tb::LlmRequest::setReturnEncoderOutput, py::arg("return_encoder_output"))
+        .def("get_return_encoder_output", &tb::LlmRequest::getReturnEncoderOutput)
+        .def("priority", py::overload_cast<>(&tb::LlmRequest::priority, py::const_))
+        .def("set_priority", py::overload_cast<tle::PriorityType>(&tb::LlmRequest::setPriority))
+        .def_property_readonly("cum_log_probs", &tb::LlmRequest::getCumLogProbs)
+        .def("set_cum_log_prob", &tb::LlmRequest::setCumLogProb, py::arg("cum_log_prob"), py::arg("beam"))
+        .def_property_readonly("orig_prompt_len", &tb::LlmRequest::getOrigPromptLen)
+        .def("has_draft_tokens", &tb::LlmRequest::hasDraftTokens)
+        .def("move_to_next_context_chunk", &tb::LlmRequest::moveToNextContextChunk)
+        .def("is_last_context_chunk", py::overload_cast<>(&tb::LlmRequest::isLastContextChunk, py::const_))
+        .def("is_first_context_chunk", py::overload_cast<>(&tb::LlmRequest::isFirstContextChunk, py::const_))
+        .def(
+            "get_context_remaining_length", py::overload_cast<>(&tb::LlmRequest::getContextRemainingLength, py::const_))
+        .def_property(
+            "draft_tokens", [](tb::LlmRequest& self) { return *self.getDraftTokens(); },
+            [](tb::LlmRequest& self, tb::LlmRequest::VecTokens& draftTokens)
+            { self.setDraftTokens(std::make_shared<tb::LlmRequest::VecTokens>(std::move(draftTokens))); });
+
+    py::bind_vector<tb::RequestVector>(m, "RequestVector");
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
index 34ea424e6..1bc265600 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ATen.h>
 #include <ATen/ops/tensor.h>
@@ -25,6 +26,15 @@
 #include <optional>
 #include <pybind11/pybind11.h>
 
+namespace tensorrt_llm::batch_manager
+{
+class LlmRequestBindings
+{
+public:
+    static void initBindings(pybind11::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
+
 namespace tensorrt_llm::pybind::batch_manager
 {
 
@@ -91,6 +101,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
         std::optional<LlmRequest::LogitsPostProcessor> callback);
 
     [[nodiscard]] std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> toTrtLlm() const;
+
     static void initBindings(pybind11::module_& m);
 };
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h b/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h
index 9a0bf661d..522aa52e5 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/namedTensor.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ATen.h>
 
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 2c74104e9..71950bbe5 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -23,18 +23,20 @@
 #include <torch/extension.h>
 #include <vector>
 
+#include "tensorrt_llm/batch_manager/BatchManager.h"
+#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
+#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
+#include "tensorrt_llm/common/mpiUtils.h"
+#include "tensorrt_llm/common/quantization.h"
+#include "tensorrt_llm/pybind/batch_manager/algorithms.h"
+#include "tensorrt_llm/pybind/batch_manager/bindings.h"
 #include "tensorrt_llm/pybind/batch_manager/gptManager.h"
 #include "tensorrt_llm/pybind/batch_manager/inferenceRequest.h"
+#include "tensorrt_llm/pybind/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/pybind/batch_manager/llmRequest.h"
 #include "tensorrt_llm/pybind/batch_manager/namedTensor.h"
 #include "tensorrt_llm/pybind/executor/bindings.h"
 #include "tensorrt_llm/pybind/utils/pathCaster.h"
-
-#include "tensorrt_llm/batch_manager/BatchManager.h"
-#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
-#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
-#include "tensorrt_llm/common/mpiUtils.h"
-#include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/memoryCounters.h"
@@ -333,6 +335,10 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
 
     tpb::NamedTensor::initBindings(m);
     tpb::LlmRequest::initBindings(m);
+    tb::kv_cache_manager::KVCacheManagerBindings::initBindings(m);
+    tb::BasePeftCacheManagerBindings::initBindings(m);
+
+    tb::LlmRequestBindings::initBindings(m);
 
     auto tensorNames = m.def_submodule("tensor_names");
     // Input tensor names
@@ -412,8 +418,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .def(py::pickle(gptModelParamsGetState, gptModelParamsSetState))
         .def("__eq__", &tb::TrtGptModelOptionalParams::operator==);
 
-    tpb::GptManager::initBindings(m);
-
     py::class_<tr::MemoryCounters>(m, "MemoryCounters")
         .def_static("instance", &tr::MemoryCounters::getInstance, py::return_value_policy::reference)
         .def_property_readonly("gpu", &tr::MemoryCounters::getGpu)
@@ -447,4 +451,11 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
                 auto& world = tensorrt_llm::mpi::MpiComm::world();
                 tensorrt_llm::mpi::MpiComm::setSession(world.split(color, rank));
             });
+
+    auto mInternal = m.def_submodule("internal", "Internal submodule of TRTLLM runtime");
+
+    tensorrt_llm::pybind::batch_manager::initBindings(mInternal);
+    tensorrt_llm::pybind::batch_manager::algorithms::initBindings(mInternal);
+
+    tpb::GptManager::initBindings(m);
 }
diff --git a/cpp/tensorrt_llm/pybind/common/algorithmBindings.h b/cpp/tensorrt_llm/pybind/common/algorithmBindings.h
new file mode 100644
index 000000000..0a81a4e63
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/common/algorithmBindings.h
@@ -0,0 +1,39 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "opaqueBindings.h"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <string>
+
+namespace py = pybind11;
+
+namespace PybindUtils
+{
+template <typename T>
+void makeAlgorithmBindings(py::module_& m)
+{
+    py::class_<T>(m, T::name).def(py::init()).def("forward", &T::forward).def("name", [](T const&) { return T::name; });
+}
+
+template <typename T>
+void instantiatePybindAlgorithm(py::module_& m);
+} // namespace PybindUtils
+
+#define INSTANTIATE_ALGORITHM(TYPE)                                                                                    \
+    template <>                                                                                                        \
+    void PybindUtils::instantiatePybindAlgorithm<TYPE>(py::module_ & m)                                                \
+    {                                                                                                                  \
+        makeAlgorithmBindings<TYPE>(m);                                                                                \
+    };
diff --git a/cpp/tensorrt_llm/pybind/common/opaqueBindings.h b/cpp/tensorrt_llm/pybind/common/opaqueBindings.h
new file mode 100644
index 000000000..59f98a76d
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/common/opaqueBindings.h
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include <pybind11/stl_bind.h>
+
+PYBIND11_MAKE_OPAQUE(tensorrt_llm::batch_manager::RequestVector)
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index 4a79a64ee..0d8f5a2ff 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -408,9 +408,12 @@ void InitBindings(pybind11::module_& m)
         .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal);
 
     py::class_<tle::Response>(m, "Response")
-        .def(py::init<IdType, std::string>(), py::arg("request_id"), py::arg("error_msg"))
-        .def(py::init<IdType, tle::Result>(), py::arg("request_id"), py::arg("result"))
+        .def(py::init<IdType, std::string, std::optional<IdType>>(), py::arg("request_id"), py::arg("error_msg"),
+            py::arg("client_id") = std::nullopt)
+        .def(py::init<IdType, tle::Result, std::optional<IdType>>(), py::arg("request_id"), py::arg("result"),
+            py::arg("client_id") = std::nullopt)
         .def_property_readonly("request_id", &tle::Response::getRequestId)
+        .def_property_readonly("client_id", &tle::Response::getClientId)
         .def("has_error", &tle::Response::hasError)
         .def_property_readonly("error_msg", &tle::Response::getErrorMsg)
         .def_property_readonly("result", &tle::Response::getResult);
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.h b/cpp/tensorrt_llm/pybind/executor/bindings.h
index 7a686b19b..59916dcd6 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.h
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.h
@@ -16,6 +16,8 @@
  */
 
 #pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include <pybind11/pybind11.h>
 
 namespace tensorrt_llm::pybind::executor
diff --git a/cpp/tensorrt_llm/pybind/executor/executor.h b/cpp/tensorrt_llm/pybind/executor/executor.h
index 6b3663884..b70ba4c9c 100644
--- a/cpp/tensorrt_llm/pybind/executor/executor.h
+++ b/cpp/tensorrt_llm/pybind/executor/executor.h
@@ -16,8 +16,10 @@
  */
 
 #pragma once
+
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include <pybind11/pybind11.h>
 
 namespace tle = tensorrt_llm::executor;
diff --git a/cpp/tensorrt_llm/pybind/executor/streamCaster.h b/cpp/tensorrt_llm/pybind/executor/streamCaster.h
index 4838cc6cc..e0c0ccf01 100644
--- a/cpp/tensorrt_llm/pybind/executor/streamCaster.h
+++ b/cpp/tensorrt_llm/pybind/executor/streamCaster.h
@@ -17,10 +17,10 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
 #include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include <pybind11/pybind11.h>
 
 namespace PYBIND11_NAMESPACE
 {
diff --git a/cpp/tensorrt_llm/pybind/executor/tensorCaster.h b/cpp/tensorrt_llm/pybind/executor/tensorCaster.h
index 894e0af30..e3c596503 100644
--- a/cpp/tensorrt_llm/pybind/executor/tensorCaster.h
+++ b/cpp/tensorrt_llm/pybind/executor/tensorCaster.h
@@ -17,11 +17,11 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
 #include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
+#include <pybind11/pybind11.h>
 #include <torch/extension.h>
 
 namespace PYBIND11_NAMESPACE
diff --git a/cpp/tensorrt_llm/pybind/utils/bindTypes.h b/cpp/tensorrt_llm/pybind/utils/bindTypes.h
new file mode 100644
index 000000000..727c364d9
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/utils/bindTypes.h
@@ -0,0 +1,69 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace PybindUtils
+{
+
+namespace py = pybind11;
+
+template <typename T>
+void bindList(py::module& m, std::string const& name)
+{
+    py::class_<T>(m, name.c_str())
+        .def(py::init())
+        .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); })
+        .def("pop_back", [](T& lst) { lst.pop_back(); })
+        .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); })
+        .def("pop_front", [](T& lst) { lst.pop_front(); })
+        .def("__len__", [](T const& lst) { return lst.size(); })
+        .def(
+            "__iter__", [](T& lst) { return py::make_iterator(lst.begin(), lst.end()); }, py::keep_alive<0, 1>())
+        .def("__getitem__",
+            [](T const& lst, size_t index)
+            {
+                if (index >= lst.size())
+                    throw py::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                return *it;
+            })
+        .def("__setitem__",
+            [](T& lst, size_t index, const typename T::value_type& value)
+            {
+                if (index >= lst.size())
+                    throw py::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                *it = value;
+            });
+}
+
+template <typename T>
+void bindSet(py::module& m, std::string const& name)
+{
+    py::class_<T>(m, name.c_str())
+        .def(py::init())
+        .def("clear", &T::clear)
+        .def("size", &T::size)
+        // .def("insert", py::overload_cast<const typename T::value_type&>(&T::insert))
+        .def("erase", py::overload_cast<typename T::value_type const&>(&T::erase))
+        .def("__contains__", [](T const& s, typename T::value_type x) { return s.find(x) != s.end(); })
+        .def(
+            "__iter__", [](T& s) { return py::make_iterator(s.begin(), s.end()); }, py::keep_alive<0, 1>());
+}
+
+} // namespace PybindUtils
diff --git a/cpp/tensorrt_llm/pybind/utils/pathCaster.h b/cpp/tensorrt_llm/pybind/utils/pathCaster.h
index 571be82ad..e74da30dd 100644
--- a/cpp/tensorrt_llm/pybind/utils/pathCaster.h
+++ b/cpp/tensorrt_llm/pybind/utils/pathCaster.h
@@ -22,6 +22,7 @@
 #include "pybind11/detail/descr.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include <filesystem>
 
 namespace PYBIND11_NAMESPACE
diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
index 80a1284f2..2ce57d5dd 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
@@ -161,6 +161,19 @@ void GptDecoder<T>::setup(SamplingConfig const& samplingConfig, size_t batchSize
         lookaheadParams->attentionPackedMasks = output->lookaheadOutputs->packedMasks;
         setupParams->decodingParams = std::move(lookaheadParams);
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        auto externalDraftTokensParams = std::make_shared<tl::ExternalDraftTokensSetupParams>();
+        // signed to unsigned
+        if (mSamplingConfig.topK)
+        {
+            auto const& topK = mSamplingConfig.topK.value();
+            externalDraftTokensParams->runtimeTopK = std::vector<SizeType32>(std::begin(topK), std::end(topK));
+        }
+
+        externalDraftTokensParams->runtimeTopP = mSamplingConfig.topP;
+        setupParams->decodingParams = std::move(externalDraftTokensParams);
+    }
     setupParams->decodingParams->randomSeed = mSamplingConfig.randomSeed;
 
     mDecodingLayerWorkspace->setDeviceBatchSlots(batchSlots);
@@ -244,6 +257,27 @@ void prepareMedusaInputs(
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
+void prepareExternalDraftTokensInputs(
+    DecodingInput const& inputs, size_t maxBatchSize, std::shared_ptr<tl::DecodingInputs>& baseInputs)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto inputParams = std::dynamic_pointer_cast<tl::ExternalDraftTokensInputs>(baseInputs);
+
+    auto const& externalDraftTokensInputs = inputs.externalDraftTokensInputs.value();
+
+    inputParams->draftLogits = externalDraftTokensInputs.draftLogits;
+    inputParams->draftProbs = externalDraftTokensInputs.draftProbs;
+    inputParams->targetProbs = externalDraftTokensInputs.targetProbs;
+    inputParams->numDraftTokens = externalDraftTokensInputs.numDraftTokens;
+    inputParams->draftTokenIds = externalDraftTokensInputs.draftTokenIds;
+    inputParams->constantThreshold = externalDraftTokensInputs.constantThreshold;
+    inputParams->useRandomAcceptanceThreshold = externalDraftTokensInputs.useRandomAcceptanceThreshold;
+    inputParams->step = externalDraftTokensInputs.step;
+    inputParams->useDraftLogits = externalDraftTokensInputs.useDraftLogits;
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
 void prepareExplicitDraftTokensInput(DecodingInput const& inputs, std::shared_ptr<tl::DecodingInputs>& baseInputs)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@@ -316,6 +350,11 @@ std::shared_ptr<tl::BaseDecodingInputs> prepareInputs(
         forwardParams
             = std::make_shared<tl::ExplicitDraftTokensInputs>(input.endIds, input.batchSlots, input.batchSize);
     }
+    else if (decodingMode.isExternalDraftTokens())
+    {
+        forwardParams = std::make_shared<tl::ExternalDraftTokensInputs>(
+            input.endIds, input.batchSlots, input.step, ite, input.batchSize);
+    }
 
     // No logits for explicit draft tokens
     if (!decodingMode.isExplicitDraftTokens())
@@ -379,6 +418,11 @@ std::shared_ptr<tl::BaseDecodingInputs> prepareInputs(
         forwardParams->localBatchSize = input.batchSize;
     }
 
+    if (decodingMode.isExternalDraftTokens())
+    {
+        prepareExternalDraftTokensInputs(input, maxBatchSize, forwardParams);
+    }
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 
     return forwardParams;
@@ -593,105 +637,3 @@ namespace tensorrt_llm::runtime
 template class GptDecoder<float>;
 template class GptDecoder<half>;
 } // namespace tensorrt_llm::runtime
-
-void IGptDecoder::acceptDraftTokensByIds(ITensor const& targetTokenIds, ITensor const& draftTokenIds,
-    ITensor const& contextLengths, ITensor const& numDraftTokens, ITensor& sequenceLengths, ITensor const& finishedVec,
-    ITensor& finishedFinal, ITensor& finishedSum, ITensor const& batchSlots, BufferManager::CudaStreamPtr const& stream)
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-
-    auto const finishedVecShape = finishedVec.getShape();
-    auto const maxBatchSize = finishedVecShape.d[1];
-    auto const batchSlotsShape = batchSlots.getShape();
-    auto const batchSize = batchSlotsShape.d[0];
-    auto const targetTokenIdsShape = targetTokenIds.getShape();
-    auto const beamWidth = targetTokenIdsShape.d[1];
-    auto const maxSeqLength = targetTokenIdsShape.d[2];
-    auto const maxDraftTokens = draftTokenIds.getDimension<1>();
-
-    TLLM_CHECK_WITH_INFO(beamWidth == 1,
-        common::fmtstr("Beam width (" FMT_DIM ") > 1 is not supported for the speculative decoding", beamWidth));
-
-    TLLM_CHECK_WITH_INFO(batchSize <= maxBatchSize,
-        common::fmtstr("Batch size (" FMT_DIM ") is not smaller or equal to max batch size (" FMT_DIM ")", batchSize,
-            maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(draftTokenIds.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Draft tokens batch size (" FMT_DIM ") is not equal to target batch size (" FMT_DIM ")",
-            draftTokenIds.getDimension<0>(), maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(contextLengths.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Context length batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")",
-            contextLengths.getDimension<0>(), maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(numDraftTokens.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Num draft tokens batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")",
-            numDraftTokens.getDimension<0>(), maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(sequenceLengths.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Sequence length batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")",
-            sequenceLengths.getDimension<0>(), maxBatchSize));
-
-    tksd::invokeAcceptDraftTokensByIds(bufferCast<TokenIdType>(draftTokenIds), bufferCast<TokenIdType>(targetTokenIds),
-        bufferCast<SizeType32>(contextLengths), bufferCast<SizeType32>(numDraftTokens),
-        bufferCast<SizeType32>(sequenceLengths),
-        reinterpret_cast<tensorrt_llm::kernels::FinishedState const*>(
-            bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finishedVec)),
-        reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
-            bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finishedFinal)),
-        bufferCast<int>(finishedSum), bufferCast<SizeType32>(batchSlots), batchSize, maxBatchSize, beamWidth,
-        maxSeqLength, maxDraftTokens, stream->get());
-
-    sync_check_cuda_error();
-
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-}
-
-void IGptDecoder::acceptDraftTokensByLogits(ITensor& draftLogits, ITensor const& targetLogits, ITensor& draftProbs,
-    ITensor& targetProbs, ITensor const& numDraftTokens, ITensor& finished, ITensor const& batchSlots,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold,
-    curandState_t* curandState, BufferManager::CudaStreamPtr const& stream)
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-
-    auto const draftLogitsShape = draftLogits.getShape();
-    auto const maxBatchSize = draftLogitsShape.d[0];
-    auto const maxTokensPerStep = draftLogitsShape.d[1];
-    auto const batchSlotsShape = batchSlots.getShape();
-    auto const batchSize = batchSlotsShape.d[0];
-    auto constexpr beamWidth = 1;
-
-    TLLM_CHECK_WITH_INFO(
-        beamWidth == 1, common::fmtstr("Beam width (%d) > 1 is not supported for the speculative decoding", beamWidth));
-
-    TLLM_CHECK(draftLogitsShape.d[2] == vocabSize);
-
-    if (draftLogits.getDataType() == nvinfer1::DataType::kFLOAT)
-    {
-        tksd::acceptDraftTokensByLogits(bufferCast<float>(draftLogits),
-            const_cast<float**>(reinterpret_cast<float const* const*>(bufferCast<int64_t>(targetLogits))),
-            bufferCast<float>(draftProbs), bufferCast<float>(targetProbs), bufferCast<SizeType32>(numDraftTokens),
-            reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
-                bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finished)),
-            curandState, bufferCast<SizeType32>(batchSlots), batchSize, maxBatchSize, beamWidth, vocabSize,
-            vocabSizePadded, maxTokensPerStep, useRandomAcceptThreshold, randomAcceptThreshold, stream->get());
-    }
-    else if (draftLogits.getDataType() == nvinfer1::DataType::kHALF)
-    {
-        tksd::acceptDraftTokensByLogits(bufferCast<half>(draftLogits),
-            const_cast<half**>(reinterpret_cast<half const* const*>(bufferCast<int64_t>(targetLogits))),
-            bufferCast<half>(draftProbs), bufferCast<half>(targetProbs), bufferCast<SizeType32>(numDraftTokens),
-            reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
-                bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finished)),
-            curandState, bufferCast<SizeType32>(batchSlots), batchSize, maxBatchSize, beamWidth, vocabSize,
-            vocabSizePadded, maxTokensPerStep, useRandomAcceptThreshold, randomAcceptThreshold, stream->get());
-    }
-    else
-    {
-        TLLM_THROW("Incorrect logits dtype. Only float32 and float16 are supported");
-    }
-
-    sync_check_cuda_error();
-
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-}
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
index 8e1f57e9f..3930f9aa9 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -93,30 +93,28 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz
     auto constexpr nvFloatType = TRTDataType<float>::value;
 
     auto& dInput = mJointDecodingInput;
-    auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    auto batchSlots = mBufferManager.emptyTensor(MemoryType::kPINNED, nvSizeType);
-    dInput
-        = std::make_unique<DecodingInput>(0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots));
-
+    { // prevent reusing these vars after std::move
+        auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
+        auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+        auto batchSlots = mBufferManager.emptyTensor(MemoryType::kPINNED, nvSizeType);
+        dInput = std::make_unique<DecodingInput>(
+            0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots));
+    }
     dInput->sequenceLimitLength = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     dInput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
 
     auto& dOutput = mJointDecodingOutput;
-    auto outputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    auto gatheredOutputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    dOutput = std::make_unique<DecodingOutput>(std::move(outputIds), std::move(gatheredOutputIds));
-
+    { // prevent reusing these vars after std::move
+        auto outputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+        auto gatheredOutputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+        dOutput = std::make_unique<DecodingOutput>(std::move(outputIds), std::move(gatheredOutputIds));
+    }
     dOutput->newTokensSteps = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    dOutput->parentIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+    dOutput->parentIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     mFinishedSteps
         = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<tk::FinishedState::UnderlyingType>::value);
-    mDraftProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    mTargetProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    mBatchSlotsSetup = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
-    mBatchSlotsDecoder = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
-    mBatchSlotsAcceptTokens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
-    mBatchSlotsAcceptLogits = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
+    mBatchSlotsSetup = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
+    mBatchSlotsDecoder = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     // use batchSize many entries instead of the usual 1
     dOutput->finishedSum = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     mFinishedSum = BufferManager::pinned(ITensor::makeShape({1}), nvSizeType);
@@ -129,16 +127,10 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz
 
     dOutput->logProbsTiled = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<float>::value);
 
-    mNumDraftTokens = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
-    mCurandStates = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT8);
-    mDraftTokenIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
-    mDraftLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    mTargetLogitsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<float*>::value);
-
     dInput->stopWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<int32_t*>::value);
-    dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
+    dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     dInput->badWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<int32_t*>::value);
-    dInput->badWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
+    dInput->badWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     dInput->embeddingBias = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
 
     int device;
@@ -149,13 +141,13 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz
 
     if (!mSpeculativeDecodingMode.isNone())
     {
-        allocateSpeculativeDecodingBuffers();
+        allocateSpeculativeDecodingBuffers(dtype);
     }
 
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void GptDecoderBatched::allocateSpeculativeDecodingBuffers()
+void GptDecoderBatched::allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
@@ -201,6 +193,22 @@ void GptDecoderBatched::allocateSpeculativeDecodingBuffers()
     }
     dOutput->speculativeDecodingOutputs = speculativeDecodingOutputs;
 
+    if (mSpeculativeDecodingMode.isDraftTokensExternal())
+    {
+        DecodingInput::ExternalDraftTokensInputs externalDraftTokensInputs;
+
+        externalDraftTokensInputs.draftLogits = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
+        externalDraftTokensInputs.draftProbs = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
+        externalDraftTokensInputs.targetProbs = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
+        externalDraftTokensInputs.numDraftTokens = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
+        externalDraftTokensInputs.useDraftLogits
+            = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<bool>::value);
+        externalDraftTokensInputs.draftTokenIds
+            = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
+
+        dInput->externalDraftTokensInputs = externalDraftTokensInputs;
+    }
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -251,6 +259,7 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     auto const maxTokensPerStepXmaxBatchSizeXmaxBeamWidth
         = ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize, maxBeamWidth});
     auto const maxBatchSizeXmaxTokensPerStep = ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep});
+    auto const jointOutputIdsShape = ITensor::makeShape({maxBatchSize, maxBeamWidth, maxSequenceLength});
 
     auto& dInput = *mJointDecodingInput;
     dInput.maxLength = mMaxSequenceLength;
@@ -268,8 +277,6 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     inputLengths.reshape(maxBatchSizeXmaxBeamWidth);
     mBufferManager.setZero(inputLengths);
 
-    auto const jointOutputIdsShape = ITensor::makeShape({maxBatchSize, maxBeamWidth, maxSequenceLength});
-
     auto& dOutput = *mJointDecodingOutput;
     dOutput.ids->reshape(jointOutputIdsShape);
 
@@ -296,15 +303,18 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
 
     mBatchSlotsSetup->reshape(ITensor::makeShape({maxBatchSize}));
     mBatchSlotsDecoder->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
-    mBatchSlotsAcceptTokens->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
-    mBatchSlotsAcceptLogits->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
 
     if (mSpeculativeDecodingMode.isDraftTokensExternal())
     {
-        mDraftProbs->reshape(ITensor::makeShape(
+        dInput.externalDraftTokensInputs->draftProbs->reshape(ITensor::makeShape(
             {maxBatchSize, maxTokensPerEngineStep, maxBeamWidth, static_cast<SizeType32>(mVocabSizePadded)}));
-        mTargetProbs->reshape(ITensor::makeShape(
+        dInput.externalDraftTokensInputs->targetProbs->reshape(ITensor::makeShape(
             {maxBatchSize, maxTokensPerEngineStep, maxBeamWidth, static_cast<SizeType32>(mVocabSizePadded)}));
+        dInput.externalDraftTokensInputs->draftLogits->reshape(
+            ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep, static_cast<SizeType32>(mVocabSizePadded)}));
+        dInput.externalDraftTokensInputs->draftTokenIds->reshape(maxBatchSizeXmaxTokensPerStep);
+        dInput.externalDraftTokensInputs->numDraftTokens->reshape(ITensor::makeShape({maxBatchSize, 1}));
+        dInput.externalDraftTokensInputs->useDraftLogits->reshape(ITensor::makeShape({maxBatchSize, 1}));
     }
 
     dOutput.parentIds->reshape(jointOutputIdsShape);
@@ -317,7 +327,7 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth);
     mBufferManager.setZero(*dOutput.cumLogProbs);
 
-    dOutput.logProbs->reshape(ITensor::makeShape({maxBatchSize, maxBeamWidth, mMaxSequenceLength}));
+    dOutput.logProbs->reshape(jointOutputIdsShape);
     mBufferManager.setZero(*dOutput.logProbs);
 
     if (maxBeamWidth > 1)
@@ -328,15 +338,6 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     dOutput.logProbsTiled->reshape(ITensor::makeShape({maxSequenceLength, maxBatchSize, maxBeamWidth}));
     mBufferManager.setZero(*dOutput.logProbsTiled);
 
-    // speculative decoding only works for beam width == 1
-    mDraftTokenIds->reshape(maxBatchSizeXmaxTokensPerStep);
-    mDraftLogits->reshape(
-        ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep, static_cast<SizeType32>(mVocabSizePadded)}));
-    mAcceptByLogits.resize(maxBatchSize);
-    mNumDraftTokens->reshape(ITensor::makeShape({maxBatchSize, 1}));
-    mCurandStates->reshape(ITensor::makeShape({maxBatchSize, sizeof(curandState_t)}));
-    mTargetLogitsPtrs->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
-
     const_cast<ITensor&>(*dInput.embeddingBias)
         .reshape(ITensor::makeShape({maxBatchSize, static_cast<SizeType32>(mVocabSizePadded)}));
     const_cast<ITensor&>(*dInput.badWordsPtrs).reshape(ITensor::makeShape({maxBatchSize}));
@@ -591,7 +592,6 @@ void GptDecoderBatched::newRequestSpeculativeDecoding(
     SizeType32 batchIdx, decoder_batch::Request const& request, SamplingConfig const& samplingConfig)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    mAcceptByLogits[batchIdx] = false;
 
     if (mSpeculativeDecodingMode.predictsDraftTokens())
     {
@@ -639,40 +639,41 @@ void GptDecoderBatched::newRequestDraftTokensExternal(
     auto const& stream = mDecoderStream;
     BufferManager manager{stream};
 
-    auto constexpr localBatchSize = 1;
+    auto& dJointInput = *mJointDecodingInput;
+    auto useDraftLogits = false;
 
     auto const numDraftTokens = request.generatedTokensPerEngineStep - 1;
     if (request.draftLogits.has_value())
     {
         TensorPtr draftLogitsView = ITensor::view(request.draftLogits.value());
-        mAcceptByLogits[batchIdx] = true;
+        useDraftLogits = true;
 
-        TensorPtr draftLogitsReqBatchSlice = ITensor::slice(mDraftLogits, batchIdx, 1);
+        TensorPtr draftLogitsReqBatchSlice
+            = ITensor::slice(dJointInput.externalDraftTokensInputs->draftLogits, batchIdx, 1);
         draftLogitsReqBatchSlice->squeeze(0);
         TensorPtr draftLogitsReqTokensSlice = ITensor::slice(draftLogitsReqBatchSlice, 0, numDraftTokens);
         manager.copy(*draftLogitsView, *draftLogitsReqTokensSlice);
     }
-    TensorPtr draftTokensReqBatchSlice = ITensor::slice(mDraftTokenIds, batchIdx, 1);
+    auto useDraftLogitsView = ITensor::slice(dJointInput.externalDraftTokensInputs->useDraftLogits, batchIdx, 1);
+    kernels::invokeFill(*useDraftLogitsView, useDraftLogits, *stream);
+
+    TensorPtr draftTokensReqBatchSlice
+        = ITensor::slice(dJointInput.externalDraftTokensInputs->draftTokenIds, batchIdx, 1);
     draftTokensReqBatchSlice->squeeze(0);
     TensorPtr draftTokensReqTokensSlice = ITensor::slice(draftTokensReqBatchSlice, 0, numDraftTokens);
     TensorPtr draftTokensView = ITensor::view(request.draftTokens, ITensor::makeShape({numDraftTokens}));
     manager.copy(*draftTokensView, *draftTokensReqTokensSlice);
 
-    auto const curandStatesView = ITensor::slice(mCurandStates, batchIdx, 1);
-    auto curandState = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*curandStatesView));
-    auto batchSlotsPtr = bufferCast<SizeType32>(*ITensor::slice(mBatchSlotsSetup, 0, localBatchSize));
-    if (samplingConfig.randomSeed.has_value())
-    {
-        tk::invokeCurandInitialize(
-            curandState, batchSlotsPtr, localBatchSize, samplingConfig.randomSeed.value()[0], stream->get());
-    }
-    else
-    {
-        tk::invokeCurandInitialize(curandState, batchSlotsPtr, localBatchSize, 0, stream->get());
-    }
-    auto numDraftTokensView = ITensor::slice(mNumDraftTokens, batchIdx, 1);
+    auto numDraftTokensView = ITensor::slice(dJointInput.externalDraftTokensInputs->numDraftTokens, batchIdx, 1);
     kernels::invokeFill(*numDraftTokensView, numDraftTokens, *stream);
 
+    bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value();
+    float const constantThreshold
+        = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0];
+
+    dJointInput.externalDraftTokensInputs->useRandomAcceptanceThreshold = useRandomAcceptanceThreshold;
+    dJointInput.externalDraftTokensInputs->constantThreshold = constantThreshold;
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -838,8 +839,6 @@ void GptDecoderBatched::forwardDecoder(
 
     auto batchSlotsDecoderPtr = maxBeamWidth > 1 && input.seqSlots ? bufferCast<SizeType32>(*input.seqSlots)
                                                                    : bufferCast<SizeType32>(*mBatchSlotsDecoder);
-    auto batchSlotsAcceptTokensPtr = bufferCast<SizeType32>(*mBatchSlotsAcceptTokens);
-    auto batchSlotsAcceptLogitsPtr = bufferCast<SizeType32>(*mBatchSlotsAcceptLogits);
     auto& dInput = *mJointDecodingInput;
     auto& dOutput = *mJointDecodingOutput;
     auto& decoder = *mDecoder;
@@ -864,26 +863,12 @@ void GptDecoderBatched::forwardDecoder(
     }
 
     SizeType32 localBatchDecoderIdx = 0;
-    SizeType32 localBatchAcceptTokensIdx = 0;
-    SizeType32 localBatchAcceptLogitsIdx = 0;
     for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi)
     {
         if (mFinished[bi] || !input.active.at(bi) || step >= mNumDecodingEngineTokens[bi])
         {
             continue;
         }
-
-        if (!mAcceptByLogits[bi] && mMaxDecodingDecoderTokens == 1 && mNumDecodingEngineTokens[bi] > 1
-            && step == mNumDecodingEngineTokens[bi] - 1)
-        {
-            batchSlotsAcceptTokensPtr[step * mActualBatchSize + localBatchAcceptTokensIdx] = bi;
-            localBatchAcceptTokensIdx++;
-        }
-        else if (mAcceptByLogits[bi] && mMaxDecodingDecoderTokens == 1 && mNumDecodingEngineTokens[bi] > 1 && step == 0)
-        {
-            batchSlotsAcceptLogitsPtr[step * mActualBatchSize + localBatchAcceptLogitsIdx] = bi;
-            localBatchAcceptLogitsIdx++;
-        }
         batchSlotsDecoderPtr[step * mActualBatchSize + localBatchDecoderIdx] = bi;
         localBatchDecoderIdx++;
     }
@@ -892,9 +877,6 @@ void GptDecoderBatched::forwardDecoder(
         = *std::max_element(std::begin(mNumDecodingEngineTokens), std::end(mNumDecodingEngineTokens));
 
     std::vector<SharedConstPtr> logitsVec;
-    auto targetLogitsPtrsSlice = ITensor::slice(mTargetLogitsPtrs, step, 1);
-    auto targetLogitsPtrsSlicePtr = reinterpret_cast<void const**>(bufferCast<int64_t>(*targetLogitsPtrsSlice));
-    SizeType32 targetLogitsIdx = 0;
     for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi)
     {
         if (mFinished[bi] || !input.active.at(bi) || step >= mNumDecodingEngineTokens[bi])
@@ -904,32 +886,6 @@ void GptDecoderBatched::forwardDecoder(
         auto const& targetLogits = allTargetLogits[bi];
         TensorPtr logitsSlice = ITensor::slice(targetLogits, step, singleRequest);
         logitsVec.push_back(logitsSlice);
-        targetLogitsPtrsSlicePtr[targetLogitsIdx++] = logitsSlice->data();
-    }
-
-    if (async && localBatchAcceptLogitsIdx > 0)
-    {
-        // These params are only used for testing. Thus, can be per batch instead of per request
-        auto const& samplingConfig = decoder.getSamplingConfig();
-        bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value();
-        float const randomAcceptanceThreshold
-            = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0];
-
-        TensorPtr batchSlotsAcceptLogitsStepSlice = ITensor::slice(mBatchSlotsAcceptLogits, step, 1);
-        batchSlotsAcceptLogitsStepSlice->squeeze(0);
-        TensorPtr batchSlotsAcceptLogitsSlice
-            = ITensor::slice(batchSlotsAcceptLogitsStepSlice, 0, localBatchAcceptLogitsIdx);
-
-        IGptDecoder::acceptDraftTokensByLogits(
-            /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mDraftLogits,
-            /* [maxBatchSize][maxDecodingTokens, vocabPadded] */ *targetLogitsPtrsSlice,
-            /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mDraftProbs,
-            /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mTargetProbs,
-            /* [maxBatchSize] */ *mNumDraftTokens,
-            /* [maxDecodingTokens, maxBatchSize] */ *mFinishedSteps,
-            /* [bs] */ *batchSlotsAcceptLogitsSlice, static_cast<SizeType32>(mVocabSize),
-            static_cast<SizeType32>(mVocabSizePadded), useRandomAcceptanceThreshold, randomAcceptanceThreshold,
-            reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStates)), stream);
     }
 
     TensorPtr finishedStepsInput = ITensor::slice(mFinishedSteps, step, 1);
@@ -958,6 +914,11 @@ void GptDecoderBatched::forwardDecoder(
         dInput.medusaInputs->medusaLogits = input.predictedDraftLogits;
     }
 
+    if (mSpeculativeDecodingMode.isDraftTokensExternal())
+    {
+        dInput.externalDraftTokensInputs->step = step;
+    }
+
     dOutput.newTokens = newTokensStepView;
     dOutput.finishReasons = finishedStepsOutput;
     dOutput.lengths = sequenceLengths;
@@ -987,26 +948,6 @@ void GptDecoderBatched::forwardDecoder(
         mNbSteps[bi] += 1;
         mFinished[bi] = mNbSteps[bi] >= mMaxNewTokens[bi];
     }
-    if (async && localBatchAcceptTokensIdx > 0)
-    {
-        TensorPtr batchSlotsAcceptTokensStepSlice = ITensor::slice(mBatchSlotsAcceptTokens, step, 1);
-        batchSlotsAcceptTokensStepSlice->squeeze(0);
-        auto batchSlotsAcceptTokensSlice
-            = ITensor::slice(batchSlotsAcceptTokensStepSlice, 0, localBatchAcceptTokensIdx);
-
-        // Update finished state for 0th step
-        auto finishedFinal = ITensor::slice(mFinishedSteps, step, 1);
-        IGptDecoder::acceptDraftTokensByIds(
-            /* [maxBatchSize, maxBeamWidth, maxSeqLen] */ *dOutput.ids,
-            /* [maxBatchSize, maxDecodingDraftTokens] */ *mDraftTokenIds,
-            /* [maxBatchSize] */ *dInput.lengths,
-            /* [maxBatchSize] */ *mNumDraftTokens,
-            /* [maxBatchSize] */ *dOutput.lengths,
-            /* [maxDecodingTokens, maxBatchSize] */ *mFinishedSteps,
-            /* [maxBatchSize] */ *finishedFinal,
-            /* [maxBatchSize] */ *dOutput.finishedSum,
-            /* [bs] */ *batchSlotsAcceptTokensSlice, stream);
-    }
 
     // If last iteration
     if (async && step == maxDecodingEngineTokens - mMaxDecodingDecoderTokens)
diff --git a/cpp/tensorrt_llm/runtime/ipcUtils.cpp b/cpp/tensorrt_llm/runtime/ipcUtils.cpp
index a1ab91c4a..75af2eb32 100644
--- a/cpp/tensorrt_llm/runtime/ipcUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/ipcUtils.cpp
@@ -149,19 +149,24 @@ AllReduceBuffers::AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWi
         * std::min(
             static_cast<std::size_t>(maxBatchSize) * maxBeamWidth * maxSequenceLength * hiddenSize * sizeof(float),
             utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(tpSize));
+    auto const lamportBufferSize
+        = tpSize * tensorrt_llm::kernels::reduce_fusion::details::kLamportTokenNumThreshold * hiddenSize * sizeof(half);
     auto const flagsSize = IpcMemory::FLAGS_SIZE * tpSize * 2;
 
-    for (auto size : {bufferSize, bufferSize, flagsSize, flagsSize})
+    for (auto size :
+        {bufferSize, bufferSize, flagsSize, flagsSize, lamportBufferSize, lamportBufferSize, lamportBufferSize})
     {
         mIpcMemoryHandles.emplace_back(size, manager, worldConfig, isP2pSupported);
     }
 
     mAllReduceCommPtrs
-        = BufferManager::cpu(ITensor::makeShape({static_cast<SizeType32>(mIpcMemoryHandles.size()) * tpSize + 1}),
+        = BufferManager::cpu(ITensor::makeShape({static_cast<SizeType32>(mIpcMemoryHandles.size()) * tpSize + 2}),
             nvinfer1::DataType::kINT64);
     auto commPtrs = BufferRange<void*>(*mAllReduceCommPtrs);
-    auto const flagPtr = static_cast<int64_t*>(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 1));
-    *flagPtr = 0;
+    auto const CustomARFlagPtr = static_cast<int64_t*>(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 1));
+    auto const LamportFlagPtr = static_cast<int64_t*>(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 2));
+    *CustomARFlagPtr = 0;
+    *LamportFlagPtr = 0;
 
     for (std::size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++)
     {
@@ -169,6 +174,20 @@ AllReduceBuffers::AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWi
         TLLM_CHECK(memCommPtrs.size() == static_cast<std::size_t>(tpSize));
         std::copy(memCommPtrs.begin(), memCommPtrs.end(), commPtrs.begin() + memIdx * tpSize);
     }
+#if ENABLE_MULTI_DEVICE
+    auto rank = worldConfig.getRank();
+    auto tp_rank = worldConfig.getTensorParallelRank();
+    if (rank == tp_rank)
+    {
+        tensorrt_llm::kernels::lamportInitialize(
+            mIpcMemoryHandles[4].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0);
+        tensorrt_llm::kernels::lamportInitialize(
+            mIpcMemoryHandles[5].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0);
+        tensorrt_llm::kernels::lamportInitialize(
+            mIpcMemoryHandles[6].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0);
+        cudaDeviceSynchronize();
+    }
+#endif
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
diff --git a/cpp/tensorrt_llm/runtime/tllmBuffers.h b/cpp/tensorrt_llm/runtime/tllmBuffers.h
index 67a55d3ba..ea6beb7b2 100644
--- a/cpp/tensorrt_llm/runtime/tllmBuffers.h
+++ b/cpp/tensorrt_llm/runtime/tllmBuffers.h
@@ -216,7 +216,7 @@ class BorrowingAllocator : public BaseAllocator<BorrowingAllocator<memoryType>,
         , mCapacity(capacity)
     {
         TLLM_CHECK_WITH_INFO(capacity == 0 || static_cast<bool>(mPtr), "Undefined pointer");
-        TLLM_CHECK_WITH_INFO(mCapacity >= 0, "Capacity must be non-negative");
+        TLLM_CHECK_WITH_INFO(mCapacity >= std::size_t(0), "Capacity must be non-negative");
     }
 
 protected:
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
index fe0cf7c8a..a20046079 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
@@ -20,10 +20,12 @@
 #include "tensorrt_llm/common/nvtxUtils.h"
 #include "tensorrt_llm/common/safetensors.h"
 #include "tensorrt_llm/executor/tensor.h"
-#include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
 #include "tllmLogger.h"
 
+#include <algorithm>
+#include <iterator>
 #include <limits>
+#include <memory>
 #include <type_traits>
 
 using namespace tensorrt_llm::runtime;
@@ -141,6 +143,24 @@ TllmRuntime::TllmRuntime(
     // Print context memory size for CI/CD to track.
     TLLM_LOG_INFO("[MemUsageChange] Allocated %.2f MiB for execution context memory.",
         static_cast<double>(devMemorySize) / 1048576.0);
+
+    cacheTensorNames();
+}
+
+void TllmRuntime::cacheTensorNames()
+{
+    for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
+    {
+        auto const* const name = mEngine->getIOTensorName(i);
+        if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT)
+        {
+            mInputTensorNames.emplace_back(name);
+        }
+        else if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT)
+        {
+            mOutputTensorNames.emplace_back(name);
+        }
+    }
 }
 
 nvinfer1::IExecutionContext& TllmRuntime::addContext(std::int32_t profileIndex)
@@ -188,68 +208,97 @@ bool TllmRuntime::executeContext(SizeType32 contextIndex) const
     return res;
 }
 
-void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap)
+void TllmRuntime::setInputTensorsImpl(SizeType32 contextIndex, TensorMap const& tensorMap, bool throwOnMiss)
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_FUNC_RANGE();
     auto& context = getContext(contextIndex);
-    for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
+    for (auto const& name : mInputTensorNames)
     {
-        char const* name = mEngine->getIOTensorName(i);
-        if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT)
+        auto const pos = tensorMap.find(name);
+        if (pos == tensorMap.end())
         {
-            auto pos = tensorMap.find(name);
-            auto posWeight = mManagedWeightsMap.find(name);
-            if (pos == tensorMap.end() && posWeight == mManagedWeightsMap.end())
+            if (throwOnMiss)
             {
-                auto expectedShape = mEngine->getTensorShape(name);
-                TLLM_THROW(
-                    "Input tensor '%s' not found; expected shape: %s", name, ITensor::toString(expectedShape).c_str());
+                auto expectedShape = mEngine->getTensorShape(name.c_str());
+                TLLM_THROW("Input tensor '%s' not found; expected shape: %s", name.c_str(),
+                    ITensor::toString(expectedShape).c_str());
             }
-            if (posWeight != mManagedWeightsMap.end() && mSetWeights.count(contextIndex) > 0)
+            else
             {
-                continue; // This input tensor is a managed weight, and we have already set it in a previous call.
+                continue;
             }
+        }
 
-            auto const& tensor = pos == tensorMap.end() ? posWeight->second : pos->second;
-            auto const tensorDtype = tensor->getDataType();
-            auto const engineDtype = mEngine->getTensorDataType(name);
-            // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
-            TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
-                    || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
-                "%s: expected type %d, provided type %d", name, static_cast<std::int32_t>(engineDtype),
-                static_cast<std::int32_t>(tensorDtype));
-
-            auto const tensorShape = tensor->getShape();
-            auto const setInputShapeSuccess = context.setInputShape(name, tensorShape);
-            if (!setInputShapeSuccess)
-            {
-                auto const minShape = mEngine->getProfileShape(name, contextIndex, nvinfer1::OptProfileSelector::kMIN);
-                auto const maxShape = mEngine->getProfileShape(name, contextIndex, nvinfer1::OptProfileSelector::kMAX);
-
-                TLLM_THROW("Tensor '%s' has invalid shape %s, expected in range min %s, max %s", name,
-                    ITensor::toString(tensorShape).c_str(), ITensor::toString(minShape).c_str(),
-                    ITensor::toString(maxShape).c_str());
-            }
-            auto* const data = tensor->data();
-            if (data)
-            {
-                context.setInputTensorAddress(name, data);
-            }
-            else
+        auto const& tensor = pos->second;
+        auto const tensorDtype = tensor->getDataType();
+        auto const engineDtype = mEngine->getTensorDataType(name.c_str());
+        // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
+        TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
+                || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
+            "%s: expected type %d, provided type %d", name.c_str(), static_cast<std::int32_t>(engineDtype),
+            static_cast<std::int32_t>(tensorDtype));
+
+        auto const tensorShape = tensor->getShape();
+        auto const setInputShapeSuccess = context.setInputShape(name.c_str(), tensorShape);
+        if (!setInputShapeSuccess)
+        {
+            auto const minShape
+                = mEngine->getProfileShape(name.c_str(), contextIndex, nvinfer1::OptProfileSelector::kMIN);
+            auto const maxShape
+                = mEngine->getProfileShape(name.c_str(), contextIndex, nvinfer1::OptProfileSelector::kMAX);
+
+            TLLM_THROW("Tensor '%s' has invalid shape %s, expected in range min %s, max %s", name.c_str(),
+                ITensor::toString(tensorShape).c_str(), ITensor::toString(minShape).c_str(),
+                ITensor::toString(maxShape).c_str());
+        }
+        auto* const data = tensor->data();
+        if (data)
+        {
+            context.setInputTensorAddress(name.c_str(), data);
+        }
+        else
+        {
+            TLLM_CHECK_WITH_INFO(tensor->getSize() == 0, std::string("Invalid data for tensor: ") + name.c_str());
+            // TensorRT runtime does not support nullptr.
+            if (!mDummyTensor)
             {
-                TLLM_CHECK_WITH_INFO(tensor->getSize() == 0, std::string("Invalid data for tensor: ") + name);
-                // TensorRT runtime does not support nullptr.
-                if (!mDummyTensor)
-                {
-                    mDummyTensor = mBufferManager.gpu(ITensor::makeShape({1}));
-                }
-                context.setInputTensorAddress(name, mDummyTensor->data());
+                mDummyTensor = mBufferManager.gpu(ITensor::makeShape({1}));
             }
+            context.setInputTensorAddress(name.c_str(), mDummyTensor->data());
         }
     }
+}
+
+void TllmRuntime::setStaticInputTensors(TensorMap const& tensorMap)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    NVTX3_FUNC_RANGE();
+
+    TLLM_CHECK_WITH_INFO(getNbContexts() > 0, "Contexts should be created before calling setStaticInputTensors");
+    for (auto contextIndex = 0; contextIndex < getNbContexts(); ++contextIndex)
+    {
+        setInputTensorsImpl(contextIndex, tensorMap, false);
+    }
 
-    mSetWeights.insert(contextIndex);
+    // move static input tensor names to separate vector
+    auto const begin = mInputTensorNames.begin();
+    auto end = mInputTensorNames.end();
+    for (auto const& [name, tensor] : tensorMap)
+    {
+        end = std::remove(begin, end, name);
+    }
+    mInputTensorNames.erase(end, mInputTensorNames.end());
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    NVTX3_FUNC_RANGE();
+    setInputTensorsImpl(contextIndex, tensorMap, true);
+
+    auto& context = getContext(contextIndex);
     if (mUseShapeInference)
     {
         NVTX3_SCOPED_RANGE(infer_shapes);
@@ -278,41 +327,37 @@ void TllmRuntime::setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_FUNC_RANGE();
     auto& context = getContext(contextIndex);
-    for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
+    for (auto const& name : mOutputTensorNames)
     {
-        auto const name = mEngine->getIOTensorName(i);
-        if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT)
+        auto const engineDtype = mEngine->getTensorDataType(name.c_str());
+        auto const pos = tensorMap.find(name);
+        if (pos != tensorMap.end())
         {
-            auto const engineDtype = mEngine->getTensorDataType(name);
-            auto pos = tensorMap.find(name);
-            if (pos != tensorMap.end())
-            {
-                auto const& tensor = pos->second;
-                auto const tensorDtype = tensor->getDataType();
-                // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
-                TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
-                        || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
-                    "%s: expected type %d, provided type %d", name, static_cast<std::int32_t>(engineDtype),
-                    static_cast<std::int32_t>(tensorDtype));
-
-                if (mUseShapeInference)
-                {
-                    auto const dims = context.getTensorShape(name);
-                    tensor->reshape(dims);
-                }
-                context.setTensorAddress(name, tensor->data());
-            }
-            else if (mUseShapeInference)
-            {
-                auto const dims = context.getTensorShape(name);
-                auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype));
-                tensorMap.insert(pos, std::make_pair(name, tensor));
-                context.setTensorAddress(name, tensor->data());
-            }
-            else
+            auto const& tensor = pos->second;
+            auto const tensorDtype = tensor->getDataType();
+            // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
+            TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
+                    || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
+                "%s: expected type %d, provided type %d", name.c_str(), static_cast<std::int32_t>(engineDtype),
+                static_cast<std::int32_t>(tensorDtype));
+
+            if (mUseShapeInference)
             {
-                TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name);
+                auto const dims = context.getTensorShape(name.c_str());
+                tensor->reshape(dims);
             }
+            context.setTensorAddress(name.c_str(), tensor->data());
+        }
+        else if (mUseShapeInference)
+        {
+            auto const dims = context.getTensorShape(name.c_str());
+            auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype));
+            tensorMap.insert(pos, std::make_pair(name, tensor));
+            context.setTensorAddress(name.c_str(), tensor->data());
+        }
+        else
+        {
+            TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name.c_str());
         }
     }
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
@@ -330,7 +375,7 @@ bool TllmRuntime::hasLayerProfiler(SizeType32 contextId) const
 
 void TllmRuntime::setLayerProfiler()
 {
-    mLayerProfiler.reset(new LayerProfiler);
+    mLayerProfiler = std::make_unique<LayerProfiler>();
     for (auto& context : mContexts)
     {
         context->setProfiler(mLayerProfiler.get());
@@ -351,6 +396,8 @@ void TllmRuntime::reportToProfiler(SizeType32 contextId)
 
 void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank)
 {
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    NVTX3_FUNC_RANGE();
     auto& engine = getEngine();
     auto& manager = getBufferManager();
     if (rawEngine.getManagedWeightsMapOpt().has_value())
@@ -386,4 +433,6 @@ void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank)
             mManagedWeightsMap.insert(std::make_pair(name, weightsDevice));
         }
     }
+    setStaticInputTensors(mManagedWeightsMap);
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h
index 80cea9c5f..06d7815cd 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.h
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h
@@ -73,8 +73,20 @@ class TllmRuntime
 
     void clearContexts();
 
+    /// @brief Set input tensors from tensorMap for all contexts.
+    /// @details The function can be used to set static input tensors for all iterations. If a tensor was set this way,
+    /// it doesn't need to included in calls to setInputTensors anymore.
+    void setStaticInputTensors(TensorMap const& tensorMap);
+
+    /// @brief Set input tensors from tensorMap for context at contextIndex.
+    /// @details The function expects that all input tensors (excluding the ones set by setStaticInputTensors) are
+    /// contained in the tensorMap. If a tensor is missing, has a bad shape or type, it will throw.
     void setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap);
 
+    /// @brief Set output tensors from tensorMap for context at contextIndex.
+    /// @details The function expects that all output tensors are contained in the tensorMap. If a tensor is missing and
+    /// shape inference is enabled, it will allocate the tensor on GPU and insert it into the tensorMap. Otherwise it
+    /// will throw.
     void setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap);
 
     bool executeContext(SizeType32 contextIndex) const;
@@ -123,6 +135,10 @@ class TllmRuntime
     void loadManagedWeights(RawEngine const& rawEngine, int localRank);
 
 private:
+    void cacheTensorNames();
+
+    void setInputTensorsImpl(SizeType32 contextIndex, TensorMap const& tensorMap, bool throwOnMiss);
+
     BufferManager::CudaStreamPtr mStream;
     BufferManager mBufferManager;
     std::unique_ptr<nvinfer1::IRuntime> mRuntime;
@@ -133,7 +149,10 @@ class TllmRuntime
     std::unique_ptr<nvinfer1::IEngineInspector> mEngineInspector;
     std::unique_ptr<LayerProfiler> mLayerProfiler;
     bool mUseShapeInference;
-    TensorMap mManagedWeightsMap{};
-    std::set<SizeType32> mSetWeights;
+    TensorMap mManagedWeightsMap;
+    // List of input tensor names. Names of static tensors are removed from this list when setStaticInputTensors is
+    // called.
+    std::vector<std::string> mInputTensorNames;
+    std::vector<std::string> mOutputTensorNames;
 };
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 839136b99..34034fa7d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -45,20 +45,12 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
 
 add_custom_target(google-tests)
 
-set(CASE_REPORT_WRAPPER
-    ${CMAKE_CURRENT_SOURCE_DIR}/resources/scripts/case_report_wrapper.py)
-
 function(add_gtest test_name test_src)
   set(options NO_GTEST_MAIN NO_TLLM_LINKAGE)
   cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}"
                         ${ARGN})
   add_executable(${test_name} ${test_src})
 
-  if($ENV{LLM_MEMORY_PROFILING})
-    set_property(TARGET ${test_name} PROPERTY TEST_LAUNCHER
-                                              ${CASE_REPORT_WRAPPER})
-  endif()
-
   target_link_libraries(${test_name} PUBLIC gmock_main nvonnxparser)
   if(NOT ARGS_NO_GTEST_MAIN)
     target_link_libraries(${test_name} PUBLIC gtest_main)
diff --git a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu
index a0f9233df..b6fd6f3b9 100644
--- a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu
+++ b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu
@@ -51,6 +51,16 @@ void simple_assert(bool flag)
     }
 }
 
+void check_last_cuda_error()
+{
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("CUDA error: %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
 struct CudaBuffer
 {
     void* _data;
@@ -85,8 +95,22 @@ struct CudaBuffer
 };
 
 template <typename T>
-float compare(int rank, void* _pa, void* _pb, int size, float scale)
+float compare(
+    int rank, void* _pa, void* _pb, int size, float scale, bool print_error = false, std::string const& cmp_info = "")
 {
+    if (print_error && rank == 0)
+    {
+        if (!cmp_info.empty())
+        {
+            printf("compare %s\n", cmp_info.c_str());
+        }
+        else
+        {
+            static int cnt = 0;
+            printf("unnamed compare %d\n", cnt++);
+        }
+    }
+
     auto pa = reinterpret_cast<T*>(_pa);
     auto pb = reinterpret_cast<T*>(_pb);
     float max_diff = 0.f, tot_diff = 0.f;
@@ -101,6 +125,10 @@ float compare(int rank, void* _pa, void* _pb, int size, float scale)
         float diff = std::abs(va - vb);
         if (diff > threshold)
         {
+            if (rank == 0 && print_error)
+            {
+                printf("err idx %d, value %f vs %f\n", n, va, vb);
+            }
             max_diff = std::max(max_diff, diff);
             tot_diff += diff;
             ++diff_cnt;
@@ -130,7 +158,7 @@ float compare(int rank, void* _pa, void* _pb, int size, float scale)
 template <typename T1, typename T2>
 void random_fill(std::vector<T1>& vec, T2 minv, T2 maxv)
 {
-    std::mt19937 gen(20240410);
+    std::mt19937 gen(20240725);
     std::uniform_real_distribution<float> dis(static_cast<float>(minv), static_cast<float>(maxv));
     for (auto& v : vec)
     {
@@ -164,8 +192,64 @@ std::string ar_info(AllReduceStrategyType runtime_strategy, AllReduceStrategyCon
     return info;
 }
 
-bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int warmup, int iter,
-    AllReduceStrategyType runtime_strategy = AllReduceStrategyType::ONESHOT,
+struct SetDevice
+{
+    SetDevice(int device_id)
+    {
+        TLLM_CUDA_CHECK(cudaSetDevice(device_id));
+    }
+};
+
+class Workspace
+{
+public:
+    Workspace(int world_size, int rank, int max_token_num, int max_hidden_size)
+        : world_config(world_size, 1, rank, world_size)
+        , set_device(world_config.getDevice())
+        , p_s(std::make_shared<tr::CudaStream>())
+        , buf_mgr(p_s)
+        , buffers(1, 1, max_token_num, max_hidden_size, buf_mgr, world_config)
+    {
+    }
+
+    void set_params(AllReduceParams& params) const
+    {
+        int world_size = world_config.getSize();
+        for (int i = 0; i < world_size; ++i)
+        {
+            params.peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[0].getCommPtrs()[i];
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[4].getCommPtrs()[i];
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[i + MAX_RANKS_PER_NODE]
+                = buffers.mIpcMemoryHandles[5].getCommPtrs()[i];
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[i + MAX_RANKS_PER_NODE * 2]
+                = buffers.mIpcMemoryHandles[6].getCommPtrs()[i];
+        }
+        for (int i = 0; i < world_size; ++i)
+        {
+            params.peer_barrier_ptrs_in[i] = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[2].getCommPtrs()[i]);
+        }
+        for (int i = 0; i < world_size; ++i)
+        {
+            params.peer_barrier_ptrs_out[i]
+                = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[3].getCommPtrs()[i]);
+        }
+    }
+
+    cudaStream_t get_stream() const
+    {
+        return p_s->get();
+    }
+
+protected:
+    tr::WorldConfig world_config;
+    SetDevice set_device;
+    std::shared_ptr<tr::CudaStream> p_s;
+    tr::BufferManager buf_mgr;
+    tr::AllReduceBuffers buffers;
+};
+
+bool test(Workspace const& workspace, int token_num, int hidden_size, bool has_bias, bool has_affine, int warmup,
+    int iter, AllReduceStrategyType runtime_strategy = AllReduceStrategyType::ONESHOT,
     AllReduceStrategyConfig config = AllReduceStrategyConfig(0), AllReduceFusionOp fusion_op = AllReduceFusionOp::NONE)
 {
     std::srand(20240603);
@@ -183,9 +267,13 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
     random_fill(residual_buffer, -1, 1);
     random_fill(weight_buffer, -1, 1);
     random_fill(bias_buffer, -1, 1);
+    random_fill(inter_buffer, 0, 0);
+    random_fill(output_buffer, 0, 0);
     residual.copy_from(residual_buffer.data());
     weight.copy_from(weight_buffer.data());
     bias.copy_from(bias_buffer.data());
+    inter.copy_from(inter_buffer.data());
+    out.copy_from(output_buffer.data());
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
     auto rank = comm.getRank();
@@ -195,40 +283,25 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
         if (fusion_op == AllReduceFusionOp::RESIDUAL_RMS_NORM)
         {
             printf(
-                "Custom All Reduce with Residual Add and RMS Norm, %s, message size %d(token num %d, hidden size %d), "
+                "Custom All Reduce with Residual Add and RMS Norm, %s, message size %6d(token num %6d, hidden size "
+                "%6d), "
                 "has bias %d, has affine %d\n",
                 info.c_str(), message_size, token_num, hidden_size, static_cast<int>(has_bias),
                 static_cast<int>(has_affine));
         }
         else
         {
-            printf("Custom All Reduce, %s, message size %d(token num %d, hidden size %d), has bias %d, has affine %d\n",
+            printf(
+                "Custom All Reduce, %s, message size %d(token num %d, hidden size %6d), has bias %6d, has affine %6d\n",
                 info.c_str(), message_size, token_num, hidden_size, static_cast<int>(has_bias),
                 static_cast<int>(has_affine));
         }
     }
-    random_fill(input_buffer, -1 / world_size, 1 / world_size);
+    random_fill(input_buffer, -1, 1);
     in.copy_from(input_buffer.data());
-    cudaSetDevice(rank);
-
-    tr::WorldConfig world_config(world_size, 1, rank, world_size);
-    auto p_s = std::make_shared<tr::CudaStream>();
-    tr::BufferManager buf_mgr(p_s);
-    tr::AllReduceBuffers buffers(1, 1, token_num, hidden_size, buf_mgr, world_config);
 
     AllReduceParams params;
-    for (int i = 0; i < world_size; ++i)
-    {
-        params.peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[0].getCommPtrs()[i];
-    }
-    for (int i = 0; i < world_size; ++i)
-    {
-        params.peer_barrier_ptrs_in[i] = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[2].getCommPtrs()[i]);
-    }
-    for (int i = 0; i < world_size; ++i)
-    {
-        params.peer_barrier_ptrs_out[i] = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[3].getCommPtrs()[i]);
-    }
+    workspace.set_params(params);
     params.barrier_flag = 0;
     params.ranks_per_node = world_size;
     params.local_rank = rank;
@@ -242,11 +315,18 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
     params.fusion_params.eps = eps;
     params.fusion_params.intermediate_buffer = inter.data();
 
-    cudaStream_t s;
-    cudaStreamCreate(&s);
+    cudaStream_t s = workspace.get_stream();
     cudaEvent_t begin, end;
     cudaEventCreate(&begin);
     cudaEventCreate(&end);
+    lamportInitialize(
+        params.fusion_params.lamport_peer_comm_buffer_ptrs[rank], message_size, nvinfer1::DataType::kHALF, s);
+    lamportInitialize(params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + MAX_RANKS_PER_NODE], message_size,
+        nvinfer1::DataType::kHALF, s);
+    lamportInitialize(params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + MAX_RANKS_PER_NODE * 2], message_size,
+        nvinfer1::DataType::kHALF, s);
+    cudaDeviceSynchronize();
+    comm.barrier();
     for (int i = 0; i < warmup; ++i)
     {
         params.barrier_flag += 1;
@@ -307,7 +387,7 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
     {
         printf("\033[31mFAILED\033[0m\n");
     }
-    cudaStreamDestroy(s);
+    comm.barrier();
     return pass;
 }
 
@@ -315,6 +395,7 @@ TEST(Kernel, AllReduce)
 {
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
+    auto rank = comm.getRank();
     if (world_size % 2)
         return;
 
@@ -331,6 +412,8 @@ TEST(Kernel, AllReduce)
     };
     // clang-format on
     bool pass = true;
+    int max_token_num = 1000, max_hidden_size = 8192;
+    Workspace workspace(world_size, rank, max_token_num, max_hidden_size);
     for (auto config : configs)
     {
         for (auto op : ops)
@@ -340,23 +423,23 @@ TEST(Kernel, AllReduce)
                 for (auto has_affine : {false, true})
                 {
                     pass = pass
-                        && test(
-                            1, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 1, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            1, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 1, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            10, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 10, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            10, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 10, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            1000, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::TWOSHOT, config, op);
+                        && test(workspace, 1000, 4096, has_bias, has_affine, warmup, iter,
+                            AllReduceStrategyType::TWOSHOT, config, op);
                     pass = pass
-                        && test(
-                            1000, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::TWOSHOT, config, op);
+                        && test(workspace, 1000, 8192, has_bias, has_affine, warmup, iter,
+                            AllReduceStrategyType::TWOSHOT, config, op);
                 }
             }
         }
@@ -368,28 +451,22 @@ TEST(Kernel, AllReduceOneShot)
 {
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
+    auto rank = comm.getRank();
     if (world_size % 2)
         return;
 
     int warmup = 100, iter = 100;
-    std::vector<int> candidate_bs{1, 2, 4, 8, 16, 32, 64, 128};
-    std::vector<int> candidate_hidden{4096, 8192, 12288, 16384};
+    std::vector<int> candidate_bs{1, 2, 4, 8, 16};
+    std::vector<int> candidate_hidden{1024, 2048, 4096, 8192};
     bool pass = true;
+    int max_token_num = 16, max_hidden_size = 8192;
+    Workspace workspace(world_size, rank, max_token_num, max_hidden_size);
     for (auto bs : candidate_bs)
     {
         for (auto hidden : candidate_hidden)
         {
             pass = pass
-                && test(bs, hidden, false, true, warmup, iter, AllReduceStrategyType::ONESHOT,
-                    AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
-            pass = pass
-                && test(bs, hidden, true, true, warmup, iter, AllReduceStrategyType::ONESHOT,
-                    AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
-            pass = pass
-                && test(bs, hidden, false, false, warmup, iter, AllReduceStrategyType::ONESHOT,
-                    AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
-            pass = pass
-                && test(bs, hidden, true, false, warmup, iter, AllReduceStrategyType::ONESHOT,
+                && test(workspace, bs, hidden, false, true, warmup, iter, AllReduceStrategyType::ONESHOT,
                     AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
         }
     }
diff --git a/cpp/tests/kernels/decodingKernelTest.cpp b/cpp/tests/kernels/decodingKernelTest.cpp
index 0860326ba..9b9a868b4 100644
--- a/cpp/tests/kernels/decodingKernelTest.cpp
+++ b/cpp/tests/kernels/decodingKernelTest.cpp
@@ -286,6 +286,438 @@ TEST_F(TestBeamHypothesesCopy, SingleBatchTest)
     checkAllEqual();
 }
 
+/**
+ * @brief Fills a slice of a tensor with data from a source array.
+ *
+ * This function writes to `tensor`  from source array `src` at index `idx.
+ * It optionally flattens the tensor before performing the insertion.
+ * For example tensor if we wanted to write 5 values in the 3rd row of [1,10,100]
+ * We will use (tensor, 2, 5, src, true, mBufferManager) where src is a buffer with at least 5 elems.
+ *
+ * @tparam T The type of elements in the source array.
+ * @param tensor A shared pointer to the tensor to be modified. Also need to be of type T.
+ * @param idx The index at which to start inserting data into the tensor.
+ * @param insertLen The number of elements to insert from the source array into the tensor.
+ * @param src An array containing the data to be inserted into the tensor.
+ * @param flattenFirst A boolean flag indicating whether to flatten the first dimension of the tensor before insertion.
+ * @param bufferManager A shared pointer to a BufferManager responsible for managing memory operations.
+ */
+template <typename T>
+void fillTensorAtIndex(ITensor::SharedPtr tensor, SizeType32 idx, std::vector<T> src, bool flattenFirst,
+    std::shared_ptr<tensorrt_llm::runtime::BufferManager> bufferManager)
+{
+    SizeType32 insertLen = src.size();
+    ITensor::SharedPtr target = ITensor::view(tensor);
+    if (flattenFirst)
+    {
+        target->squeeze(0);
+    }
+
+    target = ITensor::slice(target, idx, 1);
+    target->squeeze(0);
+    target = ITensor::slice(target, 0, insertLen);
+    bufferManager->copy(src.data(), *target);
+}
+
+class TestGatherTree : public ::testing::Test
+{
+public:
+    SizeType32 batchSize{1};
+    SizeType32 beamWidth{5};
+    SizeType32 maxSeqLen{20};
+
+    using TensorPtr = ITensor::SharedPtr;
+
+    using DecodingOutputPtr = std::unique_ptr<DecodingOutput>;
+    DecodingOutputPtr decodingOutput{nullptr};
+
+    SamplingConfig samplingConfig = SamplingConfig();
+
+    std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream{nullptr};
+    std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager{nullptr};
+
+    SamplingConfig mSamplingConfig;
+
+    using DecodingInputPtr = std::unique_ptr<DecodingInput>;
+    DecodingInputPtr decodingInput{nullptr};
+
+    TensorPtr targetOut{nullptr};
+
+    void SetUp() override
+    {
+        mStream = std::make_shared<tensorrt_llm::runtime::CudaStream>();
+        mBufferManager = std::make_shared<tensorrt_llm::runtime::BufferManager>(mStream);
+    }
+
+    // create the empty buffers with the correct shapes and zero them
+    void createBuffers()
+    {
+        auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
+        auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
+        auto constexpr nvFloatType = TRTDataType<float>::value;
+
+        auto const maxBatchSizeShape = ITensor::makeShape({batchSize});
+        auto const maxBatchSizeXmaxBeamWidth = ITensor::makeShape({batchSize, beamWidth});
+        auto const jointOutputIdsShape = ITensor::makeShape({batchSize, beamWidth, maxSeqLen});
+
+        { // prevent reusing these vars after std::move
+            auto dummyLogits = mBufferManager->emptyTensor(MemoryType::kGPU, nvFloatType);
+            auto endIds = mBufferManager->emptyTensor(MemoryType::kGPU, nvTokenIdType);
+            auto batchSlots = mBufferManager->emptyTensor(MemoryType::kPINNED, nvSizeType);
+            decodingInput = std::make_unique<DecodingInput>(
+                0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots));
+        }
+        auto& dInput = *decodingInput;
+
+        dInput.maxLength = maxSeqLen;
+
+        const_cast<ITensor&>(*dInput.endIds).reshape(maxBatchSizeShape);
+        const_cast<ITensor&>(*dInput.batchSlots).reshape(maxBatchSizeShape);
+        const_cast<ITensor&>(*dInput.endIds).reshape(maxBatchSizeShape);
+        const_cast<ITensor&>(*dInput.batchSlots).reshape(maxBatchSizeShape);
+        auto& inputLengths = const_cast<ITensor&>(*dInput.lengths);
+        dInput.lengths = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, nvSizeType);
+        mBufferManager->setZero(const_cast<ITensor&>(*dInput.lengths));
+
+        { // prevent reusing these vars after std::move
+
+            auto ids = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+            mBufferManager->setZero(*ids);
+            auto gatheredIds = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+            mBufferManager->setZero(*gatheredIds);
+
+            decodingOutput = std::make_unique<DecodingOutput>(std::move(ids), std::move(gatheredIds));
+        }
+        auto& dOutput = *decodingOutput;
+
+        dOutput.logProbs = mBufferManager->gpu(jointOutputIdsShape, nvFloatType);
+        mBufferManager->setZero(*dOutput.logProbs);
+        dOutput.logProbsTiled = mBufferManager->gpu(ITensor::makeShape({maxSeqLen, batchSize, beamWidth}), nvFloatType);
+        mBufferManager->setZero(*dOutput.logProbsTiled);
+        dOutput.lengths = mBufferManager->gpu(ITensor::makeShape({batchSize, beamWidth}), nvSizeType);
+        mBufferManager->setZero(*dOutput.lengths);
+        dOutput.cumLogProbs = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, nvFloatType);
+        mBufferManager->setZero(*dOutput.cumLogProbs);
+
+        dOutput.beamHypotheses.empty(*mBufferManager);
+        dOutput.beamHypotheses.reshape(batchSize, beamWidth, maxSeqLen);
+
+        dOutput.finishReasons
+            = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, TRTDataType<tk::FinishedState::UnderlyingType>::value);
+        mBufferManager->setZero(*dOutput.finishReasons);
+        dOutput.parentIds = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+        mBufferManager->setZero(*dOutput.parentIds);
+
+        targetOut = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+        mBufferManager->setZero(*targetOut);
+    }
+
+    // clang-format off
+
+    // hardcode the input data for the output_len = 10 case
+    // this should not cause any beam swapping from the CBAs, just reorder the beams
+    void hardcodeBuffersLen10()
+    {
+        auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
+        auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
+        auto constexpr nvFloatType = TRTDataType<float>::value;
+
+        std::vector<SizeType32> len = {3, 3, 3, 3, 3};
+        TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
+        mBufferManager->copy(len.data(),*inputLengths);
+
+        std::vector<std::vector<float>> logProbs =
+        {
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -2.41985},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -0.493615, -2.61479},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -3.11851, -1.01671},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, 0, 0},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -3.62298}
+        };
+        for (SizeType32 it = 0; it < logProbs.size(); it++){
+            fillTensorAtIndex(decodingOutput->logProbs, it, logProbs[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsTiled =
+        {
+            {-2.70907, -2.96689, -3.27157, -3.37314, -3.50595},
+            {-1.84733, -1.8942, -1.63675, -1.9567, -1.47513},
+            {-0.305059, -0.765237, -2.31329, -2.37162, -2.48475},
+            {-1.97517, -0.0377979, -2.0169, -2.42439, -2.27471},
+            {-1.31451, -2.2442, -1.5831, -2.44732, -2.02409},
+            {-1.57552, -2.63339, -2.11286, -2.57304, -3.85214},
+            {-0.310524, -0.534199, -0.74379, -2.86232, -1.72914},
+            {-0.696636, -0.493615, -0.237725, -3.07164, -3.11851},
+            {-2.41985, -2.61479, -1.01671, -3.62298, -1.26586},
+            {-0.844337, -0.922832, -0.427682, -0.419985, -1.85996}
+        };
+        TensorPtr logProbsTiledView = ITensor::view(decodingOutput->logProbsTiled,ITensor::makeShape({maxSeqLen*batchSize, beamWidth}));
+        for (SizeType32 it = 0; it < logProbsTiled.size(); it++){
+            auto logProbsSlice = ITensor::slice(logProbsTiledView, it+3,1);
+            mBufferManager->copy(logProbsTiled[it].data(),*logProbsSlice);
+        }
+
+        std::vector<SizeType32> outputLenghts = {13, 13, 13, 13, 13};
+        mBufferManager->copy(outputLenghts.data(),*decodingOutput->lengths);
+
+        std::vector<float> cumLogProbs = {-15.0458, -15.4681, -15.8323, -15.8424, -16.0614};
+        mBufferManager->copy(cumLogProbs.data(),*decodingOutput->cumLogProbs);
+
+        std::vector<std::vector<TokenIdType>> outputIdsCBA =
+        {
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973}
+        };
+        for(SizeType32 it = 0; it < outputIdsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.outputIdsCBA, it, outputIdsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsCBA =
+        {
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -2.19674},
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -2.81382,}
+        };
+        for(SizeType32 it = 0; it < logProbsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.logProbsCBA, it, logProbsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<SizeType32> sequenceLengthsCBA = {10, 10, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(sequenceLengthsCBA.data(), *decodingOutput->beamHypotheses.sequenceLengthsCBA);
+
+        std::vector<float> cumLogProbsCBA = {-13.6336, -13.8988, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(cumLogProbsCBA.data(), *decodingOutput->beamHypotheses.cumLogProbsCBA);
+
+        std::vector<float> normedScoresCBA = {-1.7042, -1.73735, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(normedScoresCBA.data(), *decodingOutput->beamHypotheses.normedScoresCBA);
+
+        std::vector<SizeType32> numBeamsCBA = {2};
+        mBufferManager->copy(numBeamsCBA.data(), *decodingOutput->beamHypotheses.numBeamsCBA);
+
+        std::vector<float> minNormedScoresCBA = {-1.73735};
+        mBufferManager->copy(minNormedScoresCBA.data(), *decodingOutput->beamHypotheses.minNormedScoresCBA);
+
+        std::vector<SizeType32> batchDones = {0};
+        mBufferManager->copy(batchDones.data(), *decodingOutput->beamHypotheses.batchDones);
+
+        std::vector<uint8_t> finishReasons = {4, 4, 4, 4, 4};
+        mBufferManager->copy(finishReasons.data(), *decodingOutput->finishReasons);
+
+        std::vector<std::vector<TokenIdType>> ids =
+        {
+            {1, 864, 304, 1073, 825, 1048, 278, 278, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 367, 920, 304, 310, 1749, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 679, 263, 760, 679, 263, 29973, 13, 310, 526, 502},
+            {1, 864, 304, 1207, 901, 278, 1749, 445, 3889, 393, 591, 13443, 276},
+            {1, 864, 304, 1074, 263, 29973, 1207, 263, 2446, 12623, 1334, 29915, 30010}
+        };
+        for(SizeType32 it = 0; it < ids.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->ids, it, ids[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<SizeType32>> parentIds =
+        {
+            {0, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 0},
+            {0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 1},
+            {0, 0, 0, 0, 1, 2, 1, 4, 3, 2, 4, 4, 3},
+            {0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0, 0, 4},
+            {0, 0, 0, 0, 3, 3, 1, 2, 0, 4, 0, 3, 0}
+        };
+        for(SizeType32 it = 0; it < parentIds.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->parentIds, it, parentIds[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<TokenIdType>> targetOutput =
+    {
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 13443, 502},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 591, 29915, 276},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 4806, 30010}
+        };
+        for(SizeType32 it = 0; it < targetOutput.size(); it++)
+        {
+            fillTensorAtIndex(targetOut, it, targetOutput[it], true, mBufferManager);
+        }
+    }
+
+    // this case has the output_len = 8, and tests that the beams from the CBAs are correctly swapped.
+    void hardcodeBuffersLen8()
+    {
+        auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
+        auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
+        auto constexpr nvFloatType = TRTDataType<float>::value;
+
+        std::vector<SizeType32> len = {3, 3, 3, 3, 3};
+        TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
+        mBufferManager->copy(len.data(),*inputLengths);
+
+        std::vector<std::vector<float> >logProbs =
+        {
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.44732, -2.11286, -0.74379},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -2.86232},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -3.85214, -1.72914}
+        };
+        for (SizeType32 it = 0; it < logProbs.size(); it++){
+            fillTensorAtIndex(decodingOutput->logProbs, it, logProbs[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsTiled =
+        {
+            {-2.70907, -2.96689, -3.27157, -3.37314, -3.50595},
+            {-1.84733, -1.8942, -1.63675, -1.9567, -1.47513},
+            {-0.305059, -0.765237, -2.31329, -2.37162, -2.48475},
+            {-1.97517, -0.0377979, -2.0169, -2.42439, -2.27471},
+            {-1.31451, -2.2442, -1.5831, -2.44732, -2.02409},
+            {-1.57552, -2.63339, -2.11286, -2.57304, -3.85214},
+            {-0.310524, -0.534199, -0.74379, -2.86232, -1.72914},
+            {-0.696636, -0.493615, -0.237725, -3.07164, -3.11851}
+        };
+        TensorPtr logProbsTiledView = ITensor::view(decodingOutput->logProbsTiled,ITensor::makeShape({maxSeqLen*batchSize, beamWidth}));
+        for (SizeType32 it = 0; it < logProbsTiled.size(); it++){
+            auto logProbsSlice = ITensor::slice(logProbsTiledView, it+3,1);
+            mBufferManager->copy(logProbsTiled[it].data(),*logProbsSlice);
+        }
+        std::vector<SizeType32> outputLenghts = {11, 11, 11, 11, 11};
+        mBufferManager->copy(outputLenghts.data(),*decodingOutput->lengths);
+
+        std::vector<float> cumLogProbs = {-11.7816, -11.9304, -14.0883, -14.1566, -14.2035};
+        mBufferManager->copy(cumLogProbs.data(),*decodingOutput->cumLogProbs);
+
+        std::vector<std::vector<TokenIdType>> outputIdsCBA =
+        {
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973}
+        };
+        for(SizeType32 it = 0; it < outputIdsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.outputIdsCBA, it, outputIdsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsCBA =
+        {
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -2.19674},
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -2.81382,}
+        };
+        for(SizeType32 it = 0; it < logProbsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.logProbsCBA, it, logProbsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<SizeType32> sequenceLengthsCBA  = {10, 10, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(sequenceLengthsCBA.data(), *decodingOutput->beamHypotheses.sequenceLengthsCBA);
+
+        std::vector<float> cumLogProbsCBA = {-13.6336, -13.8988, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(cumLogProbsCBA.data(), *decodingOutput->beamHypotheses.cumLogProbsCBA);
+
+        std::vector<float> normedScoresCBA = {-1.7042, -1.73735, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(normedScoresCBA.data(), *decodingOutput->beamHypotheses.normedScoresCBA);
+
+        std::vector<SizeType32> numBeamsCBA = {2};
+        mBufferManager->copy(numBeamsCBA.data(), *decodingOutput->beamHypotheses.numBeamsCBA);
+
+        std::vector<float> minNormedScoresCBA = {-1.73735};
+        mBufferManager->copy(minNormedScoresCBA.data(), *decodingOutput->beamHypotheses.minNormedScoresCBA);
+
+        std::vector<SizeType32> batchDones = {0};
+        mBufferManager->copy(batchDones.data(), *decodingOutput->beamHypotheses.batchDones);
+
+        std::vector<uint8_t> finishReasons = {4, 4, 4, 4, 4};
+        mBufferManager->copy(finishReasons.data(), *decodingOutput->finishReasons);
+
+        std::vector<std::vector<TokenIdType>> ids =
+        {
+            {1, 864, 304, 1073, 825, 1048, 278, 278, 3815, 29973, 13},
+            {1, 864, 304, 367, 920, 304, 310, 1749, 3815, 29973, 13},
+            {1, 864, 304, 679, 263, 760, 679, 263, 29973, 13, 310},
+            {1, 864, 304, 1207, 901, 278, 1749, 445, 3889, 393, 591},
+            {1, 864, 304, 1074, 263, 29973, 1207, 263, 2446, 12623, 1334}
+        };
+        for(SizeType32 it = 0; it < ids.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->ids, it, ids[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<SizeType32>> parentIds =
+        {
+            {0, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0},
+            {0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1},
+            {0, 0, 0, 0, 1, 2, 1, 4, 3, 2, 4},
+            {0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0},
+            {0, 0, 0, 0, 3, 3, 1, 2, 0, 4, 0}
+        };
+        for(SizeType32 it = 0; it < parentIds.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->parentIds, it, parentIds[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<TokenIdType>> targetOutput =
+        {
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13},
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 13},
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 0},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 0},
+            {1, 864, 304, 367, 263, 760, 310, 278, 2446, 12623, 310}
+        };
+        for(SizeType32 it = 0; it < targetOutput.size(); it++)
+        {
+            fillTensorAtIndex(targetOut, it, targetOutput[it], true, mBufferManager);
+        }
+    }
+
+    // clang-format on
+
+    bool checkResult()
+    {
+
+        TensorPtr reference = this->mBufferManager->copyFrom((*targetOut), tensorrt_llm::runtime::MemoryType::kCPU);
+        auto referencePtr = bufferCast<TokenIdType>(*reference);
+
+        TensorPtr real
+            = this->mBufferManager->copyFrom((*decodingOutput->gatheredIds), tensorrt_llm::runtime::MemoryType::kCPU);
+        auto realPtr = bufferCast<TokenIdType>(*real);
+
+        bool allEqual = true;
+        for (SizeType32 iAssert = 0; iAssert < batchSize * beamWidth * maxSeqLen; iAssert++)
+        {
+            if (referencePtr[iAssert] != realPtr[iAssert])
+            {
+                TLLM_LOG_ERROR("Mismatch input value. Position of inputs: %d, expected value: %d, output value: %d",
+                    iAssert, referencePtr[iAssert], realPtr[iAssert]);
+                allEqual = false;
+            }
+        }
+        return allEqual;
+    }
+};
+
+TEST_F(TestGatherTree, GatherTreeNoSwap)
+{
+    createBuffers();
+    hardcodeBuffersLen10();
+    cudaDeviceSynchronize();
+    kernels::gatherTree(*decodingOutput, *decodingInput, *mBufferManager, mSamplingConfig);
+    cudaDeviceSynchronize();
+
+    EXPECT_TRUE(checkResult());
+}
+
+TEST_F(TestGatherTree, GatherTreeWithSwap)
+{
+    createBuffers();
+    hardcodeBuffersLen8();
+    cudaDeviceSynchronize();
+    kernels::gatherTree(*decodingOutput, *decodingInput, *mBufferManager, mSamplingConfig);
+    cudaDeviceSynchronize();
+
+    EXPECT_TRUE(checkResult());
+}
+
 enum AcceptKernelMode
 {
     BY_IDS,
@@ -872,24 +1304,24 @@ class DecodingKernelsTest : public testing::Test
 
     void callAcceptByIds()
     {
-        tksp::invokeAcceptDraftTokensByIds(bufferCast<SizeType32>(*mDraftTokens),
-            bufferCast<SizeType32>(*mTargetTokens), bufferCast<SizeType32>(*mContextLengths),
-            bufferCast<SizeType32>(*mNumsDraftTokens), bufferCast<SizeType32>(*mSequenceLengths),
-            reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
-            reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedFinal)),
-            bufferCast<SizeType32>(*mFinishedSum), bufferCast<SizeType32>(*mBatchSlots), mBatchSize, mMaxBatchSize,
-            mBeamWidth, mMaxSeqLen, mMaxDraftTokens, mStream->get());
+        // tksp::invokeAcceptDraftTokensByIds(bufferCast<SizeType32>(*mDraftTokens),
+        //     bufferCast<SizeType32>(*mTargetTokens), bufferCast<SizeType32>(*mContextLengths),
+        //     bufferCast<SizeType32>(*mNumsDraftTokens), bufferCast<SizeType32>(*mSequenceLengths),
+        //     reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
+        //     reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedFinal)),
+        //     bufferCast<SizeType32>(*mFinishedSum), bufferCast<SizeType32>(*mBatchSlots), mBatchSize, mMaxBatchSize,
+        //     mBeamWidth, mMaxSeqLen, mMaxDraftTokens, mStream->get());
     }
 
     void callAcceptByLogits()
     {
-        tksp::acceptDraftTokensByLogits(bufferCast<T>(*mDraftLogits),
-            reinterpret_cast<T**>(bufferCast<int64_t>(*mTargetLogitsPtrs)), bufferCast<T>(*mDraftProbs),
-            bufferCast<T>(*mTargetProbs), bufferCast<SizeType32>(*mNumsDraftTokens),
-            reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
-            reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStates)), bufferCast<SizeType32>(*mBatchSlots),
-            mBatchSize, mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSize, mMaxDraftTokens, false, 0.9f,
-            mStream->get());
+        // tksp::acceptDraftTokensByLogits(bufferCast<T>(*mDraftLogits),
+        //     reinterpret_cast<T**>(bufferCast<int64_t>(*mTargetLogitsPtrs)), bufferCast<T>(*mDraftProbs),
+        //     bufferCast<T>(*mTargetProbs), bufferCast<SizeType32>(*mNumsDraftTokens),
+        //     reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
+        //     reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStates)),
+        //     bufferCast<SizeType32>(*mBatchSlots), mBatchSize, mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSize,
+        //     mMaxDraftTokens, false, 0.9f, mStream->get());
     }
 
     void callAcceptByIdsWithPaths()
@@ -1165,7 +1597,7 @@ typedef testing::Types<float, half> FloatAndHalfTypes;
 
 TYPED_TEST_SUITE(DecodingKernelsTest, FloatAndHalfTypes);
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelSmall)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByIdsKernelSmall)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(1)
@@ -1176,7 +1608,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelSmall)
                       .setAcceptMode(AcceptKernelMode::BY_IDS));
 }
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelLarge)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByIdsKernelLarge)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(128)
@@ -1187,7 +1619,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelLarge)
                       .setAcceptMode(AcceptKernelMode::BY_IDS));
 }
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelSmall)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByLogitsKernelSmall)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(1)
@@ -1198,7 +1630,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelSmall)
                       .setAcceptMode(AcceptKernelMode::BY_LOGITS));
 }
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelLarge)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByLogitsKernelLarge)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(64)
diff --git a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp
index 399595583..402eea153 100644
--- a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp
+++ b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp
@@ -62,7 +62,7 @@ class AirTopPSamplingKernelTest : public SamplingKernelTest<T>
 
         tk::TopPSamplingKernelParams<T> kernelParams;
         kernelParams.probs = bufferCast<T>(*this->mProbsDevice);
-        kernelParams.outputIds = bufferCast<int*>(*this->mIdsPtrHost);
+        kernelParams.outputIdsPtrs = bufferCast<int*>(*this->mIdsPtrHost);
         kernelParams.workspace = workspaceDevice->data();
         kernelParams.topPs = bufferCast<float>(*this->mTopPsDevice);
         kernelParams.sequenceLength = bufferCast<int32_t>(*this->mSeqLengthsDevice);
diff --git a/cpp/tests/kernels/sampling/samplingTopPTest.cpp b/cpp/tests/kernels/sampling/samplingTopPTest.cpp
index c09133312..047644319 100644
--- a/cpp/tests/kernels/sampling/samplingTopPTest.cpp
+++ b/cpp/tests/kernels/sampling/samplingTopPTest.cpp
@@ -53,7 +53,7 @@ class TopPSamplingKernelTest : public SamplingKernelTest<T>
 
         tk::TopPSamplingKernelParams<T> kernelParams;
         kernelParams.probs = bufferCast<T>(*this->mProbsDevice);
-        kernelParams.outputIds = bufferCast<int*>(*this->mIdsPtrHost);
+        kernelParams.outputIdsPtrs = bufferCast<int*>(*this->mIdsPtrHost);
         kernelParams.workspace = workspaceDevice->data();
         kernelParams.topPs = bufferCast<float>(*this->mTopPsDevice);
         kernelParams.sequenceLength = bufferCast<int32_t>(*this->mSeqLengthsDevice);
diff --git a/cpp/tests/resources/data/test_model_lora_config.json b/cpp/tests/resources/data/test_model_lora_config.json
index 73a598d01..ea6442186 100644
--- a/cpp/tests/resources/data/test_model_lora_config.json
+++ b/cpp/tests/resources/data/test_model_lora_config.json
@@ -63,7 +63,6 @@
         "gather_context_logits": false,
         "gather_generation_logits": false,
         "strongly_typed": true,
-        "builder_opt": null,
         "profiling_verbosity": "layer_names_only",
         "enable_debug_output": false,
         "max_draft_len": 0,
diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py
index e845a0365..530db1d8f 100644
--- a/cpp/tests/resources/scripts/build_chatglm_engines.py
+++ b/cpp/tests/resources/scripts/build_chatglm_engines.py
@@ -59,7 +59,6 @@ def build_engine(ckpt_dir: str,
         "--max_seq_len=384",
         "--gpt_attention_plugin=float16",
         "--gemm_plugin=float16",
-        "--builder_opt=0",
     ]
     if is_ifb:
         build_cmd.extend([
diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py
index 104879be4..7cbc8c382 100755
--- a/cpp/tests/resources/scripts/build_gpt_engines.py
+++ b/cpp/tests/resources/scripts/build_gpt_engines.py
@@ -63,7 +63,6 @@ def build_engine(
         f'--max_input_len={max_input_len}',
         f'--max_seq_len={max_seq_len}',
         '--max_beam_width=2',
-        '--builder_opt=0',
         '--kv_cache_type=continuous',
     ]
     legacy_args = [
diff --git a/cpp/tests/resources/scripts/case_report_wrapper.py b/cpp/tests/resources/scripts/case_report_wrapper.py
deleted file mode 100755
index b5c99ef13..000000000
--- a/cpp/tests/resources/scripts/case_report_wrapper.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import subprocess
-import sys
-import time
-
-if __name__ == '__main__':
-    case = ''
-    for arg in sys.argv[1:]:
-        if '--gtest_filter=' in arg:
-            case = arg.removeprefix('--gtest_filter=')
-
-    gtest = subprocess.Popen(sys.argv[1:])
-
-    if case:
-        import multiprocessing.connection
-
-        with multiprocessing.connection.Client("/tmp/profiling_scribe.unix",
-                                               "AF_UNIX") as client:
-            client.send({
-                "type": "gtest_case",
-                "timestamp": time.time(),
-                "case": case,
-                "pid": gtest.pid
-            })
-
-    gtest.wait()
-    exit(gtest.returncode)
diff --git a/cpp/tests/resources/scripts/generate_expected_gpt_output.py b/cpp/tests/resources/scripts/generate_expected_gpt_output.py
index 4037a236f..69607af7c 100755
--- a/cpp/tests/resources/scripts/generate_expected_gpt_output.py
+++ b/cpp/tests/resources/scripts/generate_expected_gpt_output.py
@@ -151,7 +151,7 @@ def generate_outputs(num_beams):
                     output_logits=True,
                     output_log_probs=True,
                     output_cum_log_probs=True)
-    # GptExecutorTest.GenerationLogitsEarlyStop requires to use context_fmha_fp32_acc flag in runtime
+    # GptExecutorTest.GenerationLogitsEarlyStop and several tests require to use context_fmha_fp32_acc flag in runtime
     model_spec_obj.enable_context_fmha_fp32_acc()
     generate_output(engine=model_spec_obj.get_model_path(),
                     num_beams=num_beams,
@@ -165,6 +165,14 @@ def generate_outputs(num_beams):
     model_spec_obj.use_gpt_plugin()
     model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED)
     model_spec_obj.use_packed_input()
+    generate_output(engine=model_spec_obj.get_model_path(),
+                    num_beams=num_beams,
+                    input_name=input_name,
+                    model_spec_obj=model_spec_obj,
+                    output_logits=False,
+                    output_log_probs=True,
+                    output_cum_log_probs=True)
+    model_spec_obj.enable_context_fmha_fp32_acc()
     generate_output(engine=model_spec_obj.get_model_path(),
                     num_beams=num_beams,
                     input_name=input_name,
diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py
index 7e40a4cd4..0082b02d1 100755
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@@ -16,16 +16,13 @@
 
 import argparse as _arg
 import copy
-import functools
 import glob
 import logging as _log
 import os as _os
 import pathlib as _pl
 import platform
-import signal
 import subprocess as _sp
 import sys as _sys
-import time as _time
 import typing as _tp
 
 build_script_dir = _pl.Path(
@@ -559,31 +556,6 @@ def build_tests(build_dir: _pl.Path):
     run_command(make_google_tests, cwd=build_dir, timeout=300)
 
 
-def with_memory_monitor(func):
-    if not _os.environ.get('LLM_MEMORY_PROFILING', False):
-        return func
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        memory_collector = _sp.Popen([
-            "/usr/bin/python3",
-            find_root_dir() /
-            "tests/llm-test-defs/turtle/defs/memory_collector.py",
-            "-p",
-            str(_os.getpid()),
-            "-i",
-            "0.2",
-        ])
-        try:
-            func(*args, **kwargs)
-        finally:
-            memory_collector.send_signal(signal.SIGINT)
-            memory_collector.wait()
-
-    return wrapper
-
-
-@with_memory_monitor
 def run_unit_tests(build_dir: _pl.Path, timeout=1800):
     build_tests(build_dir=build_dir)
 
@@ -607,7 +579,6 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800):
     parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
 
 
-@with_memory_monitor
 def run_single_gpu_tests(build_dir: _pl.Path,
                          run_gpt,
                          run_gptj,
@@ -671,7 +642,7 @@ def run_single_gpu_tests(build_dir: _pl.Path,
             nranks=2,
             local_commands=[
                 "tests/executor/executorTest",
-                "--gtest_filter=*GptSingleDeviceDisaggExecutorTest*"
+                "--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*"
             ],
             leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
         run_command(trt_model_test, cwd=build_dir, env=cpp_env, timeout=timeout)
@@ -686,7 +657,6 @@ def produce_mpirun_command(*, global_commands, nranks, local_commands,
     return l[:-1]
 
 
-@with_memory_monitor
 def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
     build_tests(build_dir=build_dir)
 
@@ -793,7 +763,8 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
         global_commands=["mpirun", "--allow-run-as-root"],
         nranks=2,
         local_commands=[
-            "executor/executorTest", "--gtest_filter=*DisaggExecutorTest*"
+            "executor/executorTest",
+            "--gtest_filter=*DisaggSymmetricExecutorTest*"
         ],
         leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
     run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
@@ -805,7 +776,8 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
         global_commands=["mpirun", "--allow-run-as-root"],
         nranks=4,
         local_commands=[
-            "executor/executorTest", "--gtest_filter=*DisaggExecutorTest*"
+            "executor/executorTest",
+            "--gtest_filter=*DisaggSymmetricExecutorTest*"
         ],
         leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
     run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
@@ -818,7 +790,33 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
         nranks=8,
         local_commands=[
             "executor/executorTest",
-            "--gtest_filter=*LlamaTP2PP2DisaggExecutorTest*"
+            "--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+
+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=4,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+
+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=6,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
         ],
         leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
     run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
@@ -1121,24 +1119,4 @@ def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
 
     del test_args.run_all_models
 
-    do_memory_profiling = _os.environ.get('LLM_MEMORY_PROFILING', False)
-    if do_memory_profiling:
-        unix_socket = "/tmp/profiling_scribe.unix"
-
-        scribe = _sp.Popen([
-            "/usr/bin/python3",
-            find_root_dir() /
-            "tests/llm-test-defs/turtle/defs/profiling_scribe.py", "-l",
-            unix_socket
-        ])
-
-        while not _os.path.exists(unix_socket):
-            _time.sleep(0.1)
-
-    try:
-        run_tests(**vars(test_args))
-    finally:
-        if do_memory_profiling:
-            scribe.send_signal(signal.SIGINT)
-            scribe.wait(timeout=10)
-            scribe.kill()
+    run_tests(**vars(test_args))
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index 7ea3a00ab..42a2fecd0 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -489,7 +489,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     auto const maxAttentionWindow = maxSeqLength;
     SizeType32 const sinkTokenLength{0};
 
-    auto const decodingMode = maxBeamWidth == 1 ? tle::DecodingMode::TopKTopP() : tle::DecodingMode::BeamSearch();
+    auto const decodingMode = tle::DecodingMode::ExternalDraftTokens(); // only supports bw=1
 
     // set up decoder
     auto decoder
@@ -518,13 +518,10 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     decoder.forward(outputs, inputs);
 
     advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs, batchSize, maxBeamWidth);
-    // WAR: we don't write endId back into outputIds when we rejected tokens,
-    // so we adjust the lengths for verifyResults here
-    advanceSequenceLengths(generatedLengths, advancedTokensPerStep, samplingConfigs, batchSize, maxBeamWidth);
     checkSequenceLengths(*outputs.sequenceLengths, expectedLengths, manager);
     EXPECT_THAT(decoder.getFinished(), ::testing::Each(false));
 
-    verifyResults(manager, decoder, samplingConfigs, inputLengths, generatedLengths, batchSize, maxBeamWidth,
+    verifyResults(manager, decoder, samplingConfigs, inputLengths, expectedLengths, batchSize, maxBeamWidth,
         maxSeqLength, tokenId, padId);
 }
 
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index abf1e9d22..577c68d78 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -89,9 +89,11 @@ RUN ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensor
     test -f bin/executorWorker && \
     ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/libs")') lib && \
     test -f lib/libnvinfer_plugin_tensorrt_llm.so && \
-    ln -sv lib/libnvinfer_plugin_tensorrt_llm.so lib/libnvinfer_plugin_tensorrt_llm.so.9 && \
     echo "/app/tensorrt_llm/lib" > /etc/ld.so.conf.d/tensorrt_llm.conf && \
     ldconfig
+# Test LD configuration
+RUN ! ( ldd -v bin/executorWorker | grep tensorrt_llm | grep -q "not found" )
+
 ARG SRC_DIR=/src/tensorrt_llm
 COPY --from=wheel ${SRC_DIR}/benchmarks benchmarks
 ARG CPP_BUILD_DIR=${SRC_DIR}/cpp/build
diff --git a/docs/source/advanced/gpt-runtime.md b/docs/source/advanced/gpt-runtime.md
index 60a881e02..2e8ce590c 100644
--- a/docs/source/advanced/gpt-runtime.md
+++ b/docs/source/advanced/gpt-runtime.md
@@ -133,14 +133,14 @@ value for a given parameter, the vector can be limited to a single element
 
 ***General***
 
-|   Name in TRT-LLM   |                         Description                          |   Data type   |                        Range of value                        |                     Default value                     |       Name in HF       |
-| :-----------------: | :----------------------------------------------------------: | :-----------: | :----------------------------------------------------------: | :---------------------------------------------------: | :--------------------: |
-|    `temperature`    |          modulation of logits in sampling workflow           | List\[Float\] |                     \[0.0f, $+\infty$\)                      |                `1.0f` (no modulation)                 |     `temperature`      |
-|     `minLength`     |        lower-bound on the number of tokens generated         |  List\[Int\]  |                       \[0, $+\infty$\)                       | `0` (no effect (the first generated token can be EOS) |      `min_length`      |
-| `repetitionPenalty` | penalize repetitive tokens <br> multiplicative, irrespective of appearances count | List\[Float\] | \[0.0f, $+\infty$\) <br> `< 1.0f` encourages repetition <br> `> 1.0f` discourages it |                  `1.0f` (no effect)                   |  `repetition_penalty`  |
-|  `presencePenalty`  | penalize existed tokens <br> additive, irrespective of appearances count | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
-| `frequencyPenalty`  | penalize existed tokens <br> additive, dependent on appearances count | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
-| `noRepeatNgramSize` |                                                              |  List\[Int\]  | \[0, $+\infty$\) <br> `> 0` all ngrams of that size can only occur once |                    `0` (no effect)                    | `no_repeat_ngram_size` |
+|   Name in TRT-LLM   |                                    Description                                    |   Data type   |                                      Range of value                                       |                     Default value                     |       Name in HF       |
+| :-----------------: | :-------------------------------------------------------------------------------: | :-----------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------: | :--------------------: |
+|    `temperature`    |                     modulation of logits in sampling workflow                     | List\[Float\] |                                    \[0.0f, $+\infty$\)                                    |                `1.0f` (no modulation)                 |     `temperature`      |
+|     `minLength`     |                   lower-bound on the number of tokens generated                   |  List\[Int\]  |                                     \[0, $+\infty$\)                                      | `0` (no effect (the first generated token can be EOS) |      `min_length`      |
+| `repetitionPenalty` | penalize repetitive tokens <br> multiplicative, irrespective of appearances count | List\[Float\] |   \[0.0f, $+\infty$\) <br> `< 1.0f` encourages repetition <br> `> 1.0f` discourages it    |                  `1.0f` (no effect)                   |  `repetition_penalty`  |
+|  `presencePenalty`  |     penalize existed tokens <br> additive, irrespective of appearances count      | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
+| `frequencyPenalty`  |       penalize existed tokens <br> additive, dependent on appearances count       | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
+| `noRepeatNgramSize` |                                                                                   |  List\[Int\]  |          \[0, $+\infty$\) <br> `> 0` all ngrams of that size can only occur once          |                    `0` (no effect)                    | `no_repeat_ngram_size` |
 
 * The tokens of input prompt are included during adopting `repetitionPenalty`, `presencePenalty`, and `frequencyPenalty` onto logits.
 
@@ -158,9 +158,9 @@ value for a given parameter, the vector can be limited to a single element
 | `topPResetIds`  |    the decay in the `topP` algorithm    |  List\[Int\]  | \[-1, $+\infty$\) | `-1` (no effect) |     no     |
 
  * If setting `topK = 0` and `topP = 0.0f`, greedy search is performed.
- * If setting `topK > 0` and `topP = 0.0f`, `topK` tokens of highest probilities will become the candidates of sampling (named `TopK sampling` in TRT-LLM).
- * If setting `topK = 0` and `topP > 0.0f`, tokens will be sorted with probility descendly, then the tokens with highest probilities which the accumulated probility larger than `topP` will become the candidates of sampling (named `TopP sampling` in TRT-LLM).
- * If setting `topK > 0` and `topP > 0.0f`, `topK` tokens of highest probilities will be selected, then those selected tokens will be sorted with probility descendly and their probility will be normalized, then the tokens with highest normalized probilities which the accumulated probility larger than `topP` will become the candidates of sampling (named `TopKTopP sampling` in TRT-LLM)
+ * If setting `topK > 0` and `topP = 0.0f`, `topK` tokens of highest probabilities will become the candidates of sampling (named `TopK sampling` in TRT-LLM).
+ * If setting `topK = 0` and `topP > 0.0f`, tokens will be sorted with probability descendly, then the tokens with highest probabilities which the accumulated probability larger than `topP` will become the candidates of sampling (named `TopP sampling` in TRT-LLM).
+ * If setting `topK > 0` and `topP > 0.0f`, `topK` tokens of highest probabilities will be selected, then those selected tokens will be sorted with probability descendly and their probability will be normalized, then the tokens with highest normalized probabilities which the accumulated probability larger than `topP` will become the candidates of sampling (named `TopKTopP sampling` in TRT-LLM)
 
  * If different `topK` values are provided for the different sequences in the batch, the performance of the implementation will depend on the largest value. For efficiency reasons, we recommend to batch requests with similar `topK` values together.
 
diff --git a/docs/source/advanced/speculative-decoding.md b/docs/source/advanced/speculative-decoding.md
index 2b8db72c8..f59600b86 100644
--- a/docs/source/advanced/speculative-decoding.md
+++ b/docs/source/advanced/speculative-decoding.md
@@ -32,11 +32,11 @@ may prove simpler than generating a summary for an article.
 Furthermore, when integrating Medusa with a standard PyTorch model implementation which may not be as finely
 tuned as TensorRT-LLM, the potential time savings are more pronounced.
 
-## Draft Model Approach
+## Draft-Target-Model Approach
 
-The Draft model approach involves the use of two distinct models trained independently
-but sharing the same vocabulary: a smaller Draft model and a larger Target model.
-For example, a GPT 125M model can serve as the Draft model, while a GPT 6.7B model acts as the Target model.
+The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model.
+
+There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in this document. The second one is using it directly in TensorRT-LLM, which steps can be found in [examples/draft_target_model/README.md](../../../examples/draft_target_model/README.md) and the code can be found in [examples/run.py](../../../examples/run.py).
 
 The management of Draft and Target models is facilitated through two separate `GptManager` instances.
 It is essential that you to coordinate the interactions between the Draft and Target models effectively.
@@ -341,7 +341,7 @@ Each request can be assigned a specific lookahead configuration when input to th
 
 ## Build and execute an engine from a model
 
-Vicuna models re-use Llmama Python scripts located in [examples/llama](../../examples/llama).
+Vicuna models reuse Llmama Python scripts located in [examples/llama](../../examples/llama).
 
 ### Convert a model to checkpoint
 ```bash
diff --git a/docs/source/architecture/workflow.md b/docs/source/architecture/workflow.md
index 1d366dc5f..6d02e406b 100644
--- a/docs/source/architecture/workflow.md
+++ b/docs/source/architecture/workflow.md
@@ -10,7 +10,7 @@ The build workflow contains two major steps.
 
 To generalize the TensorRT-LLM optimization features to all models, and to share the same workflow between different models for TensorRT-LLM users, TensorRT-LLM has conventions about how the models shall be defined and how the models shall be imported.
 
-TensorRT-LLM checkpoint convention is documented in [checkpoint doc](/docs/source/architecture/checkpoint.md), and all decoder-only models had been migrated to adopt the convention. Model-specific convert_checkpoint.py scripts are shipped as source code in example directories, and a trtllm-build CLI tool had been added. However, there are some disadvantages of providing convert checkpoint scripts outside the core TensorRT-LLM lib as example:
+TensorRT-LLM checkpoint convention is documented in [](checkpoint.md) and all decoder-only models had been migrated to adopt the convention. Model-specific convert_checkpoint.py scripts are shipped as source code in example directories, and a trtllm-build CLI tool had been added. However, there are some disadvantages of providing convert checkpoint scripts outside the core TensorRT-LLM lib as example:
 
 1. TensorRT-LLM evolves so quickly that the model's definition code might have changed for better performance; which means the `convert_checkpoint.py` is out of date.
 
@@ -47,7 +47,9 @@ class LLaMAForCausalLM (DecoderModelForCausalLM):
 ```
 
 
-Then, in the [convert_checkpoint.py](../../../../examples/llama/convert_checkpoint.py) script, the logic can be greatly simplified. Even if the model definition code of TensorRT-LLM LLaMA class is changed due to some reason, the `from_hugging_face` API will keep the same, thus the existing workflow using this interface will not be affected.
+Then, in the convert_checkpoint.py script in the
+[`examples/llama/`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama/) directory of the GitHub repo,
+the logic can be greatly simplified. Even if the model definition code of TensorRT-LLM LLaMA class is changed due to some reason, the `from_hugging_face` API will keep the same, thus the existing workflow using this interface will not be affected.
 
 
 ```python
@@ -65,7 +67,9 @@ Since LLaMA models were also released with different formats, such as the Meta c
 In the 0.9 release, only LLaMA is refactored. Since popular LLaMA (and its variants) models are released by Hugging Face and Meta checkpoint formats, only these two functions are implemented.
 
 
-In future releases, there might be `from_jax`, `from_nemo`, `from_keras` or other factory methods for different training checkpoints added, for example the TensorRT-LLM [GEMMA](../../../../examples/gemma/README.md) model supports JAX/Keras formats in addition to huggingface. The model developers can choose to implement **any subset** of these factory methods for the models they contributed to TensorRT-LLM.
+In future releases, there might be `from_jax`, `from_nemo`, `from_keras` or other factory methods for different training checkpoints added.
+For example, the Gemma 2B model and the convert_checkpoint.py file in the [`examples/gemma`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gemma/)
+directory support JAX and Keras formats in addition to Hugging Face. The model developers can choose to implement **any subset** of these factory methods for the models they contributed to TensorRT-LLM.
 
 
 For some formats which are not supported by TensorRT-LLM model developers, you still have the freedom to implement your own weights conversion outside the core lib; the flow will look like this:
@@ -96,7 +100,9 @@ TensorRT-LLM relies on NVIDIA Modelopt toolkit to support some of the quantizati
 
 In TensorRT-LLM 0.8 version:
 
-* For Modelopt-supported quantization algorithms, a standalone script in the example folder [quantize.py](../../../../examples/quantization/quantize.py) shall be executed to export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines.
+* For Modelopt-supported quantization algorithms, a standalone script,
+  [example/quantization/quantize.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
+  can export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines.
 
 * For the non-Modelopt quantization algorithms, users need to use the per-model convert_checkpoint.py scripts to export TensorRT-LLM checkpoints.
 
@@ -116,8 +122,6 @@ class PretrainedModel:
         # and save the checkpoint to output_dir
 ```
 
-```{note}
-
 * The default implementation only handles the Modelopt supported quantization. The LLaMA class then inherits this `PretrainedModel` and dispatches the Modelopt quantization to the super class's default implementation.
 * The model developer raises errors in the sub-class implementation if the new model is not supported by Modelopt yet.
 
@@ -145,7 +149,7 @@ class LLaMAForCausalLM:
 The `quantize` API is designed to take multi-GPU resources internally to make quantization. For example, a LLaMA 70B BF16 takes 140G memory, if we make FP8 quantization, then, another 70G is needed. So, we need at least 210G, 4 * A100(H100) is needed to quantize the LLaMA 70B model. If you want to call `quantize` API inside a MPI program, be cautious and ensure the quantize API is only called by rank 0.
 
 
-Usage of the `quantize` API in an MPI program looks like this, only rank 0 calls it. In an non-MPI program, and `if rank == 0` and the `mpi_barrier()` is not needed.
+Usage of the `quantize` API in an MPI program looks like this, only rank 0 calls it. In an non-MPI program, the `if rank == 0` and the `mpi_barrier()` are not needed.
 
 ```python
 quant_config = QuantConfig()
@@ -179,7 +183,7 @@ engine.save(engine_dir)
 ```
 
 
-The Llama object can be created by any method mentioned in the [conversion APIs](#conversion-apis) and the [quantization APIs](#quantization-apis) section.
+The Llama object can be created by any method mentioned in the [](#conversion-apis) or [](#quantization-apis) sections.
 
 
 The `trtllm-build` CLI tool is a thin wrapper around this `tensorrt_llm.build` API. The flags of the CLI tool are kept close to the fields of the `BuildConfig` class.
@@ -216,8 +220,7 @@ All the weights conversion, quantization, and build APIs mentioned above have co
 * A unified quantization script is inside the `examples/quantization/quantize.py` and can be shared by all **supported** models.
 * A `trtllm-build` CLI tool builds all models from TensorRT-LLM checkpoint.
 
-
-```{note}
+Refer to the following considerations for the CLI tools:
 
 * These scripts and tools should be used for scripting. Do not import the Python functions/class defined in these tools. TensorRT-LLM does not promise the content of these scripts can be compatible with previous versions. The options of these tools may also be changed when it’s not avoidable.
 
diff --git a/docs/source/blogs/quantization-in-TRT-LLM.md b/docs/source/blogs/quantization-in-TRT-LLM.md
index bf0cfb1bc..73f18b8f4 100644
--- a/docs/source/blogs/quantization-in-TRT-LLM.md
+++ b/docs/source/blogs/quantization-in-TRT-LLM.md
@@ -12,31 +12,31 @@ TensorRT-LLM offers a best-in-class unified quantization toolkit to significantl
 ### Performance
 In the following benchmark, we highlight the acceleration of a few popular models at a small batch size without imposing latency constraints. It's important to note that in scenarios where there's a latency constraint in your application, TRT-LLM can achieve an even greater performance improvement. Using LLaMA-v2-7B as an example, when the first token latency is constrained to be under 500ms, quantization with FP8 and a batch size of 16 achieves a notable **2.3x inference speedup** compared to FP16 on a H100.
 
-| Model       | Batch Size |  Speedup (FP8 v.s. FP16) | Speedup (INT8 SQ v.s. FP16) |
-|-------------|:----------:|:------------------------:|:---------------------------:|
-| GPT-J       |      1     |  1.40x  | 1.40x  |
-| GPT-J       |      8     |  1.44x  | 1.30x  |
-| LLaMA-v2-7B |      1     |  1.51x  | 1.47x  |
-| LLaMA-v2-7B |      8     |  1.40x  | 1.32x  |
+| Model       | Batch Size | Speedup (FP8 v.s. FP16) | Speedup (INT8 SQ v.s. FP16) |
+| ----------- | :--------: | :---------------------: | :-------------------------: |
+| GPT-J       |     1      |          1.40x          |            1.40x            |
+| GPT-J       |     8      |          1.44x          |            1.30x            |
+| LLaMA-v2-7B |     1      |          1.51x          |            1.47x            |
+| LLaMA-v2-7B |     8      |          1.40x          |            1.32x            |
 
 *The above benchmarks were run with Input Length=1024, Output Length=128, and TP=1 on H100 80GB.
 
 ### Accuracy
 
-| Model        | Quantization Methods | MMLU Baseline (FP16) | MMLU Post-quantization |   MMLU Loss   |
-|--------------|:--------------------:|:--------------------:|:----------------------:|:-------------:|
-| Falcon-180B  |          FP8         |         70.4         |          70.3          |     0.14%     |
-|              |        INT8-SQ       |         70.4         |          68.6          |     2.56%     |
-|              |       INT4-AWQ       |         70.4         |          69.8          |     0.85%     |
-| Falcon-40B   |          FP8         |         56.1         |          55.6          |     0.89%     |
-|              |        INT8-SQ       |         56.1         |          54.7          |     2.50%     |
-|              |       INT4-AWQ       |         56.1         |          55.5          |     1.07%     |
-| LLaMA-v2-70B |          FP8         |         69.1         |          68.5          |     0.87%     |
-|              |        INT8-SQ       |         69.1         |          67.2          |     2.75%     |
-|              |       INT4-AWQ       |         69.1         |          68.4          |     1.01%     |
-| MPT-30B      |          FP8         |         47.5         |          47.4          |     0.21%     |
-|              |        INT8-SQ       |         47.5         |          46.8          |     1.47%     |
-|              |       INT4-AWQ       |         47.5         |          46.5          |     2.11%     |
+| Model        | Quantization Methods | MMLU Baseline (FP16) | MMLU Post-quantization | MMLU Loss |
+| ------------ | :------------------: | :------------------: | :--------------------: | :-------: |
+| Falcon-180B  |         FP8          |         70.4         |          70.3          |   0.14%   |
+|              |       INT8-SQ        |         70.4         |          68.6          |   2.56%   |
+|              |       INT4-AWQ       |         70.4         |          69.8          |   0.85%   |
+| Falcon-40B   |         FP8          |         56.1         |          55.6          |   0.89%   |
+|              |       INT8-SQ        |         56.1         |          54.7          |   2.50%   |
+|              |       INT4-AWQ       |         56.1         |          55.5          |   1.07%   |
+| LLaMA-v2-70B |         FP8          |         69.1         |          68.5          |   0.87%   |
+|              |       INT8-SQ        |         69.1         |          67.2          |   2.75%   |
+|              |       INT4-AWQ       |         69.1         |          68.4          |   1.01%   |
+| MPT-30B      |         FP8          |         47.5         |          47.4          |   0.21%   |
+|              |       INT8-SQ        |         47.5         |          46.8          |   1.47%   |
+|              |       INT4-AWQ       |         47.5         |          46.5          |   2.11%   |
 
 
 
@@ -46,19 +46,19 @@ A quantization method comprises three primary components:
 2. Activation precision format
 3. Calibration algorithms
 
-Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it's recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model speicfic. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ.
+Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it's recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model specific. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ.
 
 Based on specific use cases, users might have different tolerances on accuracy impact and calibration time. The table below summarizes the tradeoffs* to consider when choosing a quantization method. You can also learn more about precision formats in our [documentation](https://nvidia.github.io/TensorRT-LLM/precision.html).
 
-| Quantization Methods | Performance Improvement (batch size <= 4) | Performance Improvement (batch size >= 16)  | Accuracy Impact | Calibration Time** |
-| :--------------------------- | :--------: | :--------------: | :----------: | :--------------: |
-| FP8 (W8A8)                   | Medium     | Medium           | Very Low     |          Minutes |
-| Int8 SQ (W8A8)               | Medium     | Medium           | Medium       |          Minutes |
-| Int8 weight-only (W8A16)     | Medium     | Low              | Low          |     Not Required |
-| Int4 weight-only (W4A16)     | High       | Low              | High         |     Not Required |
-| Int4 AWQ (W4A16)             | High       | Low              | Low          |  Tens of Minutes |
-| Int4 GPTQ                    | High       | Low              | Low          |  Tens of Minutes |
-| Int4-FP8 AWQ (W4A8)          | High       | Medium           | Low          |  Tens of Minutes |
+| Quantization Methods     | Performance Improvement (batch size <= 4) | Performance Improvement (batch size >= 16) | Accuracy Impact | Calibration Time** |
+| :----------------------- | :---------------------------------------: | :----------------------------------------: | :-------------: | :----------------: |
+| FP8 (W8A8)               |                  Medium                   |                   Medium                   |    Very Low     |      Minutes       |
+| Int8 SQ (W8A8)           |                  Medium                   |                   Medium                   |     Medium      |      Minutes       |
+| Int8 weight-only (W8A16) |                  Medium                   |                    Low                     |       Low       |    Not Required    |
+| Int4 weight-only (W4A16) |                   High                    |                    Low                     |      High       |    Not Required    |
+| Int4 AWQ (W4A16)         |                   High                    |                    Low                     |       Low       |  Tens of Minutes   |
+| Int4 GPTQ                |                   High                    |                    Low                     |       Low       |  Tens of Minutes   |
+| Int4-FP8 AWQ (W4A8)      |                   High                    |                   Medium                   |       Low       |  Tens of Minutes   |
 
 \* The performance and impact are measured on 10+ popular LLMs. We'll follow up with more data points.
 ** Calibration time is subject to the actual model size.
diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt
index 75b8c7914..02425e2a7 100644
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/bindings/executor/example_advanced.py b/examples/bindings/executor/example_advanced.py
index c6b7c31ea..6cd1303ed 100644
--- a/examples/bindings/executor/example_advanced.py
+++ b/examples/bindings/executor/example_advanced.py
@@ -124,7 +124,7 @@ def write_output_tokens(output_tokens_csv_file: str, request_ids: list[int],
         default=False,
         action="store_true",
         help=
-        "Exclude input token when writing output toekns. Only has effect for streaming=False since in streaming mode, input tokens are never included in output."
+        "Exclude input token when writing output tokens. Only has effect for streaming=False since in streaming mode, input tokens are never included in output."
     )
     parser.add_argument("--max_tokens",
                         type=int,
diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt
index bf948f69f..a3c6da3e0 100644
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt
index eeb7788bc..606480701 100644
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt
index 94f0ed2d2..d881f96af 100644
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/draft_target_model/README.md b/examples/draft_target_model/README.md
new file mode 100644
index 000000000..debf1cf3b
--- /dev/null
+++ b/examples/draft_target_model/README.md
@@ -0,0 +1,86 @@
+# Draft-Target-Model Speculative Decoding
+
+This document shows how to build and run a model using Draft-Target-Model speculative decoding (also known as `Speculative-Sampling`, [`Paper`](https://arxiv.org/abs/2302.01318)) in TensorRT-LLM on single GPU, or single node multiple GPU.
+
+## Overview
+
+The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model.
+
+There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in [speculative decoding documentation](../../docs/source/speculative_decoding.md). The second one is using it directly in TensorRT-LLM, which steps can be found in this document and the code can be found in [examples/run.py](../run.py).
+
+Draft-Target-Model has 4 additional hyperparameters that you need to specify to control the process of generation:
+- `draft_len`: the number of tokens the draft model generated in one iteration, which the range is from 4 to 10 in common usage. Empirically, the larger the value is, the higher acceptance ratio but higher overhead is expected at the same time, so the right balance based on the models and application scenarios needs to be found.
+- `draft_model_device_list`: the index list of device(s) to run the draft model. The length of it must be the same as the TP size of the draft model engine. For instances, `draft_model_device_list=[1]` means using tp_size=1 and GPU 1 for draft model, `draft_model_device_list=[4,5,6,7]` means using tp=4 and GPU from 4 to 7 for draft model.
+- `target_model_device_list`: the index list of device(s) to run the target model. The length of it must be the same as the TP size of the target model engine. For instances, `draft_model_device_list=[0]` means using tp_size=1 and GPU 0 for target model, `draft_model_device_list=[2,3]` means using tp=2 and GPU from 2 to 3 for target model.
+- `use_logits`: there are two methods to accept tokens proposed by draft model. When `use_logits=True`, the draft tokens are accepted based on the ratio of the logits from draft and target model (modified rejection sampling method in the original paper); When `use_logits=False`, the draft tokens are accepted based on per-token comparison with target predictions regardless of the logits.
+
+## Support Matrix
+  * GPU Compute Capability >= 8.0 (Ampere or newer)
+  * FP16 / BF16 / FP8 (both draft and target model)
+  * Paged KV Cache
+  * Tensor Parallel
+
+## Usage
+
+### Build draft and target engines
+
++ We use a open-source `llama-v2-7B/13B` models as both draft and target model in this example.
++ `--use_paged_context_fmha=enable` must be specified since we need KVcache reuse for draft / target model.
++ `--gather_generation_logits` is optional. In original paper, we accept the tokens by comparing logits of draft and target models, so this parameter is needed. But for simplification, we can accept the tokens by comparing the output token directly, in this occasion, we can skip this parameter.
++ `--speculative_decoding_mode=draft_tokens_external` and `--max_draft_len` must be specified for target model.
+
+```bash
+cd examples/llama
+
+python3 convert_checkpoint.py \
+    --model_dir=<Path To Llama-v2-7B repo> \
+    --output_dir=./ckpt-draft \
+    --dtype=float16
+
+python3 convert_checkpoint.py \
+    --model_dir=<Path To Llama-v2-13B repo> \
+    --output_dir=./ckpt-target \
+    --dtype=float16
+
+trtllm-build \
+    --checkpoint_dir ./ckpt-draft \
+    --output_dir=./draft-engine \
+    --gemm_plugin=float16 \
+    --use_paged_context_fmha=enable \
+    --gather_generation_logits \
+    --max_batch_size=4 \
+    --max_input_len=3200 \
+    --max_seq_len=4800
+
+trtllm-build \
+    --checkpoint_dir=./ckpt-target \
+    --output_dir=./target-engine \
+    --gemm_plugin=float16 \
+    --use_paged_context_fmha=enable \
+    --gather_generation_logits \
+    --speculative_decoding_mode=draft_tokens_external \
+    --max_draft_len=10 \
+    --max_batch_size=4 \
+    --max_input_len=3200 \
+    --max_seq_len=4800
+```
+
+### Run decoding
+
++ `--draft_engine_dir` and `--engine_dir` must be specified for the draft and target engines.
++ `--draft_target_model_config` is corresponding configuration of Draft-Target-Model, we can see its definition in [util.py](../util.py).
+  + As an example, `[4,[0],[1],False]` means `draft_len=4`, device of draft model is `GPU0`, device of target model is `GPU1`, and use tokens rather than logits to accept.
++ Only CPP session (using executor as low-level API) is supported, while Python session (`--use_py_session`) is not supported.
+
+```bash
+cd examples/llama
+
+python3 ../run.py \
+    --tokenizer_dir gpt2-medium \
+    --draft_engine_dir ./draft-engine \
+    --engine_dir ./target-engine \
+    --draft_target_model_config="[4,[0],[1],True]" \
+    --kv_cache_free_gpu_memory_fraction=0.4 \
+    --max_output_len=256 \
+    --input_text="How does Draft-Sampling work?"
+```
diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt
new file mode 100644
index 000000000..5ac8a0ffb
--- /dev/null
+++ b/examples/draft_target_model/requirements.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pypi.nvidia.com
+tensorrt_llm==0.14.0.dev2024100800
+datasets~=2.14.5
+rouge_score~=0.1.2
+sentencepiece~=0.1.99
+evaluate~=0.4.1
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index 815dca5b8..cd85a4ef2 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt
index 0a5ffa77b..dc52c6f39 100644
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@@ -3,7 +3,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
diff --git a/examples/gpt/README.md b/examples/gpt/README.md
index 2c9699096..7b0b65507 100644
--- a/examples/gpt/README.md
+++ b/examples/gpt/README.md
@@ -425,8 +425,7 @@ Then, use `trtllm-build` to build engine(s).
 
 ```bash
 trtllm-build --checkpoint_dir starcoder2/trt_ckpt/int8-sq/ \
-             --output_dir starcoder2/trt_engine/int8-sq/ \
-             --builder_opt 4
+             --output_dir starcoder2/trt_engine/int8-sq/
 ```
 
 
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index c179f7ffc..b1baef82c 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt
index 73cd7c4df..ee73d169c 100644
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt
index 15bcb91c5..593f3125c 100644
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt
index 3019f01b3..4fa525358 100644
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@@ -1,6 +1,6 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt
index 3078974dd..640b43eac 100644
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/jais/requirements.txt b/examples/jais/requirements.txt
index c179f7ffc..b1baef82c 100644
--- a/examples/jais/requirements.txt
+++ b/examples/jais/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py
index 17034568f..6443f8513 100644
--- a/examples/llama/convert_checkpoint.py
+++ b/examples/llama/convert_checkpoint.py
@@ -91,6 +91,19 @@ def parse_arguments():
         help=
         "The huggingface dataset name or the local directory of the dataset for calibration."
     )
+    parser.add_argument(
+        "--calib_size",
+        type=int,
+        default=512,
+        help=
+        "Number of samples for calibration. Set to -1 to use the whole dataset.",
+    )
+    parser.add_argument(
+        "--calib_max_seq_length",
+        type=int,
+        default=512,
+        help="Max Sequence length for calibration",
+    )
     parser.add_argument(
         "--smoothquant",
         "-sq",
@@ -408,6 +421,8 @@ def convert_and_save_hf(args):
             quant_config=quant_config,
             device='cpu' if args.load_model_on_cpu else 'cuda',
             calib_dataset=args.calib_dataset,
+            calib_batches=args.calib_size,
+            calib_max_seq_length=args.calib_max_seq_length,
             **override_fields)
     else:
         # When not loading by shard, preload one complete model and then slice per rank weights from this
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
index 0658842e8..44c2d95c5 100644
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/llm-api/requirements.txt b/examples/llm-api/requirements.txt
index 6d9b7b3e5..7ac907aa0 100644
--- a/examples/llm-api/requirements.txt
+++ b/examples/llm-api/requirements.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt
index 08e64bb07..7adcda04f 100644
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 transformers>=4.39.0
 datasets~=2.14.5
 evaluate
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
index 82689ec1c..5ac8a0ffb 100644
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md
index 20f8aa644..2aef5bd51 100644
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
@@ -94,7 +94,7 @@ In TP+EP mode, both strategies are used simultaneously. This means each GPU hand
 
 You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_coneckpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used.
 
-Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE paralleism across all GPUs must match the total number of parallelism in other parts of the model.
+Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE parallelism across all GPUs must match the total number of parallelism in other parts of the model.
 
 ```bash
 # Build Mixtral8x7B with Expert Parallelism
diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt
index 9edade2c6..7c4024262 100644
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 transformers==4.38.2
 accelerate==0.25.0
diff --git a/examples/model_api/README.md b/examples/model_api/README.md
index 065a81847..258801153 100644
--- a/examples/model_api/README.md
+++ b/examples/model_api/README.md
@@ -42,7 +42,7 @@ python ./llama_quantize.py --hf_model_dir <hf llama path> --cache_dir ./llama.aw
 
 ## AutoModelForCausalLM
 
-The API `tensorrt_llm.AutoModelForCausalLM` can read from a Hugging Face model directory, find the correct TRT-LLM model class and dispatch the `from_hugging_face` mothod to the correct TRT-LLM class.
+The API `tensorrt_llm.AutoModelForCausalLM` can read from a Hugging Face model directory, find the correct TRT-LLM model class and dispatch the `from_hugging_face` method to the correct TRT-LLM class.
 
 The following code snippets demonstrated the usage of the `AutoModelForCausalLM` class.
 
diff --git a/examples/model_api/llama.py b/examples/model_api/llama.py
index c699ee192..5ca11b98a 100644
--- a/examples/model_api/llama.py
+++ b/examples/model_api/llama.py
@@ -43,8 +43,6 @@ def main():
     build_config = BuildConfig(max_input_len=256,
                                max_seq_len=276,
                                max_batch_size=1)
-    # just for fast build, not best for production
-    build_config.builder_opt = 0
     build_config.plugin_config.gemm_plugin = 'auto'
 
     if args.clean_build or not args.engine_dir.exists():
diff --git a/examples/model_api/llama_multi_gpu.py b/examples/model_api/llama_multi_gpu.py
index 388ad3a7f..fe35cbe9a 100644
--- a/examples/model_api/llama_multi_gpu.py
+++ b/examples/model_api/llama_multi_gpu.py
@@ -28,7 +28,6 @@ def build_and_run_llama(hf_model_dir, engine_dir, tp_size, rank):
     build_config = BuildConfig(max_input_len=256,
                                max_seq_len=512,
                                max_batch_size=8)
-    build_config.builder_opt = 0  # fast build for demo, pls avoid using this in production, since inference might be slower
     build_config.plugin_config.gemm_plugin = 'auto'  # for fast build, tune inference perf based on your needs
     mapping = Mapping(world_size=tp_size, rank=rank, tp_size=tp_size)
     llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir, mapping=mapping)
diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt
index 73cd7c4df..ee73d169c 100644
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt
index eedf21867..79128ee0d 100644
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 nemo-toolkit[all]==2.0.0rc1
 megatron-core==0.8.0
 datasets~=2.14.5
diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt
index 73cd7c4df..ee73d169c 100644
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt
index b711cf2ad..d145002d2 100644
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/quantization/quantize.py b/examples/quantization/quantize.py
index f2fe6f3c9..c1380efa3 100644
--- a/examples/quantization/quantize.py
+++ b/examples/quantization/quantize.py
@@ -55,8 +55,13 @@
         help="Quantization format.",
         default="full_prec",
         choices=[
-            "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo",
-            "full_prec"
+            "fp8",
+            "int8_sq",
+            "int4_awq",
+            "w4a8_awq",
+            "int8_wo",
+            "int4_wo",
+            "full_prec",
         ],
     )
     parser.add_argument(
@@ -101,15 +106,43 @@
                         action='store_true',
                         help="whether to quantize the weights of medusa heads")
 
+    # auto quantization
+    parser.add_argument(
+        '--autoq_format',
+        default=None,
+        type=str,
+        help=
+        "Specific quantization algorithms will be searched in auto quantization."
+        "The algorithm must in ['fp8', 'int4_awq', 'w4a8_awq', 'int8_sq']."
+        "You can use ',' to separate more than one quantization algorithms(e.g. --autoq_format fp8,int4_awq,w4a8_awq)."
+        "Notice: fp8 and int8_sq can't be used at the same time.")
+    parser.add_argument(
+        '--weight_compression',
+        type=float,
+        default=None,
+        help="Percent of compression size when using mix precision quantization."
+        "The range is [0.0, 1.0], if you only indicate the autoq_format, it will be default to the lowest possible value."
+    )
+
     args = parser.parse_args()
 
+    # weight_compression check
+    if args.autoq_format:
+        lower_bound = 0.25 if '4' in args.autoq_format else 0.5
+        if args.weight_compression is None or args.weight_compression < lower_bound:
+            print(
+                f"invalid weight_compression value, will be set to {lower_bound}"
+            )
+            args.weight_compression = lower_bound
+
     if args.model_dir is not None:
         quantize_and_export(
             model_dir=args.model_dir,
             device=args.device,
             calib_dataset=args.calib_dataset,
             dtype=args.dtype,
-            qformat=args.qformat,
+            qformat=args.qformat
+            if args.weight_compression is None else args.autoq_format,
             kv_cache_dtype=args.kv_cache_dtype,
             calib_size=args.calib_size,
             batch_size=args.batch_size,
@@ -125,7 +158,8 @@
             max_draft_len=args.max_draft_len,
             medusa_hidden_act=args.medusa_hidden_act,
             medusa_model_dir=args.medusa_model_dir,
-            quant_medusa_head=args.quant_medusa_head)
+            quant_medusa_head=args.quant_medusa_head,
+            weight_compression=args.weight_compression)
     elif args.nemo_ckpt_path is not None:
         quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
                                  decoder_type=args.decoder_type,
diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt
index 4df6d2b70..34dfb19dd 100644
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt
index 4ad36e3f6..46dbe9896 100644
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt
index 989f4fb60..9f6671eb3 100644
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt
index 9303bc411..d589b27fe 100644
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 git+https://github.com/google-deepmind/recurrentgemma.git
 flax>=0.8.2
 jax~=0.4.23
diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt
index 82689ec1c..5ac8a0ffb 100644
--- a/examples/redrafter/requirements.txt
+++ b/examples/redrafter/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/run.py b/examples/run.py
index cadd6cd19..86292b826 100644
--- a/examples/run.py
+++ b/examples/run.py
@@ -41,6 +41,11 @@ def parse_arguments(args=None):
     parser = argparse.ArgumentParser()
     parser.add_argument('--max_input_length', type=int, default=923)
     parser.add_argument('--max_output_len', type=int, required=True)
+    parser.add_argument(
+        '--draft_engine_dir',
+        type=str,
+        default=None,
+        help='Path to engine of draft model in Draft-Target-Model mode.')
     parser.add_argument(
         '--input_text',
         type=str,
@@ -168,6 +173,11 @@ def parse_input(tokenizer,
     batch_input_ids = [
         torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
     ]
+
+    logger.debug(f"Input token ids (batch_size = {len(batch_input_ids)}):")
+    for i, input_ids in enumerate(batch_input_ids):
+        logger.debug(f"Request {i}: {input_ids.tolist()}")
+
     return batch_input_ids
 
 
@@ -233,6 +243,7 @@ def print_output(tokenizer,
                              if num_return_sequences > 1 else
                              f'Text {batch_idx} Beam {beam}')
                 print(f'Output [{index_str}]: \"{output_text}\"')
+                logger.debug(str(outputs))
 
     output_ids = output_ids.reshape((-1, output_ids.size(2)))
 
@@ -288,10 +299,230 @@ def print_output(tokenizer,
         np.save(log_probs_file, log_probs_outputs)
 
 
+def run_draft_target_model(batch_input_ids, args, runtime_rank, end_id, pad_id,
+                           stop_words_list, bad_words_list, vocab_size):
+    draft_len, draft_device_list, target_device_list, use_logits = ast.literal_eval(
+        args.draft_target_model_config)
+    logger.info(f"draft_len: {draft_len}")
+    logger.info(f"Device(s) for draft model: {draft_device_list}")
+    logger.info(f"Device(s) for target model: {target_device_list}")
+    logger.info(f"Use logits to accept tokens: {use_logits}")
+    # Variables keeping constant during decoding
+    input_batch_size = len(batch_input_ids)  # Note as `BS`
+    beam_width = args.num_beams  # Note as `BW`
+    is_compute_acceptance_ratio = logger.level == 'verbose'  # Only enable in verbose mode
+    input_lengths = [len(p) for p in batch_input_ids]
+    max_seq_lengths = [i + args.max_output_len for i in input_lengths]
+    # Variables changing during decoding
+    n_iteration = 0
+    prefix = batch_input_ids  # Input for draft model
+    batch_slot = list(range(input_batch_size))  # Index of requests
+    if is_compute_acceptance_ratio:
+        n_draft_token = [0 for _ in range(input_batch_size)]
+        n_accept_token = [0 for _ in range(input_batch_size)]
+
+    # Repack the output like the output of function `generate`
+    outputs = {}
+    outputs["output_ids"] = torch.full(
+        [input_batch_size, beam_width,
+         max(max_seq_lengths)],
+        end_id,
+        dtype=torch.int32)
+    for bs in range(input_batch_size):
+        outputs["output_ids"][bs, :, :input_lengths[bs]] = batch_input_ids[bs]
+    outputs["sequence_lengths"] = torch.full([input_batch_size, beam_width],
+                                             0,
+                                             dtype=torch.int32)
+    outputs["context_logits"] = None
+    outputs["generation_logits"] = torch.full(
+        [input_batch_size, beam_width,
+         max(max_seq_lengths), vocab_size],
+        0,
+        dtype=torch.float16)
+    outputs['cum_log_probs'] = None
+    outputs['log_probs'] = None
+
+    # Model runners
+    common_kwargs = dict(
+        lora_dir=args.lora_dir,
+        rank=runtime_rank,
+        debug_mode=args.debug_mode,
+        lora_ckpt_source=args.lora_ckpt_source,
+        gpu_weights_percent=args.gpu_weights_percent,
+        max_output_len=args.max_output_len,
+        is_enc_dec=False,
+        max_batch_size=input_batch_size,
+        max_input_len=max(input_lengths) + args.max_output_len,
+        max_beam_width=beam_width,
+        max_attention_window_size=args.max_attention_window_size,
+        sink_token_length=args.sink_token_length,
+        max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
+        kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
+        kv_cache_free_gpu_memory_fraction=args.
+        kv_cache_free_gpu_memory_fraction,
+        enable_chunked_context=args.enable_chunked_context,
+        multi_block_mode=args.multi_block_mode,
+        cuda_graph_mode=args.cuda_graph_mode,
+        enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc,
+    )
+    draft_runner_kwargs = common_kwargs.copy()
+    draft_runner_kwargs.update(
+        engine_dir=args.draft_engine_dir,
+        device_ids=draft_device_list,
+    )
+    draft_runner = ModelRunnerCpp.from_dir(**draft_runner_kwargs)
+    target_runner_kwargs = common_kwargs.copy()
+    target_runner_kwargs.update(
+        engine_dir=args.engine_dir,
+        device_ids=target_device_list,
+    )
+    target_runner = ModelRunnerCpp.from_dir(**target_runner_kwargs)
+
+    common_gen_kwargs = dict(
+        max_attention_window_size=args.max_attention_window_size,
+        sink_token_length=args.sink_token_length,
+        end_id=end_id,
+        pad_id=pad_id,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        num_beams=beam_width,
+        num_return_sequences=args.num_return_sequences,
+        length_penalty=args.length_penalty,
+        early_stopping=args.early_stopping,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        frequency_penalty=args.frequency_penalty,
+        stop_words_list=stop_words_list,
+        bad_words_list=bad_words_list,
+        random_seed=args.random_seed,
+        streaming=False,
+        output_sequence_lengths=True,
+        return_dict=True,
+    )
+
+    while True:
+        n_iteration += 1
+        batch_size = len(prefix)
+        prefix_len = [len(prefix[i]) for i in range(batch_size)]
+        # Run draft model
+        draft_generation_kwargs = common_gen_kwargs.copy()
+        draft_generation_kwargs.update(
+            batch_input_ids=prefix,
+            max_new_tokens=draft_len,
+            streaming=False,
+            output_sequence_lengths=True,
+            return_dict=True,
+        )
+        draft = draft_runner.generate(**draft_generation_kwargs)
+        torch.cuda.synchronize()
+
+        # draft["output_ids"].shape -> [BS, BW, maxSL]
+        # draft["sequence_lengths"].shape -> [BS, BW]
+        # draft["generation_logits"].shape -> [BS, BW, draft_len, vocab_size]
+        # `d_*` means variables from draft model
+        # Value of `d_seq_len` includes input part, but `draft_len` doesn't
+        d_seq_len = draft["sequence_lengths"][:, 0].tolist()
+        d_len = [d_seq_len[bs] - prefix_len[bs] for bs in range(batch_size)]
+        d_ids = [None] * batch_size
+        if use_logits:
+            assert "generation_logits" in draft.keys(
+            ), "`--gather_generation_logits` must be specified when building TRT engine."
+            d_logits = [None] * batch_size
+        else:
+            d_logits = None
+
+        for bs in range(batch_size):
+            l = prefix_len[bs]
+            r = d_seq_len[bs]
+            d_ids[bs] = draft["output_ids"][bs, 0, l:r].tolist()
+            if use_logits:
+                d_logits[bs] = draft["generation_logits"][bs, 0, :, :]
+
+        # Run target model
+        target_generation_kwargs = common_gen_kwargs.copy()
+        target_generation_kwargs.update(
+            batch_input_ids=prefix,
+            max_new_tokens=draft_len + 1,
+            draft_tokens_list=d_ids,
+            draft_logits_list=d_logits,
+        )
+        target = target_runner.generate(**target_generation_kwargs)
+        torch.cuda.synchronize()
+
+        # `t_*` means variables from target model
+        # Value of `t_seq_len` and `t_seq_ids` includes input part, but `t_len` or `t_ids` doesn't
+        t_seq_len = target["sequence_lengths"][:, 0].tolist()
+        # t_len = [t_seq_len[bs] - prefix_len[bs] for bs in range(batch_size)]
+        t_seq_ids = [None] * batch_size
+        t_ids = [None] * batch_size
+
+        # Update output and tokens for next iteration
+        for bs in range(batch_size):
+            index = batch_slot[bs]  # Get original index in the input batch
+            l = prefix_len[bs]
+            r = min(t_seq_len[bs], max_seq_lengths[index])
+            t_ids[bs] = target["output_ids"][bs, 0, l:r].tolist()
+            t_seq_ids[bs] = target["output_ids"][bs, 0, :r]
+            outputs["output_ids"][index, 0, l:r] = torch.IntTensor(t_ids[bs])
+            outputs["sequence_lengths"][index, 0] = r
+            if use_logits:
+                outputs["generation_logits"][index, 0, (l - input_lengths[bs]):(r - input_lengths[bs])] = \
+                    target["generation_logits"][bs][0,:(r-l)].detach().cpu()
+            if is_compute_acceptance_ratio:
+                n_draft_token[index] += len(d_ids[bs])
+                n_accept_token[index] += sum(d_ids[bs][i] == t_ids[bs][i] \
+                    for i in range(min(d_len[bs], t_seq_len[bs] - prefix_len[bs], max_seq_lengths[index] - prefix_len[bs])))
+
+        # yield output if using streaming
+        if args.streaming and not n_iteration % args.streaming_interval:
+            yield outputs
+
+        # Evaluate stop criteria and prepare inputs for next iteration
+        prefix_next = []
+        batch_slot_next = []
+        for bs in range(batch_size):
+            # Stop due to output length
+            if len(t_seq_ids[bs]) >= max_seq_lengths[batch_slot[bs]]:
+                continue  # No need to update for the stopped requests
+            # Stop due to the same output. Normally target should return 1 more token.
+            # if (d_ids is not None and np.array_equal(d_ids[bs], t_ids[bs])):
+            #     continue
+            # Stop due to no change (hit early stopping)
+            if np.array_equal(t_seq_ids[bs], prefix[bs]):
+                continue
+            # Stop due to end words
+            if end_id in t_seq_ids[bs]:
+                continue
+            # TODO: Check bad words and stop words criteria
+            prefix_next.append(t_seq_ids[bs])
+            batch_slot_next.append(bs)
+        prefix = prefix_next
+        batch_slot = batch_slot_next
+        if len(prefix) == 0:  # Leave while loop if no request remained
+            break
+
+    if is_compute_acceptance_ratio:
+        logger.debug(f"Count of iteration(s): {n_iteration}")
+        logger.debug(f"Acceptance ratio:")
+        for i, (a, d) in enumerate(zip(n_accept_token, n_draft_token)):
+            logger.debug(f"Request {i}: {a / d * 100 :6.2f}%")
+
+    # Return runner in No-Streaming mode
+    if args.streaming:
+        yield outputs
+    else:
+        yield outputs, target_runner
+
+
 def main(args):
     runtime_rank = tensorrt_llm.mpi_rank()
     logger.set_level(args.log_level)
 
+    if args.draft_target_model_config is not None:
+        assert args.draft_engine_dir is not None, "Path to draft engine (--draft_engine_dir) must be specified."
+        assert args.engine_dir is not None, "Path to target engine (--engine_dir) must be specified."
+
     # different handling if encoder-decoder models
     is_enc_dec = {'encoder', 'decoder'}.issubset({
         name
@@ -404,88 +635,107 @@ def main(args):
         )
         args.return_all_generated_tokens = True
 
-    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
-    runner_kwargs = dict(
-        engine_dir=args.engine_dir,
-        lora_dir=args.lora_dir,
-        rank=runtime_rank,
-        debug_mode=args.debug_mode,
-        lora_ckpt_source=args.lora_ckpt_source,
-        gpu_weights_percent=args.gpu_weights_percent,
-        max_output_len=args.max_output_len,
-    )
-    if not args.use_py_session:
-        runner_kwargs.update(is_enc_dec=is_enc_dec)
-    if args.medusa_choices is not None:
-        args.medusa_choices = ast.literal_eval(args.medusa_choices)
-        assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
-        assert args.num_beams == 1, "Medusa should use num_beams == 1"
-        runner_kwargs.update(medusa_choices=args.medusa_choices)
-    if args.lookahead_config is not None:
-        args.lookahead_config = ast.literal_eval(args.lookahead_config)
-        assert len(
-            args.lookahead_config
-        ) == 3, "Lookahead needs [max_window_size, max_ngram_size, max_verification_set_size]"
-        runner_kwargs.update(lookahead_config=args.lookahead_config)
-    if not args.use_py_session:
+    logger.info(f"Using {'Python' if args.use_py_session else 'C++'} session")
+
+    if args.draft_target_model_config is None:  # Normal run except Draft-Target-Model
+        runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
+        runner_kwargs = dict(
+            engine_dir=args.engine_dir,
+            lora_dir=args.lora_dir,
+            rank=runtime_rank,
+            debug_mode=args.debug_mode,
+            lora_ckpt_source=args.lora_ckpt_source,
+            gpu_weights_percent=args.gpu_weights_percent,
+            max_output_len=args.max_output_len,
+        )
+        if not args.use_py_session:
+            runner_kwargs.update(is_enc_dec=is_enc_dec)
+        if args.medusa_choices is not None:
+            args.medusa_choices = ast.literal_eval(args.medusa_choices)
+            assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
+            assert args.num_beams == 1, "Medusa should use num_beams == 1"
+            runner_kwargs.update(medusa_choices=args.medusa_choices)
+        if args.lookahead_config is not None:
+            args.lookahead_config = ast.literal_eval(args.lookahead_config)
+            assert len(
+                args.lookahead_config
+            ) == 3, "Lookahead needs [max_window_size, max_ngram_size, max_verification_set_size]"
+            runner_kwargs.update(lookahead_config=args.lookahead_config)
+        if not args.use_py_session:
+            runner_kwargs.update(
+                max_batch_size=len(batch_input_ids),
+                max_input_len=max(
+                    encoder_input_lengths if is_enc_dec else input_lengths),
+                max_beam_width=args.num_beams,
+                max_attention_window_size=args.max_attention_window_size,
+                sink_token_length=args.sink_token_length,
+                max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
+                kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
+                kv_cache_free_gpu_memory_fraction=args.
+                kv_cache_free_gpu_memory_fraction,
+                enable_chunked_context=args.enable_chunked_context,
+                multi_block_mode=args.multi_block_mode,
+                cuda_graph_mode=args.cuda_graph_mode)
         runner_kwargs.update(
-            max_batch_size=len(batch_input_ids),
-            max_input_len=max(
-                encoder_input_lengths if is_enc_dec else input_lengths),
-            max_beam_width=args.num_beams,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
-            kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
-            kv_cache_free_gpu_memory_fraction=args.
-            kv_cache_free_gpu_memory_fraction,
-            enable_chunked_context=args.enable_chunked_context,
-            multi_block_mode=args.multi_block_mode,
-            cuda_graph_mode=args.cuda_graph_mode)
-    runner_kwargs.update(
-        enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
-    runner = runner_cls.from_dir(**runner_kwargs)
-
-    with torch.no_grad():
-        outputs = runner.generate(
-            batch_input_ids=decoder_input_ids
-            if is_enc_dec else batch_input_ids,
-            encoder_input_ids=encoder_input_ids if is_enc_dec else None,
-            encoder_input_features=encoder_input_features
-            if is_enc_dec else None,
-            encoder_output_lengths=encoder_output_lengths
-            if is_enc_dec else None,
-            max_new_tokens=args.max_output_len,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            end_id=end_id,
-            pad_id=pad_id,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            num_beams=args.num_beams,
-            num_return_sequences=args.num_return_sequences,
-            length_penalty=args.length_penalty,
-            early_stopping=args.early_stopping,
-            repetition_penalty=args.repetition_penalty,
-            presence_penalty=args.presence_penalty,
-            frequency_penalty=args.frequency_penalty,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            output_cum_log_probs=(args.output_cum_log_probs_npy != None),
-            output_log_probs=(args.output_log_probs_npy != None),
-            random_seed=args.random_seed,
-            lora_uids=args.lora_task_uids,
-            prompt_table=args.prompt_table_path,
-            prompt_tasks=args.prompt_tasks,
-            streaming=args.streaming,
-            output_sequence_lengths=True,
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            return_dict=True,
-            medusa_choices=args.medusa_choices,
-            return_all_generated_tokens=args.return_all_generated_tokens,
-            input_token_extra_ids=input_token_extra_ids)
-        torch.cuda.synchronize()
+            enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
+        runner = runner_cls.from_dir(**runner_kwargs)
+
+        with torch.no_grad():
+            outputs = runner.generate(
+                batch_input_ids=decoder_input_ids
+                if is_enc_dec else batch_input_ids,
+                encoder_input_ids=encoder_input_ids if is_enc_dec else None,
+                encoder_input_features=encoder_input_features
+                if is_enc_dec else None,
+                encoder_output_lengths=encoder_output_lengths
+                if is_enc_dec else None,
+                max_new_tokens=args.max_output_len,
+                max_attention_window_size=args.max_attention_window_size,
+                sink_token_length=args.sink_token_length,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                num_return_sequences=args.num_return_sequences,
+                length_penalty=args.length_penalty,
+                early_stopping=args.early_stopping,
+                repetition_penalty=args.repetition_penalty,
+                presence_penalty=args.presence_penalty,
+                frequency_penalty=args.frequency_penalty,
+                stop_words_list=stop_words_list,
+                bad_words_list=bad_words_list,
+                output_cum_log_probs=(args.output_cum_log_probs_npy != None),
+                output_log_probs=(args.output_log_probs_npy != None),
+                random_seed=args.random_seed,
+                lora_uids=args.lora_task_uids,
+                prompt_table=args.prompt_table_path,
+                prompt_tasks=args.prompt_tasks,
+                streaming=args.streaming,
+                output_sequence_lengths=True,
+                no_repeat_ngram_size=args.no_repeat_ngram_size,
+                return_dict=True,
+                medusa_choices=args.medusa_choices,
+                return_all_generated_tokens=args.return_all_generated_tokens,
+                input_token_extra_ids=input_token_extra_ids)
+            torch.cuda.synchronize()
+
+    else:  # For Draft-Target-Model
+        if not args.kv_cache_enable_block_reuse:
+            logger.warning(
+                "`--kv_cache_enable_block_reuse` must be specified in Draft-Target-Model."
+            )
+        assert not args.use_py_session, "Only CPP session is supported in Draft-Target-Model."
+        assert not is_enc_dec, "Only decoder model is supported in Draft-Target-Model."
+        assert args.num_beams == 1, "Beam width > 1 is not supported in Draft-Target-Model."
+
+        outputs = run_draft_target_model(batch_input_ids, args, runtime_rank,
+                                         end_id, pad_id, stop_words_list,
+                                         bad_words_list, tokenizer.vocab_size)
+
+        if not args.streaming:  # Unpack runner from the return value in No-Streaming mode
+            outputs, runner = list(outputs)[0]
 
     if args.streaming:
         for curr_outputs in throttle_generator(outputs,
diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt
index 0ac45f703..065e4c4f4 100644
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt
index 0658842e8..44c2d95c5 100644
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/summarize.py b/examples/summarize.py
index b908c83f6..faa3412d2 100644
--- a/examples/summarize.py
+++ b/examples/summarize.py
@@ -740,6 +740,7 @@ def eval_hf(datapoint,
 
 
 if __name__ == '__main__':
+    # see `add_common_args` for extended list of arguments
     parser = argparse.ArgumentParser()
     parser.add_argument('--test_hf', action='store_true')
     parser.add_argument('--test_trt_llm', action='store_true')
diff --git a/examples/utils.py b/examples/utils.py
index de3361c3a..160494de7 100644
--- a/examples/utils.py
+++ b/examples/utils.py
@@ -292,7 +292,11 @@ def add_common_args(parser):
     parser.add_argument('--cuda_graph_mode',
                         action='store_true',
                         help="Enable cuda graphs in the inference.")
-    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument(
+        '--log_level',
+        type=str,
+        choices=['verbose', 'info', 'warning', 'error', 'internal_error'],
+        default='info')
     parser.add_argument(
         '--no_prompt_template',
         dest='use_prompt_template',
@@ -343,19 +347,26 @@ def add_common_args(parser):
         help="Number of (default) virtual tokens to prepend to each sentence."
         " For example, '--num_prepend_vtokens=10' will prepend the tokens"
         " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.")
+    parser.add_argument(
+        '--draft_target_model_config',
+        type=str,
+        default=None,
+        help=
+        "Configuration of Draft-Target-Model decoding, see `examples/draft_target_model/README.md` for more information."
+        "   E.g.: [4, [0], [1], False] for [draft_len, draft_model_device_list, target_model_device_list, use_logits]."
+    )
     parser.add_argument(
         '--medusa_choices',
         type=str,
         default=None,
-        help="Medusa choice to use, if not none, will use Medusa decoding."
+        help="Configuration of Medusa decoding."
         "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
     )
     parser.add_argument(
         '--lookahead_config',
         type=str,
         default=None,
-        help=
-        "executor and request lookahead config to use, if not none, will use lookahead decoding."
+        help="Configuration of executor and request lookahead decoding."
         "   E.g.: [5, 6, 7] for [max_window_size, max_ngram_size, max_verification_set_size]."
     )
     # model arguments
diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt
index 5bf15bc3b..4b3670a10 100644
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024100100
+tensorrt_llm==0.14.0.dev2024100800
 tiktoken
 datasets
 kaldialign
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 3342d10f8..3bf5a427d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,6 +11,7 @@ pytest-cov
 pytest-forked
 pytest-xdist
 pytest-timeout
+pytest-split
 rouge_score
 cloudpickle
 typing-extensions==4.8.0
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 0f7e4c750..68ae71881 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -90,7 +90,15 @@ def main(*,
     os.chdir(project_dir)
     build_run = partial(run, shell=True, check=True)
 
-    if not (project_dir / "3rdparty/cutlass/.git").exists():
+    # Get all submodules and check their folder exists. If not,
+    # invoke git submodule update
+    with open(project_dir / ".gitmodules", "r") as submodules_f:
+        submodules = [
+            l.split("=")[1].strip() for l in submodules_f.readlines()
+            if "path = " in l
+        ]
+    if any(not (project_dir / submodule / ".git").exists()
+           for submodule in submodules):
         build_run('git submodule update --init --recursive')
     on_windows = platform.system() == "Windows"
     requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt"
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 3e5029249..3f64b1bc3 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -518,3 +518,31 @@ def supports_inflight_batching(engine_dir):
     json_config = GptJsonConfig.parse_file(config_path)
     model_config = json_config.model_config
     return model_config.supports_inflight_batching
+
+
+class QuantModeWrapper:
+
+    def __init__(self, objs):
+        self.objs = objs
+
+    def __getattr__(self, name):
+
+        def method_wrapper(*args, **kwargs):
+            result = False
+            for obj in self.objs:
+                attr = getattr(obj, name)
+                if callable(attr):
+                    result = result | attr(*args, **kwargs)
+            return result
+
+        return method_wrapper
+
+    def __repr__(self):
+        return f"QuantModeWrapper: ({self.objs})"
+
+    def __str__(self):
+        obj_strs = [str(obj) for obj in self.objs]
+        return f"[{', '.join(obj_strs)}]"
+
+    def __getitem__(self, index):
+        return self.objs[index]
diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py
index 4123e870d..9e7719adc 100644
--- a/tensorrt_llm/bench/build/build.py
+++ b/tensorrt_llm/bench/build/build.py
@@ -174,21 +174,17 @@ def build_command(
         stdin,
     ], [], [], 0.0)[0]))
 
-    # Initialize the HF tokenizer for the specified model.
-    tokenizer = initialize_tokenizer(bench_env.model)
-
     # If we are receiving data from a path or stdin, parse and gather metadata.
-    if dataset_path or data_on_stdin:
+    if dataset_path:
         logger.info("Found dataset.")
-        # Cannot set the data file path and pipe in from stdin. Choose one.
-        if dataset_path is not None and data_on_stdin:
-            raise ValueError(
-                "Cannot provide a dataset on both stdin and by --dataset "
-                "option. Please pick one.")
-        stream = stdin if data_on_stdin else open(dataset_path, "r")
-        # Parse the dataset from stdin and return it plus its metadata.
-        metadata, _ = \
-            create_dataset_from_stream(tokenizer, stream=stream)
+        # Initialize the HF tokenizer for the specified model.
+        tokenizer = initialize_tokenizer(bench_env.model)
+        # Dataset Loading and Preparation
+        with open(dataset_path, "r") as dataset:
+            metadata, _ = create_dataset_from_stream(
+                tokenizer,
+                dataset,
+            )
         # The max sequence length option for build is the sum of max osl + isl.
         max_seq_len = metadata.max_sequence_length
         logger.info(metadata.get_summary_for_print())
diff --git a/tensorrt_llm/bench/run/run.py b/tensorrt_llm/bench/run/run.py
index 4cf22d836..b908940ff 100644
--- a/tensorrt_llm/bench/run/run.py
+++ b/tensorrt_llm/bench/run/run.py
@@ -20,7 +20,8 @@
 from tensorrt_llm.bench.run.utils import (ResponseTuple, StatsKeeper,
                                           get_executor_request,
                                           get_settings_from_engine)
-from tensorrt_llm.bench.utils.data import generate_dataset_from_stream
+from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
+                                           initialize_tokenizer)
 from tensorrt_llm.logger import logger
 
 
@@ -133,9 +134,13 @@ def run_command(
     # Construct the runtime configuration dataclass.
     runtime_config = RuntimeConfig(**exec_settings)
 
+    # Initialize the HF tokenizer for the specified model.
+    tokenizer = initialize_tokenizer(bench_env.model)
+
     # Dataset Loading and Preparation
-    metadata, requests = generate_dataset_from_stream(dataset_path, model,
-                                                      num_requests)
+    with open(dataset_path, "r") as dataset:
+        metadata, requests = create_dataset_from_stream(
+            tokenizer, dataset, num_requests=num_requests)
     # TODO: Verify that the engine can handle the max/min ISL/OSL.
     if metadata.max_sequence_length > engine_max_seq_len:
         raise RuntimeError(
diff --git a/tensorrt_llm/bench/utils/data.py b/tensorrt_llm/bench/utils/data.py
index 4f6380325..b6d00a345 100644
--- a/tensorrt_llm/bench/utils/data.py
+++ b/tensorrt_llm/bench/utils/data.py
@@ -1,8 +1,5 @@
 import json
-import sys
 from functools import partial
-from pathlib import Path
-from select import select
 from typing import List, TextIO, Tuple
 
 from transformers import AutoTokenizer, PreTrainedTokenizer
@@ -10,33 +7,6 @@
 from tensorrt_llm.bench.dataclasses import DatasetMetadata, InferenceRequest
 
 
-def generate_dataset_from_stream(dataset_path: Path,
-                                 model: str,
-                                 num_requests: int = 0):
-    # Check for data on stdin.
-    data_on_stdin: bool = bool(len(select([
-        sys.stdin,
-    ], [], [], 0.0)[0]))
-
-    # Cannot set the data file path and pipe in from stdin. Choose one.
-    if dataset_path is not None and data_on_stdin:
-        raise ValueError(
-            "Cannot provide a dataset on both stdin and by --dataset option. "
-            "Please pick one.")
-    # If we are receiving data from a path or stdin, parse and gather metadata.
-    stream = sys.stdin if data_on_stdin else open(dataset_path, "r")
-    tokenizer = initialize_tokenizer(model)
-    # Parse the dataset from stdin and return it plus its metadata.
-    metadata, requests = \
-        create_dataset_from_stream(
-            tokenizer,
-            stream=stream,
-            num_requests=num_requests
-        )
-
-    return metadata, requests
-
-
 def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
     """Initialize a tokenizer.
 
@@ -58,20 +28,23 @@ def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
 
 def create_dataset_from_stream(
     tokenizer: PreTrainedTokenizer,
+    stream: TextIO,
     max_input_length: int = 0,
     max_output_length: int = 0,
-    stream: TextIO = sys.stdin,
     num_requests: int = 0,
 ) -> Tuple[DatasetMetadata, List[InferenceRequest]]:
     """Generate metadata and a list of requests to drive benchmarking.
 
     Args:
         tokenizer (PreTrainedTokenizer): HuggingFace tokenizer.
-        max_input_length (int): Maximum input length to cap prompts to.
+        stream (TextIO): Stream of input requests.
+        max_input_length (int, optional): Maximum input length to cap prompts to. Defaults to 0.
+        max_output_length (int, optional): Maximum output length to cap prompts to.. Defaults to 0.
+        num_requests (int, optional): Number of requests to limit to. Defaults to 0.
 
     Returns:
-        DatasetMetadata: Dataclass of dataset statistics.
-        List[InferenceRequest]: A list of inference requests for benchmarking.
+        Tuple[DatasetMetadata, List[InferenceRequest]]: A tuple containing a dataclass of dataset
+        statistics and a list of inference requests for benchmarking.
     """
     # Initialize dataset list, and metadata tracking variables.
     dataset = []
diff --git a/tensorrt_llm/bench/utils/tokenize.py b/tensorrt_llm/bench/utils/tokenize.py
deleted file mode 100644
index 44f04df56..000000000
--- a/tensorrt_llm/bench/utils/tokenize.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import json
-import sys
-from functools import partial
-from typing import List, TextIO, Tuple
-
-from transformers import AutoTokenizer, PreTrainedTokenizer
-
-from tensorrt_llm.bench.dataclasses import DatasetMetadata, InferenceRequest
-
-
-def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
-    """Initialize a tokenizer.
-
-    Args:
-        model_name (str): The name of the HuggingFace model to pull a
-        tokenizer from.
-
-    Returns:
-        PreTrainedTokenizer: An initialized HuggingFace tokenizer.
-    """
-    # Initialize the tokenizer specific to the model that we are planning
-    # to benchmark.
-    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
-    if tokenizer.pad_token_id is None:
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    return tokenizer
-
-
-def create_dataset_from_stream(
-    tokenizer: PreTrainedTokenizer,
-    max_input_length: int = 0,
-    max_output_length: int = 0,
-    stream: TextIO = sys.stdin,
-) -> Tuple[DatasetMetadata, List[InferenceRequest]]:
-    """Generate metadata and a list of requests to drive benchmarking.
-
-    Args:
-        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer.
-        max_input_length (int): Maximum input length to cap prompts to.
-
-    Returns:
-        DatasetMetadata: Dataclass of dataset statistics.
-        List[InferenceRequest]: A list of inference requests for benchmarking.
-    """
-    # Initialize dataset list, and metadata tracking variables.
-    dataset = []
-    max_isl = 0
-    max_osl = 0
-    max_sequence = 0
-
-    # If we're limiting the input length to a certain size, then set up
-    # a partial to truncate the data down to size. Otherwise, just use the
-    # unmodified tokenizer callable.
-    tokenize = (partial(
-        tokenizer,
-        padding="max_length",
-        max_length=max_input_length,
-        truncation=True,
-    ) if max_input_length > 0 else tokenizer)
-
-    # If we need to limit the output length, fill in a partial callable
-    # for max, otherwise a lambda that just returns x with no bounds.
-    output_limiter = (partial(max, max_output_length)
-                      if max_output_length > 0 else lambda x: x)
-
-    # For each line in the standard input, parse out the JSON string we expect
-    # to see.
-    # Note the := walrus -- we're assigning and checking the condition.
-    while line := stream.readline():
-        # We expect the data to come in as a JSON string.
-        # For example:
-        # {"prompt": "Generate an infinite response to the following: There once was a man who.", "output_tokens": 1000}
-        # Each line should be a complete JSON dictionary with no indentation
-        # or newline characters.
-        data = json.loads(line)
-        logits = data.get("logits", None)
-        prompt = data.get("prompt", None)
-        task_id = data["task_id"]
-        osl = data["output_tokens"]
-        # If the request comes in with logits, just use the provided.
-        # Otherwise we need to tokenize it.
-        logits = tokenize(prompt)["input_ids"] if logits is None else logits
-
-        request = InferenceRequest(
-            task_id=task_id,
-            prompt=prompt,
-            output_tokens=output_limiter(osl),
-            logits=logits,
-        )
-        max_isl = max(max_isl, len(logits))
-        max_osl = max(max_osl, osl)
-        max_sequence = max(max_sequence, len(logits) + osl)
-        dataset.append(request)
-
-    # Fill in basic dataset metrics here
-    # TODO: Maybe fill this out to be more complete?
-    metadata = DatasetMetadata(
-        max_isl=max_isl,
-        max_osl=max_osl,
-        max_sequence_length=max_sequence,
-        num_requests=len(dataset),
-    )
-
-    return metadata, dataset
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index 5849bc20d..fe951e9a6 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -141,7 +141,6 @@ def create_builder_config(self,
                               use_refit: bool = False,
                               int8: bool = False,
                               strongly_typed: bool = True,
-                              opt_level: Optional[int] = None,
                               force_num_profiles: Optional[int] = None,
                               profiling_verbosity: str = "layer_names_only",
                               use_strip_plan: bool = False,
@@ -191,9 +190,6 @@ def create_builder_config(self,
         if use_strip_plan:
             config.set_flag(trt.BuilderFlag.STRIP_PLAN)
 
-        if opt_level is not None:
-            config.builder_optimization_level = opt_level
-
         # Set TRT Engine profiling verbosity
         if profiling_verbosity == "detailed":
             config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
@@ -479,7 +475,6 @@ class BuildConfig:
     gather_context_logits: int = False
     gather_generation_logits: int = False
     strongly_typed: bool = True
-    builder_opt: Optional[int] = None
     force_num_profiles: Optional[int] = None
     profiling_verbosity: str = 'layer_names_only'
     enable_debug_output: bool = False
@@ -567,7 +562,6 @@ def from_dict(cls, config, plugin_config=None):
         gather_context_logits = config.pop('gather_context_logits', False)
         gather_generation_logits = config.pop('gather_generation_logits', False)
         strongly_typed = config.pop('strongly_typed', True)
-        builder_opt = config.pop('builder_opt', None)
         force_num_profiles = config.pop('force_num_profiles', None)
         weight_sparsity = config.pop('weight_sparsity', False)
         profiling_verbosity = config.pop('profiling_verbosity',
@@ -608,7 +602,6 @@ def from_dict(cls, config, plugin_config=None):
             gather_context_logits=gather_context_logits,
             gather_generation_logits=gather_generation_logits,
             strongly_typed=strongly_typed,
-            builder_opt=builder_opt,
             force_num_profiles=force_num_profiles,
             profiling_verbosity=profiling_verbosity,
             enable_debug_output=enable_debug_output,
@@ -731,10 +724,24 @@ def save(self, engine_dir: str):
             if os.path.exists(root_lora_dir) and os.path.isdir(root_lora_dir):
                 shutil.rmtree(root_lora_dir)
         if self.config.pretrained_config.mapping.rank == 0:
+            config_dict = self.config.to_dict()
+            if self.config.pretrained_config.quant_algo == QuantAlgo.MIXED_PRECISION:
+                quant_dict = {
+                    'version': self.config.version,
+                }
+                quant_dict.update(
+                    config_dict['pretrained_config']['quantization'])
+                config_dict['pretrained_config']['quantization'].pop(
+                    'quantized_layers', None)
+                with open(os.path.join(engine_dir, 'quant_cfg.json'),
+                          "w",
+                          encoding="utf-8") as f:
+                    json.dump(quant_dict, f, indent=4, cls=ConfigEncoder)
+
             with open(os.path.join(engine_dir, 'config.json'),
                       "w",
                       encoding="utf-8") as f:
-                json.dump(self.config.to_dict(), f, indent=4, cls=ConfigEncoder)
+                json.dump(config_dict, f, indent=4, cls=ConfigEncoder)
         if self.engine is not None:
             serialize_engine(
                 self.engine,
@@ -807,7 +814,7 @@ def optimize_model_with_config(model: PretrainedModel,
         use_lora=build_config.plugin_config.lora_plugin is not None,
         max_lora_rank=build_config.lora_config.max_lora_rank,
         use_fp8_context_fmha=(
-            model.config.quantization.quant_algo == QuantAlgo.FP8
+            QuantAlgo.FP8 == model.config.quantization.quant_algo
             and build_config.plugin_config.use_fp8_context_fmha),
     )
 
@@ -1053,7 +1060,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
                 "Paged Context FMHA doesn't work with int8 kv cache currently.")
 
     if build_config.plugin_config.manage_weights:
-        if model.config.quant_mode & QuantMode.INT4_WEIGHTS or model.config.quant_mode & QuantMode.INT8_WEIGHTS:
+        if model.config.quant_mode.has_weight_quant():
             raise RuntimeError(
                 "Managed weights is not supported with int4 or int8 weights.")
 
@@ -1068,7 +1075,6 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
               and not model.config.quant_mode.has_per_group_scaling())
         or model.config.quant_mode.has_int8_kv_cache(),
         strongly_typed=build_config.strongly_typed,
-        opt_level=build_config.builder_opt,
         force_num_profiles=build_config.force_num_profiles,
         profiling_verbosity=build_config.profiling_verbosity,
         quant_mode=model.config.quant_mode,
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index e4efeb509..3a7772ecc 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -152,11 +152,6 @@ def parse_arguments():
                         type=str,
                         default='model.cache',
                         help="The file path to write the timing cache.")
-    parser.add_argument('--builder_opt',
-                        type=int,
-                        default=None,
-                        choices=[0, 1, 2, 3, 4, 5],
-                        help="TensorRT builder optimization level.")
     parser.add_argument(
         '--profiling_verbosity',
         type=str,
@@ -330,7 +325,6 @@ def build_model(
     bool = False,  # return the modified BuildConfig without actually building the engine
     **kwargs
 ) -> Union[Engine, BuildConfig]:
-
     model_config = copy.deepcopy(model_config)
 
     logits_dtype = kwargs.get('logits_dtype')
@@ -505,12 +499,12 @@ def main():
     else:
         config_path = os.path.join(ckpt_dir_or_model_config, 'config.json')
         ckpt_dir = ckpt_dir_or_model_config
-
     model_config = PretrainedConfig.from_json_file(config_path)
 
     # avoid ValueError if not supported quantization is chosen with use_fused_mlp
     quant_algo = model_config.quantization.quant_algo
-    if quant_algo and quant_algo != QuantAlgo.FP8:
+    if quant_algo and quant_algo not in (QuantAlgo.FP8,
+                                         QuantAlgo.MIXED_PRECISION):
         kwargs['use_fused_mlp'] = False
 
     if args.build_config is None:
@@ -536,7 +530,6 @@ def main():
                 'gather_context_logits': args.gather_context_logits,
                 'gather_generation_logits': args.gather_generation_logits,
                 'strongly_typed': True,
-                'builder_opt': args.builder_opt,
                 'force_num_profiles': args.builder_force_num_profiles,
                 'weight_sparsity': args.weight_sparsity,
                 'profiling_verbosity': args.profiling_verbosity,
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 082a83561..335f85d7c 100644
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -27,10 +27,11 @@
 
 from . import graph_rewriting as gw
 from ._common import default_net, default_trtnet, precision
-from ._utils import (bf16_array, bool_array, dim_resolve_negative,
-                     dim_to_trt_axes, dims_array, fp16_array, fp32_array,
-                     int32_array, int64_array, np_dtype_to_trt,
-                     str_dtype_to_trt, trt_dtype_to_np, trt_dtype_to_str)
+from ._utils import (QuantModeWrapper, bf16_array, bool_array,
+                     dim_resolve_negative, dim_to_trt_axes, dims_array,
+                     fp16_array, fp32_array, int32_array, int64_array,
+                     np_dtype_to_trt, str_dtype_to_trt, trt_dtype_to_np,
+                     trt_dtype_to_str)
 from .network import PluginInfo, set_np_weight, set_plugin_info
 from .plugin import TRT_LLM_PLUGIN_NAMESPACE, current_all_reduce_helper
 from .quantization import QuantMode
@@ -4579,7 +4580,7 @@ def gpt_attention(
     kv_orig_quant_scale: Optional[Tensor] = None,
     kv_quant_orig_scale: Optional[Tensor] = None,
     attention_output_orig_quant_scale: Optional[Tensor] = None,
-    kv_cache_quant_mode: QuantMode = QuantMode(0),
+    kv_cache_quant_mode: Union[QuantModeWrapper, QuantMode] = QuantMode(0),
     max_context_length: Optional[int] = None,
     mask_type: AttentionMaskType = AttentionMaskType.causal,
     block_sparse_block_size: int = 64,
@@ -4997,6 +4998,9 @@ def gpt_attention(
                               trt.PluginFieldType.INT32)
     tp_rank = trt.PluginField("tp_rank", np.array(tp_rank, dtype=np.int32),
                               trt.PluginFieldType.INT32)
+    if isinstance(kv_cache_quant_mode, QuantModeWrapper):
+        # Now in TRT-LLM only use global kv_cache, so it's enough to get the first quant mode from list
+        kv_cache_quant_mode = kv_cache_quant_mode[0]
     kv_cache_quant_mode_field = trt.PluginField(
         "kv_cache_quant_mode", np.array(kv_cache_quant_mode, dtype=np.int32),
         trt.PluginFieldType.INT32)
@@ -6200,7 +6204,7 @@ def rg_lru(input: Tensor,
 
 
 def topk(input: Tensor,
-         k: int,
+         k: Union[Tensor, int],
          dim: int,
          largest: bool = True) -> Tuple[Tensor, Tensor]:
     '''
@@ -6241,8 +6245,12 @@ def topk(input: Tensor,
     layer = default_trtnet().add_topk(
         input.trt_tensor,
         trt.TopKOperation.MAX if largest else trt.TopKOperation.MIN,
-        k=k,
+        k=k if not isinstance(k, Tensor) else 1,
         axes=axes)
+    if isinstance(k, Tensor):
+        if k.ndim() == 1:
+            k = squeeze(k, 0)
+        layer.set_input(1, k.trt_tensor)
     values = layer.get_output(0)
     indices = layer.get_output(1)
 
diff --git a/tensorrt_llm/hlapi/llm_utils.py b/tensorrt_llm/hlapi/llm_utils.py
index 431450783..a19ac6e0a 100644
--- a/tensorrt_llm/hlapi/llm_utils.py
+++ b/tensorrt_llm/hlapi/llm_utils.py
@@ -473,7 +473,7 @@ def setup(self):
     def _perform_config_arbitration(self):
         '''
         Arbitrate the configurations for the model building. The configs between different functional or performance
-        features might be confilcted, and this method will arbitrate the conflicts and raise errors if necessary.
+        features might be conflicted, and this method will arbitrate the conflicts and raise errors if necessary.
         '''
         self._config_arbitrator = _ConfigArbitrator()
         if self.build_config_mutable:
@@ -1460,7 +1460,7 @@ def save(self, engine_dir: Path):
 @dataclass
 class LlmBuildStats:
     ''' LlmBuildStats is the statistics for the LLM model building. '''
-    # Whether the cache is hitted for the engine
+    # Whether the cache is hit for the engine
     cache_hitted: bool = False
     cache_info: Optional[str] = None
 
diff --git a/tensorrt_llm/layers/embedding.py b/tensorrt_llm/layers/embedding.py
index 06882dfac..f822cfec1 100644
--- a/tensorrt_llm/layers/embedding.py
+++ b/tensorrt_llm/layers/embedding.py
@@ -128,7 +128,7 @@ def forward(self, tokens, prompt_embedding_table, tasks, task_vocab_size):
 
         Parameters:
             tokens : Tensor
-                the ids to embbed, size [batch_size, seq_len]
+                the ids to embed, size [batch_size, seq_len]
 
             prompt_embedding_table : Tensor
                 the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size]
diff --git a/tensorrt_llm/layers/mlp.py b/tensorrt_llm/layers/mlp.py
index 5e64c0a3a..312a841eb 100644
--- a/tensorrt_llm/layers/mlp.py
+++ b/tensorrt_llm/layers/mlp.py
@@ -262,7 +262,7 @@ def __init__(
     def fc_gate_plugin(self, hidden_states, lora_layer_params=None):
         # Combine the following pattern
         #
-        #   SiLU(FC(x)) + Gate(x)
+        #   SiLU(FC(x)) * Gate(x)
         #
         # into:
         #
@@ -319,7 +319,7 @@ def fc_gate_plugin(self, hidden_states, lora_layer_params=None):
     def fc_gate(self, hidden_states, lora_layer_params=None):
         # Combine the following pattern
         #
-        #   SiLU(FC(x)) + Gate(x)
+        #   SiLU(FC(x)) * Gate(x)
         #
         # into:
         #
diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py
index ee3742bcc..e05ea6de3 100644
--- a/tensorrt_llm/layers/moe.py
+++ b/tensorrt_llm/layers/moe.py
@@ -25,7 +25,7 @@
 from tensorrt_llm.layers.lora import LoraParams
 
 from .._common import default_net, default_trtnet
-from .._utils import int32_array
+from .._utils import QuantModeWrapper, int32_array
 from ..functional import (AllReduceFusionParams, _add_plugin_info,
                           _create_tensor, allreduce, cast, concat, constant,
                           div, expand, gather_nd, is_gated_activation,
@@ -170,6 +170,10 @@ def from_parameter(x):
     p_output_type_id = trt.PluginField(
         "output_type_id", np.array([int(output_dtype)], dtype=np.int32),
         trt.PluginFieldType.INT32)
+
+    if isinstance(quant_mode, QuantModeWrapper):
+        # We only need to get one quant mode here for specific moe layer
+        quant_mode = quant_mode[0]
     p_quant_mode = trt.PluginField("quant_mode",
                                    np.array([int(quant_mode)], dtype=np.int32),
                                    trt.PluginFieldType.INT32)
diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py
index 7d25399fe..7e14b50f9 100644
--- a/tensorrt_llm/models/convert_utils.py
+++ b/tensorrt_llm/models/convert_utils.py
@@ -248,6 +248,7 @@ def has_safetensors(model_dir: str):
     'ccdv/cnn_dailymail': ('3.0.0', 'train', 'article'),
     'cnn_dailymail': ('3.0.0', 'train', 'article'),
     'lambada': (None, 'validation', 'text'),
+    '': (None, 'train', 'text'),  # Default value in HF
 }
 
 
diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py
index cd9bbc63f..3f102712a 100644
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@@ -1085,7 +1085,9 @@ def quantize(hf_model_dir: str,
              config: LLaMAConfig,
              device: str = 'cuda',
              calib_dataset: str = 'cnn_dailymail',
-             trust_remote_code: bool = True):
+             trust_remote_code: bool = True,
+             calib_batches: int = 512,
+             calib_max_seq_length: int = 512):
     '''
         Quantize the save the model as TRT-LLM checkpoint to output_dir
     '''
@@ -1121,7 +1123,14 @@ def quantize(hf_model_dir: str,
 
     dataset = load_calib_dataset(calib_dataset)
 
-    act_range = capture_activation_range(hf_model, tokenizer, dataset)
+    if calib_batches == -1:  # use the whole dataset if calib_batches is -1
+        calib_batches = len(dataset)
+
+    act_range = capture_activation_range(hf_model,
+                                         tokenizer,
+                                         dataset,
+                                         num_samples=calib_batches,
+                                         seq_len=calib_max_seq_length)
     qkv_para, smoother = {}, {}
     if use_smooth_quant:
         smooth_llama_model(hf_model, act_range, quant_config.smoothquant_val,
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index c8e84ba1c..aabcc5265 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -451,7 +451,9 @@ def quantize(
                              config=config,
                              device=device,
                              calib_dataset=calib_dataset,
-                             trust_remote_code=trust_remote_code)
+                             trust_remote_code=trust_remote_code,
+                             calib_batches=calib_batches,
+                             calib_max_seq_length=calib_max_seq_length)
         else:
             raise ValueError(
                 f"The quant_config ({quant_config}) does not require calibration, try {cls.__name__}.from_hugging_face instead."
diff --git a/tensorrt_llm/models/model_weights_loader.py b/tensorrt_llm/models/model_weights_loader.py
index e7613f580..e406e342f 100644
--- a/tensorrt_llm/models/model_weights_loader.py
+++ b/tensorrt_llm/models/model_weights_loader.py
@@ -71,7 +71,7 @@ def translate_to_external_key(
         """Translate TRT-LLM key into HF key or HF key list (e.g. QKV/MoE/GPTQ)
 
         tllm_key will get translated into HF format section by section.
-        If one section is responeded with multiple hf_keys in a list, \
+        If one section is responded with multiple hf_keys in a list, \
         the translated keys will also get multiplied accordingly.
         tllm_key : "transformer.layers.0.attention.  qkv .weight"
                           |        |   |     |        |     |
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
index 8ef7f32a4..6b8fca73a 100644
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@@ -3,6 +3,7 @@
 import dataclasses
 import json
 import os
+import re
 from enum import IntFlag, auto
 from functools import cached_property
 from pathlib import Path
@@ -14,8 +15,9 @@
 import torch
 
 from .._common import default_net
-from .._utils import (get_init_params, numpy_to_torch, release_gc,
-                      str_dtype_to_torch, str_dtype_to_trt, trt_dtype_to_torch)
+from .._utils import (QuantModeWrapper, get_init_params, numpy_to_torch,
+                      release_gc, str_dtype_to_torch, str_dtype_to_trt,
+                      trt_dtype_to_torch)
 from ..bindings import KVCacheType
 from ..functional import (PositionEmbeddingType, Tensor,
                           gather_last_token_logits, tanh)
@@ -107,7 +109,17 @@ def use_plugin_sq(self):
         return self.quant_algo in W8A8_SQ_PLUGIN_LIST
 
     @cached_property
-    def quant_mode(self) -> QuantMode:
+    def quant_mode(self) -> QuantModeWrapper:
+        quant_mode_list = [
+            QuantMode.from_quant_algo(
+                self.quant_algo,
+                self.kv_cache_quant_algo,
+            )
+        ]
+        return QuantModeWrapper(quant_mode_list)
+
+    @cached_property
+    def layer_quant_mode(self) -> QuantMode:
         return QuantMode.from_quant_algo(
             self.quant_algo,
             self.kv_cache_quant_algo,
@@ -124,7 +136,8 @@ def requires_calibration(self):
     def requires_modelopt_quantization(self):
         if self.quant_algo in [
                 QuantAlgo.W4A16_AWQ, QuantAlgo.FP8,
-                QuantAlgo.W8A8_SQ_PER_CHANNEL, QuantAlgo.W4A8_AWQ
+                QuantAlgo.W8A8_SQ_PER_CHANNEL, QuantAlgo.W4A8_AWQ,
+                QuantAlgo.MIXED_PRECISION
         ]:
             return True
         elif self.quant_algo is None and self.kv_cache_quant_algo == QuantAlgo.FP8:
@@ -132,6 +145,9 @@ def requires_modelopt_quantization(self):
         else:
             return False
 
+    def get_quant_cfg(self, module_name=None):
+        return self
+
     def get_modelopt_qformat(self):
         algo_to_modelopt_map = {
             QuantAlgo.W8A16: "int8_wo",
@@ -141,6 +157,7 @@ def get_modelopt_qformat(self):
             QuantAlgo.FP8: 'fp8',
             QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq',
         }
+        assert self.quant_algo != QuantAlgo.MIXED_PRECISION, f"We don't support mixed precision in QuantConfig"
         if self.quant_algo is not None:
             assert self.quant_algo in algo_to_modelopt_map, f"We don't use Modelopt for quantization algorithm {self.quant_algo}, you probably shall not call this"
             return algo_to_modelopt_map[self.quant_algo]
@@ -160,12 +177,102 @@ def get_modelopt_kv_cache_dtype(self):
 
     @classmethod
     def from_dict(cls, config: dict):
-        return cls(**config)
+        obj = cls(**config)
+        return obj
 
     def to_dict(self):
         return dataclasses.asdict(self)
 
 
+@dataclasses.dataclass
+class LayerQuantConfig(QuantConfig):
+    quant_algo: Optional[QuantConfig] = None
+    kv_cache_quant_algo: Optional[QuantConfig] = None
+    quantized_layers: Optional[Dict[str, QuantConfig]] = None
+    exclude_modules: Optional[List[str]] = None
+
+    def __init__(self,
+                 *,
+                 quant_algo: Optional[QuantConfig] = None,
+                 kv_cache_quant_algo: Optional[QuantConfig] = None,
+                 quantized_layers: Optional[Dict[str, QuantConfig]] = None,
+                 exclude_modules: Optional[List[str]] = None,
+                 **kwargs):
+        self.quant_algo = quant_algo
+        self.quantized_layers = quantized_layers
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+        self.auto_quant_mode = {}
+        for name, layer_config in self.quantized_layers.items():
+            self.auto_quant_mode.update({
+                name:
+                QuantMode.from_quant_algo(
+                    layer_config.quant_algo,
+                    self.kv_cache_quant_algo,
+                )
+            })
+        for key in kwargs:
+            logger.warning(
+                f"Warning: Unrecognized parameter '{key}' with value '{kwargs[key]}'"
+            )
+
+    @cached_property
+    def quant_mode(self):
+        quant_mode_list = list(set(self.auto_quant_mode.values()))
+        return QuantModeWrapper(quant_mode_list)
+
+    @property
+    def layer_quant_mode(self) -> Dict[str, QuantMode]:
+        return self.auto_quant_mode
+
+    @cached_property
+    def auto_quant_list(self):
+        quant_list = []
+        for _, layer_config in self.quantized_layers.items():
+            quant_list.append(layer_config.quant_algo)
+        return list(set(quant_list))
+
+    @classmethod
+    def from_dict(cls, config: dict):
+        quantized_layers = config.pop('quantized_layers', {})
+
+        quantized_layers_dict = {
+            layer_name: QuantConfig(**layer_config)
+            for layer_name, layer_config in quantized_layers.items()
+        }
+
+        obj = cls(quantized_layers=quantized_layers_dict, **config)
+        return obj
+
+    def get_quant_cfg(self, module_name):
+        assert module_name in self.quantized_layers.keys(), \
+            "module {module_name} should be included in `quantized_layers` in AutoQuant mode"
+        return self.quantized_layers[module_name]
+
+    def get_modelopt_qformat(self):
+        algo_to_modelopt_map = {
+            QuantAlgo.W4A16_AWQ: "int4_awq",
+            QuantAlgo.W4A8_AWQ: 'w4a8_awq',
+            QuantAlgo.FP8: 'fp8',
+            QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq',
+        }
+        assert self.quant_algo == QuantAlgo.MIXED_PRECISION, f"We only support mixed precision quantization in LayerQuantConfig"
+        autoq_format = ','.join(
+            [algo_to_modelopt_map[item] for item in self.auto_quant_list])
+        return autoq_format
+
+    def to_dict(self):
+        output = copy.deepcopy(self.__dict__)
+        output.pop('auto_quant_mode', None)
+        output.pop('quant_mode', None)
+        output.pop('exclude_modules', None)
+        for name, per_layer_config in output['quantized_layers'].items():
+            per_layer_config = per_layer_config.to_dict()
+            per_layer_config.pop('exclude_modules')
+            output['quantized_layers'][name] = per_layer_config
+        return output
+
+
 class PretrainedConfig:
 
     def __init__(self,
@@ -269,6 +376,8 @@ def __init__(self,
 
     @property
     def kv_dtype(self):
+        # TODO: need to align the kv dtype
+        # now assume the kv cache is for all layers
         if self.quant_mode.has_int8_kv_cache():
             return 'int8'
         elif self.quant_mode.has_fp8_kv_cache():
@@ -302,7 +411,17 @@ def to_dict(self):
     def from_json_file(cls, config_file: str):
         with open(config_file) as f:
             config = json.load(f)
-        return cls.from_dict(config)
+        obj = cls.from_dict(config)
+        if obj.quantization.quant_algo == QuantAlgo.MIXED_PRECISION:
+            try:
+                layer_config_path = str(config_file).replace(
+                    'config.json', 'quant_cfg.json')
+                obj.to_layer_quant_config(layer_config_path)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Encounter error '{e}' for read quantization config '{layer_config_path}'"
+                )
+        return obj
 
     @classmethod
     def from_checkpoint(cls, ckpt_dir: str):
@@ -312,10 +431,22 @@ def to_json_file(self, config_file: str):
         with open(config_file, 'w') as f:
             json.dump(self.to_dict(), f, indent=4)
 
+    def to_layer_quant_config(self, config_file: str):
+        with open(config_file) as f:
+            config = json.load(f)
+        self.quantization = LayerQuantConfig.from_dict(config)
+
     @property
     def quant_mode(self):
         return self.quantization.quant_mode
 
+    @property
+    def quant_algo(self):
+        return self.quantization.quant_algo
+
+    def get_quant_cfg(self, module_name: str):
+        return self.quantization.get_quant_cfg(module_name)
+
     def set_rank(self, rank):
         self.mapping = Mapping(self.mapping.world_size,
                                rank=rank,
@@ -485,13 +616,14 @@ def from_checkpoint(
 
         assert os.path.isfile(weights_path)
         weights = safetensors.torch.load_file(weights_path)
-
         is_checkpoint_pruned = getattr(config, 'is_pruned', False)
 
         if preprocess_weights_hook is not None:
             weights = preprocess_weights_hook(weights)
 
-        preprocess_weights(weights, config, from_pruned=is_checkpoint_pruned)
+        weights = preprocess_weights(weights,
+                                     config,
+                                     from_pruned=is_checkpoint_pruned)
         model = cls(config)
         model.load(weights, from_pruned=is_checkpoint_pruned)
         return model
@@ -830,11 +962,6 @@ def fuse_gate_mlp(
 ) -> PretrainedModel:
     from ..quantization.quantize import fp8_quantize
 
-    quant_algo = model.config.quantization.quant_algo
-    if quant_algo != QuantAlgo.FP8 and quant_algo is not None:
-        logger.warning("fuse_gate_mlp cannot be done for this model. Skipping.")
-        return model
-
     for name, mlp, layer in model.named_modules_with_parent():
         if isinstance(mlp, GatedMLP):
             init_params = get_init_params(mlp)
@@ -849,9 +976,18 @@ def fuse_gate_mlp(
             init_params["inner_layernorm"] = mlp.inner_layernorm is not None
             fused_layer = FusedGatedMLP(**init_params)
 
-            if quant_algo == QuantAlgo.FP8:
-                fused_layer = fp8_quantize(fused_layer,
-                                           model.config.quantization)
+            fc_name = name + '.fc'
+            layer_quant_cfg = model.config.get_quant_cfg(fc_name)
+            layer_quant_algo = layer_quant_cfg.quant_algo
+            if layer_quant_algo != QuantAlgo.FP8 and layer_quant_algo is not None:
+                continue
+
+            if isinstance(model.config.quantization.exclude_modules, list) \
+                    and fc_name in model.config.quantization.exclude_modules:
+                layer_quant_algo = None
+
+            if layer_quant_algo == QuantAlgo.FP8:
+                fused_layer = fp8_quantize(fused_layer, layer_quant_cfg)
 
                 if isinstance(mlp.dtype, str):
                     dtype = str_dtype_to_torch(mlp.dtype)
@@ -904,7 +1040,7 @@ def fuse_gate_mlp(
                     mlp.gate.activation_scaling_factor.raw_value,
                     mlp.fc.activation_scaling_factor.raw_value,
                 )
-            elif quant_algo is None:
+            elif layer_quant_algo is None:
                 fused_layer.fused_fc.weight.value = np.concatenate(
                     [
                         mlp.gate.weight.raw_value,
@@ -917,7 +1053,7 @@ def fuse_gate_mlp(
                         [mlp.gate.bias.raw_value, mlp.fc.bias.raw_value],
                         axis=0)
             else:
-                raise ValueError(f'Unsupported quant algo: {quant_algo}')
+                raise ValueError(f'Unsupported quant algo: {layer_quant_algo}')
 
             fused_layer.proj = mlp.proj
             fused_layer.inner_layernorm = mlp.inner_layernorm
@@ -963,9 +1099,10 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
                     layer.tp_size * layer.num_attention_kv_heads *
                     layer.attention_head_size,
                 })
-            q = quantize(q, model.config.quantization)
-            k = quantize(k, model.config.quantization)
-            v = quantize(v, model.config.quantization)
+            layer_quant_cfg = model.config.get_quant_cfg(name + '.qkv')
+            q = quantize(q, layer_quant_cfg)
+            k = quantize(k, layer_quant_cfg)
+            v = quantize(v, layer_quant_cfg)
             out_features = q.out_features + k.out_features + v.out_features
             if isinstance(layer.qkv, (
                     WeightOnlyQuantLinear,
@@ -1143,7 +1280,8 @@ def share_embedding(model: PretrainedModel) -> PretrainedModel:
 
 def set_fp8_context_fhma(model: PretrainedModel) -> PretrainedModel:
     for name, layer in model.named_modules():
-        if isinstance(layer, Attention):
+        if isinstance(layer, Attention) and hasattr(
+                layer.dense, 'activation_scaling_factor'):
             scale = [1.0] / layer.dense.activation_scaling_factor.raw_value
             layer.attention_output_orig_quant_scale = Parameter(
                 value=scale.astype(np.float32))
@@ -1193,19 +1331,11 @@ def optimize_model(
     return model
 
 
-def preprocess_weights(weights: Dict[str, torch.Tensor],
-                       model_config: PretrainedConfig,
-                       from_pruned=False) -> None:
-    """This function in-place modifies weights and model_config, making them compatible with each other.
-
-    Note: Typically, it should be called before model creation and weight loading. For example,
-        preprocess_weights(weights, model_config)
-        model = XXXForCausalLM(model_config)
-        model.load(weights)
-    """
-    quant_algo = model_config.quantization.quant_algo
+def preprocess_perlayer_weights(weights,
+                                model_config,
+                                quant_algo,
+                                from_pruned=False):
     exclude_modules = model_config.quantization.exclude_modules
-
     # INT4_AWQ
     if quant_algo == QuantAlgo.W4A8_AWQ or quant_algo == QuantAlgo.W4A16_AWQ:
         preprocessor = torch.ops.trtllm.preprocess_weights_for_mixed_gemm
@@ -1280,15 +1410,68 @@ def preprocess_weights(weights: Dict[str, torch.Tensor],
                                             exclude_modules=exclude_modules,
                                             plugin=True)
 
-    # Parallel block rowlinear should not have duplicate bias.
-    elif model_config.architecture == 'GPTJForCausalLM':
-        if model_config.mapping.tp_rank > 0:
-            for name, param in weights.items():
+
+def preprocess_weights(weights: Dict[str, torch.Tensor],
+                       model_config: PretrainedConfig,
+                       from_pruned=False) -> None:
+    """This function in-place modifies weights and model_config, making them compatible with each other.
+
+    Note: Typically, it should be called before model creation and weight loading. For example,
+        preprocess_weights(weights, model_config)
+        model = XXXForCausalLM(model_config)
+        model.load(weights)
+    """
+    quant_config = model_config.quantization
+    quant_algo = quant_config.quant_algo
+
+    pattern_info = ['fc', 'gate', 'proj', 'qkv', 'dense']
+
+    per_layer_weights = {}
+
+    for name, param in weights.items():
+        in_mode = False
+        for info in pattern_info:
+            pattern = rf'(.*?{info}.*?)'
+            pattern_match = re.match(pattern, name)
+            if pattern_match:
+                base_name = pattern_match.group(1)
+                if base_name not in per_layer_weights.keys():
+                    per_layer_weights[base_name] = {}
+                per_layer_weights[base_name][name] = param
+                in_mode = True
+                break
+        if not in_mode:
+            # [lm_head.weight, ln_f.weight, vocab_embedding.weight]
+            base_name = name.rsplit('.', 1)[0]
+            if base_name not in per_layer_weights.keys():
+                per_layer_weights[base_name] = {}
+            per_layer_weights[base_name][name] = param
+
+    new_weights = {}
+    for base_name, layer_weights in per_layer_weights.items():
+        if quant_algo != QuantAlgo.MIXED_PRECISION:
+            layer_quant_algo = quant_algo
+        else:
+            if base_name not in quant_config.quantized_layers.keys():
+                new_weights.update(layer_weights)
+                continue
+            layer_quant_algo = quant_config.quantized_layers[
+                base_name].quant_algo
+
+        preprocess_perlayer_weights(layer_weights, model_config,
+                                    layer_quant_algo, from_pruned)
+        new_weights.update(layer_weights)
+
+    weights = new_weights
+    for name, param in weights.items():
+        if model_config.architecture == 'GPTJForCausalLM':
+            if model_config.mapping.tp_rank > 0:
                 if 'attention.dense.bias' in name or 'mlp.proj.bias' in name:
                     weights[name] = torch.zeros_like(param)
 
     # For share_embedding_table
     check_share_embedding(weights, model_config)
+    return weights
 
 
 def check_share_embedding(weights: Dict[str, torch.Tensor],
diff --git a/tensorrt_llm/models/redrafter/redrafter_helper.py b/tensorrt_llm/models/redrafter/redrafter_helper.py
index 9604f40af..4cbd0b05c 100644
--- a/tensorrt_llm/models/redrafter/redrafter_helper.py
+++ b/tensorrt_llm/models/redrafter/redrafter_helper.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Tuple
 
 import numpy as np
@@ -11,7 +12,7 @@
     div, eq, exp, expand, expand_dims, floordiv, gather, gather_nd,
     index_select, int32_array, log_softmax, lt, max, maximum, masked_select,
     minimum, nonzero, not_op, op_and, rand, relu, scatter, select, shape, slice,
-    softmax, squeeze, stack, sum, topk, transpose, unsqueeze, view, where)
+    silu, softmax, squeeze, stack, sum, topk, transpose, unsqueeze, view, where)
 # isort: on
 from tensorrt_llm.layers import Embedding
 from tensorrt_llm.module import Module
@@ -358,9 +359,133 @@ def _unflatten_decoding_dim(x: Tensor, num_beams: int) -> Tensor:
     return x
 
 
-def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding,
-                            drafter: Module, num_beams: int, beam_length: int,
+def _beam_search_candidates(prompt_state: Tensor, init_token: Tensor,
+                            embedding: Embedding, drafter: Module,
+                            num_beams: int, beam_length: int,
                             is_rnn: bool) -> Tuple[Tensor, Tensor]:
+    """
+        This version of beam search matches with ReDrafter GitHub version as of 10/02/2024.
+        Link: https://github.com/apple/ml-recurrent-drafter/releases/tag/v1.1
+    """
+
+    LOG_0 = -50000.0
+    LOG_1 = 0.0
+
+    def maintain_logits(logits: Tensor) -> Tensor:
+        max_logits = max(logits, -1, keepdim=True)
+        max_logits = expand(max_logits,
+                            shape(logits, cast_to_dtype=INT_DTYPE_STR))
+        return logits - max_logits
+
+    def warp_logits(logits: Tensor,
+                    top_k: int = 50,
+                    mask_value: float = LOG_0) -> Tensor:
+        top_k = minimum(top_k, shape(logits,
+                                     dim=-1,
+                                     cast_to_dtype=INT_DTYPE_STR))
+        top_values, _ = topk(logits, k=top_k, dim=-1)  # [bs, nb, top_k]
+        starts = concat([0, 0, top_k - 1])
+        sizes = concat([shape(logits, 0), shape(logits, 1), 1])
+        lt_mask = logits < slice(top_values, starts=starts, sizes=sizes)
+        logits = where(lt_mask,
+                       constant_to_tensor_(mask_value, dtype=logits.dtype),
+                       logits)
+        return logits
+
+    def compute_logits(x: Tensor) -> Tensor:
+        """
+        x: [bs, nb, 2*H]
+        """
+        logits = drafter(x)  # [bs, nb, 2*H] => [bs, nb, V]
+        logits = maintain_logits(logits)  # [bs, nb, V]
+        logits = warp_logits(logits)  # [bs, nb, V]
+        return logits
+
+    assert prompt_state.ndim() == 2
+    assert init_token.ndim() == 1
+    assert beam_length > 1
+    batch_size = shape(prompt_state, 0, INT_DTYPE_STR)
+    vocab_size = embedding.num_embeddings
+    dtype = prompt_state.dtype
+
+    log_p_beam = expand(
+        unsqueeze(
+            constant(
+                numpy_array([LOG_1] + [LOG_0] * (num_beams - 1),
+                            trt_dtype=dtype)), 0),  # [1, nb]
+        concat([batch_size, num_beams]))  # [bs, nb]
+    context = _add_decoding_dim(prompt_state, num_beams)  # [bs, nb, H]
+    if init_token.ndim() == 1:
+        init_token = unsqueeze(init_token, -1)  # [bs] => [bs, 1]
+    beams = _add_decoding_dim(init_token, num_beams)  # [bs, nb, 1]
+
+    last_tokens = squeeze(beams, -1)  # [bs, nb]
+    state_shape = shape(context, cast_to_dtype=INT_DTYPE_STR)  # [bs, nb, H]
+    state = expand(expand_dims(constant_to_tensor_(0.0, dtype=dtype), [0, 1]),
+                   state_shape)  # [bs, nb, H]
+    logits_token_in_beam = None
+    candidate_length = beam_length - 1
+    for _ in range(candidate_length):
+        state = (
+            silu(drafter.rnn_w(embedding(last_tokens)) +
+                 drafter.rnn_u(state)) if is_rnn else embedding(last_tokens) +
+            state)  # [bs, nb, H]
+
+        logits_new_token = compute_logits(concat([context, state],
+                                                 -1))  # [bs, nb, V]
+        log_p_new_token = log_softmax(logits_new_token, -1)  # [bs, nb, V]
+
+        log_p_beam_new_token = log_p_new_token + unsqueeze(log_p_beam,
+                                                           2)  # [bs, nb, V]
+
+        tokens_times_beams = view(log_p_beam_new_token,
+                                  concat([batch_size, num_beams * vocab_size
+                                          ]))  # [bs, nb*V]
+        log_p_beam, topk_indices = topk(tokens_times_beams, k=num_beams,
+                                        dim=-1)  # [bs, nb]
+        top_beam_indices = topk_indices // vocab_size  # [bs, nb]
+        # Avoid repeated division for: top_token_ids = topk_indices % vocab_size
+        top_token_ids = topk_indices - (top_beam_indices * vocab_size
+                                        )  # [bs, nb]
+
+        # get the common indices to gather beams
+        gather_indices = _get_indices_for_gather_beams(batch_size,
+                                                       top_beam_indices,
+                                                       num_beams)
+
+        # update running beams, state, logits, and last_tokens
+        prev_top_beams = _gather_beams(beams, gather_indices, batch_size,
+                                       num_beams)  # [bs, nb] OR [bs, nb, 1+i]
+        if prev_top_beams.ndim() == 2:
+            prev_top_beams = unsqueeze(prev_top_beams, -1)  # [bs, nb, 1]
+        new_tokens = unsqueeze(top_token_ids, -1)  # [bs, nb, 1]
+        beams = concat([prev_top_beams, new_tokens], dim=-1)  # [bs, nb, 1+i+1]
+
+        state = _gather_beams(state, gather_indices, batch_size,
+                              num_beams)  # [bs, nb, H]
+
+        cur_logits_token_in_beam = unsqueeze(
+            _gather_beams(logits_new_token, gather_indices, batch_size,
+                          num_beams), 2)  # [bs, nb, 1, V]
+        if logits_token_in_beam is None:  # first iteration
+            logits_token_in_beam = cur_logits_token_in_beam
+        else:
+            logits_token_in_beam = concat(
+                [
+                    _gather_beams(logits_token_in_beam, gather_indices,
+                                  batch_size,
+                                  num_beams),  # prev_top_logits [bs, nb, i, V]
+                    cur_logits_token_in_beam
+                ],
+                dim=2)  # [bs, nb, i+1, V]
+        last_tokens = top_token_ids  # [bs, nb]
+    return beams, logits_token_in_beam
+
+
+def _beam_search_candidates_v0(x: Tensor, init_token: Tensor,
+                               embedding: Embedding, drafter: Module,
+                               num_beams: int, beam_length: int,
+                               is_rnn: bool) -> Tuple[Tensor, Tensor]:
     '''
     x: [bs, H]
     init_token: [bs]
@@ -372,6 +497,9 @@ def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding,
             draft_probs: (batch, num_beams, beam_length - 1, vocab_size)
                 Probabilities for the draft_tokens.
     '''
+    warnings.warn(
+        "This version of beam search is deprecated and will be removed in the future."
+    )
     NEG_INF = -50000.0
     batch_size = shape(x, 0, INT_DTYPE_STR)
     vocab_size = embedding.num_embeddings
@@ -408,7 +536,7 @@ def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding,
             h))  # [bs, nb, 2H] => [bs*nb, 2H] => [bs*nb, V]
         new_flat_log_probs = log_softmax(new_flat_logits, dim=-1)  # [bs*nb, V]
 
-        # compute probabilties and flatten the beams for topk
+        # compute probabilities and flatten the beams for topk
         candidate_log_probs = _unflatten_decoding_dim(
             new_flat_log_probs, num_beams)  # [bs*nb, V] => [bs, nb, V]
         log_probs = candidate_log_probs + unsqueeze(scores, 2)  # [bs, nb, V]
diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py
index f405387a0..47f719eff 100644
--- a/tensorrt_llm/parameter.py
+++ b/tensorrt_llm/parameter.py
@@ -86,11 +86,13 @@ def _create_managed_tensor(self, network, need_transpose=False) -> Tensor:
 
         if self._value is None or (isinstance(self._value, np.ndarray)
                                    and not self._value.flags['C_CONTIGUOUS']):
+            value_old = self._value
+            self._value = np.empty(self._shape, trt_dtype_to_np(self._dtype))
             network._register_unfilled_weights(
                 # use updated self._shape here
                 name,
-                np.empty(self._shape, trt_dtype_to_np(self._dtype)),
-                self._value)
+                self._value,
+                value_old)
         return Tensor(name=name, dtype=self._dtype, shape=shape)
 
     def get_managed_tensor(self,
diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py
index 6b0a86814..84441597c 100644
--- a/tensorrt_llm/plugin/plugin.py
+++ b/tensorrt_llm/plugin/plugin.py
@@ -368,7 +368,7 @@ class CustomAllReduceHelper:
             - Set custom_all_reduce_helper.workspace with the required tensor.
               Then, each instance of allreduce will reference that tensor automatically.
     """
-    POINTERS_PER_RANK = 4
+    POINTERS_PER_RANK = 7
 
     def __init__(self) -> None:
         self.workspace: Optional[Tensor] = None
@@ -377,7 +377,7 @@ def set_workspace_tensor(self,
                              mapping: Mapping,
                              num_profiles: Optional[int] = None):
         from ..functional import Tensor
-        workspace_size = self.POINTERS_PER_RANK * mapping.tp_size + 1
+        workspace_size = self.POINTERS_PER_RANK * mapping.tp_size + 2
 
         dim_range = None
         if num_profiles is not None:
@@ -412,16 +412,23 @@ def allocate_workspace(mapping: Mapping,
         ipc_barriers_out = IpcMemory(
             mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size * 2,
             is_p2p_supported)
+        lamport_buffers_0 = IpcMemory(mapping, size * mapping.tp_size,
+                                      is_p2p_supported)
+        lamport_buffers_1 = IpcMemory(mapping, size * mapping.tp_size,
+                                      is_p2p_supported)
+        lamport_buffers_2 = IpcMemory(mapping, size * mapping.tp_size,
+                                      is_p2p_supported)
         buffers = [
-            ipc_buffers_ping,
-            ipc_buffers_pong,
-            ipc_barriers_in,
-            ipc_barriers_out,
+            ipc_buffers_ping, ipc_buffers_pong, ipc_barriers_in,
+            ipc_barriers_out, lamport_buffers_0, lamport_buffers_1,
+            lamport_buffers_2
         ]
 
         return buffers, torch.tensor(
             ipc_buffers_ping.serialize() + ipc_buffers_pong.serialize() +
-            ipc_barriers_in.serialize() + ipc_barriers_out.serialize() + [0],
+            ipc_barriers_in.serialize() + ipc_barriers_out.serialize() +
+            lamport_buffers_0.serialize() + lamport_buffers_1.serialize() +
+            lamport_buffers_2.serialize() + [0] + [0],
             dtype=torch.int64,
             device="cpu")
 
diff --git a/tensorrt_llm/quantization/mode.py b/tensorrt_llm/quantization/mode.py
index 04ffe2fe7..0ececc424 100644
--- a/tensorrt_llm/quantization/mode.py
+++ b/tensorrt_llm/quantization/mode.py
@@ -34,6 +34,8 @@ class QuantAlgo(StrEnum, metaclass=BaseEnumMeta):
     FP8 = auto()
     FP8_PER_CHANNEL_PER_TOKEN = auto()
     INT8 = auto()
+    MIXED_PRECISION = auto()
+    NO_QUANT = auto()
 
 
 QUANT_ALGO_LIST = list(set(QuantAlgo) - {QuantAlgo.INT8})
@@ -82,6 +84,9 @@ class QuantMode(IntFlag):
     # The mask of all valid flags.
     VALID_FLAGS = COUNT - 1
 
+    def __deepcopy__(self, memo):
+        return self
+
     # All the bits set? You can restrict the test to the bits indicated by "mask".
     def _all(self, bits, mask=VALID_FLAGS):
         return (self & mask) == bits
@@ -138,6 +143,9 @@ def has_fp8_qdq(self):
     def has_fp8_rowwise(self):
         return self._any(self.FP8_ROWWISE)
 
+    def has_weight_quant(self):
+        return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS)
+
     def has_any_quant(self):
         return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS
                          | self.ACTIVATIONS
@@ -241,7 +249,7 @@ def use_weight_only(use_int4_weights=False, per_group=False):
 
     @staticmethod
     def from_quant_algo(
-        quant_algo: Optional[QuantAlgo],
+        quant_algo: Optional[QuantAlgo] = None,
         kv_cache_quant_algo: Optional[QuantAlgo] = None,
     ) -> "QuantMode":
         assert quant_algo is None or quant_algo in QUANT_ALGO_LIST
diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py
index 25b4bdf6c..92cc7bac8 100644
--- a/tensorrt_llm/quantization/quantize.py
+++ b/tensorrt_llm/quantization/quantize.py
@@ -1,10 +1,11 @@
 import fnmatch
+from typing import Union
 
 from .._utils import get_init_params
 from ..layers import (MLP, Attention, ColumnLinear, Embedding, GatedMLP,
                       LayerNorm, RmsNorm, RowLinear)
 from ..layers.moe import MixtureOfExperts
-from ..models.modeling_utils import QuantConfig
+from ..models.modeling_utils import LayerQuantConfig, QuantConfig
 from ..parameter import Parameter
 from .layers import (FP8Linear, FP8RowLinear, Fp8RowwiseGatedMLP, Fp8RowwiseMLP,
                      Fp8RowwiseRmsNorm, Int8SmoothQuantLinear,
@@ -79,9 +80,14 @@ def quantize_layers(
     return model
 
 
-def weight_only_quantize(model, quant_config: QuantConfig):
+def weight_only_quantize(model, quant_config: QuantConfig, model_config=None):
     assert quant_config.quant_mode.is_weight_only()
 
+    try:
+        model_cfg = model.config
+    except Exception:
+        model_cfg = model_config
+
     quant_map = {
         ColumnLinear: WeightOnlyQuantColumnLinear,
         RowLinear: WeightOnlyQuantRowLinear,
@@ -93,7 +99,7 @@ def preprocess_init_params(init_params, name, module):
         if isinstance(module, ColumnLinear):
             module_name = name.rsplit('.', 1)[-1]
             init_params["transb"] = module_name == "lm_head"
-        init_params["tp_rank"] = model.config.mapping.tp_rank
+        init_params["tp_rank"] = model_cfg.mapping.tp_rank
 
     model = quantize_layers(
         model,
@@ -104,9 +110,16 @@ def preprocess_init_params(init_params, name, module):
     return model
 
 
-def weight_only_groupwise_quantize(model, quant_config: QuantConfig):
+def weight_only_groupwise_quantize(model,
+                                   quant_config: QuantConfig,
+                                   model_config=None):
     assert quant_config.quant_mode.is_weight_only()
 
+    try:
+        model_cfg = model.config
+    except Exception:
+        model_cfg = model_config
+
     quant_map = {
         ColumnLinear: WeightOnlyGroupwiseQuantColumnLinear,
         RowLinear: WeightOnlyGroupwiseQuantRowLinear,
@@ -118,7 +131,7 @@ def preprocess_init_params(init_params, name, module):
         init_params["zero"] = quant_config.has_zero_point
         init_params[
             "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ
-        init_params["tp_rank"] = model.config.mapping.tp_rank
+        init_params["tp_rank"] = model_cfg.mapping.tp_rank
 
     model = quantize_layers(
         model,
@@ -207,9 +220,14 @@ def fp8_quantize(model, quant_config: QuantConfig):
     return model
 
 
-def fp8_rowwise_quantize(model, quant_config: QuantConfig):
+def fp8_rowwise_quantize(model, quant_config: QuantConfig, model_config=None):
     assert quant_config.quant_mode.has_fp8_rowwise()
 
+    try:
+        model_cfg = model.config
+    except Exception:
+        model_cfg = model_config
+
     quant_map = {
         RmsNorm: Fp8RowwiseRmsNorm,
         GatedMLP: Fp8RowwiseGatedMLP,
@@ -230,8 +248,8 @@ def extract_layer_idx(name):
             continue
 
         # Meta's Fp8 recipe
-        mapping = model.config.mapping
-        layers_range = mapping.pp_layers(model.config.num_hidden_layers)
+        mapping = model_cfg.mapping
+        layers_range = mapping.pp_layers(model_cfg.num_hidden_layers)
         is_first_layer = mapping.is_first_pp_rank() and layer_idx == 0
         is_last_layer = mapping.is_last_pp_rank(
         ) and layer_idx == len(layers_range) - 1
@@ -259,30 +277,54 @@ def extract_layer_idx(name):
     return model
 
 
-def kv_cache_quantize(model, quant_config: QuantConfig):
-    assert quant_config.quant_mode.has_kv_cache_quant()
+# Now consider the kv cache is enabled for all layers
+def kv_cache_quantize(model):
     for name, module in model.named_modules():
         if isinstance(module, (Attention, SmoothQuantAttention)):
             module.kv_cache_scaling_factor = Parameter(shape=(1, ),
                                                        dtype='float32')
+    return model
 
 
-def quantize(model, quant_config: QuantConfig):
-    quant_mode = quant_config.quant_mode
+def quantize(model, quant_config: Union[QuantConfig, LayerQuantConfig]):
+    quant_mode = quant_config.layer_quant_mode
 
-    if quant_mode.has_fp8_qdq():
-        model = fp8_quantize(model, quant_config)
-    elif quant_mode.has_fp8_rowwise():
-        model = fp8_rowwise_quantize(model, quant_config)
-    elif quant_mode.has_act_and_weight_quant():
-        model = smooth_quantize(model, quant_config)
-    elif quant_mode.is_weight_only():
-        if quant_mode.has_per_group_scaling():
-            model = weight_only_groupwise_quantize(model, quant_config)
+    for name, module, parent in model.named_modules_with_parent():
+        if quant_config.quant_algo == QuantAlgo.MIXED_PRECISION:
+            if name in quant_mode.keys():
+                layer_quant_mode = quant_mode[name]
+            else:
+                continue
         else:
-            model = weight_only_quantize(model, quant_config)
+            layer_quant_mode = quant_mode
+        if layer_quant_mode == QuantMode(0):
+            continue
+
+        layer_quant_cfg = quant_config.get_quant_cfg(name)
+
+        if layer_quant_mode.has_fp8_qdq():
+            module = fp8_quantize(module, layer_quant_cfg)
+        elif layer_quant_mode.has_fp8_rowwise():
+            module = fp8_rowwise_quantize(module, layer_quant_cfg, model.config)
+        elif layer_quant_mode.has_act_and_weight_quant():
+            module = smooth_quantize(module, layer_quant_cfg)
+        elif layer_quant_mode.is_weight_only():
+            if layer_quant_mode.has_per_group_scaling():
+                module = weight_only_groupwise_quantize(module, layer_quant_cfg,
+                                                        model.config)
+            else:
+                module = weight_only_quantize(module, layer_quant_cfg,
+                                              model.config)
 
-    if quant_mode.has_kv_cache_quant():
-        model = kv_cache_quantize(model, quant_config)
+        if parent is not None:  # for per layer
+            module_name = name.rsplit('.', 1)[-1]
+            setattr(parent, module_name, module)
+        else:  # for all layer
+            model = module
+            break
+
+    if quant_config.quant_mode.has_kv_cache_quant():
+        model = kv_cache_quantize(model)
 
+    setattr(model, 'quant_mode', quant_config.quant_mode)
     return model
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
index 769f0e0f0..8c6869eae 100644
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -130,6 +130,22 @@ def quant_cfg_choices():
 }
 
 
+class _CustomDataset(torch.utils.data.Dataset):
+
+    def __init__(self, encodings):
+        self.encodings = encodings
+
+    def __getitem__(self, idx):
+        item = {
+            key: torch.tensor(val[idx])
+            for key, val in self.encodings.items()
+        }
+        return item
+
+    def __len__(self):
+        return len(self.encodings["input_ids"])
+
+
 def get_tokenizer(ckpt_path, max_seq_length=2048, model_type=None):
     logger.info(f"Initializing tokenizer from {ckpt_path}")
     tokenizer = AutoTokenizer.from_pretrained(
@@ -218,7 +234,9 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
                          tokenizer=None,
                          batch_size=1,
                          calib_size=512,
-                         block_size=512):
+                         block_size=512,
+                         device=None,
+                         include_labels=False):
     logger.info("Loading calibration dataset")
     if dataset_name_or_dir == "pileval":
         dataset = load_dataset(
@@ -227,7 +245,11 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
             split="train")
         dataset = dataset["text"][:calib_size]
     elif "cnn_dailymail" in dataset_name_or_dir:
-        dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train")
+        dataset = load_dataset(
+            dataset_name_or_dir,
+            name="3.0.0",
+            split="train",
+        )
         dataset = dataset["article"][:calib_size]
     elif os.path.isdir(dataset_name_or_dir):
         logger.info(
@@ -246,7 +268,23 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
                                                 padding=True,
                                                 truncation=True,
                                                 max_length=block_size)
-    batch_encoded = batch_encoded["input_ids"]
+
+    if device:
+        batch_encoded = batch_encoded.to(device)
+
+    if include_labels:
+        # Labels are needed when backward is called in the model.
+        # The labels should be a shifted version of the input_ids.
+        # However, we should not shift the input_ids here since the labels are shifted by
+        # Huggingface models during loss calculation as shown here -
+        # https://github.com/huggingface/transformers/blob/7f79a97399bb52aad8460e1da2f36577d5dccfed/src/transformers/models/llama/modeling_llama.py#L1093-L1095
+        batch_encoded["labels"] = torch.where(
+            batch_encoded["attention_mask"] > 0.5, batch_encoded["input_ids"],
+            -100)
+        batch_encoded = _CustomDataset(batch_encoded)
+    else:
+        # For backward compatibility, if labels are not needed, we only return input_ids.
+        batch_encoded = batch_encoded["input_ids"]
 
     calib_dataloader = DataLoader(batch_encoded,
                                   batch_size=batch_size,
@@ -255,7 +293,8 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
     return calib_dataloader
 
 
-def quantize_model(model, quant_cfg, calib_dataloader=None):
+def quantize_model(model, quant_cfg, calib_dataloader, batch_size, qformat,
+                   weight_compression):
     import modelopt.torch.quantization as atq
 
     def calibrate_loop():
@@ -268,14 +307,40 @@ def calibrate_loop():
             data = data.to(model.device)
             model(data)
 
+    QUANT_CFG_CHOICES = {
+        "int8": "INT8_DEFAULT_CFG",
+        "int8_sq": "INT8_SMOOTHQUANT_CFG",
+        "fp8": "FP8_DEFAULT_CFG",
+        "int4_awq": "INT4_AWQ_CFG",
+        "w4a8_awq": "W4A8_AWQ_BETA_CFG",
+    }
+
     logger.info("Starting quantization...")
     start_time = time.time()
-    atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    if weight_compression:
+        logger.info("Starting mixed precision quantization...")
+        model, search_history = atq.auto_quantize(
+            model,
+            data_loader=calib_dataloader,
+            loss_func=lambda output, batch: output.loss,
+            constraints={"weight_compression": weight_compression},
+            quantization_formats=[
+                QUANT_CFG_CHOICES[item] for item in qformat.split(",")
+            ] + [None],
+            collect_func=lambda x: x,
+            num_calib_steps=len(calib_dataloader),
+            num_score_steps=min(
+                len(calib_dataloader), 128 // batch_size
+            ),  # Limit the number of score steps to avoid long calibration time
+            verbose=True,
+        )
+        atq.print_quant_summary(model)
+    else:
+        atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
     end_time = time.time()
     logger.info(
         "Quantization done. Total time used: {:.2f} s.".format(end_time -
                                                                start_time))
-
     return model
 
 
@@ -367,7 +432,8 @@ def quantize_and_export(*,
                         max_draft_len=None,
                         medusa_hidden_act=None,
                         medusa_model_dir=None,
-                        quant_medusa_head=None):
+                        quant_medusa_head=None,
+                        weight_compression=None):
     '''
         Load model from the model_dir, call Modelopt to quantize the model, and then export
         the quantized model as TRT-LLM checkpoint
@@ -403,7 +469,7 @@ def quantize_and_export(*,
                    ] and kv_cache_dtype is None:
         logger.info(f"No quantization applied, export {dtype} model")
     else:
-        if "awq" in qformat:
+        if any("awq" in item for item in qformat.split(",")):
             if calib_size > 32:
                 logger.info(
                     f"AWQ calibration could take longer with calib_size = {calib_size}, Using"
@@ -415,34 +481,53 @@ def quantize_and_export(*,
                 " set by adding the argument --batch_size <batch_size> to the command line.\n"
             )
 
+        # Check if qformat provided is supported. qformat is list of one element for non auto_quant case.
+        if all(item in quant_cfg_choices() for item in qformat.split(",")):
+            quant_cfg = quant_cfg_choices()[qformat.split(",")[0]]
+        else:
+            raise ValueError(f"Unsupported quantization format: {qformat}")
+
+        # Auto quantize does not use quant_cfg
+        if not weight_compression and "awq" in qformat:
+            quant_cfg = copy.deepcopy(quant_cfg_choices()[qformat])
+            weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
+            if isinstance(weight_quantizer, list):
+                weight_quantizer = weight_quantizer[0]
+            weight_quantizer["block_sizes"][-1] = awq_block_size
+
+            # Coarser optimal scale search seems to resolve the overflow in TRT-LLM for some models
+            if "w4a8_awq" == qformat and model_type in ["gemma", "mpt"]:
+                quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1}
+
         calib_dataloader = get_calib_dataloader(
             dataset_name_or_dir=calib_dataset,
             tokenizer=tokenizer,
             batch_size=batch_size,
             calib_size=calib_size,
             block_size=calib_max_seq_length,
+            device=torch.device("cuda") if weight_compression else None,
+            include_labels=weight_compression is not None,
         )
 
-        if qformat in quant_cfg_choices():
-            quant_cfg = quant_cfg_choices()[qformat]
-        else:
-            raise ValueError(f"Unsupported quantization format: {qformat}")
-
-        if "awq" in qformat:
-            quant_cfg = copy.deepcopy(quant_cfg_choices()[qformat])
-            weight_quantizer = quant_cfg["quant_cfg"][
-                "*weight_quantizer"]  # type: ignore
-            if isinstance(weight_quantizer, list):
-                weight_quantizer = weight_quantizer[0]
-            weight_quantizer["block_sizes"][-1] = awq_block_size
+        # Always turn on FP8 kv cache to save memory footprint.
+        # For int8_sq, we do not quantize kv cache to preserve accuracy.
+        # We turn off FP8 kv cache for unified_hf checkpoint
+        enable_quant_kv_cache = "int8" not in qformat
+        print(
+            f'{"Enable" if enable_quant_kv_cache else "Disable"} KV cache quantization'
+        )
+        quant_cfg["quant_cfg"]["*output_quantizer"] = {
+            "num_bits": 8 if qformat == "int8_sq" else (4, 3),
+            "axis": None,
+            "enable": enable_quant_kv_cache,
+        }
 
-        if kv_cache_dtype is not None:
-            if kv_cache_dtype == "fp8":
-                for value in KV_CACHE_CFG.values():
-                    value.update({"num_bits": (4, 3)})  # type: ignore
-            quant_cfg["quant_cfg"].update(KV_CACHE_CFG)  # type: ignore
+        # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
+        if model_type == "gemma" and "int8_sq" in qformat.split(","):
+            quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}
 
-        model = quantize_model(model, quant_cfg, calib_dataloader)
+        model = quantize_model(model, quant_cfg, calib_dataloader, batch_size,
+                               qformat, weight_compression)
 
     with torch.inference_mode():
         if model_type is None:
@@ -454,12 +539,34 @@ def quantize_and_export(*,
         export_path = output_dir
         start_time = time.time()
 
-        export_tensorrt_llm_checkpoint(model,
-                                       model_type,
-                                       getattr(torch, dtype),
-                                       export_dir=export_path,
-                                       inference_tensor_parallel=tp_size,
-                                       inference_pipeline_parallel=pp_size)
+        QUANT_ALGO = {
+            "int8": "INT8",
+            "int8_sq": "W8A8_SQ_PER_CHANNEL",
+            "fp8": "FP8",
+            "int4_awq": "W4A16_AWQ",
+            "w4a8_awq": "W4A8_AWQ",
+        }
+
+        # workaround for old API version
+        if weight_compression:
+            export_tensorrt_llm_checkpoint(
+                model,
+                model_type,
+                getattr(torch, dtype),
+                export_dir=export_path,
+                inference_tensor_parallel=tp_size,
+                inference_pipeline_parallel=pp_size,
+                auto_quant=weight_compression is not None,
+            )
+        else:
+            export_tensorrt_llm_checkpoint(
+                model,
+                model_type,
+                getattr(torch, dtype),
+                export_dir=export_path,
+                inference_tensor_parallel=tp_size,
+                inference_pipeline_parallel=pp_size,
+            )
 
         with open(f"{export_path}/config.json", "r") as f:
             tensorrt_llm_config = json.load(f)
diff --git a/tensorrt_llm/runtime/model_runner_cpp.py b/tensorrt_llm/runtime/model_runner_cpp.py
index daa5c608d..70d848d7f 100644
--- a/tensorrt_llm/runtime/model_runner_cpp.py
+++ b/tensorrt_llm/runtime/model_runner_cpp.py
@@ -15,7 +15,7 @@
 
 import copy
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 
@@ -24,6 +24,7 @@
 from ..bindings import (DataType, GptJsonConfig, KVCacheType, ModelConfig,
                         WorldConfig)
 from ..bindings import executor as trtllm
+from ..bindings.executor import ExternalDraftTokensConfig, ParallelConfig
 from ..builder import EngineConfig
 from ..logger import logger
 from ..mapping import Mapping
@@ -74,30 +75,34 @@ def __init__(self,
         self.lora_manager = lora_manager
 
     @classmethod
-    def from_dir(cls,
-                 engine_dir: str,
-                 *,
-                 lora_dir: Optional[str] = None,
-                 rank: int = 0,
-                 max_batch_size: Optional[int] = None,
-                 max_input_len: Optional[int] = None,
-                 max_output_len: Optional[int] = None,
-                 max_beam_width: Optional[int] = None,
-                 max_attention_window_size: Optional[list[int]] = None,
-                 sink_token_length: Optional[int] = None,
-                 kv_cache_free_gpu_memory_fraction: Optional[float] = None,
-                 medusa_choices: list[list[int]] | None = None,
-                 lookahead_config: list[int] | None = None,
-                 debug_mode: bool = False,
-                 lora_ckpt_source: str = "hf",
-                 gpu_weights_percent: float = 1,
-                 max_tokens_in_paged_kv_cache: int | None = None,
-                 kv_cache_enable_block_reuse: bool = False,
-                 enable_chunked_context: bool = False,
-                 is_enc_dec: bool = False,
-                 multi_block_mode: bool = True,
-                 enable_context_fmha_fp32_acc: Optional[bool] = None,
-                 cuda_graph_mode: Optional[bool] = None) -> 'ModelRunnerCpp':
+    def from_dir(
+        cls,
+        engine_dir: str,
+        *,
+        lora_dir: Optional[str] = None,
+        rank: int = 0,
+        max_batch_size: Optional[int] = None,
+        max_input_len: Optional[int] = None,
+        max_output_len: Optional[int] = None,
+        max_beam_width: Optional[int] = None,
+        max_attention_window_size: Optional[list[int]] = None,
+        sink_token_length: Optional[int] = None,
+        kv_cache_free_gpu_memory_fraction: Optional[float] = None,
+        medusa_choices: list[list[int]] | None = None,
+        lookahead_config: list[int] | None = None,
+        debug_mode: bool = False,
+        lora_ckpt_source: str = "hf",
+        gpu_weights_percent: float = 1,
+        max_tokens_in_paged_kv_cache: int | None = None,
+        kv_cache_enable_block_reuse: bool = False,
+        enable_chunked_context: bool = False,
+        is_enc_dec: bool = False,
+        multi_block_mode: bool = True,
+        enable_context_fmha_fp32_acc: Optional[bool] = None,
+        cuda_graph_mode: Optional[bool] = None,
+        logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None,
+        device_ids: List[int] | None = None,
+    ) -> 'ModelRunnerCpp':
         """
         Create a ModelRunnerCpp instance from an engine directory.
 
@@ -150,6 +155,11 @@ def from_dir(cls,
                 Enable FMHA runner FP32 accumulation.
             cuda_graph_mode (bool):
                 Whether to use cuda graph for inference.
+            logits_processor_map (Dict[str, LogitsProcessor])
+                A map of logits processor functions indexed by names. A name can be provided later to
+                the generate() function to specify which logits processor to run.
+            device_ids (List[int]):
+                Device indices to run the Executor on.
         Returns:
             ModelRunnerCpp: An instance of ModelRunnerCpp.
         """
@@ -336,6 +346,16 @@ def from_dir(cls,
             gpu_weights_percent=gpu_weights_percent)
         trtllm_config.enable_chunked_context = enable_chunked_context
         trtllm_config.extended_runtime_perf_knob_config = extended_runtime_perf_knob_config
+        trtllm_config.parallel_config = ParallelConfig(
+            trtllm.CommunicationType.MPI,
+            trtllm.CommunicationMode.LEADER,
+            device_ids=device_ids,
+            orchestrator_config=None)
+
+        logits_proc_config = trtllm.LogitsPostProcessorConfig()
+        if logits_processor_map is not None:
+            logits_proc_config.processor_map = logits_processor_map
+        trtllm_config.logits_post_processor_config = logits_proc_config
 
         executor = trtllm.Executor(engine_dir, trtllm.ModelType.DECODER_ONLY,
                                    trtllm_config)
@@ -437,7 +457,7 @@ def generate(
             lookahead_config: list[int] | None = None,
             streaming: bool = False,
             stopping_criteria: Optional[StoppingCriteria] = None,
-            logits_processor: Optional[LogitsProcessor] = None,
+            logits_processor_names: list[str] | None = None,
             max_new_tokens: int = 1,
             num_return_sequences: int = 1,
             end_id: int | None = None,
@@ -485,8 +505,8 @@ def generate(
                 Whether or not to use streaming mode for generation.
             stopping_criteria (StoppingCriteria):
                 Custom stopping criteria.
-            logits_processor (LogitsProcessor):
-                Custom logits processors.
+            logits_processor_names (List[str]):
+                Custom logits processor names.
             return_all_generated_tokens (bool):
                 Whether the full output is returned at each streaming step
             num_return_sequences (int):
@@ -507,9 +527,6 @@ def generate(
         if stopping_criteria is not None:
             raise RuntimeError(
                 "Stopping criteria is not supported in C++ session.")
-        if logits_processor is not None:
-            raise RuntimeError(
-                "Logits processor is not supported in C++ session.")
 
         if not self.use_kv_cache and max_new_tokens > 1:
             raise RuntimeError(
@@ -567,6 +584,8 @@ def generate(
                                                    len(batch_input_ids_list))
         bad_words_list = self._prepare_words_list(bad_words_list,
                                                   len(batch_input_ids_list))
+        logits_processor_names = self._prepare_names_list(
+            logits_processor_names, len(batch_input_ids_list))
 
         lora_configs = self._prepare_lora_configs(lora_uids,
                                                   len(batch_input_ids_list))
@@ -575,6 +594,29 @@ def generate(
             [w, n, g] = lookahead_config
             request_lookahead_config = trtllm.LookaheadDecodingConfig(w, n, g)
 
+        # Draft-Target-Model speculative decoding
+        if "draft_tokens_list" in kwargs.keys() and kwargs[
+                "draft_tokens_list"] is not None and "draft_logits_list" in kwargs.keys(
+                ) and kwargs["draft_logits_list"] is not None:
+            # Use logits to accept
+            external_draft_tokens_configs = [
+                ExternalDraftTokensConfig(draft_tokens, draft_logits, 1.0e-8)
+                for draft_tokens, draft_logits in zip(
+                    kwargs["draft_tokens_list"], kwargs["draft_logits_list"])
+            ]
+            is_draft_target_model = True
+        elif "draft_tokens_list" in kwargs.keys(
+        ) and kwargs["draft_tokens_list"] is not None:
+            # Use tokens to accept
+            external_draft_tokens_configs = [
+                ExternalDraftTokensConfig(draft_tokens)
+                for draft_tokens in kwargs["draft_tokens_list"]
+            ]
+            is_draft_target_model = True
+        else:
+            external_draft_tokens_configs = [None] * len(batch_input_ids_list)
+            is_draft_target_model = False
+
         requests = [
             trtllm.Request(
                 input_token_ids=input_ids,
@@ -598,11 +640,16 @@ def generate(
                 output_config=output_config,
                 prompt_tuning_config=prompt_tuning_config,
                 lora_config=lora_config,
-                return_all_generated_tokens=return_all_generated_tokens) for i,
+                return_all_generated_tokens=return_all_generated_tokens,
+                logits_post_processor_name=logits_post_processor_name,
+                external_draft_tokens_config=external_draft_tokens_config,
+            ) for i,
             (input_ids, stop_words, bad_words, prompt_tuning_config,
-             lora_config) in enumerate(
+             lora_config, logits_post_processor_name,
+             external_draft_tokens_config) in enumerate(
                  zip(batch_input_ids_list, stop_words_list, bad_words_list,
-                     prompt_tuning_configs, lora_configs))
+                     prompt_tuning_configs, lora_configs,
+                     logits_processor_names, external_draft_tokens_configs))
         ]
 
         request_ids = self.session.enqueue_requests(requests)
@@ -610,14 +657,15 @@ def generate(
             return self._initialize_and_fill_output(
                 request_ids, end_id, return_dict, output_sequence_lengths,
                 output_log_probs, output_cum_log_probs, batch_input_ids,
-                streaming, max_new_tokens, num_return_sequences)
+                streaming, max_new_tokens, num_return_sequences,
+                is_draft_target_model)
         else:
             return self._stream(request_ids, end_id, return_dict,
                                 output_sequence_lengths, output_log_probs,
                                 output_cum_log_probs, batch_input_ids,
                                 batch_input_ids_list, streaming,
                                 return_all_generated_tokens, max_new_tokens,
-                                num_return_sequences)
+                                num_return_sequences, is_draft_target_model)
 
     def _prepare_words_list(self, words_list: List[List[List[int]]],
                             batch_size: int):
@@ -625,6 +673,11 @@ def _prepare_words_list(self, words_list: List[List[List[int]]],
             return [None] * batch_size
         return words_list
 
+    def _prepare_names_list(self, names_list: List[str], batch_size: int):
+        if names_list is None:
+            return [None] * batch_size
+        return names_list
+
     def _prepare_ptuning_executor(self, batch_input_ids_list, prompt_table,
                                   prompt_tasks, input_token_extra_ids):
         if input_token_extra_ids:
@@ -666,17 +719,20 @@ def _prepare_lora_configs(self, lora_uids, batch_size):
             if int(uid) >= 0 else None for uid in lora_uids
         ]
 
-    def _initialize_and_fill_output(self,
-                                    request_ids,
-                                    end_id,
-                                    return_dict,
-                                    output_sequence_lengths,
-                                    output_log_probs,
-                                    output_cum_log_probs,
-                                    batch_input_ids,
-                                    streaming,
-                                    max_new_tokens: int,
-                                    num_return_sequences: int = 1):
+    def _initialize_and_fill_output(
+        self,
+        request_ids,
+        end_id,
+        return_dict,
+        output_sequence_lengths,
+        output_log_probs,
+        output_cum_log_probs,
+        batch_input_ids,
+        streaming,
+        max_new_tokens: int,
+        num_return_sequences: int = 1,
+        is_draft_target_model: bool = False,
+    ):
         output_ids = [[[] for _ in range(self.max_beam_width)]
                       for _ in range(len(request_ids) * num_return_sequences)]
 
@@ -689,21 +745,24 @@ def _initialize_and_fill_output(self,
                                  output_sequence_lengths, output_log_probs,
                                  output_cum_log_probs, batch_input_ids, [],
                                  streaming, request_ids, False, max_new_tokens,
-                                 num_return_sequences)
-
-    def _stream(self,
-                request_ids,
-                end_id,
-                return_dict,
-                output_sequence_lengths,
-                output_log_probs,
-                output_cum_log_probs,
-                batch_input_ids,
-                batch_input_ids_list,
-                streaming,
-                return_all_generated_tokens,
-                max_new_tokens: int,
-                num_return_sequences: int = 1):
+                                 num_return_sequences, is_draft_target_model)
+
+    def _stream(
+        self,
+        request_ids,
+        end_id,
+        return_dict,
+        output_sequence_lengths,
+        output_log_probs,
+        output_cum_log_probs,
+        batch_input_ids,
+        batch_input_ids_list,
+        streaming,
+        return_all_generated_tokens,
+        max_new_tokens: int,
+        num_return_sequences: int = 1,
+        is_draft_target_model: bool = False,
+    ):
 
         output_ids = [[]
                       for _ in range(len(request_ids) * num_return_sequences)]
@@ -726,14 +785,15 @@ def _stream(self,
                                     output_cum_log_probs, batch_input_ids,
                                     batch_input_ids_list, streaming,
                                     request_ids, return_all_generated_tokens,
-                                    max_new_tokens, num_return_sequences)
+                                    max_new_tokens, num_return_sequences,
+                                    is_draft_target_model)
 
     def _fill_output(self, responses, output_ids, end_id, return_dict,
                      output_sequence_lengths, output_log_probs,
                      output_cum_log_probs, batch_input_ids,
                      batch_input_ids_list, streaming, request_ids,
                      return_all_generated_tokens, max_new_tokens,
-                     num_return_sequences):
+                     num_return_sequences, is_draft_target_model):
         cuda_device = torch.device("cuda")
 
         # Total number of output sequences = batch_size * num_return_sequences.
@@ -806,32 +866,39 @@ def req_idx(response: trtllm.Response):
                 outputs['context_logits'] = context_logits
 
             if self.gather_generation_logits:
-                if not streaming:
-                    gen_shape = (num_beams, max_new_tokens, vocab_size)
-                elif streaming and return_all_generated_tokens:
-                    gen_shape = (max_new_tokens, num_beams, vocab_size)
-                else:  # streaming and not return_all_generated_tokens
-                    gen_shape = (1, num_beams, vocab_size)
-
                 gen_logits = None
-                for response in responses:
-                    # gen logits shape: (beam, seq, vocab)
-                    logits = response.result.generation_logits
-                    if logits is None:
-                        continue
-                    num_beams, seq_len, vocab_size = logits.shape
-                    if gen_logits is None:
-                        gen_logits = torch.zeros(
-                            (num_output_sequences, *gen_shape),
-                            dtype=logits.dtype,
-                            device=cuda_device)
-                    batch_idx = request_ids.index(response.request_id)
-                    seq_idx = response.result.sequence_index
-                    reqid_pos = batch_idx * num_return_sequences + seq_idx
-                    if streaming:
-                        gen_logits[reqid_pos, :seq_len, ...] = logits[0]
-                    else:
-                        gen_logits[reqid_pos, :, :seq_len, ...] = logits[0]
+                if is_draft_target_model:
+                    # Put the outputs in a list rather than a tensor since their
+                    # length may vary among requests in a batch
+                    gen_logits = [
+                        a.result.generation_logits.cuda() for a in responses
+                        if a.result.generation_logits is not None
+                    ]
+                else:
+                    for response in responses:
+                        # gen logits shape: (beam, seq, vocab)
+                        logits = response.result.generation_logits
+                        if logits is None:
+                            continue
+                        num_beams, seq_len, vocab_size = logits.shape
+                        if not streaming:
+                            gen_shape = (num_beams, max_new_tokens, vocab_size)
+                        elif streaming and return_all_generated_tokens:
+                            gen_shape = (max_new_tokens, num_beams, vocab_size)
+                        else:  # streaming and not return_all_generated_tokens
+                            gen_shape = (1, num_beams, vocab_size)
+                        if gen_logits is None:
+                            gen_logits = torch.zeros(
+                                (num_output_sequences, *gen_shape),
+                                dtype=logits.dtype,
+                                device=cuda_device)
+                        batch_idx = request_ids.index(response.request_id)
+                        seq_idx = response.result.sequence_index
+                        reqid_pos = batch_idx * num_return_sequences + seq_idx
+                        if streaming:
+                            gen_logits[reqid_pos, :seq_len, ...] = logits[0]
+                        else:
+                            gen_logits[reqid_pos, :, :seq_len, ...] = logits[0]
                 outputs['generation_logits'] = gen_logits
 
             if output_log_probs:
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index 5981441d5..d504e88c6 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -144,7 +144,8 @@ def build_trt_engine(model_type,
 
     config_args = {
         "precision": str(dtype).split('.')[-1],
-        "model_type": model_type
+        "model_type": model_type,
+        "strongly_typed": False
     }
     if num_frames is not None:
         config_args["num_frames"] = num_frames
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index ed0a116b5..789e5f612 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.14.0.dev2024100100"
+__version__ = "0.14.0.dev2024100800"
diff --git a/tests/attention/test_gpt_attention.py b/tests/attention/test_gpt_attention.py
index dc7758898..8abbedc0a 100644
--- a/tests/attention/test_gpt_attention.py
+++ b/tests/attention/test_gpt_attention.py
@@ -680,7 +680,6 @@ def _construct_execution(
             builder_config = builder.create_builder_config(
                 name=attention_type,
                 precision=dtype,
-                opt_level=0,
                 int8=int8_trt_flag,
                 quant_mode=quant_mode)
 
diff --git a/tests/attention/test_gpt_attention_IFB.py b/tests/attention/test_gpt_attention_IFB.py
index 4e6e67c12..af5b67414 100644
--- a/tests/attention/test_gpt_attention_IFB.py
+++ b/tests/attention/test_gpt_attention_IFB.py
@@ -480,7 +480,6 @@ def _construct_execution(session,
             builder_config = builder.create_builder_config(
                 name=attention_type,
                 precision=dtype,
-                opt_level=0,
                 fp8=use_fp8_context_fmha,
                 int8=int8_trt_flag)
             if session is None:
diff --git a/tests/bindings/test_executor_bindings.py b/tests/bindings/test_executor_bindings.py
index 315b55158..5d1310d47 100644
--- a/tests/bindings/test_executor_bindings.py
+++ b/tests/bindings/test_executor_bindings.py
@@ -478,13 +478,12 @@ def test_get_num_responses_ready(streaming: bool,
 @pytest.mark.parametrize("return_context_logits", [False, True])
 @pytest.mark.parametrize("return_generation_logits", [False, True])
 @skip_pre_ampere  # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture
-def test_token_comparison(batching_type: trtllm.BatchingType, streaming: bool,
-                          beam_width: int, compute_log_probs: bool,
-                          exclude_input_from_output: bool,
-                          return_context_logits: bool,
-                          return_generation_logits: bool, model_files,
-                          model_path, model_path_return_logits, input_data_path,
-                          results_data_path, results_data_path_beam_width_2):
+def test_token_comparison(
+        batching_type: trtllm.BatchingType, streaming: bool, beam_width: int,
+        compute_log_probs: bool, exclude_input_from_output: bool,
+        return_context_logits: bool, return_generation_logits: bool,
+        model_files, model_path, model_path_return_logits, input_data_path,
+        results_data_path_fmhafp32acc, results_data_path_beam_width_2):
     if streaming and beam_width > 1:
         pytest.skip("Test does not support streaming with beam search")
 
@@ -597,7 +596,7 @@ def verify_output(beam_tokens, test_data, given_input_lengths):
                                executor_config)
 
     # Load test data
-    results_path = results_data_path if beam_width == 1 else results_data_path_beam_width_2
+    results_path = results_data_path_fmhafp32acc if beam_width == 1 else results_data_path_beam_width_2
     given_input, given_input_lengths, max_input_length, test_data = load_test_data(
         input_data_path, results_path)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index b1c967564..1afbe2f35 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,94 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # # Force resource release after test
-import gc
-import multiprocessing.connection
-import os
-import sys
-import time
-
 import pytest
 
-memory_profiling_enabled = os.environ.get("LLM_MEMORY_PROFILING", False)
-
-if memory_profiling_enabled:
-
-    @pytest.hookimpl(trylast=True)
-    def pytest_sessionstart(session):
-        import xdist
-        session.stash["reporter"] = multiprocessing.connection.Client(
-            "/tmp/profiling_scribe.unix", "AF_UNIX")
-        session.stash["worker_id"] = xdist.get_xdist_worker_id(session)
-        session.stash["reporter"].send({
-            "type": "identity",
-            "identifier": "unittest",
-            "pid": os.getpid(),
-            "worker_id": session.stash["worker_id"]
-        })
-
-    @pytest.hookimpl(trylast=True)
-    def pytest_collection_modifyitems(session, config, items):
-        for item in items:
-            item.stash["reporter"] = session.stash["reporter"]
-            item.stash["worker_id"] = session.stash["worker_id"]
 
-    @pytest.hookimpl(trylast=True)
-    def pytest_sessionfinish(session):
-        session.stash["reporter"].close()
-
-
-@pytest.hookimpl(tryfirst=True, wrapper=True)
+@pytest.hookimpl(wrapper=True)
 def pytest_runtest_protocol(item, nextitem):
-    if memory_profiling_enabled:
-        path, line, name = item.reportinfo()
-        item.stash["reporter"].send({
-            "type": "unit_case",
-            "timestamp": time.time(),
-            "case": {
-                "path": str(path),
-                "line": line,
-                "name": name
-            },
-            "worker_id": item.stash["worker_id"],
-            "pid": os.getpid()
-        })
-
-    result = yield
-
-    if not any(module == 'torch' or module.startswith('torch.')
-               for module in sys.modules):
-        return result
-
-    import torch
-
-    if memory_profiling_enabled:
-        item.stash["reporter"].send({
-            "type": "torch_report",
-            "timestamp": time.time(),
-            "case": {
-                "path": str(path),
-                "line": line,
-                "name": name
-            },
-            "context": "unit",
-            "worker_id": item.stash["worker_id"],
-            "pid": os.getpid(),
-            "report": {
-                "allocated": torch.cuda.memory_allocated(),
-                "max_allocated": torch.cuda.max_memory_allocated(),
-                "reserved": torch.cuda.memory_reserved(),
-                "max_reserved": torch.cuda.max_memory_reserved(),
-            }
-        })
-
-        torch.cuda.reset_peak_memory_stats()
-
-    worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1))
-
-    if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0)
-        ) >= (torch.cuda.get_device_properties(0).total_memory //
-              worker_count) * 0.9:
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    return result
+    yield
+
+    import sys
+    for m in sys.modules:
+        if m == 'torch' or m.startswith('torch.'):
+            import gc
+            import os
+
+            import torch
+            worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1))
+
+            if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0)
+                ) >= (torch.cuda.get_device_properties(0).total_memory //
+                      worker_count) * 0.9:
+                gc.collect()
+                print("torch.cuda.memory_allocated: %fGB" %
+                      (torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024))
+                print("torch.cuda.memory_reserved: %fGB" %
+                      (torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024))
+                print("torch.cuda.max_memory_reserved: %fGB" %
+                      (torch.cuda.max_memory_reserved(0) / 1024 / 1024 / 1024))
+
+                torch.cuda.empty_cache()
+            break
diff --git a/tests/functional/test_moe.py b/tests/functional/test_moe.py
index c6819f3f1..deedde4f6 100644
--- a/tests/functional/test_moe.py
+++ b/tests/functional/test_moe.py
@@ -1022,8 +1022,7 @@ def create_trt_session(
                                  network,
                                  precision=trt_dtype_to_str(dtype),
                                  int8=weight_dtype == trt.int8,
-                                 quant_mode=quant_mode,
-                                 opt_level=4)
+                                 quant_mode=quant_mode)
         return session
 
     def generate_reference(self, inputs, k, actfn, weight_dtype, quant_mode,
diff --git a/tests/hlapi/test_llm.py b/tests/hlapi/test_llm.py
index 7ebc7d717..aca7b5437 100644
--- a/tests/hlapi/test_llm.py
+++ b/tests/hlapi/test_llm.py
@@ -127,7 +127,6 @@ def test_llm_build_config():
     # change some building parameters
     build_config.max_batch_size = 129
     build_config.max_beam_width = 4
-    build_config.builder_opt = 3
     build_config.max_num_tokens = 888
     build_config.strongly_typed = True
     build_config.max_seq_len = 333
@@ -148,7 +147,6 @@ def test_llm_build_config():
         build_config1.plugin_config.nccl_plugin = 'float16'
         assert build_config1.max_batch_size == build_config.max_batch_size
         assert build_config1.max_beam_width == build_config.max_beam_width
-        assert build_config1.builder_opt == build_config.builder_opt
         assert build_config1.max_num_tokens == build_config.max_num_tokens
         assert build_config1.strongly_typed == build_config.strongly_typed
         assert build_config1.max_seq_len == build_config.max_seq_len
@@ -806,7 +804,7 @@ def second_run():
                          prompts, ["D E F G H I J K"],
                          sampling_params=sampling_params)
 
-        # the cache should be hitted
+        # the cache should be hit
         assert llm.llm_build_stats.cache_hitted, llm.llm_build_stats.cache_info
         del llm
         release_gc()
diff --git a/tests/hlapi/test_llm_utils.py b/tests/hlapi/test_llm_utils.py
index 8bc460725..733b424f4 100644
--- a/tests/hlapi/test_llm_utils.py
+++ b/tests/hlapi/test_llm_utils.py
@@ -13,7 +13,7 @@
 
 
 def test_ConfigArbitrator_basic():
-    # the performance and functionality have conflict plugins config, keep the functionalies and disable the performance's
+    # the performance and functionality have conflict plugins config, keep the functionalities and disable the performance's
     arb = _ConfigArbitrator()
     arb.claim_perf("chunked_context",
                    config_name="plugin_config",
diff --git a/tests/model/test_gpt_e2e.py b/tests/model/test_gpt_e2e.py
index 936298085..82ee3eb76 100644
--- a/tests/model/test_gpt_e2e.py
+++ b/tests/model/test_gpt_e2e.py
@@ -62,7 +62,6 @@ def build_engine(checkpoint_dir: str, engine_dir: str, *args):
         '--max_input_len=40',
         '--max_seq_len=60',
         '--max_beam_width=2',
-        '--builder_opt=0',
     ]
     legacy_args = [
         "--gpt_attention_plugin=disable",
diff --git a/tests/model/test_mamba.py b/tests/model/test_mamba.py
index 9f16397d7..60beb572f 100644
--- a/tests/model/test_mamba.py
+++ b/tests/model/test_mamba.py
@@ -251,7 +251,7 @@ def test_mamba(self, gemm_plugin, mamba_conv1d_plugin, dtype,
                                                   device=step1_id.device))
                 gen_ref = hf_outputs.logits[:, -1, :]
 
-        # get tensorrt llm mamba rumtime
+        # get tensorrt llm mamba runtime
         runtime, _ = self._gen_tensorrt_llm_runtime(
             log_level, model_name, gemm_plugin, mamba_conv1d_plugin, hf_config,
             hf_path, hf_mamba, load_mode, batch_size, input_len, output_len,
diff --git a/tests/test_graph_rewriter.py b/tests/test_graph_rewriter.py
index 2adc8875a..2fb8656e7 100644
--- a/tests/test_graph_rewriter.py
+++ b/tests/test_graph_rewriter.py
@@ -455,7 +455,7 @@ def match_and_rewrite(self, layer: Layer) -> bool:
 
         new_inputs = flayer.clone_inputs()
         with net_guard(layer.network):
-            # Step 1: create new inputs and repalce the original arglist
+            # Step 1: create new inputs and replace the original arglist
             input = Tensor(
                 name='qkv',
                 dtype=trt.float16,
diff --git a/tests/test_layer.py b/tests/test_layer.py
index 8546ec725..0219331bb 100644
--- a/tests/test_layer.py
+++ b/tests/test_layer.py
@@ -1359,7 +1359,6 @@ def test_mamba(self, batch_size, in_seq_len, out_seq_len, d_model, d_state,
 
         stream = torch.cuda.current_stream()
         builder_config = builder.create_builder_config(name='mamba',
-                                                       opt_level=0,
                                                        precision=dtype)
         engine = builder.build_engine(net, builder_config)
         session = tensorrt_llm.runtime.Session.from_serialized_engine(engine)
@@ -1695,7 +1694,6 @@ def test_mamba2(self, batch_size, in_seq_len, out_seq_len, d_model, d_state,
 
         stream = torch.cuda.current_stream()
         builder_config = builder.create_builder_config(name='mamba2',
-                                                       opt_level=0,
                                                        precision=dtype)
         engine = builder.build_engine(net, builder_config)
         session = tensorrt_llm.runtime.Session.from_serialized_engine(engine)
@@ -1706,7 +1704,7 @@ def test_mamba2(self, batch_size, in_seq_len, out_seq_len, d_model, d_state,
             hidden_states_ref, last_token_ids, conv_state_ref, ssm_state_ref,
             remove_padding, batch_size, seqlen_offset)
 
-        dtype_atol = {"float16": 5e-3, "float32": 5e-3, "bfloat16": 5e-2}
+        dtype_atol = {"float16": 7e-3, "float32": 5e-3, "bfloat16": 5e-2}
 
         if not remove_padding:
             # get out_mask
@@ -2045,7 +2043,6 @@ def fuse_rg_lru(recurrent_layer):
 
         stream = torch.cuda.current_stream()
         builder_config = builder.create_builder_config(name='recurrent',
-                                                       opt_level=0,
                                                        precision=dtype)
         engine = builder.build_engine(net, builder_config)
         session = tensorrt_llm.runtime.Session.from_serialized_engine(engine)
diff --git a/tests/test_model_runner_cpp.py b/tests/test_model_runner_cpp.py
new file mode 100644
index 000000000..e5bd10459
--- /dev/null
+++ b/tests/test_model_runner_cpp.py
@@ -0,0 +1,84 @@
+import typing as tp
+from pathlib import Path
+
+import torch
+from bindings.binding_test_utils import *
+from transformers import AutoTokenizer
+from utils.cpp_paths import *
+from utils.llm_data import llm_models_root
+from utils.util import skip_pre_ampere
+
+from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCpp
+
+
+@pytest.fixture
+def model_files(llm_root: Path, resource_path: Path, results_data_path: Path):
+    # Model engines and expected outputs need to be generated.
+    print(results_data_path)
+    if not results_data_path.exists():
+        model_cache = llm_models_root()
+        model_cache_arg = ["--model_cache", str(model_cache)
+                           ] if model_cache is not None else []
+        prepare_model_tests(llm_root, resource_path, "gpt", model_cache_arg)
+
+
+@skip_pre_ampere  # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture
+def test_logits_post_processor(model_files, model_path):
+
+    # Define the logits post-processor callback
+    def logits_post_processor(req_id: int, logits: torch.Tensor,
+                              ids: tp.List[tp.List[int]], stream_ptr: int,
+                              client_id: tp.Optional[int]):
+        with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)):
+            logits[:] = float("-inf")
+            logits[..., 42] = 0
+
+    # Create ModelRunnerCpp
+    logits_processor_map = {"my_logits_pp": logits_post_processor}
+    runner = ModelRunnerCpp.from_dir(model_path,
+                                     logits_processor_map=logits_processor_map)
+
+    model_root = llm_models_root(check=True)
+    hf_model_dir = Path(model_root, "gpt2")
+
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_dir,
+                                              padding_side="left",
+                                              truncation_side="left",
+                                              trust_remote_code=True,
+                                              use_fast=True)
+
+    input_text = "Born in north-east France, Soyer trained as a"
+    batch_input_ids = [
+        torch.tensor(tokenizer.encode(input_text,
+                                      add_special_tokens=True,
+                                      truncation=True),
+                     dtype=torch.int32)
+    ]
+
+    pad_token_id = tokenizer.pad_token_id
+    if tokenizer.pad_token_id is None:
+        pad_token_id = tokenizer.eos_token_id
+
+    # Create the request
+    max_new_tokens = 5
+    with torch.no_grad():
+        outputs = runner.generate(batch_input_ids=batch_input_ids,
+                                  max_new_tokens=max_new_tokens,
+                                  end_id=tokenizer.eos_token_id,
+                                  pad_id=pad_token_id,
+                                  output_sequence_lengths=True,
+                                  return_dict=True,
+                                  logits_processor_names={"my_logits_pp"})
+
+    torch.cuda.synchronize()
+
+    # Get the new tokens
+    tokens = outputs['output_ids']
+    sequence_lengths = outputs['sequence_lengths']
+
+    output_begin = len(batch_input_ids[0])
+    output_end = sequence_lengths[0][0]
+
+    # check that all output tokens are 42
+    assert tokens[0][0][output_begin:output_end].tolist() == [42
+                                                              ] * max_new_tokens
diff --git a/tests/utils/cpp_paths.py b/tests/utils/cpp_paths.py
index 02a8abff4..7fd5cee3e 100644
--- a/tests/utils/cpp_paths.py
+++ b/tests/utils/cpp_paths.py
@@ -77,3 +77,8 @@ def results_data_path(data_path: _pl.Path) -> _pl.Path:
 @pytest.fixture(scope="module")
 def results_data_path_beam_width_2(data_path: _pl.Path) -> _pl.Path:
     return data_path / f"gpt2/beam_search_2/{get_base_model_spec().get_results_file()}"
+
+
+@pytest.fixture(scope="module")
+def results_data_path_fmhafp32acc(data_path: _pl.Path) -> _pl.Path:
+    return data_path / f"gpt2/sampling/{get_base_model_spec().enable_context_fmha_fp32_acc().get_results_file()}"
diff --git a/tests/utils/util.py b/tests/utils/util.py
index 74ac97fe8..8184c9f0f 100644
--- a/tests/utils/util.py
+++ b/tests/utils/util.py
@@ -200,7 +200,6 @@ def create_session(builder,
                    precision="float32",
                    int8=False,
                    fp8=False,
-                   opt_level=None,
                    memory_pool_limit=None,
                    optimization_profiles=[],
                    quant_mode=QuantMode(0)):
@@ -209,14 +208,13 @@ def create_session(builder,
     Args:
         network: a tensorrt_llm.Network object
         precision: the precision of the network, choose from ["float32", "float16", "bfloat16"]
-        **kwargs: builder flags such as int8, fp8, builder_opt, etc.
+        **kwargs: builder flags such as int8, fp8, etc.
     Returns:
         session: a tensorrt_llm.runtime.Session
     """
     builder_config = builder.create_builder_config(precision=precision,
                                                    int8=int8,
                                                    fp8=fp8,
-                                                   opt_level=opt_level,
                                                    quant_mode=quant_mode)
     # Some tests require to set mem pool limit to avoid OOM
     if memory_pool_limit is not None: