NVIDIA · kaiyux · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,8 @@ cpp/.ccache/
 tensorrt_llm/libs
 tensorrt_llm/bindings.pyi
 tensorrt_llm/bindings/*.pyi
+*docs/cpp_docs*
+*docs/source/_cpp_gen*
 
 # Testing
 .coverage.*

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/README.md b/README.md
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -225,23 +225,19 @@ python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
                               --output_dir ${CONVERTED_CHECKPOINT} \
                               --dtype ${DTYPE} \
                               --tp_size ${TP} \
-                              --pp_size 1 \
-                              --lora_target_modules attn_qkv \
-                              --max_lora_rank ${MAX_LORA_RANK}
+                              --pp_size 1
 
 ${HOME}/.local/bin/trtllm-build \
     --checkpoint_dir ${CONVERTED_CHECKPOINT} \
     --output_dir ${LORA_ENGINE} \
     --max_batch_size ${MAX_BATCH} \
     --max_input_len $MAX_LEN \
     --max_output_len $MAX_LEN \
-    --gpt_attention_plugin float16 \
-    --paged_kv_cache enable \
-    --remove_input_padding enable \
     --gemm_plugin float16 \
     --lora_plugin float16 \
     --use_paged_context_fmha enable \
-    --use_custom_all_reduce disable
+    --lora_target_modules attn_qkv \
+    --max_lora_rank ${MAX_LORA_RANK}
 
 NUM_LORAS=(8 16 24 32 64 128 256)
 NUM_REQUESTS=1024

diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -14,6 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*****************************************************************************
+ *
+ * GptSession is going to be deprecated soon.
+ * Please do not add new functionality in this file!
+ *
+ *****************************************************************************/
+
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/mpiUtils.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"

diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
@@ -1127,6 +1127,39 @@ class ModelConfig:
                     max_output_len=200,
                     builder_opt=None,
                 )),
+    "qwen1.5_7b_chat":
+    ModelConfig(name="qwen1.5_7b_chat",
+                family="qwen2",
+                benchmark_type="gpt",
+                build_config=BuildConfig(num_layers=32,
+                                         num_heads=32,
+                                         hidden_size=4096,
+                                         vocab_size=151936,
+                                         hidden_act='silu',
+                                         n_positions=8192,
+                                         inter_size=11008,
+                                         max_batch_size=128,
+                                         max_input_len=512,
+                                         max_output_len=200,
+                                         builder_opt=None,
+                                         bias=False)),
+    "qwen1.5_14b_chat":
+    ModelConfig(name="qwen1.5_14b_chat",
+                family="qwen2",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=40,
+                    num_heads=40,
+                    hidden_size=5120,
+                    vocab_size=152064,
+                    hidden_act='silu',
+                    n_positions=8192,
+                    inter_size=13696,
+                    max_batch_size=64,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                )),
     "mamba_2.8b":
     ModelConfig(name="mamba_2.8b",
                 family="mamba",

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -232,6 +232,7 @@ def build_gpt(args):
         builder_config_extra_kwargs['mamba_expand'] = build_config[
             'mamba_expand']
         builder_config_extra_kwargs['max_beam_width'] = max_beam_width
+        builder_config_extra_kwargs['layer_types'] = ['recurrent']
     builder_config = builder.create_builder_config(
         name=args.model,
         precision=args.dtype,
@@ -715,6 +716,51 @@ def build_gpt(args):
             build_config["moe_num_experts"],
             'moe_top_k':
             build_config["moe_top_k"],
+            'qwen_type':
+            'qwen',
+        }
+        config = PretrainedConfig.from_dict(config)
+        tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
+    elif family == "qwen2":
+        config = {
+            'architecture':
+            'QWenForCausalLM',
+            'dtype':
+            args.dtype,
+            'num_hidden_layers':
+            build_config['num_layers'],
+            'num_attention_heads':
+            build_config['num_heads'],
+            'num_key_value_heads':
+            build_config['num_heads'] if build_config['num_kv_heads'] is None
+            else build_config['num_kv_heads'],
+            'hidden_size':
+            build_config['hidden_size'],
+            'intermediate_size':
+            build_config['inter_size'],
+            'vocab_size':
+            build_config['vocab_size'],
+            'position_embedding_type':
+            'rope_gpt_neox',
+            'max_position_embeddings':
+            build_config['n_positions'],
+            'hidden_act':
+            build_config['hidden_act'],
+            'quantization': {
+                'group_size': 128,
+                'quant_algo': quant_algo,
+                'kv_cache_quant_algo': kv_cache_quant_algo
+            },
+            'mapping': {
+                'world_size': world_size,
+                'tp_size': world_size
+            },
+            'moe_num_experts':
+            build_config["moe_num_experts"],
+            'moe_top_k':
+            build_config["moe_top_k"],
+            'qwen_type':
+            'qwen2',
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)

diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -21,7 +21,7 @@
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/schedulerPolicy.h"
 #include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
-#include "tensorrt_llm/runtime/gptModelConfig.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <atomic>
@@ -79,17 +79,21 @@ class GptManager
     virtual ~GptManager();
 
 protected:
+    /* Synchronizes the decoder */
+    virtual BatchManagerErrorCode_t forwardSync();
+
     /* Invokes one step of backend
        Updates state of all requests */
-    virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set<uint64_t>& activeRequestsIds);
+    virtual BatchManagerErrorCode_t forwardAsync(
+        RequestList& activeRequests, std::unordered_set<uint64_t>& activeRequestsIds);
 
 private:
     [[nodiscard]] SizeType getMaxInputLen() const;
     [[nodiscard]] SizeType getMaxSequenceLen() const;
     [[nodiscard]] SizeType getMaxNumSequences() const;
 
     void validateLlmRequest(
-        LlmRequest& newReq, runtime::GptModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
+        LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
     static std::shared_ptr<LlmRequest> fillLlmRequest(std::shared_ptr<InferenceRequest> newReq);
     static std::shared_ptr<std::vector<TokenIdType>> getReqInputTokens(std::shared_ptr<InferenceRequest> newReq);
     static SizeType getMaxNewTokens(std::shared_ptr<InferenceRequest> newReq);
@@ -108,7 +112,7 @@ class GptManager
     // List of live requests
     RequestList mActiveRequests;
     // IDs of live requests
-    std::set<uint64_t> mActiveRequestsIds;
+    std::unordered_set<uint64_t> mActiveRequestsIds;
     // Boolean that controls if prompt should be included in output tokens for non-streaming
     bool mExcludeInputInOutput;
 

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@@ -63,6 +63,8 @@ class KvCacheConfig
             && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
     }
 
+    friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
+
     std::optional<SizeType> maxTokens;
     std::optional<SizeType> maxAttentionWindow;
     std::optional<SizeType> sinkTokenLength;