NVIDIA · Shixiaowei02 · Jun 18, 2024 · Jun 18, 2024
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \
     --output_dir ${LORA_ENGINE} \
     --max_batch_size ${MAX_BATCH} \
     --max_input_len $MAX_LEN \
-    --max_output_len $MAX_LEN \
+    --max_seq_len $((2*${MAX_LEN})) \
     --gemm_plugin float16 \
     --lora_plugin float16 \
     --use_paged_context_fmha enable \

diff --git a/benchmarks/cpp/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp
@@ -17,6 +17,7 @@
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/rawEngine.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
@@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
 {
     auto const worldConfig = WorldConfig::mpi();
     auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
-    auto engineBlob = loadEngine(enginePath.string());
 
     for (float gpuWeightsPercent : gpuWeightsPercents)
     {
-        auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);
+        auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
         rt->addContext(0);
         for (auto inLen : inLens)
         {

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -150,6 +150,7 @@ struct BenchmarkParams
     bool streaming{false};
     bool enableExpDelays{false};
     std::optional<float> requestRate{std::nullopt};
+    std::optional<SizeType32> maxBatchSize{std::nullopt};
     int randomSeed = 430;
     std::optional<int> maxAttentionWindow{std::nullopt};
 
@@ -785,6 +786,10 @@ class ExecutorServer
         executorConfig.setPeftCacheConfig(peftCacheConfig);
         executorConfig.setBatchingType(
             modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
+        if (benchmarkParams.maxBatchSize)
+        {
+            executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
+        }
 
         mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);
 
@@ -1339,6 +1344,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     optionalParams.kvCacheConfig.onboardBlocks = benchmarkParams.kvOnboardBlocks;
     optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
     optionalParams.maxBeamWidth = beamWidth;
+    optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
     optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
 
     auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
@@ -1628,6 +1634,7 @@ int main(int argc, char* argv[])
     options.add_options()("request_rate",
         "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
         cxxopts::value<float>());
+    options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
     options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
@@ -1777,6 +1784,12 @@ int main(int argc, char* argv[])
         benchmarkParams.requestRate = result["request_rate"].as<float>();
     }
 
+    // Argument: request rate
+    if (result.count("max_batch_size"))
+    {
+        benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
+    }
+
     benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();
 
     // Argument: Enable batch stats output