Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TensorRT-LLM #1793

Merged
merged 1 commit into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \
--output_dir ${LORA_ENGINE} \
--max_batch_size ${MAX_BATCH} \
--max_input_len $MAX_LEN \
--max_output_len $MAX_LEN \
--max_seq_len $((2*${MAX_LEN})) \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/bertBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/rawEngine.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tensorrt_llm/runtime/tllmRuntime.h"
#include "tensorrt_llm/runtime/worldConfig.h"
Expand Down Expand Up @@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
{
auto const worldConfig = WorldConfig::mpi();
auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
auto engineBlob = loadEngine(enginePath.string());

for (float gpuWeightsPercent : gpuWeightsPercents)
{
auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);
auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
rt->addContext(0);
for (auto inLen : inLens)
{
Expand Down
13 changes: 13 additions & 0 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ struct BenchmarkParams
bool streaming{false};
bool enableExpDelays{false};
std::optional<float> requestRate{std::nullopt};
std::optional<SizeType32> maxBatchSize{std::nullopt};
int randomSeed = 430;
std::optional<int> maxAttentionWindow{std::nullopt};

Expand Down Expand Up @@ -785,6 +786,10 @@ class ExecutorServer
executorConfig.setPeftCacheConfig(peftCacheConfig);
executorConfig.setBatchingType(
modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
if (benchmarkParams.maxBatchSize)
{
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
}

mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);

Expand Down Expand Up @@ -1339,6 +1344,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
optionalParams.kvCacheConfig.onboardBlocks = benchmarkParams.kvOnboardBlocks;
optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
optionalParams.maxBeamWidth = beamWidth;
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};

auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
Expand Down Expand Up @@ -1628,6 +1634,7 @@ int main(int argc, char* argv[])
options.add_options()("request_rate",
"request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
cxxopts::value<float>());
options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
cxxopts::value<bool>()->default_value("false"));
options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
Expand Down Expand Up @@ -1777,6 +1784,12 @@ int main(int argc, char* argv[])
benchmarkParams.requestRate = result["request_rate"].as<float>();
}

// Argument: request rate
if (result.count("max_batch_size"))
{
benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
}

benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();

// Argument: Enable batch stats output
Expand Down
Loading