Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TensorRT-LLM #1492

Merged
merged 2 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ cpp/.ccache/
tensorrt_llm/libs
tensorrt_llm/bindings.pyi
tensorrt_llm/bindings/*.pyi
*docs/cpp_docs*
*docs/source/_cpp_gen*

# Testing
.coverage.*
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 548 files
460 changes: 15 additions & 445 deletions README.md

Large diffs are not rendered by default.

10 changes: 3 additions & 7 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,23 +225,19 @@ python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
--output_dir ${CONVERTED_CHECKPOINT} \
--dtype ${DTYPE} \
--tp_size ${TP} \
--pp_size 1 \
--lora_target_modules attn_qkv \
--max_lora_rank ${MAX_LORA_RANK}
--pp_size 1

${HOME}/.local/bin/trtllm-build \
--checkpoint_dir ${CONVERTED_CHECKPOINT} \
--output_dir ${LORA_ENGINE} \
--max_batch_size ${MAX_BATCH} \
--max_input_len $MAX_LEN \
--max_output_len $MAX_LEN \
--gpt_attention_plugin float16 \
--paged_kv_cache enable \
--remove_input_padding enable \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
--use_custom_all_reduce disable
--lora_target_modules attn_qkv \
--max_lora_rank ${MAX_LORA_RANK}

NUM_LORAS=(8 16 24 32 64 128 256)
NUM_REQUESTS=1024
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*****************************************************************************
*
* GptSession is going to be deprecated soon.
* Please do not add new functionality in this file!
*
*****************************************************************************/

#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/mpiUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
Expand Down
33 changes: 33 additions & 0 deletions benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,39 @@ class ModelConfig:
max_output_len=200,
builder_opt=None,
)),
"qwen1.5_7b_chat":
ModelConfig(name="qwen1.5_7b_chat",
family="qwen2",
benchmark_type="gpt",
build_config=BuildConfig(num_layers=32,
num_heads=32,
hidden_size=4096,
vocab_size=151936,
hidden_act='silu',
n_positions=8192,
inter_size=11008,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
bias=False)),
"qwen1.5_14b_chat":
ModelConfig(name="qwen1.5_14b_chat",
family="qwen2",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=40,
num_heads=40,
hidden_size=5120,
vocab_size=152064,
hidden_act='silu',
n_positions=8192,
inter_size=13696,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"mamba_2.8b":
ModelConfig(name="mamba_2.8b",
family="mamba",
Expand Down
46 changes: 46 additions & 0 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def build_gpt(args):
builder_config_extra_kwargs['mamba_expand'] = build_config[
'mamba_expand']
builder_config_extra_kwargs['max_beam_width'] = max_beam_width
builder_config_extra_kwargs['layer_types'] = ['recurrent']
builder_config = builder.create_builder_config(
name=args.model,
precision=args.dtype,
Expand Down Expand Up @@ -715,6 +716,51 @@ def build_gpt(args):
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'qwen_type':
'qwen',
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
elif family == "qwen2":
config = {
'architecture':
'QWenForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
build_config['num_layers'],
'num_attention_heads':
build_config['num_heads'],
'num_key_value_heads':
build_config['num_heads'] if build_config['num_kv_heads'] is None
else build_config['num_kv_heads'],
'hidden_size':
build_config['hidden_size'],
'intermediate_size':
build_config['inter_size'],
'vocab_size':
build_config['vocab_size'],
'position_embedding_type':
'rope_gpt_neox',
'max_position_embeddings':
build_config['n_positions'],
'hidden_act':
build_config['hidden_act'],
'quantization': {
'group_size': 128,
'quant_algo': quant_algo,
'kv_cache_quant_algo': kv_cache_quant_algo
},
'mapping': {
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'qwen_type':
'qwen2',
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
Expand Down
12 changes: 8 additions & 4 deletions cpp/include/tensorrt_llm/batch_manager/GptManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/schedulerPolicy.h"
#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"

#include <atomic>
Expand Down Expand Up @@ -79,17 +79,21 @@ class GptManager
virtual ~GptManager();

protected:
/* Synchronizes the decoder */
virtual BatchManagerErrorCode_t forwardSync();

/* Invokes one step of backend
Updates state of all requests */
virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set<uint64_t>& activeRequestsIds);
virtual BatchManagerErrorCode_t forwardAsync(
RequestList& activeRequests, std::unordered_set<uint64_t>& activeRequestsIds);

private:
[[nodiscard]] SizeType getMaxInputLen() const;
[[nodiscard]] SizeType getMaxSequenceLen() const;
[[nodiscard]] SizeType getMaxNumSequences() const;

void validateLlmRequest(
LlmRequest& newReq, runtime::GptModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
static std::shared_ptr<LlmRequest> fillLlmRequest(std::shared_ptr<InferenceRequest> newReq);
static std::shared_ptr<std::vector<TokenIdType>> getReqInputTokens(std::shared_ptr<InferenceRequest> newReq);
static SizeType getMaxNewTokens(std::shared_ptr<InferenceRequest> newReq);
Expand All @@ -108,7 +112,7 @@ class GptManager
// List of live requests
RequestList mActiveRequests;
// IDs of live requests
std::set<uint64_t> mActiveRequestsIds;
std::unordered_set<uint64_t> mActiveRequestsIds;
// Boolean that controls if prompt should be included in output tokens for non-streaming
bool mExcludeInputInOutput;

Expand Down
2 changes: 2 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class KvCacheConfig
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
}

friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);

std::optional<SizeType> maxTokens;
std::optional<SizeType> maxAttentionWindow;
std::optional<SizeType> sinkTokenLength;
Expand Down
Loading