Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Layernorm option for Wenet trt #434

Merged
merged 5 commits into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/fastertransformer/models/wenet/WenetEncoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ void WenetEncoder<T>::allocateBuffer(
inter_conv2_output_buf_, sizeof(T) * batch_size * d_model_ * seq_len2 * feature_size2, false);
inter_fc_input_buf_ = (T*)allocator_->reMalloc(
inter_fc_input_buf_, sizeof(T) * batch_size * seq_len2 * d_model_ * feature_size2, false);
// Current workspace used for CuDNN Convolution is 1 << 27
conv_workspace_ = (T*)allocator_->reMalloc(conv_workspace_, 1 << 27, false);
// Current workspace used for CuDNN Convolution is 1 << 29
conv_workspace_ = (T*)allocator_->reMalloc(conv_workspace_, 1 << 29, false);
// Position Embed

input_hidden_state_ =
Expand Down
3 changes: 1 addition & 2 deletions src/fastertransformer/tensorrt_plugin/wenet/DecoderPlugin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ void WenetDecoderPlugin::CreateFT()
#endif

// Wenet DecoderWeight
std::string weightFilePath = "/weight/dec/";
// std::string weightFilePath = m_.weightFilePath;
std::string weightFilePath = m_.weightFilePath;
FT_LOG_WARNING("The default weight file path is %s. Change it accordingly, otherwise model will fail to load! \n",
weightFilePath.c_str());
if (m_.useFP16) {
Expand Down
83 changes: 46 additions & 37 deletions src/fastertransformer/tensorrt_plugin/wenet/EncoderPlugin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,29 @@ WenetEncoderPlugin::WenetEncoderPlugin(const std::string& name,
int sm,
float q_scaling,
const std::string& weightFilePath,
int use_layernorm_in_conv_module,
int useFP16):
name_(name)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
WHERE_AM_I();
m_.max_batch_size = max_batch_size;
m_.max_seq_len = max_seq_len;
m_.head_num = head_num;
m_.size_per_head = size_per_head;
m_.feature_size = feature_size;
m_.max_len = max_len;
m_.inter_size = inter_size;
m_.d_model = d_model;
m_.num_layer = num_layer;
m_.vocab_size = vocab_size;
m_.conv_module_kernel_size = conv_module_kernel_size;
m_.sm = sm;
m_.q_scaling = q_scaling;
m_.useFP16 = (bool)useFP16;
m_.batch_size = m_.max_batch_size;
m_.seq_len = m_.max_seq_len;
m_.max_batch_size = max_batch_size;
m_.max_seq_len = max_seq_len;
m_.head_num = head_num;
m_.size_per_head = size_per_head;
m_.feature_size = feature_size;
m_.max_len = max_len;
m_.inter_size = inter_size;
m_.d_model = d_model;
m_.num_layer = num_layer;
m_.vocab_size = vocab_size;
m_.conv_module_kernel_size = conv_module_kernel_size;
m_.sm = sm;
m_.q_scaling = q_scaling;
m_.use_layernorm_in_conv_module = (bool)use_layernorm_in_conv_module;
m_.useFP16 = (bool)useFP16;
m_.batch_size = m_.max_batch_size;
m_.seq_len = m_.max_seq_len;
strcpy(m_.weightFilePath, weightFilePath.c_str());

CreateFT();
Expand All @@ -74,8 +76,7 @@ void WenetEncoderPlugin::CreateFT()
cudnnCreate(&cudnn_handle_);

// Wenet EncoderWeight
std::string weightFilePath = "/weight/enc/";
// std::string weightFilePath = m_.weightFilePath;
std::string weightFilePath = m_.weightFilePath;
FT_LOG_WARNING("The default weight file path is %s. Change it accordingly, otherwise model will fail to load! \n",
weightFilePath.c_str());
if (m_.useFP16) {
Expand All @@ -88,7 +89,8 @@ void WenetEncoderPlugin::CreateFT()
m_.conv_module_kernel_size,
m_.feature_size,
m_.max_len,
m_.num_layer);
m_.num_layer,
m_.use_layernorm_in_conv_module);
pWenetEncoderWeightHalf_->loadModel(weightFilePath);
}
else {
Expand All @@ -101,7 +103,8 @@ void WenetEncoderPlugin::CreateFT()
m_.conv_module_kernel_size,
m_.feature_size,
m_.max_len,
m_.num_layer);
m_.num_layer,
m_.use_layernorm_in_conv_module);
pWenetEncoderWeightFloat_->loadModel(weightFilePath);
}

Expand Down Expand Up @@ -143,7 +146,8 @@ void WenetEncoderPlugin::CreateFT()
m_.is_free_buffer_after_forward,
m_.attention_type,
m_.is_sparse,
m_.activation_type);
m_.activation_type,
m_.use_layernorm_in_conv_module);
}
else {
pCublasWrapper_->setFP32GemmConfig();
Expand All @@ -168,7 +172,8 @@ void WenetEncoderPlugin::CreateFT()
m_.is_free_buffer_after_forward,
m_.attention_type,
m_.is_sparse,
m_.activation_type);
m_.activation_type,
m_.use_layernorm_in_conv_module);
}
PRINT_ENCODER(m_.useFP16)
}
Expand Down Expand Up @@ -481,6 +486,7 @@ std::vector<PluginField> WenetEncoderPluginCreator::attr_{
{"vocab_size", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"conv_module_kernel_size", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"sm", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"use_layernorm_in_conv_module", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"useFP16", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"q_scaling", nullptr, nvinfer1::PluginFieldType::kFLOAT32, 0},
{"weightFilePath", nullptr, nvinfer1::PluginFieldType::kCHAR, 0}};
Expand All @@ -501,21 +507,22 @@ IPluginV2* WenetEncoderPluginCreator::createPlugin(const char* name, const Plugi
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
WHERE_AM_I();
int max_batch_size = 128;
int max_seq_len = 384;
int head_num = 8;
int size_per_head = 32;
int feature_size = 80;
int max_len = 5000;
int d_model = head_num * size_per_head;
int inter_size = d_model * 4;
int num_layer = 12;
int vocab_size = 4233;
int conv_module_kernel_size = 15;
int sm = -1;
float q_scaling = 1.0f / (sqrt(size_per_head) * 1.0f);
std::string weightFilePath = "";
int useFP16 = 0;
int max_batch_size = 128;
int max_seq_len = 384;
int head_num = 8;
int size_per_head = 32;
int feature_size = 80;
int max_len = 5000;
int d_model = head_num * size_per_head;
int inter_size = d_model * 4;
int num_layer = 12;
int vocab_size = 4233;
int conv_module_kernel_size = 15;
int sm = -1;
float q_scaling = 1.0f / (sqrt(size_per_head) * 1.0f);
std::string weightFilePath = "";
int use_layernorm_in_conv_module = 0;
int useFP16 = 0;

struct cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
Expand All @@ -534,6 +541,7 @@ IPluginV2* WenetEncoderPluginCreator::createPlugin(const char* name, const Plugi
{"vocab_size", &vocab_size},
{"conv_module_kernel_size", &conv_module_kernel_size},
{"sm", &sm},
{"use_layernorm_in_conv_module", &use_layernorm_in_conv_module},
{"useFP16", &useFP16},
};
for (int i = 0; i < fc->nbFields; i++) {
Expand Down Expand Up @@ -562,6 +570,7 @@ IPluginV2* WenetEncoderPluginCreator::createPlugin(const char* name, const Plugi
sm,
q_scaling,
weightFilePath,
use_layernorm_in_conv_module,
useFP16);
return p;
}
Expand Down
31 changes: 17 additions & 14 deletions src/fastertransformer/tensorrt_plugin/wenet/EncoderPlugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
printf("\tnum_layer=%ld\n", m_.num_layer); \
printf("\tsm=%d\n", m_.sm); \
printf("\tq_scaling=%f\n", m_.q_scaling); \
printf("\tuse_layernorm_in_cnn_module=%d\n", m_.use_layernorm_in_cnn_module); \
printf("\tuseFP16=%d\n", m_.useFP16); \
printf("\tweightFilePath=%s\n", m_.weightFilePath); \
printf("\tvocab_size=%ld\n", m_.vocab_size); \
Expand Down Expand Up @@ -98,20 +99,21 @@ class WenetEncoderPlugin: public IPluginV2DynamicExt {
WenetEncoder<float>* pWenetEncoderFloat_ = nullptr;
struct {
// constructor parameter
size_t max_batch_size = 16;
size_t max_seq_len = 256;
size_t head_num = 8;
size_t size_per_head = 32;
size_t feature_size = 80;
size_t max_len = 5000;
size_t inter_size = head_num * size_per_head * 4;
size_t d_model = head_num * size_per_head;
size_t num_layer = 12;
size_t vocab_size = 4233;
size_t conv_module_kernel_size = 15;
int sm = -1; // assign later
float q_scaling = 1.0f / (1.0f * sqrt(size_per_head));
bool useFP16 = false;
size_t max_batch_size = 16;
size_t max_seq_len = 256;
size_t head_num = 8;
size_t size_per_head = 32;
size_t feature_size = 80;
size_t max_len = 5000;
size_t inter_size = head_num * size_per_head * 4;
size_t d_model = head_num * size_per_head;
size_t num_layer = 12;
size_t vocab_size = 4233;
size_t conv_module_kernel_size = 15;
int sm = -1; // assign later
float q_scaling = 1.0f / (1.0f * sqrt(size_per_head));
bool use_layernorm_in_conv_module = false;
bool useFP16 = false;
// internal parameter
bool is_remove_padding = false;
bool is_free_buffer_after_forward = false;
Expand Down Expand Up @@ -144,6 +146,7 @@ class WenetEncoderPlugin: public IPluginV2DynamicExt {
int sm,
float q_scaling,
const std::string& weightFilePath,
int use_layernorm_in_conv_module,
int useFP16);
WenetEncoderPlugin(const std::string& name, const void* buffer, size_t length);
~WenetEncoderPlugin();
Expand Down
4 changes: 2 additions & 2 deletions src/fastertransformer/utils/wenet_conv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ void conv2d(T* output,
&ws_size));
FT_LOG_DEBUG("Convolution algorithm: %d with workspace size: %d \n", convolution_algorithm_, ws_size);
FT_CHECK_WITH_INFO(
ws_size <= (1 << 27),
"Current workspace used for CuDNN Convolution is fixed as 1 << 27, please increase it in WenetEncoder::allocateBuffer!");
ws_size <= (1 << 29),
"Current workspace used for CuDNN Convolution is fixed as 1 << 29, please increase it in WenetEncoder::allocateBuffer!");
// void *ws_data;
// if (ws_size > 0) {
// check_cuda_error(cudaMalloc(&ws_data, ws_size));
Expand Down