Skip to content

Commit

Permalink
Add Layernorm option for Wenet trt (#434)
Browse files Browse the repository at this point in the history
* update layernorm option

* update layrnorm option

* remove hard coding path

* lint

* lint

---------

Co-authored-by: root <root@node1.cluster.local>
  • Loading branch information
yuekaizhang and root authored Feb 8, 2023
1 parent 7921074 commit 3357ef6
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 57 deletions.
4 changes: 2 additions & 2 deletions src/fastertransformer/models/wenet/WenetEncoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ void WenetEncoder<T>::allocateBuffer(
inter_conv2_output_buf_, sizeof(T) * batch_size * d_model_ * seq_len2 * feature_size2, false);
inter_fc_input_buf_ = (T*)allocator_->reMalloc(
inter_fc_input_buf_, sizeof(T) * batch_size * seq_len2 * d_model_ * feature_size2, false);
// Current workspace used for CuDNN Convolution is 1 << 27
conv_workspace_ = (T*)allocator_->reMalloc(conv_workspace_, 1 << 27, false);
// Current workspace used for CuDNN Convolution is 1 << 29
conv_workspace_ = (T*)allocator_->reMalloc(conv_workspace_, 1 << 29, false);
// Position Embed

input_hidden_state_ =
Expand Down
3 changes: 1 addition & 2 deletions src/fastertransformer/tensorrt_plugin/wenet/DecoderPlugin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ void WenetDecoderPlugin::CreateFT()
#endif

// Wenet DecoderWeight
std::string weightFilePath = "/weight/dec/";
// std::string weightFilePath = m_.weightFilePath;
std::string weightFilePath = m_.weightFilePath;
FT_LOG_WARNING("The default weight file path is %s. Change it accordingly, otherwise model will fail to load! \n",
weightFilePath.c_str());
if (m_.useFP16) {
Expand Down
83 changes: 46 additions & 37 deletions src/fastertransformer/tensorrt_plugin/wenet/EncoderPlugin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,29 @@ WenetEncoderPlugin::WenetEncoderPlugin(const std::string& name,
int sm,
float q_scaling,
const std::string& weightFilePath,
int use_layernorm_in_conv_module,
int useFP16):
name_(name)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
WHERE_AM_I();
m_.max_batch_size = max_batch_size;
m_.max_seq_len = max_seq_len;
m_.head_num = head_num;
m_.size_per_head = size_per_head;
m_.feature_size = feature_size;
m_.max_len = max_len;
m_.inter_size = inter_size;
m_.d_model = d_model;
m_.num_layer = num_layer;
m_.vocab_size = vocab_size;
m_.conv_module_kernel_size = conv_module_kernel_size;
m_.sm = sm;
m_.q_scaling = q_scaling;
m_.useFP16 = (bool)useFP16;
m_.batch_size = m_.max_batch_size;
m_.seq_len = m_.max_seq_len;
m_.max_batch_size = max_batch_size;
m_.max_seq_len = max_seq_len;
m_.head_num = head_num;
m_.size_per_head = size_per_head;
m_.feature_size = feature_size;
m_.max_len = max_len;
m_.inter_size = inter_size;
m_.d_model = d_model;
m_.num_layer = num_layer;
m_.vocab_size = vocab_size;
m_.conv_module_kernel_size = conv_module_kernel_size;
m_.sm = sm;
m_.q_scaling = q_scaling;
m_.use_layernorm_in_conv_module = (bool)use_layernorm_in_conv_module;
m_.useFP16 = (bool)useFP16;
m_.batch_size = m_.max_batch_size;
m_.seq_len = m_.max_seq_len;
strcpy(m_.weightFilePath, weightFilePath.c_str());

CreateFT();
Expand All @@ -74,8 +76,7 @@ void WenetEncoderPlugin::CreateFT()
cudnnCreate(&cudnn_handle_);

// Wenet EncoderWeight
std::string weightFilePath = "/weight/enc/";
// std::string weightFilePath = m_.weightFilePath;
std::string weightFilePath = m_.weightFilePath;
FT_LOG_WARNING("The default weight file path is %s. Change it accordingly, otherwise model will fail to load! \n",
weightFilePath.c_str());
if (m_.useFP16) {
Expand All @@ -88,7 +89,8 @@ void WenetEncoderPlugin::CreateFT()
m_.conv_module_kernel_size,
m_.feature_size,
m_.max_len,
m_.num_layer);
m_.num_layer,
m_.use_layernorm_in_conv_module);
pWenetEncoderWeightHalf_->loadModel(weightFilePath);
}
else {
Expand All @@ -101,7 +103,8 @@ void WenetEncoderPlugin::CreateFT()
m_.conv_module_kernel_size,
m_.feature_size,
m_.max_len,
m_.num_layer);
m_.num_layer,
m_.use_layernorm_in_conv_module);
pWenetEncoderWeightFloat_->loadModel(weightFilePath);
}

Expand Down Expand Up @@ -143,7 +146,8 @@ void WenetEncoderPlugin::CreateFT()
m_.is_free_buffer_after_forward,
m_.attention_type,
m_.is_sparse,
m_.activation_type);
m_.activation_type,
m_.use_layernorm_in_conv_module);
}
else {
pCublasWrapper_->setFP32GemmConfig();
Expand All @@ -168,7 +172,8 @@ void WenetEncoderPlugin::CreateFT()
m_.is_free_buffer_after_forward,
m_.attention_type,
m_.is_sparse,
m_.activation_type);
m_.activation_type,
m_.use_layernorm_in_conv_module);
}
PRINT_ENCODER(m_.useFP16)
}
Expand Down Expand Up @@ -481,6 +486,7 @@ std::vector<PluginField> WenetEncoderPluginCreator::attr_{
{"vocab_size", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"conv_module_kernel_size", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"sm", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"use_layernorm_in_conv_module", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"useFP16", nullptr, nvinfer1::PluginFieldType::kINT32, 0},
{"q_scaling", nullptr, nvinfer1::PluginFieldType::kFLOAT32, 0},
{"weightFilePath", nullptr, nvinfer1::PluginFieldType::kCHAR, 0}};
Expand All @@ -501,21 +507,22 @@ IPluginV2* WenetEncoderPluginCreator::createPlugin(const char* name, const Plugi
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
WHERE_AM_I();
int max_batch_size = 128;
int max_seq_len = 384;
int head_num = 8;
int size_per_head = 32;
int feature_size = 80;
int max_len = 5000;
int d_model = head_num * size_per_head;
int inter_size = d_model * 4;
int num_layer = 12;
int vocab_size = 4233;
int conv_module_kernel_size = 15;
int sm = -1;
float q_scaling = 1.0f / (sqrt(size_per_head) * 1.0f);
std::string weightFilePath = "";
int useFP16 = 0;
int max_batch_size = 128;
int max_seq_len = 384;
int head_num = 8;
int size_per_head = 32;
int feature_size = 80;
int max_len = 5000;
int d_model = head_num * size_per_head;
int inter_size = d_model * 4;
int num_layer = 12;
int vocab_size = 4233;
int conv_module_kernel_size = 15;
int sm = -1;
float q_scaling = 1.0f / (sqrt(size_per_head) * 1.0f);
std::string weightFilePath = "";
int use_layernorm_in_conv_module = 0;
int useFP16 = 0;

struct cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
Expand All @@ -534,6 +541,7 @@ IPluginV2* WenetEncoderPluginCreator::createPlugin(const char* name, const Plugi
{"vocab_size", &vocab_size},
{"conv_module_kernel_size", &conv_module_kernel_size},
{"sm", &sm},
{"use_layernorm_in_conv_module", &use_layernorm_in_conv_module},
{"useFP16", &useFP16},
};
for (int i = 0; i < fc->nbFields; i++) {
Expand Down Expand Up @@ -562,6 +570,7 @@ IPluginV2* WenetEncoderPluginCreator::createPlugin(const char* name, const Plugi
sm,
q_scaling,
weightFilePath,
use_layernorm_in_conv_module,
useFP16);
return p;
}
Expand Down
31 changes: 17 additions & 14 deletions src/fastertransformer/tensorrt_plugin/wenet/EncoderPlugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
printf("\tnum_layer=%ld\n", m_.num_layer); \
printf("\tsm=%d\n", m_.sm); \
printf("\tq_scaling=%f\n", m_.q_scaling); \
printf("\tuse_layernorm_in_cnn_module=%d\n", m_.use_layernorm_in_cnn_module); \
printf("\tuseFP16=%d\n", m_.useFP16); \
printf("\tweightFilePath=%s\n", m_.weightFilePath); \
printf("\tvocab_size=%ld\n", m_.vocab_size); \
Expand Down Expand Up @@ -98,20 +99,21 @@ class WenetEncoderPlugin: public IPluginV2DynamicExt {
WenetEncoder<float>* pWenetEncoderFloat_ = nullptr;
struct {
// constructor parameter
size_t max_batch_size = 16;
size_t max_seq_len = 256;
size_t head_num = 8;
size_t size_per_head = 32;
size_t feature_size = 80;
size_t max_len = 5000;
size_t inter_size = head_num * size_per_head * 4;
size_t d_model = head_num * size_per_head;
size_t num_layer = 12;
size_t vocab_size = 4233;
size_t conv_module_kernel_size = 15;
int sm = -1; // assign later
float q_scaling = 1.0f / (1.0f * sqrt(size_per_head));
bool useFP16 = false;
size_t max_batch_size = 16;
size_t max_seq_len = 256;
size_t head_num = 8;
size_t size_per_head = 32;
size_t feature_size = 80;
size_t max_len = 5000;
size_t inter_size = head_num * size_per_head * 4;
size_t d_model = head_num * size_per_head;
size_t num_layer = 12;
size_t vocab_size = 4233;
size_t conv_module_kernel_size = 15;
int sm = -1; // assign later
float q_scaling = 1.0f / (1.0f * sqrt(size_per_head));
bool use_layernorm_in_conv_module = false;
bool useFP16 = false;
// internal parameter
bool is_remove_padding = false;
bool is_free_buffer_after_forward = false;
Expand Down Expand Up @@ -144,6 +146,7 @@ class WenetEncoderPlugin: public IPluginV2DynamicExt {
int sm,
float q_scaling,
const std::string& weightFilePath,
int use_layernorm_in_conv_module,
int useFP16);
WenetEncoderPlugin(const std::string& name, const void* buffer, size_t length);
~WenetEncoderPlugin();
Expand Down
4 changes: 2 additions & 2 deletions src/fastertransformer/utils/wenet_conv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ void conv2d(T* output,
&ws_size));
FT_LOG_DEBUG("Convolution algorithm: %d with workspace size: %d \n", convolution_algorithm_, ws_size);
FT_CHECK_WITH_INFO(
ws_size <= (1 << 27),
"Current workspace used for CuDNN Convolution is fixed as 1 << 27, please increase it in WenetEncoder::allocateBuffer!");
ws_size <= (1 << 29),
"Current workspace used for CuDNN Convolution is fixed as 1 << 29, please increase it in WenetEncoder::allocateBuffer!");
// void *ws_data;
// if (ws_size > 0) {
// check_cuda_error(cudaMalloc(&ws_data, ws_size));
Expand Down

0 comments on commit 3357ef6

Please sign in to comment.