Skip to content

Commit

Permalink
[Paddle Inference] General optimization for no_varlen embedding layer…
Browse files Browse the repository at this point in the history
…norm (PaddlePaddle#48580)

* general optimization no_varlen embedding layernorm
  • Loading branch information
Wangzheee authored Dec 8, 2022
1 parent c409eca commit f41809b
Show file tree
Hide file tree
Showing 20 changed files with 1,357 additions and 963 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ if(WITH_TENSORRT)
pass_library(preln_layernorm_x_fuse_pass inference)
endif()

if(WITH_TENSORRT AND NOT WIN32)
if(WITH_TENSORRT)
pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
endif()
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1170,14 +1170,14 @@ void TrtMultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
"preln_embedding_eltwise_layernorm_fuse_"
"pass. please use no_varseqlen"));
}
} else if (!use_varseqlen && pos_id == "" && mask_id == "") {
} else if (!use_varseqlen && pos_id == "") {
VLOG(3) << "start no_varseqlen_trt_multihead_matmul_fuse_pass";
} else {
PADDLE_THROW(
platform::errors::Fatal("Use transformer'varseqlen need config: "
"use_varseqlen, set pos_id, set "
"mask_id. Or not use varseqlen, do not set "
"pos_id, set mask_id. Please "
"pos_id. Please "
"reconfig"));
}
graph->Set(kMultiheadMatmulPass, new bool(true));
Expand Down
3 changes: 0 additions & 3 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2338,11 +2338,8 @@ USE_TRT_CONVERTER(conv3d_transpose);
USE_TRT_CONVERTER(mish);
USE_TRT_CONVERTER(deformable_conv);
USE_TRT_CONVERTER(pool3d)
#ifdef _WIN32
#else
USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
#endif
USE_TRT_CONVERTER(preln_skip_layernorm)
USE_TRT_CONVERTER(preln_residual_bias)
USE_TRT_CONVERTER(c_allreduce_sum)
Expand Down
56 changes: 25 additions & 31 deletions paddle/fluid/inference/api/paddle_pass_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,39 +95,33 @@ const std::vector<std::string> kTRTSubgraphPasses({
"identity_scale_op_clean_pass", //
"add_support_int8_pass", //
// "fc_fuse_pass", //
"simplify_with_basic_ops_pass", //

#if defined _WIN32
#else
"simplify_with_basic_ops_pass", //
"trt_embedding_eltwise_layernorm_fuse_pass", //
"preln_embedding_eltwise_layernorm_fuse_pass", //
#endif

"delete_c_identity_op_pass", //
"trt_multihead_matmul_fuse_pass_v2", //
"trt_multihead_matmul_fuse_pass_v3", //
"multihead_matmul_roformer_fuse_pass", //
"constant_folding_pass", //
"vit_attention_fuse_pass", //
"trt_skip_layernorm_fuse_pass", //
"preln_skip_layernorm_fuse_pass", //
"layernorm_shift_partition_fuse_pass", //
"merge_layernorm_fuse_pass", //
"preln_residual_bias_fuse_pass", //
"preln_layernorm_x_fuse_pass", //
"reverse_roll_fuse_pass", //
// "set_transformer_input_convert_pass", //
"conv_bn_fuse_pass", //
"unsqueeze2_eltwise_fuse_pass", //
"trt_squeeze2_matmul_fuse_pass", //
"trt_flatten2_matmul_fuse_pass", //
"trt_map_matmul_v2_to_mul_pass", //
"trt_map_matmul_v2_to_matmul_pass", //
"trt_map_matmul_to_mul_pass", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"remove_padding_recover_padding_pass", //
"delete_remove_padding_recover_padding_pass", //
"delete_c_identity_op_pass", //
"trt_multihead_matmul_fuse_pass_v2", //
"trt_multihead_matmul_fuse_pass_v3", //
"multihead_matmul_roformer_fuse_pass", //
"constant_folding_pass", //
"vit_attention_fuse_pass", //
"trt_skip_layernorm_fuse_pass", //
"preln_skip_layernorm_fuse_pass", //
"layernorm_shift_partition_fuse_pass", //
"merge_layernorm_fuse_pass", //
"preln_residual_bias_fuse_pass", //
"preln_layernorm_x_fuse_pass", //
"reverse_roll_fuse_pass", //
"conv_bn_fuse_pass", //
"unsqueeze2_eltwise_fuse_pass", //
"trt_squeeze2_matmul_fuse_pass", //
"trt_flatten2_matmul_fuse_pass", //
"trt_map_matmul_v2_to_mul_pass", //
"trt_map_matmul_v2_to_matmul_pass", //
"trt_map_matmul_to_mul_pass", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"remove_padding_recover_padding_pass", //
"delete_remove_padding_recover_padding_pass", //
// "yolo_box_fuse_pass", //
"dense_fc_to_sparse_pass", //
"dense_multihead_matmul_to_sparse_pass", //
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ list(
fused_lookup_tables_op.cc
expand_v2_op.cc)

if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7 AND NOT WIN32)
if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
list(APPEND CONVERT_FILES emb_eltwise_layernorm.cc
preln_emb_eltwise_layernorm.cc)
endif()
Expand Down
197 changes: 79 additions & 118 deletions paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/utils.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
#include "paddle/phi/core/ddim.h"

Expand All @@ -36,7 +36,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
const framework::Scope& scope,
bool test_mode) override {
VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";

// get the presistable var's data
auto GetWeight = [&](const std::string& var_name,
framework::DDim* dim) -> TensorRTEngine::Weight {
Expand All @@ -47,32 +46,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
return weight;
};

auto GetFp16Weight = [&](const std::string& var_name,
framework::DDim* dim) -> TensorRTEngine::Weight {
auto* temp_var = scope.FindVar(var_name);
auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
*dim = temp_tensor->dims();
auto weight = engine_->GetFp16TrtWeight(var_name, *temp_tensor);
return weight;
};

auto GetFp32Weight = [&](const std::string& var_name,
framework::DDim* dim) -> TensorRTEngine::Weight {
auto* temp_var = scope.FindVar(var_name);
auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
*dim = temp_tensor->dims();
auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor);
return weight;
};

framework::OpDesc op_desc(op, nullptr);
auto pos_id_name = engine_->tensorrt_transformer_posid();
auto mask_id_name = engine_->tensorrt_transformer_maskid();
bool flag_varseqlen =
engine_->use_varseqlen() && pos_id_name != "" && mask_id_name != "";
bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
int hidden = 0;
// Declare inputs
// bool with_fp16 = engine_->WithFp16() &&
// !engine_->disable_trt_plugin_fp16(); int hidden = 0; Declare inputs
std::vector<nvinfer1::ITensor*> input_ids;

// Declare inputs_weight
Expand All @@ -95,55 +75,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
if (flag_varseqlen) {
engine_->SetITensor("pos_id", engine_->GetITensor(pos_id_name));
engine_->SetITensor("mask_id", engine_->GetITensor(mask_id_name));

auto mask_id_tensor = engine_->GetITensor("mask_id");
auto mask_dims = mask_id_tensor->getDimensions();
auto slice_start_dims = mask_dims;
auto slice_stride_dims = mask_dims;

for (int i = 0; i < mask_dims.nbDims; i++) {
slice_start_dims.d[i] = 0;
slice_stride_dims.d[i] = 1;
}

auto* shape_tensor = Shape(mask_id_tensor);
std::vector<nvinfer1::ITensor*> size_vec_tensor;
std::vector<nvinfer1::ITensor*> start_vec_tensor;
for (int i = 0; i < mask_dims.nbDims; i++) {
size_vec_tensor.push_back(Add1DConstantLayer(1));
start_vec_tensor.push_back(Add1DConstantLayer(0));
}
size_vec_tensor[1] = GetEleTensorOfShape(shape_tensor, 1);
auto size_tensor = Concat(size_vec_tensor);
auto start_tensor = Concat(start_vec_tensor);

auto slice_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Slice,
*mask_id_tensor,
slice_start_dims,
slice_start_dims,
slice_stride_dims); // unuseful slice_start_dims
slice_layer->setInput(1, *start_tensor);
slice_layer->setInput(2, *size_tensor);
slice_layer->setName(
("Embeltwise_slice_layer (Output: slice_max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(slice_layer->getOutput(0), 1.0f);

auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *slice_layer->getOutput(0));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
reshape_layer->setReshapeDimensions(shape_dim);
reshape_layer->setName(("Embeltwise_reshape_layer (Output: max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(reshape_layer->getOutput(0), 1.0f);
engine_->SetITensor("max_seqlen_tensor", reshape_layer->getOutput(0));

for (int i = 0; i < input_num; i++) {
auto input_tensor = engine_->GetITensor(id_names[i]);
weight = GetWeight(emb_names[i], &emb_dims);
Expand All @@ -156,7 +87,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
input_embs.push_back(weight.get());
emb_sizes.push_back(weight.get().count);
}
hidden = emb_dims[1];
}
bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
Expand Down Expand Up @@ -206,26 +136,29 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
plugin_ptr->fields = fields.data();

std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids;
plugin_inputs.emplace_back(engine_->GetITensor(
"max_seqlen_tensor")); // max_seqlen, eval_placeholder_3

plugin_inputs.emplace_back(
engine_->GetITensor("mask_id")); // input mask_id
auto creator = GetPluginRegistry()->getPluginCreator(
"ManyEmbLayerNormPluginDynamic", "1");
auto plugin_obj =
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr);
"ManyEmbLayerNormVarlenPluginDynamic", "1");
auto plugin_obj = creator->createPlugin(
"ManyEmbLayerNormVarlenPluginDynamic", plugin_ptr);

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);

plugin_layer->setName(("ManyEmbLayerNormPluginDynamic_V1(Output: " +
plugin_layer->setName(("ManyEmbLayerNormVarlenPluginDynamicV1(Output: " +
op_desc.Output("Out")[0] + ")")
.c_str());
free(plugin_ptr);
if (enable_int8) {
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_scale);
engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_scale);
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
out_scale); // output
engine_->SetTensorDynamicRange(plugin_layer->getOutput(1),
out_scale); // mask
engine_->SetTensorDynamicRange(plugin_layer->getOutput(2),
out_scale); // max seqlen
}
if (engine_->with_interleaved()) {
VLOG(4) << "fused emb_eltwise_layernorm op: use_varseqlen and "
Expand All @@ -249,54 +182,82 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer,
"ManyEmbLayerNormPluginDynamic_V1",
{output_name, std::string("qkv_plugin_mask")},
{output_name,
std::string("qkv_plugin_mask"),
std::string("max_seqlen_tensor")},
test_mode);
}
} else {
for (int i = 0; i < input_num; i++) {
if (with_fp16) {
weight = GetFp16Weight(emb_names[i], &emb_dims);
} else {
weight = GetFp32Weight(emb_names[i], &emb_dims);
}
input_ids.push_back(engine_->GetITensor(id_names[i]));
auto input_tensor = engine_->GetITensor(id_names[i]);
weight = GetWeight(emb_names[i], &emb_dims);
input_ids.push_back(input_tensor);
input_embs.push_back(weight.get());
emb_sizes.push_back(weight.get().count);
hidden = emb_dims[1];
}
if (with_fp16) {
bias_weight = GetFp16Weight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight =
GetFp16Weight(op_desc.Input("Scale").front(), &scale_dims);
} else {
bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight =
GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims);
// hidden = emb_dims[1];
}
bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
bias_size = phi::product(bias_dims);
scale_size = phi::product(scale_dims);
float eps = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
plugin::DynamicPluginTensorRT* plugin = nullptr;
std::vector<void*> input_embs_data;
for (size_t i = 0; i < input_embs.size(); ++i) {
input_embs_data.push_back(const_cast<void*>(
reinterpret_cast<const void*>(input_embs[i].values)));

int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
if (enable_int8) {
output_fp16 = 1;
}

std::vector<nvinfer1::PluginField> fields;
std::vector<std::string> temp_fields_keys;
fields.emplace_back("bert_embeddings_layernorm_beta",
bias_weight.get().values,
GetPluginFieldType(bias_weight.get().type),
static_cast<int32_t>(bias_size));
fields.emplace_back("bert_embeddings_layernorm_gamma",
scale_weight.get().values,
GetPluginFieldType(scale_weight.get().type),
static_cast<int32_t>(scale_size));
fields.emplace_back(
"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1);
for (int i = 0; i < input_num; ++i) {
temp_fields_keys.push_back("bert_embeddings_word_embeddings_" +
std::to_string(i));
fields.emplace_back(temp_fields_keys.rbegin()->c_str(),
input_embs[i].values,
GetPluginFieldType(input_embs[i].type),
static_cast<int32_t>(emb_sizes[i]));
}

nvinfer1::PluginFieldCollection* plugin_ptr =
static_cast<nvinfer1::PluginFieldCollection*>(
malloc(sizeof(*plugin_ptr) +
fields.size() * sizeof(nvinfer1::PluginField)));
plugin_ptr->nbFields = static_cast<int>(fields.size());
plugin_ptr->fields = fields.data();

std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids;

auto creator = GetPluginRegistry()->getPluginCreator(
"ManyEmbLayerNormPluginDynamic", "1");
auto plugin_obj =
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr);

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);

plugin_layer->setName(("ManyEmbLayerNormPluginDynamicV1(Output: " +
op_desc.Output("Out")[0] + ")")
.c_str());
free(plugin_ptr);
if (enable_int8) {
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
out_scale); // output
}
plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs_data,
const_cast<void*>(static_cast<const void*>(bias_weight.get().values)),
const_cast<void*>(
static_cast<const void*>(scale_weight.get().values)),
emb_sizes,
bias_size,
scale_size,
hidden,
eps,
with_fp16);
layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
layer = plugin_layer;
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(
layer, "emb_eltwise_layernorm", {output_name}, test_mode);
layer, "ManyEmbLayerNormPluginDynamicV1", {output_name}, test_mode);
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,10 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
"max_seqlen_tensor")); // max_seqlen, eval_placeholder_3

auto creator = GetPluginRegistry()->getPluginCreator(
"ManyEmbLayerNormPluginDynamic", "2");
"ManyEmbLayerNormVarlenPluginDynamic", "2");

auto plugin_obj =
creator->createPlugin("ManyEmbLayerNormPluginDynamic", plugin_ptr);
auto plugin_obj = creator->createPlugin(
"ManyEmbLayerNormVarlenPluginDynamic", plugin_ptr);

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
Expand Down
Loading

0 comments on commit f41809b

Please sign in to comment.