diff --git a/README.md b/README.md index f1f8f9a58..051363ce6 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ paddle2onnx --model_dir saved_inference_model \ | --opset_version | **[可选]** 配置转换为 ONNX 的 OpSet 版本,目前支持 7~16 等多个版本,默认为 9 | | --enable_onnx_checker | **[可选]** 配置是否检查导出为 ONNX 模型的正确性, 建议打开此开关, 默认为 False | | --enable_auto_update_opset | **[可选]** 是否开启 opset version 自动升级功能,当低版本 opset 无法转换时,自动选择更高版本的 opset进行转换, 默认为 True | -| --deploy_backend | **[可选]** 量化模型部署的推理引擎,支持 onnxruntime、tensorrt 或 others,当选择 others 时,所有的量化信息存储于 max_range.txt 文件中,默认为 onnxruntime | +| --deploy_backend | **[可选]** 量化模型部署的推理引擎,支持 onnxruntime/rknn/tensorrt, 默认为 onnxruntime | | --save_calibration_file | **[可选]** TensorRT 8.X版本部署量化模型需要读取的 cache 文件的保存路径,默认为 calibration.cache | | --version | **[可选]** 查看 paddle2onnx 版本 | | --external_filename | **[可选]** 当导出的 ONNX 模型大于 2G 时,需要设置 external data 的存储路径,推荐设置为:external_data | diff --git a/README_en.md b/README_en.md index 336afba62..41a3df30f 100644 --- a/README_en.md +++ b/README_en.md @@ -57,7 +57,7 @@ The adjustable conversion parameters are listed in the following table: | --opset_version | **[Optional]** Configure the OpSet version converted to ONNX, currently supports multiple versions such as 7~16, the default is 9 | | --enable_onnx_checker | **[Optional]** Configure whether to check the correctness of the exported ONNX model, it is recommended to turn on this switch, the default is False | | --enable_auto_update_opset | **[Optional]** Whether to enable the opset version automatic upgrade function, when the lower version of the opset cannot be converted, automatically select the higher version of the opset for conversion, the default is True | -| --deploy_backend | **[Optional]** Inference engine for quantitative model deployment, supports onnxruntime, tensorrt or others, when other is selected, all quantization information is stored in the max_range.txt file, the default is onnxruntime | +| --deploy_backend | **[Optional]** Inference engine for quantitative model deployment, supports onnxruntime/rknn/tensorrt, the default is onnxruntime | | --save_calibration_file | **[Optional]** TensorRT 8.X version deploys the cache file that needs to be read to save the path of the quantitative model, the default is calibration.cache | | --version | **[Optional]** View paddle2onnx version | | --external_filename | **[Optional]** When the exported ONNX model is larger than 2G, you need to set the storage path of external data, the recommended setting is: external_data | diff --git a/paddle2onnx/mapper/exporter.cc b/paddle2onnx/mapper/exporter.cc index 30af82e85..2b43fed9a 100644 --- a/paddle2onnx/mapper/exporter.cc +++ b/paddle2onnx/mapper/exporter.cc @@ -20,6 +20,10 @@ #include #include "onnxoptimizer/optimize.h" +#include "paddle2onnx/mapper/quantize/ort_quantize_processor.h" +#include "paddle2onnx/mapper/quantize/other_quantize_processor.h" +#include "paddle2onnx/mapper/quantize/rknn_quantize_processor.h" +#include "paddle2onnx/mapper/quantize/tensorrt_quantize_processor.h" #include "paddle2onnx/optimizer/convert_fp32_to_fp16.h" #include "paddle2onnx/optimizer/eliminate_non_transpose.h" #include "paddle2onnx/optimizer/fuse_constant_cast.h" @@ -28,29 +32,24 @@ #include "paddle2onnx/optimizer/fuse_paddle_conv_bias.h" #include "paddle2onnx/optimizer/fuse_unsqueeze_conv2d_squeeze.h" -namespace paddle2onnx -{ - MapperHelper *MapperHelper::helper = nullptr; - int32_t OnnxHelper::opset_version = 7; - - bool ModelExporter::IsOpsRegistered(const PaddleParser &parser, bool enable_experimental_op) - { - OnnxHelper temp_helper; - std::set unsupported_ops; - for (auto i = 0; i < parser.NumOfBlocks(); ++i) - { - for (auto j = 0; j < parser.NumOfOps(i); ++j) - { - auto op = parser.GetOpDesc(i, j); - if (op.type() == "feed" || op.type() == "fetch") - { - continue; - } +namespace paddle2onnx { +MapperHelper *MapperHelper::helper = nullptr; +int32_t OnnxHelper::opset_version = 7; + +bool ModelExporter::IsOpsRegistered(const PaddleParser &parser, + bool enable_experimental_op) { + OnnxHelper temp_helper; + std::set unsupported_ops; + for (auto i = 0; i < parser.NumOfBlocks(); ++i) { + for (auto j = 0; j < parser.NumOfOps(i); ++j) { + auto op = parser.GetOpDesc(i, j); + if (op.type() == "feed" || op.type() == "fetch") { + continue; + } - if (op.type() == "conditional_block" || op.type() == "select_input") - { - continue; - } + if (op.type() == "conditional_block" || op.type() == "select_input") { + continue; + } #if 0 if (op.type() == "while" && enable_experimental_op) { @@ -61,77 +60,66 @@ namespace paddle2onnx continue; } #endif - if (custom_ops.find(op.type()) != custom_ops.end()) - { - continue; - } - if (!MapperHelper::Get()->IsRegistered(op.type())) - { + if (custom_ops.find(op.type()) != custom_ops.end()) { + continue; + } + if (!MapperHelper::Get()->IsRegistered(op.type())) { + unsupported_ops.insert(op.type()); + } else if (!enable_experimental_op) { + auto mapper = MapperHelper::Get()->CreateMapper(op.type(), parser, + &temp_helper, i, j); + if (mapper->IsExperimentalOp()) { unsupported_ops.insert(op.type()); } - else if (!enable_experimental_op) - { - auto mapper = MapperHelper::Get()->CreateMapper(op.type(), parser, &temp_helper, i, j); - if (mapper->IsExperimentalOp()) - { - unsupported_ops.insert(op.type()); - } - delete mapper; - } + delete mapper; } } + } - if (unsupported_ops.size() == 0) - { - return true; - } - - auto logger = P2OLogger(); - logger << "Oops, there are some operators not supported yet, including "; - for (auto &item : unsupported_ops) - { - logger << item << ","; - } - logger << std::endl; - return false; + if (unsupported_ops.size() == 0) { + return true; } - int32_t ModelExporter::GetMinOpsetVersion(const PaddleParser &parser) - { - int32_t max_opset = 7; - std::set verbose_log; - OnnxHelper helper; - for (auto i = 0; i < parser.NumOfBlocks(); ++i) - { - for (auto j = 0; j < parser.NumOfOps(i); ++j) - { - auto op = parser.GetOpDesc(i, j); - if (custom_ops.find(op.type()) != custom_ops.end()) - { - continue; - } + auto logger = P2OLogger(); + logger << "Oops, there are some operators not supported yet, including "; + for (auto &item : unsupported_ops) { + logger << item << ","; + } + logger << std::endl; + return false; +} - // Skip the input and output nodes. - if (op.type() == "feed" || op.type() == "fetch" || op.type() == "conditional_block") - { - continue; - } +int32_t ModelExporter::GetMinOpsetVersion(const PaddleParser &parser) { + int32_t max_opset = 7; + std::set verbose_log; + OnnxHelper helper; + for (auto i = 0; i < parser.NumOfBlocks(); ++i) { + for (auto j = 0; j < parser.NumOfOps(i); ++j) { + auto op = parser.GetOpDesc(i, j); + if (custom_ops.find(op.type()) != custom_ops.end()) { + continue; + } - int current_opset = 7; + // Skip the input and output nodes. + if (op.type() == "feed" || op.type() == "fetch" || + op.type() == "conditional_block") { + continue; + } - if (op.type() == "select_input") - { - P2OLogger() << "Detected there's control flow op('conditional_block/select_input') in your model, " - << "this requires the minimal opset version of 11." - << std::endl; - current_opset = 11; - } - else - { - auto mapper = MapperHelper::Get()->CreateMapper(op.type(), parser, &helper, i, j); - current_opset = mapper->GetMinOpsetVersion(verbose_); - delete mapper; - } + int current_opset = 7; + + if (op.type() == "select_input") { + P2OLogger() << "Detected there's control flow " + "op('conditional_block/select_input') in your model, " + << "this requires the minimal opset version of 11." + << std::endl; + current_opset = 11; + } else { + auto mapper = + MapperHelper::Get()->CreateMapper(op.type(), parser, &helper, i, j); + current_opset = mapper->GetMinOpsetVersion(verbose_); + delete mapper; + } #if 0 if (op.type() == "while") { @@ -142,72 +130,67 @@ namespace paddle2onnx } #endif - if (current_opset > max_opset) - { - max_opset = current_opset; - if (current_opset > opset_version_) - { - verbose_log.insert("Due to the operator: " + op.type() + ", " + - "requires opset_version >= " + std::to_string(current_opset) + "."); - } + if (current_opset > max_opset) { + max_opset = current_opset; + if (current_opset > opset_version_) { + verbose_log.insert("Due to the operator: " + op.type() + ", " + + "requires opset_version >= " + + std::to_string(current_opset) + "."); } } } + } - for (auto iter = verbose_log.begin(); iter != verbose_log.end(); ++iter) - { - P2OLogger(verbose_) << *iter << std::endl; - } - return max_opset; + for (auto iter = verbose_log.begin(); iter != verbose_log.end(); ++iter) { + P2OLogger(verbose_) << *iter << std::endl; } + return max_opset; +} - void ModelExporter::SetOpsetVersion(const PaddleParser &parser, bool auto_upgrade_opset) - { - // Set the Opset Version of the ONNX model. - bool opset_is_legal = true; - int32_t min_opset = GetMinOpsetVersion(parser); - if (min_opset < 7 || min_opset >= MAX_ONNX_OPSET_VERSION) - { - P2OLogger(verbose_) << "The Opset Version must be between 7 and " << MAX_ONNX_OPSET_VERSION - 1 << std::endl; +void ModelExporter::SetOpsetVersion(const PaddleParser &parser, + bool auto_upgrade_opset) { + // Set the Opset Version of the ONNX model. + bool opset_is_legal = true; + int32_t min_opset = GetMinOpsetVersion(parser); + if (min_opset < 7 || min_opset >= MAX_ONNX_OPSET_VERSION) { + P2OLogger(verbose_) << "The Opset Version must be between 7 and " + << MAX_ONNX_OPSET_VERSION - 1 << std::endl; + opset_is_legal = false; + } + if (!auto_upgrade_opset) { + if (min_opset > opset_version_) { + P2OLogger(verbose_) << "Please set the opset_version to " + << std::to_string(opset_version_) + << " or set auto_upgrade_opset=true." << std::endl; opset_is_legal = false; } - if (!auto_upgrade_opset) - { - if (min_opset > opset_version_) - { - P2OLogger(verbose_) << "Please set the opset_version to " << std::to_string(opset_version_) - << " or set auto_upgrade_opset=true." << std::endl; - opset_is_legal = false; - } - } - else - { - if (min_opset > opset_version_) - { - P2OLogger(verbose_) << "Opset version will change to " << min_opset << " from " << opset_version_ << std::endl; - opset_version_ = min_opset; - } - } - Assert(opset_is_legal, "Due to opset version, the model exporting is aborted."); - - OnnxHelper::SetOpsetVersion(opset_version_); - - auto opset_import = onnx_model_.add_opset_import(); - opset_import->set_domain(""); - opset_import->set_version(opset_version_); - P2OLogger(verbose_) << "Use opset_version = " << opset_version_ << " for ONNX export." << std::endl; - if (custom_ops.size()) { - auto opset_paddle_id = onnx_model_.add_opset_import(); - opset_paddle_id->set_domain("Paddle"); - opset_paddle_id->set_version(1); + } else { + if (min_opset > opset_version_) { + P2OLogger(verbose_) << "Opset version will change to " << min_opset + << " from " << opset_version_ << std::endl; + opset_version_ = min_opset; } } + Assert(opset_is_legal, + "Due to opset version, the model exporting is aborted."); + + OnnxHelper::SetOpsetVersion(opset_version_); + + auto opset_import = onnx_model_.add_opset_import(); + opset_import->set_domain(""); + opset_import->set_version(opset_version_); + P2OLogger(verbose_) << "Use opset_version = " << opset_version_ + << " for ONNX export." << std::endl; + if (custom_ops.size()) { + auto opset_paddle_id = onnx_model_.add_opset_import(); + opset_paddle_id->set_domain("Paddle"); + opset_paddle_id->set_version(1); + } +} - inline ONNX_NAMESPACE::Version ModelExporter::GetIRVersion() const - { - int ir_version = 0; - switch (opset_version_) - { +inline ONNX_NAMESPACE::Version ModelExporter::GetIRVersion() const { + int ir_version = 0; + switch (opset_version_) { case 7: case 8: ir_version = 3; @@ -240,223 +223,217 @@ namespace paddle2onnx ir_version = 10; break; default: - P2OLogger(verbose_) << "The Opset Version must be between 7 and 21." << std::endl; + P2OLogger(verbose_) << "The Opset Version must be between 7 and 21." + << std::endl; Assert(false, "Due to opset version, the model exporting is aborted."); - } - return static_cast(ir_version); } + return static_cast(ir_version); +} - void ModelExporter::SetIRVersion() - { - onnx_model_.set_ir_version(GetIRVersion()); - } +void ModelExporter::SetIRVersion() { + onnx_model_.set_ir_version(GetIRVersion()); +} - void ModelExporter::ExportInputOutputs(const PaddleParser &parser, - std::vector> &inputs, - std::vector> &outputs) - { - inputs.clear(); - for (auto &item : parser.inputs) - { - auto value_info = MakeValueInfo(item); - inputs.push_back(std::move(value_info)); - } - outputs.clear(); - for (auto &item : parser.outputs) - { - auto value_info = MakeValueInfo(item); - outputs.push_back(std::move(value_info)); - } +void ModelExporter::ExportInputOutputs( + const PaddleParser &parser, + std::vector> &inputs, + std::vector> &outputs) { + inputs.clear(); + for (auto &item : parser.inputs) { + auto value_info = MakeValueInfo(item); + inputs.push_back(std::move(value_info)); } - - void ModelExporter::ExportParameters(const PaddleParser &parser, - std::vector> ¶meters) - { - parameters.clear(); - for (auto &item : parser.params) - { - // TODO(jiangjiajun) I'm not handling use_initializer now, but some day I - // will - auto node = MakeConstant(item.first, item.second); - parameters.push_back(std::move(node)); - } + outputs.clear(); + for (auto &item : parser.outputs) { + auto value_info = MakeValueInfo(item); + outputs.push_back(std::move(value_info)); } +} - ONNX_NAMESPACE::GraphProto ModelExporter::ExportConditionalBlock(const PaddleParser &parser, - int32_t block_id, - int32_t op_id, - const std::string &output_names) - { - auto op = parser.GetOpDesc(block_id, op_id); - - // Get sub_block_idx - int32_t sub_block_idx = -1; - for (size_t i = 0; i < op.attrs_size(); ++i) - { - if (op.attrs(i).name() == "sub_block") - { - sub_block_idx = op.attrs(i).block_idx(); - break; - } - } - Assert(sub_block_idx != -1, "Due to the unsupported sub_block_idx, the conversion is aborted."); - - std::vector> temp_parameters; +void ModelExporter::ExportParameters( + const PaddleParser &parser, + std::vector> ¶meters) { + parameters.clear(); + for (auto &item : parser.params) { + // TODO(jiangjiajun) I'm not handling use_initializer now, but some day I + // will + auto node = MakeConstant(item.first, item.second); + parameters.push_back(std::move(node)); + } +} - std::vector> temp_inputs; - // auto input_info = parser.GetOpInput(block_id, op_id, "Input"); - // for (int index = 0; index < input_info.size(); index++) - // { - // temp_inputs.push_back(std::move(MakeValueInfo(input_info[index]))); - // } +ONNX_NAMESPACE::GraphProto ModelExporter::ExportConditionalBlock( + const PaddleParser &parser, int32_t block_id, int32_t op_id, + const std::string &output_names) { + auto op = parser.GetOpDesc(block_id, op_id); - std::vector> temp_outputs; - auto out_info = parser.GetOpOutput(block_id, op_id, "Out"); - for (int index = 0; index < out_info.size(); index++) - { - if (out_info[index].name != output_names) - { - continue; - } - temp_outputs.push_back(std::move(MakeValueInfo(out_info[index]))); + // Get sub_block_idx + int32_t sub_block_idx = -1; + for (size_t i = 0; i < op.attrs_size(); ++i) { + if (op.attrs(i).name() == "sub_block") { + sub_block_idx = op.attrs(i).block_idx(); + break; } - return std::move(ExportBlock(parser, sub_block_idx, temp_parameters, temp_inputs, temp_outputs)); - } - - ONNX_NAMESPACE::GraphProto ModelExporter::ExportBlock(const PaddleParser &parser, - int32_t block_id, - std::vector> ¶meters, - std::vector> &inputs, - std::vector> &outputs) - { - ONNX_NAMESPACE::GraphProto graph; - graph.set_name("PaddlePaddle Graph " + std::to_string(block_id)); - OnnxHelper temp_helper; - auto num_ops = parser.NumOfOps(block_id); - temp_helper.nodes.reserve(num_ops * 3); - temp_helper.Clear(); - for (auto op_id = 0; op_id < num_ops; ++op_id) - { - auto op = parser.GetOpDesc(block_id, op_id); - if (op.type() == "feed") - { - continue; - } - else if (op.type() == "fetch") - { - continue; - } - else if (op.type() == "conditional_block") - { - auto out_info = parser.GetOpOutput(block_id, op_id, "Out"); - for (int index = 0; index < out_info.size(); index++) - { - sub_block_map_[out_info[index].name] = {block_id, op_id}; - } - continue; - } - else if (op.type() == "select_input") - { - // 如果找到,则输出对应的值;否则输出错误信息 - // 遍历输入Tensor - auto input_info = parser.GetOpInput(block_id, op_id, "X"); - - Assert(input_info.size() == 2, "Only support when number of select_input's input_node is 2."); - - // 构建 else 分支图 - auto else_node_name = input_info[0].name; - auto conditional_block_cood_it = sub_block_map_.find(else_node_name); - Assert(conditional_block_cood_it != sub_block_map_.end(), "Don't find select_input else_input node."); - auto conditional_block_cood = conditional_block_cood_it->second; - auto else_graph = ExportConditionalBlock(parser, conditional_block_cood.first, conditional_block_cood.second, else_node_name); - - // 构建 then 分支图 - auto then_node_name = input_info[1].name; - conditional_block_cood_it = sub_block_map_.find(then_node_name); - Assert(conditional_block_cood_it != sub_block_map_.end(), "Don't find select_input then_input node."); - conditional_block_cood = conditional_block_cood_it->second; - auto then_graph = ExportConditionalBlock(parser, conditional_block_cood.first, conditional_block_cood.second, then_node_name); - - auto cond_info = parser.GetOpInput(block_id, op_id, "Mask"); - auto output_info = parser.GetOpOutput(block_id, op_id, "Out"); - auto cond_name = temp_helper.AutoCast(cond_info[0].name, cond_info[0].dtype, P2ODataType::BOOL); - auto node = temp_helper.MakeNode("If", {cond_name}, {output_info[0].name}); - AddAttribute(node, "then_branch", then_graph); - AddAttribute(node, "else_branch", else_graph); - continue; - } - ExportOp(parser, &temp_helper, opset_version_, block_id, op_id, verbose_); + } + Assert(sub_block_idx != -1, + "Due to the unsupported sub_block_idx, the conversion is aborted."); + + std::vector> temp_parameters; + + std::vector> temp_inputs; + // auto input_info = parser.GetOpInput(block_id, op_id, "Input"); + // for (int index = 0; index < input_info.size(); index++) + // { + // temp_inputs.push_back(std::move(MakeValueInfo(input_info[index]))); + // } + + std::vector> temp_outputs; + auto out_info = parser.GetOpOutput(block_id, op_id, "Out"); + for (int index = 0; index < out_info.size(); index++) { + if (out_info[index].name != output_names) { + continue; } + temp_outputs.push_back(std::move(MakeValueInfo(out_info[index]))); + } + return std::move(ExportBlock(parser, sub_block_idx, temp_parameters, + temp_inputs, temp_outputs)); +} - ProcessGraphDumplicateNames(parameters, inputs, outputs, temp_helper.nodes, temp_helper.quantize_info); - if (parser.is_quantized_model) - { - quantize_model_processer.ProcessQuantizeModel(¶meters, - &inputs, - &outputs, - &temp_helper.nodes, - &temp_helper, - deploy_backend_, - parser, - calibration_cache_); - // Update int8 weights in quantized OP to float32 - UpdateParameters(temp_helper.updated_params, parameters); +ONNX_NAMESPACE::GraphProto ModelExporter::ExportBlock( + const PaddleParser &parser, int32_t block_id, + std::vector> ¶meters, + std::vector> &inputs, + std::vector> &outputs) { + ONNX_NAMESPACE::GraphProto graph; + graph.set_name("PaddlePaddle Graph " + std::to_string(block_id)); + OnnxHelper temp_helper; + auto num_ops = parser.NumOfOps(block_id); + temp_helper.nodes.reserve(num_ops * 3); + temp_helper.Clear(); + for (auto op_id = 0; op_id < num_ops; ++op_id) { + auto op = parser.GetOpDesc(block_id, op_id); + if (op.type() == "feed") { + continue; + } else if (op.type() == "fetch") { + continue; + } else if (op.type() == "conditional_block") { + auto out_info = parser.GetOpOutput(block_id, op_id, "Out"); + for (int index = 0; index < out_info.size(); index++) { + sub_block_map_[out_info[index].name] = {block_id, op_id}; + } + continue; + } else if (op.type() == "select_input") { + auto input_info = parser.GetOpInput(block_id, op_id, "X"); + + Assert(input_info.size() == 2, + "Only support when number of select_input's input_node is 2."); + + auto else_node_name = input_info[0].name; + auto conditional_block_cood_it = sub_block_map_.find(else_node_name); + Assert(conditional_block_cood_it != sub_block_map_.end(), + "Don't find select_input else_input node."); + auto conditional_block_cood = conditional_block_cood_it->second; + auto else_graph = + ExportConditionalBlock(parser, conditional_block_cood.first, + conditional_block_cood.second, else_node_name); + + auto then_node_name = input_info[1].name; + conditional_block_cood_it = sub_block_map_.find(then_node_name); + Assert(conditional_block_cood_it != sub_block_map_.end(), + "Don't find select_input then_input node."); + conditional_block_cood = conditional_block_cood_it->second; + auto then_graph = + ExportConditionalBlock(parser, conditional_block_cood.first, + conditional_block_cood.second, then_node_name); + + auto cond_info = parser.GetOpInput(block_id, op_id, "Mask"); + auto output_info = parser.GetOpOutput(block_id, op_id, "Out"); + auto cond_name = temp_helper.AutoCast( + cond_info[0].name, cond_info[0].dtype, P2ODataType::BOOL); + auto node = + temp_helper.MakeNode("If", {cond_name}, {output_info[0].name}); + AddAttribute(node, "then_branch", then_graph); + AddAttribute(node, "else_branch", else_graph); + continue; } + ExportOp(parser, &temp_helper, opset_version_, block_id, op_id, verbose_); + } - for (auto &item : parameters) - { - *(graph.add_node()) = *(item.get()); - } + ProcessGraphDumplicateNames(parameters, inputs, outputs, temp_helper.nodes, + temp_helper.quantize_info); + + // Process the model according to deploy_mackend_ + if (parser.is_quantized_model) { + if (deploy_backend_ == "onnxruntime") { + quantize_processer_ = new ORTQuantizeProcessor(); + } else if (deploy_backend_ == "rknn") { + quantize_processer_ = new RKNNQuantizeProcessor(); + } else if (deploy_backend_ == "tensorrt") { + quantize_processer_ = new TensorRTQuantizeProcessor(); + } else if (deploy_backend_ == "other") { + quantize_processer_ = new OtherQuantizeProcessor(); + } else { + Assert(false, + "Only support onnxruntime/rknn/tensorrt/other as backend now, but " + "now the backend is: " + + deploy_backend_ + "."); + } + P2OLogger() << "Deploy backend is: " << deploy_backend_ << std::endl; + quantize_processer_->ProcessQuantizeModel(¶meters, &inputs, &outputs, + &temp_helper.nodes, &temp_helper, + parser, calibration_cache_); + delete quantize_processer_; + quantize_processer_ = nullptr; + + // Update int8 weights in quantized OP to float32 + UpdateParameters(temp_helper.updated_params, parameters); + } - for (auto &item : inputs) - { - *(graph.add_input()) = *(item.get()); - } + for (auto &item : parameters) { + *(graph.add_node()) = *(item.get()); + } - for (auto &item : outputs) - { - *(graph.add_output()) = (*item.get()); - } + for (auto &item : inputs) { + *(graph.add_input()) = *(item.get()); + } - for (auto &item : temp_helper.nodes) - { - *(graph.add_node()) = (*item.get()); - } + for (auto &item : outputs) { + *(graph.add_output()) = (*item.get()); + } - for (auto &item : temp_helper.value_infos) - { - *(graph.add_value_info()) = (*item.get()); - } + for (auto &item : temp_helper.nodes) { + *(graph.add_node()) = (*item.get()); + } - return std::move(graph); + for (auto &item : temp_helper.value_infos) { + *(graph.add_value_info()) = (*item.get()); } - void ModelExporter::UpdateParameters(const std::map ¶ms, - std::vector> ¶meters) - { - for (auto &item : params) - { - auto node = MakeConstant(item.first, item.second); - bool updated = false; - for (int i = 0; i < parameters.size(); ++i) - { - auto old_node = parameters[i]; - if (old_node->output(0) == item.first) - { - parameters.erase(parameters.begin() + i); - parameters.push_back(std::move(node)); - updated = true; - break; - } - } - if (!updated) - { + return std::move(graph); +} + +void ModelExporter::UpdateParameters( + const std::map ¶ms, + std::vector> ¶meters) { + for (auto &item : params) { + auto node = MakeConstant(item.first, item.second); + bool updated = false; + for (int i = 0; i < parameters.size(); ++i) { + auto old_node = parameters[i]; + if (old_node->output(0) == item.first) { + parameters.erase(parameters.begin() + i); parameters.push_back(std::move(node)); + updated = true; + break; } } + if (!updated) { + parameters.push_back(std::move(node)); + } } -void ModelExporter::CovertCustomOps(const PaddleParser& parser, - OnnxHelper* helper, int64_t block_id, +} +void ModelExporter::CovertCustomOps(const PaddleParser &parser, + OnnxHelper *helper, int64_t block_id, int64_t op_id) { auto op = parser.GetOpDesc(block_id, op_id); std::vector input_strs; @@ -529,324 +506,282 @@ void ModelExporter::CovertCustomOps(const PaddleParser& parser, << custom_ops[op.type()] << std::endl; } - void ModelExporter::ExportOp(const PaddleParser &parser, - OnnxHelper *helper, - int32_t opset_version, - int64_t block_id, - int64_t op_id, - bool verbose) - { - auto op = parser.GetOpDesc(block_id, op_id); +void ModelExporter::ExportOp(const PaddleParser &parser, OnnxHelper *helper, + int32_t opset_version, int64_t block_id, + int64_t op_id, bool verbose) { + auto op = parser.GetOpDesc(block_id, op_id); #if 0 if (op.type() == "while") { return ExportLoop(parser, helper, opset_version, block_id, op_id, verbose); } #endif - if (MapperHelper::Get()->IsRegistered(op.type())) { - auto mapper = MapperHelper::Get()->CreateMapper(op.type(), parser, helper, block_id, op_id); - mapper->deploy_backend = deploy_backend_; - // Some operators will export as custom operator - auto iter = custom_ops.find(op.type()); - if (iter != custom_ops.end()) { - mapper->export_as_custom_op = true; - mapper->custom_op_name = iter->second; - } - mapper->Run(); - delete mapper; - } else if (custom_ops.find(op.type()) != custom_ops.end()) { - CovertCustomOps(parser, helper, block_id, op_id); - } + if (MapperHelper::Get()->IsRegistered(op.type())) { + auto mapper = MapperHelper::Get()->CreateMapper(op.type(), parser, helper, + block_id, op_id); + mapper->deploy_backend = deploy_backend_; + // Some operators will export as custom operator + auto iter = custom_ops.find(op.type()); + if (iter != custom_ops.end()) { + mapper->export_as_custom_op = true; + mapper->custom_op_name = iter->second; + } + mapper->Run(); + delete mapper; + } else if (custom_ops.find(op.type()) != custom_ops.end()) { + CovertCustomOps(parser, helper, block_id, op_id); } +} - void ModelExporter::ProcessGraphDumplicateNames(std::vector> ¶meters, - std::vector> &inputs, - std::vector> &outputs, - std::vector> &nodes, - std::map &quantize_info) - { - std::map renamer; - for (auto &item : parameters) - { - for (size_t i = 0; i < item->output_size(); ++i) - { - if (tensor_names_.find(item->output(i)) != tensor_names_.end()) - { - Assert(false, "There's dumplicate names in exported parameters."); - } - tensor_names_.insert(item->output(i)); +void ModelExporter::ProcessGraphDumplicateNames( + std::vector> ¶meters, + std::vector> &inputs, + std::vector> &outputs, + std::vector> &nodes, + std::map &quantize_info) { + std::map renamer; + for (auto &item : parameters) { + for (size_t i = 0; i < item->output_size(); ++i) { + if (tensor_names_.find(item->output(i)) != tensor_names_.end()) { + Assert(false, "There's dumplicate names in exported parameters."); } + tensor_names_.insert(item->output(i)); } + } - for (auto &item : inputs) - { - if (tensor_names_.find(item->name()) != tensor_names_.end()) - { - continue; - // Assert(false, "There's dumplicate names:" + item->name() + " in exported parameters and inputs."); - } - tensor_names_.insert(item->name()); + for (auto &item : inputs) { + if (tensor_names_.find(item->name()) != tensor_names_.end()) { + continue; + // Assert(false, "There's dumplicate names:" + item->name() + " in + // exported parameters and inputs."); } + tensor_names_.insert(item->name()); + } - for (auto &item : nodes) - { - // update node inputs - for (size_t i = 0; i < item->input_size(); ++i) - { - if (renamer.find(item->input(i)) != renamer.end()) - { - auto updated_name = renamer[item->input(i)]; - while (renamer.find(updated_name) != renamer.end()) - { - updated_name = renamer[updated_name]; - } - *(item->mutable_input(i)) = updated_name; + for (auto &item : nodes) { + // update node inputs + for (size_t i = 0; i < item->input_size(); ++i) { + if (renamer.find(item->input(i)) != renamer.end()) { + auto updated_name = renamer[item->input(i)]; + while (renamer.find(updated_name) != renamer.end()) { + updated_name = renamer[updated_name]; } + *(item->mutable_input(i)) = updated_name; } + } - // if there's dumplicate name , it will generate new name and replace the dumplicate name - for (size_t i = 0; i < item->output_size(); ++i) - { - if (tensor_names_.find(item->output(i)) != tensor_names_.end()) - { - std::string renamed_tensor_name = item->output(i); - while (renamer.find(renamed_tensor_name) != renamer.end()) - { - renamed_tensor_name = renamer[renamed_tensor_name]; - } - auto new_tensor_name = MapperHelper::Get()->GenName(renamed_tensor_name); - P2OLogger() << "Find dumplicate output name '" << renamed_tensor_name - << "', it will rename to '" << new_tensor_name << "'." - << std::endl; - if (quantize_info.find(renamed_tensor_name) != quantize_info.end()) - { - quantize_info[new_tensor_name] = quantize_info[renamed_tensor_name]; - } - *(item->mutable_output(i)) = new_tensor_name; - renamer[renamed_tensor_name] = new_tensor_name; + // if there's dumplicate name , it will generate new name and replace the + // dumplicate name + for (size_t i = 0; i < item->output_size(); ++i) { + if (tensor_names_.find(item->output(i)) != tensor_names_.end()) { + std::string renamed_tensor_name = item->output(i); + while (renamer.find(renamed_tensor_name) != renamer.end()) { + renamed_tensor_name = renamer[renamed_tensor_name]; } - tensor_names_.insert(item->output(i)); + auto new_tensor_name = + MapperHelper::Get()->GenName(renamed_tensor_name); + // P2OLogger() << "Find dumplicate output name '" << renamed_tensor_name + // << "', it will rename to '" << new_tensor_name << "'." + // << std::endl; + if (quantize_info.find(renamed_tensor_name) != quantize_info.end()) { + quantize_info[new_tensor_name] = quantize_info[renamed_tensor_name]; + } + *(item->mutable_output(i)) = new_tensor_name; + renamer[renamed_tensor_name] = new_tensor_name; } + tensor_names_.insert(item->output(i)); } + } - for (auto &item : outputs) - { - if (renamer.find(item->name()) != renamer.end()) - { - auto updated_name = renamer[item->name()]; - while (renamer.find(updated_name) != renamer.end()) - { - updated_name = renamer[updated_name]; - } - item->set_name(updated_name); + for (auto &item : outputs) { + if (renamer.find(item->name()) != renamer.end()) { + auto updated_name = renamer[item->name()]; + while (renamer.find(updated_name) != renamer.end()) { + updated_name = renamer[updated_name]; } + item->set_name(updated_name); } } +} - void ModelExporter::SaveExternalData(::ONNX_NAMESPACE::GraphProto *graph, - const std::string &external_file_path, - bool *save_external) - { - P2OLogger() << "The exported ONNX model is bigger than 2G, external data " - "will save to file: " - << external_file_path << std::endl; - std::string file_name = GetFilenameFromPath(external_file_path); - if (save_external) - { - *save_external = true; +void ModelExporter::SaveExternalData(::ONNX_NAMESPACE::GraphProto *graph, + const std::string &external_file_path, + bool *save_external) { + P2OLogger() << "The exported ONNX model is bigger than 2G, external data " + "will save to file: " + << external_file_path << std::endl; + std::string file_name = GetFilenameFromPath(external_file_path); + if (save_external) { + *save_external = true; + } + std::fstream f(external_file_path, std::ios::out); + Assert(f.is_open(), "Failed to open: " + external_file_path + + " file to save external data"); + for (auto index = 0; index < graph->node_size(); index++) { + auto node = graph->mutable_node(index); + if (node->op_type() != "Constant") { + continue; } - std::fstream f(external_file_path, std::ios::out); - Assert(f.is_open(), "Failed to open: " + external_file_path + - " file to save external data"); - for (auto index = 0; index < graph->node_size(); index++) - { - auto node = graph->mutable_node(index); - if (node->op_type() != "Constant") - { + for (auto i = 0; i < node->attribute_size(); i++) { + auto attr = node->mutable_attribute(i); + if (attr->name() != "value") { continue; } - for (auto i = 0; i < node->attribute_size(); i++) - { - auto attr = node->mutable_attribute(i); - if (attr->name() != "value") - { - continue; - } - auto tensor = attr->mutable_t(); + auto tensor = attr->mutable_t(); - if (tensor->raw_data().size() <= 128) - { - continue; - } - - tensor->set_data_location(ONNX_NAMESPACE::TensorProto::EXTERNAL); - auto external_data = tensor->add_external_data(); - external_data->set_key("location"); - external_data->set_value(file_name); - - external_data = tensor->add_external_data(); - external_data->set_key("offset"); - f.seekg(0, std::ios::end); - int64_t offset = f.tellg(); - external_data->set_value(std::to_string(offset)); - auto raw_data = tensor->raw_data(); - f << raw_data; - external_data = tensor->add_external_data(); - external_data->set_key("length"); - int64_t raw_datas_size = raw_data.size(); - external_data->set_value(std::to_string(raw_datas_size)); - tensor->clear_raw_data(); + if (tensor->raw_data().size() <= 128) { + continue; } + + tensor->set_data_location(ONNX_NAMESPACE::TensorProto::EXTERNAL); + auto external_data = tensor->add_external_data(); + external_data->set_key("location"); + external_data->set_value(file_name); + + external_data = tensor->add_external_data(); + external_data->set_key("offset"); + f.seekg(0, std::ios::end); + int64_t offset = f.tellg(); + external_data->set_value(std::to_string(offset)); + auto raw_data = tensor->raw_data(); + f << raw_data; + external_data = tensor->add_external_data(); + external_data->set_key("length"); + int64_t raw_datas_size = raw_data.size(); + external_data->set_value(std::to_string(raw_datas_size)); + tensor->clear_raw_data(); } - f.close(); - } - void ModelExporter::ONNXChecker(const ONNX_NAMESPACE::ModelProto &model, - const bool &verbose) - { - // TODO(jiangjiajun) - // If we need to integrate with framework - // this check will return a information - // to let framework know the conversion is - // pass or fail - try - { - // ONNX_NAMESPACE::checker::check_model(*(model.get())); - ONNX_NAMESPACE::checker::check_model(model); - } - catch (const std::exception &e) - { - P2OLogger(verbose) << "The exported ONNX model is invalid." << std::endl; - P2OLogger(verbose) << "Model checker error log: " << e.what() << std::endl; - } - P2OLogger(verbose) << "PaddlePaddle model is exported as ONNX format now." - << std::endl; - } - - std::string ModelExporter::Run(const PaddleParser &parser, - int opset_version, - bool auto_upgrade_opset, - bool verbose, - bool enable_onnx_checker, - bool enable_experimental_op, - bool enable_optimize, - const std::string &deploy_backend, - std::string *calibration_cache, - const std::string &external_file, - bool *save_external, - bool export_fp16_model, - std::vector disable_fp16_op_types) - { - verbose_ = verbose; - deploy_backend_ = deploy_backend; - calibration_cache_ = calibration_cache; - - // Clear name_counter, this use to generate unique name for intermdiate while converting all the op - MapperHelper::Get()->ClearNameCounter(); - - if (!IsOpsRegistered(parser, enable_experimental_op)) - { - Assert(false, "Due to the unsupported operators, the conversion is aborted."); - } + } + f.close(); +} +void ModelExporter::ONNXChecker(const ONNX_NAMESPACE::ModelProto &model, + const bool &verbose) { + // TODO(jiangjiajun) + // If we need to integrate with framework + // this check will return a information + // to let framework know the conversion is + // pass or fail + try { + // ONNX_NAMESPACE::checker::check_model(*(model.get())); + ONNX_NAMESPACE::checker::check_model(model); + } catch (const std::exception &e) { + P2OLogger(verbose) << "The exported ONNX model is invalid." << std::endl; + P2OLogger(verbose) << "Model checker error log: " << e.what() << std::endl; + } + P2OLogger(verbose) << "PaddlePaddle model is exported as ONNX format now." + << std::endl; +} - // Set ONNX Opset Version - opset_version_ = opset_version; - SetOpsetVersion(parser, auto_upgrade_opset); +std::string ModelExporter::Run( + const PaddleParser &parser, int opset_version, bool auto_upgrade_opset, + bool verbose, bool enable_onnx_checker, bool enable_experimental_op, + bool enable_optimize, const std::string &deploy_backend, + std::string *calibration_cache, const std::string &external_file, + bool *save_external, bool export_fp16_model, + std::vector disable_fp16_op_types) { + verbose_ = verbose; + deploy_backend_ = deploy_backend; + calibration_cache_ = calibration_cache; + + // Clear name_counter, this use to generate unique name for intermdiate while + // converting all the op + MapperHelper::Get()->ClearNameCounter(); + + if (!IsOpsRegistered(parser, enable_experimental_op)) { + Assert(false, + "Due to the unsupported operators, the conversion is aborted."); + } - // Set ONNX IR Version - SetIRVersion(); + // Set ONNX Opset Version + opset_version_ = opset_version; + SetOpsetVersion(parser, auto_upgrade_opset); - // Export Parser Parameters - std::vector> parameters; - ExportParameters(parser, parameters); - // Export Parser Inputs and Outputs - std::vector> inputs; - std::vector> outputs; - ExportInputOutputs(parser, inputs, outputs); - // Export Blocks - tensor_names_.clear(); + // Set ONNX IR Version + SetIRVersion(); - auto share_graph = ExportBlock(parser, 0, parameters, inputs, outputs); - *onnx_model_.mutable_graph() = share_graph; + // Export Parser Parameters + std::vector> parameters; + ExportParameters(parser, parameters); + // Export Parser Inputs and Outputs + std::vector> inputs; + std::vector> outputs; + ExportInputOutputs(parser, inputs, outputs); + // Export Blocks + tensor_names_.clear(); - if (enable_optimize) - { - onnx_model_ = Optimize(onnx_model_); - } + auto share_graph = ExportBlock(parser, 0, parameters, inputs, outputs); + *onnx_model_.mutable_graph() = share_graph; - // convert fp32 model to fp16 - if (export_fp16_model) - { - P2OLogger(verbose) << "Convert FP32 ONNX model to FP16." << std::endl; - ConvertFp32ToFp16 convert; - convert.SetCustomOps(custom_ops); - convert.AddDisabledOpTypes(disable_fp16_op_types); - convert.Convert(&onnx_model_); - } + if (enable_optimize) { + onnx_model_ = Optimize(onnx_model_); + } - // save external data file for big model - std::string external_data_file; - if (onnx_model_.ByteSizeLong() > INT_MAX) - { - if (external_file.empty()) - { - external_data_file = "external_data"; - } - else - { - external_data_file = external_file; - } - } + // convert fp32 model to fp16 + if (export_fp16_model) { + P2OLogger(verbose) << "Convert FP32 ONNX model to FP16." << std::endl; + ConvertFp32ToFp16 convert; + convert.SetCustomOps(custom_ops); + convert.AddDisabledOpTypes(disable_fp16_op_types); + convert.Convert(&onnx_model_); + } - if (external_data_file.size()) - { - SaveExternalData(onnx_model_.mutable_graph(), external_data_file, save_external); + // save external data file for big model + std::string external_data_file; + if (onnx_model_.ByteSizeLong() > INT_MAX) { + if (external_file.empty()) { + external_data_file = "external_data"; + } else { + external_data_file = external_file; } + } - // check model - if (enable_onnx_checker) - { - ONNXChecker(onnx_model_, verbose); - } + if (external_data_file.size()) { + SaveExternalData(onnx_model_.mutable_graph(), external_data_file, + save_external); + } - std::string out; - if (!onnx_model_.SerializeToString(&out)) - { - P2OLogger(verbose) - << "Error happenedd while optimizing the exported ONNX model." - << std::endl; - return ""; - } - return out; - } - - ONNX_NAMESPACE::ModelProto ModelExporter::Optimize( - const ONNX_NAMESPACE::ModelProto &model) - { - ONNX_NAMESPACE::optimization::Optimizer::passes - .registerPass(); - ONNX_NAMESPACE::optimization::Optimizer::passes - .registerPass(); - ONNX_NAMESPACE::optimization::Optimizer::passes - .registerPass(); - ONNX_NAMESPACE::optimization::Optimizer::passes - .registerPass(); - ONNX_NAMESPACE::optimization::Optimizer::passes - .registerPass(); - ONNX_NAMESPACE::optimization::Optimizer::passes - .registerPass(); - std::vector passes = {"eliminate_identity", - "eliminate_deadend", - "eliminate_deadend", - "fuse_constant_reshape", - "fuse_constant_unsqueeze", - "fuse_paddle_conv_bias", - "fuse_consecutive_transposes", - "eliminate_non_transpose", - "fuse_matmul_add_bias_into_gemm", - "eliminate_identity", - "eliminate_deadend", - "eliminate_unused_initializer"}; - return ONNX_NAMESPACE::optimization::Optimize(model, passes); - } - -} // namespace paddle2onnx + // check model + if (enable_onnx_checker) { + ONNXChecker(onnx_model_, verbose); + } + + std::string out; + if (!onnx_model_.SerializeToString(&out)) { + P2OLogger(verbose) + << "Error happenedd while optimizing the exported ONNX model." + << std::endl; + return ""; + } + return out; +} + +ONNX_NAMESPACE::ModelProto ModelExporter::Optimize( + const ONNX_NAMESPACE::ModelProto &model) { + ONNX_NAMESPACE::optimization::Optimizer::passes + .registerPass(); + ONNX_NAMESPACE::optimization::Optimizer::passes + .registerPass(); + ONNX_NAMESPACE::optimization::Optimizer::passes + .registerPass(); + ONNX_NAMESPACE::optimization::Optimizer::passes + .registerPass(); + ONNX_NAMESPACE::optimization::Optimizer::passes + .registerPass(); + ONNX_NAMESPACE::optimization::Optimizer::passes + .registerPass(); + std::vector passes = {"eliminate_identity", + "eliminate_deadend", + "eliminate_deadend", + "fuse_constant_reshape", + "fuse_constant_unsqueeze", + "fuse_paddle_conv_bias", + "fuse_consecutive_transposes", + "eliminate_non_transpose", + "fuse_matmul_add_bias_into_gemm", + "eliminate_identity", + "eliminate_deadend", + "eliminate_unused_initializer"}; + return ONNX_NAMESPACE::optimization::Optimize(model, passes); +} + +} // namespace paddle2onnx diff --git a/paddle2onnx/mapper/exporter.h b/paddle2onnx/mapper/exporter.h index 9467a9cde..a64f6d039 100644 --- a/paddle2onnx/mapper/exporter.h +++ b/paddle2onnx/mapper/exporter.h @@ -19,7 +19,7 @@ #include #include "paddle2onnx/mapper/mapper.h" -#include "paddle2onnx/mapper/quantize_helper.h" +#include "paddle2onnx/mapper/quantize/base_quantize_processor.h" #include "paddle2onnx/parser/parser.h" #ifdef _MSC_VER @@ -28,100 +28,93 @@ #define PATH_SEP "/" #endif -inline std::string GetFilenameFromPath(const std::string &path) -{ +namespace paddle2onnx { +inline std::string GetFilenameFromPath(const std::string &path) { auto pos = path.find_last_of(PATH_SEP); - if (pos == std::string::npos) - { + if (pos == std::string::npos) { return path; } return path.substr(pos + 1); } -namespace paddle2onnx -{ - class ModelExporter - { - public: - // custom operators for export - // - std::map custom_ops; - QuantizeModelProcessor quantize_model_processer; +class ModelExporter { + public: + // custom operators for export + // + std::map custom_ops; - void SaveExternalData(ONNX_NAMESPACE::GraphProto *graph, - const std::string &external_file_path, - bool *save_external = nullptr); + void SaveExternalData(ONNX_NAMESPACE::GraphProto *graph, + const std::string &external_file_path, + bool *save_external = nullptr); - void ONNXChecker(const ONNX_NAMESPACE::ModelProto &model, - const bool &verbose); + void ONNXChecker(const ONNX_NAMESPACE::ModelProto &model, + const bool &verbose); - std::string Run(const PaddleParser &parser, - int opset_version = 9, - bool auto_upgrade_opset = true, - bool verbose = false, - bool enable_onnx_checker = true, - bool enable_experimental_op = false, - bool enable_optimize = true, - const std::string &deploy_backend = "onnxruntime", - std::string *calibration_cache = nullptr, - const std::string &external_file = "", - bool *save_external = nullptr, - bool export_fp16_model = false, - std::vector disable_fp16_op_types = {}); + std::string Run(const PaddleParser &parser, int opset_version = 9, + bool auto_upgrade_opset = true, bool verbose = false, + bool enable_onnx_checker = true, + bool enable_experimental_op = false, + bool enable_optimize = true, + const std::string &deploy_backend = "onnxruntime", + std::string *calibration_cache = nullptr, + const std::string &external_file = "", + bool *save_external = nullptr, bool export_fp16_model = false, + std::vector disable_fp16_op_types = {}); - private: - bool verbose_ = false; - // The _deploy_backend will pass to Mapper to influence the conversion - std::string deploy_backend_ = "onnxruntime"; - std::string *calibration_cache_ = nullptr; - int32_t opset_version_ = 7; + private: + bool verbose_ = false; + // The _deploy_backend will pass to Mapper to influence the conversion + std::string deploy_backend_ = "onnxruntime"; + BaseQuantizeProcessor *quantize_processer_ = nullptr; + std::string *calibration_cache_ = nullptr; + int32_t opset_version_ = 7; - bool IsOpsRegistered(const PaddleParser &parser, - bool enable_experimental_op); + bool IsOpsRegistered(const PaddleParser &parser, bool enable_experimental_op); - ONNX_NAMESPACE::ModelProto onnx_model_; - // Opset Version - int32_t GetMinOpsetVersion(const PaddleParser &parser); - void SetOpsetVersion(const PaddleParser &parser, bool auto_upgrade_opset); - // IR Version - inline ONNX_NAMESPACE::Version GetIRVersion() const; - void SetIRVersion(); - // - void ExportInputOutputs(const PaddleParser &parser, - std::vector> &inputs, - std::vector> &outputs); - // - void ExportParameters(const PaddleParser &parser, std::vector> ¶meters); - // Process dumplicate tensor names in paddle model - std::set tensor_names_; - void ProcessGraphDumplicateNames(std::vector> ¶meters, - std::vector> &inputs, - std::vector> &outputs, - std::vector> &nodes, - std::map &quantize_info); - // Update constant node in parameters. When process quantize model, the weight - // dtype may be int8, it should be convet to float32 and use this function to - // update converted params. - void UpdateParameters(const std::map ¶ms, - std::vector> ¶meters); - // - std::map> sub_block_map_; - ONNX_NAMESPACE::GraphProto ExportConditionalBlock(const PaddleParser &parser, - int32_t block_id, - int32_t op_id, - const std::string &output_names); - ONNX_NAMESPACE::GraphProto ExportBlock(const PaddleParser &parser, - int32_t block_id, - std::vector> ¶meters, - std::vector> &inputs, - std::vector> &outputs); + ONNX_NAMESPACE::ModelProto onnx_model_; + // Opset Version + int32_t GetMinOpsetVersion(const PaddleParser &parser); + void SetOpsetVersion(const PaddleParser &parser, bool auto_upgrade_opset); + // IR Version + inline ONNX_NAMESPACE::Version GetIRVersion() const; + void SetIRVersion(); + // + void ExportInputOutputs( + const PaddleParser &parser, + std::vector> &inputs, + std::vector> &outputs); + // + void ExportParameters( + const PaddleParser &parser, + std::vector> ¶meters); + // Process dumplicate tensor names in paddle model + std::set tensor_names_; + void ProcessGraphDumplicateNames( + std::vector> ¶meters, + std::vector> &inputs, + std::vector> &outputs, + std::vector> &nodes, + std::map &quantize_info); + // Update constant node in parameters. When process quantize model, the weight + // dtype may be int8, it should be convet to float32 and use this function to + // update converted params. + void UpdateParameters( + const std::map ¶ms, + std::vector> ¶meters); + // + std::map> sub_block_map_; + ONNX_NAMESPACE::GraphProto ExportConditionalBlock( + const PaddleParser &parser, int32_t block_id, int32_t op_id, + const std::string &output_names); + ONNX_NAMESPACE::GraphProto ExportBlock( + const PaddleParser &parser, int32_t block_id, + std::vector> ¶meters, + std::vector> &inputs, + std::vector> &outputs); - void ExportOp(const PaddleParser &parser, - OnnxHelper *helper, - int32_t opset_version, - int64_t block_id, - int64_t op_id, - bool verbose); + void ExportOp(const PaddleParser &parser, OnnxHelper *helper, + int32_t opset_version, int64_t block_id, int64_t op_id, + bool verbose); #if 0 bool IsLoopSupported(const PaddleParser &parser, const int64_t &block_id, const int64_t &op_id); @@ -129,7 +122,8 @@ namespace paddle2onnx int32_t opset_version, int64_t block_id, int64_t op_id, bool verbose); #endif - ONNX_NAMESPACE::ModelProto Optimize(const ONNX_NAMESPACE::ModelProto &model); - void CovertCustomOps(const PaddleParser& parser, OnnxHelper* helper, int64_t block_id, int64_t op_id); - }; -} // namespace paddle2onnx + ONNX_NAMESPACE::ModelProto Optimize(const ONNX_NAMESPACE::ModelProto &model); + void CovertCustomOps(const PaddleParser &parser, OnnxHelper *helper, + int64_t block_id, int64_t op_id); +}; +} // namespace paddle2onnx diff --git a/paddle2onnx/mapper/quantize_helper.cc b/paddle2onnx/mapper/quantize/base_quantize_processor.cc similarity index 56% rename from paddle2onnx/mapper/quantize_helper.cc rename to paddle2onnx/mapper/quantize/base_quantize_processor.cc index 1e0e2350f..95049d79f 100644 --- a/paddle2onnx/mapper/quantize_helper.cc +++ b/paddle2onnx/mapper/quantize/base_quantize_processor.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle2onnx/mapper/quantize_helper.h" +#include "paddle2onnx/mapper/quantize/base_quantize_processor.h" #include namespace paddle2onnx { -void QuantizeModelProcessor::RemoveNodeByName(const std::string& name, - const bool& update_io) { +void BaseQuantizeProcessor::RemoveNodeByName(const std::string& name, + const bool& update_io) { if (name.empty()) { return; } @@ -35,7 +35,7 @@ void QuantizeModelProcessor::RemoveNodeByName(const std::string& name, } } -void QuantizeModelProcessor::ReplaceInputOfAllNodes( +void BaseQuantizeProcessor::ReplaceInputOfAllNodes( const std::string& old_name, const std::string& new_name, const std::vector>& except_nodes) { @@ -64,7 +64,7 @@ void QuantizeModelProcessor::ReplaceInputOfAllNodes( } } -void QuantizeModelProcessor::UpdateInputNameToNodes() { +void BaseQuantizeProcessor::UpdateInputNameToNodes() { name2node_dict_.clear(); for (auto& node : *nodes_) { for (size_t i = 0; i < node->input_size(); ++i) { @@ -78,115 +78,22 @@ void QuantizeModelProcessor::UpdateInputNameToNodes() { } } -void QuantizeModelProcessor::ProcessQuantizeModel( +void BaseQuantizeProcessor::ProcessQuantizeModel( std::vector>* parameters, std::vector>* inputs, std::vector>* outputs, std::vector>* nodes, - OnnxHelper* helper, const std::string& deploy_backend, - const PaddleParser& parser, std::string* calibration_cache) { - // Determine whether the model contains quantization related OPs, if not, exit - // directly - bool quantized_model = false; - for (auto& node : *nodes) { - if (node->op_type() == "QuantizeLinear" || - node->op_type() == "DequantizeLinear") { - quantized_model = true; - break; - } - } - if (!quantized_model) { - return; - } + OnnxHelper* helper, const PaddleParser& parser, + std::string* calibration_cache) { parser_ = &parser; helper_ = helper; parameters_ = parameters; inputs_ = inputs; outputs_ = outputs; nodes_ = nodes; - P2OLogger() << "[Info] Quantize model deploy backend is: " << deploy_backend - << std::endl; - // Determine the format of the exported ONNX quantization model according to - // the deploy_backend - if (deploy_backend == "others") { - // If deploy_backend is others, the quantization model is exported as a - // float model + quantization table. - RemoveAllQuantizeOps(); - std::ofstream outfile; - outfile.open("max_range.txt", std::ios::out); - if (!outfile.is_open()) { - P2OLogger() << "[WARNING] Quantize model processer failed to write range " - "information in current location." - << std::endl; - return; - } - for (auto iter = helper_->quantize_info.begin(); - iter != helper_->quantize_info.end(); iter++) { - std::string log = iter->first; - auto scale = iter->second.scale_; - if (scale.size() == 1) { - log = log + ": " + std::to_string(scale[0] * 127); - outfile << log << std::endl; - } - } - outfile.close(); - } else if (deploy_backend == "onnxruntime") { - // When deploy_backend is ONNXRuntime, use the follow four steps to process: - // 1. broadcast quantize info - // 2. remove all quantize ops - // 3. merge conv and add - // 4. merge conv and bn - // 5. add Q and DQ according ONNXRuntime quantize OP fuse patten. - // 6. use topo sort in nodes - QuantizeInfoBroadcast(); - RemoveAllQuantizeOps(); - MergeConvAdd(); - MergeConvBN(); - AddQDQForORT(); - SortNodes(); - } else if (deploy_backend == "tensorrt") { - // When deploy_backend is TensorRT, use the follow four steps to process: - // For Explicit Quantization - // 1. broadcast quantize info - // 2. remove all quantize ops - // 3. add Q and DQ before conv and matmul. - // 4. use topo sort in nodes - - // For Implicit Quantization - // 1. remove all quantize ops - // 2. broadcast quantize info - // 3. save float onnx model and alibration.cache - QuantizeInfoBroadcast(); - RemoveAllQuantizeOps(); - // Add qdq for Explicit Quantization - // AddTrtQDQ(); - // SortNodes(); - - // Genarate calibration.cache for Implicit Quantization - // convert float to hex - GenerateCache(calibration_cache); - } else if (deploy_backend == "rknn") { - // When deploy_backend is RKNN, use the follow four steps to process: - // 1. broadcast quantize info - // 2. remove all quantize ops - // 3. add Q and DQ - // 4. use topo sort in nodes - QuantizeInfoBroadcast(); - RemoveAllQuantizeOps(); - RemoveIdentityOp(); - MergeConvAdd(); - AddQDQForRKNN(); - SortNodes(); - } else { - Assert(false, - "[QuantizeModelProcessor] Only support 'onnxruntime' / 'tensorrt' " - "/ 'others' as " - "backend now, but now the backend is: " + - deploy_backend + "."); - } } -void QuantizeModelProcessor::RemoveIdentityOp() { +void BaseQuantizeProcessor::RemoveIdentityOp() { UpdateInputNameToNodes(); auto iter = nodes_->begin(); while (iter != nodes_->end()) { @@ -199,360 +106,29 @@ void QuantizeModelProcessor::RemoveIdentityOp() { } } -void QuantizeModelProcessor::AddQDQForRKNN() { - UpdateInputNameToNodes(); - supported_quantize_type_ = {"Abs", - "Acos", - "Add", - "Asin", - "Atan", - "AveragePool", - "BatchNormalization", - "Ceil", - "Clip", - "Conv", - "ConvTranspose", - "Cos", - "Cosh", - "Concat", - "Div", - "Elu", - "Erf", - "Exp", - "Floor", - "Gemm", - "GlobalAveragePool", - "HardSigmoid", - "HardSwish", - "InstanceNormalization", - "IsInf", - "IsNaN", - "Log", - "MatMul", - "MaxPool", - "Mul", - "Neg", - "ReduceMean", - "Relu", - "Reshape", - "Resize", - "Round", - "Shape", - "Sigmoid", - "Sin", - "Sinh", - "Slice", - "Softmax", - "Split", - "Sqrt", - "Tan", - "Tanh", - "Transpose"}; - for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { - auto node = *iter; - auto type_iter = std::find(supported_quantize_type_.begin(), supported_quantize_type_.end(), node->op_type()); - if (!supported_quantize_type_.empty() && type_iter == supported_quantize_type_.end()) { - continue; - } - - std::vector tensor_names = {}; - for (size_t i = 0; i < node->input_size(); ++i) { - std::string node_input = node->input(i); - tensor_names.push_back(node_input); - } - for (size_t i = 0; i < node->output_size(); ++i) { - std::string node_output = node->output(i); - tensor_names.push_back(node_output); - } - - if (node->op_type() == "MatMul" || node->op_type() == "Add" || node->op_type() == "Mul") { - for (auto& name : tensor_names) { - if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { - continue; - } - - std::vector weight; - if (!GetTensorByName(name, &weight)) { - P2OLogger() << "Failed to GetTensorByName: " << node->op_type() << ";" << name << std::endl; - continue; - } - - std::vector weight_shape; - if (!GetTensorShape(name, &weight_shape)) { - P2OLogger() << "Failed to GetTensorShape: " << node->op_type() << ";" << name << std::endl; - continue; - } - - int64_t quantize_axis = 1; - std::vector scale; - std::vector zeros; - GetTensorWiseQuantizeInfo(weight, &scale, &zeros); - - std::string weight_scale_node, weight_zero_node; - weight_scale_node = helper_->Constant({}, ONNX_NAMESPACE::TensorProto::FLOAT, scale[0]); - weight_zero_node = helper_->Constant({}, ONNX_NAMESPACE::TensorProto::INT8, zeros[0]); +void BaseQuantizeProcessor::AddQDQ() { UpdateInputNameToNodes(); } - QuantizeInfo matmul_weight_quantize_info(scale, zeros, weight_scale_node, weight_zero_node, quantize_axis); - helper_->quantize_info[name] = matmul_weight_quantize_info; - } - } else if (node->op_type() == "BatchNormalization") { - // BatchNormalization only need quntize X and Y. - // when opset > 9, tensor_names is {X, scale, B, input_mean, input_var, Y, running_mean, running_var} - // when opset <= 9, tensor_names is {X, scale, B, mean, var, Y, mean, var, saved_mean, saved_var} - tensor_names.erase(tensor_names.begin() + 1, tensor_names.begin() + 5); - tensor_names.erase(tensor_names.begin() + 2, tensor_names.end()); - } - - if (!CanBeQuantize(tensor_names)) { - continue; - } - - for (auto& name : tensor_names) { - AppendQuantizeTensor(name); - } - } - - // update name2node_dict for the change of Relu op. - UpdateInputNameToNodes(); - // Add QDQ in model - AddQDQInModel(tensors_to_be_quantize); -} - -void QuantizeModelProcessor::GenerateCache(std::string* calibration_cache) { - union { - float f; - unsigned char farray[4]; - } un; - *calibration_cache += "TRT-8XXX-EntropyCalibration2 \n"; - for (auto iter = helper_->quantize_info.rbegin(); - iter != helper_->quantize_info.rend(); iter++) { - std::string tensor_name = iter->first; - QuantizeInfo quantize_info = iter->second; - if (quantize_info.scale_.size() == 1) { - float val = quantize_info.scale_[0]; - un.f = val; - *calibration_cache += (tensor_name + ": "); - std::stringstream enc; - for (int64_t i = 3; i >= 0; i--) { - enc << std::hex << std::setw(2) << std::setfill('0') - << (int)(un.farray[i]); - } - *calibration_cache = *calibration_cache + enc.str() + "\n"; - } - } -} -// In TensorRT, all quantized op: Conv, ConvTranspose, liner(MatMul), MaxPool, -// AvgPool, AdaptiveAvgPool, rnn(not support now) -// https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules -void QuantizeModelProcessor::AddTrtQDQ() { - UpdateInputNameToNodes(); - std::vector - quantize_tensors; // save the tensor names that need add quantize ops - std::vector pool_types = {"MaxPool", "AvgPool", - "AdaptiveAvgPool"}; - for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { - quantize_tensors.clear(); - auto node = *iter; - if (node->op_type() == "Conv" || node->op_type() == "ConvTranspose") { - std::vector tensor_names = {node->input(0), node->input(1)}; - if (!CanBeQuantize(tensor_names)) { - continue; - } - quantize_tensors = tensor_names; - } - if (node->op_type() == "MatMul") { - std::vector tensor_names = {node->input(0), node->input(1)}; - for (auto& name : tensor_names) { - if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { - continue; - } - std::vector matmul_weight; - if (!GetTensorByName(name, &matmul_weight)) { - continue; - } - std::vector matmul_weight_shape; - if (!GetTensorShape(name, &matmul_weight_shape)) { - continue; - } - int64_t quantize_axis = 1; - std::vector scale; - std::vector zeros; - GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, - quantize_axis, &scale, &zeros); - auto scale_node = - helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); - auto zero_node = - helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); - QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, - zero_node, quantize_axis); - helper_->quantize_info[name] = matmul_weight_quantize_info; - } - if (!CanBeQuantize(tensor_names)) { - continue; - } - quantize_tensors = tensor_names; - } - auto type_iter = - std::find(pool_types.begin(), pool_types.end(), node->op_type()); - if (type_iter != pool_types.end()) { - std::vector tensor_names = {node->input(0)}; - if (!CanBeQuantize(tensor_names)) { - continue; - } - quantize_tensors = tensor_names; - } - - std::string negative_scale_tensor = ""; - for (std::string& name : quantize_tensors) { - Assert( - helper_->quantize_info.find(name) != helper_->quantize_info.end(), - "[QuantizeModelProcessor] Can not find quantize info for tensor: " + - name); - QuantizeInfo quantize_info = helper_->quantize_info[name]; - std::vector scales = quantize_info.scale_; - for (auto& i : scales) { - if (i <= 1e-10) { - negative_scale_tensor = negative_scale_tensor + " " + name; - } - } - } - if (negative_scale_tensor.size() > 0) { - P2OLogger() - << "[Warning] The scale of tensors: [ " + negative_scale_tensor + - " ] contains negative scale, so this OP will not be quantized." - << std::endl; - continue; - } - // An OP requires a separate quantize op - for (std::string& name : quantize_tensors) { - if (IsGraphOutput(name)) { - continue; - } - QuantizeInfo quantize_info = helper_->quantize_info[name]; - std::string scale_node = quantize_info.scale_node_; - std::string zeros_node = quantize_info.zeros_node_; - int64_t quantize_axis = quantize_info.quantize_axis_; - auto q_node = - helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node}); - if (helper_->GetOpsetVersion() >= 13) { - AddAttribute(q_node, "axis", quantize_axis); - } - auto dq_node = helper_->MakeNode( - "DequantizeLinear", {q_node->output(0), scale_node, zeros_node}); - if (helper_->GetOpsetVersion() >= 13) { - AddAttribute(dq_node, "axis", quantize_axis); - } - for (size_t i = 0; i < node->input_size(); ++i) { - if (node->input(i) == name) { - node->set_input(i, dq_node->output(0)); - } - } - } - } -} - -// According to: -// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc -void QuantizeModelProcessor::AddQDQForORT() { - UpdateInputNameToNodes(); - supported_quantize_type_ = {"Add", - "Conv", - "LeakyRelu" - "MatMul", - "Mul", - "Relu", - "Sigmoid",}; - for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { - auto node = *iter; - auto type_iter = std::find(supported_quantize_type_.begin(), - supported_quantize_type_.end(), node->op_type()); - if (!supported_quantize_type_.empty() && - type_iter == supported_quantize_type_.end()) { - continue; - } - if (node->op_type() == "MatMul") { - std::vector tensor_names = {node->input(0), node->input(1), - node->output(0)}; - for (auto& name : tensor_names) { - if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { - continue; - } - std::vector matmul_weight; - if (!GetTensorByName(name, &matmul_weight)) { - continue; - } - std::vector matmul_weight_shape; - if (!GetTensorShape(name, &matmul_weight_shape)) { - continue; - } - int64_t quantize_axis = 1; - std::vector scale; - std::vector zeros; - GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, - quantize_axis, &scale, &zeros); - auto scale_node = - helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); - auto zero_node = - helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); - QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, - zero_node, quantize_axis); - helper_->quantize_info[name] = matmul_weight_quantize_info; - } - if (!CanBeQuantize(tensor_names)) { - tensor_names.pop_back(); - if (!CanBeQuantize(tensor_names)) { - continue; - } - } - for (auto& name : tensor_names) { - AppendQuantizeTensor(name); - } - } - - std::vector tensor_names; - for (size_t i = 0; i < node->input_size(); ++i) { - std::string node_input = node->input(i); - tensor_names.push_back(node_input); - } - for (size_t i = 0; i < node->output_size(); ++i) { - std::string node_output = node->output(i); - tensor_names.push_back(node_output); - } - if (!CanBeQuantize(tensor_names)) { - continue; - } - for (auto& name : tensor_names) { - AppendQuantizeTensor(name); - } - } - // update name2node_dict for the change of Relu op. - UpdateInputNameToNodes(); - // Add QDQ in model - AddQDQInModel(tensors_to_be_quantize); -} - -void QuantizeModelProcessor::AddQDQInModel( - const std::vector& tensors_to_be_quantize) { +void BaseQuantizeProcessor::AddQDQInModel() { // add Q and DQ according to tensors_to_be_quantize - for (auto& name : tensors_to_be_quantize) { + for (auto& name : tensors_to_be_quantize_) { if (IsGraphOutput(name)) { continue; } Assert(helper_->quantize_info.find(name) != helper_->quantize_info.end(), - "[QuantizeModelProcessor] Can not find quantize info for tensor: " + + "[BaseQuantizeProcessor] Can not find quantize info for tensor: " + name); QuantizeInfo quantize_info = helper_->quantize_info[name]; std::string scale_node = quantize_info.scale_node_; std::string zeros_node = quantize_info.zeros_node_; int64_t quantize_axis = quantize_info.quantize_axis_; - auto iter = std::find(only_dequantize_tensors.begin(), - only_dequantize_tensors.end(), name); - if (iter != only_dequantize_tensors.end()) { + auto iter = std::find(only_dequantize_tensors_.begin(), + only_dequantize_tensors_.end(), name); + if (iter != only_dequantize_tensors_.end()) { // if only add DequantizeLinear std::vector scale = quantize_info.scale_; std::vector bias; Assert(GetTensorByName(name, &bias), - "[QuantizeModelProcessor] Can not find bias value: " + name); + "[BaseQuantizeProcessor] Can not find bias value: " + name); std::vector new_bias(bias.size(), 0); for (int64_t i = 0; i < bias.size(); i++) { float scale_val = scale.size() == 1 ? scale[0] : scale[i]; @@ -607,7 +183,7 @@ void QuantizeModelProcessor::AddQDQInModel( } } -void QuantizeModelProcessor::MergeConvBN() { +void BaseQuantizeProcessor::MergeConvBN() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto conv_node = *iter; @@ -750,7 +326,7 @@ void QuantizeModelProcessor::MergeConvBN() { } } -void QuantizeModelProcessor::MergeConvAdd() { +void BaseQuantizeProcessor::MergeConvAdd() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; @@ -758,13 +334,16 @@ void QuantizeModelProcessor::MergeConvAdd() { continue; } // if act input of conv does not have quantize info, continue - bool act_has_quantize_info = helper_->quantize_info.find(node->input(0)) != helper_->quantize_info.end(); + bool act_has_quantize_info = helper_->quantize_info.find(node->input(0)) != + helper_->quantize_info.end(); if (!act_has_quantize_info) { continue; } // if weight of conv does not have quantize info, continue - bool weight_has_quantize_info = helper_->quantize_info.find(node->input(1)) != helper_->quantize_info.end(); + bool weight_has_quantize_info = + helper_->quantize_info.find(node->input(1)) != + helper_->quantize_info.end(); if (!weight_has_quantize_info) { continue; } @@ -809,15 +388,18 @@ void QuantizeModelProcessor::MergeConvAdd() { continue; } // continue if shape_val != [1, bias_val.size(), 1, 1] - std::vector target = {1, static_cast(bias_val.size()), 1, 1}; + std::vector target = {1, static_cast(bias_val.size()), 1, + 1}; if (target != shape_val) { continue; } // remove Reshape op RemoveNodeByName(before_nodes[0]->name()); // add scale for bias - std::vector weight_scale = helper_->quantize_info[node->input(1)].scale_; - std::vector act_scale = helper_->quantize_info[node->input(0)].scale_; + std::vector weight_scale = + helper_->quantize_info[node->input(1)].scale_; + std::vector act_scale = + helper_->quantize_info[node->input(0)].scale_; std::vector bias_scale; for (int64_t i = 0; i < weight_scale.size(); i++) { bias_scale.push_back(weight_scale[i] * act_scale[0]); @@ -828,7 +410,8 @@ void QuantizeModelProcessor::MergeConvAdd() { auto zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, onnx_zeros); - QuantizeInfo quantize_info(bias_scale, onnx_zeros, scale_node, zero_node, 0); + QuantizeInfo quantize_info(bias_scale, onnx_zeros, scale_node, zero_node, + 0); helper_->quantize_info[bias_node] = quantize_info; AppendQuantizeTensor(bias_node, true); @@ -837,7 +420,7 @@ void QuantizeModelProcessor::MergeConvAdd() { } } -void QuantizeModelProcessor::SortNodes() { +void BaseQuantizeProcessor::SortNodes() { // return the topo sort of nodes; // 1. Get i2o_mapper and constant_nodes, i2o_mapper means the node map to its // all output nodes, constant_nodes save all constant nodes. @@ -929,7 +512,7 @@ void QuantizeModelProcessor::SortNodes() { *nodes_ = new_nodes; } -void QuantizeModelProcessor::RemoveAllQuantizeOps() { +void BaseQuantizeProcessor::RemoveAllQuantizeOps() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; @@ -964,7 +547,7 @@ void QuantizeModelProcessor::RemoveAllQuantizeOps() { // Broadcast quantize info between the input and output of the OPs that will not // change quantize info -void QuantizeModelProcessor::QuantizeInfoBroadcast() { +void BaseQuantizeProcessor::QuantizeInfoBroadcast() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; @@ -998,7 +581,7 @@ void QuantizeModelProcessor::QuantizeInfoBroadcast() { } } -bool QuantizeModelProcessor::IsGraphOutput(const std::string& name) { +bool BaseQuantizeProcessor::IsGraphOutput(const std::string& name) { for (auto& item : *outputs_) { auto out_node = (*item.get()); if (name == out_node.name()) { @@ -1009,8 +592,8 @@ bool QuantizeModelProcessor::IsGraphOutput(const std::string& name) { } // Try get tensor shape value -bool QuantizeModelProcessor::GetTensorShape(const std::string& name, - std::vector* shape) { +bool BaseQuantizeProcessor::GetTensorShape(const std::string& name, + std::vector* shape) { for (auto& item : *parameters_) { auto node = *(item.get()); if (node.output(0) != name) { @@ -1028,8 +611,7 @@ bool QuantizeModelProcessor::GetTensorShape(const std::string& name, } } - for (auto& item : *nodes_) - { + for (auto& item : *nodes_) { auto node = *(item.get()); if (node.output(0) != name) { continue; @@ -1049,15 +631,13 @@ bool QuantizeModelProcessor::GetTensorShape(const std::string& name, return !shape->empty(); } -void QuantizeModelProcessor::GetTensorWiseQuantizeInfo( +void BaseQuantizeProcessor::GetTensorWiseQuantizeInfo( const std::vector& tensor, std::vector* scale, std::vector* zero) { - float max_val = -1; - for (int64_t i = 0; i < tensor.size(); i++) { - if (fabs(tensor[i]) > max_val) { - max_val = fabs(tensor[i]); - } - } + Assert(!tensor.empty(), + "[GetTensorWiseQuantizeInfo] Require weight is not empty."); + + float max_val = *std::max_element(tensor.begin(), tensor.end()); Assert(max_val >= 0, "[GetTensorWiseQuantizeInfo] Require the scale >= 0, but now it's " + std::to_string(max_val) + "."); @@ -1065,7 +645,7 @@ void QuantizeModelProcessor::GetTensorWiseQuantizeInfo( zero->push_back(0); } -void QuantizeModelProcessor::GetChannelWiseQuantizeInfo( +void BaseQuantizeProcessor::GetChannelWiseQuantizeInfo( const std::vector& tensor, const std::vector& shape, const int64_t& quant_axis, std::vector* scale, std::vector* zero) { @@ -1111,7 +691,7 @@ void QuantizeModelProcessor::GetChannelWiseQuantizeInfo( zero->push_back(0); } else { Assert(false, - "QuantizeModelProcessor::GetChannelWiseQuantizeInfo only supports " + "BaseQuantizeProcessor::GetChannelWiseQuantizeInfo only supports " "quant_axis equals to 0 or 1, but now it's " + std::to_string(quant_axis) + "."); } @@ -1119,8 +699,8 @@ void QuantizeModelProcessor::GetChannelWiseQuantizeInfo( } template -bool QuantizeModelProcessor::GetTensorByName(const std::string& name, - std::vector* value) { +bool BaseQuantizeProcessor::GetTensorByName(const std::string& name, + std::vector* value) { // Find tensor values in the following order, if found, store the data in // value, and return true: // 1. updated_parameters, the weight of conv or matmul. @@ -1140,7 +720,7 @@ bool QuantizeModelProcessor::GetTensorByName(const std::string& name, return helper_->TryGetTensorValue(name, value); } -bool QuantizeModelProcessor::ConnectToOutput(const std::string& output_name) { +bool BaseQuantizeProcessor::ConnectToOutput(const std::string& output_name) { std::vector names = {output_name}; while (!names.empty()) { std::string name = names[names.size() - 1]; @@ -1158,7 +738,7 @@ bool QuantizeModelProcessor::ConnectToOutput(const std::string& output_name) { return false; } -bool QuantizeModelProcessor::CanBeQuantize( +bool BaseQuantizeProcessor::CanBeQuantize( const std::vector& tensor_names, const std::vector& output_index) { for (auto& tensor : tensor_names) { @@ -1167,7 +747,8 @@ bool QuantizeModelProcessor::CanBeQuantize( } } - // If there is an OP linked to the output by identity, it needs to be skipped, do not quantize the OP + // If there is an OP linked to the output by identity, it needs to be skipped, + // do not quantize the OP for (auto i = 0; i < output_index.size(); i++) { int64_t index = output_index[i]; if (index == -1) { @@ -1183,18 +764,19 @@ bool QuantizeModelProcessor::CanBeQuantize( return true; } -void QuantizeModelProcessor::AppendQuantizeTensor(const std::string& tensor, - const bool& only_dequantize) { +void BaseQuantizeProcessor::AppendQuantizeTensor(const std::string& tensor, + const bool& only_dequantize) { if (only_dequantize) { - if (std::find(only_dequantize_tensors.begin(), - only_dequantize_tensors.end(), - tensor) == only_dequantize_tensors.end()) { - only_dequantize_tensors.push_back(tensor); + if (std::find(only_dequantize_tensors_.begin(), + only_dequantize_tensors_.end(), + tensor) == only_dequantize_tensors_.end()) { + only_dequantize_tensors_.push_back(tensor); } } else { - if (std::find(tensors_to_be_quantize.begin(), tensors_to_be_quantize.end(), - tensor) == tensors_to_be_quantize.end()) { - tensors_to_be_quantize.push_back(tensor); + if (std::find(tensors_to_be_quantize_.begin(), + tensors_to_be_quantize_.end(), + tensor) == tensors_to_be_quantize_.end()) { + tensors_to_be_quantize_.push_back(tensor); } } } diff --git a/paddle2onnx/mapper/quantize/base_quantize_processor.h b/paddle2onnx/mapper/quantize/base_quantize_processor.h new file mode 100644 index 000000000..7ffca264d --- /dev/null +++ b/paddle2onnx/mapper/quantize/base_quantize_processor.h @@ -0,0 +1,104 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include +#include + +#include "paddle2onnx/mapper/mapper.h" +#include "paddle2onnx/parser/parser.h" + +namespace paddle2onnx { +class BaseQuantizeProcessor { + public: + BaseQuantizeProcessor() = default; + virtual ~BaseQuantizeProcessor() = default; + + // Convert to different model formats based on backend, backend can be + // TensorRT, ONNXRuntime and Others + virtual void ProcessQuantizeModel( + std::vector> *parameters, + std::vector> *inputs, + std::vector> *outputs, + std::vector> *nodes, + OnnxHelper *helper, const PaddleParser &parser, + std::string *calibration_cache = nullptr); + + protected: + const PaddleParser *parser_; + OnnxHelper *helper_; + std::vector> *parameters_; + std::vector> *inputs_; + std::vector> *outputs_; + std::vector> *nodes_; + std::vector tensors_to_be_quantize_; + std::vector supported_quantize_type_; + std::map>> + name2node_dict_; + + void QuantizeInfoBroadcast(); + void RemoveAllQuantizeOps(); + void MergeConvAdd(); + void MergeConvBN(); + + // only_dequantize records those tensors that only need to add the dequantize + // op + void AppendQuantizeTensor(const std::string &tensor, + const bool &only_dequantize = false); + template + bool GetTensorByName(const std::string &name, std::vector *value); + bool GetTensorShape(const std::string &name, std::vector *shape); + // Generate name2node_dict to save input name and its related nodes + void UpdateInputNameToNodes(); + // Perform tensor wise quantization, returning scale and zero + void GetTensorWiseQuantizeInfo(const std::vector &tensor, + std::vector *scale, + std::vector *zero); + // Perform channel wise quantization, returning scale and zero + void GetChannelWiseQuantizeInfo(const std::vector &tensor, + const std::vector &shape, + const int64_t &quant_axis, + std::vector *scale, + std::vector *zero); + // If all tensors in tensor_names have quantize info and all the next nodes + // can be quantized, return True, otherwise + // return false + bool CanBeQuantize(const std::vector &tensor_names, + const std::vector &output_index = {-1}); + // Add quantize related op in model according to tensor names + void AddQDQInModel(); + void RemoveIdentityOp(); + // Determine whether a tensor is an output + bool IsGraphOutput(const std::string &name); + virtual void AddQDQ(); + + // Because processing the quantize model will add new nodes, which will + // destroy the topo sorting of nodes, this function will sort the nodes again + void SortNodes(); + + private: + std::vector only_dequantize_tensors_; + + // Determine if the tensor is directly linked to the output by identity + bool ConnectToOutput(const std::string &output_name); + void RemoveNodeByName(const std::string &name, const bool &update_io = true); + void ReplaceInputOfAllNodes( + const std::string &old_name, const std::string &new_name, + const std::vector> + &except_nodes = {}); +}; +} // namespace paddle2onnx diff --git a/paddle2onnx/mapper/quantize/ort_quantize_processor.cc b/paddle2onnx/mapper/quantize/ort_quantize_processor.cc new file mode 100644 index 000000000..e0601597f --- /dev/null +++ b/paddle2onnx/mapper/quantize/ort_quantize_processor.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle2onnx/mapper/quantize/ort_quantize_processor.h" + +namespace paddle2onnx { +ORTQuantizeProcessor::ORTQuantizeProcessor() { + supported_quantize_type_ = { + "Add", + "Conv", + "LeakyRelu" + "MatMul", + "Mul", + "Relu", + "Sigmoid", + }; +} + +// According to: +// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +void ORTQuantizeProcessor::AddQDQ() { + BaseQuantizeProcessor::AddQDQ(); + for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { + auto node = *iter; + auto type_iter = std::find(supported_quantize_type_.begin(), + supported_quantize_type_.end(), node->op_type()); + if (!supported_quantize_type_.empty() && + type_iter == supported_quantize_type_.end()) { + continue; + } + if (node->op_type() == "MatMul") { + std::vector tensor_names = {node->input(0), node->input(1), + node->output(0)}; + for (auto& name : tensor_names) { + if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { + continue; + } + std::vector matmul_weight; + if (!GetTensorByName(name, &matmul_weight)) { + continue; + } + std::vector matmul_weight_shape; + if (!GetTensorShape(name, &matmul_weight_shape)) { + continue; + } + int64_t quantize_axis = 1; + std::vector scale; + std::vector zeros; + GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, + quantize_axis, &scale, &zeros); + auto scale_node = + helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); + auto zero_node = + helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); + QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, + zero_node, quantize_axis); + helper_->quantize_info[name] = matmul_weight_quantize_info; + } + if (!CanBeQuantize(tensor_names)) { + tensor_names.pop_back(); + if (!CanBeQuantize(tensor_names)) { + continue; + } + } + for (auto& name : tensor_names) { + AppendQuantizeTensor(name); + } + } + + std::vector tensor_names; + for (size_t i = 0; i < node->input_size(); ++i) { + std::string node_input = node->input(i); + tensor_names.push_back(node_input); + } + for (size_t i = 0; i < node->output_size(); ++i) { + std::string node_output = node->output(i); + tensor_names.push_back(node_output); + } + if (!CanBeQuantize(tensor_names)) { + continue; + } + for (auto& name : tensor_names) { + AppendQuantizeTensor(name); + } + } +} + +void ORTQuantizeProcessor::ProcessQuantizeModel( + std::vector>* parameters, + std::vector>* inputs, + std::vector>* outputs, + std::vector>* nodes, + OnnxHelper* helper, const PaddleParser& parser, + std::string* calibration_cache) { + BaseQuantizeProcessor::ProcessQuantizeModel( + parameters, inputs, outputs, nodes, helper, parser, calibration_cache); + + // When deploy_backend is ONNXRuntime, use the follow four steps to process: + // 1. broadcast quantize info + // 2. remove all quantize ops + // 3. merge conv and add + // 4. merge conv and bn + // 5. add Q and DQ according ONNXRuntime quantize OP fuse patten. + // 6. use topo sort in nodes + QuantizeInfoBroadcast(); + RemoveAllQuantizeOps(); + MergeConvAdd(); + MergeConvBN(); + AddQDQ(); + UpdateInputNameToNodes(); + AddQDQInModel(); + SortNodes(); +} +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/ort_quantize_processor.h b/paddle2onnx/mapper/quantize/ort_quantize_processor.h new file mode 100644 index 000000000..34d50cb32 --- /dev/null +++ b/paddle2onnx/mapper/quantize/ort_quantize_processor.h @@ -0,0 +1,37 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle2onnx/mapper/quantize/base_quantize_processor.h" + +namespace paddle2onnx { +class ORTQuantizeProcessor : public BaseQuantizeProcessor { + public: + ORTQuantizeProcessor(); + virtual ~ORTQuantizeProcessor() = default; + + // According to: + // https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc + void AddQDQ() override; + + void ProcessQuantizeModel( + std::vector> *parameters, + std::vector> *inputs, + std::vector> *outputs, + std::vector> *nodes, + OnnxHelper *helper, const PaddleParser &parser, + std::string *calibration_cache = nullptr) override; +}; +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/other_quantize_processor.cc b/paddle2onnx/mapper/quantize/other_quantize_processor.cc new file mode 100644 index 000000000..a11f52282 --- /dev/null +++ b/paddle2onnx/mapper/quantize/other_quantize_processor.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle2onnx/mapper/quantize/other_quantize_processor.h" + +namespace paddle2onnx { +void OtherQuantizeProcessor::ProcessQuantizeModel( + std::vector>* parameters, + std::vector>* inputs, + std::vector>* outputs, + std::vector>* nodes, + OnnxHelper* helper, const PaddleParser& parser, + std::string* calibration_cache) { + BaseQuantizeProcessor::ProcessQuantizeModel( + parameters, inputs, outputs, nodes, helper, parser, calibration_cache); + + // If deploy_backend is others, the quantization model is exported as a + // float model + quantization table. + RemoveAllQuantizeOps(); + std::ofstream outfile; + outfile.open("max_range.txt", std::ios::out); + if (!outfile.is_open()) { + P2OLogger() << "[WARNING] Quantize model processer failed to write range " + "information in current location." + << std::endl; + return; + } + for (auto iter = helper_->quantize_info.begin(); + iter != helper_->quantize_info.end(); iter++) { + std::string log = iter->first; + auto scale = iter->second.scale_; + if (scale.size() == 1) { + log = log + ": " + std::to_string(scale[0] * 127); + outfile << log << std::endl; + } + } + outfile.close(); +} +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/other_quantize_processor.h b/paddle2onnx/mapper/quantize/other_quantize_processor.h new file mode 100644 index 000000000..bf395fceb --- /dev/null +++ b/paddle2onnx/mapper/quantize/other_quantize_processor.h @@ -0,0 +1,33 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle2onnx/mapper/quantize/base_quantize_processor.h" + +namespace paddle2onnx { +class OtherQuantizeProcessor : public BaseQuantizeProcessor { + public: + OtherQuantizeProcessor() = default; + virtual ~OtherQuantizeProcessor() = default; + + void ProcessQuantizeModel( + std::vector> *parameters, + std::vector> *inputs, + std::vector> *outputs, + std::vector> *nodes, + OnnxHelper *helper, const PaddleParser &parser, + std::string *calibration_cache = nullptr) override; +}; +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/rknn_quantize_processor.cc b/paddle2onnx/mapper/quantize/rknn_quantize_processor.cc new file mode 100644 index 000000000..1bc019ba2 --- /dev/null +++ b/paddle2onnx/mapper/quantize/rknn_quantize_processor.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle2onnx/mapper/quantize/rknn_quantize_processor.h" + +namespace paddle2onnx { +RKNNQuantizeProcessor::RKNNQuantizeProcessor() { + supported_quantize_type_ = {"Abs", + "Acos", + "Add", + "Asin", + "Atan", + "AveragePool", + "BatchNormalization", + "Ceil", + "Clip", + "Conv", + "ConvTranspose", + "Cos", + "Cosh", + "Concat", + "Div", + "Elu", + "Erf", + "Exp", + "Floor", + "Gemm", + "GlobalAveragePool", + "HardSigmoid", + "HardSwish", + "InstanceNormalization", + "IsInf", + "IsNaN", + "Log", + "MatMul", + "MaxPool", + "Mul", + "Neg", + "ReduceMean", + "Relu", + "Reshape", + "Resize", + "Round", + "Shape", + "Sigmoid", + "Sin", + "Sinh", + "Slice", + "Softmax", + "Split", + "Sqrt", + "Tan", + "Tanh", + "Transpose"}; +} + +void RKNNQuantizeProcessor::AddQDQ() { + BaseQuantizeProcessor::AddQDQ(); + for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { + auto node = *iter; + auto type_iter = std::find(supported_quantize_type_.begin(), + supported_quantize_type_.end(), node->op_type()); + if (type_iter == supported_quantize_type_.end()) { + continue; + } + + std::vector tensor_names = {}; + for (size_t i = 0; i < node->input_size(); ++i) { + std::string node_input = node->input(i); + tensor_names.push_back(node_input); + } + for (size_t i = 0; i < node->output_size(); ++i) { + std::string node_output = node->output(i); + tensor_names.push_back(node_output); + } + + if (node->op_type() == "MatMul" || node->op_type() == "Mul") { + for (auto& name : tensor_names) { + if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { + continue; + } + std::vector matmul_weight; + if (!GetTensorByName(name, &matmul_weight)) { + continue; + } + std::vector matmul_weight_shape; + if (!GetTensorShape(name, &matmul_weight_shape)) { + continue; + } + int64_t quantize_axis = 1; + std::vector scale; + std::vector zeros; + GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, + quantize_axis, &scale, &zeros); + auto scale_node = + helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); + auto zero_node = + helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); + QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, + zero_node, quantize_axis); + helper_->quantize_info[name] = matmul_weight_quantize_info; + } + } else if (node->op_type() == "Add") { + for (auto& name : tensor_names) { + if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { + continue; + } + + std::vector weight; + if (!GetTensorByName(name, &weight)) { + P2OLogger() << "Failed to GetTensorByName: " << node->name() << ";" + << name << std::endl; + continue; + } + + int64_t quantize_axis = 1; + std::vector scale; + std::vector zeros; + GetTensorWiseQuantizeInfo(weight, &scale, &zeros); + + std::string scale_node, zero_node; + scale_node = + helper_->Constant({}, ONNX_NAMESPACE::TensorProto::FLOAT, scale[0]); + zero_node = + helper_->Constant({}, ONNX_NAMESPACE::TensorProto::INT8, zeros[0]); + + QuantizeInfo quantize_info(scale, zeros, scale_node, zero_node, + quantize_axis); + helper_->quantize_info[name] = quantize_info; + } + } else if (node->op_type() == "BatchNormalization") { + // BatchNormalization only need quntize X and Y. + // when opset > 9, tensor_names is {X, scale, B, input_mean, input_var, Y, + // running_mean, running_var} when opset <= 9, tensor_names is {X, scale, + // B, mean, var, Y, mean, var, saved_mean, saved_var} + tensor_names.erase(tensor_names.begin() + 1, tensor_names.begin() + 5); + tensor_names.erase(tensor_names.begin() + 2, tensor_names.end()); + } + + if (!CanBeQuantize(tensor_names)) { + continue; + } + + for (auto& name : tensor_names) { + AppendQuantizeTensor(name); + } + } +} + +void RKNNQuantizeProcessor::PerchannelToPerlayer() { + UpdateInputNameToNodes(); + for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { + auto node = *iter; + + if (node->op_type() != "MatMul" && node->op_type() != "Mul") { + continue; + } + + auto next_nodes = name2node_dict_[node->output(0)]; + if (next_nodes.size() > 1 || IsGraphOutput(node->output(0))) { + P2OLogger() << "Type1" << std::endl; + continue; + } + + auto add_node = next_nodes[0]; + if (add_node->op_type() != "Add" || IsGraphOutput(add_node->output(0))) { + P2OLogger() << "Type2" << std::endl; + continue; + } + + std::vector tensor_names = {}; + for (size_t i = 0; i < node->input_size(); ++i) { + std::string node_input = node->input(i); + tensor_names.push_back(node_input); + } + for (size_t i = 0; i < node->output_size(); ++i) { + std::string node_output = node->output(i); + tensor_names.push_back(node_output); + } + + for (auto& name : tensor_names) { + if (helper_->quantize_info.find(name) == helper_->quantize_info.end()) { + continue; + } + + auto ori_quantize_info = helper_->quantize_info[name]; + auto ori_scale = ori_quantize_info.scale_; + + int64_t now_quantize_axis = 1; + std::vector now_scale = { + *std::max_element(ori_scale.begin(), ori_scale.end())}; + std::vector now_zeros = {0}; + + std::string scale_node, zero_node; + scale_node = helper_->Constant({}, ONNX_NAMESPACE::TensorProto::FLOAT, + now_scale[0]); + zero_node = helper_->Constant({}, ONNX_NAMESPACE::TensorProto::INT8, + now_zeros[0]); + + QuantizeInfo now_quantize_info(now_scale, now_zeros, scale_node, + zero_node, now_quantize_axis); + helper_->quantize_info[name] = now_quantize_info; + } + } +} + +void RKNNQuantizeProcessor::ProcessQuantizeModel( + std::vector>* parameters, + std::vector>* inputs, + std::vector>* outputs, + std::vector>* nodes, + OnnxHelper* helper, const PaddleParser& parser, + std::string* calibration_cache) { + BaseQuantizeProcessor::ProcessQuantizeModel( + parameters, inputs, outputs, nodes, helper, parser, calibration_cache); + + // When deploy_backend is RKNN, use the follow four steps to process: + // 1. broadcast quantize info + // 2. remove all quantize ops + // 3. add Q and DQ + // 4. use topo sort in nodes + QuantizeInfoBroadcast(); + RemoveAllQuantizeOps(); + RemoveIdentityOp(); + // MergeConvAdd(); + // MergeConvBN(); + AddQDQ(); + PerchannelToPerlayer(); + UpdateInputNameToNodes(); + AddQDQInModel(); + SortNodes(); +} +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/rknn_quantize_processor.h b/paddle2onnx/mapper/quantize/rknn_quantize_processor.h new file mode 100644 index 000000000..93c72bfcc --- /dev/null +++ b/paddle2onnx/mapper/quantize/rknn_quantize_processor.h @@ -0,0 +1,38 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle2onnx/mapper/quantize/base_quantize_processor.h" + +namespace paddle2onnx { +class RKNNQuantizeProcessor : public BaseQuantizeProcessor { + public: + RKNNQuantizeProcessor(); + virtual ~RKNNQuantizeProcessor() = default; + + void AddQDQ() override; + + void ProcessQuantizeModel( + std::vector> *parameters, + std::vector> *inputs, + std::vector> *outputs, + std::vector> *nodes, + OnnxHelper *helper, const PaddleParser &parser, + std::string *calibration_cache = nullptr) override; + + private: + void PerchannelToPerlayer(); +}; +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/tensorrt_quantize_processor.cc b/paddle2onnx/mapper/quantize/tensorrt_quantize_processor.cc new file mode 100644 index 000000000..af4575643 --- /dev/null +++ b/paddle2onnx/mapper/quantize/tensorrt_quantize_processor.cc @@ -0,0 +1,183 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle2onnx/mapper/quantize/tensorrt_quantize_processor.h" + +namespace paddle2onnx { +// In TensorRT, all quantized op: Conv, ConvTranspose, liner(MatMul), MaxPool, +// AvgPool, AdaptiveAvgPool, rnn(not support now) +// According to: +// https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules +void TensorRTQuantizeProcessor::AddQDQ() { + BaseQuantizeProcessor::AddQDQ(); + std::vector + quantize_tensors; // save the tensor names that need add quantize ops + std::vector pool_types = {"MaxPool", "AvgPool", + "AdaptiveAvgPool"}; + for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { + quantize_tensors.clear(); + auto node = *iter; + if (node->op_type() == "Conv" || node->op_type() == "ConvTranspose") { + std::vector tensor_names = {node->input(0), node->input(1)}; + if (!CanBeQuantize(tensor_names)) { + continue; + } + quantize_tensors = tensor_names; + } + if (node->op_type() == "MatMul") { + std::vector tensor_names = {node->input(0), node->input(1)}; + for (auto& name : tensor_names) { + if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { + continue; + } + std::vector matmul_weight; + if (!GetTensorByName(name, &matmul_weight)) { + continue; + } + std::vector matmul_weight_shape; + if (!GetTensorShape(name, &matmul_weight_shape)) { + continue; + } + int64_t quantize_axis = 1; + std::vector scale; + std::vector zeros; + GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, + quantize_axis, &scale, &zeros); + auto scale_node = + helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); + auto zero_node = + helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); + QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, + zero_node, quantize_axis); + helper_->quantize_info[name] = matmul_weight_quantize_info; + } + if (!CanBeQuantize(tensor_names)) { + continue; + } + quantize_tensors = tensor_names; + } + auto type_iter = + std::find(pool_types.begin(), pool_types.end(), node->op_type()); + if (type_iter != pool_types.end()) { + std::vector tensor_names = {node->input(0)}; + if (!CanBeQuantize(tensor_names)) { + continue; + } + quantize_tensors = tensor_names; + } + + std::string negative_scale_tensor = ""; + for (std::string& name : quantize_tensors) { + Assert(helper_->quantize_info.find(name) != helper_->quantize_info.end(), + "[BaseQuantizeProcessor] Can not find quantize info for tensor: " + + name); + QuantizeInfo quantize_info = helper_->quantize_info[name]; + std::vector scales = quantize_info.scale_; + for (auto& i : scales) { + if (i <= 1e-10) { + negative_scale_tensor = negative_scale_tensor + " " + name; + } + } + } + if (negative_scale_tensor.size() > 0) { + P2OLogger() + << "[Warning] The scale of tensors: [ " + negative_scale_tensor + + " ] contains negative scale, so this OP will not be quantized." + << std::endl; + continue; + } + // An OP requires a separate quantize op + for (std::string& name : quantize_tensors) { + if (IsGraphOutput(name)) { + continue; + } + QuantizeInfo quantize_info = helper_->quantize_info[name]; + std::string scale_node = quantize_info.scale_node_; + std::string zeros_node = quantize_info.zeros_node_; + int64_t quantize_axis = quantize_info.quantize_axis_; + auto q_node = + helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node}); + if (helper_->GetOpsetVersion() >= 13) { + AddAttribute(q_node, "axis", quantize_axis); + } + auto dq_node = helper_->MakeNode( + "DequantizeLinear", {q_node->output(0), scale_node, zeros_node}); + if (helper_->GetOpsetVersion() >= 13) { + AddAttribute(dq_node, "axis", quantize_axis); + } + for (size_t i = 0; i < node->input_size(); ++i) { + if (node->input(i) == name) { + node->set_input(i, dq_node->output(0)); + } + } + } + } +} + +void TensorRTQuantizeProcessor::GenerateCache(std::string* calibration_cache) { + union { + float f; + unsigned char farray[4]; + } un; + *calibration_cache += "TRT-8XXX-EntropyCalibration2 \n"; + for (auto iter = helper_->quantize_info.rbegin(); + iter != helper_->quantize_info.rend(); iter++) { + std::string tensor_name = iter->first; + QuantizeInfo quantize_info = iter->second; + if (quantize_info.scale_.size() == 1) { + float val = quantize_info.scale_[0]; + un.f = val; + *calibration_cache += (tensor_name + ": "); + std::stringstream enc; + for (int64_t i = 3; i >= 0; i--) { + enc << std::hex << std::setw(2) << std::setfill('0') + << (int)(un.farray[i]); + } + *calibration_cache = *calibration_cache + enc.str() + "\n"; + } + } +} + +void TensorRTQuantizeProcessor::ProcessQuantizeModel( + std::vector>* parameters, + std::vector>* inputs, + std::vector>* outputs, + std::vector>* nodes, + OnnxHelper* helper, const PaddleParser& parser, + std::string* calibration_cache) { + BaseQuantizeProcessor::ProcessQuantizeModel( + parameters, inputs, outputs, nodes, helper, parser, calibration_cache); + + // When deploy_backend is TensorRT, use the follow four steps to process: + // For Explicit Quantization + // 1. broadcast quantize info + // 2. remove all quantize ops + // 3. add Q and DQ before conv and matmul. + // 4. use topo sort in nodes + + // For Implicit Quantization + // 1. remove all quantize ops + // 2. broadcast quantize info + // 3. save float onnx model and alibration.cache + QuantizeInfoBroadcast(); + RemoveAllQuantizeOps(); + // Add qdq for Explicit Quantization + // AddTrtQDQ(); + // SortNodes(); + + // Genarate calibration.cache for Implicit Quantization + // convert float to hex + GenerateCache(calibration_cache); +} +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize/tensorrt_quantize_processor.h b/paddle2onnx/mapper/quantize/tensorrt_quantize_processor.h new file mode 100644 index 000000000..3b207e178 --- /dev/null +++ b/paddle2onnx/mapper/quantize/tensorrt_quantize_processor.h @@ -0,0 +1,42 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle2onnx/mapper/quantize/base_quantize_processor.h" + +namespace paddle2onnx { +class TensorRTQuantizeProcessor : public BaseQuantizeProcessor { + public: + TensorRTQuantizeProcessor() = default; + virtual ~TensorRTQuantizeProcessor() = default; + + void ProcessQuantizeModel( + std::vector> *parameters, + std::vector> *inputs, + std::vector> *outputs, + std::vector> *nodes, + OnnxHelper *helper, const PaddleParser &parser, + std::string *calibration_cache = nullptr) override; + + protected: + // According to: + // https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules + void AddQDQ() override; + + private: + // Generate cache file for TensorRT8.X int8 deploy + void GenerateCache(std::string *calibration_cache); +}; +} // namespace paddle2onnx \ No newline at end of file diff --git a/paddle2onnx/mapper/quantize_helper.h b/paddle2onnx/mapper/quantize_helper.h deleted file mode 100755 index 31913b459..000000000 --- a/paddle2onnx/mapper/quantize_helper.h +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include -#include -#include - -#include "paddle2onnx/mapper/mapper.h" -#include "paddle2onnx/parser/parser.h" -namespace paddle2onnx { - -struct QuantizeModelProcessor { - public: - std::vector quantize_info; - const PaddleParser* parser_; - OnnxHelper* helper_; - - std::vector>* parameters_; - std::vector>* inputs_; - std::vector>* outputs_; - std::vector>* nodes_; - // All types that support quantization - std::vector supported_quantize_type_; - - std::map>> - name2node_dict_; - std::vector tensors_to_be_quantize; // records those tensors - // that need to add quantize - // and dequantize op - std::vector only_dequantize_tensors; // records those tensors - // that only need to add - // the dequantize op - // Convert to different model formats based on backend, backend can be - // TensorRT, ONNXRuntime and Others - void ProcessQuantizeModel( - std::vector>* parameters, - std::vector>* inputs, - std::vector>* outputs, - std::vector>* nodes, - OnnxHelper* helper, const std::string& deploy_backend, - const PaddleParser& parser, std::string* calibration_cache = nullptr); - - // Remove all Quantize and Dequantize ops - void RemoveAllQuantizeOps(); - - // If all tensors in tensor_names have quantize info and all the next nodes - // can be quantized, return True, otherwise - // return false - bool CanBeQuantize(const std::vector& tensor_names, - const std::vector& output_index = {-1}); - // only_dequantize records those tensors that only need to add the dequantize - // op - void AppendQuantizeTensor(const std::string& tensor, - const bool& only_dequantize = false); - - // Add QDQ for ORT according to: - // https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc - void AddQDQForORT(); - - // Determine if the tensor is directly linked to the output by identity - bool ConnectToOutput(const std::string& output_name); - - // Generate cache file for TensorRT8.X int8 deploy - void GenerateCache(std::string* calibration_cache); - - // Add QDQ for TRT according to: - // https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules - void AddTrtQDQ(); - - // Add QDQ for RKNN - void AddQDQForRKNN(); - - void RemoveIdentityOp(); - - // Add quantize related op in model according to tensor names - void AddQDQInModel(const std::vector& tensors_to_be_quantize); - - void QuantizeInfoBroadcast(); - - // merge conv + add - void MergeConvAdd(); - - // merge conv + BN - void MergeConvBN(); - - // Determine whether a tensor is an output - bool IsGraphOutput(const std::string& name); - - // Because processing the quantize model will add new nodes, which will - // destroy the topo sorting of nodes, this function will sort the nodes again - void SortNodes(); - - bool GetTensorShape(const std::string& name, std::vector* shape); - - // return the value of tensor by name - template - bool GetTensorByName(const std::string& name, std::vector* value); - - // Perform tensor wise quantization, returning scale and zero - void GetTensorWiseQuantizeInfo(const std::vector& tensor, - std::vector* scale, - std::vector* zero); - - // Perform channel wise quantization, returning scale and zero - void GetChannelWiseQuantizeInfo(const std::vector& tensor, - const std::vector& shape, - const int64_t& quant_axis, - std::vector* scale, - std::vector* zero); - - // Generate name2node_dict to save input name and its related nodes - void UpdateInputNameToNodes(); - - void RemoveNodeByName(const std::string& name, const bool& update_io = true); - - void ReplaceInputOfAllNodes( - const std::string& old_name, const std::string& new_name, - const std::vector>& - except_nodes = {}); -}; -} // namespace paddle2onnx