PaddlePaddle · jiangjiajun · Sep 5, 2022 · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -43,13 +43,34 @@ class QuantDequantMkldnnPass : public FusePassBase {
       std::unordered_map<std::string, std::vector<float>>* weight_thresholds)
       const;
 
-  void CollectInputScalesFromFake(
+  ///
+  /// \brief collect scale info for weight from onnx_format dequantize_linear op
+  /// onnx_format_dequantize_types: the onnx_format dequantize op type
+  /// weight_thresholds: scale info for weight
+  /// var_quant_scales: scale info for act
+  /// onnx_format_quantize_model: recorder if the quantize model is a
+  /// onnx_format quantize model
+  ///
+  void CollectWeightScalesInfoFromONNXFormatDequantize(
+      ir::Graph* graph,
+      Scope* scope,
+      std::unordered_map<std::string, std::vector<float>>* weight_thresholds,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+      bool* onnx_format_quantize_model) const;
+
+  void CollectInputScalesFromQuantize(
       ir::Graph* graph,
       Scope* scope,
       const std::unordered_set<std::string>& fake_quantize_types,
       std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
       const;
 
+  void ConvertFromINT8ToFP32(const std::vector<float>& scales,
+                             Tensor* weight_tensor,
+                             int8_t* int8_weight_data,
+                             float* fp32_weight_data,
+                             const std::string& weight_var_name) const;
+
   void CollectOutputScalesFromAttr(
       ir::Graph* graph,
       std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
@@ -64,12 +85,22 @@ class QuantDequantMkldnnPass : public FusePassBase {
       Node* op_node,
       std::unordered_set<const Node*>* nodes2rm) const;
 
+  ///
+  /// \brief collect all the onnx_format quantize related ops to remove
+  /// nodes2rm: record all quantize related ops to remove
+  ///
+  void CollectQuantizeDequantizeOpsFromONNXFormat(
+      ir::Graph* graph,
+      Node* op_node,
+      std::unordered_set<const Node*>* nodes2rm) const;
+
   void RemoveFakeOps(
       ir::Graph* graph,
       const std::unordered_set<std::string>& fake_quantize_types,
       const std::unordered_set<std::string>& fake_dequantize_types,
-      const std::unordered_set<std::string>& fake_quantize_dequantize_types)
-      const;
+      const std::unordered_set<std::string>& fake_quantize_dequantize_types,
+      const std::unordered_set<std::string>&
+          onnx_format_quantize_dequantize_types) const;
 
   bool IsInt8Weight(Node* op_node,
                     Scope* scope,
@@ -85,11 +116,23 @@ class QuantDequantMkldnnPass : public FusePassBase {
       const std::unordered_map<std::string, std::vector<float>>&
           weight_thresholds) const;
 
+  ///
+  /// \brief Dequantize weight in conv or matmul
+  /// weight_thresholds: recorded scale info for weight
+  ///
+  void DequantizeOpWeightsFromONNXFormat(
+      Node* op_node,
+      Scope* scope,
+      const std::string& weight_name,
+      const std::unordered_map<std::string, std::vector<float>>&
+          weight_thresholds) const;
+
   void DequantizeWeights(
       ir::Graph* graph,
       Scope* scope,
       const std::unordered_map<std::string, std::vector<float>>&
-          weight_thresholds) const;
+          weight_thresholds,
+      const bool& onnx_format_quantize_model) const;
 
   void UpdateActivations(ir::Graph* graph) const;
 

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
@@ -177,6 +177,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);
 
 #ifdef PADDLE_WITH_MKLDNN
+  // Calibration file path of quantize model
+  DECL_ARGUMENT_FIELD(calibration_file_path, CalibrationFilePath, std::string);
+
   // A set of op types to enable their quantized kernels
   DECL_ARGUMENT_FIELD(quantize_enabled_op_types,
                       QuantizeEnabledOpTypes,

diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -20,6 +20,10 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#endif
+
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -32,6 +36,19 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   auto* the_graph = argument->ReleaseMainGraph();
   auto graph = std::unique_ptr<Graph>(the_graph);
 
+#ifdef PADDLE_WITH_MKLDNN
+  if (argument->Has("calibration_file_path")) {
+    VLOG(5) << "Calibration file path of quantize model: "
+            << argument->calibration_file_path();
+    std::unordered_map<std::string, std::vector<float>> var_quant_scales{};
+    ReadCalibrationInfo(argument, &var_quant_scales);
+    // save var_quant_scales in the first op's attr
+    // for quant_dequant_mkldnn_pass
+    SaveInfoInTheFirstOp(
+        the_graph, "has_quant_info", "var_quant_scales", var_quant_scales);
+  }
+#endif
+
   // Apply passes.
   IRPassManager the_ir_manager(argument);
   graph = the_ir_manager.Apply(std::move(graph));
@@ -44,6 +61,40 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   CollectFusionStatis(argument);
 }
 
+void IrAnalysisPass::ReadCalibrationInfo(
+    Argument* argument,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales) {
+  std::string calibration_file_path;
+#ifdef PADDLE_WITH_MKLDNN
+  if (argument->Has("calibration_file_path")) {
+    calibration_file_path = argument->calibration_file_path();
+  }
+#endif
+  if (calibration_file_path.empty()) {
+    LOG(INFO) << "argument has no calibration_file_path";
+    return;
+  }
+  std::ifstream calibration_file(calibration_file_path);
+  std::string one_line;
+  while (getline(calibration_file, one_line)) {
+    if (one_line.find(" ") != one_line.npos) {
+      auto pos = one_line.find(" ");
+      std::string pre_str = one_line.substr(0, pos);
+      std::string pos_str = one_line.substr(pos);
+      if (pre_str.size() && pos_str.size()) {
+        std::string tensor_name = pre_str;
+        float scale = std::stod(pos_str);
+        scale = 1.0 / scale;
+        if (std::isinf(scale) || std::isnan(scale)) {
+          continue;
+        }
+        std::vector<float> scales = {scale};
+        (*var_quant_scales)[tensor_name] = scales;
+      }
+    }
+  }
+}
+
 void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
   if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
     LOG(INFO) << "argument has no fuse statis";

diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -33,6 +33,10 @@ class IrAnalysisPass : public AnalysisPass {
 
   void CollectFusionStatis(Argument* argument);
 
+  void ReadCalibrationInfo(
+      Argument* argument,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales);
+
   std::string repr() const override;
 };
 

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
@@ -246,6 +246,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(opt_cache_dir_);
   CP_MEMBER(prog_file_);
   CP_MEMBER(params_file_);
+  CP_MEMBER(calibration_file_path_);
 
   CP_MEMBER(use_fc_padding_);
   // GPU related.
@@ -509,6 +510,14 @@ void AnalysisConfig::EnableMkldnnInt8(
   Update();
 }
 
+void AnalysisConfig::SetCalibrationFilePath(
+    const std::string &calibration_file_path) {
+  calibration_file_path_ = calibration_file_path;
+  VLOG(1) << "Set calibration file path of quantize model: " +
+                 calibration_file_path_;
+  Update();
+}
+
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
                           platform::errors::PreconditionNotMet(
@@ -805,6 +814,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << prog_file_;
   ss << params_file_;
 
+  ss << calibration_file_path_;
+
   ss << use_gpu_;
   ss << use_external_stream_;
   ss << exec_stream_;
@@ -987,6 +998,10 @@ std::string AnalysisConfig::Summary() {
     os.InsertRow({"model_file", prog_file_});
     os.InsertRow({"params_file", params_file_});
   }
+  if (!(calibration_file_path_.empty())) {
+    os.InsertRow({"calibration_file_path", calibration_file_path_});
+  }
+
   if (model_from_memory_) {
     os.InsertRow({"model_from_memory", params_file_});
   }

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1186,6 +1186,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetQuantizeEnabledOpTypes(config_.quantize_enabled_op_types_);
     argument_.SetQuantizeExcludedOpIds(config_.quantize_excluded_op_ids_);
     argument_.SetQuantVarScales({});
+    argument_.SetCalibrationFilePath(config_.calibration_file_path_);
   }
 #endif
 

diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -755,6 +755,18 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief Set the calibration ranges file path of quantize model.
+  ///
+  ///
+  void SetCalibrationFilePath(const std::string& calibration_file_path = "");
+
+  ///
+  /// \brief Return the calibration ranges file path of quantize model.
+  ///
+  ///
+  std::string CalibrationFilePath() { return calibration_file_path_; }
+
   ///
   /// \brief Turn on MKLDNN int8.
   ///
@@ -933,6 +945,7 @@ struct PD_INFER_DECL AnalysisConfig {
   std::string model_dir_;
   mutable std::string prog_file_;
   mutable std::string params_file_;
+  mutable std::string calibration_file_path_;
 
   // Mixed precision.
   std::unordered_set<std::string> mixed_black_list_;

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
@@ -750,6 +750,9 @@ void BindAnalysisConfig(py::module *m) {
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
       .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
       .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
+      .def("set_calibration_file_path",
+           &AnalysisConfig::SetCalibrationFilePath,
+           py::arg("calibration_file_path") = std::string(""))
 #ifdef PADDLE_WITH_MKLDNN
       .def("quantizer_config",
            &AnalysisConfig::mkldnn_quantizer_config,

diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -4,9 +4,19 @@ file(
   "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+list(REMOVE_ITEM TEST_OPS "test_onnx_format_quantization_mobilenetv1")
+
+if(WITH_MKLDNN AND NOT WIN32)
+  list(APPEND TEST_OPS "test_onnx_format_quantization_mobilenetv1")
+endif()
+
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
+if(WITH_MKLDNN AND NOT WIN32)
+  set_tests_properties(test_onnx_format_quantization_mobilenetv1
+                       PROPERTIES TIMEOUT 300)
+endif()
 set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)