Skip to content

Commit

Permalink
New format quant model support for MKLDNN (#45416)
Browse files Browse the repository at this point in the history
* support onnx format quantized model

* update code

* add test

* add test

* fix

* fix test

* fix cmake

* update code

* change scale file path to calibration file path

* update code

* update code

* fix build bug

* fix build bugs

* fix

* fix
  • Loading branch information
yeliang2258 authored Sep 5, 2022
1 parent fd56f08 commit 4e4f458
Show file tree
Hide file tree
Showing 12 changed files with 889 additions and 57 deletions.
303 changes: 250 additions & 53 deletions paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc

Large diffs are not rendered by default.

51 changes: 47 additions & 4 deletions paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,34 @@ class QuantDequantMkldnnPass : public FusePassBase {
std::unordered_map<std::string, std::vector<float>>* weight_thresholds)
const;

void CollectInputScalesFromFake(
///
/// \brief collect scale info for weight from onnx_format dequantize_linear op
/// onnx_format_dequantize_types: the onnx_format dequantize op type
/// weight_thresholds: scale info for weight
/// var_quant_scales: scale info for act
/// onnx_format_quantize_model: recorder if the quantize model is a
/// onnx_format quantize model
///
void CollectWeightScalesInfoFromONNXFormatDequantize(
ir::Graph* graph,
Scope* scope,
std::unordered_map<std::string, std::vector<float>>* weight_thresholds,
std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
bool* onnx_format_quantize_model) const;

void CollectInputScalesFromQuantize(
ir::Graph* graph,
Scope* scope,
const std::unordered_set<std::string>& fake_quantize_types,
std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
const;

void ConvertFromINT8ToFP32(const std::vector<float>& scales,
Tensor* weight_tensor,
int8_t* int8_weight_data,
float* fp32_weight_data,
const std::string& weight_var_name) const;

void CollectOutputScalesFromAttr(
ir::Graph* graph,
std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
Expand All @@ -64,12 +85,22 @@ class QuantDequantMkldnnPass : public FusePassBase {
Node* op_node,
std::unordered_set<const Node*>* nodes2rm) const;

///
/// \brief collect all the onnx_format quantize related ops to remove
/// nodes2rm: record all quantize related ops to remove
///
void CollectQuantizeDequantizeOpsFromONNXFormat(
ir::Graph* graph,
Node* op_node,
std::unordered_set<const Node*>* nodes2rm) const;

void RemoveFakeOps(
ir::Graph* graph,
const std::unordered_set<std::string>& fake_quantize_types,
const std::unordered_set<std::string>& fake_dequantize_types,
const std::unordered_set<std::string>& fake_quantize_dequantize_types)
const;
const std::unordered_set<std::string>& fake_quantize_dequantize_types,
const std::unordered_set<std::string>&
onnx_format_quantize_dequantize_types) const;

bool IsInt8Weight(Node* op_node,
Scope* scope,
Expand All @@ -85,11 +116,23 @@ class QuantDequantMkldnnPass : public FusePassBase {
const std::unordered_map<std::string, std::vector<float>>&
weight_thresholds) const;

///
/// \brief Dequantize weight in conv or matmul
/// weight_thresholds: recorded scale info for weight
///
void DequantizeOpWeightsFromONNXFormat(
Node* op_node,
Scope* scope,
const std::string& weight_name,
const std::unordered_map<std::string, std::vector<float>>&
weight_thresholds) const;

void DequantizeWeights(
ir::Graph* graph,
Scope* scope,
const std::unordered_map<std::string, std::vector<float>>&
weight_thresholds) const;
weight_thresholds,
const bool& onnx_format_quantize_model) const;

void UpdateActivations(ir::Graph* graph) const;

Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/inference/analysis/argument.h
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ struct Argument {
DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);

#ifdef PADDLE_WITH_MKLDNN
// Calibration file path of quantize model
DECL_ARGUMENT_FIELD(calibration_file_path, CalibrationFilePath, std::string);

// A set of op types to enable their quantized kernels
DECL_ARGUMENT_FIELD(quantize_enabled_op_types,
QuantizeEnabledOpTypes,
Expand Down
51 changes: 51 additions & 0 deletions paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"

#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
#endif

namespace paddle {
namespace inference {
namespace analysis {
Expand All @@ -32,6 +36,19 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
auto* the_graph = argument->ReleaseMainGraph();
auto graph = std::unique_ptr<Graph>(the_graph);

#ifdef PADDLE_WITH_MKLDNN
if (argument->Has("calibration_file_path")) {
VLOG(5) << "Calibration file path of quantize model: "
<< argument->calibration_file_path();
std::unordered_map<std::string, std::vector<float>> var_quant_scales{};
ReadCalibrationInfo(argument, &var_quant_scales);
// save var_quant_scales in the first op's attr
// for quant_dequant_mkldnn_pass
SaveInfoInTheFirstOp(
the_graph, "has_quant_info", "var_quant_scales", var_quant_scales);
}
#endif

// Apply passes.
IRPassManager the_ir_manager(argument);
graph = the_ir_manager.Apply(std::move(graph));
Expand All @@ -44,6 +61,40 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
CollectFusionStatis(argument);
}

void IrAnalysisPass::ReadCalibrationInfo(
Argument* argument,
std::unordered_map<std::string, std::vector<float>>* var_quant_scales) {
std::string calibration_file_path;
#ifdef PADDLE_WITH_MKLDNN
if (argument->Has("calibration_file_path")) {
calibration_file_path = argument->calibration_file_path();
}
#endif
if (calibration_file_path.empty()) {
LOG(INFO) << "argument has no calibration_file_path";
return;
}
std::ifstream calibration_file(calibration_file_path);
std::string one_line;
while (getline(calibration_file, one_line)) {
if (one_line.find(" ") != one_line.npos) {
auto pos = one_line.find(" ");
std::string pre_str = one_line.substr(0, pos);
std::string pos_str = one_line.substr(pos);
if (pre_str.size() && pos_str.size()) {
std::string tensor_name = pre_str;
float scale = std::stod(pos_str);
scale = 1.0 / scale;
if (std::isinf(scale) || std::isnan(scale)) {
continue;
}
std::vector<float> scales = {scale};
(*var_quant_scales)[tensor_name] = scales;
}
}
}
}

void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
LOG(INFO) << "argument has no fuse statis";
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ class IrAnalysisPass : public AnalysisPass {

void CollectFusionStatis(Argument* argument);

void ReadCalibrationInfo(
Argument* argument,
std::unordered_map<std::string, std::vector<float>>* var_quant_scales);

std::string repr() const override;
};

Expand Down
15 changes: 15 additions & 0 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(opt_cache_dir_);
CP_MEMBER(prog_file_);
CP_MEMBER(params_file_);
CP_MEMBER(calibration_file_path_);

CP_MEMBER(use_fc_padding_);
// GPU related.
Expand Down Expand Up @@ -516,6 +517,14 @@ void AnalysisConfig::EnableMkldnnInt8(
Update();
}

void AnalysisConfig::SetCalibrationFilePath(
const std::string &calibration_file_path) {
calibration_file_path_ = calibration_file_path;
VLOG(1) << "Set calibration file path of quantize model: " +
calibration_file_path_;
Update();
}

MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
platform::errors::PreconditionNotMet(
Expand Down Expand Up @@ -827,6 +836,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << prog_file_;
ss << params_file_;

ss << calibration_file_path_;

ss << use_gpu_;
ss << use_external_stream_;
ss << exec_stream_;
Expand Down Expand Up @@ -1009,6 +1020,10 @@ std::string AnalysisConfig::Summary() {
os.InsertRow({"model_file", prog_file_});
os.InsertRow({"params_file", params_file_});
}
if (!(calibration_file_path_.empty())) {
os.InsertRow({"calibration_file_path", calibration_file_path_});
}

if (model_from_memory_) {
os.InsertRow({"model_from_memory", params_file_});
}
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetQuantizeEnabledOpTypes(config_.quantize_enabled_op_types_);
argument_.SetQuantizeExcludedOpIds(config_.quantize_excluded_op_ids_);
argument_.SetQuantVarScales({});
argument_.SetCalibrationFilePath(config_.calibration_file_path_);
}
#endif

Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/inference/api/paddle_analysis_config.h
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,18 @@ struct PD_INFER_DECL AnalysisConfig {
///
void EnableMkldnnQuantizer();

///
/// \brief Set the calibration ranges file path of quantize model.
///
///
void SetCalibrationFilePath(const std::string& calibration_file_path = "");

///
/// \brief Return the calibration ranges file path of quantize model.
///
///
std::string CalibrationFilePath() { return calibration_file_path_; }

///
/// \brief Turn on MKLDNN int8.
///
Expand Down Expand Up @@ -941,6 +953,7 @@ struct PD_INFER_DECL AnalysisConfig {
std::string model_dir_;
mutable std::string prog_file_;
mutable std::string params_file_;
mutable std::string calibration_file_path_;

// Mixed precision.
std::unordered_set<std::string> mixed_black_list_;
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/pybind/inference_api.cc
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,9 @@ void BindAnalysisConfig(py::module *m) {
.def("to_native_config", &AnalysisConfig::ToNativeConfig)
.def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
.def("set_calibration_file_path",
&AnalysisConfig::SetCalibrationFilePath,
py::arg("calibration_file_path") = std::string(""))
#ifdef PADDLE_WITH_MKLDNN
.def("quantizer_config",
&AnalysisConfig::mkldnn_quantizer_config,
Expand Down
Empty file modified python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
100644 → 100755
Empty file.
10 changes: 10 additions & 0 deletions python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,19 @@ file(
"test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

list(REMOVE_ITEM TEST_OPS "test_onnx_format_quantization_mobilenetv1")

if(WITH_MKLDNN AND NOT WIN32)
list(APPEND TEST_OPS "test_onnx_format_quantization_mobilenetv1")
endif()

foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach()
set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
if(WITH_MKLDNN AND NOT WIN32)
set_tests_properties(test_onnx_format_quantization_mobilenetv1
PROPERTIES TIMEOUT 300)
endif()
set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)
Loading

0 comments on commit 4e4f458

Please sign in to comment.