Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QDQ tool modification part2 #9720

Merged
merged 9 commits into from
Nov 30, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ def quantize(self):
nodes_to_iterate = itertools.chain(node.input, node.output)

for tensor_name in nodes_to_iterate:
self.quantizer.quantize_tensor(tensor_name)
if self.quantizer.is_per_channel():
self.quantizer.quantize_tensor_per_channel(tensor_name, self.quantizer.qdq_channel_axis)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if the operator doesn't support per-channel

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cpu ep only per-channel for Conv and Matmul now

Copy link
Contributor Author

@chilo-ms chilo-ms Nov 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a new extra options OpTypesSupportPerChannelQuantization

else:
self.quantizer.quantize_tensor(tensor_name)
113 changes: 92 additions & 21 deletions onnxruntime/python/tools/quantization/qdq_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,28 @@ def __init__(self, model, per_channel, reduce_range, mode, static, weight_qType,
self.add_qdq_pair_to_weight = False if 'AddQDQPairToWeight' not in extra_options \
else extra_options['AddQDQPairToWeight']

# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
# In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node.
self.dedicated_qdq_pair = False if 'DedicatedQDQPair' not in extra_options else extra_options['DedicatedQDQPair']
if self.dedicated_qdq_pair:
self.tensor_to_its_receiving_nodes = {}

# Channel axis when per_channel is True
self.qdq_channel_axis = 0 if 'QDQChannelAxis' not in extra_options else extra_options['QDQChannelAxis']

# In TRT, it recommended to add QDQ pair to inputs of Add node followed by ReduceMean node.
# If True and Add node is in op_types_to_quantize, other Add nodes that don't meet the requirement above won't be adding QDQ pair.
if "Add" in self.op_types_to_quantize:
self.add_qdq_to_add_node_followed_by_redeuce_mean_node = False if 'AddQDQToAddNodeFollowedByReduceMeanNode' not in extra_options \
else extra_options['AddQDQToAddNodeFollowedByReduceMeanNode']
else:
self.add_qdq_to_add_node_followed_by_redeuce_mean_node = False

if self.add_qdq_to_add_node_followed_by_redeuce_mean_node:
self.reduce_mean_nodes = []
self.add_nodes = []
self.add_nodes_to_quantize = []

def quantize_tensor(self, tensor_name):
weight = find_by_name(tensor_name, self.model.initializer())
if weight is not None:
Expand Down Expand Up @@ -90,7 +112,31 @@ def remove_node(self, node):
def remove_nodes(self):
self.model.remove_nodes(self.nodes_to_remove)

def pre_quantization_setup(self):
if self.add_qdq_to_add_node_followed_by_redeuce_mean_node:
for node in self.model.nodes():
if node.op_type == "Add":
self.add_nodes.append(node)
if node.op_type == "ReduceMean":
self.reduce_mean_nodes.append(node)

for add_node in self.add_nodes:
for reduce_mean_node in self.reduce_mean_nodes:
if add_node.output == reduce_mean_node.input:
self.add_nodes_to_quantize.append(add_node.name)
if add_node.name not in self.add_nodes_to_quantize:
self.nodes_to_exclude.append(add_node.name)

if self.dedicated_qdq_pair:
for node in self.model.nodes():
if self.should_quantize(node):
for tensor_name in node.input:
if tensor_name not in self.tensor_to_its_receiving_nodes:
self.tensor_to_its_receiving_nodes[tensor_name] = []
self.tensor_to_its_receiving_nodes[tensor_name].append(node)

def quantize_model(self):
self.pre_quantization_setup()
for node in self.model.nodes():
if self.should_quantize(node):
op_quantizer = CreateQDQQuantizer(self, node)
Expand Down Expand Up @@ -156,30 +202,55 @@ def quantize_tensors(self):
"In static mode quantization params for inputs and outputs of nodes to be quantized are required."
.format(tensor_name))

q_input = tensor_name
q_output = tensor_name + "_QuantizeLinear"
dq_input = q_output
dq_output = tensor_name + "_DequantizeLinear"
if self.model.is_graph_output(tensor_name):
q_input = tensor_name + "_QuantizeLinearInput"
dq_output = tensor_name
self.model.replace_output_of_all_nodes(tensor_name, q_input)
if self.dedicated_qdq_pair and tensor_name in self.tensor_to_its_receiving_nodes and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1:
num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])
for i in range(num_dedicated_qdq_pair):
postfix = str(i+1)
q_input = tensor_name
q_output = tensor_name + "_QuantizeLinear_" + postfix
dq_input = q_output
dq_output = tensor_name + "_DequantizeLinear_" + postfix
quant_node_name = tensor_name + "_QuantizeLinear_" + postfix
dequant_node_name = tensor_name + "_DequantizeLinear_" + postfix
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
[q_output], quant_node_name)
dequant_node = onnx.helper.make_node("DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name)
self.model.add_nodes([qlinear_node, dequant_node])

node = self.tensor_to_its_receiving_nodes[tensor_name][i]
self.model.replace_node_input(node, tensor_name, dq_output)

quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
QuantizedValueType.Input)
self.quantized_value_map[tensor_name] = quantized_value
else:
self.model.replace_input_of_all_nodes(tensor_name, dq_output)
q_input = tensor_name
q_output = tensor_name + "_QuantizeLinear"
dq_input = q_output
dq_output = tensor_name + "_DequantizeLinear"
if self.model.is_graph_output(tensor_name):
q_input = tensor_name + "_QuantizeLinearInput"
dq_output = tensor_name
self.model.replace_output_of_all_nodes(tensor_name, q_input)
else:
self.model.replace_input_of_all_nodes(tensor_name, dq_output)

quant_node_name = tensor_name + "_QuantizeLinear"
dequant_node_name = tensor_name + "_DequantizeLinear"
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
[q_output], quant_node_name)
dequant_node = onnx.helper.make_node("DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name)
self.model.add_nodes([qlinear_node, dequant_node])
quant_node_name = tensor_name + "_QuantizeLinear"
dequant_node_name = tensor_name + "_DequantizeLinear"
qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
[q_output], quant_node_name)
dequant_node = onnx.helper.make_node("DequantizeLinear",
[dq_input, scale_name, zp_name],
[dq_output],
dequant_node_name)
self.model.add_nodes([qlinear_node, dequant_node])

quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
QuantizedValueType.Input)
self.quantized_value_map[tensor_name] = quantized_value
quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
QuantizedValueType.Input)
self.quantized_value_map[tensor_name] = quantized_value

def quantize_bias_tensors(self):
for bias_name, input_name, weight_name in self.bias_to_quantize:
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ def quantize_static(model_input,
inserts both QuantizeLinear/DeQuantizeLinear nodes to weight.
OpTypesToExcludeOutputQuantizatioin = list of op type : Default is []. If any op type is specified, it won't quantize
the output of ops with this specific op types.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as TRT has so many , i'm thinking we may need a config file for different execution provider

DedicatedQDQPair = True/False : Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their inputs.
If True, it will create identical and dedicated QDQ pair for each node.
AddQDQToAddNodeFollowedByReduceMeanNode = True/False : Default is False. It adds QDQ pairs to every Add node if Add op type is in op_types_to_quantize.
If True, only Add node followed by ReduceMean node is going to be added QDQ pair.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is too specific.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have a general way nodes_to_exclude. why not use it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I'm using nodes_to_exclude in qdq_quantizer.py#128.
Agree it's too specific. I'm removing this feature and move the logic to e2e example to get nodes_to_exclude.

QDQChannelAxis = Integer : Default is 0. Channel axis for QDQ pair when per_channel is True.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QDQChannelAxis

i don't think all the operators can share same per-channel axis.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which operators in TRT supports per-channel?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In TRT pretty much we only use per-channel for weights. All activations are using per-tensor.

'''

mode = QuantizationMode.QLinearOps
Expand Down
173 changes: 172 additions & 1 deletion onnxruntime/test/python/quantization/test_qdq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import onnx
import numpy as np
from onnx import helper, TensorProto
from onnxruntime.quantization import quantize_static, QuantType, QuantFormat
from onnxruntime.quantization import quantize_static, QuantType, QuantFormat, QuantizationMode, QDQQuantizer
from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order

class TestQDQFormat(unittest.TestCase):
Expand All @@ -24,6 +24,177 @@ def input_feeds(self, n, name2shape):
dr = TestDataFeeds(input_data_list)
return dr

class TestQDQExtraOptions(unittest.TestCase):
def test_qdq_extra_options(self):
# (input)
# |
# Add
# |
# ReduceMean
# |
# Add
# |
# (output)

initializers = []

input_tensor = helper.make_tensor_value_info('L', TensorProto.FLOAT, [5, 5])
output_tensor = helper.make_tensor_value_info('O', TensorProto.FLOAT, [5, 5])

add_weight_data_1 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
initializers.append(onnx.numpy_helper.from_array(add_weight_data_1, name="M"))
add_weight_data_2 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
initializers.append(onnx.numpy_helper.from_array(add_weight_data_2, name="N"))

add_node_1 = onnx.helper.make_node('Add', ['L', 'M'], ['P'], name='Add1')
reduce_mean_node = onnx.helper.make_node('ReduceMean', ['P'], ['Q'], keepdims=1, name='ReduceMean')
add_node_2 = onnx.helper.make_node('Add', ['Q', 'N'], ['O'], name='Add2')

graph = helper.make_graph([add_node_1, reduce_mean_node, add_node_2], 'QDQ_Test_Finetune', [input_tensor], [output_tensor], initializer=initializers)
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
test_model_path = './test_qdq_finetune.onnx'
onnx.save(model, test_model_path)

compute_range = {
'P': [0.1, 0.1],
'Q': [0.1, 0.1],
'M': [0.1, 0.1],
'N': [0.1, 0.1],
'L': [0.1, 0.1],
'O': [0.1, 0.1],
}

op_types_to_quantize = ['Add']

mode = QuantizationMode.QLinearOps
model = onnx.load_model(test_model_path, False)
quantizer = QDQQuantizer(
model,
True, #per_channel
False, #reduce_range
mode,
True, #static
QuantType.QInt8, #weight_type
QuantType.QInt8, #activation_type
compute_range,
[], #nodes_to_quantize
[], #nodes_to_exclude
op_types_to_quantize,
{'ActivationSymmetric' : True, 'AddQDQPairToWeight' : True, 'AddQDQToAddNodeFollowedByReduceMeanNode': True, 'OpTypesToExcludeOutputQuantizatioin': []}) #extra_options
quantizer.quantize_model()
qdq_model_path = './test_qdq_finetune_qdq.onnx'
quantizer.model.save_model_to_file(qdq_model_path, False)

# QDQ pair should be added to Add1 but not Add2
# QDQ pair shoud be added to Add1 output as well.
qdq_added_to_node_output_flag = False
for node in quantizer.model.nodes():
if node.name == 'Add1':
for input in node.input:
self.assertTrue("DequantizeLinear" in input)
for output in node.output:
self.assertTrue("QuantizeLinear" not in output)

if node.name == 'Add2':
for input in node.input:
self.assertTrue("DequantizeLinear" not in input)
for output in node.output:
self.assertTrue("QuantizeLinear" not in output)

# This QuantizeLinear node should be followed by Add1
if node.name == 'P_QuantizeLinear':
qdq_added_to_node_output_flag = True
self.assertTrue(node.input[0] is 'P')

self.assertTrue(qdq_added_to_node_output_flag)


def test_qdq_extra_options_2(self):
# (input)
# |
# Add
# / | \
# MatMul MatMul MatMul
# | | |
# (output)(output)(output)

initializers = []

input_tensor = helper.make_tensor_value_info('L', TensorProto.FLOAT, [5, 5])
output_tensor1 = helper.make_tensor_value_info('M', TensorProto.FLOAT, [5, 5])
output_tensor2 = helper.make_tensor_value_info('N', TensorProto.FLOAT, [5, 5])
output_tensor3 = helper.make_tensor_value_info('O', TensorProto.FLOAT, [5, 5])

add_weight_data = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
initializers.append(onnx.numpy_helper.from_array(add_weight_data, name="P"))
matmul_weight_data_1 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_1, name="Q"))
matmul_weight_data_2 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="R"))
matmul_weight_data_3 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="S"))

add_node = onnx.helper.make_node('Add', ['L', 'P'], ['T'], name='Add')
matmul_node_1 = onnx.helper.make_node('MatMul', ['T', 'Q'], ['M'], name='MatMul1')
matmul_node_2 = onnx.helper.make_node('MatMul', ['T', 'R'], ['N'], name='MatMul2')
matmul_node_3 = onnx.helper.make_node('MatMul', ['T', 'S'], ['O'], name='MatMul3')

graph = helper.make_graph([add_node, matmul_node_1, matmul_node_2, matmul_node_3], 'QDQ_Test_Finetune_2', [input_tensor], [output_tensor1, output_tensor2, output_tensor3], initializer=initializers)
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
test_model_path = './test_qdq_finetune_2.onnx'
onnx.save(model, test_model_path)

compute_range = {
'L': [0.1, 0.1],
'M': [0.1, 0.1],
'N': [0.1, 0.1],
'O': [0.1, 0.1],
'P': [0.1, 0.1],
'Q': [0.1, 0.1],
'R': [0.1, 0.1],
'S': [0.1, 0.1],
'T': [0.1, 0.1],
}

op_types_to_quantize = ['Add', 'MatMul']

mode = QuantizationMode.QLinearOps
model = onnx.load_model(test_model_path, False)
quantizer = QDQQuantizer(
model,
True, #per_channel
False, #reduce_range
mode,
True, #static
QuantType.QInt8, #weight_type
QuantType.QInt8, #activation_type
compute_range,
[], #nodes_to_quantize
[], #nodes_to_exclude
op_types_to_quantize,
{'ActivationSymmetric' : True, 'AddQDQPairToWeight' : True, 'AddQDQToAddNodeFollowedByReduceMeanNode': True, 'OpTypesToExcludeOutputQuantizatioin': op_types_to_quantize, 'DedicatedQDQPair': True}) #extra_options
quantizer.quantize_model()
qdq_model_path = './test_qdq_finetune_qdq_2.onnx'
quantizer.model.save_model_to_file(qdq_model_path, False)

# Three dedicated QDQ pair should be generated and feed into each MatMul node
# Also QDQ pair should not be added to Add node
# QDQ pair shoud not be added to node's output
for node in quantizer.model.nodes():
if node.name == 'MatMul1':
self.assertTrue("T_DequantizeLinear_1" in node.input)
if node.name == 'MatMul2':
self.assertTrue("T_DequantizeLinear_2" in node.input)
if node.name == 'MatMul3':
self.assertTrue("T_DequantizeLinear_3" in node.input)
if node.name == 'Add':
for input in node.input:
self.assertTrue("DequantizeLinear" not in input)

# QDQ pair shoud not be added to MatMul's output
if node.op_type == 'QuantizeLinear':
self.assertTrue(node.input[0] not in ['M_QuantizeLinearInput', 'N_QuantizeLinearInput', 'O_QuantizeLinearInput'])

class TestQDQFormatConv(TestQDQFormat):
def construct_model_conv(self, output_model_path, input_shape, weight_shape, output_shape, has_bias):
# (input)
Expand Down