microsoft · linkerzhang · Nov 29, 2018 · Nov 29, 2018 · Nov 29, 2018 · ke1337
diff --git a/onnxruntime/contrib_ops/contrib_ops.cc b/onnxruntime/contrib_ops/contrib_ops.cc
@@ -12,8 +12,8 @@
 namespace onnxruntime {
 namespace contrib {
 using ::ONNX_NAMESPACE::AttributeProto;
-using ::ONNX_NAMESPACE::OpSchema;
 using ::ONNX_NAMESPACE::OPTIONAL;
+using ::ONNX_NAMESPACE::OpSchema;
 
 void RegisterContribSchemas() {
  ONNX_CONTRIB_OPERATOR_SCHEMA(SampleOp)
@@ -135,6 +135,31 @@ The linear de-quantization operator. It consumes a quantized data, a scale, a ze
 The dequantization formula is y = (x - x_zero_point) * x_scale.
  Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC");
 
+ ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearMatMul)
+ .SetDomain(kMSDomain)
+ .SinceVersion(1)
+ .SetDoc(R"DOC(
+Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+It consumes two quantized input tensors, their scales and zero points, and output's scale and zero point, and computes
+the quantized output. The quantization formula is x_quantized = (x_fp32 / x_scale) + x_zero_point. For (x_fp32 / x_scale),
+it computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from zero.
+Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per row for a and per column for b).
+If scale and zero point are 1D tensor, the number of elements of scale and zero point tensor of input 'a' and output 'y'
+should be equal to the number of rows of input 'a', and the number of elements of scale and zero point tensor of input 'b'
+should be equal to the number of columns of input 'b'.)DOC")
+ .Input(0, "a", "N-dimensional quantized matrix a", "T1")
+ .Input(1, "a_scale", "scale of quantized input a", "tensor(float)")
+ .Input(2, "a_zero_point", "zero point of quantized input a", "T1")
+ .Input(3, "b", "N-dimensional quantized matrix b", "T2")
+ .Input(4, "b_scale", "scale of quantized input b", "tensor(float)")
+ .Input(5, "b_zero_point", "zero point of quantized input b", "T2")
+ .Input(6, "y_scale", "scale of quantized output y", "tensor(float)")
+ .Input(7, "y_zero_point", "zero point of quantized output y", "T3")
+ .Output(0, "y", "Quantized matrix multiply results from a * b", "T3")
+ .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input a and its zero point data types as 8-bit integer tensor")
+ .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input b and its zero point data types as 8-bit integer tensor")
+ .TypeConstraint("T3", {"tensor(int8)", "tensor(uint8)"}, "Constrain output y and its zero point data types as 8-bit integer tensor.");
+
  const char* auto_pad_doc =
  "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
  "default value is NOTSET, which means explicit padding is used. "
@@ -323,7 +348,7 @@ The integer convolution operator consumes an input tensor, a filter, and a paddi
 Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
  The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.)DOC")
  .Input(0, "A", "N-dimensional matrix A", "T1")
- .Input(0, "B", "N-dimensional matrix B", "T2")
+ .Input(1, "B", "N-dimensional matrix B", "T2")
  .Output(0, "Y", "Matrix multiply results from A * B", "T3")
  .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data types as 8-bit integer tensor")
  .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data types as 8-bit integer tensor")