Enable addmm fusion for ONNX export only (pytorch#12538)

James Reed · facebook-github-bot · commit 0f9807ee619a · 2018-10-11T13:57:50.000-07:00
Summary: There's some action at a distance issues and not having this is disabling quantization in C2 for prod use cases ref T34831022 Pull Request resolved: pytorch#12538 Differential Revision: D10302931 Pulled By: jamesr66a fbshipit-source-id: 700dc8c5c4297e942171992266ffb67b815be754
diff --git a/test/expect/TestScript.test_addmm_fusion-jit.expect b/test/expect/TestScript.test_addmm_fusion-jit.expect
@@ -0,0 +1,8 @@
+graph(%0 : Double(*, *)
+      %1 : Double(*, *)
+      %2 : Double(*, *)) {
+  %3 : int = prim::Constant[value=1]()
+  %4 : Double(*, *) = aten::mm(%0, %1)
+  %5 : Double(*, *) = aten::add(%4, %2, %3)
+  return (%5);
+}
diff --git a/test/expect/TestScript.test_addmm_fusion-onnx.expect b/test/expect/TestScript.test_addmm_fusion-onnx.expect
@@ -0,0 +1,16 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "0", type:Tensor dims: 3 4},{name: "1", type:Tensor dims: 4 5},{name: "2", type:Tensor dims: 3 5}]
+      outputs: [{name: "3", type:Tensor dims: 3 5}]
+      initializers: []
+      nodes: [
+        Node {type: "Gemm", inputs: [0,1,2], outputs: [3], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1}]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f2.expect b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
@@ -24,25 +24,21 @@ ModelProto {
                     GraphProto {
                       name: "torch-jit-export2"
                       inputs: []
-                      outputs: [{name: "11", type:Tensor dims: 1 20}]
+                      outputs: [{name: "9", type:Tensor dims: 1 20}]
                       initializers: []
                       nodes: [
-                        Node {type: "Constant", inputs: [], outputs: [9], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: [1]}]},
-                        Node {type: "Gemm", inputs: [3,1,9], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 0},{ name: 'transB', type: int, value: 1}]},
-                        Node {type: "Add", inputs: [2,10], outputs: [11], attributes: []}
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
                       ]
                     }
 
                   },{ name: 'else_branch', type: graph, value:
                     GraphProto {
                       name: "torch-jit-export3"
                       inputs: []
-                      outputs: [{name: "14", type:Tensor dims: 1 20}]
+                      outputs: [{name: "10", type:Tensor dims: 1 20}]
                       initializers: []
                       nodes: [
-                        Node {type: "Constant", inputs: [], outputs: [12], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: [1]}]},
-                        Node {type: "Gemm", inputs: [3,1,12], outputs: [13], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 0},{ name: 'transB', type: int, value: 1}]},
-                        Node {type: "Add", inputs: [2,13], outputs: [14], attributes: []}
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
                       ]
                     }
 
@@ -54,12 +50,10 @@ ModelProto {
             GraphProto {
               name: "torch-jit-export4"
               inputs: []
-              outputs: [{name: "17", type:Tensor dims: 1 20}]
+              outputs: [{name: "11", type:Tensor dims: 1 20}]
               initializers: []
               nodes: [
-                Node {type: "Constant", inputs: [], outputs: [15], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: [1]}]},
-                Node {type: "Gemm", inputs: [3,1,15], outputs: [16], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 0},{ name: 'transB', type: int, value: 1}]},
-                Node {type: "Add", inputs: [2,16], outputs: [17], attributes: []}
+                Node {type: "Gemm", inputs: [3,1,2], outputs: [11], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
               ]
             }
 
diff --git a/test/test_jit.py b/test/test_jit.py
@@ -7222,6 +7222,21 @@ def elif_test(niter : int):
 
         self.checkScript(code, (101,), name='elif_test', outputs=3028)
 
+    def test_addmm_fusion(self):
+        class AddmmWrapper(torch.nn.Module):
+            def forward(self, x, y, c):
+                return torch.mm(x, y) + c
+
+        # Test addmm fusion is disabled for normal Jit
+        x, y, c = torch.rand(3, 4), torch.rand(4, 5), torch.rand(3, 5)
+        f = io.BytesIO()
+        pretty = torch.onnx.export_to_pretty_string(AddmmWrapper(), (x, y, c), f)
+        self.assertExpected(pretty, 'onnx')
+
+        jit_trace = torch.jit.trace(AddmmWrapper(), (x, y, c))
+        ge_graph = jit_trace.__getattr__('forward').graph_for(x, y, c)
+        self.assertExpectedGraph(ge_graph, 'jit')
+
     def test_weak_script_function(self):
         outer_var = 10
         outer_var2 = 11
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
@@ -90,7 +90,7 @@ void initJITBindings(PyObject *module) {
      return EliminateCommonSubexpression(g); // overload resolution
    })
    .def("_jit_pass_constant_pooling", ConstantPooling)
-   .def("_jit_pass_peephole", PeepholeOptimize)
+   .def("_jit_pass_peephole", PeepholeOptimize, py::arg("graph"), py::arg("addmm_fusion_enabled") = false)
    .def("_jit_pass_canonicalize", [](const std::shared_ptr<Graph>& g) {
      return Canonicalize(g);
    })
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
@@ -14,12 +14,19 @@ namespace torch { namespace jit {
 //    - Simply x.t().t() to x
 //
 // TODO: Decide what kind of fixed point strategy we will have
-void PeepholeOptimize(Block * block) {
+//
+// The parameter `addmm_fusion_enabled` exists because, as it is today, fusing
+// add + mm has no benefit within PyTorch running ATen ops. However, we rely on
+// seeing the fused version of addmm for ONNX export, since after ONNX translation
+// we would see redundant Gemm ops with sub-optimal inputs. This flag is exposed
+// so that ONNX export can pass `true` to get the fused behavior, but normal
+// JIT peephole optimization is left alone.
+void PeepholeOptimize(Block * block, bool addmm_fusion_enabled) {
   for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
     auto* node = *it;
 
     for (Block * sub_block : node->blocks()) {
-      PeepholeOptimize(sub_block);
+      PeepholeOptimize(sub_block, addmm_fusion_enabled);
     }
 
     // XXX: remember that if you want to simplify an expression by combining multiple nodes
@@ -60,7 +67,6 @@ void PeepholeOptimize(Block * block) {
       // and because it works out of place on C, we're only trading off an explicit add for
       // a copy inside the addmm function. Note that it doesn't even result in fewer reads,
       // because mm won't even load C (because beta == 0 for it).
-      static constexpr bool addmm_fusion_enabled = false;
       if (addmm_fusion_enabled && node->get<at::Scalar>(attr::alpha).value().toDouble() == 1.) {
         // Look for mm from both sides of the add
         for (size_t mm_side = 0; mm_side < 2; mm_side++) {
@@ -123,8 +129,8 @@ void PeepholeOptimize(Block * block) {
   }
 }
 
-void PeepholeOptimize(std::shared_ptr<Graph>& graph) {
-  PeepholeOptimize(graph->block());
+void PeepholeOptimize(std::shared_ptr<Graph>& graph, bool addmm_fusion_enabled) {
+  PeepholeOptimize(graph->block(), addmm_fusion_enabled);
   // Eliminate dead code created by any peephole passes we've just done
   EliminateDeadCode(graph->block());
 }
diff --git a/torch/csrc/jit/passes/peephole.h b/torch/csrc/jit/passes/peephole.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-TORCH_API void PeepholeOptimize(std::shared_ptr<Graph>& graph);
+TORCH_API void PeepholeOptimize(std::shared_ptr<Graph>& graph, bool addmm_fusion_enabled=false);
 
 }}
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
@@ -138,7 +138,7 @@ def _optimize_graph(graph, operator_export_type):
     torch._C._jit_pass_canonicalize_ops(graph)
     torch._C._jit_pass_lint(graph)
 
-    torch._C._jit_pass_peephole(graph)
+    torch._C._jit_pass_peephole(graph, True)
     torch._C._jit_pass_lint(graph)
 
     # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
@@ -147,7 +147,7 @@ def _optimize_graph(graph, operator_export_type):
     torch._C._jit_pass_erase_number_types(graph)
     # onnx does not support tuples, so try to remove them
     torch._C._jit_pass_lower_all_tuples(graph)
-    torch._C._jit_pass_peephole(graph)
+    torch._C._jit_pass_peephole(graph, True)
     torch._C._jit_pass_lint(graph)
 
     if operator_export_type != OperatorExportTypes.RAW: