From e16b5e574e1606544f828bea35f45c6e2fe97119 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 22 Oct 2021 06:45:28 -0700
Subject: [PATCH] BUG: Look through on_device annotations when looking for
 shape constants (#9345)

https://github.com/apache/tvm/pull/8788 introduced a perf regression
since a `shape.as<ConstantNode>` in `alloc_tensor` was always failing
due to the extra `on_device` annotation on the constant. Fixed that,
and introduced some helpers to make this situation easier to deal with.

(This is CORE-102 in OctoML JIRA).

(Second try -- test_crp.py failure seems unrelated)
---
 src/relay/backend/aot_executor_codegen.cc |  3 +--
 src/relay/backend/graph_plan_memory.cc    |  5 ++---
 src/relay/backend/vm/compiler.cc          |  7 +++---
 src/relay/op/annotation/annotation.h      | 26 +++++++++++++++++++++++
 src/relay/op/memory/memory.cc             | 10 +++------
 src/relay/transforms/pass_utils.h         |  5 ++---
 tests/python/relay/test_vm.py             | 15 ++++++++++++-
 7 files changed, 52 insertions(+), 19 deletions(-)
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 56e008a345de..3c9c35c4f254 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -182,9 +182,8 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
    * \return The corresponding token.
    */
   StorageInfo GetStorage(const Expr& expr) {
-    auto props = GetOnDeviceProps(expr);
     // See through "on_device" calls.
-    Expr true_expr = props.body.defined() ? props.body : expr;
+    Expr true_expr = IgnoreOnDevice(expr);
     VisitExpr(true_expr);
     auto it = storage_device_map_.find(true_expr);
     ICHECK(it != storage_device_map_.end());
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 7642f3ccf703..961252a14fa7 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -146,10 +146,9 @@ class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor {
    * \return The corresponding token.
    */
   const std::vector<StorageToken*>& GetToken(const Expr& expr) {
-    this->VisitExpr(expr);
     // See through on_device calls.
-    auto props = GetOnDeviceProps(expr);
-    Expr real_expr = props.body.defined() ? props.body : expr;
+    Expr real_expr = IgnoreOnDevice(expr);
+    this->VisitExpr(real_expr);
     auto it = token_map_.find(real_expr.get());
     ICHECK(it != token_map_.end()) << "Expression not found in storage map:" << std::endl
                                    << PrettyPrint(real_expr);
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 70ad2ccc992e..b3c1cd81274f 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -594,8 +594,9 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
                    auto offset_register = last_register_;
 
                    // If the shape is constant then we will emit a static tensor allocation
-                   // instruction.
-                   auto const_shape = args[2].as<ConstantNode>();
+                   // instruction. It may be wrapped by an on_device, but it will be on the host
+                   // which is assumed by the alloc_tensor instruction anyway.
+                   auto const_shape = AsIgnoringOnDevice<ConstantNode>(args[2]);
 
                    if (const_shape) {
                      NDArray shape = const_shape->data;
@@ -619,7 +620,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
                    this->VisitExpr(args[0]);
                    auto size_register = last_register_;
 
-                   ICHECK(args[1].as<ConstantNode>());
+                   ICHECK(args[1].as<ConstantNode>());  // Always a literal.
                    NDArray alignment_arr = args[1].as<ConstantNode>()->data;
                    ICHECK_EQ(alignment_arr->dtype.code, 0U)
                        << "The dtype of constant shape must be int32 or int64, but got "
diff --git a/src/relay/op/annotation/annotation.h b/src/relay/op/annotation/annotation.h
index b6dff8813fd4..d772df9b023a 100644
--- a/src/relay/op/annotation/annotation.h
+++ b/src/relay/op/annotation/annotation.h
@@ -85,6 +85,32 @@ OnDeviceProps GetOnDeviceProps(const CallNode* call_node);
  */
 OnDeviceProps GetOnDeviceProps(const Expr& expr);
 
+/*!
+ * \brief Returns the body of \p expr if it is an "on_device" annotation, otherwise returns
+ * \p expr directly.
+ */
+inline Expr IgnoreOnDevice(const Expr& expr) {
+  OnDeviceProps props = GetOnDeviceProps(expr);
+  return props.body.defined() ? props.body : expr;
+}
+
+/*!
+ * \brief Returns \p expr as \p NodeType, or null if it is not of that type. Looks through
+ * any "on_device" annotations.
+ */
+template <typename NodeType>
+const NodeType* AsIgnoringOnDevice(const Expr& expr) {
+  const auto* node = expr.as<NodeType>();
+  if (node != nullptr) {
+    return node;
+  }
+  OnDeviceProps props = GetOnDeviceProps(expr);
+  if (!props.body.defined()) {
+    return nullptr;
+  }
+  return props.body.as<NodeType>();
+}
+
 /*!
  * \brief Returns \p function annotated with "param_device_types" and "result_device_type"
  * attributes capturing parameter and result devices types respectively.
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 6b22cfd6bdba..08e92b31965e 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -101,13 +101,9 @@ Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype,
     attrs->assert_shape = assert_shape;
   } else {
     // Look through any on_device for the shape argument expression.
-    Expr literal_shape = shape;
-    auto props = GetOnDeviceProps(literal_shape);
-    if (props.body.defined()) {
-      // See through on_device calls.
-      literal_shape = props.body;
-    }
-    attrs->const_shape = Downcast<Constant>(literal_shape);
+    const auto* constant_node = AsIgnoringOnDevice<ConstantNode>(shape);
+    ICHECK(constant_node);
+    attrs->const_shape = GetRef<Constant>(constant_node);
   }
   static const Op& op = Op::Get("memory.alloc_tensor");
   return Call(op, {storage, offset, shape}, Attrs(attrs), {});
diff --git a/src/relay/transforms/pass_utils.h b/src/relay/transforms/pass_utils.h
index ed9409856871..fd7f0a5594c2 100644
--- a/src/relay/transforms/pass_utils.h
+++ b/src/relay/transforms/pass_utils.h
@@ -118,9 +118,8 @@ inline Expr TransformF(const std::function<Expr(const Expr&)>& func, const Expr&
  *   is it atomic?
  *   if so, the compute cost of the expression is bounded so it can be copy without graph mode.
  */
-inline bool IsAtomic(const Expr& e) {
-  auto props = GetOnDeviceProps(e);
-  Expr true_expr = props.body.defined() ? props.body : e;
+inline bool IsAtomic(const Expr& expr) {
+  Expr true_expr = IgnoreOnDevice(expr);
   return true_expr.as<VarNode>() || true_expr.as<OpNode>() || true_expr.as<ConstructorNode>() ||
          true_expr.as<GlobalVarNode>() ||
          true_expr.as<ConstantNode>();  // Constant is always by reference.
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 42fe1a3cef3a..8ec41523f9dc 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -766,6 +766,19 @@ def test_vm_reshape_tensor(target, dev):
     check_result(target, dev, [x_np, y_np], x_np.reshape([8, 2, 8]), mod)
 
 
+def test_vm_reshape_and_copy(target, dev):
+    """Make sure the compiler notices the reshape result shape is a literal and can use
+    the immediate-mode alloc_tensor instruction instead of alloc_tensor_reg."""
+    x_np = np.random.uniform(size=(1, 1)).astype("float32")
+    x = relay.var("x", shape=(1, 1), dtype="float32")
+    mod = tvm.IRModule.from_expr(relay.Function([x], relay.copy(relay.reshape(x, [0, 1]))))
+    with tvm.transform.PassContext(opt_level=3):
+        exec = relay.vm.compile(mod, "llvm")
+    assert "alloc_tensor" in exec.bytecode
+    assert not "alloc_tensor_reg" in exec.bytecode
+    check_result(target, dev, [x_np], x_np.reshape([1, 1]), mod)
+
+
 def test_vm_reshape_tuple(target, dev, x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
     tup = relay.var(
         "tup",
@@ -963,4 +976,4 @@ def test_benchmark_end_to_end_rpc():
 if __name__ == "__main__":
     import sys
 
-    sys.exit(pytest.main(sys.argv))
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))