From e16b5e574e1606544f828bea35f45c6e2fe97119 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Fri, 22 Oct 2021 06:45:28 -0700 Subject: [PATCH] BUG: Look through on_device annotations when looking for shape constants (#9345) https://github.com/apache/tvm/pull/8788 introduced a perf regression since a `shape.as` in `alloc_tensor` was always failing due to the extra `on_device` annotation on the constant. Fixed that, and introduced some helpers to make this situation easier to deal with. (This is CORE-102 in OctoML JIRA). (Second try -- test_crp.py failure seems unrelated) --- src/relay/backend/aot_executor_codegen.cc | 3 +-- src/relay/backend/graph_plan_memory.cc | 5 ++--- src/relay/backend/vm/compiler.cc | 7 +++--- src/relay/op/annotation/annotation.h | 26 +++++++++++++++++++++++ src/relay/op/memory/memory.cc | 10 +++------ src/relay/transforms/pass_utils.h | 5 ++--- tests/python/relay/test_vm.py | 15 ++++++++++++- 7 files changed, 52 insertions(+), 19 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 56e008a345de..3c9c35c4f254 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -182,9 +182,8 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor { * \return The corresponding token. */ StorageInfo GetStorage(const Expr& expr) { - auto props = GetOnDeviceProps(expr); // See through "on_device" calls. - Expr true_expr = props.body.defined() ? props.body : expr; + Expr true_expr = IgnoreOnDevice(expr); VisitExpr(true_expr); auto it = storage_device_map_.find(true_expr); ICHECK(it != storage_device_map_.end()); diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 7642f3ccf703..961252a14fa7 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -146,10 +146,9 @@ class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor { * \return The corresponding token. */ const std::vector& GetToken(const Expr& expr) { - this->VisitExpr(expr); // See through on_device calls. - auto props = GetOnDeviceProps(expr); - Expr real_expr = props.body.defined() ? props.body : expr; + Expr real_expr = IgnoreOnDevice(expr); + this->VisitExpr(real_expr); auto it = token_map_.find(real_expr.get()); ICHECK(it != token_map_.end()) << "Expression not found in storage map:" << std::endl << PrettyPrint(real_expr); diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 70ad2ccc992e..b3c1cd81274f 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -594,8 +594,9 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { auto offset_register = last_register_; // If the shape is constant then we will emit a static tensor allocation - // instruction. - auto const_shape = args[2].as(); + // instruction. It may be wrapped by an on_device, but it will be on the host + // which is assumed by the alloc_tensor instruction anyway. + auto const_shape = AsIgnoringOnDevice(args[2]); if (const_shape) { NDArray shape = const_shape->data; @@ -619,7 +620,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { this->VisitExpr(args[0]); auto size_register = last_register_; - ICHECK(args[1].as()); + ICHECK(args[1].as()); // Always a literal. NDArray alignment_arr = args[1].as()->data; ICHECK_EQ(alignment_arr->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got " diff --git a/src/relay/op/annotation/annotation.h b/src/relay/op/annotation/annotation.h index b6dff8813fd4..d772df9b023a 100644 --- a/src/relay/op/annotation/annotation.h +++ b/src/relay/op/annotation/annotation.h @@ -85,6 +85,32 @@ OnDeviceProps GetOnDeviceProps(const CallNode* call_node); */ OnDeviceProps GetOnDeviceProps(const Expr& expr); +/*! + * \brief Returns the body of \p expr if it is an "on_device" annotation, otherwise returns + * \p expr directly. + */ +inline Expr IgnoreOnDevice(const Expr& expr) { + OnDeviceProps props = GetOnDeviceProps(expr); + return props.body.defined() ? props.body : expr; +} + +/*! + * \brief Returns \p expr as \p NodeType, or null if it is not of that type. Looks through + * any "on_device" annotations. + */ +template +const NodeType* AsIgnoringOnDevice(const Expr& expr) { + const auto* node = expr.as(); + if (node != nullptr) { + return node; + } + OnDeviceProps props = GetOnDeviceProps(expr); + if (!props.body.defined()) { + return nullptr; + } + return props.body.as(); +} + /*! * \brief Returns \p function annotated with "param_device_types" and "result_device_type" * attributes capturing parameter and result devices types respectively. diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc index 6b22cfd6bdba..08e92b31965e 100644 --- a/src/relay/op/memory/memory.cc +++ b/src/relay/op/memory/memory.cc @@ -101,13 +101,9 @@ Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype, attrs->assert_shape = assert_shape; } else { // Look through any on_device for the shape argument expression. - Expr literal_shape = shape; - auto props = GetOnDeviceProps(literal_shape); - if (props.body.defined()) { - // See through on_device calls. - literal_shape = props.body; - } - attrs->const_shape = Downcast(literal_shape); + const auto* constant_node = AsIgnoringOnDevice(shape); + ICHECK(constant_node); + attrs->const_shape = GetRef(constant_node); } static const Op& op = Op::Get("memory.alloc_tensor"); return Call(op, {storage, offset, shape}, Attrs(attrs), {}); diff --git a/src/relay/transforms/pass_utils.h b/src/relay/transforms/pass_utils.h index ed9409856871..fd7f0a5594c2 100644 --- a/src/relay/transforms/pass_utils.h +++ b/src/relay/transforms/pass_utils.h @@ -118,9 +118,8 @@ inline Expr TransformF(const std::function& func, const Expr& * is it atomic? * if so, the compute cost of the expression is bounded so it can be copy without graph mode. */ -inline bool IsAtomic(const Expr& e) { - auto props = GetOnDeviceProps(e); - Expr true_expr = props.body.defined() ? props.body : e; +inline bool IsAtomic(const Expr& expr) { + Expr true_expr = IgnoreOnDevice(expr); return true_expr.as() || true_expr.as() || true_expr.as() || true_expr.as() || true_expr.as(); // Constant is always by reference. diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py index 42fe1a3cef3a..8ec41523f9dc 100644 --- a/tests/python/relay/test_vm.py +++ b/tests/python/relay/test_vm.py @@ -766,6 +766,19 @@ def test_vm_reshape_tensor(target, dev): check_result(target, dev, [x_np, y_np], x_np.reshape([8, 2, 8]), mod) +def test_vm_reshape_and_copy(target, dev): + """Make sure the compiler notices the reshape result shape is a literal and can use + the immediate-mode alloc_tensor instruction instead of alloc_tensor_reg.""" + x_np = np.random.uniform(size=(1, 1)).astype("float32") + x = relay.var("x", shape=(1, 1), dtype="float32") + mod = tvm.IRModule.from_expr(relay.Function([x], relay.copy(relay.reshape(x, [0, 1])))) + with tvm.transform.PassContext(opt_level=3): + exec = relay.vm.compile(mod, "llvm") + assert "alloc_tensor" in exec.bytecode + assert not "alloc_tensor_reg" in exec.bytecode + check_result(target, dev, [x_np], x_np.reshape([1, 1]), mod) + + def test_vm_reshape_tuple(target, dev, x_shape=(1, 4, 2), y_shape=(1, 2, 10)): tup = relay.var( "tup", @@ -963,4 +976,4 @@ def test_benchmark_end_to_end_rpc(): if __name__ == "__main__": import sys - sys.exit(pytest.main(sys.argv)) + sys.exit(pytest.main([__file__] + sys.argv[1:]))