[Relay][VM] Memory planner (part 1) (apache#5144)

* Start on memory planning WIP Move to test_memory_passes.py Work on memory planning Post-rebase and VM changes Plumb through the offsets Basic tests all pass, fix offset to data buffer. Fix compile errors Fix ws Apply suggestions from code review Co-Authored-By: Haichen Shen <shenhaichen@gmail.com> Address CR Update src/runtime/vm/vm.cc Co-Authored-By: Haichen Shen <shenhaichen@gmail.com> Fix another comment Fix lint Fix Fix Fix Lint is done? Fix More fix Trying to debug No clue Fix lint * Fix docs * Disable aggressive constant eval * It works * Fix lint * Found issue with dynamic * Fix the pass, but runtime segfaults * fix scalar tensor, test_any_elemwise passes * Fix split pass * Fix 0-rank issues * Fix * debug * apply Haichen's patch and clean up * lintgit add . * fix serializer and test_tyck_alloc_tensor test * Fix the constant lift pass in presence of closures * Restore old finder * Fix rebase issues * Fix * Fix * Fix issue coercing the shapes incorrectly from i64 to i32 * Fix linting * Fix clang format * Format memory.cc * Fix 0-rank case * Add fix for (0,) shape * Ignore shapes for now * Apply suggestions from code review Co-authored-by: Zhi <5145158+zhiics@users.noreply.github.com> * Update src/runtime/vm/executable.cc Co-authored-by: Zhi <5145158+zhiics@users.noreply.github.com> * Fix * lint Co-authored-by: Zhi Chen <chzhi@amazon.com> Co-authored-by: Zhi <5145158+zhiics@users.noreply.github.com>
trevor-m · Jun 9, 2020 · 7c4270b · 7c4270b
1 parent 1c7941c
commit 7c4270b
Show file tree

Hide file tree

Showing 16 changed files with 649 additions and 126 deletions.
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_NDARRAY_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/serializer.h>
 
@@ -160,6 +161,7 @@ class NDArray : public ObjectRef {
                                  TVMStreamHandle stream = nullptr);
 
   TVM_DLL std::vector<int64_t> Shape() const;
+  TVM_DLL runtime::DataType DataType() const;
   // internal namespace
   struct Internal;
 

diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
@@ -136,6 +136,8 @@ struct Instruction {
     struct /* AllocTensor Operands */ {
       /*! \brief The storage to allocate from. */
       RegName storage;
+      /*! \brief The offset into the storage to allocate from. */
+      Index offset;
       /*! \brief The number of dimensions. */
       uint32_t ndim;
       /*! \brief The shape of tensor. */
@@ -146,6 +148,8 @@ struct Instruction {
     struct /* AllocTensorReg Operands */ {
       /*! \brief The storage to allocate from. */
       RegName storage;
+      /*! \brief The offset into the storage to allocate from. */
+      Index offset;
       /*! \brief The register to read the shape out of. */
       RegName shape_register;
       /*! \brief The datatype of tensor to be allocated. */
@@ -267,23 +271,25 @@ struct Instruction {
   /*!
    * \brief Construct an allocate tensor instruction with constant shape.
    * \param storage The storage to allocate out of.
+   * \param offset The offset to allocate at.
    * \param shape The shape of the tensor.
    * \param dtype The dtype of the tensor.
    * \param dst The destination register.
    * \return The allocate tensor instruction.
    */
-  static Instruction AllocTensor(RegName storage, const std::vector<int64_t>& shape,
+  static Instruction AllocTensor(RegName storage, Index offset, const std::vector<int64_t>& shape,
                                  DLDataType dtype, RegName dst);
   /*!
    * \brief Construct an allocate tensor instruction with register.
    * \param storage The storage to allocate out of.
+   * \param offset The offset into the storage to allocate from.
    * \param shape_register The register containing the shape.
    * \param dtype The dtype of the tensor.
    * \param dst The destination register.
    * \return The allocate tensor instruction.
    */
-  static Instruction AllocTensorReg(RegName storage, RegName shape_register, DLDataType dtype,
-                                    RegName dst);
+  static Instruction AllocTensorReg(RegName storage, Index offset, RegName shape_register,
+                                    DLDataType dtype, RegName dst);
   /*!
    * \brief Construct an allocate datatype instruction.
    * \param tag The datatype tag.

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
@@ -58,6 +58,12 @@
 # Dialects
 from . import qnn
 
+from .scope_builder import ScopeBuilder
+
+# Load Memory Passes
+from .transform import memory_alloc
+from .transform import memory_plan
+
 # Required to traverse large programs
 setrecursionlimit(10000)
 

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
@@ -504,6 +504,7 @@ def const(value, dtype=None):
 
     if not isinstance(value, _nd.NDArray):
         raise ValueError("value has to be scalar or NDArray")
+
     return Constant(value)
 
 

diff --git a/python/tvm/relay/op/memory/memory.py b/python/tvm/relay/op/memory/memory.py
@@ -40,14 +40,17 @@ def invoke_tvm_op(func, inputs, outputs):
     """
     return _make.invoke_tvm_op(func, inputs, outputs)
 
-def alloc_tensor(storage, shape, dtype='float32', assert_shape=None):
+def alloc_tensor(storage, offset, shape, dtype='float32', assert_shape=None):
     """Allocate a tensor with the provided shape, and dtype.
 
     Parameters
     ----------
     storage : tvm.relay.Expr
         The storage to allocate from.
 
+    offset : tvm.relay.Expr
+        The offset to allocate from.
+
     shape : tvm.relay.Expr
         The shape of the tensor to allocate.
 
@@ -61,7 +64,7 @@ def alloc_tensor(storage, shape, dtype='float32', assert_shape=None):
     result : tvm.relay.Expr
         The alloc_tensor expression.
     """
-    return _make.alloc_tensor(storage, shape, dtype, assert_shape)
+    return _make.alloc_tensor(storage, offset, shape, dtype, assert_shape)
 
 def alloc_storage(size, alignment, ctx, dtype_hint='float32'):
     """Allocate a piece of tensor storage.

diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py
@@ -18,5 +18,4 @@
 """The Relay IR namespace containing transformations."""
 # transformation passes
 from .transform import *
-
 from . import memory_alloc
diff --git a/python/tvm/relay/transform/memory_alloc.py b/python/tvm/relay/transform/memory_alloc.py
@@ -28,19 +28,21 @@
 from ..backend import compile_engine
 from ..op.memory import flatten_tuple_type, from_tuple_type, to_tuple_type
 from ...import cpu
+from ..op.memory import alloc_storage
 
+def alloc_tensor(storage, shape, dtype='float32', assert_shape=None):
+    offset = expr.const(0, dtype="int64")
+    return op.memory.alloc_tensor(storage, offset, shape, dtype, assert_shape)
 
 def is_primitive(call):
     return hasattr(call, 'op') and hasattr(call.op, 'attrs') and \
            hasattr(call.op.attrs, 'Primitive') and int(call.op.attrs.Primitive) == 1
 
 class ManifestAllocPass(ExprMutator):
-    """A pass for explictly manifesting all memory allocations in Relay."""
+    """A pass for explicitly manifesting all memory allocations in Relay."""
 
     def __init__(self, target_host):
         self.invoke_tvm = op.memory.invoke_tvm_op
-        self.alloc_storage = op.memory.alloc_storage
-        self.alloc_tensor = op.memory.alloc_tensor
         self.shape_func = op.memory.shape_func
         self.scopes = [ScopeBuilder()]
         self.target_host = target_host
@@ -94,17 +96,16 @@ def make_static_allocation(self, scope, tensor_type, i):
         """Allocate a tensor with a statically known shape."""
         shape = [int(sh) for sh in tensor_type.shape]
         if len(shape) == 0:
-            shape = expr.const(np.array([]).astype(
-                self.compute_dtype), dtype=self.compute_dtype)
+            shape = expr.const(np.empty((), dtype=self.compute_dtype), dtype=self.compute_dtype)
         else:
             shape = expr.const(np.array(shape), dtype=self.compute_dtype)
         size = self.compute_storage(tensor_type)
         alignment = self.compute_alignment(tensor_type.dtype)
         dtype = tensor_type.dtype
-        sto = scope.let("storage_{0}".format(i), self.alloc_storage(
+        sto = scope.let("storage_{0}".format(i), alloc_storage(
             size, alignment, self.default_context, dtype))
         # TODO(@jroesch): There is a bug with typing based on the constant shape.
-        tensor = self.alloc_tensor(sto, shape, dtype, tensor_type.shape)
+        tensor = alloc_tensor(sto, shape, dtype, tensor_type.shape)
         return scope.let("tensor_{0}".format(i), tensor)
 
     def visit_let(self, let):
@@ -172,14 +173,14 @@ def dynamic_invoke(self, scope, func, ins, new_args, out_types, ret_type):
             size = self.compute_storage_in_relay(
                 out_shape, out_type.dtype)
             alignment = self.compute_alignment(out_type.dtype)
-            sto = scope.let("storage_{i}".format(i=i), self.alloc_storage(
+            sto = scope.let("storage_{i}".format(i=i), alloc_storage(
                 size, alignment, self.default_context, out_type.dtype))
             storages.append(sto)
 
         outs = []
         sh_ty_storage = zip(out_shapes, out_types, storages)
         for i, (out_shape, out_type, storage) in enumerate(sh_ty_storage):
-            alloc = self.alloc_tensor(
+            alloc = alloc_tensor(
                 storage,
                 out_shape,
                 out_type.dtype,
@@ -204,6 +205,7 @@ def visit_call(self, call):
             # Because we are in ANF we do not need to visit the arguments.
             scope = self.current_scope()
             new_args = [self.visit(arg) for arg in call.args]
+
             ins = expr.Tuple(new_args)
             ret_type = call.checked_type
             out_types = flatten_tuple_type(ret_type)
@@ -233,7 +235,7 @@ def __init__(self, target_host):
         self.target_host = target_host
 
     def transform_function(self, func, mod, _):
-        # TODO(@jroesch): Is there a way to do one shot initilization?
+        # TODO(@jroesch): Is there a way to do one shot initialization?
         # can we have def pass_init?
         mod.import_from_std("core.rly")
         ea = ManifestAllocPass(self.target_host)