Add memory manager (#1)

jroesch · Apr 26, 2019 · 9afa9b0 · 9afa9b0
1 parent bf2335a
commit 9afa9b0
Show file tree

Hide file tree

Showing 10 changed files with 331 additions and 40 deletions.
diff --git a/include/tvm/relay/vm/vm.h b/include/tvm/relay/vm/vm.h
@@ -6,8 +6,10 @@
 #ifndef TVM_RELAY_RUNTIME_H_
 #define TVM_RELAY_RUNTIME_H_
 
+#include <vector>
+#include <memory>
 #include <tvm/relay/expr_functor.h>
-#include<vector>
+#include <tvm/runtime/memory_manager.h>
 
 namespace tvm {
 namespace relay {
@@ -161,6 +163,8 @@ struct VirtualMachine {
     const Instruction* code;
     size_t pc;
     size_t bp;
+
+    std::vector<TVMContext> ctxs;
 
     // Interface debugging.
     std::unordered_map<GlobalVar, size_t, NodeHash, NodeEqual> global_map;
@@ -177,7 +181,10 @@ struct VirtualMachine {
       functions(), frames(), stack(),
       func_index(0), code(nullptr), pc(0), bp(0) {}
 
-    static VirtualMachine FromModule(const Module& module);
+    void Init(const std::vector<TVMContext>& ctxs);
+
+    static VirtualMachine FromModule(const Module& module,
+                                     const std::vector<TVMContext>& ctxs);
 };
 
 VirtualMachine CompileModule(const Module& mod);

diff --git a/include/tvm/runtime/memory_manager.h b/include/tvm/runtime/memory_manager.h
@@ -0,0 +1,75 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/runtime/memory_manager.h
+ * \brief Abstract device memory management API
+ */
+#ifndef TVM_RUNTIME_MEMORY_MANAGER_H_
+#define TVM_RUNTIME_MEMORY_MANAGER_H_
+
+#include <memory>
+#include <mutex>
+#include <vector>
+#include <unordered_map>
+#include "c_runtime_api.h"
+
+namespace std {
+template<>
+struct hash<TVMContext> {
+  std::size_t operator()(const TVMContext& ctx) const {
+    return ((ctx.device_id << 8) | ctx.device_type);
+  }
+};
+
+template<>
+struct equal_to<TVMContext> {
+  bool operator()(const TVMContext& lhs, const TVMContext& rhs) const {
+    return (lhs.device_type == rhs.device_type &&
+            lhs.device_id == rhs.device_id);
+  }
+};
+
+} // namespace std
+
+namespace tvm {
+namespace runtime {
+
+struct Buffer {
+  // data pointer
+  void* data{nullptr};
+  // Buffer size in bytes
+  size_t size{0};
+  // TVM Context
+  TVMContext ctx;
+};
+
+class Allocator {
+ public:
+  Allocator(TVMContext ctx) : ctx_(ctx) {}
+
+  virtual Buffer Alloc(size_t nbytes, size_t alignment, TVMType type_hint) = 0;
+  virtual void Free(const Buffer& buffer) = 0;
+  virtual size_t UsedMemory() = 0;
+  virtual ~Allocator() = default;
+
+ protected:
+  TVMContext ctx_;
+};
+
+class MemoryManager {
+ public:
+  static MemoryManager* Global();
+
+  Allocator* GetAllocator(TVMContext ctx);
+
+ private:
+  MemoryManager() {}
+
+ private:
+  std::mutex mu_;
+  std::unordered_map<TVMContext, std::unique_ptr<Allocator>> allocators_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif // TVM_RUNTIME_MEMORY_MANAGER_H_
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
@@ -28,6 +28,7 @@
 #include <vector>
 #include <utility>
 #include "c_runtime_api.h"
+#include "memory_manager.h"
 #include "serializer.h"
 
 namespace tvm {
@@ -167,7 +168,8 @@ class NDArray {
    */
   TVM_DLL static NDArray Empty(std::vector<int64_t> shape,
                                DLDataType dtype,
-                               DLContext ctx);
+                               DLContext ctx,
+                               Allocator* allocator = nullptr);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
@@ -309,6 +311,19 @@ class NDArray::Container {
       }
     }
   }
+
+ private:
+  friend class NDArray;
+  friend class RPCWrappedFunc;
+  /*!
+   * \brief The shape container,
+   *  can be used used for shape data.
+   */
+  std::vector<int64_t> shape_;
+  /*! \brief The internal array object */
+  std::atomic<int> ref_counter_{0};
+  /*! \brief Buffer allocated by allocator */
+  Buffer* buffer_;
 };
 
 // implementations of inline functions

diff --git a/python/tvm/relay/vm.py b/python/tvm/relay/vm.py
@@ -50,7 +50,7 @@ def convert(args):
         _convert(arg, cargs)
     return cargs
 
-def eval_vm(expr_or_mod, *args):
+def eval_vm(expr_or_mod, ctx, *args):
     if isinstance(expr_or_mod, Expr):
         mod = Module.from_expr(expr_or_mod)
     else:
@@ -67,4 +67,4 @@ def eval_vm(expr_or_mod, *args):
 
     cargs = convert(list(args))
     import pdb; pdb.set_trace()
-    return _evaluate_vm(mod, cargs)
+    return _evaluate_vm(mod, ctx.device_type, ctx.device_id, cargs)
diff --git a/src/relay/vm/vm.cc b/src/relay/vm/vm.cc
@@ -4,14 +4,18 @@
  * \brief Abstract device memory management API
  */
 
+#include <tvm/runtime/memory_manager.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/vm/vm.h>
 #include <tvm/relay/interpreter.h>
 #include "../backend/compile_engine.h"
+#include "../../runtime/naive_allocator.h"
 
 #include <vector>
 #include <iostream>
 
+using namespace tvm::runtime;
+
 namespace tvm {
 namespace relay {
 namespace vm {
@@ -254,7 +258,7 @@ struct VMCompiler : ExprFunctor<void(const Expr& expr)> {
       auto it = this->context->global_map.find(global);
       CHECK(it != this->context->global_map.end());
       CHECK(it->second < 5);
-      std::cout << "Invoke with: " << it->second;
+      std::cout << "Invoke with: " << global->name_hint << "(func idx" << it->second << ")" << std::endl;
       Emit(Invoke(it->second));
     }
 
@@ -474,6 +478,8 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<VMOb
 VMObject VirtualMachine::Invoke(const VMFunction& func, const std::vector<VMObject>& args) {
   InvokeGlobal(func, args);
   Run();
+  auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]);
+  std::cout << "Memory used: " << alloc->UsedMemory() << " B\n";
   // std::cout << "final stack size: " << stack.size() << "bp: " << bp << std::endl;
   return stack.back();
 }
@@ -504,6 +510,10 @@ void InvokePacked(const PackedFunc& func, size_t arg_count, std::vector<VMObject
   stack.resize(stack.size() - arg_count + 1);
 }
 
+void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
+  this->ctxs = ctxs;
+}
+
 static int trip_counter = 0;
 
 void VirtualMachine::Run() {
@@ -573,12 +583,10 @@ void VirtualMachine::Run() {
       }
       case Opcode::AllocTensor: {
         const auto& ti = instr.tensor_info;
-        DLContext ctx;
-        ctx.device_type = DLDeviceType::kDLCPU;
-        ctx.device_id = 0;
         auto shape = std::vector<int64_t>(ti.ndim);
         shape.assign(ti.shape, ti.shape + ti.ndim);
-        auto data = NDArray::Empty(shape, ti.dtype, ctx);
+        auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
+        auto data = NDArray::Empty(shape, ti.dtype, ctxs[0], allocator);
         stack.push_back(VMTensor(data));
         pc++;
         goto main_loop;
@@ -607,8 +615,11 @@ void VirtualMachine::Run() {
   }
 }
 
-VirtualMachine VirtualMachine::FromModule(const Module& module) {
-  return CompileModule(module);
+VirtualMachine VirtualMachine::FromModule(const Module& module,
+                                          const std::vector<TVMContext>& ctxs) {
+  auto vm = CompileModule(module);
+  vm.Init(ctxs);
+  return vm;
 }
 
 /*! \brief Convert from an array of relay.Value into VM compatible objects.
@@ -648,8 +659,9 @@ Value ConvertVMToValue(VMObject obj) {
   }
 }
 
-VMObject EvaluateModule(const Module& module, const std::vector<VMObject>& vm_args) {
-  VirtualMachine vm = VirtualMachine::FromModule(module);
+VMObject EvaluateModule(const Module& module, const std::vector<TVMContext> ctxs,
+                        const std::vector<VMObject>& vm_args) {
+  VirtualMachine vm = VirtualMachine::FromModule(module, ctxs);
   std::cout << "--------------------------" << std::endl;
   VMFunctionPrint(vm.functions[0]);
   std::cout << "--------------------------" << std::endl;
@@ -659,6 +671,10 @@ VMObject EvaluateModule(const Module& module, const std::vector<VMObject>& vm_ar
 TVM_REGISTER_API("relay._vm._evaluate_vm")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef to_compile = args[0];
+    TVMContext ctx;
+    int dev_type = args[1];
+    ctx.device_type = static_cast<DLDeviceType>(dev_type);
+    ctx.device_id = args[2];
 
     Module module;
     if (to_compile.as<FunctionNode>()) {
@@ -670,8 +686,8 @@ TVM_REGISTER_API("relay._vm._evaluate_vm")
       LOG(FATAL) << "expected function or module";
     }
 
-    std::vector<VMObject> vm_args = ConvertArgsToVM(args[1]);
-    auto result = EvaluateModule(module, vm_args);
+    std::vector<VMObject> vm_args = ConvertArgsToVM(args[3]);
+    auto result = EvaluateModule(module, {ctx}, vm_args);
     *ret = ConvertVMToValue(result);
 });
 

diff --git a/src/runtime/memory_manager.cc b/src/runtime/memory_manager.cc
@@ -0,0 +1,26 @@
+#include <tvm/runtime/memory_manager.h>
+#include "naive_allocator.h"
+#include "pooled_allocator.h"
+
+namespace tvm {
+namespace runtime {
+
+MemoryManager* MemoryManager::Global() {
+  static MemoryManager memory_manager;
+  return &memory_manager;
+}
+
+Allocator* MemoryManager::GetAllocator(TVMContext ctx) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (allocators_.find(ctx) == allocators_.end()) {
+    LOG(INFO) << "New allocator for " << DeviceName(ctx.device_type) << "("
+              << ctx.device_id << ")";
+    std::unique_ptr<Allocator> alloc(new NaiveAllocator(ctx));
+    //std::unique_ptr<Allocator> alloc(new PooledAllocator(ctx, 128));
+    allocators_.emplace(ctx, std::move(alloc));
+  }
+  return allocators_.at(ctx).get();
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/naive_allocator.h b/src/runtime/naive_allocator.h
@@ -0,0 +1,47 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file runtime/naive_allocator.h
+ */
+#ifndef TVM_RUNTIME_NAIVE_ALLOCATOR_H_
+#define TVM_RUNTIME_NAIVE_ALLOCATOR_H_
+
+#include <atomic>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/memory_manager.h>
+
+namespace tvm {
+namespace runtime {
+
+class NaiveAllocator final : public Allocator {
+ public:
+  NaiveAllocator(TVMContext ctx) : Allocator(ctx), used_memory_(0) {}
+
+  Buffer Alloc(size_t nbytes, size_t alignment, TVMType type_hint) override {
+    Buffer buf;
+    buf.ctx = ctx_;
+    buf.size = nbytes;
+    buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(
+        ctx_, nbytes, alignment, type_hint);
+    used_memory_.fetch_add(nbytes, std::memory_order_relaxed);
+    LOG(INFO) << "allocate " << nbytes << " B, used memory " << used_memory_ << " B";
+    return buf;
+  }
+
+  void Free(const Buffer& buffer) override {
+    DeviceAPI::Get(ctx_)->FreeDataSpace(buffer.ctx, buffer.data);
+    used_memory_.fetch_sub(buffer.size, std::memory_order_relaxed);
+    LOG(INFO) << "free " << buffer.size << " B, used memory " << used_memory_ << " B";
+  }
+
+  size_t UsedMemory() override {
+    return used_memory_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<size_t> used_memory_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif // TVM_RUNTIME_NAIVE_ALLOCATOR_H_