[Runtime] Special Memory Scope Support (#7488)

apache · Feb 28, 2021 · 2673309 · 2673309
1 parent 485dfd6
commit 2673309
Show file tree

Hide file tree

Showing 24 changed files with 577 additions and 387 deletions.
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
@@ -559,6 +559,23 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr);
 TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment,
                                     DLDataType type_hint, void** out_data);
 
+/*!
+ * \brief Allocate a data space on device with special memory scope.
+ * \note The memory could use a special multi-dimensional memory layout.
+ *       That is why we pass shape and dtype instead of raw number of bytes.
+ * \param ctx The device context to perform operation.
+ * \param ndim The number of dimension of the tensor.
+ * \param shape The shape of the tensor.
+ * \param dtype The type of elements.
+ * \param mem_scope The memory scope of the tensor,
+ *        can be nullptr, which indicate the default global DRAM
+ * \param out_data The allocated device pointer.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                             DLDataType dtype, const char* mem_scope,
+                                             void** out_data);
+
 /*!
  * \brief Free a data space on device.
  * \param ctx The device context to perform operation.
@@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr);
 
 /*!
  * \brief Copy data from one place to another.
- * \param from The source array.
- * \param from_offset The byte offeset in the from.
- * \param to The target array.
- * \param to_offset The byte offset in the to.
- * \param num_bytes The size of the memory in bytes
- * \param ctx_from The source context
- * \param ctx_to The target context
- * \param type_hint The type of elements, only neded by certain backends.
- *                  can be useful for cross device endian converison.
+ * \note This API is designed to support special memory with shape dependent layout.
+ *       We pass in DLTensor* with shape information to support these cases.
+ * \param from The source tensor.
+ * \param to The target tensor.
  * \param stream Optional stream object.
  * \return 0 when success, -1 when failure happens.
  */
-TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                    size_t to_offset, size_t num_bytes, TVMContext ctx_from,
-                                    TVMContext ctx_to, DLDataType type_hint,
-                                    TVMStreamHandle stream);
+TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
 
 /*!
  * \brief Check that an object is derived from another.

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
@@ -90,6 +90,17 @@ class TVM_DLL DeviceAPI {
    */
   virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                                DLDataType type_hint) = 0;
+  /*!
+   * \brief Allocate a data space on device with memory scope support.
+   * \param ctx The device context to perform operation.
+   * \param ndim The number of dimension of allocated tensor.
+   * \param shape The shape of allocated tensor.
+   * \param dtype The type of elements.
+   * \param mem_scope The memory scope of allocated tensor.
+   * \return The allocated device pointer.
+   */
+  virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                               Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Free a data space on device.
    * \param ctx The device context to perform operation.
@@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI {
   virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
   /*!
    * \brief copy data from one place to another
+   * \note This API is designed to support special memory with shape dependent layout.
+   *       We pass in DLTensor* with shape information to support these cases.
    * \param from The source array.
-   * \param from_offset The byte offeset in the from.
    * \param to The target array.
-   * \param to_offset The byte offset in the to.
-   * \param num_bytes The size of the memory in bytes
-   * \param ctx_from The source context
-   * \param ctx_to The target context
-   * \param type_hint The type of elements, only neded by certain backends.
-   *                  can be useful for cross device endian converison.
    * \param stream Optional stream object.
    */
-  virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                              DLDataType type_hint, TVMStreamHandle stream) = 0;
+  virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
   /*!
    * \brief Create a new stream of execution.
    *
@@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI {
   static bool NeedSetDeviceContext(int device_type) {
     return device_type != kDLCPU && device_type != kDLMicroDev;
   }
+
+ protected:
+  /*!
+   * \brief copy data from one place to another
+   * \param from The source array.
+   * \param from_offset The byte offeset in the from.
+   * \param to The target array.
+   * \param to_offset The byte offset in the to.
+   * \param num_bytes The size of the memory in bytes
+   * \param ctx_from The source context
+   * \param ctx_to The target context
+   * \param type_hint The type of elements, only neded by certain backends.
+   *                  can be useful for cross device endian converison.
+   * \param stream Optional stream object.
+   */
+  virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                              DLDataType type_hint, TVMStreamHandle stream);
 };
 
 /*! \brief The device type bigger than this is RPC device */

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_NDARRAY_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/serializer.h>
@@ -133,10 +134,12 @@ class NDArray : public ObjectRef {
    * \brief Create an empty NDArray.
    * \param shape The shape of the new array.
    * \param dtype The data type of the new array.
-   * \param ctx The context of the Array.
+   * \param ctx The context of the array.
+   * \param mem_scope The memory scope of the array.
    * \return The created Array
    */
-  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+                               Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
@@ -23,6 +23,7 @@
 from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE
 from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle
 from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t
+from . import _ffi_api
 
 try:
     # pylint: disable=wrong-import-position
@@ -253,42 +254,41 @@ def numpyasarray(np_data):
     return arr, shape
 
 
-def empty(shape, dtype="float32", ctx=context(1, 0)):
+def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
     """Create an empty array given shape and device
 
     Parameters
     ----------
     shape : tuple of int
-        The shape of the array
+        The shape of the array.
 
     dtype : type or str
         The data type of the array.
 
     ctx : TVMContext
-        The context of the array
+        The context of the array.
+
+    mem_scope : Optional[str]
+        The memory scope of the array.
 
     Returns
     -------
     arr : tvm.nd.NDArray
         The array tvm supported.
     """
-    shape = c_array(tvm_shape_index_t, shape)
-    ndim = ctypes.c_int(len(shape))
-    handle = TVMArrayHandle()
+    shape_imm = []
+    for s in shape:
+        if isinstance(s, tvm.tir.IntImm):
+            shape_imm.append(s.value)
+        else:
+            shape_imm.append(int(s))
+    arr = np.array(shape_imm, "int64")
+    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+    shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
+    ndim = len(shape_imm)
     dtype = DataType(dtype)
-    check_call(
-        _LIB.TVMArrayAlloc(
-            shape,
-            ndim,
-            ctypes.c_int(dtype.type_code),
-            ctypes.c_int(dtype.bits),
-            ctypes.c_int(dtype.lanes),
-            ctx.device_type,
-            ctx.device_id,
-            ctypes.byref(handle),
-        )
-    )
-    return _make_array(handle, False, False)
+    arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope)
+    return arr
 
 
 def from_dlpack(dltensor):

diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
@@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin
   return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }
 
+static size_t GetDataAlignment(const DLDataType dtype) {
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                                Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    // by default, we can always redirect to the flat memory allocations
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.ctx = ctx;
+    temp.ndim = ndim;
+    temp.dtype = dtype;
+    temp.shape = const_cast<int64_t*>(shape);
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+    size_t size = GetDataSize(temp);
+    size_t alignment = GetDataAlignment(temp.dtype);
+    return AllocDataSpace(ctx, size, alignment, dtype);
+  }
+  LOG(FATAL) << "Device does not support allocate data space with "
+             << "specified memory scope: " << mem_scope.value();
+  return nullptr;
+}
+
+void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  // by default, we can always redirect to the flat memory copy operation.
+  size_t nbytes = GetDataSize(*from);
+  ICHECK_EQ(nbytes, GetDataSize(*to));
+
+  ICHECK(IsContiguous(*from) && IsContiguous(*to))
+      << "CopyDataFromTo only support contiguous array for now";
+  CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx,
+                 to->ctx, from->dtype, stream);
+}
+
+void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                               size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                               DLDataType type_hint, TVMStreamHandle stream) {
+  LOG(FATAL) << "Device does not support CopyDataFromTo.";
+}
+
 void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); }
 
 TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
@@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
   API_END();
 }
 
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+  API_BEGIN();
+  Optional<String> scope;
+  if (mem_scope != nullptr) {
+    scope = String(std::string(mem_scope));
+  }
+  out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope);
+  API_END();
+}
+
 int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) {
   API_BEGIN();
   DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr);
   API_END();
 }
 
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                            size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                            DLDataType type_hint, TVMStreamHandle stream) {
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   API_BEGIN();
+  TVMContext ctx_from = from->ctx;
+  TVMContext ctx_to = to->ctx;
   TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to;
-  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from,
-                                             ctx_to, type_hint, stream);
+  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream);
   API_END();
 }
 

diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
@@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI {
 #endif
   }
 
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
-    memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
-  }
-
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}
 
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
@@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI {
     static auto* inst = new CPUDeviceAPI();
     return inst;
   }
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final {
+    memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
+  }
 };
 
 struct CPUWorkspacePool : public WorkspacePool {

diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
@@ -22,6 +22,7 @@
 #include <assert.h>
 #include <inttypes.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -87,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
   if (alignment != 1) {
     nbytes = (nbytes + alignment - 1) / alignment * alignment;
   }
-
   return TVMPlatformMemoryAllocate(nbytes, ctx, out_data);
 }
 
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+  size_t nbytes = 1;
+  for (int i = 0; i < ndim; ++i) {
+    nbytes *= shape[i];
+  }
+  nbytes *= (dtype.bits * dtype.lanes + 7) / 8;
+
+  int kAllocAlignment = 128;
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) align = kAllocAlignment;
+  return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data);
+}
+
 int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); }
 
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                            size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                            DLDataType type_hint, TVMStreamHandle stream) {
-  memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes);
+static bool IsContiguous(const DLTensor* arr) {
+  if (arr->strides == NULL) return true;
+  int64_t expected_stride = 1;
+  for (int32_t i = arr->ndim; i != 0; --i) {
+    int32_t k = i - 1;
+    if (arr->strides[k] != expected_stride) return false;
+    expected_stride *= arr->shape[k];
+  }
+  return true;
+}
+
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  assert(IsContiguous(from) && IsContiguous(to));
+  size_t size = 1;
+  for (int i = 0; i < from->ndim; ++i) {
+    size *= from->shape[i];
+  }
+  size *= (from->dtype.bits * from->dtype.lanes + 7) / 8;
+  memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size);
   return 0;
 }
 

diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
@@ -127,6 +127,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -166,6 +167,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+ public:
   TVMStreamHandle CreateStream(TVMContext ctx) {
     CUDA_CALL(cudaSetDevice(ctx.device_id));
     cudaStream_t retval;