Skip to content

Commit

Permalink
[Runtime] Special Memory Scope Support (#7488)
Browse files Browse the repository at this point in the history
  • Loading branch information
ZihengJiang authored Feb 28, 2021
1 parent 485dfd6 commit 2673309
Show file tree
Hide file tree
Showing 24 changed files with 577 additions and 387 deletions.
35 changes: 22 additions & 13 deletions include/tvm/runtime/c_runtime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,23 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr);
TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment,
DLDataType type_hint, void** out_data);

/*!
* \brief Allocate a data space on device with special memory scope.
* \note The memory could use a special multi-dimensional memory layout.
* That is why we pass shape and dtype instead of raw number of bytes.
* \param ctx The device context to perform operation.
* \param ndim The number of dimension of the tensor.
* \param shape The shape of the tensor.
* \param dtype The type of elements.
* \param mem_scope The memory scope of the tensor,
* can be nullptr, which indicate the default global DRAM
* \param out_data The allocated device pointer.
* \return 0 when success, -1 when failure happens
*/
TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
DLDataType dtype, const char* mem_scope,
void** out_data);

/*!
* \brief Free a data space on device.
* \param ctx The device context to perform operation.
Expand All @@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr);

/*!
* \brief Copy data from one place to another.
* \param from The source array.
* \param from_offset The byte offeset in the from.
* \param to The target array.
* \param to_offset The byte offset in the to.
* \param num_bytes The size of the memory in bytes
* \param ctx_from The source context
* \param ctx_to The target context
* \param type_hint The type of elements, only neded by certain backends.
* can be useful for cross device endian converison.
* \note This API is designed to support special memory with shape dependent layout.
* We pass in DLTensor* with shape information to support these cases.
* \param from The source tensor.
* \param to The target tensor.
* \param stream Optional stream object.
* \return 0 when success, -1 when failure happens.
*/
TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to,
size_t to_offset, size_t num_bytes, TVMContext ctx_from,
TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream);
TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);

/*!
* \brief Check that an object is derived from another.
Expand Down
42 changes: 32 additions & 10 deletions include/tvm/runtime/device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,17 @@ class TVM_DLL DeviceAPI {
*/
virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
DLDataType type_hint) = 0;
/*!
* \brief Allocate a data space on device with memory scope support.
* \param ctx The device context to perform operation.
* \param ndim The number of dimension of allocated tensor.
* \param shape The shape of allocated tensor.
* \param dtype The type of elements.
* \param mem_scope The memory scope of allocated tensor.
* \return The allocated device pointer.
*/
virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope = NullOpt);
/*!
* \brief Free a data space on device.
* \param ctx The device context to perform operation.
Expand All @@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI {
virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
/*!
* \brief copy data from one place to another
* \note This API is designed to support special memory with shape dependent layout.
* We pass in DLTensor* with shape information to support these cases.
* \param from The source array.
* \param from_offset The byte offeset in the from.
* \param to The target array.
* \param to_offset The byte offset in the to.
* \param num_bytes The size of the memory in bytes
* \param ctx_from The source context
* \param ctx_to The target context
* \param type_hint The type of elements, only neded by certain backends.
* can be useful for cross device endian converison.
* \param stream Optional stream object.
*/
virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
DLDataType type_hint, TVMStreamHandle stream) = 0;
virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
/*!
* \brief Create a new stream of execution.
*
Expand Down Expand Up @@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI {
static bool NeedSetDeviceContext(int device_type) {
return device_type != kDLCPU && device_type != kDLMicroDev;
}

protected:
/*!
* \brief copy data from one place to another
* \param from The source array.
* \param from_offset The byte offeset in the from.
* \param to The target array.
* \param to_offset The byte offset in the to.
* \param num_bytes The size of the memory in bytes
* \param ctx_from The source context
* \param ctx_to The target context
* \param type_hint The type of elements, only neded by certain backends.
* can be useful for cross device endian converison.
* \param stream Optional stream object.
*/
virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
DLDataType type_hint, TVMStreamHandle stream);
};

/*! \brief The device type bigger than this is RPC device */
Expand Down
7 changes: 5 additions & 2 deletions include/tvm/runtime/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#define TVM_RUNTIME_NDARRAY_H_

#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/container.h>
#include <tvm/runtime/data_type.h>
#include <tvm/runtime/object.h>
#include <tvm/runtime/serializer.h>
Expand Down Expand Up @@ -133,10 +134,12 @@ class NDArray : public ObjectRef {
* \brief Create an empty NDArray.
* \param shape The shape of the new array.
* \param dtype The data type of the new array.
* \param ctx The context of the Array.
* \param ctx The context of the array.
* \param mem_scope The memory scope of the array.
* \return The created Array
*/
TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
Optional<String> mem_scope = NullOpt);
/*!
* \brief Create a NDArray backed by a dlpack tensor.
*
Expand Down
38 changes: 19 additions & 19 deletions python/tvm/runtime/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE
from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle
from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t
from . import _ffi_api

try:
# pylint: disable=wrong-import-position
Expand Down Expand Up @@ -253,42 +254,41 @@ def numpyasarray(np_data):
return arr, shape


def empty(shape, dtype="float32", ctx=context(1, 0)):
def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
"""Create an empty array given shape and device
Parameters
----------
shape : tuple of int
The shape of the array
The shape of the array.
dtype : type or str
The data type of the array.
ctx : TVMContext
The context of the array
The context of the array.
mem_scope : Optional[str]
The memory scope of the array.
Returns
-------
arr : tvm.nd.NDArray
The array tvm supported.
"""
shape = c_array(tvm_shape_index_t, shape)
ndim = ctypes.c_int(len(shape))
handle = TVMArrayHandle()
shape_imm = []
for s in shape:
if isinstance(s, tvm.tir.IntImm):
shape_imm.append(s.value)
else:
shape_imm.append(int(s))
arr = np.array(shape_imm, "int64")
ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
ndim = len(shape_imm)
dtype = DataType(dtype)
check_call(
_LIB.TVMArrayAlloc(
shape,
ndim,
ctypes.c_int(dtype.type_code),
ctypes.c_int(dtype.bits),
ctypes.c_int(dtype.lanes),
ctx.device_type,
ctx.device_id,
ctypes.byref(handle),
)
)
return _make_array(handle, False, False)
arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope)
return arr


def from_dlpack(dltensor):
Expand Down
64 changes: 59 additions & 5 deletions src/runtime/c_runtime_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin
return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
}

static size_t GetDataAlignment(const DLDataType dtype) {
size_t align = (dtype.bits / 8) * dtype.lanes;
if (align < kAllocAlignment) return kAllocAlignment;
return align;
}

void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
// by default, we can always redirect to the flat memory allocations
DLTensor temp;
temp.data = nullptr;
temp.ctx = ctx;
temp.ndim = ndim;
temp.dtype = dtype;
temp.shape = const_cast<int64_t*>(shape);
temp.strides = nullptr;
temp.byte_offset = 0;
size_t size = GetDataSize(temp);
size_t alignment = GetDataAlignment(temp.dtype);
return AllocDataSpace(ctx, size, alignment, dtype);
}
LOG(FATAL) << "Device does not support allocate data space with "
<< "specified memory scope: " << mem_scope.value();
return nullptr;
}

void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
// by default, we can always redirect to the flat memory copy operation.
size_t nbytes = GetDataSize(*from);
ICHECK_EQ(nbytes, GetDataSize(*to));

ICHECK(IsContiguous(*from) && IsContiguous(*to))
<< "CopyDataFromTo only support contiguous array for now";
CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx,
to->ctx, from->dtype, stream);
}

void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
DLDataType type_hint, TVMStreamHandle stream) {
LOG(FATAL) << "Device does not support CopyDataFromTo.";
}

void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); }

TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
Expand Down Expand Up @@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
API_END();
}

int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
DLDataType dtype, const char* mem_scope, void** out_data) {
API_BEGIN();
Optional<String> scope;
if (mem_scope != nullptr) {
scope = String(std::string(mem_scope));
}
out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope);
API_END();
}

int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) {
API_BEGIN();
DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr);
API_END();
}

int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
DLDataType type_hint, TVMStreamHandle stream) {
int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
API_BEGIN();
TVMContext ctx_from = from->ctx;
TVMContext ctx_to = to->ctx;
TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to;
DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from,
ctx_to, type_hint, stream);
DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream);
API_END();
}

Expand Down
13 changes: 7 additions & 6 deletions src/runtime/cpu_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI {
#endif
}

void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) final {
memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
}

void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}

void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
Expand All @@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI {
static auto* inst = new CPUDeviceAPI();
return inst;
}

protected:
void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) final {
memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
}
};

struct CPUWorkspacePool : public WorkspacePool {
Expand Down
39 changes: 34 additions & 5 deletions src/runtime/crt/common/crt_runtime_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <assert.h>
#include <inttypes.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -87,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
if (alignment != 1) {
nbytes = (nbytes + alignment - 1) / alignment * alignment;
}

return TVMPlatformMemoryAllocate(nbytes, ctx, out_data);
}

int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
DLDataType dtype, const char* mem_scope, void** out_data) {
size_t nbytes = 1;
for (int i = 0; i < ndim; ++i) {
nbytes *= shape[i];
}
nbytes *= (dtype.bits * dtype.lanes + 7) / 8;

int kAllocAlignment = 128;
size_t align = (dtype.bits / 8) * dtype.lanes;
if (align < kAllocAlignment) align = kAllocAlignment;
return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data);
}

int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); }

int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
DLDataType type_hint, TVMStreamHandle stream) {
memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes);
static bool IsContiguous(const DLTensor* arr) {
if (arr->strides == NULL) return true;
int64_t expected_stride = 1;
for (int32_t i = arr->ndim; i != 0; --i) {
int32_t k = i - 1;
if (arr->strides[k] != expected_stride) return false;
expected_stride *= arr->shape[k];
}
return true;
}

int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
assert(IsContiguous(from) && IsContiguous(to));
size_t size = 1;
for (int i = 0; i < from->ndim; ++i) {
size *= from->shape[i];
}
size *= (from->dtype.bits * from->dtype.lanes + 7) / 8;
memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size);
return 0;
}

Expand Down
2 changes: 2 additions & 0 deletions src/runtime/cuda/cuda_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}

protected:
void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) final {
Expand Down Expand Up @@ -166,6 +167,7 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}

public:
TVMStreamHandle CreateStream(TVMContext ctx) {
CUDA_CALL(cudaSetDevice(ctx.device_id));
cudaStream_t retval;
Expand Down
Loading

0 comments on commit 2673309

Please sign in to comment.