diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index fb14d84b794f..7b4ced7c9c0d 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -66,7 +66,6 @@ #include "../src/runtime/opencl/opencl_device_api.cc" #include "../src/runtime/opencl/opencl_module.cc" #include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc" -#include "../src/runtime/opencl/texture_pool.cc" #include "../src/runtime/source_utils.cc" #endif diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index c33606d98ed3..f27bfdacb570 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -52,6 +52,7 @@ enum DeviceAttrKind : int { kL2CacheSizeBytes = 13, kTotalGlobalMemory = 14, kAvailableGlobalMemory = 15, + kImagePitchAlignment = 16, }; #ifdef TVM_KALLOC_ALIGNMENT diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h index 0c4647e6fa5a..ab1e6b5c9f6d 100644 --- a/include/tvm/runtime/memory/memory_manager.h +++ b/include/tvm/runtime/memory/memory_manager.h @@ -87,7 +87,26 @@ class Allocator { * \return A sized allocation in the form of a buffer. */ TVM_DLL virtual Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint, - const std::string& mem_scope = "") = 0; + const std::string& mem_scope = ""); + + /*! \brief Create a view for the buffer given a shape, type and scope. + * \param buffer The existing buffer upon which we need to create a view. + * \param shape The shape of the view. + * \param type_hint A type hint to the view. + * \param mem_scope A memory scope of the view. + * \return A device pointer to the created view. + */ + TVM_DLL virtual void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint, + const std::string& mem_scope = "global") { + return buffer.data; + } + + /*! \brief Release the view . + * \param dev is the device where this view is created + * \param data The view pointer to be freed. + */ + TVM_DLL virtual void FreeView(Device dev, void* data) {} + /*! \brief Free a buffer allocated by the allocator. * \param buffer The buffer to free. */ @@ -147,6 +166,13 @@ class StorageObj : public Object { /*! \brief Allocate an NDArray from a given piece of storage. */ TVM_DLL NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype); + /*! \brief Allocate an NDArray with memory scope from a given piece of storage. */ + TVM_DLL NDArray AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype, + String scope = "global"); + + /*! \brief The deleter for an NDArray when allocated from underlying storage. */ + static void ScopedDeleter(Object* ptr); + /*! \brief The deleter for an NDArray when allocated from underlying storage. */ static void Deleter(Object* ptr); @@ -170,6 +196,12 @@ class Storage : public ObjectRef { }; } // namespace memory + +using memory::Allocator; +using memory::AllocatorType; +using memory::MemoryManager; +using memory::StorageObj; + } // namespace runtime } // namespace tvm diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index d85ffd78291c..33b3adea5f2f 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -229,6 +229,16 @@ class StorageAllocator : public StorageAllocaBaseVisitor { VLOG_CONTEXT << "StorageAllocator"; VLOG(1) << "planning:" << std::endl << PrettyPrint(func); prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func); + // Backup the virtual devices as token reuse might lost the original memory scope + std::unordered_map> virtual_device_map_; + for (const auto& kv : prototype_) { + std::vector virtual_devices; + virtual_devices.reserve(kv.second.size()); + for (StorageToken* tok : kv.second) { + virtual_devices.push_back(tok->virtual_device); + } + virtual_device_map_.insert({kv.first, virtual_devices}); + } this->Run(func); // The value of smap contains two integer arrays where the first array @@ -252,9 +262,13 @@ class StorageAllocator : public StorageAllocaBaseVisitor { } num_nodes++; storage_ids.push_back(tok->storage_id); - virtual_devices.push_back(tok->virtual_device); sid_sizes_byte.push_back(allocator_.GetMemorySize(tok)); } + ICHECK(kv.second.size() == virtual_device_map_[kv.first].size()) + << "Mismatch of tokens and virtual devices"; + for (auto vdev : virtual_device_map_[kv.first]) { + virtual_devices.push_back(vdev); + } auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices), std::move(sid_sizes_byte)); smap.Set(GetRef(kv.first), storage_info); @@ -356,25 +370,19 @@ class StorageAllocator : public StorageAllocaBaseVisitor { class TokenAllocator { public: - StorageToken* Alloc(StorageToken* proto) { - return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++) - : token_1d_.Alloc(proto, storage_ids_++); - } + StorageToken* Alloc(StorageToken* proto) { return token_mixed_.Alloc(proto, storage_ids_++); } StorageToken* Request(StorageToken* proto) { - StorageToken* token = - Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto); + StorageToken* token = token_mixed_.Request(proto); return token ? token : this->Alloc(proto); } - void CheckForRelease(StorageToken* tok) { - return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok); - } + void CheckForRelease(StorageToken* tok) { return token_mixed_.CheckForRelease(tok); } size_t GetMemorySize(StorageToken* tok) { // TODO(amalyshe): figure out who requries sizes and for what // size in case of texture is not enough - we can return any value if it // assumed to be used for memory allocatoion or we can return real size // if it is just for information - return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok); + return token_mixed_.GetMemorySize(tok); } static bool Is2DStorage(StorageToken* tok) { return relay::Is2DStorage(tok->virtual_device->memory_scope); @@ -382,8 +390,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor { private: int64_t storage_ids_{0}; - TokenAllocator1D token_1d_; - TokenAllocator2D token_2d_; + TokenAllocatorMixed token_mixed_; }; private: diff --git a/src/relay/backend/token_allocator.cc b/src/relay/backend/token_allocator.cc index bdecba9afad7..e974944b33b0 100644 --- a/src/relay/backend/token_allocator.cc +++ b/src/relay/backend/token_allocator.cc @@ -31,22 +31,45 @@ namespace tvm { namespace relay { +constexpr auto Is2DStorage = runtime::IsTextureStorage; -size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) { +/* + * Mixed mode memory allocator + */ +size_t TokenAllocatorMixed::GetMemorySize(StorageToken* prototype) { TensorType ttype = prototype->ttype; ICHECK(ttype.defined()); size_t size = 1; - for (IndexExpr dim : ttype->shape) { - const int64_t* pval = tir::as_const_int(dim); - ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; - ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; - size *= static_cast(pval[0]); + if (relay::Is2DStorage(prototype->virtual_device->memory_scope)) { + size = GetSize2D(prototype); + } else { + for (IndexExpr dim : ttype->shape) { + const int64_t* pval = tir::as_const_int(dim); + ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; + ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; + size *= static_cast(pval[0]); + } + size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); } - size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); return size; } -StorageToken* TokenAllocator1D::Request(StorageToken* prototype) { +String GetDeviceCompatibleToken(StorageToken* tok) { + Target null_tgt{nullptr}; + if (null_tgt == tok->virtual_device->target) { + return tok->virtual_device->memory_scope; + } + std::string dev_kind = tok->virtual_device->target->kind->name; + auto* device_scope_handler = tvm::runtime::Registry::Get("DeviceScopeCompatibility." + dev_kind); + if (device_scope_handler) { + String dev_scope = + (*device_scope_handler)(tok->virtual_device->target, tok->virtual_device->memory_scope); + return dev_scope; + } + return tok->virtual_device->memory_scope; +} + +StorageToken* TokenAllocatorMixed::Request(StorageToken* prototype) { // calculate the size; size_t size = GetMemorySize(prototype); // search memory block in [size / match_range_, size * match_range_) @@ -59,32 +82,42 @@ StorageToken* TokenAllocator1D::Request(StorageToken* prototype) { // search for memory blocks larger than requested for (auto it = mid; it != end; ++it) { StorageToken* tok = it->second; - if (!tok->is_compatible(*prototype)) continue; - ICHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // find a exact match, erase from map and return - free_.erase(it); - return tok; + bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype)); + if (tok->is_compatible(*prototype) || (dev_compatible)) { + ICHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + if (size > tok->max_bytes) { + tok->max_bytes = size; + tok->ttype = prototype->ttype; + } + tok->ref_counter = prototype->ref_counter; + // find a exact match, erase from map and return + free_.erase(it); + return tok; + } } // then search for memory blocks smaller than requested space for (auto it = mid; it != begin;) { --it; StorageToken* tok = it->second; - if (!tok->is_compatible(*prototype)) continue; - ICHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // erase from map and return - free_.erase(it); - return tok; + bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype)); + if (tok->is_compatible(*prototype) || (dev_compatible)) { + ICHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + if (size > tok->max_bytes) { + tok->max_bytes = size; + tok->ttype = prototype->ttype; + } + tok->ref_counter = prototype->ref_counter; + // erase from map and return + free_.erase(it); + return tok; + } } return nullptr; } -StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) { +StorageToken* TokenAllocatorMixed::Alloc(StorageToken* prototype, int64_t storage_id) { size_t size = GetMemorySize(prototype); prototype->max_bytes = size; prototype->storage_id = storage_id; @@ -92,7 +125,7 @@ StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_i return prototype; } -void TokenAllocator1D::CheckForRelease(StorageToken* tok) { +void TokenAllocatorMixed::CheckForRelease(StorageToken* tok) { ICHECK_GE(tok->storage_id, 0); ICHECK_GE(tok->ref_counter, 0); if (tok->ref_counter == 0) { @@ -100,101 +133,22 @@ void TokenAllocator1D::CheckForRelease(StorageToken* tok) { } } -StorageToken* TokenAllocator2D::Request(StorageToken* prototype) { - auto shape = GetSize2D(prototype); - const int64_t max_ratio = 5; - int64_t min_added_size_x = std::numeric_limits::max(); - int64_t min_added_size_y = std::numeric_limits::max(); - int64_t min_wasted_size_x = std::numeric_limits::max(); - int64_t min_wasted_size_y = std::numeric_limits::max(); - int64_t best_storage_id = -1; - MemBlock new_mem; - for (int64_t free_id : free_list_) { - MemBlock& cached = blocks_[free_id]; - // Can only reuse texture 2d blocks of the same type - if (cached.token_->ttype->dtype != prototype->ttype->dtype) { - continue; - } - // Can only reuse texture 2d blocks of the same scope - // Because reusing textures with different memory scope may lead to - // accuracy issues, because the data will be packed in a different way for - // different memory scopes. - if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) { - continue; - } - // avoid reusing too small and too big textures - if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio || - shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) { - continue; - } - int64_t new_width = std::max(cached.x_, shape.width); - int64_t new_height = std::max(cached.y_, shape.height); - int64_t added_size_x = new_width - cached.x_; - int64_t added_size_y = new_height - cached.y_; - int64_t wasted_size_x = new_width - shape.width; - int64_t wasted_size_y = new_height - shape.height; - // Prioritize minimization of added size first, then minimize - // wasted size among blocks which would not require expansion - if ((min_added_size_x > 0 && added_size_x < min_added_size_x) || - (min_added_size_y > 0 && added_size_y < min_added_size_y) || - (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) || - (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) { - min_added_size_x = added_size_x; - min_added_size_y = added_size_y; - min_wasted_size_x = wasted_size_x; - min_wasted_size_y = wasted_size_y; - best_storage_id = free_id; - new_mem.x_ = new_width; - new_mem.y_ = new_height; - } - } - - if (min_added_size_x == 0 && min_added_size_y == 0) { - // use existing block - free_list_.erase(best_storage_id); - blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter; - return blocks_[best_storage_id].token_; - } else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) { - // Reset the reference counter of the now live token - free_list_.erase(best_storage_id); - new_mem.token_ = prototype; - new_mem.token_->ref_counter += 1; - new_mem.token_->storage_id = best_storage_id; - blocks_[best_storage_id] = new_mem; - return new_mem.token_; - } - return nullptr; -} - -StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) { - auto shape = GetSize2D(prototype); - MemBlock block; - block.x_ = shape.width; - block.y_ = shape.height; - prototype->storage_id = storage_id; - block.token_ = prototype; - blocks_[prototype->storage_id] = block; - return prototype; -} - -void TokenAllocator2D::CheckForRelease(StorageToken* tok) { - ICHECK_GE(tok->storage_id, 0); - ICHECK_GE(tok->ref_counter, 0); - if (tok->ref_counter == 0) { - free_list_.insert(tok->storage_id); - } -} - -runtime::Texture2DShape TokenAllocator2D::GetSize2D(StorageToken* prototype) { +size_t TokenAllocatorMixed::GetSize2D(StorageToken* prototype) { TensorType ttype = prototype->ttype; ICHECK(ttype.defined()); - size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(), - prototype->virtual_device->memory_scope); struct Shape { const Array& shape; int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); } + int size() { return this->shape.size(); } }; - return runtime::ApplyTexture2DFlattening(Shape{ttype->shape}, ttype->shape.size(), axis); + auto shape = Shape{ttype->shape}; + int image_row_align = + prototype->virtual_device->target->GetAttr("image_base_address_alignment") + .value_or(Integer(64)) + ->value; + return runtime::GetTextureMemorySize(shape, ttype->dtype.bits(), ttype->dtype.lanes(), + prototype->virtual_device->memory_scope, + image_row_align); } } // namespace relay diff --git a/src/relay/backend/token_allocator.h b/src/relay/backend/token_allocator.h index 3aebd71b6c2b..5524e6b2c634 100644 --- a/src/relay/backend/token_allocator.h +++ b/src/relay/backend/token_allocator.h @@ -66,9 +66,9 @@ struct StorageToken { }; /** - * @brief Memory manager for flattened 1d memory (buffers) + * @brief Memory manager for mixed mode memory types */ -class TokenAllocator1D { +class TokenAllocatorMixed { public: /*! * \brief ceil(size/word_size) to get number of words. @@ -105,54 +105,22 @@ class TokenAllocator1D { * \param tok The token to be released. */ void CheckForRelease(StorageToken* tok); - - private: - // scale used for rough match - const size_t match_range_{16}; - // free list of storage entry - std::multimap free_; - // all the storage resources available - std::vector data_; -}; - -/** - * @brief Memory manager for 2d memory (textures) - */ -class TokenAllocator2D { - public: - /*! - * \brief Request a storage token for a given prototype. - * \param prototype. The prototype storage token. - * \return The result token. - */ - StorageToken* Request(StorageToken* prototype); - /*! - * \brief Alloacte a storage token by consuming prototype - * \param prototype The prototype token. - * \param size The size of memory being requested. - */ - StorageToken* Alloc(StorageToken* prototype, int64_t storage_id); - /*! - * \brief Check if we can release token. - * \param tok The token to be released. - */ - void CheckForRelease(StorageToken* tok); /*! * \brief Get the texture 2d size requirement * \param prototype The prototype token. - * \return The required texture 2d memory size in (width, height, channel). + * \return The physical memory size. */ - runtime::Texture2DShape GetSize2D(StorageToken* prototype); + size_t GetSize2D(StorageToken* prototype); protected: - struct MemBlock { - StorageToken* token_; - int64_t x_; - int64_t y_; - }; + // free list of storage entry + std::multimap free_; + // all the storage resources available + std::vector data_; - std::unordered_map blocks_; - std::unordered_set free_list_; + private: + // scale used for rough match + const size_t match_range_{16}; }; } // namespace relay diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index d8c0075fcdc1..fa7338177cbe 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -54,7 +54,7 @@ CLMLWorkspace::CLMLWorkspace() { tentry = workspace->GetThreadEntry(); device_id = workspace->GetCLDeviceID(tentry->device.device_id); - platform_id = workspace->device_to_platform[device_id]; + platform_id = workspace->device_info[device_id].platform_id; // Print extensions size_t reqd_size = 0; diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 33908d750d6d..82b8d9062615 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -127,6 +127,8 @@ class CUDADeviceAPI final : public DeviceAPI { *rv = static_cast(free_mem); return; } + case kImagePitchAlignment: + return; } *rv = value; } diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 1b1051322c49..3cc3ea396e17 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -424,36 +425,31 @@ void GraphExecutor::SetupStorage() { } pool_entry[sid].param_data_entry = i; pool_entry[sid].device_type = device_type; - pool_entry[sid].scope = storage_scope; DLDataType t = vtype[i]; - if (!details::Is2DStorage(storage_scope)) { - size_t size = 1; - for (int64_t sz : attrs_.shape[i]) { - size *= static_cast(sz); - } - size_t bits = t.bits * t.lanes; - ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); - int64_t bytes = ((bits + 7U) / 8U) * size; - pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes); - pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1}; - } else { - if (pool_entry[sid].shape.size() == 1) { - pool_entry[sid].shape.resize(3, 0); - } - size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope); - auto shape = ApplyTexture2DFlattening(attrs_.shape[i], attrs_.shape[i].size(), axis); - pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height); - pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width); - CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel) - << pool_entry[sid].shape[2] << " != " << shape.channel - << ", texture channel length must be consistent within a storage pool"; - pool_entry[sid].shape[2] = shape.channel; - CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t)) - << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t) - << ", pool entry for 2d texure allocations must be of the same type;" - << " downstream error from memory planner likely"; + + auto dev_type = pool_entry[sid].device_type; + const auto& cit = std::find_if(devices_.begin(), devices_.end(), [&dev_type](const Device& d) { + return dev_type == static_cast(d.device_type); + }); + Device dev = cit == devices_.end() ? devices_[0] : *cit; + + DLTensor temp; + temp.data = nullptr; + temp.device = dev; + temp.ndim = attrs_.shape[i].size(); + temp.dtype = t; + temp.shape = static_cast(attrs_.shape[i].data()); + temp.strides = nullptr; + temp.byte_offset = 0; + + int64_t alloc_size = DeviceAPI::Get(dev)->GetDataSize(temp, String(storage_scope)); + + if (pool_entry[sid].alloc_size < alloc_size) { pool_entry[sid].dtype = t; + pool_entry[sid].shape = attrs_.shape[i]; + pool_entry[sid].alloc_size = alloc_size; + pool_entry[sid].scope = storage_scope; } } @@ -466,18 +462,14 @@ void GraphExecutor::SetupStorage() { }); Device dev = cit == devices_.end() ? devices_[0] : *cit; if (pit.linked_param.defined()) { - storage_pool_.push_back(pit.linked_param); + ndarray_pool_.push_back(pit.linked_param); } else { std::vector shape = pit.shape; - if (shape.size() == 1) { - shape[0] = (shape[0] + 3) / 4; - } - Optional mem_scope; - if (!pit.scope.empty()) { - mem_scope = String(pit.scope); - } - storage_pool_.push_back(MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kNaive) - ->Empty(shape, pit.dtype, dev, mem_scope)); + String mem_scope = pit.scope.empty() ? "global" : String(pit.scope); + auto allocator = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled); + auto buffer = allocator->Alloc(dev, pit.alloc_size, kAllocAlignment, pit.dtype); + auto stor = Storage(buffer, allocator); + storage_pool_.push_back(stor); } } @@ -486,16 +478,22 @@ void GraphExecutor::SetupStorage() { // is mapped to this pool. data_entry_.resize(num_node_entries()); data_alignment_.resize(num_node_entries()); - // sid_to_eid has a size of storage_id's size, which is the size of storage_pool_. - sid_to_eid_.resize(storage_pool_.size()); - for (size_t i = 0; i < data_entry_.size(); ++i) { + // sid_to_eid has a size of storage_id's size, which is the size of pool_entry. + sid_to_eid_.resize(pool_entry.size()); + for (size_t i = 0, j = 0; i < data_entry_.size(); ++i) { int storage_id = attrs_.storage_id[i]; // Update "storage_id -> entry_id" pair. sid_to_eid_[storage_id].push_back(i); - ICHECK_LT(static_cast(storage_id), storage_pool_.size()); - data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]); + ICHECK_LT(static_cast(storage_id), pool_entry.size()); + if (pool_entry[storage_id].linked_param.defined()) { + data_entry_[i] = ndarray_pool_[j++]; + } else { + std::string storage_scope = attrs_.storage_scope.empty() ? "global" : attrs_.storage_scope[i]; + data_entry_[i] = storage_pool_[storage_id]->AllocNDArrayScoped(0, ShapeTuple(attrs_.shape[i]), + vtype[i], storage_scope); + } const DLTensor* tmp = data_entry_[i].operator->(); data_alignment_[i] = details::GetDataAlignment(*tmp); } diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index cfdba8916baa..e1c61001f1d9 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -45,6 +45,7 @@ namespace runtime { using memory::AllocatorType; using memory::MemoryManager; +using tvm::runtime::memory::Storage; /*! \brief macro to do C API call */ #define TVM_CCALL(func) \ @@ -224,6 +225,7 @@ class TVM_DLL GraphExecutor : public ModuleNode { int param_data_entry; NDArray linked_param; std::string scope; + int64_t alloc_size{-1}; // PoolEntry(int s, int dev_type, void* pre_linked_param) : // size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {} }; @@ -483,7 +485,9 @@ class TVM_DLL GraphExecutor : public ModuleNode { /*! \brief Execution context of all devices including the host. */ std::vector devices_; /*! \brief Common storage pool for all devices. */ - std::vector storage_pool_; + std::vector storage_pool_; + /*! \brief Common NDArray pool for all devices. */ + std::vector ndarray_pool_; /*! \brief Data entry of each node. */ std::vector data_entry_; /*! \brief Data alignment of each node. */ diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc index 0607697e6b83..a4b8e15943bd 100644 --- a/src/runtime/memory/memory_manager.cc +++ b/src/runtime/memory/memory_manager.cc @@ -84,6 +84,37 @@ inline size_t GetDataAlignment(const DLTensor& arr) { return align; } +void StorageObj::ScopedDeleter(Object* obj) { + auto* ptr = static_cast(obj); + StorageObj* storage = reinterpret_cast(ptr->manager_ctx); + + // Let the device handle proper cleanup of view + storage->allocator->FreeView(ptr->dl_tensor.device, ptr->dl_tensor.data); + storage->DecRef(); + delete ptr; +} + +NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype, + String scope) { + if (scope == "global" || scope.empty()) { + return AllocNDArray(offset, shape, dtype); + } + VerifyDataType(dtype); + void* data = this->allocator->CreateView(this->buffer, shape, dtype, scope); + NDArray::Container* container = new NDArray::Container(data, shape, dtype, this->buffer.device); + container->dl_tensor.byte_offset = offset; + container->SetDeleter(StorageObj::ScopedDeleter); + size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor); + this->IncRef(); + container->manager_ctx = reinterpret_cast(this); + NDArray ret(GetObjectPtr(container)); + // RAII in effect, now run the check. + ICHECK(offset + needed_size <= this->buffer.size) + << "storage allocation failure, attempted to allocate " << needed_size << " at offset " + << offset << " in region that is " << this->buffer.size << "bytes"; + return ret; +} + NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) { VerifyDataType(dtype); @@ -128,38 +159,62 @@ MemoryManager* MemoryManager::Global() { return inst; } -Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) { - MemoryManager* m = MemoryManager::Global(); - std::lock_guard lock(m->mu_); - if (m->allocators_.find(dev) == m->allocators_.end()) { - m->allocators_.emplace(dev, std::unordered_map>()); +std::string DeviceTypeStr(DLDeviceType type) { + switch (type) { + case kDLOpenCL: + return "opencl"; + break; + case kDLVulkan: + return "vulkan"; + break; + default: + return ""; } - if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) { - std::unique_ptr alloc; +} + +Allocator* GetDeviceSpecificAllocator(Device dev, AllocatorType type) { + std::string dev_str = DeviceTypeStr(dev.device_type); + auto* device_alloc_helper = tvm::runtime::Registry::Get("DeviceAllocator." + dev_str); + void* valloc; + Allocator* allocator = nullptr; + if (device_alloc_helper) { + valloc = (*device_alloc_helper)(dev, static_cast(type)); + allocator = static_cast(valloc); + } + if (nullptr == allocator) { switch (type) { case kNaive: { VLOG(1) << "New naive allocator for " << dev; - alloc.reset(new NaiveAllocator()); + allocator = new NaiveAllocator(); break; } case kPooled: { VLOG(1) << "New pooled allocator for " << dev; - alloc.reset(new PooledAllocator()); + allocator = new PooledAllocator(); break; } default: LOG(FATAL) << "Unknown allocator type: " << type; } + } + return allocator; +} + +Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) { + MemoryManager* m = MemoryManager::Global(); + std::lock_guard lock(m->mu_); + if (m->allocators_.find(dev) == m->allocators_.end()) { + m->allocators_.emplace(dev, std::unordered_map>()); + } + if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) { + std::unique_ptr alloc; + alloc.reset(GetDeviceSpecificAllocator(dev, type)); auto ret = alloc.get(); m->allocators_.at(dev).emplace(type, std::move(alloc)); return ret; } auto alloc = m->allocators_.at(dev).at(type).get(); - /*if (alloc->type() != type) { - LOG(WARNING) << "The type of existing allocator for " << dev - << " is different from the request type (" << alloc->type() << " vs " << type - << ")"; - }*/ + return alloc; } @@ -191,7 +246,7 @@ NDArray Allocator::Empty(ShapeTuple shape, DLDataType dtype, DLDevice dev, VerifyDataType(dtype); NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, dev); container->SetDeleter(BufferDeleter); - size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor); + size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor, mem_scope); size_t alignment = GetDataAlignment(container->dl_tensor); Buffer* buffer = new Buffer; if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index f2e8c4ab0b75..cbdacb5c096f 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -95,6 +95,8 @@ *rv = static_cast([devices[dev.device_id] recommendedMaxWorkingSetSize]); return; } + case kImagePitchAlignment: + return; } }; } diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 2e9b05edcb58..94ab736f5ed5 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -74,12 +75,13 @@ #include "../pack_args.h" #include "../texture.h" #include "../thread_storage_scope.h" -#include "../workspace_pool.h" namespace tvm { namespace runtime { namespace cl { +using tvm::runtime::memory::Buffer; + static_assert(sizeof(cl_mem) == sizeof(void*), "Required to store cl_mem inside void*"); inline const char* CLGetErrorString(cl_int error) { @@ -221,6 +223,12 @@ inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) { class OpenCLThreadEntry; struct BufferDescriptor; +struct CLDeviceInfo { + cl_platform_id platform_id; // platform Id + cl_uint image_row_align; // CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR + bool image_from_buffer_support; // extn: cl_khr_image2d_from_buffer +}; + /*! * \brief Process global OpenCL workspace. */ @@ -234,8 +242,8 @@ class OpenCLWorkspace : public DeviceAPI { std::unordered_map contexts; // whether the workspace it initialized. bool initialized_{false}; - // map device to platform - std::unordered_map device_to_platform; + // map device to various device informations + std::unordered_map device_info; // the devices std::vector devices; // the queues @@ -251,6 +259,7 @@ class OpenCLWorkspace : public DeviceAPI { std::vector free_kernel_ids; // the mutex for initialization std::mutex mu; + // destructor ~OpenCLWorkspace() { for (auto& it : contexts) { @@ -284,6 +293,15 @@ class OpenCLWorkspace : public DeviceAPI { << "Invalid OpenCL device_id=" << dev.device_id << ". " << GetError(); return events[dev.device_id]; } + bool IsOpenCLExtensionSupported(cl_device_id did, const std::string& name) { + size_t reqd_size = 0; + OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size)); + std::vector extn_buf(reqd_size); + OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr)); + std::string extensions(extn_buf.data()); + return (extensions.find(name) != std::string::npos); + } + // is current clCommandQueue in profiling mode bool IsProfiling(Device dev) { cl_command_queue queue = GetQueue(dev); @@ -309,12 +327,22 @@ class OpenCLWorkspace : public DeviceAPI { OPENCL_CALL(clReleaseCommandQueue(queue)); cl_int err_code; cl_device_id did = cl::OpenCLWorkspace::Global()->GetCLDeviceID(dev.device_id); - cl_platform_id platform = cl::OpenCLWorkspace::Global()->device_to_platform[did]; + cl_platform_id platform = cl::OpenCLWorkspace::Global()->device_info[did].platform_id; auto profiling_queue = clCreateCommandQueue(cl::OpenCLWorkspace::Global()->contexts[platform], did, prop, &err_code); OPENCL_CHECK_ERROR(err_code); cl::OpenCLWorkspace::Global()->queues[dev.device_id] = profiling_queue; } + cl_uint GetImageAlignment(int device_id) { + return device_info[GetCLDeviceID(device_id)].image_row_align; + } + bool IsBufferToImageSupported(int device_id) { + return device_info[GetCLDeviceID(device_id)].image_from_buffer_support; + } + + void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype, + Optional mem_scope = NullOpt); + void FreeDataSpaceView(Device dev, void* ptr); cl_device_id GetCLDeviceID(int device_id); // override device API @@ -323,6 +351,8 @@ class OpenCLWorkspace : public DeviceAPI { void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final; void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope = NullOpt) final; + void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint, + Optional mem_scope = NullOpt); void* GetNativePtr(const tvm::runtime::NDArray& narr); void SetNativePtr(const tvm::runtime::NDArray& narr, void* host_ptr, size_t buf_size); void SetPerfHint(Device dev, cl_uint perf_hint); @@ -330,11 +360,12 @@ class OpenCLWorkspace : public DeviceAPI { void StreamSync(Device dev, TVMStreamHandle stream) final; void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; void FreeWorkspace(Device dev, void* data) final; + size_t GetDataSize(const DLTensor& arr, Optional mem_scope = NullOpt) final; - // Texture (image2d_t) alloca APIs - cl_mem AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint); - void* AllocTextureWorkspace(Device dev, size_t width, size_t height, DLDataType type_hint); - void FreeTextureWorkspace(Device dev, void* data); + // cl_mem alloc utils + void* AllocCLBuffer(Device dev, size_t size, size_t alignment, DLDataType type_hint); + void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t row_pitch, + DLDataType type_hint, Optional mem_scope); /*! * \brief Get the thread local ThreadEntry @@ -370,13 +401,8 @@ class OpenCLThreadEntry { Device device; /*! \brief The thread-local kernel table */ std::vector kernel_table; - /*! \brief workspace pool */ - WorkspacePool pool; - /*! \brief texture pool */ - TexturePool texture_pool; // constructor - OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api) - : pool(device_type, device_api), texture_pool(device_type, device_api) { + OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api) { device.device_id = 0; device.device_type = device_type; } @@ -414,9 +440,14 @@ struct BufferDescriptor { static MemoryLayout MemoryLayoutFromScope(Optional mem_scope); static String ScopeFromMemoryLayout(MemoryLayout mem_scope); + /* clBuffer object */ + // buffer should be the first element here cl_mem buffer{nullptr}; + cl::BufferDescriptor* back_buffer{nullptr}; cl_uchar* host_ptr{nullptr}; MemoryLayout layout{MemoryLayout::kBuffer1D}; + Buffer mbuf{nullptr}; // MemoryManager ref. + bool is_compat_view{false}; }; } // namespace cl diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 5c5873b67f74..06f966e5f438 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -27,6 +27,7 @@ #include +#include "../memory/pooled_allocator.h" #include "opencl_common.h" #ifdef OPENCL_ENABLE_HOST_PTR @@ -103,6 +104,19 @@ String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryL return ""; } +static size_t GetMemObjectSize(Device dev, int ndim, const int64_t* shape, DLDataType dtype) { + DLTensor temp; + temp.data = nullptr; + temp.device = dev; + temp.ndim = ndim; + temp.dtype = dtype; + temp.shape = const_cast(shape); + temp.strides = nullptr; + temp.byte_offset = 0; + size_t size = DeviceAPI::Get(dev)->GetDataSize(temp); + return size; +} + OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); } OpenCLWorkspace* OpenCLWorkspace::Global() { @@ -220,6 +234,10 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) // https://stackoverflow.com/a/3568223, may not be implementable // at all through OpenCL API. break; + case kImagePitchAlignment: { + *rv = static_cast(device_info[device_id].image_row_align); + break; + } } } @@ -238,8 +256,55 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) { this->Init(); + return AllocCLBuffer(dev, size, alignment, type_hint); +} + +void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint, + Optional mem_scope) { + // Texture allocation given width and height + cl_uint row_align = GetImageAlignment(dev.device_id); + size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8; + size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align); // CL_RGBA = 4 + size_t mem_size = row_pitch * height; + + // Alloc back buffer from pool + cl::BufferDescriptor* back_buffer = nullptr; + if (IsBufferToImageSupported(dev.device_id)) { + auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled) + ->Alloc(dev, mem_size, kTempAllocaAlignment, type_hint); + back_buffer = static_cast(buf.data); + back_buffer->mbuf = buf; + } + + if (!mem_scope.defined()) { + mem_scope = String("global.texture"); + } + return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope); +} + +void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) { + this->Init(); + if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { + size_t size = GetMemObjectSize(dev, ndim, shape, dtype); + cl::BufferDescriptor* ret_buffer = nullptr; + auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled) + ->Alloc(dev, size, kTempAllocaAlignment, dtype); + ret_buffer = static_cast(buf.data); + ret_buffer->mbuf = buf; + return ret_buffer; + } + size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value()); + auto texture = ApplyTexture2DFlattening(shape, ndim, axis); + + return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope); +} + +void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment, + DLDataType type_hint) { + this->Init(); cl_device_id device_id = GetCLDeviceID(dev.device_id); - auto platform = device_to_platform[device_id]; + auto platform = device_info[device_id].platform_id; cl_int err_code; cl::BufferDescriptor* desc = new cl::BufferDescriptor; // CL_INVALID_BUFFER_SIZE if size is 0. @@ -253,25 +318,121 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, return CreateHostPtrIfEnabled(desc, dev, size); } -void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, - Optional mem_scope) { - if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { - return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope); - } - ICHECK(IsTextureStorage(std::string(mem_scope.value()))) - << "Device does not support allocate data space with " - << "specified memory scope: " << mem_scope.value(); +void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, + size_t row_pitch, DLDataType type_hint, + Optional mem_scope) { + this->Init(); + ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos) + << "Expect texture scope while creating an Image object"; + cl::BufferDescriptor* back_desc = static_cast(back_buffer); + cl_device_id device_id = GetCLDeviceID(dev.device_id); + auto platform = device_info[device_id].platform_id; + cl_int err_code; + cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint); + cl_image_format format = {CL_RGBA, cl_type}; + cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0}; - ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; " - << "provided shape is rank " << ndim; + if (IsBufferToImageSupported(dev.device_id)) { + descriptor.image_row_pitch = row_pitch; + descriptor.buffer = back_desc->buffer; + } + cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_CREATE_FLAGS, &format, &descriptor, + nullptr, &err_code); + OPENCL_CHECK_ERROR(err_code); cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope); - size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value()); - auto texture = ApplyTexture2DFlattening(shape, ndim, axis); - desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype); + desc->buffer = mptr; + desc->back_buffer = back_desc; + return desc; } +size_t OpenCLWorkspace::GetDataSize(const DLTensor& arr, Optional mem_scope) { + if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { + return DeviceAPI::GetDataSize(arr); + } + cl_uint row_align = GetImageAlignment(GetThreadEntry()->device.device_id); + std::vector shape; + shape.assign(arr.shape, arr.shape + arr.ndim); + return runtime::GetTextureMemorySize>(shape, arr.dtype.bits, arr.dtype.lanes, + mem_scope.value(), row_align); +} + +void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, + DLDataType dtype, Optional mem_scope) { + cl::BufferDescriptor* desc = static_cast(data); + + // Fall back for devices w/o "cl_khr_image2d_from_buffer" + if (!IsBufferToImageSupported(dev.device_id)) { + cl::BufferDescriptor* ret_desc = desc; // buffer -> buffer + if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { + if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + // image -> buffer + size_t nbytes = GetMemObjectSize(dev, shape.size(), shape.data(), dtype); + ret_desc = static_cast( + OpenCLWorkspace::AllocCLBuffer(dev, nbytes, kTempAllocaAlignment, dtype)); + ret_desc->is_compat_view = true; + } + } else { + // Any -> Image + size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value()); + auto texture = ApplyTexture2DFlattening(shape.data(), shape.size(), axis); + cl_uint row_align = GetImageAlignment(dev.device_id); + size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8; + size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align); // CL_RGBA = 4 + + ret_desc = static_cast(OpenCLWorkspace::Global()->AllocCLImage( + dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope)); + ret_desc->is_compat_view = true; + } + return ret_desc; + } + + if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { + if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + // buffer -> buffer + return desc; + } else { + // image -> buffer + return desc->back_buffer; + } + } + size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value()); + auto texture = ApplyTexture2DFlattening(shape.data(), shape.size(), axis); + cl_uint row_align = GetImageAlignment(dev.device_id); + size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8; + size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align); // CL_RGBA = 4 + + cl::BufferDescriptor* back_buffer; + if (desc->back_buffer) { + // image -> image + back_buffer = desc->back_buffer; + } else { + // buffer -> image + back_buffer = desc; + } + + return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height, + row_pitch, dtype, mem_scope); +} + +void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) { + auto* desc = static_cast(ptr); + // Handle the fall back + if (!IsBufferToImageSupported(dev.device_id)) { + if (desc->is_compat_view) { + OPENCL_CALL(clReleaseMemObject(desc->buffer)); + delete desc; + } + return; + } + + if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + OPENCL_CALL(clReleaseMemObject(desc->buffer)); + delete desc; + } +} + void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::NDArray& narr) { cl::BufferDescriptor* desc = static_cast(narr.operator->()->data); return desc->host_ptr; @@ -286,9 +447,8 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host #ifdef USE_OPENCL_EXTN_QCOM Device dev = narr.operator->()->device; cl_device_id device_id = GetCLDeviceID(dev.device_id); - auto platform = device_to_platform[device_id]; + auto platform = device_info[device_id].platform_id; - OPENCL_CALL(clFinish(this->GetQueue(dev))); if (desc->host_ptr) { OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer, reinterpret_cast(desc->host_ptr), 0, nullptr, @@ -313,48 +473,35 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host void OpenCLWorkspace::SetPerfHint(Device dev, cl_uint perf_hint) { #ifdef CL_CONTEXT_PERF_HINT_QCOM cl_device_id device_id = GetCLDeviceID(dev.device_id); - auto platform = device_to_platform[device_id]; + auto platform = device_info[device_id].platform_id; OPENCL_CALL(clSetPerfHintQCOM(this->contexts[platform], perf_hint)); #endif } void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) { - // We have to make sure that the memory object is not in the command queue - // for some OpenCL platforms. - OPENCL_CALL(clFinish(this->GetQueue(dev))); - cl::BufferDescriptor* desc = static_cast(ptr); - if (desc->host_ptr) { - OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer, - reinterpret_cast(desc->host_ptr), 0, nullptr, - nullptr)); + if (desc->back_buffer) { + // 2D Image w/ back buffer allocated from pool + OPENCL_CALL(clReleaseMemObject(desc->buffer)); + MemoryManager::GetAllocator(dev, desc->back_buffer->mbuf.alloc_type) + ->Free(desc->back_buffer->mbuf); + delete desc; + } else { + if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + // 1D buffer allocated from pool + if (desc->host_ptr) { + clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer, + reinterpret_cast(desc->host_ptr), 0, nullptr, nullptr); + } + OPENCL_CALL(clReleaseMemObject(desc->buffer)); + delete desc; + } else if (!IsBufferToImageSupported(dev.device_id)) { + // 2D Image allocated w/o pool + OPENCL_CALL(clReleaseMemObject(desc->buffer)); + delete desc; + return; + } } - OPENCL_CALL(clReleaseMemObject(desc->buffer)); - delete desc; -} - -cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height, - DLDataType type_hint) { - this->Init(); - cl_device_id device_id = GetCLDeviceID(dev.device_id); - auto platform = device_to_platform[device_id]; - cl_int err_code; - cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint); - cl_image_format format = {CL_RGBA, cl_type}; - cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0}; - cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_READ_WRITE, &format, &descriptor, - nullptr, &err_code); - OPENCL_CHECK_ERROR(err_code); - return mptr; -} - -void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height, - DLDataType type_hint) { - return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint); -} - -void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) { - GetThreadEntry()->texture_pool.FreeTexture(dev, ptr); } void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { @@ -444,11 +591,18 @@ void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) { } void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { - return GetThreadEntry()->pool.AllocWorkspace(dev, size); + this->Init(); + cl::BufferDescriptor* ret_buffer = nullptr; + auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled) + ->Alloc(dev, size, kTempAllocaAlignment, type_hint); + ret_buffer = static_cast(buf.data); + ret_buffer->mbuf = buf; + return ret_buffer; } void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) { - GetThreadEntry()->pool.FreeWorkspace(dev, data); + cl::BufferDescriptor* desc = static_cast(data); + MemoryManager::GetAllocator(dev, desc->mbuf.alloc_type)->Free(desc->mbuf); } typedef dmlc::ThreadLocalStore OpenCLThreadStore; @@ -585,9 +739,20 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic this->devices.insert(this->devices.end(), devices.begin(), devices.end()); for (size_t i = 0; i < devices.size(); ++i) { cl_device_id did = devices[i]; - device_to_platform[did] = platform; + CLDeviceInfo dev_info; + dev_info.platform_id = platform; this->queues.push_back(clCreateCommandQueue(this->contexts[platform], did, 0, &err_code)); OPENCL_CHECK_ERROR(err_code); + cl_uint row_pitch; + OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, sizeof(row_pitch), + &row_pitch, nullptr)); + if (0 == row_pitch) { + row_pitch = kAllocAlignment; // Fallback + } + dev_info.image_row_align = row_pitch; + dev_info.image_from_buffer_support = + IsOpenCLExtensionSupported(did, "cl_khr_image2d_from_buffer"); + device_info.insert({did, dev_info}); } OPENCL_CHECK_ERROR(err_code); } @@ -617,9 +782,9 @@ TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMR type_hint.bits = static_cast(dtype_bits_hint); type_hint.lanes = 1; - OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); - *rv = ptr->AllocTextureWorkspace(dev, static_cast(width), static_cast(height), - type_hint); + *rv = OpenCLWorkspace::Global()->AllocDataSpace(dev, static_cast(width), + static_cast(height), type_hint, + Optional("global.texture")); }); TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) { @@ -632,7 +797,7 @@ TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRe Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; - ptr->FreeTextureWorkspace(dev, data); + ptr->FreeDataSpace(dev, data); *rv = static_cast(0); }); @@ -647,6 +812,92 @@ TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) { return Timer(make_object(dev)); }); +class OpenCLPooledAllocator final : public memory::PooledAllocator { + public: + explicit OpenCLPooledAllocator() : PooledAllocator() {} + + bool AllowMemoryScope(const std::string& mem_scope) const final { + return ((mem_scope.find("texture") != std::string::npos) || mem_scope.empty() || + ("global" == mem_scope)); + } + + Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) override { + std::lock_guard lock(mu_); + size_t size = ((nbytes + page_size_ - 1) / page_size_) * page_size_; + auto&& it = memory_pool_.find(size); + if (it != memory_pool_.end() && !it->second.empty()) { + auto&& pool = it->second; + auto ret = pool.back(); + pool.pop_back(); + return ret; + } + Buffer buf; + buf.device = dev; + buf.size = size; + buf.alloc_type = AllocatorType::kPooled; + try { + buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint); + } catch (InternalError& err) { + LOG(WARNING) << "PooledAllocator got InternalError during allocation: " << err.message(); + LOG(WARNING) << "Trying to release all unused memory and reallocate..."; + ReleaseAll(); + buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint); + } + + used_memory_.fetch_add(size, std::memory_order_relaxed); + VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B"; + return buf; + } + + Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint, + const std::string& mem_scope) override { + if (AllowMemoryScope(mem_scope)) { + NDArray::Container container(nullptr, shape, type_hint, dev); + size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor); + Buffer buf; + buf.device = dev; + buf.size = size; + buf.alloc_type = AllocatorType::kPooled; + buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint, + String(mem_scope)); + if (mem_scope.find("texture") == std::string::npos) { + // All textures are backed by buffers - don't count in total memory + used_memory_.fetch_add(size, std::memory_order_relaxed); + } + DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B"; + return buf; + } + LOG(FATAL) << "Unsupported memory scope for this Allocator:" << mem_scope; + return {}; + } + + void Free(const Buffer& buffer) override { + std::lock_guard lock(mu_); + if (memory_pool_.find(buffer.size) == memory_pool_.end()) { + memory_pool_.emplace(buffer.size, std::vector{}); + } + memory_pool_.at(buffer.size).push_back(buffer); + VLOG(1) << "reclaim buffer " << buffer.size; + } + + void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint, + const std::string& mem_scope) final { + OpenCLWorkspace* ws_ = OpenCLWorkspace::Global(); + return ws_->AllocDataSpaceView(buffer.device, buffer.data, shape, type_hint, + Optional(mem_scope)); + } + + void FreeView(Device dev, void* data) final { + OpenCLWorkspace* ws_ = OpenCLWorkspace::Global(); + return ws_->FreeDataSpaceView(dev, data); + } +}; + +TVM_REGISTER_GLOBAL("DeviceAllocator.opencl").set_body([](TVMArgs args, TVMRetValue* rv) { + Allocator* alloc = new OpenCLPooledAllocator(); + *rv = static_cast(alloc); +}); + } // namespace cl size_t OpenCLTimerNode::count_timer_execs = 0; std::vector OpenCLTimerNode::event_start_idxs; diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 567b7ad88a9e..77c50b23895c 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -225,7 +225,7 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre std::lock_guard lock(build_lock_); int device_id = t->device.device_id; auto did = w->GetCLDeviceID(device_id); - auto platform = w->device_to_platform[did]; + auto platform = w->device_info[did].platform_id; if (!IsProgramCreated(func_name, device_id)) { // create program if (fmt_ == "cl") { @@ -294,7 +294,7 @@ void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) { const unsigned char* programBinary = bin_vector.data(); cl_device_id dev = workspace_->GetCLDeviceID(device_id); - auto platform = workspace_->device_to_platform[dev]; + auto platform = workspace_->device_info[dev].platform_id; programs_[name][device_id] = clCreateProgramWithBinary(workspace_->contexts[platform], 1, &dev, &binarySize, &programBinary, &binaryStatus, &err); diff --git a/src/runtime/opencl/opencl_module_spirv.cc b/src/runtime/opencl/opencl_module_spirv.cc index 7e52b7057bc7..28e02a4e3749 100644 --- a/src/runtime/opencl/opencl_module_spirv.cc +++ b/src/runtime/opencl/opencl_module_spirv.cc @@ -96,7 +96,7 @@ cl_kernel OpenCLSPIRVModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenC size_t len = it->second.data.size() * sizeof(uint32_t); cl_int err; cl_device_id dev = w->devices[device_id]; - auto platform = w->device_to_platform[dev]; + auto platform = w->device_info[dev].platform_id; programs_[func_name][device_id] = clCreateProgramWithBinary(w->contexts[platform], 1, &dev, &len, &s, nullptr, &err); OPENCL_CHECK_ERROR(err); diff --git a/src/runtime/opencl/texture_pool.cc b/src/runtime/opencl/texture_pool.cc deleted file mode 100644 index 0b9477f2d4ea..000000000000 --- a/src/runtime/opencl/texture_pool.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file texture_pool.h - * \brief Texture pool utility. - */ -#include -#include - -#include "../texture.h" - -namespace tvm { -namespace runtime { - -void* Pool2D::Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, - DLDataType type_hint) { - Entry e; - Entry new_mem; - // Processed several experiments and found that when we are trying to fit - // small texture to too big texture then it may lead to the performance - // degradation. - // Coefficient at 5 looks like robust variant for reusing textures. - const int64_t max_ratio = 5; - e.data = nullptr; - std::vector::iterator best_mem; - if (free_list_.size() != 0) { - int64_t min_added_size_x = std::numeric_limits::max(); - int64_t min_added_size_y = std::numeric_limits::max(); - int64_t min_wasted_size_x = std::numeric_limits::max(); - int64_t min_wasted_size_y = std::numeric_limits::max(); - for (auto it = free_list_.begin(); it != free_list_.end(); ++it) { - if (it->type.code != type_hint.code) { - continue; - } - // avoid reusing too small and too big textures - if (width / it->x > max_ratio || it->x / width > max_ratio || height / it->y > max_ratio || - it->y / height > max_ratio) { - continue; - } - int64_t new_width = std::max(it->x, width); - int64_t new_height = std::max(it->y, height); - int64_t added_size_x = new_width - it->x; - int64_t added_size_y = new_height - it->y; - int64_t wasted_size_x = new_width - width; - int64_t wasted_size_y = new_height - height; - // Minimize added size first and wasted size thereafter - if ((min_added_size_x > 0 && added_size_x < min_added_size_x) || - (min_added_size_y > 0 && added_size_y < min_added_size_y) || - (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) || - (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) { - min_added_size_x = added_size_x; - min_added_size_y = added_size_y; - min_wasted_size_x = wasted_size_x; - min_wasted_size_y = wasted_size_y; - best_mem = it; - new_mem.x = new_width; - new_mem.y = new_height; - } - } - - if (min_added_size_x == 0 && min_added_size_y == 0) { - // use existing block - e = *best_mem; - free_list_.erase(best_mem); - } else if (static_cast(min_added_size_x) <= width || - static_cast(min_added_size_y) <= height) { - // if added size is less or equal to - // what is needed by alloc, then grow entry - device->FreeDataSpace(dev, best_mem->data); - free_list_.erase(best_mem); - new_mem.type = type_hint; - std::vector shape{int64_t(new_mem.y), int64_t(new_mem.x), 4}; - new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type, - Optional("global.texture")); - e = new_mem; - } - } - - if (e.data == nullptr) { - // create new block - std::vector shape{int64_t(height), int64_t(width), 4}; - e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint, - Optional("global.texture")); - e.x = width; - e.y = height; - e.type = type_hint; - } - - allocated_.push_back(e); - return e.data; -} - -void Pool2D::Free(void* data) { - Entry e; - if (allocated_.back().data == data) { - // quick path, last allocated. - e = allocated_.back(); - allocated_.pop_back(); - } else { - int index = static_cast(allocated_.size()) - 2; - for (; index >= 0 && allocated_[index].data != data; --index) { - } - ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated"; - e = allocated_[index]; - allocated_.erase(allocated_.begin() + index); - } - free_list_.push_back(e); -} - -// Release all resources immediately -void Pool2D::Release(Device dev, DeviceAPI* device) { - for (auto& e : allocated_) { - device->FreeDataSpace(dev, e.data); - } - for (auto& e : free_list_) { - device->FreeDataSpace(dev, e.data); - } - allocated_.clear(); - free_list_.clear(); -} - -TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device) - : device_type_(device_type), device_(device) {} - -TexturePool::~TexturePool() { - for (size_t i = 0; i < array_.size(); ++i) { - if (array_[i] != nullptr) { - Device dev; - dev.device_type = device_type_; - dev.device_id = static_cast(i); - array_[i]->Release(dev, device_); - delete array_[i]; - } - } -} - -void* TexturePool::AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint) { - if (static_cast(dev.device_id) >= array_.size()) { - array_.resize(dev.device_id + 1, nullptr); - } - if (array_[dev.device_id] == nullptr) { - array_[dev.device_id] = new Pool2D(); - } - return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint); -} - -void TexturePool::FreeTexture(Device dev, void* ptr) { - ICHECK(static_cast(dev.device_id) < array_.size() && array_[dev.device_id] != nullptr) - << "Attempt to free texture from null texture pool"; - array_[dev.device_id]->Free(ptr); -} - -} // namespace runtime -} // namespace tvm diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index ebfd312595a3..e7f103daadc9 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -136,7 +136,8 @@ class ROCMDeviceAPI final : public DeviceAPI { *rv = total_global_memory; return; } - + case kImagePitchAlignment: + return; case kAvailableGlobalMemory: // Not currently implemented. *rv = nullptr; diff --git a/src/runtime/texture.h b/src/runtime/texture.h index dc38101f0cd4..f3a827aa8792 100644 --- a/src/runtime/texture.h +++ b/src/runtime/texture.h @@ -30,6 +30,8 @@ #include #include +#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1)) + namespace tvm { namespace runtime { @@ -94,74 +96,26 @@ inline bool IsTextureStorage(std::string scope) { return scope.find("texture") != std::string::npos; } -class TVM_DLL Pool2D { - public: - Pool2D() = default; - void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint); - void Free(void* data); - // Release all resources immediately - void Release(Device dev, DeviceAPI* device); - - protected: - struct Entry { - void* data; - size_t x; - size_t y; - DLDataType type; - }; - std::vector free_list_; - std::vector allocated_; -}; - /*! - * \brief A two dimensional storage pool that recycles temporal workspace - * allocations for dynamically allocated texture. See AllocTexture docstring - * for approach to allocation and reuse. + * \brief Returns the physical backing memory size required for given specification + * \param shape shape of tensor + * \param bits dtype bits + * \param lanes vectorization lanes + * \param mem_scope the memory scope info + * \param image_row_align image rowwise alignment size + * \return returns the backing memory size */ -class TVM_DLL TexturePool { - public: - /*! - * \brief Create pool with specific device type and device. - * \param device_type The device type. - * \param device_api The device API. - */ - TexturePool(DLDeviceType device_type, DeviceAPI* device_api); - /*! \brief destructor */ - ~TexturePool(); - - /*! - * \brief Allocate a two dimensional temporal texture workspace on device - * - * \note Two dimensional texture workspaces will be grown and reused - * according to the following strategy: - * - Choose the workspace which minimizes the amount of memory required to - * grow the workspace to fit the request. - * - If a set of workspaces exist that fit the current request without - * expansion, choose the workspace of that set which most closely - * matches the request size, minimizing wasted space. - * - * \param dev The context of allocation. - * \param width The width of the 2d texture to be allocated. - * \param height The height of the 2d texture to be allocated. - * \param type_hint The type of elements. - */ - void* AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint); - /*! - * \brief Free temporal texture in backend execution. - * - * \param dev The context of allocation. - * \param ptr The pointer to be freed. - */ - void FreeTexture(Device dev, void* ptr); +template +size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope, + int image_row_align) { + size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope); + auto tshape = ApplyTexture2DFlattening(shape, shape.size(), axis); - private: - /*! \brief pool of device local array */ - std::vector array_; - /*! \brief device type this pool support */ - DLDeviceType device_type_; - /*! \brief The device API */ - DeviceAPI* device_; -}; + auto pack_size = shape[shape.size() - 1]; + auto pixel_size = (bits * lanes + 7) / 8; + size_t row_pitch = ALIGN_UP(tshape.width * pixel_size * pack_size, image_row_align); + return row_pitch * tshape.height; +} } // namespace runtime } // namespace tvm diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc index 483668a2a75f..af7b35e85ec5 100644 --- a/src/runtime/vulkan/vulkan_device_api.cc +++ b/src/runtime/vulkan/vulkan_device_api.cc @@ -168,11 +168,12 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) *rv = device(index).compute_memory_size; return; } - case kAvailableGlobalMemory: // Not currently implemented. Will only be implementable for // devices that support the VK_EXT_memory_budget extension. break; + case kImagePitchAlignment: + return; } } diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 5933c9582cec..b447c0729746 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -674,5 +674,19 @@ runtime::Module BuildOpenCL(IRModule mod, Target target) { } TVM_REGISTER_GLOBAL("target.build.opencl").set_body_typed(BuildOpenCL); + +String DeviceScopeCompatibilityFromTarget(Target target, String memory_scope) { + auto prototype_keys = target->GetKeys(); + bool is_adreno = + std::find(prototype_keys.begin(), prototype_keys.end(), "adreno") != prototype_keys.end(); + if (is_adreno) { + return String("global"); + } + return memory_scope; +} + +TVM_REGISTER_GLOBAL("DeviceScopeCompatibility.opencl") + .set_body_typed(DeviceScopeCompatibilityFromTarget); + } // namespace codegen } // namespace tvm diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index e0a0ad23a1b6..e12c18e5ac73 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -366,6 +366,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) // specify any limitations on the number of kernel arguments. max_function_args // equals to 128 looks like a reasonable number of kernel arguments. .add_attr_option("max_function_args", runtime::Int(128)) + .add_attr_option("image_base_address_alignment", runtime::Int(64)) .set_default_keys({"opencl", "gpu"}); // The metal has some limitations on the number of input parameters. This is why attribute diff --git a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc deleted file mode 100644 index 2d3f43ddce6d..000000000000 --- a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include - -#include "../src/runtime/opencl/opencl_common.h" -#include "../src/runtime/texture.h" - -using namespace tvm::runtime; -using namespace tvm::runtime::cl; - -// PoolWrapper is necessary because in class Pool2D we don't have an access to -// its protected members. In this class we add new methods which allow us to -// get and check internal state of class Pool -class PoolWrapper : public Pool2D { - public: - inline size_t FreeListSize() const { return free_list_.size(); } - inline size_t AllocatedListSize() const { return allocated_.size(); } - inline std::pair FreeListItemSize(size_t idx) const { - return std::make_pair(free_list_[idx].x, free_list_[idx].y); - } - inline std::pair AllocatedListItemSize(size_t idx) const { - return std::make_pair(allocated_[idx].x, allocated_[idx].y); - } -}; - -TEST(OpenCLTexturePool, textures_reallocation_optimal_size) { - OpenCLWorkspace* workspace = OpenCLWorkspace::Global(); - OpenCLThreadEntry* t = workspace->GetThreadEntry(); - PoolWrapper pool; - EXPECT_EQ(pool.AllocatedListSize(), 0); - EXPECT_EQ(pool.FreeListSize(), 0); - - DLDataType type{kDLFloat, 16, 1}; - void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type); - EXPECT_EQ(pool.AllocatedListSize(), 1); - EXPECT_EQ(pool.FreeListSize(), 0); - auto item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 768); - - pool.Alloc(t->device, workspace, 64, 12455, type); - EXPECT_EQ(pool.AllocatedListSize(), 2); - EXPECT_EQ(pool.FreeListSize(), 0); - item = pool.AllocatedListItemSize(1); - EXPECT_EQ(item.first, 64); - EXPECT_EQ(item.second, 12455); - - pool.Free(data1); - EXPECT_EQ(pool.AllocatedListSize(), 1); - EXPECT_EQ(pool.FreeListSize(), 1); - item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 64); - EXPECT_EQ(item.second, 12455); - item = pool.FreeListItemSize(0); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 768); - - pool.Alloc(t->device, workspace, 768, 1024, type); - EXPECT_EQ(pool.AllocatedListSize(), 2); - EXPECT_EQ(pool.FreeListSize(), 0); - item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 64); - EXPECT_EQ(item.second, 12455); - item = pool.AllocatedListItemSize(1); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 1024); -} - -TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) { - OpenCLWorkspace* workspace = OpenCLWorkspace::Global(); - OpenCLThreadEntry* t = workspace->GetThreadEntry(); - PoolWrapper pool; - EXPECT_EQ(pool.AllocatedListSize(), 0); - EXPECT_EQ(pool.FreeListSize(), 0); - - DLDataType type{kDLFloat, 16, 1}; - void* data1 = pool.Alloc(t->device, workspace, 12455, 64, type); - EXPECT_EQ(pool.AllocatedListSize(), 1); - EXPECT_EQ(pool.FreeListSize(), 0); - auto item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 12455); - EXPECT_EQ(item.second, 64); - - pool.Free(data1); - EXPECT_EQ(pool.AllocatedListSize(), 0); - EXPECT_EQ(pool.FreeListSize(), 1); - item = pool.FreeListItemSize(0); - EXPECT_EQ(item.first, 12455); - EXPECT_EQ(item.second, 64); - - pool.Alloc(t->device, workspace, 1024, 768, type); - EXPECT_EQ(pool.AllocatedListSize(), 1); - EXPECT_EQ(pool.FreeListSize(), 1); - item = pool.FreeListItemSize(0); - EXPECT_EQ(item.first, 12455); - EXPECT_EQ(item.second, 64); - item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 768); -} - -TEST(OpenCLTexturePool, avoid_reusing_too_small_textures) { - OpenCLWorkspace* workspace = OpenCLWorkspace::Global(); - OpenCLThreadEntry* t = workspace->GetThreadEntry(); - PoolWrapper pool; - EXPECT_EQ(pool.AllocatedListSize(), 0); - EXPECT_EQ(pool.FreeListSize(), 0); - - DLDataType type{kDLFloat, 16, 1}; - void* data1 = pool.Alloc(t->device, workspace, 1024, 64, type); - EXPECT_EQ(pool.AllocatedListSize(), 1); - EXPECT_EQ(pool.FreeListSize(), 0); - auto item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 64); - - pool.Free(data1); - EXPECT_EQ(pool.AllocatedListSize(), 0); - EXPECT_EQ(pool.FreeListSize(), 1); - item = pool.FreeListItemSize(0); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 64); - - pool.Alloc(t->device, workspace, 12544, 64, type); - EXPECT_EQ(pool.AllocatedListSize(), 1); - EXPECT_EQ(pool.FreeListSize(), 1); - item = pool.FreeListItemSize(0); - EXPECT_EQ(item.first, 1024); - EXPECT_EQ(item.second, 64); - item = pool.AllocatedListItemSize(0); - EXPECT_EQ(item.first, 12544); - EXPECT_EQ(item.second, 64); -} diff --git a/tests/cpp-runtime/opencl/opencl_timer_test.cc b/tests/cpp-runtime/opencl/opencl_timer_test.cc index 1753300d3a09..ec038be5406c 100644 --- a/tests/cpp-runtime/opencl/opencl_timer_test.cc +++ b/tests/cpp-runtime/opencl/opencl_timer_test.cc @@ -37,7 +37,7 @@ TEST(OpenCLTimerNode, nested_timers) { int64_t nested_time_sum = 0; auto did = workspace->GetCLDeviceID(thr->device.device_id); - auto platform = workspace->device_to_platform[did]; + auto platform = workspace->device_info[did].platform_id; Timer init_timer = Timer::Start(thr->device); for (int i = 0; i < NUM_REPEAT; ++i) { Timer nested_timer = Timer::Start(thr->device); diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc new file mode 100644 index 000000000000..23b490f695e2 --- /dev/null +++ b/tests/cpp-runtime/opencl/texture_copy_test.cc @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include +#include + +#include "../src/runtime/opencl/opencl_common.h" + +using tvm::runtime::kAllocAlignment; +using tvm::runtime::memory::AllocatorType; +using tvm::runtime::memory::Buffer; +using tvm::runtime::memory::MemoryManager; +using tvm::runtime::memory::Storage; + +class TextureCopyTest : public ::testing::Test { + protected: + void SetUp() override { + bool enabled = tvm::runtime::RuntimeEnabled("opencl"); + if (!enabled) { + GTEST_SKIP() << "Skip texture copy test because opencl runtime is disabled.\n"; + } + // Check hardware support + tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global(); + tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry(); + if (!workspace->IsBufferToImageSupported(thr->device.device_id)) { + GTEST_SKIP() << "Skip test case as BufferToImage is not supported \n"; + } + (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator( + thr->device, tvm::runtime::memory::AllocatorType::kPooled); + } +}; + +TEST(TextureCopy, HostDeviceRT) { + using namespace tvm; + bool enabled = tvm::runtime::RuntimeEnabled("opencl"); + if (!enabled) { + GTEST_SKIP() << "Skip texture copy test because opencl runtime is disabled.\n"; + } + tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global(); + tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry(); + (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator( + thr->device, tvm::runtime::memory::AllocatorType::kPooled); + std::vector shape{16, 16, 4}; + auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + String mem_scope = "global.texture"; + auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope); + + size_t size = 1; + for (size_t i = 0; i < shape.size(); ++i) { + size *= static_cast(shape[i]); + } + + std::random_device dev; + std::mt19937 mt(dev()); + std::uniform_real_distribution<> random(-10.0, 10.0); + + // Random initialize host ndarray + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr0->data)[i] = random(mt); + } + + // Do a roundtrip from host storage to opencl texture storage and back + cpu_arr0.CopyTo(opencl_txarr0); + opencl_txarr0.CopyTo(cpu_arr1); + for (size_t i = 0; i < size; ++i) { + ICHECK_LT( + std::fabs(static_cast(cpu_arr1->data)[i] - static_cast(cpu_arr0->data)[i]), + 1e-5); + } +} + +TEST_F(TextureCopyTest, ViewBufferAsBuffer) { + using namespace tvm; + std::vector shape{1, 16, 16, 8}; + std::vector same_shape{1, 8, 16, 16}; + auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + + String mem_scope = "global"; + + DLDevice cl_dev = {kDLOpenCL, 0}; + auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled); + auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1}); + auto stor = Storage(buffer, allocator); + + auto opencl_memobj = stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, mem_scope); + auto opencl_memview = + stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, mem_scope); + + std::random_device dev; + std::mt19937 mt(dev()); + std::uniform_real_distribution<> random(-10.0, 10.0); + + size_t size = 1; + for (size_t i = 0; i < shape.size(); ++i) { + size *= static_cast(shape[i]); + } + + /* Check original object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_memobj); + // Copy from OpenCLBuffer + opencl_memobj.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } + + /* Check view object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_memview); + // Copy from OpenCLBuffer + opencl_memview.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } +} + +TEST_F(TextureCopyTest, ViewBufferAsImage) { + using namespace tvm; + // Shape that doesn't cause padding for image row + std::vector shape{1, 16, 16, 8, 4}; + std::vector same_shape{1, 8, 16, 16, 4}; + auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + + DLDevice cl_dev = {kDLOpenCL, 0}; + auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled); + auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1}); + auto stor = Storage(buffer, allocator); + + auto opencl_buf_obj = stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global"); + auto opencl_img_obj = + stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global.texture"); + + std::random_device dev; + std::mt19937 mt(dev()); + std::uniform_real_distribution<> random(-10.0, 10.0); + + size_t size = 1; + for (size_t i = 0; i < shape.size(); ++i) { + size *= static_cast(shape[i]); + } + + /* Check original object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_buf_obj); + // Copy from OpenCLBuffer + opencl_buf_obj.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } + + /* Check view object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_img_obj); + // Copy from OpenCLBuffer + opencl_img_obj.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } +} + +TEST_F(TextureCopyTest, ViewImageAsBuffer) { + using namespace tvm; + // Shape that doesn't cause padding for image row + std::vector shape{1, 16, 16, 8, 4}; + std::vector same_shape{1, 8, 16, 16, 4}; + auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + + DLDevice cl_dev = {kDLOpenCL, 0}; + auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled); + auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1}); + auto stor = Storage(buffer, allocator); + + auto opencl_img_obj = + stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global.texture"); + auto opencl_buf_obj = + stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global"); + + std::random_device dev; + std::mt19937 mt(dev()); + std::uniform_real_distribution<> random(-10.0, 10.0); + + size_t size = 1; + for (size_t i = 0; i < shape.size(); ++i) { + size *= static_cast(shape[i]); + } + + /* Check original object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_buf_obj); + // Copy from OpenCLBuffer + opencl_buf_obj.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } + + /* Check view object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_img_obj); + // Copy from OpenCLBuffer + opencl_img_obj.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } +} + +TEST_F(TextureCopyTest, ViewImageAsImage) { + using namespace tvm; + // Shape that doesn't cause padding for image row + std::vector shape{1, 16, 16, 8, 4}; + std::vector same_shape{1, 8, 16, 16, 4}; + auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0}); + + DLDevice cl_dev = {kDLOpenCL, 0}; + auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled); + auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1}); + auto stor = Storage(buffer, allocator); + + auto opencl_img_obj_1 = + stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global.texture"); + auto opencl_img_obj_2 = + stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global.texture"); + + std::random_device dev; + std::mt19937 mt(dev()); + std::uniform_real_distribution<> random(-10.0, 10.0); + + size_t size = 1; + for (size_t i = 0; i < shape.size(); ++i) { + size *= static_cast(shape[i]); + } + + /* Check original object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_img_obj_1); + // Copy from OpenCLBuffer + opencl_img_obj_1.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } + + /* Check view object round trip */ + // Random initialize host pool storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr->data)[i] = random(mt); + } + // Copy to OpenCLBuffer + cpu_arr.CopyTo(opencl_img_obj_2); + // Copy from OpenCLBuffer + opencl_img_obj_2.CopyTo(cpu_arr_ret); + for (size_t i = 0; i < size; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_arr->data)[i] - + static_cast(cpu_arr_ret->data)[i]), + 1e-5); + } +} diff --git a/tests/cpp/relay/backend/graph_plan_token_alloc.cc b/tests/cpp/relay/backend/graph_plan_token_alloc.cc index 4641da2cb8b5..7fca4b26a985 100644 --- a/tests/cpp/relay/backend/graph_plan_token_alloc.cc +++ b/tests/cpp/relay/backend/graph_plan_token_alloc.cc @@ -24,23 +24,24 @@ namespace tvm { namespace relay { -// TokenAllocator2d is necessary because in class TokenAllocator2D we don't +// TokenAllocatorMixed is necessary because in class TokenAllocatorMixed we don't // have an access to its protected members. In this class we add new methods -// which allow us to get and check internal state of class TokenAllocator2D -class TokenAllocator2DWrapper : public TokenAllocator2D { +// which allow us to get and check internal state of class TokenAllocatorMixed +class TokenAllocatorMixedWrapper : public TokenAllocatorMixed { public: - inline size_t FreeListSize() const { return free_list_.size(); } - inline size_t BlockMapSize() const { return blocks_.size(); } + inline size_t FreeListSize() const { return free_.size(); } + inline size_t AllocListSize() const { return data_.size(); } }; -TEST(Token2DAlloc, OneToken) { - TokenAllocator2DWrapper alloc; +TEST(TokenMixedAlloc, TextureOneToken) { + TokenAllocatorMixedWrapper alloc; int storage_ids = 0; - EXPECT_EQ(alloc.BlockMapSize(), 0); + EXPECT_EQ(alloc.AllocListSize(), 0); EXPECT_EQ(alloc.FreeListSize(), 0); TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc")); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-nhwc")); StorageToken tok1 = { 1, // ref_counter 0, // max bytes @@ -49,29 +50,28 @@ TEST(Token2DAlloc, OneToken) { -1 // storage_id }; auto size2d = alloc.GetSize2D(&tok1); - EXPECT_EQ(size2d.channel, 4); - EXPECT_EQ(size2d.height, 22); - EXPECT_EQ(size2d.width, 400); + EXPECT_EQ(size2d, 140800); EXPECT_EQ(alloc.Request(&tok1), nullptr); alloc.Alloc(&tok1, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); tok1.ref_counter -= 1; alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); } -TEST(Token2DAlloc, EqualSizeTokenReuse) { - TokenAllocator2DWrapper alloc; +TEST(TokenMixedAlloc, TextureEqualSizeTokenReuse) { + TokenAllocatorMixedWrapper alloc; int storage_ids = 0; - EXPECT_EQ(alloc.BlockMapSize(), 0); + EXPECT_EQ(alloc.AllocListSize(), 0); EXPECT_EQ(alloc.FreeListSize(), 0); TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc")); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-nhwc")); StorageToken tok1 = { 1, // ref_counter 0, // max bytes @@ -80,18 +80,16 @@ TEST(Token2DAlloc, EqualSizeTokenReuse) { -1 // storage_id }; auto size2d = alloc.GetSize2D(&tok1); - EXPECT_EQ(size2d.channel, 4); - EXPECT_EQ(size2d.height, 22); - EXPECT_EQ(size2d.width, 400); + EXPECT_EQ(size2d, 140800); EXPECT_EQ(alloc.Request(&tok1), nullptr); alloc.Alloc(&tok1, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); tok1.ref_counter -= 1; alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); StorageToken tok2 = { @@ -103,24 +101,51 @@ TEST(Token2DAlloc, EqualSizeTokenReuse) { }; auto req = alloc.Request(&tok2); EXPECT_NE(req, nullptr); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); EXPECT_EQ(req->storage_id, storage_ids - 1); EXPECT_EQ(req->ref_counter, 1); auto sizeReq = alloc.GetSize2D(req); - EXPECT_EQ(sizeReq.channel, 4); - EXPECT_EQ(sizeReq.height, 22); - EXPECT_EQ(sizeReq.width, 400); + EXPECT_EQ(sizeReq, 140800); + + req->ref_counter -= 1; + alloc.CheckForRelease(req); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); + + // Try reuse of the texture memory for buffer object + VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"), MemoryScope("global")); + StorageToken tok3 = { + 1, // ref_counter + 0, // max bytes + tt1, // tensor type + vd2, // virtual device + -1 // storage_id + }; + auto req1 = alloc.Request(&tok3); + EXPECT_NE(req1, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + EXPECT_EQ(req1->storage_id, storage_ids - 1); + EXPECT_EQ(req1->ref_counter, 1); + sizeReq = alloc.GetSize2D(req1); + EXPECT_EQ(sizeReq, 140800); + + req1->ref_counter -= 1; + alloc.CheckForRelease(req1); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); } -TEST(Token2DAlloc, EqualSizeDiffTypes) { - TokenAllocator2DWrapper alloc; +TEST(TokenMixedAlloc, TextureEqualSizeDiffTypes) { + TokenAllocatorMixedWrapper alloc; int storage_ids = 0; - EXPECT_EQ(alloc.BlockMapSize(), 0); + EXPECT_EQ(alloc.AllocListSize(), 0); EXPECT_EQ(alloc.FreeListSize(), 0); TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc")); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-nhwc")); StorageToken tok1 = { 1, // ref_counter 0, // max bytes @@ -128,19 +153,17 @@ TEST(Token2DAlloc, EqualSizeDiffTypes) { vd1, // virtual device -1 // storage_id }; - auto size2d = alloc.GetSize2D(&tok1); - EXPECT_EQ(size2d.channel, 4); - EXPECT_EQ(size2d.height, 22); - EXPECT_EQ(size2d.width, 400); + auto sizeReq = alloc.GetSize2D(&tok1); + EXPECT_EQ(sizeReq, 140800); EXPECT_EQ(alloc.Request(&tok1), nullptr); alloc.Alloc(&tok1, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); tok1.ref_counter -= 1; alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1)); @@ -151,28 +174,27 @@ TEST(Token2DAlloc, EqualSizeDiffTypes) { vd1, // virtual device -1 // storage_id }; - EXPECT_EQ(alloc.Request(&tok2), nullptr); - EXPECT_EQ(alloc.BlockMapSize(), 1); - EXPECT_EQ(alloc.FreeListSize(), 1); - alloc.Alloc(&tok2, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 2); - EXPECT_EQ(alloc.FreeListSize(), 1); + auto req1 = alloc.Request(&tok2); + EXPECT_NE(req1, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); - tok2.ref_counter -= 1; - alloc.CheckForRelease(&tok2); - EXPECT_EQ(alloc.BlockMapSize(), 2); - EXPECT_EQ(alloc.FreeListSize(), 2); + req1->ref_counter -= 1; + alloc.CheckForRelease(req1); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); } -TEST(Token2DAlloc, DifferentSizesTokenReuse) { - TokenAllocator2DWrapper alloc; +TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse) { + TokenAllocatorMixedWrapper alloc; int storage_ids = 0; - EXPECT_EQ(alloc.BlockMapSize(), 0); + EXPECT_EQ(alloc.AllocListSize(), 0); EXPECT_EQ(alloc.FreeListSize(), 0); TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc")); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-nhwc")); StorageToken tok1 = { 1, // ref_counter 0, // max bytes @@ -180,19 +202,17 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) { vd1, // virtual device -1 // storage_id }; - auto size2d = alloc.GetSize2D(&tok1); - EXPECT_EQ(size2d.channel, 4); - EXPECT_EQ(size2d.height, 22); - EXPECT_EQ(size2d.width, 400); + auto sizeReq = alloc.GetSize2D(&tok1); + EXPECT_EQ(sizeReq, 140800); EXPECT_EQ(alloc.Request(&tok1), nullptr); alloc.Alloc(&tok1, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); tok1.ref_counter -= 1; alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1)); @@ -205,19 +225,16 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) { }; auto req = alloc.Request(&tok2); EXPECT_NE(req, nullptr); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); EXPECT_EQ(req->storage_id, storage_ids - 1); - EXPECT_EQ(req->ref_counter, 2); - auto sizeReq = alloc.GetSize2D(req); - EXPECT_EQ(sizeReq.channel, 4); - EXPECT_EQ(sizeReq.height, 40); - EXPECT_EQ(sizeReq.width, 900); + EXPECT_EQ(req->ref_counter, 1); + sizeReq = alloc.GetSize2D(req); + EXPECT_EQ(sizeReq, 576000); - tok2.ref_counter -= 1; req->ref_counter -= 1; - alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + alloc.CheckForRelease(req); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1)); @@ -230,24 +247,23 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) { }; auto req2 = alloc.Request(&tok3); EXPECT_NE(req2, nullptr); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); EXPECT_EQ(req2->storage_id, storage_ids - 1); EXPECT_EQ(req2->ref_counter, 1); - auto sizeReq2 = alloc.GetSize2D(req2); - EXPECT_EQ(sizeReq2.channel, 4); - EXPECT_EQ(sizeReq2.height, 40); - EXPECT_EQ(sizeReq2.width, 900); + sizeReq = alloc.GetSize2D(req2); + EXPECT_EQ(sizeReq, 576000); } -TEST(Token2DAlloc, DifferentSizesTokenReuse2) { - TokenAllocator2DWrapper alloc; +TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse2) { + TokenAllocatorMixedWrapper alloc; int storage_ids = 0; - EXPECT_EQ(alloc.BlockMapSize(), 0); + EXPECT_EQ(alloc.AllocListSize(), 0); EXPECT_EQ(alloc.FreeListSize(), 0); TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc")); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-nhwc")); StorageToken tok1 = { 1, // ref_counter 0, // max bytes @@ -255,19 +271,17 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse2) { vd1, // virtual device -1 // storage_id }; - auto size2d = alloc.GetSize2D(&tok1); - EXPECT_EQ(size2d.channel, 4); - EXPECT_EQ(size2d.height, 22); - EXPECT_EQ(size2d.width, 400); + auto sizeReq = alloc.GetSize2D(&tok1); + EXPECT_EQ(sizeReq, 140800); EXPECT_EQ(alloc.Request(&tok1), nullptr); alloc.Alloc(&tok1, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); tok1.ref_counter -= 1; alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); TensorType tt2({1, 5, 30, 20, 4}, DataType(kDLFloat, 32, 1)); @@ -280,24 +294,23 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse2) { }; auto req = alloc.Request(&tok2); EXPECT_NE(req, nullptr); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); EXPECT_EQ(req->storage_id, storage_ids - 1); - EXPECT_EQ(req->ref_counter, 2); - auto sizeReq = alloc.GetSize2D(req); - EXPECT_EQ(sizeReq.channel, 4); - EXPECT_EQ(sizeReq.height, 5); - EXPECT_EQ(sizeReq.width, 600); + EXPECT_EQ(req->ref_counter, 1); + sizeReq = alloc.GetSize2D(req); + EXPECT_EQ(sizeReq, 140800); } -TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) { - TokenAllocator2DWrapper alloc; +TEST(TokenMixedAlloc, TextureSameSizesButDiffMemoryScopes) { + TokenAllocatorMixedWrapper alloc; int storage_ids = 0; - EXPECT_EQ(alloc.BlockMapSize(), 0); + EXPECT_EQ(alloc.AllocListSize(), 0); EXPECT_EQ(alloc.FreeListSize(), 0); TensorType tt1({28, 676, 1, 1, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-weight")); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-weight")); StorageToken tok1 = { 1, // ref_counter 0, // max bytes @@ -305,23 +318,22 @@ TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) { vd1, // virtual device -1 // storage_id }; - auto size2d = alloc.GetSize2D(&tok1); - EXPECT_EQ(size2d.channel, 4); - EXPECT_EQ(size2d.height, 28); - EXPECT_EQ(size2d.width, 676); + auto sizeReq = alloc.GetSize2D(&tok1); + EXPECT_EQ(sizeReq, 302848); EXPECT_EQ(alloc.Request(&tok1), nullptr); alloc.Alloc(&tok1, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 0); tok1.ref_counter -= 1; alloc.CheckForRelease(&tok1); - EXPECT_EQ(alloc.BlockMapSize(), 1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); TensorType tt2({1, 28, 26, 26, 4}, DataType(kDLFloat, 32, 1)); - VirtualDevice vd2(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc")); + VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"), + MemoryScope("global.texture-nhwc")); StorageToken tok2 = { 1, // ref_counter 0, // max bytes @@ -330,22 +342,199 @@ TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) { -1 // storage_id }; auto tok2Size = alloc.GetSize2D(&tok2); - EXPECT_EQ(tok2Size.channel, 4); - EXPECT_EQ(tok2Size.height, 28); - EXPECT_EQ(tok2Size.width, 676); + EXPECT_EQ(tok2Size, 302848); - EXPECT_EQ(alloc.Request(&tok2), nullptr); - EXPECT_EQ(alloc.BlockMapSize(), 1); + auto req = alloc.Request(&tok2); + EXPECT_NE(req, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + + req->ref_counter -= 1; + alloc.CheckForRelease(req); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); +} + +TEST(TokenMixedAlloc, OneToken) { + TokenAllocatorMixedWrapper alloc; + int storage_ids = 0; + EXPECT_EQ(alloc.AllocListSize(), 0); + EXPECT_EQ(alloc.FreeListSize(), 0); - alloc.Alloc(&tok2, storage_ids++); - EXPECT_EQ(alloc.BlockMapSize(), 2); + TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl")); + StorageToken tok1 = { + 1, // ref_counter + 0, // max bytes + tt1, // tensor type + vd1, // virtual device + -1 // storage_id + }; + EXPECT_EQ(alloc.Request(&tok1), nullptr); + + alloc.Alloc(&tok1, storage_ids++); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + + tok1.ref_counter -= 1; + alloc.CheckForRelease(&tok1); + EXPECT_EQ(alloc.AllocListSize(), 1); EXPECT_EQ(alloc.FreeListSize(), 1); +} + +TEST(TokenMixedAlloc, EqualSizeTokenReuse) { + TokenAllocatorMixedWrapper alloc; + int storage_ids = 0; + EXPECT_EQ(alloc.AllocListSize(), 0); + EXPECT_EQ(alloc.FreeListSize(), 0); + + TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl")); + StorageToken tok1 = { + 1, // ref_counter + 0, // max bytes + tt1, // tensor type + vd1, // virtual device + -1 // storage_id + }; + EXPECT_EQ(alloc.Request(&tok1), nullptr); - tok2.ref_counter -= 1; - alloc.CheckForRelease(&tok2); - EXPECT_EQ(alloc.BlockMapSize(), 2); - EXPECT_EQ(alloc.FreeListSize(), 2); + alloc.Alloc(&tok1, storage_ids++); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + + tok1.ref_counter -= 1; + alloc.CheckForRelease(&tok1); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); + + StorageToken tok2 = { + 1, // ref_counter + 0, // max bytes + tt1, // tensor type + vd1, // virtual device + -1 // storage_id + }; + auto req = alloc.Request(&tok2); + EXPECT_NE(req, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + EXPECT_EQ(req->storage_id, storage_ids - 1); + EXPECT_EQ(req->ref_counter, 1); + + req->ref_counter -= 1; + alloc.CheckForRelease(req); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); } + +TEST(TokenMixedAlloc, EqualSizeDiffTypes) { + TokenAllocatorMixedWrapper alloc; + int storage_ids = 0; + EXPECT_EQ(alloc.AllocListSize(), 0); + EXPECT_EQ(alloc.FreeListSize(), 0); + + TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl")); + StorageToken tok1 = { + 1, // ref_counter + 0, // max bytes + tt1, // tensor type + vd1, // virtual device + -1 // storage_id + }; + EXPECT_EQ(alloc.Request(&tok1), nullptr); + + alloc.Alloc(&tok1, storage_ids++); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + + tok1.ref_counter -= 1; + alloc.CheckForRelease(&tok1); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); + + TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1)); + StorageToken tok2 = { + 1, // ref_counter + 0, // max bytes + tt2, // tensor type + vd1, // virtual device + -1 // storage_id + }; + + auto req1 = alloc.Request(&tok2); + EXPECT_NE(req1, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + + req1->ref_counter -= 1; + alloc.CheckForRelease(req1); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); +} + +TEST(TokenMixedAlloc, DifferentSizesTokenReuse) { + TokenAllocatorMixedWrapper alloc; + int storage_ids = 0; + EXPECT_EQ(alloc.AllocListSize(), 0); + EXPECT_EQ(alloc.FreeListSize(), 0); + + TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1)); + VirtualDevice vd1(kDLOpenCL, 0, Target("opencl")); + StorageToken tok1 = { + 1, // ref_counter + 0, // max bytes + tt1, // tensor type + vd1, // virtual device + -1 // storage_id + }; + EXPECT_EQ(alloc.Request(&tok1), nullptr); + + alloc.Alloc(&tok1, storage_ids++); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + + tok1.ref_counter -= 1; + alloc.CheckForRelease(&tok1); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); + + TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1)); + StorageToken tok2 = { + 1, // ref_counter + 0, // max bytes + tt2, // tensor type + vd1, // virtual device + -1 // storage_id + }; + auto req = alloc.Request(&tok2); + EXPECT_NE(req, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + EXPECT_EQ(req->storage_id, storage_ids - 1); + EXPECT_EQ(req->ref_counter, 1); + + req->ref_counter -= 1; + alloc.CheckForRelease(req); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 1); + + TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1)); + StorageToken tok3 = { + 1, // ref_counter + 0, // max bytes + tt3, // tensor type + vd1, // virtual device + -1 // storage_id + }; + auto req2 = alloc.Request(&tok3); + EXPECT_NE(req2, nullptr); + EXPECT_EQ(alloc.AllocListSize(), 1); + EXPECT_EQ(alloc.FreeListSize(), 0); + EXPECT_EQ(req2->storage_id, storage_ids - 1); + EXPECT_EQ(req2->ref_counter, 1); +} + } // namespace relay } // namespace tvm diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc index aea37bf7fbfe..47146d2000fc 100644 --- a/tests/cpp/runtime/memory/memory_manager_tests.cc +++ b/tests/cpp/runtime/memory/memory_manager_tests.cc @@ -85,6 +85,38 @@ TEST_F(TvmVMMemoryManagerTest, NaiveEmptyBasic) { EXPECT_EQ(allocator->UsedMemory(), 0); } +TEST_F(TvmVMMemoryManagerTest, BothAllocatorsCoexists) { + Device dev = {kDLCPU, 0}; + // Initialize and use Naive allocator + Allocator* nallocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive); + EXPECT_EQ(nallocator->UsedMemory(), 0); + auto dt = DataType::Float(32); + size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes(); + ShapeTuple shape = {1, 3, 6, 6}; + { + auto ndarray = nallocator->Empty(shape, dt, dev); + EXPECT_EQ(nallocator->UsedMemory(), nbytes); + } + EXPECT_EQ(nallocator->UsedMemory(), 0); + auto naive_buff = nallocator->Alloc(dev, shape, dt); + EXPECT_EQ(nallocator->UsedMemory(), nbytes); + + // Initialize and use Pooled allocator + Allocator* pallocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled); + EXPECT_EQ(pallocator->UsedMemory(), 0); + auto pooled_buff = pallocator->Alloc(dev, shape, dt); + EXPECT_NE(pallocator->UsedMemory(), 0); + + // Operate on Naive allocator + EXPECT_EQ(nallocator->UsedMemory(), nbytes); + nallocator->Free(naive_buff); + EXPECT_EQ(nallocator->UsedMemory(), 0); + + // Operate on Pooled allocator + pallocator->Free(pooled_buff); + EXPECT_NE(pallocator->UsedMemory(), 0); +} + TEST_F(TvmVMMemoryManagerTest, PooledEmptyBasic) { Device dev = {kDLCPU, 0}; Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled); @@ -150,58 +182,6 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocWithShape) { } } -TEST_F(TvmVMMemoryManagerTest, NaiveAllocOpenCLTexture) { - bool enabled = tvm::runtime::RuntimeEnabled("opencl"); - if (!enabled) { - LOG(INFO) << "Skip OpenCL Texture alloc test because opencl runtime is disabled.\n"; - return; - } - Device dev = {kDLOpenCL, 0}; - Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive); - EXPECT_EQ(allocator->UsedMemory(), 0); - auto dt = DataType::Float(32); - size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes(); - ShapeTuple shape = {1, 3, 6, 6}; - auto buff = allocator->Alloc(dev, shape, dt); - EXPECT_EQ(allocator->UsedMemory(), nbytes); - allocator->Free(buff); - EXPECT_EQ(allocator->UsedMemory(), 0); - - auto texture = allocator->Alloc(dev, shape, dt, "global.texture"); - EXPECT_EQ(allocator->UsedMemory(), nbytes); - allocator->Free(texture); - EXPECT_EQ(allocator->UsedMemory(), 0); -} - -TEST_F(TvmVMMemoryManagerTest, PooledAllocOpenCLTexture) { - bool enabled = tvm::runtime::RuntimeEnabled("opencl"); - if (!enabled) { - LOG(INFO) << "Skip OpenCL Texture alloc test because opencl runtime is disabled.\n"; - return; - } - Device dev = {kDLOpenCL, 0}; - Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled); - EXPECT_EQ(allocator->UsedMemory(), 0); - auto dt = DataType::Float(32); - size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes(); - size_t page_size = PooledAllocator::kDefaultPageSize; - size_t size = ((nbytes + page_size - 1) / page_size) * page_size; - ShapeTuple shape = {1, 3, 6, 6}; - auto buff = allocator->Alloc(dev, shape, dt); - EXPECT_EQ(allocator->UsedMemory(), size); - allocator->Free(buff); - EXPECT_EQ(allocator->UsedMemory(), size); - - try { - auto texture = allocator->Alloc(dev, shape, dt, "global.texture"); - (void)texture; - FAIL(); - } catch (std::exception& e) { - std::string pattern = "This alloc should be implemented"; - std::string what = e.what(); - EXPECT_NE(what.find(pattern), std::string::npos) << what; - } -} } // namespace memory } // namespace runtime } // namespace tvm diff --git a/tests/cpp/texture_copy_test.cc b/tests/cpp/texture_copy_test.cc index 92c12bafdd9a..63e2ac1a0af4 100644 --- a/tests/cpp/texture_copy_test.cc +++ b/tests/cpp/texture_copy_test.cc @@ -98,39 +98,28 @@ TEST(TextureCopy, OverwritePoolSubview) { static_cast(cpu_pool0->data)[i] = random(mt); } - // Random initialize host array - for (int64_t h = 0; h < shape[0]; h++) { - for (int64_t w = 0; w < shape[1]; w++) { - for (int64_t rgba = 0; rgba < shape[2]; rgba++) { - static_cast(cpu_arr0->data)[shape[1] * shape[2] * h + shape[2] * w + rgba] = 1.1f; - } - } + // Random initialize host array storage + for (size_t i = 0; i < size; i++) { + static_cast(cpu_arr0->data)[i] = random(mt); } - // Copy to texture pool for initialization + // Loop through pool cpu_pool0.CopyTo(opencl_txpool); - // Copy host data to subview into texture storage - cpu_arr0.CopyTo(opencl_txarr0); - // Copy modified pool back opencl_txpool.CopyTo(cpu_pool1); - // Check that modifications to pool follow two dimensional - // strides according to the written texture shape. - for (int64_t h = 0; h < shape_pool[0]; h++) { - for (int64_t w = 0; w < shape_pool[1]; w++) { - for (int64_t rgba = 0; rgba < shape_pool[2]; rgba++) { - size_t i = shape_pool[1] * shape_pool[2] * h + shape_pool[2] * w + rgba; - if (h < shape[0] && w < shape[1] && rgba < shape[2]) { - size_t j = shape[1] * shape[2] * h + shape[2] * w + rgba; - ICHECK_LT(std::fabs(static_cast(cpu_pool1->data)[i] - - static_cast(cpu_arr0->data)[j]), - 1e-5); - } else { - ICHECK_LT(std::fabs(static_cast(cpu_pool1->data)[i] - - static_cast(cpu_pool0->data)[i]), - 1e-5); - } - } - } + for (size_t i = 0; i < size_pool; i++) { + ICHECK_LT(std::fabs(static_cast(cpu_pool0->data)[i] - + static_cast(cpu_pool1->data)[i]), + 1e-5); + } + + // Loop through view + cpu_arr0.CopyTo(opencl_txarr0); + opencl_txarr0.CopyTo(cpu_arr1); + + for (size_t i = 0; i < size; i++) { + ICHECK_LT( + std::fabs(static_cast(cpu_arr0->data)[i] - static_cast(cpu_arr1->data)[i]), + 1e-5); } } diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py index d7b6e13c18b6..133fcd191961 100644 --- a/tests/python/relay/test_backend_graph_executor.py +++ b/tests/python/relay/test_backend_graph_executor.py @@ -179,7 +179,7 @@ def test_plan_memory(): assert ( storage_sizes[0][0] == 40 and storage_sizes[1][0] == 4 - and storage_sizes[2][0] == 4 + and storage_sizes[2][0] == 40 and storage_sizes[3][0] == 40 ) diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index 51ef86d05ec7..3202839e50ed 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -43,8 +43,13 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \ run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay --ignore=tests/python/relay/aot # OpenCL texture test. Deselected specific tests that fails in CI -TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \ - run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture +TEXTURE_TESTS=$(ls tests/python/relay/opencl_texture/test_*) +i=0 +for TEST in $TEXTURE_TESTS; do + TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \ + run_pytest "${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture-$i" "$TEST" + i=$((i+1)) +done # Command line driver test run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver