From f745f5dc93d7e58f58431ce5c0f7bc223276e03d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 24 Dec 2024 10:30:11 +0530
Subject: [PATCH 01/14] [OPENCL][TEXTURE] Improved texture memory planning

Motivated form the fact that textures can be allocated over a clBuffer
object and the size of backing clBuffer can be computed based on
hardware image pitch alignment.

This optimizes the overall memory allocation on device and helps
greately the models with large memory requirements.

Improvised the graph memory planner to not differentiate buffer and
texture storage tokens and reuse them across. The texture pool in OpenCL
runtime is rebranded as memory pool that handles allocation for both
buffer and image objects.

NDArray to DeviceAPI interface is extended with AllocDataSpaceView and
FreeDataSpaceView. These new API's acommodates accessing same physical
memory as clBuffer / clImage objects.
---
 .../app/src/main/jni/tvm_runtime.h            |   1 -
 include/tvm/runtime/device_api.h              |  21 +
 include/tvm/runtime/memory/memory_manager.h   |  12 +-
 include/tvm/runtime/ndarray.h                 |   5 +-
 src/relay/backend/graph_plan_memory.cc        |  33 +-
 src/relay/backend/token_allocator.cc          | 184 +++-----
 src/relay/backend/token_allocator.h           |  54 +--
 src/runtime/c_runtime_api.cc                  |   7 +
 src/runtime/contrib/clml/clml_runtime.cc      |   2 +-
 src/runtime/cuda/cuda_device_api.cc           |   2 +
 src/runtime/graph_executor/graph_executor.cc  |  61 ++-
 src/runtime/graph_executor/graph_executor.h   |   1 +
 src/runtime/memory/memory_manager.cc          |  47 +-
 src/runtime/memory/naive_allocator.h          |  31 +-
 src/runtime/memory/pooled_allocator.h         |  14 +-
 src/runtime/metal/metal_device_api.mm         |   2 +
 src/runtime/ndarray.cc                        |  10 +-
 src/runtime/opencl/opencl_common.h            |  58 ++-
 src/runtime/opencl/opencl_device_api.cc       | 276 +++++++++---
 src/runtime/opencl/opencl_module.cc           |   4 +-
 src/runtime/opencl/opencl_module_spirv.cc     |   2 +-
 src/runtime/opencl/texture_pool.cc            | 171 --------
 src/runtime/rocm/rocm_device_api.cc           |   3 +-
 src/runtime/texture.h                         |  84 +---
 src/runtime/vulkan/vulkan_device_api.cc       |   3 +-
 src/target/source/codegen_opencl.cc           |  14 +
 src/target/target_kind.cc                     |   1 +
 .../opencl/opencl_texture_pool_test.cc        | 151 -------
 tests/cpp-runtime/opencl/opencl_timer_test.cc |   2 +-
 tests/cpp-runtime/opencl/texture_copy_test.cc | 295 +++++++++++++
 .../relay/backend/graph_plan_token_alloc.cc   | 407 +++++++++++++-----
 .../runtime/memory/memory_manager_tests.cc    |  66 ++-
 tests/cpp/texture_copy_test.cc                |  47 +-
 .../relay/test_backend_graph_executor.py      |   2 +-
 34 files changed, 1196 insertions(+), 877 deletions(-)
 delete mode 100644 src/runtime/opencl/texture_pool.cc
 delete mode 100644 tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
 create mode 100644 tests/cpp-runtime/opencl/texture_copy_test.cc

diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index fb14d84b794f..7b4ced7c9c0d 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -66,7 +66,6 @@
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
 #include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
-#include "../src/runtime/opencl/texture_pool.cc"
 #include "../src/runtime/source_utils.cc"
 #endif
 
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c33606d98ed3..5396e7342ad0 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -52,6 +52,7 @@ enum DeviceAttrKind : int {
   kL2CacheSizeBytes = 13,
   kTotalGlobalMemory = 14,
   kAvailableGlobalMemory = 15,
+  kImagePitchAlignment = 16,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT
@@ -135,12 +136,32 @@ class TVM_DLL DeviceAPI {
    */
   virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                Optional<String> mem_scope = NullOpt);
+
+  /*!
+   * \brief Create a new view with given spec over existing tensor.
+   * \param dev The device device to perform operation.
+   * \param data The source array.
+   * \param shape The shape of allocated tensor.
+   * \param dtype The type of elements.
+   * \param mem_scope The memory scope of allocated tensor.
+   * \return The allocated device pointer.
+   */
+  virtual void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
+                                   Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Free a data space on device.
    * \param dev The device device to perform operation.
    * \param ptr The data space.
    */
   virtual void FreeDataSpace(Device dev, void* ptr) = 0;
+
+  /*!
+   * \brief Free a view data space on device.
+   * \param dev The device device to perform operation.
+   * \param ptr The data space view.
+   */
+  virtual void FreeDataSpaceView(Device dev, void* ptr);
+
   /*!
    * \brief copy data from one place to another
    * \note This API is designed to support special memory with shape dependent layout.
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index 0c4647e6fa5a..7386c812fa08 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -42,6 +42,7 @@ namespace memory {
 enum AllocatorType {
   kNaive = 1,
   kPooled,
+  kAny,
 };
 
 struct Buffer {
@@ -87,7 +88,7 @@ class Allocator {
    *  \return A sized allocation in the form of a buffer.
    */
   TVM_DLL virtual Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
-                               const std::string& mem_scope = "") = 0;
+                               const std::string& mem_scope = "");
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
    */
@@ -102,6 +103,7 @@ class Allocator {
  protected:
   /*! \brief Check if the given memory scope is allowed to allocate by the allocator. */
   TVM_DLL virtual bool AllowMemoryScope(const std::string& mem_scope) const;
+  std::atomic<size_t> used_memory_;
 
  private:
   AllocatorType type_;
@@ -123,7 +125,7 @@ class MemoryManager {
    * \param type The allocator type
    * \return The memory allocator.
    */
-  TVM_DLL static Allocator* GetAllocator(Device dev, AllocatorType type);
+  TVM_DLL static Allocator* GetAllocator(Device dev, AllocatorType type = AllocatorType::kAny);
   /*! \brief Clear the allocators. */
   static void Clear();
 
@@ -170,6 +172,12 @@ class Storage : public ObjectRef {
 };
 
 }  // namespace memory
+
+using memory::Allocator;
+using memory::AllocatorType;
+using memory::MemoryManager;
+using memory::StorageObj;
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index fef61a753103..1f8d48cec66e 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -137,6 +137,8 @@ class NDArray : public ObjectRef {
    * \param relative_byte_offset The offset of the output NDArray,
    *     relative to the current byte offset.
    *
+   * \param mem_scope The memory scope of the array.
+   *
    *     By default, the offset of the view is the same as the offset
    *     of the current array.
    *
@@ -147,7 +149,8 @@ class NDArray : public ObjectRef {
    *       outside the bounds of the current array, this function will
    *       raise an exception.
    */
-  TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset = 0);
+  TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset = 0,
+                             Optional<String> mem_scope = NullOpt);
 
   /*!
    * \brief Create a reference view of NDArray that
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index d85ffd78291c..33b3adea5f2f 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -229,6 +229,16 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     VLOG_CONTEXT << "StorageAllocator";
     VLOG(1) << "planning:" << std::endl << PrettyPrint(func);
     prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
+    // Backup the virtual devices as token reuse might lost the original memory scope
+    std::unordered_map<const ExprNode*, std::vector<VirtualDevice>> virtual_device_map_;
+    for (const auto& kv : prototype_) {
+      std::vector<VirtualDevice> virtual_devices;
+      virtual_devices.reserve(kv.second.size());
+      for (StorageToken* tok : kv.second) {
+        virtual_devices.push_back(tok->virtual_device);
+      }
+      virtual_device_map_.insert({kv.first, virtual_devices});
+    }
     this->Run(func);
 
     // The value of smap contains two integer arrays where the first array
@@ -252,9 +262,13 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         }
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
-        virtual_devices.push_back(tok->virtual_device);
         sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
       }
+      ICHECK(kv.second.size() == virtual_device_map_[kv.first].size())
+          << "Mismatch of tokens and virtual devices";
+      for (auto vdev : virtual_device_map_[kv.first]) {
+        virtual_devices.push_back(vdev);
+      }
       auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices),
                                                std::move(sid_sizes_byte));
       smap.Set(GetRef<Expr>(kv.first), storage_info);
@@ -356,25 +370,19 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
   class TokenAllocator {
    public:
-    StorageToken* Alloc(StorageToken* proto) {
-      return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
-                                : token_1d_.Alloc(proto, storage_ids_++);
-    }
+    StorageToken* Alloc(StorageToken* proto) { return token_mixed_.Alloc(proto, storage_ids_++); }
     StorageToken* Request(StorageToken* proto) {
-      StorageToken* token =
-          Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
+      StorageToken* token = token_mixed_.Request(proto);
       return token ? token : this->Alloc(proto);
     }
-    void CheckForRelease(StorageToken* tok) {
-      return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
-    }
+    void CheckForRelease(StorageToken* tok) { return token_mixed_.CheckForRelease(tok); }
 
     size_t GetMemorySize(StorageToken* tok) {
       // TODO(amalyshe): figure out who requries sizes and for what
       // size in case of texture is not enough - we can return any value if it
       // assumed to be used for memory allocatoion or we can return real size
       // if it is just for information
-      return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
+      return token_mixed_.GetMemorySize(tok);
     }
     static bool Is2DStorage(StorageToken* tok) {
       return relay::Is2DStorage(tok->virtual_device->memory_scope);
@@ -382,8 +390,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
    private:
     int64_t storage_ids_{0};
-    TokenAllocator1D token_1d_;
-    TokenAllocator2D token_2d_;
+    TokenAllocatorMixed token_mixed_;
   };
 
  private:
diff --git a/src/relay/backend/token_allocator.cc b/src/relay/backend/token_allocator.cc
index bdecba9afad7..e974944b33b0 100644
--- a/src/relay/backend/token_allocator.cc
+++ b/src/relay/backend/token_allocator.cc
@@ -31,22 +31,45 @@
 
 namespace tvm {
 namespace relay {
+constexpr auto Is2DStorage = runtime::IsTextureStorage;
 
-size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) {
+/*
+ * Mixed mode memory allocator
+ */
+size_t TokenAllocatorMixed::GetMemorySize(StorageToken* prototype) {
   TensorType ttype = prototype->ttype;
   ICHECK(ttype.defined());
   size_t size = 1;
-  for (IndexExpr dim : ttype->shape) {
-    const int64_t* pval = tir::as_const_int(dim);
-    ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-    ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-    size *= static_cast<size_t>(pval[0]);
+  if (relay::Is2DStorage(prototype->virtual_device->memory_scope)) {
+    size = GetSize2D(prototype);
+  } else {
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = tir::as_const_int(dim);
+      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
   }
-  size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
   return size;
 }
 
-StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
+String GetDeviceCompatibleToken(StorageToken* tok) {
+  Target null_tgt{nullptr};
+  if (null_tgt == tok->virtual_device->target) {
+    return tok->virtual_device->memory_scope;
+  }
+  std::string dev_kind = tok->virtual_device->target->kind->name;
+  auto* device_scope_handler = tvm::runtime::Registry::Get("DeviceScopeCompatibility." + dev_kind);
+  if (device_scope_handler) {
+    String dev_scope =
+        (*device_scope_handler)(tok->virtual_device->target, tok->virtual_device->memory_scope);
+    return dev_scope;
+  }
+  return tok->virtual_device->memory_scope;
+}
+
+StorageToken* TokenAllocatorMixed::Request(StorageToken* prototype) {
   // calculate the size;
   size_t size = GetMemorySize(prototype);
   // search memory block in [size / match_range_, size * match_range_)
@@ -59,32 +82,42 @@ StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
   // search for memory blocks larger than requested
   for (auto it = mid; it != end; ++it) {
     StorageToken* tok = it->second;
-    if (!tok->is_compatible(*prototype)) continue;
-    ICHECK_EQ(tok->ref_counter, 0);
-    // Use exect matching strategy
-    tok->max_bytes = std::max(size, tok->max_bytes);
-    tok->ref_counter = prototype->ref_counter;
-    // find a exact match, erase from map and return
-    free_.erase(it);
-    return tok;
+    bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype));
+    if (tok->is_compatible(*prototype) || (dev_compatible)) {
+      ICHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      if (size > tok->max_bytes) {
+        tok->max_bytes = size;
+        tok->ttype = prototype->ttype;
+      }
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
   }
   // then search for memory blocks smaller than requested space
   for (auto it = mid; it != begin;) {
     --it;
     StorageToken* tok = it->second;
-    if (!tok->is_compatible(*prototype)) continue;
-    ICHECK_EQ(tok->ref_counter, 0);
-    // Use exect matching strategy
-    tok->max_bytes = std::max(size, tok->max_bytes);
-    tok->ref_counter = prototype->ref_counter;
-    // erase from map and return
-    free_.erase(it);
-    return tok;
+    bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype));
+    if (tok->is_compatible(*prototype) || (dev_compatible)) {
+      ICHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      if (size > tok->max_bytes) {
+        tok->max_bytes = size;
+        tok->ttype = prototype->ttype;
+      }
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
   }
   return nullptr;
 }
 
-StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) {
+StorageToken* TokenAllocatorMixed::Alloc(StorageToken* prototype, int64_t storage_id) {
   size_t size = GetMemorySize(prototype);
   prototype->max_bytes = size;
   prototype->storage_id = storage_id;
@@ -92,7 +125,7 @@ StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_i
   return prototype;
 }
 
-void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
+void TokenAllocatorMixed::CheckForRelease(StorageToken* tok) {
   ICHECK_GE(tok->storage_id, 0);
   ICHECK_GE(tok->ref_counter, 0);
   if (tok->ref_counter == 0) {
@@ -100,101 +133,22 @@ void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
   }
 }
 
-StorageToken* TokenAllocator2D::Request(StorageToken* prototype) {
-  auto shape = GetSize2D(prototype);
-  const int64_t max_ratio = 5;
-  int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-  int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-  int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-  int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-  int64_t best_storage_id = -1;
-  MemBlock new_mem;
-  for (int64_t free_id : free_list_) {
-    MemBlock& cached = blocks_[free_id];
-    // Can only reuse texture 2d blocks of the same type
-    if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
-      continue;
-    }
-    // Can only reuse texture 2d blocks of the same scope
-    // Because reusing textures with different memory scope may lead to
-    // accuracy issues, because the data will be packed in a different way for
-    // different memory scopes.
-    if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) {
-      continue;
-    }
-    // avoid reusing too small and too big textures
-    if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio ||
-        shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) {
-      continue;
-    }
-    int64_t new_width = std::max(cached.x_, shape.width);
-    int64_t new_height = std::max(cached.y_, shape.height);
-    int64_t added_size_x = new_width - cached.x_;
-    int64_t added_size_y = new_height - cached.y_;
-    int64_t wasted_size_x = new_width - shape.width;
-    int64_t wasted_size_y = new_height - shape.height;
-    // Prioritize minimization of added size first, then minimize
-    // wasted size among blocks which would not require expansion
-    if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-        (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-        (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-        (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-      min_added_size_x = added_size_x;
-      min_added_size_y = added_size_y;
-      min_wasted_size_x = wasted_size_x;
-      min_wasted_size_y = wasted_size_y;
-      best_storage_id = free_id;
-      new_mem.x_ = new_width;
-      new_mem.y_ = new_height;
-    }
-  }
-
-  if (min_added_size_x == 0 && min_added_size_y == 0) {
-    // use existing block
-    free_list_.erase(best_storage_id);
-    blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter;
-    return blocks_[best_storage_id].token_;
-  } else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) {
-    // Reset the reference counter of the now live token
-    free_list_.erase(best_storage_id);
-    new_mem.token_ = prototype;
-    new_mem.token_->ref_counter += 1;
-    new_mem.token_->storage_id = best_storage_id;
-    blocks_[best_storage_id] = new_mem;
-    return new_mem.token_;
-  }
-  return nullptr;
-}
-
-StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) {
-  auto shape = GetSize2D(prototype);
-  MemBlock block;
-  block.x_ = shape.width;
-  block.y_ = shape.height;
-  prototype->storage_id = storage_id;
-  block.token_ = prototype;
-  blocks_[prototype->storage_id] = block;
-  return prototype;
-}
-
-void TokenAllocator2D::CheckForRelease(StorageToken* tok) {
-  ICHECK_GE(tok->storage_id, 0);
-  ICHECK_GE(tok->ref_counter, 0);
-  if (tok->ref_counter == 0) {
-    free_list_.insert(tok->storage_id);
-  }
-}
-
-runtime::Texture2DShape<int64_t> TokenAllocator2D::GetSize2D(StorageToken* prototype) {
+size_t TokenAllocatorMixed::GetSize2D(StorageToken* prototype) {
   TensorType ttype = prototype->ttype;
   ICHECK(ttype.defined());
-  size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
-                                                       prototype->virtual_device->memory_scope);
   struct Shape {
     const Array<PrimExpr>& shape;
     int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+    int size() { return this->shape.size(); }
   };
-  return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
+  auto shape = Shape{ttype->shape};
+  int image_row_align =
+      prototype->virtual_device->target->GetAttr<Integer>("image_base_address_alignment")
+          .value_or(Integer(64))
+          ->value;
+  return runtime::GetTextureMemorySize<Shape>(shape, ttype->dtype.bits(), ttype->dtype.lanes(),
+                                              prototype->virtual_device->memory_scope,
+                                              image_row_align);
 }
 
 }  // namespace relay
diff --git a/src/relay/backend/token_allocator.h b/src/relay/backend/token_allocator.h
index 3aebd71b6c2b..5524e6b2c634 100644
--- a/src/relay/backend/token_allocator.h
+++ b/src/relay/backend/token_allocator.h
@@ -66,9 +66,9 @@ struct StorageToken {
 };
 
 /**
- * @brief Memory manager for flattened 1d memory (buffers)
+ * @brief Memory manager for mixed mode memory types
  */
-class TokenAllocator1D {
+class TokenAllocatorMixed {
  public:
   /*!
    * \brief ceil(size/word_size) to get number of words.
@@ -105,54 +105,22 @@ class TokenAllocator1D {
    * \param tok The token to be released.
    */
   void CheckForRelease(StorageToken* tok);
-
- private:
-  // scale used for rough match
-  const size_t match_range_{16};
-  // free list of storage entry
-  std::multimap<size_t, StorageToken*> free_;
-  // all the storage resources available
-  std::vector<StorageToken*> data_;
-};
-
-/**
- * @brief Memory manager for 2d memory (textures)
- */
-class TokenAllocator2D {
- public:
-  /*!
-   * \brief Request a storage token for a given prototype.
-   * \param prototype. The prototype storage token.
-   * \return The result token.
-   */
-  StorageToken* Request(StorageToken* prototype);
-  /*!
-   * \brief Alloacte a storage token by consuming prototype
-   * \param prototype The prototype token.
-   * \param size The size of memory being requested.
-   */
-  StorageToken* Alloc(StorageToken* prototype, int64_t storage_id);
-  /*!
-   * \brief Check if we can release token.
-   * \param tok The token to be released.
-   */
-  void CheckForRelease(StorageToken* tok);
   /*!
    * \brief Get the texture 2d size requirement
    * \param prototype The prototype token.
-   * \return The required texture 2d memory size in (width, height, channel).
+   * \return The physical memory size.
    */
-  runtime::Texture2DShape<int64_t> GetSize2D(StorageToken* prototype);
+  size_t GetSize2D(StorageToken* prototype);
 
  protected:
-  struct MemBlock {
-    StorageToken* token_;
-    int64_t x_;
-    int64_t y_;
-  };
+  // free list of storage entry
+  std::multimap<size_t, StorageToken*> free_;
+  // all the storage resources available
+  std::vector<StorageToken*> data_;
 
-  std::unordered_map<int64_t, MemBlock> blocks_;
-  std::unordered_set<int64_t> free_list_;
+ private:
+  // scale used for rough match
+  const size_t match_range_{16};
 };
 
 }  // namespace relay
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index ea22b89dd771..45a394e733b5 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -187,6 +187,11 @@ void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDa
   return nullptr;
 }
 
+void* DeviceAPI::AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
+                                    Optional<String> mem_scope) {
+  return data;
+}
+
 void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   // by default, we can always redirect to the flat memory copy operation.
   size_t nbytes = GetDataSize(*from);
@@ -206,6 +211,8 @@ void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, s
 
 void DeviceAPI::FreeWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); }
 
+void DeviceAPI::FreeDataSpaceView(Device dev, void* ptr) {}
+
 TVMStreamHandle DeviceAPI::CreateStream(Device dev) { return nullptr; }
 
 void DeviceAPI::FreeStream(Device dev, TVMStreamHandle stream) {}
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index d8c0075fcdc1..fa7338177cbe 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -54,7 +54,7 @@ CLMLWorkspace::CLMLWorkspace() {
   tentry = workspace->GetThreadEntry();
 
   device_id = workspace->GetCLDeviceID(tentry->device.device_id);
-  platform_id = workspace->device_to_platform[device_id];
+  platform_id = workspace->device_info[device_id].platform_id;
 
   // Print extensions
   size_t reqd_size = 0;
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 33908d750d6d..82b8d9062615 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -127,6 +127,8 @@ class CUDADeviceAPI final : public DeviceAPI {
         *rv = static_cast<int64_t>(free_mem);
         return;
       }
+      case kImagePitchAlignment:
+        return;
     }
     *rv = value;
   }
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 1b1051322c49..1757aae50663 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -26,6 +26,7 @@
 #include <tvm/runtime/container/string.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
@@ -424,36 +425,31 @@ void GraphExecutor::SetupStorage() {
     }
     pool_entry[sid].param_data_entry = i;
     pool_entry[sid].device_type = device_type;
-    pool_entry[sid].scope = storage_scope;
 
     DLDataType t = vtype[i];
-    if (!details::Is2DStorage(storage_scope)) {
-      size_t size = 1;
-      for (int64_t sz : attrs_.shape[i]) {
-        size *= static_cast<size_t>(sz);
-      }
-      size_t bits = t.bits * t.lanes;
-      ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
-      int64_t bytes = ((bits + 7U) / 8U) * size;
-      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes);
-      pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1};
-    } else {
-      if (pool_entry[sid].shape.size() == 1) {
-        pool_entry[sid].shape.resize(3, 0);
-      }
-      size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope);
-      auto shape = ApplyTexture2DFlattening<int64_t>(attrs_.shape[i], attrs_.shape[i].size(), axis);
-      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height);
-      pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width);
-      CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel)
-          << pool_entry[sid].shape[2] << " != " << shape.channel
-          << ",  texture channel length must be consistent within a storage pool";
-      pool_entry[sid].shape[2] = shape.channel;
-      CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t))
-          << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t)
-          << ", pool entry for 2d texure allocations must be of the same type;"
-          << " downstream error from memory planner likely";
+
+    auto dev_type = pool_entry[sid].device_type;
+    const auto& cit = std::find_if(devices_.begin(), devices_.end(), [&dev_type](const Device& d) {
+      return dev_type == static_cast<int>(d.device_type);
+    });
+    Device dev = cit == devices_.end() ? devices_[0] : *cit;
+
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.device = dev;
+    temp.ndim = attrs_.shape[i].size();
+    temp.dtype = t;
+    temp.shape = static_cast<int64_t*>(attrs_.shape[i].data());
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+
+    int64_t alloc_size = DeviceAPI::Get(dev)->GetDataSize(temp, String(storage_scope));
+
+    if (pool_entry[sid].alloc_size < alloc_size) {
       pool_entry[sid].dtype = t;
+      pool_entry[sid].shape = attrs_.shape[i];
+      pool_entry[sid].alloc_size = alloc_size;
+      pool_entry[sid].scope = storage_scope;
     }
   }
 
@@ -469,9 +465,6 @@ void GraphExecutor::SetupStorage() {
       storage_pool_.push_back(pit.linked_param);
     } else {
       std::vector<int64_t> shape = pit.shape;
-      if (shape.size() == 1) {
-        shape[0] = (shape[0] + 3) / 4;
-      }
       Optional<String> mem_scope;
       if (!pit.scope.empty()) {
         mem_scope = String(pit.scope);
@@ -494,8 +487,12 @@ void GraphExecutor::SetupStorage() {
     sid_to_eid_[storage_id].push_back(i);
 
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
-
+    std::string storage_scope = attrs_.storage_scope.empty() ? "" : attrs_.storage_scope[i];
+    Optional<String> mem_scope;
+    if (!storage_scope.empty()) {
+      mem_scope = String(storage_scope);
+    }
+    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i], 0, mem_scope);
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
   }
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index cfdba8916baa..d9f0e0aec34a 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -224,6 +224,7 @@ class TVM_DLL GraphExecutor : public ModuleNode {
     int param_data_entry;
     NDArray linked_param;
     std::string scope;
+    int64_t alloc_size{-1};
     //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
     //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index 0607697e6b83..510f9a13be7b 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -131,9 +131,21 @@ MemoryManager* MemoryManager::Global() {
 Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
   MemoryManager* m = MemoryManager::Global();
   std::lock_guard<std::mutex> lock(m->mu_);
-  if (m->allocators_.find(dev) == m->allocators_.end()) {
+  auto it = m->allocators_.find(dev);
+  if (it == m->allocators_.end()) {
     m->allocators_.emplace(dev, std::unordered_map<AllocatorType, std::unique_ptr<Allocator>>());
   }
+
+  // Look for any available, else create Naive.
+  if (type == AllocatorType::kAny) {
+    it = m->allocators_.find(dev);
+    if (it->second.begin() != it->second.end()) {
+      return it->second.begin()->second.get();
+    } else {
+      type = AllocatorType::kNaive;
+    }
+  }
+
   if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) {
     std::unique_ptr<Allocator> alloc;
     switch (type) {
@@ -155,11 +167,6 @@ Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
     return ret;
   }
   auto alloc = m->allocators_.at(dev).at(type).get();
-  /*if (alloc->type() != type) {
-    LOG(WARNING) << "The type of existing allocator for " << dev
-                 << " is different from the request type (" << alloc->type() << " vs " << type
-                 << ")";
-  }*/
   return alloc;
 }
 
@@ -191,16 +198,12 @@ NDArray Allocator::Empty(ShapeTuple shape, DLDataType dtype, DLDevice dev,
   VerifyDataType(dtype);
   NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, dev);
   container->SetDeleter(BufferDeleter);
-  size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor);
+  size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor, mem_scope);
   size_t alignment = GetDataAlignment(container->dl_tensor);
   Buffer* buffer = new Buffer;
-  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
-    *buffer = this->Alloc(dev, size, alignment, dtype);
-  } else {
-    *buffer = this->Alloc(dev, shape, dtype, mem_scope.value());
-  }
-  container->manager_ctx = reinterpret_cast<void*>(buffer);
+  *buffer = this->Alloc(dev, size, alignment, dtype);
   container->dl_tensor.data = buffer->data;
+  container->manager_ctx = reinterpret_cast<void*>(buffer);
   return NDArray(GetObjectPtr<Object>(container));
 }
 
@@ -210,16 +213,22 @@ bool Allocator::AllowMemoryScope(const std::string& mem_scope) const {
 
 Buffer Allocator::Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
                         const std::string& mem_scope) {
+  NDArray::Container container(nullptr, shape, type_hint, dev);
+  size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor);
+
   if (AllowMemoryScope(mem_scope)) {
-    // by default, we can always redirect to the flat memory allocations
-    NDArray::Container container(nullptr, shape, type_hint, dev);
-    size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor);
     size_t alignment = GetDataAlignment(container.dl_tensor);
     return Alloc(dev, size, alignment, type_hint);
   }
-  LOG(FATAL) << "Allocator cannot allocate data space with "
-             << "specified memory scope: " << mem_scope;
-  return {};
+  Buffer buf;
+  buf.device = dev;
+  buf.size = size;
+  buf.alloc_type = type_;
+  buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
+                                                 String(mem_scope));
+  used_memory_.fetch_add(size, std::memory_order_relaxed);
+  DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+  return buf;
 }
 
 void Allocator::Clear() {
diff --git a/src/runtime/memory/naive_allocator.h b/src/runtime/memory/naive_allocator.h
index 6d8e90fed9f2..62d8e8f06c80 100644
--- a/src/runtime/memory/naive_allocator.h
+++ b/src/runtime/memory/naive_allocator.h
@@ -35,7 +35,7 @@ namespace memory {
 
 class NaiveAllocator final : public Allocator {
  public:
-  explicit NaiveAllocator() : Allocator(kNaive), used_memory_(0) {}
+  explicit NaiveAllocator() : Allocator(kNaive) { used_memory_ = 0; }
 
   Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
     Buffer buf;
@@ -48,32 +48,6 @@ class NaiveAllocator final : public Allocator {
     return buf;
   }
 
-  Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
-               const std::string& mem_scope) final {
-    Buffer buf;
-    size_t nbytes = 1;
-    for (int i = 0; i < static_cast<int>(shape.size()); ++i) {
-      nbytes *= static_cast<size_t>(shape[i]);
-    }
-    nbytes *= (type_hint.bits * type_hint.lanes + 7) / 8;
-    buf.device = dev;
-    if (AllowMemoryScope(mem_scope)) {
-      auto tmp_buf = Allocator::Alloc(dev, shape, type_hint, mem_scope);
-      buf.size = tmp_buf.size;
-      buf.data = tmp_buf.data;
-      buf.alloc_type = kNaive;
-      return buf;
-    }
-
-    buf.size = nbytes;
-    buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
-                                                   String(mem_scope));
-    used_memory_.fetch_add(nbytes, std::memory_order_relaxed);
-    DLOG(INFO) << "allocate " << nbytes << " B, used memory " << used_memory_ << " B";
-    buf.alloc_type = kNaive;
-    return buf;
-  }
-
   void Free(const Buffer& buffer) override {
     DeviceAPI::Get(buffer.device)->FreeDataSpace(buffer.device, buffer.data);
     used_memory_.fetch_sub(buffer.size, std::memory_order_relaxed);
@@ -81,9 +55,6 @@ class NaiveAllocator final : public Allocator {
   }
 
   size_t UsedMemory() const override { return used_memory_.load(std::memory_order_relaxed); }
-
- private:
-  std::atomic<size_t> used_memory_;
 };
 
 }  // namespace memory
diff --git a/src/runtime/memory/pooled_allocator.h b/src/runtime/memory/pooled_allocator.h
index c96c87a73a13..7bc73fd234b1 100644
--- a/src/runtime/memory/pooled_allocator.h
+++ b/src/runtime/memory/pooled_allocator.h
@@ -41,7 +41,9 @@ class PooledAllocator : public Allocator {
   static constexpr size_t kDefaultPageSize = 4096;
 
   explicit PooledAllocator(size_t page_size = kDefaultPageSize)
-      : Allocator(kPooled), page_size_(page_size), used_memory_(0) {}
+      : Allocator(kPooled), page_size_(page_size) {
+    used_memory_ = 0;
+  }
 
   ~PooledAllocator() { ReleaseAll(); }
 
@@ -73,15 +75,6 @@ class PooledAllocator : public Allocator {
     return buf;
   }
 
-  Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
-               const std::string& mem_scope) override {
-    if (AllowMemoryScope(mem_scope)) {
-      return Allocator::Alloc(dev, shape, type_hint, mem_scope);
-    }
-    LOG(FATAL) << "This alloc should be implemented";
-    return {};
-  }
-
   void Free(const Buffer& buffer) override {
     std::lock_guard<std::recursive_mutex> lock(mu_);
     if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
@@ -120,7 +113,6 @@ class PooledAllocator : public Allocator {
 
  protected:
   size_t page_size_;
-  std::atomic<size_t> used_memory_;
   std::unordered_map<size_t, std::vector<Buffer>> memory_pool_;
   std::recursive_mutex mu_;
 };
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index f2e8c4ab0b75..cbdacb5c096f 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -95,6 +95,8 @@
         *rv = static_cast<int64_t>([devices[dev.device_id] recommendedMaxWorkingSetSize]);
         return;
       }
+      case kImagePitchAlignment:
+        return;
     }
   };
 }
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index c2cf5f388a21..83711a617f69 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -102,6 +102,8 @@ struct NDArray::Internal {
     auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
     if (ptr->manager_ctx != nullptr) {
       static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
+      tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
+          ->FreeDataSpaceView(ptr->dl_tensor.device, ptr->dl_tensor.data);
     } else if (ptr->dl_tensor.data != nullptr) {
       tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
           ->FreeDataSpace(ptr->dl_tensor.device, ptr->dl_tensor.data);
@@ -179,7 +181,8 @@ struct NDArray::Internal {
   }
 };
 
-NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset) {
+NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset,
+                            Optional<String> mem_scope) {
   ICHECK(data_ != nullptr);
 
   const DLTensor& orig = get_mutable()->dl_tensor;
@@ -223,7 +226,10 @@ NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relativ
   // increase ref count
   get_mutable()->IncRef();
   ret.get_mutable()->manager_ctx = get_mutable();
-  ret.get_mutable()->dl_tensor.data = get_mutable()->dl_tensor.data;
+  ret.get_mutable()->dl_tensor.data =
+      DeviceAPI::Get(get_mutable()->dl_tensor.device)
+          ->AllocDataSpaceView(get_mutable()->dl_tensor.device, get_mutable()->dl_tensor.data,
+                               shape, dtype, mem_scope);
   ret.get_mutable()->dl_tensor.byte_offset =
       get_mutable()->dl_tensor.byte_offset + relative_byte_offset;
   return ret;
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 2e9b05edcb58..b9cf671a643c 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -27,6 +27,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
@@ -74,12 +75,13 @@
 #include "../pack_args.h"
 #include "../texture.h"
 #include "../thread_storage_scope.h"
-#include "../workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
 namespace cl {
 
+using tvm::runtime::memory::Buffer;
+
 static_assert(sizeof(cl_mem) == sizeof(void*), "Required to store cl_mem inside void*");
 
 inline const char* CLGetErrorString(cl_int error) {
@@ -221,6 +223,12 @@ inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
 class OpenCLThreadEntry;
 struct BufferDescriptor;
 
+struct CLDeviceInfo {
+  cl_platform_id platform_id;      // platform Id
+  cl_uint image_row_align;         // CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR
+  bool image_from_buffer_support;  // extn: cl_khr_image2d_from_buffer
+};
+
 /*!
  * \brief Process global OpenCL workspace.
  */
@@ -234,8 +242,8 @@ class OpenCLWorkspace : public DeviceAPI {
   std::unordered_map<cl_platform_id, cl_context> contexts;
   // whether the workspace it initialized.
   bool initialized_{false};
-  // map device to platform
-  std::unordered_map<cl_device_id, cl_platform_id> device_to_platform;
+  // map device to varius device informations
+  std::unordered_map<cl_device_id, CLDeviceInfo> device_info;
   // the devices
   std::vector<cl_device_id> devices;
   // the queues
@@ -251,6 +259,7 @@ class OpenCLWorkspace : public DeviceAPI {
   std::vector<size_t> free_kernel_ids;
   // the mutex for initialization
   std::mutex mu;
+
   // destructor
   ~OpenCLWorkspace() {
     for (auto& it : contexts) {
@@ -284,6 +293,15 @@ class OpenCLWorkspace : public DeviceAPI {
         << "Invalid OpenCL device_id=" << dev.device_id << ". " << GetError();
     return events[dev.device_id];
   }
+  bool IsOpenCLExtensionSupported(cl_device_id did, const std::string& name) {
+    size_t reqd_size = 0;
+    OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size));
+    std::vector<char> extn_buf(reqd_size);
+    OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr));
+    std::string extensions(extn_buf.data());
+    return (extensions.find(name) != std::string::npos);
+  }
+
   // is current clCommandQueue in profiling mode
   bool IsProfiling(Device dev) {
     cl_command_queue queue = GetQueue(dev);
@@ -309,12 +327,18 @@ class OpenCLWorkspace : public DeviceAPI {
     OPENCL_CALL(clReleaseCommandQueue(queue));
     cl_int err_code;
     cl_device_id did = cl::OpenCLWorkspace::Global()->GetCLDeviceID(dev.device_id);
-    cl_platform_id platform = cl::OpenCLWorkspace::Global()->device_to_platform[did];
+    cl_platform_id platform = cl::OpenCLWorkspace::Global()->device_info[did].platform_id;
     auto profiling_queue = clCreateCommandQueue(cl::OpenCLWorkspace::Global()->contexts[platform],
                                                 did, prop, &err_code);
     OPENCL_CHECK_ERROR(err_code);
     cl::OpenCLWorkspace::Global()->queues[dev.device_id] = profiling_queue;
   }
+  cl_uint GetImageAlignment(int device_id) {
+    return device_info[GetCLDeviceID(device_id)].image_row_align;
+  }
+  bool IsBufferToImageSupported(int device_id) {
+    return device_info[GetCLDeviceID(device_id)].image_from_buffer_support;
+  }
 
   cl_device_id GetCLDeviceID(int device_id);
   // override device API
@@ -323,18 +347,24 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope = NullOpt) final;
+  void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
+                       Optional<String> mem_scope = NullOpt);
+  void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
+                           Optional<String> mem_scope = NullOpt) final;
   void* GetNativePtr(const tvm::runtime::NDArray& narr);
   void SetNativePtr(const tvm::runtime::NDArray& narr, void* host_ptr, size_t buf_size);
   void SetPerfHint(Device dev, cl_uint perf_hint);
   void FreeDataSpace(Device dev, void* ptr) final;
+  void FreeDataSpaceView(Device dev, void* ptr) final;
   void StreamSync(Device dev, TVMStreamHandle stream) final;
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(Device dev, void* data) final;
+  size_t GetDataSize(const DLTensor& arr, Optional<String> mem_scope = NullOpt) final;
 
-  // Texture (image2d_t) alloca APIs
-  cl_mem AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint);
-  void* AllocTextureWorkspace(Device dev, size_t width, size_t height, DLDataType type_hint);
-  void FreeTextureWorkspace(Device dev, void* data);
+  // cl_mem alloc utils
+  void* AllocCLBuffer(Device dev, size_t size, size_t alignment, DLDataType type_hint);
+  void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t row_pitch,
+                     DLDataType type_hint, Optional<String> mem_scope);
 
   /*!
    * \brief Get the thread local ThreadEntry
@@ -370,13 +400,8 @@ class OpenCLThreadEntry {
   Device device;
   /*! \brief The thread-local kernel table */
   std::vector<KTEntry> kernel_table;
-  /*! \brief workspace pool */
-  WorkspacePool pool;
-  /*! \brief texture pool */
-  TexturePool texture_pool;
   // constructor
-  OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api)
-      : pool(device_type, device_api), texture_pool(device_type, device_api) {
+  OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api) {
     device.device_id = 0;
     device.device_type = device_type;
   }
@@ -414,9 +439,14 @@ struct BufferDescriptor {
   static MemoryLayout MemoryLayoutFromScope(Optional<String> mem_scope);
   static String ScopeFromMemoryLayout(MemoryLayout mem_scope);
 
+  /* clBuffer object */
+  // buffer should be the first element here
   cl_mem buffer{nullptr};
+  cl::BufferDescriptor* back_buffer{nullptr};
   cl_uchar* host_ptr{nullptr};
   MemoryLayout layout{MemoryLayout::kBuffer1D};
+  Buffer mbuf{nullptr};  // MemoryManager ref.
+  bool is_compat_view{false};
 };
 }  // namespace cl
 
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 5c5873b67f74..adcb72043fc6 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -103,6 +103,19 @@ String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryL
   return "";
 }
 
+static size_t GetMemObjectSize(Device dev, int ndim, const int64_t* shape, DLDataType dtype) {
+  DLTensor temp;
+  temp.data = nullptr;
+  temp.device = dev;
+  temp.ndim = ndim;
+  temp.dtype = dtype;
+  temp.shape = const_cast<int64_t*>(shape);
+  temp.strides = nullptr;
+  temp.byte_offset = 0;
+  size_t size = DeviceAPI::Get(dev)->GetDataSize(temp);
+  return size;
+}
+
 OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }
 
 OpenCLWorkspace* OpenCLWorkspace::Global() {
@@ -220,6 +233,10 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       // https://stackoverflow.com/a/3568223, may not be implementable
       // at all through OpenCL API.
       break;
+    case kImagePitchAlignment: {
+      *rv = static_cast<int64_t>(device_info[device_id].image_row_align);
+      break;
+    }
   }
 }
 
@@ -238,8 +255,55 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device
 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
+  return AllocCLBuffer(dev, size, alignment, type_hint);
+}
+
+void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
+                                      Optional<String> mem_scope) {
+  // Texture allocation given width and height
+  cl_uint row_align = GetImageAlignment(dev.device_id);
+  size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
+  size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
+  size_t mem_size = row_pitch * height;
+
+  // Alloc back buffer from pool
+  cl::BufferDescriptor* back_buffer = nullptr;
+  if (IsBufferToImageSupported(dev.device_id)) {
+    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kAny)
+                   ->Alloc(dev, mem_size, kTempAllocaAlignment, type_hint);
+    back_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
+    back_buffer->mbuf = buf;
+  }
+
+  if (!mem_scope.defined()) {
+    mem_scope = String("global.texture");
+  }
+  return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope);
+}
+
+void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
+                                      Optional<String> mem_scope) {
+  this->Init();
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    size_t size = GetMemObjectSize(dev, ndim, shape, dtype);
+    cl::BufferDescriptor* ret_buffer = nullptr;
+    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kAny)
+                   ->Alloc(dev, size, kTempAllocaAlignment, dtype);
+    ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
+    ret_buffer->mbuf = buf;
+    return ret_buffer;
+  }
+  size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
+  auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
+
+  return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope);
+}
+
+void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
+                                     DLDataType type_hint) {
+  this->Init();
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
-  auto platform = device_to_platform[device_id];
+  auto platform = device_info[device_id].platform_id;
   cl_int err_code;
   cl::BufferDescriptor* desc = new cl::BufferDescriptor;
   // CL_INVALID_BUFFER_SIZE if size is 0.
@@ -253,25 +317,125 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
   return CreateHostPtrIfEnabled(desc, dev, size);
 }
 
-void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
-                                      Optional<String> mem_scope) {
-  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
-    return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
-  }
-  ICHECK(IsTextureStorage(std::string(mem_scope.value())))
-      << "Device does not support allocate data space with "
-      << "specified memory scope: " << mem_scope.value();
+void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height,
+                                    size_t row_pitch, DLDataType type_hint,
+                                    Optional<String> mem_scope) {
+  this->Init();
+  ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos)
+      << "Expect texture scope while creating an Image object";
+  cl::BufferDescriptor* back_desc = static_cast<cl::BufferDescriptor*>(back_buffer);
+  cl_device_id device_id = GetCLDeviceID(dev.device_id);
+  auto platform = device_info[device_id].platform_id;
+  cl_int err_code;
+  cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
+  cl_image_format format = {CL_RGBA, cl_type};
+  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
 
-  ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
-                   << "provided shape is rank " << ndim;
+  if (IsBufferToImageSupported(dev.device_id)) {
+    descriptor.image_row_pitch = row_pitch;
+    descriptor.buffer = back_desc->buffer;
+  }
+  cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_CREATE_FLAGS, &format, &descriptor,
+                              nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
 
   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
-  size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
-  auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
-  desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
+  desc->buffer = mptr;
+  desc->back_buffer = back_desc;
+
   return desc;
 }
 
+size_t OpenCLWorkspace::GetDataSize(const DLTensor& arr, Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    return DeviceAPI::GetDataSize(arr);
+  }
+  cl_uint row_align = GetImageAlignment(GetThreadEntry()->device.device_id);
+  std::vector<int64_t> shape;
+  shape.assign(arr.shape, arr.shape + arr.ndim);
+  return runtime::GetTextureMemorySize<std::vector<int64_t>>(shape, arr.dtype.bits, arr.dtype.lanes,
+                                                             mem_scope.value(), row_align);
+}
+
+void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ShapeTuple shape,
+                                          DLDataType dtype, Optional<String> mem_scope) {
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
+
+  // Fall back for devices w/o "cl_khr_image2d_from_buffer"
+  if (!IsBufferToImageSupported(dev.device_id)) {
+    cl::BufferDescriptor* ret_desc = desc;  // buffer -> buffer
+    if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+      if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+        // image -> buffer
+        size_t nbytes = GetMemObjectSize(dev, shape.size(), shape.data(), dtype);
+        ret_desc = static_cast<cl::BufferDescriptor*>(
+            OpenCLWorkspace::AllocCLBuffer(dev, nbytes, kTempAllocaAlignment, dtype));
+        ret_desc->is_compat_view = true;
+      }
+    } else {
+      // Any -> Image
+      size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
+      auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
+      cl_uint row_align = GetImageAlignment(dev.device_id);
+      size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
+      size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4
+
+      ret_desc = static_cast<cl::BufferDescriptor*>(OpenCLWorkspace::Global()->AllocCLImage(
+          dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope));
+      ret_desc->is_compat_view = true;
+    }
+    return ret_desc;
+  }
+
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+      //  buffer -> buffer
+      return desc;
+    } else {
+      // image -> buffer
+      return desc->back_buffer;
+    }
+  }
+  size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
+  auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
+  cl_uint row_align = GetImageAlignment(dev.device_id);
+  size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
+  size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4
+
+  cl::BufferDescriptor* back_buffer;
+  if (desc->back_buffer) {
+    // image -> image
+    back_buffer = desc->back_buffer;
+  } else {
+    // buffer -> image
+    back_buffer = desc;
+  }
+
+  return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height,
+                                             row_pitch, dtype, mem_scope);
+}
+
+void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
+  auto* desc = static_cast<const cl::BufferDescriptor*>(ptr);
+  // Handle the fall back
+  if (!IsBufferToImageSupported(dev.device_id)) {
+    if (desc->is_compat_view) {
+      // TODO(Siva): Do we need this waiting for entire queue ?
+      OPENCL_CALL(clFinish(this->GetQueue(dev)));
+      OPENCL_CALL(clReleaseMemObject(desc->buffer));
+      delete desc;
+    }
+    return;
+  }
+
+  if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+    // TODO(Siva): Do we need this waiting for entire queue ?
+    OPENCL_CALL(clFinish(this->GetQueue(dev)));
+    OPENCL_CALL(clReleaseMemObject(desc->buffer));
+    delete desc;
+  }
+}
+
 void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::NDArray& narr) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
   return desc->host_ptr;
@@ -324,37 +488,29 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
   OPENCL_CALL(clFinish(this->GetQueue(dev)));
 
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
-  if (desc->host_ptr) {
-    OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
-                                        reinterpret_cast<void*>(desc->host_ptr), 0, nullptr,
-                                        nullptr));
+  if (desc->back_buffer) {
+    // 2D Image w/ back buffer allocated from pool
+    OPENCL_CALL(clReleaseMemObject(desc->buffer));
+    MemoryManager::GetAllocator(dev, desc->back_buffer->mbuf.alloc_type)
+        ->Free(desc->back_buffer->mbuf);
+    delete desc;
+  } else {
+    if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+      // 1D buffer allocated from pool
+      if (desc->host_ptr) {
+        clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
+                                reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
+      }
+      OPENCL_CALL(clFinish(this->GetQueue(dev)));
+      OPENCL_CALL(clReleaseMemObject(desc->buffer));
+      delete desc;
+    } else if (!IsBufferToImageSupported(dev.device_id)) {
+      // 2D Image allocated w/o pool
+      OPENCL_CALL(clReleaseMemObject(desc->buffer));
+      delete desc;
+      return;
+    }
   }
-  OPENCL_CALL(clReleaseMemObject(desc->buffer));
-  delete desc;
-}
-
-cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
-                                     DLDataType type_hint) {
-  this->Init();
-  cl_device_id device_id = GetCLDeviceID(dev.device_id);
-  auto platform = device_to_platform[device_id];
-  cl_int err_code;
-  cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
-  cl_image_format format = {CL_RGBA, cl_type};
-  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
-  cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_READ_WRITE, &format, &descriptor,
-                              nullptr, &err_code);
-  OPENCL_CHECK_ERROR(err_code);
-  return mptr;
-}
-
-void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height,
-                                             DLDataType type_hint) {
-  return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint);
-}
-
-void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
-  GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
 }
 
 void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
@@ -444,11 +600,18 @@ void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
 }
 
 void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  return GetThreadEntry()->pool.AllocWorkspace(dev, size);
+  this->Init();
+  cl::BufferDescriptor* ret_buffer = nullptr;
+  auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kAny)
+                 ->Alloc(dev, size, kTempAllocaAlignment, type_hint);
+  ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
+  ret_buffer->mbuf = buf;
+  return ret_buffer;
 }
 
 void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
-  GetThreadEntry()->pool.FreeWorkspace(dev, data);
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
+  MemoryManager::GetAllocator(dev, desc->mbuf.alloc_type)->Free(desc->mbuf);
 }
 
 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
@@ -585,9 +748,20 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
     this->devices.insert(this->devices.end(), devices.begin(), devices.end());
     for (size_t i = 0; i < devices.size(); ++i) {
       cl_device_id did = devices[i];
-      device_to_platform[did] = platform;
+      CLDeviceInfo dev_info;
+      dev_info.platform_id = platform;
       this->queues.push_back(clCreateCommandQueue(this->contexts[platform], did, 0, &err_code));
       OPENCL_CHECK_ERROR(err_code);
+      cl_uint row_pitch;
+      OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, sizeof(row_pitch),
+                                  &row_pitch, nullptr));
+      if (0 == row_pitch) {
+        row_pitch = kAllocAlignment;  // Fallback
+      }
+      dev_info.image_row_align = row_pitch;
+      dev_info.image_from_buffer_support =
+          IsOpenCLExtensionSupported(did, "cl_khr_image2d_from_buffer");
+      device_info.insert({did, dev_info});
     }
     OPENCL_CHECK_ERROR(err_code);
   }
@@ -617,9 +791,9 @@ TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMR
   type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
   type_hint.lanes = 1;
 
-  OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
-  *rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), static_cast<size_t>(height),
-                                   type_hint);
+  *rv = OpenCLWorkspace::Global()->AllocDataSpace(dev, static_cast<size_t>(width),
+                                                  static_cast<size_t>(height), type_hint,
+                                                  Optional<String>("global.texture"));
 });
 
 TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
@@ -632,7 +806,7 @@ TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRe
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
-  ptr->FreeTextureWorkspace(dev, data);
+  ptr->FreeDataSpace(dev, data);
   *rv = static_cast<int32_t>(0);
 });
 
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 567b7ad88a9e..77c50b23895c 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -225,7 +225,7 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
   std::lock_guard<std::mutex> lock(build_lock_);
   int device_id = t->device.device_id;
   auto did = w->GetCLDeviceID(device_id);
-  auto platform = w->device_to_platform[did];
+  auto platform = w->device_info[did].platform_id;
   if (!IsProgramCreated(func_name, device_id)) {
     // create program
     if (fmt_ == "cl") {
@@ -294,7 +294,7 @@ void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) {
       const unsigned char* programBinary = bin_vector.data();
 
       cl_device_id dev = workspace_->GetCLDeviceID(device_id);
-      auto platform = workspace_->device_to_platform[dev];
+      auto platform = workspace_->device_info[dev].platform_id;
       programs_[name][device_id] =
           clCreateProgramWithBinary(workspace_->contexts[platform], 1, &dev, &binarySize,
                                     &programBinary, &binaryStatus, &err);
diff --git a/src/runtime/opencl/opencl_module_spirv.cc b/src/runtime/opencl/opencl_module_spirv.cc
index 7e52b7057bc7..28e02a4e3749 100644
--- a/src/runtime/opencl/opencl_module_spirv.cc
+++ b/src/runtime/opencl/opencl_module_spirv.cc
@@ -96,7 +96,7 @@ cl_kernel OpenCLSPIRVModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenC
     size_t len = it->second.data.size() * sizeof(uint32_t);
     cl_int err;
     cl_device_id dev = w->devices[device_id];
-    auto platform = w->device_to_platform[dev];
+    auto platform = w->device_info[dev].platform_id;
     programs_[func_name][device_id] =
         clCreateProgramWithBinary(w->contexts[platform], 1, &dev, &len, &s, nullptr, &err);
     OPENCL_CHECK_ERROR(err);
diff --git a/src/runtime/opencl/texture_pool.cc b/src/runtime/opencl/texture_pool.cc
deleted file mode 100644
index 0b9477f2d4ea..000000000000
--- a/src/runtime/opencl/texture_pool.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file texture_pool.h
- * \brief Texture pool utility.
- */
-#include <limits>
-#include <memory>
-
-#include "../texture.h"
-
-namespace tvm {
-namespace runtime {
-
-void* Pool2D::Alloc(Device dev, DeviceAPI* device, size_t width, size_t height,
-                    DLDataType type_hint) {
-  Entry e;
-  Entry new_mem;
-  // Processed several experiments and found that when we are trying to fit
-  // small texture to too big texture then it may lead to the performance
-  // degradation.
-  // Coefficient at 5 looks like robust variant for reusing textures.
-  const int64_t max_ratio = 5;
-  e.data = nullptr;
-  std::vector<Entry>::iterator best_mem;
-  if (free_list_.size() != 0) {
-    int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-    int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-    int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-    int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-    for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
-      if (it->type.code != type_hint.code) {
-        continue;
-      }
-      // avoid reusing too small and too big textures
-      if (width / it->x > max_ratio || it->x / width > max_ratio || height / it->y > max_ratio ||
-          it->y / height > max_ratio) {
-        continue;
-      }
-      int64_t new_width = std::max(it->x, width);
-      int64_t new_height = std::max(it->y, height);
-      int64_t added_size_x = new_width - it->x;
-      int64_t added_size_y = new_height - it->y;
-      int64_t wasted_size_x = new_width - width;
-      int64_t wasted_size_y = new_height - height;
-      // Minimize added size first and wasted size thereafter
-      if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-          (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-          (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-          (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-        min_added_size_x = added_size_x;
-        min_added_size_y = added_size_y;
-        min_wasted_size_x = wasted_size_x;
-        min_wasted_size_y = wasted_size_y;
-        best_mem = it;
-        new_mem.x = new_width;
-        new_mem.y = new_height;
-      }
-    }
-
-    if (min_added_size_x == 0 && min_added_size_y == 0) {
-      // use existing block
-      e = *best_mem;
-      free_list_.erase(best_mem);
-    } else if (static_cast<size_t>(min_added_size_x) <= width ||
-               static_cast<size_t>(min_added_size_y) <= height) {
-      // if added size is less or equal to
-      // what is needed by alloc, then grow entry
-      device->FreeDataSpace(dev, best_mem->data);
-      free_list_.erase(best_mem);
-      new_mem.type = type_hint;
-      std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
-      new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type,
-                                            Optional<String>("global.texture"));
-      e = new_mem;
-    }
-  }
-
-  if (e.data == nullptr) {
-    // create new block
-    std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
-    e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
-                                    Optional<String>("global.texture"));
-    e.x = width;
-    e.y = height;
-    e.type = type_hint;
-  }
-
-  allocated_.push_back(e);
-  return e.data;
-}
-
-void Pool2D::Free(void* data) {
-  Entry e;
-  if (allocated_.back().data == data) {
-    // quick path, last allocated.
-    e = allocated_.back();
-    allocated_.pop_back();
-  } else {
-    int index = static_cast<int>(allocated_.size()) - 2;
-    for (; index >= 0 && allocated_[index].data != data; --index) {
-    }
-    ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated";
-    e = allocated_[index];
-    allocated_.erase(allocated_.begin() + index);
-  }
-  free_list_.push_back(e);
-}
-
-// Release all resources immediately
-void Pool2D::Release(Device dev, DeviceAPI* device) {
-  for (auto& e : allocated_) {
-    device->FreeDataSpace(dev, e.data);
-  }
-  for (auto& e : free_list_) {
-    device->FreeDataSpace(dev, e.data);
-  }
-  allocated_.clear();
-  free_list_.clear();
-}
-
-TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device)
-    : device_type_(device_type), device_(device) {}
-
-TexturePool::~TexturePool() {
-  for (size_t i = 0; i < array_.size(); ++i) {
-    if (array_[i] != nullptr) {
-      Device dev;
-      dev.device_type = device_type_;
-      dev.device_id = static_cast<int>(i);
-      array_[i]->Release(dev, device_);
-      delete array_[i];
-    }
-  }
-}
-
-void* TexturePool::AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint) {
-  if (static_cast<size_t>(dev.device_id) >= array_.size()) {
-    array_.resize(dev.device_id + 1, nullptr);
-  }
-  if (array_[dev.device_id] == nullptr) {
-    array_[dev.device_id] = new Pool2D();
-  }
-  return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint);
-}
-
-void TexturePool::FreeTexture(Device dev, void* ptr) {
-  ICHECK(static_cast<size_t>(dev.device_id) < array_.size() && array_[dev.device_id] != nullptr)
-      << "Attempt to free texture from null texture pool";
-  array_[dev.device_id]->Free(ptr);
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index ebfd312595a3..e7f103daadc9 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -136,7 +136,8 @@ class ROCMDeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
-
+      case kImagePitchAlignment:
+        return;
       case kAvailableGlobalMemory:
         // Not currently implemented.
         *rv = nullptr;
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index dc38101f0cd4..f3a827aa8792 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -30,6 +30,8 @@
 #include <string>
 #include <vector>
 
+#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1))
+
 namespace tvm {
 namespace runtime {
 
@@ -94,74 +96,26 @@ inline bool IsTextureStorage(std::string scope) {
   return scope.find("texture") != std::string::npos;
 }
 
-class TVM_DLL Pool2D {
- public:
-  Pool2D() = default;
-  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint);
-  void Free(void* data);
-  // Release all resources immediately
-  void Release(Device dev, DeviceAPI* device);
-
- protected:
-  struct Entry {
-    void* data;
-    size_t x;
-    size_t y;
-    DLDataType type;
-  };
-  std::vector<Entry> free_list_;
-  std::vector<Entry> allocated_;
-};
-
 /*!
- * \brief A two dimensional storage pool that recycles temporal workspace
- * allocations for dynamically allocated texture. See AllocTexture docstring
- * for approach to allocation and reuse.
+ * \brief Returns the physical backing memory size required for given specification
+ * \param shape shape of tensor
+ * \param bits dtype bits
+ * \param lanes vectorization lanes
+ * \param mem_scope the memory scope info
+ * \param image_row_align image rowwise alignment size
+ * \return returns the backing memory size
  */
-class TVM_DLL TexturePool {
- public:
-  /*!
-   * \brief Create pool with specific device type and device.
-   * \param device_type The device type.
-   * \param device_api The device API.
-   */
-  TexturePool(DLDeviceType device_type, DeviceAPI* device_api);
-  /*! \brief destructor */
-  ~TexturePool();
-
-  /*!
-   * \brief Allocate a two dimensional temporal texture workspace on device
-   *
-   * \note Two dimensional texture workspaces will be grown and reused
-   * according to the following strategy:
-   *  - Choose the workspace which minimizes the amount of memory required to
-   *    grow the workspace to fit the request.
-   *  - If a set of workspaces exist that fit the current request without
-   *    expansion, choose the workspace of that set which most closely
-   *    matches the request size, minimizing wasted space.
-   *
-   * \param dev The context of allocation.
-   * \param width The width of the 2d texture to be allocated.
-   * \param height The height of the 2d texture to be allocated.
-   * \param type_hint The type of elements.
-   */
-  void* AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint);
-  /*!
-   * \brief Free temporal texture in backend execution.
-   *
-   * \param dev The context of allocation.
-   * \param ptr The pointer to be freed.
-   */
-  void FreeTexture(Device dev, void* ptr);
+template <typename T>
+size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope,
+                            int image_row_align) {
+  size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope);
+  auto tshape = ApplyTexture2DFlattening<int64_t>(shape, shape.size(), axis);
 
- private:
-  /*! \brief pool of device local array */
-  std::vector<Pool2D*> array_;
-  /*! \brief device type this pool support */
-  DLDeviceType device_type_;
-  /*! \brief The device API */
-  DeviceAPI* device_;
-};
+  auto pack_size = shape[shape.size() - 1];
+  auto pixel_size = (bits * lanes + 7) / 8;
+  size_t row_pitch = ALIGN_UP(tshape.width * pixel_size * pack_size, image_row_align);
+  return row_pitch * tshape.height;
+}
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 483668a2a75f..af7b35e85ec5 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -168,11 +168,12 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = device(index).compute_memory_size;
       return;
     }
-
     case kAvailableGlobalMemory:
       // Not currently implemented.  Will only be implementable for
       // devices that support the VK_EXT_memory_budget extension.
       break;
+    case kImagePitchAlignment:
+      return;
   }
 }
 
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 5933c9582cec..b447c0729746 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -674,5 +674,19 @@ runtime::Module BuildOpenCL(IRModule mod, Target target) {
 }
 
 TVM_REGISTER_GLOBAL("target.build.opencl").set_body_typed(BuildOpenCL);
+
+String DeviceScopeCompatibilityFromTarget(Target target, String memory_scope) {
+  auto prototype_keys = target->GetKeys();
+  bool is_adreno =
+      std::find(prototype_keys.begin(), prototype_keys.end(), "adreno") != prototype_keys.end();
+  if (is_adreno) {
+    return String("global");
+  }
+  return memory_scope;
+}
+
+TVM_REGISTER_GLOBAL("DeviceScopeCompatibility.opencl")
+    .set_body_typed(DeviceScopeCompatibilityFromTarget);
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index e0a0ad23a1b6..e12c18e5ac73 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -366,6 +366,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     // specify any limitations on the number of kernel arguments. max_function_args
     // equals to 128 looks like a reasonable number of kernel arguments.
     .add_attr_option<runtime::Int>("max_function_args", runtime::Int(128))
+    .add_attr_option<runtime::Int>("image_base_address_alignment", runtime::Int(64))
     .set_default_keys({"opencl", "gpu"});
 
 // The metal has some limitations on the number of input parameters. This is why attribute
diff --git a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
deleted file mode 100644
index 2d3f43ddce6d..000000000000
--- a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/runtime/container/optional.h>
-
-#include "../src/runtime/opencl/opencl_common.h"
-#include "../src/runtime/texture.h"
-
-using namespace tvm::runtime;
-using namespace tvm::runtime::cl;
-
-// PoolWrapper is necessary because in class Pool2D we don't have an access to
-// its protected members. In this class we add new methods which allow us to
-// get and check internal state of class Pool
-class PoolWrapper : public Pool2D {
- public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t AllocatedListSize() const { return allocated_.size(); }
-  inline std::pair<size_t, size_t> FreeListItemSize(size_t idx) const {
-    return std::make_pair(free_list_[idx].x, free_list_[idx].y);
-  }
-  inline std::pair<size_t, size_t> AllocatedListItemSize(size_t idx) const {
-    return std::make_pair(allocated_[idx].x, allocated_[idx].y);
-  }
-};
-
-TEST(OpenCLTexturePool, textures_reallocation_optimal_size) {
-  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
-  OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
-  DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 64, 12455, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 768, 1024, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 1024);
-}
-
-TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) {
-  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
-  OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
-  DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 12455, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-}
-
-TEST(OpenCLTexturePool, avoid_reusing_too_small_textures) {
-  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
-  OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
-  DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 1024, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Alloc(t->device, workspace, 12544, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 64);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 12544);
-  EXPECT_EQ(item.second, 64);
-}
diff --git a/tests/cpp-runtime/opencl/opencl_timer_test.cc b/tests/cpp-runtime/opencl/opencl_timer_test.cc
index 1753300d3a09..ec038be5406c 100644
--- a/tests/cpp-runtime/opencl/opencl_timer_test.cc
+++ b/tests/cpp-runtime/opencl/opencl_timer_test.cc
@@ -37,7 +37,7 @@ TEST(OpenCLTimerNode, nested_timers) {
   int64_t nested_time_sum = 0;
 
   auto did = workspace->GetCLDeviceID(thr->device.device_id);
-  auto platform = workspace->device_to_platform[did];
+  auto platform = workspace->device_info[did].platform_id;
   Timer init_timer = Timer::Start(thr->device);
   for (int i = 0; i < NUM_REPEAT; ++i) {
     Timer nested_timer = Timer::Start(thr->device);
diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc
new file mode 100644
index 000000000000..0e7d4b4862f4
--- /dev/null
+++ b/tests/cpp-runtime/opencl/texture_copy_test.cc
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <random>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+class TextureCopyTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+    if (!enabled) {
+      GTEST_SKIP() << "Skip texture copy test because opencl runtime is disabled.\n";
+    }
+    // Check hardware support
+    tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global();
+    tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry();
+    if (!workspace->IsBufferToImageSupported(thr->device.device_id)) {
+      GTEST_SKIP() << "Skip test case as BufferToImage is not supported \n";
+    }
+    (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
+        thr->device, tvm::runtime::memory::AllocatorType::kNaive);
+  }
+};
+
+TEST(TextureCopy, HostDeviceRT) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    GTEST_SKIP() << "Skip texture copy test because opencl runtime is disabled.\n";
+  }
+  tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global();
+  tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry();
+  (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
+      thr->device, tvm::runtime::memory::AllocatorType::kNaive);
+  std::vector<int64_t> shape{16, 16, 4};
+  auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  String mem_scope = "global.texture";
+  auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  // Random initialize host ndarray
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr0->data)[i] = random(mt);
+  }
+
+  // Do a roundtrip from host storage to opencl texture storage and back
+  cpu_arr0.CopyTo(opencl_txarr0);
+  opencl_txarr0.CopyTo(cpu_arr1);
+  for (size_t i = 0; i < size; ++i) {
+    ICHECK_LT(
+        std::fabs(static_cast<float*>(cpu_arr1->data)[i] - static_cast<float*>(cpu_arr0->data)[i]),
+        1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
+  using namespace tvm;
+  std::vector<int64_t> shape{1, 16, 16, 8};
+  std::vector<int64_t> same_shape{1, 8, 16, 16};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  String mem_scope = "global";
+  auto opencl_memobj = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope);
+  auto opencl_memview = opencl_memobj.CreateView(same_shape, {kDLFloat, 32, 1});
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_memobj);
+  // Copy from OpenCLBuffer
+  opencl_memobj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_memview);
+  // Copy from OpenCLBuffer
+  opencl_memview.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewBufferAsImage) {
+  using namespace tvm;
+  // Shape that doesn't cause padding for image row
+  std::vector<int64_t> shape{1, 16, 16, 8, 4};
+  std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto opencl_buf_obj =
+      runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, String("global"));
+  auto opencl_img_obj =
+      opencl_buf_obj.CreateView(same_shape, {kDLFloat, 32, 1}, 0, String("global.texture"));
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_buf_obj);
+  // Copy from OpenCLBuffer
+  opencl_buf_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj);
+  // Copy from OpenCLBuffer
+  opencl_img_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewImageAsBuffer) {
+  using namespace tvm;
+  // Shape that doesn't cause padding for image row
+  std::vector<int64_t> shape{1, 16, 16, 8, 4};
+  std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto opencl_img_obj =
+      runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, String("global.texture"));
+  auto opencl_buf_obj =
+      opencl_img_obj.CreateView(same_shape, {kDLFloat, 32, 1}, 0, String("global"));
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_buf_obj);
+  // Copy from OpenCLBuffer
+  opencl_buf_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj);
+  // Copy from OpenCLBuffer
+  opencl_img_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewImageAsImage) {
+  using namespace tvm;
+  // Shape that doesn't cause padding for image row
+  std::vector<int64_t> shape{1, 16, 16, 8, 4};
+  std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto opencl_img_obj_1 =
+      runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, String("global.texture"));
+  auto opencl_img_obj_2 =
+      opencl_img_obj_1.CreateView(same_shape, {kDLFloat, 32, 1}, 0, String("global.texture"));
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj_1);
+  // Copy from OpenCLBuffer
+  opencl_img_obj_1.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj_2);
+  // Copy from OpenCLBuffer
+  opencl_img_obj_2.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
diff --git a/tests/cpp/relay/backend/graph_plan_token_alloc.cc b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
index 4641da2cb8b5..7fca4b26a985 100644
--- a/tests/cpp/relay/backend/graph_plan_token_alloc.cc
+++ b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
@@ -24,23 +24,24 @@
 namespace tvm {
 namespace relay {
 
-// TokenAllocator2d is necessary because in class TokenAllocator2D we don't
+// TokenAllocatorMixed is necessary because in class TokenAllocatorMixed we don't
 // have an access to its protected members. In this class we add new methods
-// which allow us to get and check internal state of class TokenAllocator2D
-class TokenAllocator2DWrapper : public TokenAllocator2D {
+// which allow us to get and check internal state of class TokenAllocatorMixed
+class TokenAllocatorMixedWrapper : public TokenAllocatorMixed {
  public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t BlockMapSize() const { return blocks_.size(); }
+  inline size_t FreeListSize() const { return free_.size(); }
+  inline size_t AllocListSize() const { return data_.size(); }
 };
 
-TEST(Token2DAlloc, OneToken) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureOneToken) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -49,29 +50,28 @@ TEST(Token2DAlloc, OneToken) {
       -1    // storage_id
   };
   auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(size2d, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 }
 
-TEST(Token2DAlloc, EqualSizeTokenReuse) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureEqualSizeTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -80,18 +80,16 @@ TEST(Token2DAlloc, EqualSizeTokenReuse) {
       -1    // storage_id
   };
   auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(size2d, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   StorageToken tok2 = {
@@ -103,24 +101,51 @@ TEST(Token2DAlloc, EqualSizeTokenReuse) {
   };
   auto req = alloc.Request(&tok2);
   EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req->storage_id, storage_ids - 1);
   EXPECT_EQ(req->ref_counter, 1);
   auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq.channel, 4);
-  EXPECT_EQ(sizeReq.height, 22);
-  EXPECT_EQ(sizeReq.width, 400);
+  EXPECT_EQ(sizeReq, 140800);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  // Try reuse of the texture memory for buffer object
+  VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"), MemoryScope("global"));
+  StorageToken tok3 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd2,  // virtual device
+      -1    // storage_id
+  };
+  auto req1 = alloc.Request(&tok3);
+  EXPECT_NE(req1, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req1->storage_id, storage_ids - 1);
+  EXPECT_EQ(req1->ref_counter, 1);
+  sizeReq = alloc.GetSize2D(req1);
+  EXPECT_EQ(sizeReq, 140800);
+
+  req1->ref_counter -= 1;
+  alloc.CheckForRelease(req1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
 }
 
-TEST(Token2DAlloc, EqualSizeDiffTypes) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureEqualSizeDiffTypes) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -128,19 +153,17 @@ TEST(Token2DAlloc, EqualSizeDiffTypes) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
@@ -151,28 +174,27 @@ TEST(Token2DAlloc, EqualSizeDiffTypes) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  EXPECT_EQ(alloc.Request(&tok2), nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
 
-  alloc.Alloc(&tok2, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
+  auto req1 = alloc.Request(&tok2);
+  EXPECT_NE(req1, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
 
-  tok2.ref_counter -= 1;
-  alloc.CheckForRelease(&tok2);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
-  EXPECT_EQ(alloc.FreeListSize(), 2);
+  req1->ref_counter -= 1;
+  alloc.CheckForRelease(req1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
 }
 
-TEST(Token2DAlloc, DifferentSizesTokenReuse) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -180,19 +202,17 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
@@ -205,19 +225,16 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) {
   };
   auto req = alloc.Request(&tok2);
   EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 2);
-  auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq.channel, 4);
-  EXPECT_EQ(sizeReq.height, 40);
-  EXPECT_EQ(sizeReq.width, 900);
+  EXPECT_EQ(req->ref_counter, 1);
+  sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq, 576000);
 
-  tok2.ref_counter -= 1;
   req->ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
@@ -230,24 +247,23 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) {
   };
   auto req2 = alloc.Request(&tok3);
   EXPECT_NE(req2, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req2->storage_id, storage_ids - 1);
   EXPECT_EQ(req2->ref_counter, 1);
-  auto sizeReq2 = alloc.GetSize2D(req2);
-  EXPECT_EQ(sizeReq2.channel, 4);
-  EXPECT_EQ(sizeReq2.height, 40);
-  EXPECT_EQ(sizeReq2.width, 900);
+  sizeReq = alloc.GetSize2D(req2);
+  EXPECT_EQ(sizeReq, 576000);
 }
 
-TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse2) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -255,19 +271,17 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 5, 30, 20, 4}, DataType(kDLFloat, 32, 1));
@@ -280,24 +294,23 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
   };
   auto req = alloc.Request(&tok2);
   EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 2);
-  auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq.channel, 4);
-  EXPECT_EQ(sizeReq.height, 5);
-  EXPECT_EQ(sizeReq.width, 600);
+  EXPECT_EQ(req->ref_counter, 1);
+  sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq, 140800);
 }
 
-TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureSameSizesButDiffMemoryScopes) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({28, 676, 1, 1, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-weight"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-weight"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -305,23 +318,22 @@ TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 28);
-  EXPECT_EQ(size2d.width, 676);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 302848);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 28, 26, 26, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd2(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok2 = {
       1,    // ref_counter
       0,    // max bytes
@@ -330,22 +342,199 @@ TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
       -1    // storage_id
   };
   auto tok2Size = alloc.GetSize2D(&tok2);
-  EXPECT_EQ(tok2Size.channel, 4);
-  EXPECT_EQ(tok2Size.height, 28);
-  EXPECT_EQ(tok2Size.width, 676);
+  EXPECT_EQ(tok2Size, 302848);
 
-  EXPECT_EQ(alloc.Request(&tok2), nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(TokenMixedAlloc, OneToken) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
 
-  alloc.Alloc(&tok2, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(TokenMixedAlloc, EqualSizeTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
-  tok2.ref_counter -= 1;
-  alloc.CheckForRelease(&tok2);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
-  EXPECT_EQ(alloc.FreeListSize(), 2);
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 1);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
 }
+
+TEST(TokenMixedAlloc, EqualSizeDiffTypes) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+
+  auto req1 = alloc.Request(&tok2);
+  EXPECT_NE(req1, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  req1->ref_counter -= 1;
+  alloc.CheckForRelease(req1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(TokenMixedAlloc, DifferentSizesTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 1);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok3 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt3,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req2 = alloc.Request(&tok3);
+  EXPECT_NE(req2, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req2->storage_id, storage_ids - 1);
+  EXPECT_EQ(req2->ref_counter, 1);
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc
index aea37bf7fbfe..e7579a2cabe9 100644
--- a/tests/cpp/runtime/memory/memory_manager_tests.cc
+++ b/tests/cpp/runtime/memory/memory_manager_tests.cc
@@ -48,6 +48,26 @@ class TvmVMMemoryManagerTest : public ::testing::Test {
   }
 };
 
+TEST_F(TvmVMMemoryManagerTest, AnyAllocatorNaiveAutoCreate) {
+  Device dev = {kDLCPU, 0};
+  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kAny);
+  EXPECT_EQ(allocator->type(), kNaive);
+}
+
+TEST_F(TvmVMMemoryManagerTest, AnyAllocatorNaiveReuse) {
+  Device dev = {kDLCPU, 0};
+  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
+  allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kAny);
+  EXPECT_EQ(allocator->type(), kNaive);
+}
+
+TEST_F(TvmVMMemoryManagerTest, AnyAllocatorPooled) {
+  Device dev = {kDLCPU, 0};
+  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
+  allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kAny);
+  EXPECT_EQ(allocator->type(), kPooled);
+}
+
 TEST_F(TvmVMMemoryManagerTest, NaiveAllocBasic) {
   Device dev = {kDLCPU, 0};
   Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
@@ -85,6 +105,38 @@ TEST_F(TvmVMMemoryManagerTest, NaiveEmptyBasic) {
   EXPECT_EQ(allocator->UsedMemory(), 0);
 }
 
+TEST_F(TvmVMMemoryManagerTest, BothAllocatorsCoexists) {
+  Device dev = {kDLCPU, 0};
+  // Initialize and use Naive allocator
+  Allocator* nallocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
+  EXPECT_EQ(nallocator->UsedMemory(), 0);
+  auto dt = DataType::Float(32);
+  size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes();
+  ShapeTuple shape = {1, 3, 6, 6};
+  {
+    auto ndarray = nallocator->Empty(shape, dt, dev);
+    EXPECT_EQ(nallocator->UsedMemory(), nbytes);
+  }
+  EXPECT_EQ(nallocator->UsedMemory(), 0);
+  auto naive_buff = nallocator->Alloc(dev, shape, dt);
+  EXPECT_EQ(nallocator->UsedMemory(), nbytes);
+
+  // Initialize and use Pooled allocator
+  Allocator* pallocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
+  EXPECT_EQ(pallocator->UsedMemory(), 0);
+  auto pooled_buff = pallocator->Alloc(dev, shape, dt);
+  EXPECT_NE(pallocator->UsedMemory(), 0);
+
+  // Operate on Naive allocator
+  EXPECT_EQ(nallocator->UsedMemory(), nbytes);
+  nallocator->Free(naive_buff);
+  EXPECT_EQ(nallocator->UsedMemory(), 0);
+
+  // Operate on Pooled allocator
+  pallocator->Free(pooled_buff);
+  EXPECT_NE(pallocator->UsedMemory(), 0);
+}
+
 TEST_F(TvmVMMemoryManagerTest, PooledEmptyBasic) {
   Device dev = {kDLCPU, 0};
   Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
@@ -144,7 +196,8 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocWithShape) {
     (void)texture;
     FAIL();
   } catch (std::exception& e) {
-    std::string pattern = "This alloc should be implemented";
+    std::string pattern =
+        "Device does not support allocate data space with specified memory scope: global.texture";
     std::string what = e.what();
     EXPECT_NE(what.find(pattern), std::string::npos) << what;
   }
@@ -192,15 +245,8 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocOpenCLTexture) {
   allocator->Free(buff);
   EXPECT_EQ(allocator->UsedMemory(), size);
 
-  try {
-    auto texture = allocator->Alloc(dev, shape, dt, "global.texture");
-    (void)texture;
-    FAIL();
-  } catch (std::exception& e) {
-    std::string pattern = "This alloc should be implemented";
-    std::string what = e.what();
-    EXPECT_NE(what.find(pattern), std::string::npos) << what;
-  }
+  auto texture = allocator->Alloc(dev, shape, dt, "global.texture");
+  allocator->Free(texture);
 }
 }  // namespace memory
 }  // namespace runtime
diff --git a/tests/cpp/texture_copy_test.cc b/tests/cpp/texture_copy_test.cc
index 92c12bafdd9a..63e2ac1a0af4 100644
--- a/tests/cpp/texture_copy_test.cc
+++ b/tests/cpp/texture_copy_test.cc
@@ -98,39 +98,28 @@ TEST(TextureCopy, OverwritePoolSubview) {
     static_cast<float*>(cpu_pool0->data)[i] = random(mt);
   }
 
-  // Random initialize host array
-  for (int64_t h = 0; h < shape[0]; h++) {
-    for (int64_t w = 0; w < shape[1]; w++) {
-      for (int64_t rgba = 0; rgba < shape[2]; rgba++) {
-        static_cast<float*>(cpu_arr0->data)[shape[1] * shape[2] * h + shape[2] * w + rgba] = 1.1f;
-      }
-    }
+  // Random initialize host array storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr0->data)[i] = random(mt);
   }
 
-  // Copy to texture pool for initialization
+  // Loop through pool
   cpu_pool0.CopyTo(opencl_txpool);
-  // Copy host data to subview into texture storage
-  cpu_arr0.CopyTo(opencl_txarr0);
-  // Copy modified pool back
   opencl_txpool.CopyTo(cpu_pool1);
 
-  // Check that modifications to pool follow two dimensional
-  // strides according to the written texture shape.
-  for (int64_t h = 0; h < shape_pool[0]; h++) {
-    for (int64_t w = 0; w < shape_pool[1]; w++) {
-      for (int64_t rgba = 0; rgba < shape_pool[2]; rgba++) {
-        size_t i = shape_pool[1] * shape_pool[2] * h + shape_pool[2] * w + rgba;
-        if (h < shape[0] && w < shape[1] && rgba < shape[2]) {
-          size_t j = shape[1] * shape[2] * h + shape[2] * w + rgba;
-          ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
-                              static_cast<float*>(cpu_arr0->data)[j]),
-                    1e-5);
-        } else {
-          ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
-                              static_cast<float*>(cpu_pool0->data)[i]),
-                    1e-5);
-        }
-      }
-    }
+  for (size_t i = 0; i < size_pool; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool0->data)[i] -
+                        static_cast<float*>(cpu_pool1->data)[i]),
+              1e-5);
+  }
+
+  // Loop through view
+  cpu_arr0.CopyTo(opencl_txarr0);
+  opencl_txarr0.CopyTo(cpu_arr1);
+
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(
+        std::fabs(static_cast<float*>(cpu_arr0->data)[i] - static_cast<float*>(cpu_arr1->data)[i]),
+        1e-5);
   }
 }
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index d7b6e13c18b6..133fcd191961 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -179,7 +179,7 @@ def test_plan_memory():
     assert (
         storage_sizes[0][0] == 40
         and storage_sizes[1][0] == 4
-        and storage_sizes[2][0] == 4
+        and storage_sizes[2][0] == 40
         and storage_sizes[3][0] == 40
     )
 

From ae81205017ee625af6abfe958fa0411713dc07ce Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Mon, 30 Dec 2024 21:13:42 +0530
Subject: [PATCH 02/14] test case fix

---
 python/tvm/runtime/ndarray.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 082a28c7e204..ebfbf3347be3 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -288,7 +288,13 @@ def copyto(self, target, mem_scope=None):
             return self._copyto(res)
         raise ValueError(f"Unsupported target type {type(target)}")
 
-    def _create_view(self, shape, dtype: Optional[str] = None, relative_byte_offset: int = 0):
+    def _create_view(
+        self,
+        shape,
+        dtype: Optional[str] = None,
+        relative_byte_offset: int = 0,
+        scope: str = "global",
+    ):
         """Create a view into an existing array.
 
         The view shares the same allocation and datatype as the
@@ -325,6 +331,9 @@ def _create_view(self, shape, dtype: Optional[str] = None, relative_byte_offset:
             start of the backing allocation, while the `relative_byte_offset`
             is relative to the start of `self`.
 
+        scope: str
+            Memory scope of the requesting view
+
         """
 
         if not isinstance(shape, tvm.runtime.ShapeTuple):
@@ -333,7 +342,7 @@ def _create_view(self, shape, dtype: Optional[str] = None, relative_byte_offset:
         if dtype is None:
             dtype = self.dtype
 
-        return _ffi_api.TVMArrayCreateView(self, shape, dtype, relative_byte_offset)
+        return _ffi_api.TVMArrayCreateView(self, shape, dtype, relative_byte_offset, scope)
 
 
 def device(dev_type, dev_id=0):

From c24d4fcc15ed75932e3a9c3c9b558e45fdd63c8d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Mon, 20 Jan 2025 13:18:41 +0530
Subject: [PATCH 03/14] testcase fix.

---
 src/runtime/ndarray.cc | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 83711a617f69..986f4f7b6975 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -390,7 +390,18 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
 
 TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body_typed(NDArray::Empty);
 
-TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body_method(&NDArray::CreateView);
+TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body([](TVMArgs args, TVMRetValue* rv) {
+  NDArray narray = args[0];
+  ShapeTuple shape = args[1];
+  DLDataType dtype = args[2];
+  int64_t offset = args[3];
+  if (args.size() == 5) {
+    String scope = args[4];
+    *rv = narray.CreateView(shape, dtype, offset, scope);
+  } else {
+    *rv = narray.CreateView(shape, dtype, offset);
+  }
+});
 
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();

From 4164cbd67b2916c83f669f7a0c41dbe732fdc238 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 4 Feb 2025 14:25:24 +0530
Subject: [PATCH 04/14] rebase

---
 src/runtime/opencl/opencl_device_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index adcb72043fc6..0d909ec7b2ed 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -450,7 +450,7 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host
 #ifdef USE_OPENCL_EXTN_QCOM
     Device dev = narr.operator->()->device;
     cl_device_id device_id = GetCLDeviceID(dev.device_id);
-    auto platform = device_to_platform[device_id];
+    auto platform = device_info[device_id].platform_id;
 
     OPENCL_CALL(clFinish(this->GetQueue(dev)));
     if (desc->host_ptr) {
@@ -477,7 +477,7 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host
 void OpenCLWorkspace::SetPerfHint(Device dev, cl_uint perf_hint) {
 #ifdef CL_CONTEXT_PERF_HINT_QCOM
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
-  auto platform = device_to_platform[device_id];
+  auto platform = device_info[device_id].platform_id;
   OPENCL_CALL(clSetPerfHintQCOM(this->contexts[platform], perf_hint));
 #endif
 }

From 677da0d3d8ba80fea8f7087d6867fa400849348d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Wed, 5 Feb 2025 00:00:52 +0530
Subject: [PATCH 05/14] GraphRuntime goes with StorageObjs for activations
 followed by Views

---
 include/tvm/runtime/memory/memory_manager.h   | 21 +++++++-
 include/tvm/runtime/ndarray.h                 |  5 +-
 python/tvm/runtime/ndarray.py                 | 13 +----
 src/runtime/graph_executor/graph_executor.cc  | 33 ++++++------
 src/runtime/graph_executor/graph_executor.h   |  5 +-
 src/runtime/memory/memory_manager.cc          | 52 ++++++++++++++-----
 src/runtime/ndarray.cc                        | 23 ++------
 src/runtime/opencl/opencl_device_api.cc       |  6 +--
 tests/cpp-runtime/opencl/texture_copy_test.cc |  3 +-
 9 files changed, 91 insertions(+), 70 deletions(-)

diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index 7386c812fa08..4cd836445ff0 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -42,7 +42,6 @@ namespace memory {
 enum AllocatorType {
   kNaive = 1,
   kPooled,
-  kAny,
 };
 
 struct Buffer {
@@ -89,6 +88,17 @@ class Allocator {
    */
   TVM_DLL virtual Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
                                const std::string& mem_scope = "");
+
+  /*! \brief Create a view for the buffer given a shape, type and scope.
+   *  \param buffer The existing buffer upon which we need to create a view.
+   *  \param shape The shape of the view.
+   *  \param type_hint A type hint to the view.
+   *  \param mem_scope A memory scope of the view.
+   *  \return A device pointer to the created view.
+   */
+  TVM_DLL virtual void* CreateView(Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+                                   const std::string& mem_scope = "");
+
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
    */
@@ -125,7 +135,7 @@ class MemoryManager {
    * \param type The allocator type
    * \return The memory allocator.
    */
-  TVM_DLL static Allocator* GetAllocator(Device dev, AllocatorType type = AllocatorType::kAny);
+  TVM_DLL static Allocator* GetAllocator(Device dev, AllocatorType type);
   /*! \brief Clear the allocators. */
   static void Clear();
 
@@ -149,6 +159,13 @@ class StorageObj : public Object {
   /*! \brief Allocate an NDArray from a given piece of storage. */
   TVM_DLL NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 
+  /*! \brief Allocate an NDArray with memory scope from a given piece of storage. */
+  TVM_DLL NDArray AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype,
+                                     String scope = "global");
+
+  /*! \brief The deleter for an NDArray when allocated from underlying storage. */
+  static void ScopedDeleter(Object* ptr);
+
   /*! \brief The deleter for an NDArray when allocated from underlying storage. */
   static void Deleter(Object* ptr);
 
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 1f8d48cec66e..fef61a753103 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -137,8 +137,6 @@ class NDArray : public ObjectRef {
    * \param relative_byte_offset The offset of the output NDArray,
    *     relative to the current byte offset.
    *
-   * \param mem_scope The memory scope of the array.
-   *
    *     By default, the offset of the view is the same as the offset
    *     of the current array.
    *
@@ -149,8 +147,7 @@ class NDArray : public ObjectRef {
    *       outside the bounds of the current array, this function will
    *       raise an exception.
    */
-  TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset = 0,
-                             Optional<String> mem_scope = NullOpt);
+  TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset = 0);
 
   /*!
    * \brief Create a reference view of NDArray that
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index ebfbf3347be3..082a28c7e204 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -288,13 +288,7 @@ def copyto(self, target, mem_scope=None):
             return self._copyto(res)
         raise ValueError(f"Unsupported target type {type(target)}")
 
-    def _create_view(
-        self,
-        shape,
-        dtype: Optional[str] = None,
-        relative_byte_offset: int = 0,
-        scope: str = "global",
-    ):
+    def _create_view(self, shape, dtype: Optional[str] = None, relative_byte_offset: int = 0):
         """Create a view into an existing array.
 
         The view shares the same allocation and datatype as the
@@ -331,9 +325,6 @@ def _create_view(
             start of the backing allocation, while the `relative_byte_offset`
             is relative to the start of `self`.
 
-        scope: str
-            Memory scope of the requesting view
-
         """
 
         if not isinstance(shape, tvm.runtime.ShapeTuple):
@@ -342,7 +333,7 @@ def _create_view(
         if dtype is None:
             dtype = self.dtype
 
-        return _ffi_api.TVMArrayCreateView(self, shape, dtype, relative_byte_offset, scope)
+        return _ffi_api.TVMArrayCreateView(self, shape, dtype, relative_byte_offset)
 
 
 def device(dev_type, dev_id=0):
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 1757aae50663..3cc3ea396e17 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -462,15 +462,14 @@ void GraphExecutor::SetupStorage() {
     });
     Device dev = cit == devices_.end() ? devices_[0] : *cit;
     if (pit.linked_param.defined()) {
-      storage_pool_.push_back(pit.linked_param);
+      ndarray_pool_.push_back(pit.linked_param);
     } else {
       std::vector<int64_t> shape = pit.shape;
-      Optional<String> mem_scope;
-      if (!pit.scope.empty()) {
-        mem_scope = String(pit.scope);
-      }
-      storage_pool_.push_back(MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kNaive)
-                                  ->Empty(shape, pit.dtype, dev, mem_scope));
+      String mem_scope = pit.scope.empty() ? "global" : String(pit.scope);
+      auto allocator = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled);
+      auto buffer = allocator->Alloc(dev, pit.alloc_size, kAllocAlignment, pit.dtype);
+      auto stor = Storage(buffer, allocator);
+      storage_pool_.push_back(stor);
     }
   }
 
@@ -479,20 +478,22 @@ void GraphExecutor::SetupStorage() {
   // is mapped to this pool.
   data_entry_.resize(num_node_entries());
   data_alignment_.resize(num_node_entries());
-  // sid_to_eid has a size of storage_id's size, which is the size of storage_pool_.
-  sid_to_eid_.resize(storage_pool_.size());
-  for (size_t i = 0; i < data_entry_.size(); ++i) {
+  // sid_to_eid has a size of storage_id's size, which is the size of pool_entry.
+  sid_to_eid_.resize(pool_entry.size());
+  for (size_t i = 0, j = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     // Update "storage_id -> entry_id" pair.
     sid_to_eid_[storage_id].push_back(i);
 
-    ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    std::string storage_scope = attrs_.storage_scope.empty() ? "" : attrs_.storage_scope[i];
-    Optional<String> mem_scope;
-    if (!storage_scope.empty()) {
-      mem_scope = String(storage_scope);
+    ICHECK_LT(static_cast<size_t>(storage_id), pool_entry.size());
+
+    if (pool_entry[storage_id].linked_param.defined()) {
+      data_entry_[i] = ndarray_pool_[j++];
+    } else {
+      std::string storage_scope = attrs_.storage_scope.empty() ? "global" : attrs_.storage_scope[i];
+      data_entry_[i] = storage_pool_[storage_id]->AllocNDArrayScoped(0, ShapeTuple(attrs_.shape[i]),
+                                                                     vtype[i], storage_scope);
     }
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i], 0, mem_scope);
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
   }
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index d9f0e0aec34a..e1c61001f1d9 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -45,6 +45,7 @@ namespace runtime {
 
 using memory::AllocatorType;
 using memory::MemoryManager;
+using tvm::runtime::memory::Storage;
 
 /*! \brief macro to do C API call */
 #define TVM_CCALL(func)                     \
@@ -484,7 +485,9 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   /*! \brief Execution context of all devices including the host. */
   std::vector<Device> devices_;
   /*! \brief Common storage pool for all devices. */
-  std::vector<NDArray> storage_pool_;
+  std::vector<Storage> storage_pool_;
+  /*! \brief Common NDArray pool for all devices. */
+  std::vector<NDArray> ndarray_pool_;
   /*! \brief Data entry of each node. */
   std::vector<NDArray> data_entry_;
   /*! \brief Data alignment of each node. */
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index 510f9a13be7b..853db7284f58 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -84,6 +84,39 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   return align;
 }
 
+void StorageObj::ScopedDeleter(Object* obj) {
+  auto* ptr = static_cast<NDArray::Container*>(obj);
+  // Let Device API handle proper cleanup of view
+  tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
+      ->FreeDataSpaceView(ptr->dl_tensor.device, ptr->dl_tensor.data);
+  StorageObj* storage = reinterpret_cast<StorageObj*>(ptr->manager_ctx);
+  storage->DecRef();
+  delete ptr;
+}
+
+NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype,
+                                       String scope) {
+  if (scope == "global" || scope.empty()) {
+    return AllocNDArray(offset, shape, dtype);
+  }
+  VerifyDataType(dtype);
+  void* data =
+      DeviceAPI::Get(this->buffer.device)
+          ->AllocDataSpaceView(this->buffer.device, this->buffer.data, shape, dtype, scope);
+  NDArray::Container* container = new NDArray::Container(data, shape, dtype, this->buffer.device);
+  container->dl_tensor.byte_offset = offset;
+  container->SetDeleter(StorageObj::ScopedDeleter);
+  size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
+  this->IncRef();
+  container->manager_ctx = reinterpret_cast<void*>(this);
+  NDArray ret(GetObjectPtr<Object>(container));
+  // RAII in effect, now run the check.
+  ICHECK(offset + needed_size <= this->buffer.size)
+      << "storage allocation failure, attempted to allocate " << needed_size << " at offset "
+      << offset << " in region that is " << this->buffer.size << "bytes";
+  return ret;
+}
+
 NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
   VerifyDataType(dtype);
 
@@ -131,21 +164,10 @@ MemoryManager* MemoryManager::Global() {
 Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
   MemoryManager* m = MemoryManager::Global();
   std::lock_guard<std::mutex> lock(m->mu_);
-  auto it = m->allocators_.find(dev);
-  if (it == m->allocators_.end()) {
+  if (m->allocators_.find(dev) == m->allocators_.end()) {
     m->allocators_.emplace(dev, std::unordered_map<AllocatorType, std::unique_ptr<Allocator>>());
   }
 
-  // Look for any available, else create Naive.
-  if (type == AllocatorType::kAny) {
-    it = m->allocators_.find(dev);
-    if (it->second.begin() != it->second.end()) {
-      return it->second.begin()->second.get();
-    } else {
-      type = AllocatorType::kNaive;
-    }
-  }
-
   if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) {
     std::unique_ptr<Allocator> alloc;
     switch (type) {
@@ -167,6 +189,7 @@ Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
     return ret;
   }
   auto alloc = m->allocators_.at(dev).at(type).get();
+
   return alloc;
 }
 
@@ -211,6 +234,11 @@ bool Allocator::AllowMemoryScope(const std::string& mem_scope) const {
   return mem_scope.empty() || mem_scope == "global";
 }
 
+void* Allocator::CreateView(Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+                            const std::string& mem_scope) {
+  return buffer.data;
+}
+
 Buffer Allocator::Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
                         const std::string& mem_scope) {
   NDArray::Container container(nullptr, shape, type_hint, dev);
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 986f4f7b6975..c2cf5f388a21 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -102,8 +102,6 @@ struct NDArray::Internal {
     auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
     if (ptr->manager_ctx != nullptr) {
       static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
-      tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
-          ->FreeDataSpaceView(ptr->dl_tensor.device, ptr->dl_tensor.data);
     } else if (ptr->dl_tensor.data != nullptr) {
       tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
           ->FreeDataSpace(ptr->dl_tensor.device, ptr->dl_tensor.data);
@@ -181,8 +179,7 @@ struct NDArray::Internal {
   }
 };
 
-NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset,
-                            Optional<String> mem_scope) {
+NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset) {
   ICHECK(data_ != nullptr);
 
   const DLTensor& orig = get_mutable()->dl_tensor;
@@ -226,10 +223,7 @@ NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relativ
   // increase ref count
   get_mutable()->IncRef();
   ret.get_mutable()->manager_ctx = get_mutable();
-  ret.get_mutable()->dl_tensor.data =
-      DeviceAPI::Get(get_mutable()->dl_tensor.device)
-          ->AllocDataSpaceView(get_mutable()->dl_tensor.device, get_mutable()->dl_tensor.data,
-                               shape, dtype, mem_scope);
+  ret.get_mutable()->dl_tensor.data = get_mutable()->dl_tensor.data;
   ret.get_mutable()->dl_tensor.byte_offset =
       get_mutable()->dl_tensor.byte_offset + relative_byte_offset;
   return ret;
@@ -390,18 +384,7 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
 
 TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body_typed(NDArray::Empty);
 
-TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body([](TVMArgs args, TVMRetValue* rv) {
-  NDArray narray = args[0];
-  ShapeTuple shape = args[1];
-  DLDataType dtype = args[2];
-  int64_t offset = args[3];
-  if (args.size() == 5) {
-    String scope = args[4];
-    *rv = narray.CreateView(shape, dtype, offset, scope);
-  } else {
-    *rv = narray.CreateView(shape, dtype, offset);
-  }
-});
+TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body_method(&NDArray::CreateView);
 
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 0d909ec7b2ed..fac465cd255c 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -269,7 +269,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, D
   // Alloc back buffer from pool
   cl::BufferDescriptor* back_buffer = nullptr;
   if (IsBufferToImageSupported(dev.device_id)) {
-    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kAny)
+    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
                    ->Alloc(dev, mem_size, kTempAllocaAlignment, type_hint);
     back_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
     back_buffer->mbuf = buf;
@@ -287,7 +287,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape
   if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
     size_t size = GetMemObjectSize(dev, ndim, shape, dtype);
     cl::BufferDescriptor* ret_buffer = nullptr;
-    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kAny)
+    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
                    ->Alloc(dev, size, kTempAllocaAlignment, dtype);
     ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
     ret_buffer->mbuf = buf;
@@ -602,7 +602,7 @@ void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
 void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
   this->Init();
   cl::BufferDescriptor* ret_buffer = nullptr;
-  auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kAny)
+  auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
                  ->Alloc(dev, size, kTempAllocaAlignment, type_hint);
   ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
   ret_buffer->mbuf = buf;
diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc
index 0e7d4b4862f4..fb58c53714f6 100644
--- a/tests/cpp-runtime/opencl/texture_copy_test.cc
+++ b/tests/cpp-runtime/opencl/texture_copy_test.cc
@@ -83,7 +83,7 @@ TEST(TextureCopy, HostDeviceRT) {
         1e-5);
   }
 }
-
+#if 0
 TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
   using namespace tvm;
   std::vector<int64_t> shape{1, 16, 16, 8};
@@ -293,3 +293,4 @@ TEST_F(TextureCopyTest, ViewImageAsImage) {
               1e-5);
   }
 }
+#endif

From 0fd6cad215ffd7891b8663bdbff808b9e3ca0a8e Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Wed, 5 Feb 2025 12:17:53 +0530
Subject: [PATCH 06/14] Clear View api's from device API interface

---
 include/tvm/runtime/device_api.h            | 18 ----------
 include/tvm/runtime/memory/memory_manager.h |  8 ++++-
 src/runtime/c_runtime_api.cc                |  7 ----
 src/runtime/memory/memory_manager.cc        | 39 +++++++++++++++++----
 src/runtime/opencl/opencl_common.h          |  7 ++--
 src/runtime/opencl/opencl_device_api.cc     | 12 +++++++
 6 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 5396e7342ad0..d56a7be54acc 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -137,17 +137,6 @@ class TVM_DLL DeviceAPI {
   virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                Optional<String> mem_scope = NullOpt);
 
-  /*!
-   * \brief Create a new view with given spec over existing tensor.
-   * \param dev The device device to perform operation.
-   * \param data The source array.
-   * \param shape The shape of allocated tensor.
-   * \param dtype The type of elements.
-   * \param mem_scope The memory scope of allocated tensor.
-   * \return The allocated device pointer.
-   */
-  virtual void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
-                                   Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Free a data space on device.
    * \param dev The device device to perform operation.
@@ -155,13 +144,6 @@ class TVM_DLL DeviceAPI {
    */
   virtual void FreeDataSpace(Device dev, void* ptr) = 0;
 
-  /*!
-   * \brief Free a view data space on device.
-   * \param dev The device device to perform operation.
-   * \param ptr The data space view.
-   */
-  virtual void FreeDataSpaceView(Device dev, void* ptr);
-
   /*!
    * \brief copy data from one place to another
    * \note This API is designed to support special memory with shape dependent layout.
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index 4cd836445ff0..fa8a4c5d13c9 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -96,9 +96,15 @@ class Allocator {
    *  \param mem_scope A memory scope of the view.
    *  \return A device pointer to the created view.
    */
-  TVM_DLL virtual void* CreateView(Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+  TVM_DLL virtual void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
                                    const std::string& mem_scope = "");
 
+  /*! \brief Release the view .
+   *  \param dev is the device where this view is created
+   *  \param data The view pointer to be freed.
+   */
+  TVM_DLL virtual void FreeView(Device dev, void* data);
+
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
    */
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 45a394e733b5..ea22b89dd771 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -187,11 +187,6 @@ void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDa
   return nullptr;
 }
 
-void* DeviceAPI::AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
-                                    Optional<String> mem_scope) {
-  return data;
-}
-
 void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   // by default, we can always redirect to the flat memory copy operation.
   size_t nbytes = GetDataSize(*from);
@@ -211,8 +206,6 @@ void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, s
 
 void DeviceAPI::FreeWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); }
 
-void DeviceAPI::FreeDataSpaceView(Device dev, void* ptr) {}
-
 TVMStreamHandle DeviceAPI::CreateStream(Device dev) { return nullptr; }
 
 void DeviceAPI::FreeStream(Device dev, TVMStreamHandle stream) {}
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index 853db7284f58..098a5e701b82 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -86,10 +86,10 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 
 void StorageObj::ScopedDeleter(Object* obj) {
   auto* ptr = static_cast<NDArray::Container*>(obj);
-  // Let Device API handle proper cleanup of view
-  tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
-      ->FreeDataSpaceView(ptr->dl_tensor.device, ptr->dl_tensor.data);
   StorageObj* storage = reinterpret_cast<StorageObj*>(ptr->manager_ctx);
+
+  // Let the device handle proper cleanup of view
+  storage->allocator->FreeView(ptr->dl_tensor.device, ptr->dl_tensor.data);
   storage->DecRef();
   delete ptr;
 }
@@ -100,9 +100,7 @@ NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataT
     return AllocNDArray(offset, shape, dtype);
   }
   VerifyDataType(dtype);
-  void* data =
-      DeviceAPI::Get(this->buffer.device)
-          ->AllocDataSpaceView(this->buffer.device, this->buffer.data, shape, dtype, scope);
+  void* data = this->allocator->CreateView(this->buffer, shape, dtype, scope);
   NDArray::Container* container = new NDArray::Container(data, shape, dtype, this->buffer.device);
   container->dl_tensor.byte_offset = offset;
   container->SetDeleter(StorageObj::ScopedDeleter);
@@ -234,11 +232,38 @@ bool Allocator::AllowMemoryScope(const std::string& mem_scope) const {
   return mem_scope.empty() || mem_scope == "global";
 }
 
-void* Allocator::CreateView(Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+std::string DeviceTypeStr(DLDeviceType type) {
+  switch (type) {
+    case kDLOpenCL:
+      return "opencl";
+      break;
+    case kDLVulkan:
+      return "vulkan";
+      break;
+    default:
+      return "";
+  }
+}
+
+void* Allocator::CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
                             const std::string& mem_scope) {
+  std::string dev_str = DeviceTypeStr(buffer.device.device_type);
+  auto* device_view_helper = tvm::runtime::Registry::Get("DeviceCreateView." + dev_str);
+  if (device_view_helper) {
+    void* view_ptr = (*device_view_helper)(buffer.device, buffer.data, shape, type_hint, mem_scope);
+    return view_ptr;
+  }
   return buffer.data;
 }
 
+void Allocator::FreeView(Device dev, void* data) {
+  std::string dev_str = DeviceTypeStr(dev.device_type);
+  auto* device_view_helper = tvm::runtime::Registry::Get("DeviceFreeView." + dev_str);
+  if (device_view_helper) {
+    (*device_view_helper)(dev, data);
+  }
+}
+
 Buffer Allocator::Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
                         const std::string& mem_scope) {
   NDArray::Container container(nullptr, shape, type_hint, dev);
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index b9cf671a643c..62f5683059e9 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -340,6 +340,10 @@ class OpenCLWorkspace : public DeviceAPI {
     return device_info[GetCLDeviceID(device_id)].image_from_buffer_support;
   }
 
+  void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
+                           Optional<String> mem_scope = NullOpt);
+  void FreeDataSpaceView(Device dev, void* ptr);
+
   cl_device_id GetCLDeviceID(int device_id);
   // override device API
   void SetDevice(Device dev) final;
@@ -349,13 +353,10 @@ class OpenCLWorkspace : public DeviceAPI {
                        Optional<String> mem_scope = NullOpt) final;
   void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
                        Optional<String> mem_scope = NullOpt);
-  void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
-                           Optional<String> mem_scope = NullOpt) final;
   void* GetNativePtr(const tvm::runtime::NDArray& narr);
   void SetNativePtr(const tvm::runtime::NDArray& narr, void* host_ptr, size_t buf_size);
   void SetPerfHint(Device dev, cl_uint perf_hint);
   void FreeDataSpace(Device dev, void* ptr) final;
-  void FreeDataSpaceView(Device dev, void* ptr) final;
   void StreamSync(Device dev, TVMStreamHandle stream) final;
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(Device dev, void* data) final;
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index fac465cd255c..d85c07d1e9cd 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -821,6 +821,18 @@ TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
   return Timer(make_object<OpenCLTimerNode>(dev));
 });
 
+TVM_REGISTER_GLOBAL("DeviceCreateView.opencl")
+    .set_body_typed([](Device dev, void* data, ShapeTuple shape, DLDataType dtype,
+                       Optional<String> mem_scope) {
+      OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
+      return ws_->AllocDataSpaceView(dev, data, shape, dtype, Optional<String>(mem_scope));
+    });
+
+TVM_REGISTER_GLOBAL("DeviceFreeView.opencl").set_body_typed([](Device dev, void* data) {
+  OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
+  return ws_->FreeDataSpaceView(dev, data);
+});
+
 }  // namespace cl
 size_t OpenCLTimerNode::count_timer_execs = 0;
 std::vector<size_t> OpenCLTimerNode::event_start_idxs;

From 1120ed879ec5dc7f8a6d78a87d8923892632abc5 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Wed, 5 Feb 2025 22:13:30 +0530
Subject: [PATCH 07/14] Allocator based dispatch for view

---
 include/tvm/runtime/device_api.h              |   2 -
 include/tvm/runtime/memory/memory_manager.h   |   7 +-
 src/runtime/memory/memory_manager.cc          | 103 ++++++++----------
 src/runtime/memory/naive_allocator.h          |  31 +++++-
 src/runtime/memory/pooled_allocator.h         |  14 ++-
 src/runtime/opencl/opencl_common.h            |   2 +-
 src/runtime/opencl/opencl_device_api.cc       |  91 ++++++++++++++--
 tests/cpp-runtime/opencl/texture_copy_test.cc |   1 +
 .../runtime/memory/memory_manager_tests.cc    |  20 ----
 9 files changed, 177 insertions(+), 94 deletions(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index d56a7be54acc..f27bfdacb570 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -136,14 +136,12 @@ class TVM_DLL DeviceAPI {
    */
   virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                Optional<String> mem_scope = NullOpt);
-
   /*!
    * \brief Free a data space on device.
    * \param dev The device device to perform operation.
    * \param ptr The data space.
    */
   virtual void FreeDataSpace(Device dev, void* ptr) = 0;
-
   /*!
    * \brief copy data from one place to another
    * \note This API is designed to support special memory with shape dependent layout.
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index fa8a4c5d13c9..f54a9d5a2054 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -97,13 +97,15 @@ class Allocator {
    *  \return A device pointer to the created view.
    */
   TVM_DLL virtual void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
-                                   const std::string& mem_scope = "");
+                                   const std::string& mem_scope = "global") {
+    return buffer.data;
+  };
 
   /*! \brief Release the view .
    *  \param dev is the device where this view is created
    *  \param data The view pointer to be freed.
    */
-  TVM_DLL virtual void FreeView(Device dev, void* data);
+  TVM_DLL virtual void FreeView(Device dev, void* data){};
 
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
@@ -119,7 +121,6 @@ class Allocator {
  protected:
   /*! \brief Check if the given memory scope is allowed to allocate by the allocator. */
   TVM_DLL virtual bool AllowMemoryScope(const std::string& mem_scope) const;
-  std::atomic<size_t> used_memory_;
 
  private:
   AllocatorType type_;
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index 098a5e701b82..a4b8e15943bd 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -159,29 +159,56 @@ MemoryManager* MemoryManager::Global() {
   return inst;
 }
 
-Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
-  MemoryManager* m = MemoryManager::Global();
-  std::lock_guard<std::mutex> lock(m->mu_);
-  if (m->allocators_.find(dev) == m->allocators_.end()) {
-    m->allocators_.emplace(dev, std::unordered_map<AllocatorType, std::unique_ptr<Allocator>>());
+std::string DeviceTypeStr(DLDeviceType type) {
+  switch (type) {
+    case kDLOpenCL:
+      return "opencl";
+      break;
+    case kDLVulkan:
+      return "vulkan";
+      break;
+    default:
+      return "";
   }
+}
 
-  if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) {
-    std::unique_ptr<Allocator> alloc;
+Allocator* GetDeviceSpecificAllocator(Device dev, AllocatorType type) {
+  std::string dev_str = DeviceTypeStr(dev.device_type);
+  auto* device_alloc_helper = tvm::runtime::Registry::Get("DeviceAllocator." + dev_str);
+  void* valloc;
+  Allocator* allocator = nullptr;
+  if (device_alloc_helper) {
+    valloc = (*device_alloc_helper)(dev, static_cast<int>(type));
+    allocator = static_cast<Allocator*>(valloc);
+  }
+  if (nullptr == allocator) {
     switch (type) {
       case kNaive: {
         VLOG(1) << "New naive allocator for " << dev;
-        alloc.reset(new NaiveAllocator());
+        allocator = new NaiveAllocator();
         break;
       }
       case kPooled: {
         VLOG(1) << "New pooled allocator for " << dev;
-        alloc.reset(new PooledAllocator());
+        allocator = new PooledAllocator();
         break;
       }
       default:
         LOG(FATAL) << "Unknown allocator type: " << type;
     }
+  }
+  return allocator;
+}
+
+Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
+  MemoryManager* m = MemoryManager::Global();
+  std::lock_guard<std::mutex> lock(m->mu_);
+  if (m->allocators_.find(dev) == m->allocators_.end()) {
+    m->allocators_.emplace(dev, std::unordered_map<AllocatorType, std::unique_ptr<Allocator>>());
+  }
+  if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) {
+    std::unique_ptr<Allocator> alloc;
+    alloc.reset(GetDeviceSpecificAllocator(dev, type));
     auto ret = alloc.get();
     m->allocators_.at(dev).emplace(type, std::move(alloc));
     return ret;
@@ -222,9 +249,13 @@ NDArray Allocator::Empty(ShapeTuple shape, DLDataType dtype, DLDevice dev,
   size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor, mem_scope);
   size_t alignment = GetDataAlignment(container->dl_tensor);
   Buffer* buffer = new Buffer;
-  *buffer = this->Alloc(dev, size, alignment, dtype);
-  container->dl_tensor.data = buffer->data;
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    *buffer = this->Alloc(dev, size, alignment, dtype);
+  } else {
+    *buffer = this->Alloc(dev, shape, dtype, mem_scope.value());
+  }
   container->manager_ctx = reinterpret_cast<void*>(buffer);
+  container->dl_tensor.data = buffer->data;
   return NDArray(GetObjectPtr<Object>(container));
 }
 
@@ -232,56 +263,18 @@ bool Allocator::AllowMemoryScope(const std::string& mem_scope) const {
   return mem_scope.empty() || mem_scope == "global";
 }
 
-std::string DeviceTypeStr(DLDeviceType type) {
-  switch (type) {
-    case kDLOpenCL:
-      return "opencl";
-      break;
-    case kDLVulkan:
-      return "vulkan";
-      break;
-    default:
-      return "";
-  }
-}
-
-void* Allocator::CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
-                            const std::string& mem_scope) {
-  std::string dev_str = DeviceTypeStr(buffer.device.device_type);
-  auto* device_view_helper = tvm::runtime::Registry::Get("DeviceCreateView." + dev_str);
-  if (device_view_helper) {
-    void* view_ptr = (*device_view_helper)(buffer.device, buffer.data, shape, type_hint, mem_scope);
-    return view_ptr;
-  }
-  return buffer.data;
-}
-
-void Allocator::FreeView(Device dev, void* data) {
-  std::string dev_str = DeviceTypeStr(dev.device_type);
-  auto* device_view_helper = tvm::runtime::Registry::Get("DeviceFreeView." + dev_str);
-  if (device_view_helper) {
-    (*device_view_helper)(dev, data);
-  }
-}
-
 Buffer Allocator::Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
                         const std::string& mem_scope) {
-  NDArray::Container container(nullptr, shape, type_hint, dev);
-  size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor);
-
   if (AllowMemoryScope(mem_scope)) {
+    // by default, we can always redirect to the flat memory allocations
+    NDArray::Container container(nullptr, shape, type_hint, dev);
+    size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor);
     size_t alignment = GetDataAlignment(container.dl_tensor);
     return Alloc(dev, size, alignment, type_hint);
   }
-  Buffer buf;
-  buf.device = dev;
-  buf.size = size;
-  buf.alloc_type = type_;
-  buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
-                                                 String(mem_scope));
-  used_memory_.fetch_add(size, std::memory_order_relaxed);
-  DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
-  return buf;
+  LOG(FATAL) << "Allocator cannot allocate data space with "
+             << "specified memory scope: " << mem_scope;
+  return {};
 }
 
 void Allocator::Clear() {
diff --git a/src/runtime/memory/naive_allocator.h b/src/runtime/memory/naive_allocator.h
index 62d8e8f06c80..6d8e90fed9f2 100644
--- a/src/runtime/memory/naive_allocator.h
+++ b/src/runtime/memory/naive_allocator.h
@@ -35,7 +35,7 @@ namespace memory {
 
 class NaiveAllocator final : public Allocator {
  public:
-  explicit NaiveAllocator() : Allocator(kNaive) { used_memory_ = 0; }
+  explicit NaiveAllocator() : Allocator(kNaive), used_memory_(0) {}
 
   Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
     Buffer buf;
@@ -48,6 +48,32 @@ class NaiveAllocator final : public Allocator {
     return buf;
   }
 
+  Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
+               const std::string& mem_scope) final {
+    Buffer buf;
+    size_t nbytes = 1;
+    for (int i = 0; i < static_cast<int>(shape.size()); ++i) {
+      nbytes *= static_cast<size_t>(shape[i]);
+    }
+    nbytes *= (type_hint.bits * type_hint.lanes + 7) / 8;
+    buf.device = dev;
+    if (AllowMemoryScope(mem_scope)) {
+      auto tmp_buf = Allocator::Alloc(dev, shape, type_hint, mem_scope);
+      buf.size = tmp_buf.size;
+      buf.data = tmp_buf.data;
+      buf.alloc_type = kNaive;
+      return buf;
+    }
+
+    buf.size = nbytes;
+    buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
+                                                   String(mem_scope));
+    used_memory_.fetch_add(nbytes, std::memory_order_relaxed);
+    DLOG(INFO) << "allocate " << nbytes << " B, used memory " << used_memory_ << " B";
+    buf.alloc_type = kNaive;
+    return buf;
+  }
+
   void Free(const Buffer& buffer) override {
     DeviceAPI::Get(buffer.device)->FreeDataSpace(buffer.device, buffer.data);
     used_memory_.fetch_sub(buffer.size, std::memory_order_relaxed);
@@ -55,6 +81,9 @@ class NaiveAllocator final : public Allocator {
   }
 
   size_t UsedMemory() const override { return used_memory_.load(std::memory_order_relaxed); }
+
+ private:
+  std::atomic<size_t> used_memory_;
 };
 
 }  // namespace memory
diff --git a/src/runtime/memory/pooled_allocator.h b/src/runtime/memory/pooled_allocator.h
index 7bc73fd234b1..c96c87a73a13 100644
--- a/src/runtime/memory/pooled_allocator.h
+++ b/src/runtime/memory/pooled_allocator.h
@@ -41,9 +41,7 @@ class PooledAllocator : public Allocator {
   static constexpr size_t kDefaultPageSize = 4096;
 
   explicit PooledAllocator(size_t page_size = kDefaultPageSize)
-      : Allocator(kPooled), page_size_(page_size) {
-    used_memory_ = 0;
-  }
+      : Allocator(kPooled), page_size_(page_size), used_memory_(0) {}
 
   ~PooledAllocator() { ReleaseAll(); }
 
@@ -75,6 +73,15 @@ class PooledAllocator : public Allocator {
     return buf;
   }
 
+  Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
+               const std::string& mem_scope) override {
+    if (AllowMemoryScope(mem_scope)) {
+      return Allocator::Alloc(dev, shape, type_hint, mem_scope);
+    }
+    LOG(FATAL) << "This alloc should be implemented";
+    return {};
+  }
+
   void Free(const Buffer& buffer) override {
     std::lock_guard<std::recursive_mutex> lock(mu_);
     if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
@@ -113,6 +120,7 @@ class PooledAllocator : public Allocator {
 
  protected:
   size_t page_size_;
+  std::atomic<size_t> used_memory_;
   std::unordered_map<size_t, std::vector<Buffer>> memory_pool_;
   std::recursive_mutex mu_;
 };
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 62f5683059e9..94ab736f5ed5 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -242,7 +242,7 @@ class OpenCLWorkspace : public DeviceAPI {
   std::unordered_map<cl_platform_id, cl_context> contexts;
   // whether the workspace it initialized.
   bool initialized_{false};
-  // map device to varius device informations
+  // map device to various device informations
   std::unordered_map<cl_device_id, CLDeviceInfo> device_info;
   // the devices
   std::vector<cl_device_id> devices;
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index d85c07d1e9cd..4348c2ef8dfe 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -27,6 +27,7 @@
 
 #include <sstream>
 
+#include "../memory/pooled_allocator.h"
 #include "opencl_common.h"
 
 #ifdef OPENCL_ENABLE_HOST_PTR
@@ -821,16 +822,88 @@ TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
   return Timer(make_object<OpenCLTimerNode>(dev));
 });
 
-TVM_REGISTER_GLOBAL("DeviceCreateView.opencl")
-    .set_body_typed([](Device dev, void* data, ShapeTuple shape, DLDataType dtype,
-                       Optional<String> mem_scope) {
-      OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
-      return ws_->AllocDataSpaceView(dev, data, shape, dtype, Optional<String>(mem_scope));
-    });
+class OpenCLPooledAllocator final : public memory::PooledAllocator {
+ public:
+  explicit OpenCLPooledAllocator() : PooledAllocator() {}
 
-TVM_REGISTER_GLOBAL("DeviceFreeView.opencl").set_body_typed([](Device dev, void* data) {
-  OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
-  return ws_->FreeDataSpaceView(dev, data);
+  bool AllowMemoryScope(const std::string& mem_scope) const final {
+    return ((mem_scope.find("texture") != std::string::npos) || mem_scope.empty() ||
+            ("global" == mem_scope));
+  }
+
+  Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) override {
+    std::lock_guard<std::recursive_mutex> lock(mu_);
+    size_t size = ((nbytes + page_size_ - 1) / page_size_) * page_size_;
+    auto&& it = memory_pool_.find(size);
+    if (it != memory_pool_.end() && !it->second.empty()) {
+      auto&& pool = it->second;
+      auto ret = pool.back();
+      pool.pop_back();
+      return ret;
+    }
+    Buffer buf;
+    buf.device = dev;
+    buf.size = size;
+    buf.alloc_type = AllocatorType::kPooled;
+    try {
+      buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
+    } catch (InternalError& err) {
+      LOG(WARNING) << "PooledAllocator got InternalError during allocation: " << err.message();
+      LOG(WARNING) << "Trying to release all unused memory and reallocate...";
+      ReleaseAll();
+      buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
+    }
+
+    used_memory_.fetch_add(size, std::memory_order_relaxed);
+    VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+    return buf;
+  }
+
+  Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
+               const std::string& mem_scope) override {
+    if (AllowMemoryScope(mem_scope)) {
+      NDArray::Container container(nullptr, shape, type_hint, dev);
+      size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor);
+      Buffer buf;
+      buf.device = dev;
+      buf.size = size;
+      buf.alloc_type = AllocatorType::kPooled;
+      buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
+                                                     String(mem_scope));
+      used_memory_.fetch_add(size, std::memory_order_relaxed);
+      DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+      return buf;
+    }
+    LOG(FATAL) << "Unsupported memory scope for this Allocator:" << mem_scope;
+    return {};
+  }
+
+  void Free(const Buffer& buffer) override {
+    std::lock_guard<std::recursive_mutex> lock(mu_);
+    if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
+      memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
+    }
+    memory_pool_.at(buffer.size).push_back(buffer);
+    VLOG(1) << "reclaim buffer " << buffer.size;
+  }
+
+  void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+                   const std::string& mem_scope) final {
+    LOG(WARNING) << "OpenCL View:" << mem_scope;
+    OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
+    return ws_->AllocDataSpaceView(buffer.device, buffer.data, shape, type_hint,
+                                   Optional<String>(mem_scope));
+  }
+
+  void FreeView(Device dev, void* data) final {
+    OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
+    return ws_->FreeDataSpaceView(dev, data);
+  }
+};
+
+TVM_REGISTER_GLOBAL("DeviceAllocator.opencl").set_body([](TVMArgs args, TVMRetValue* rv) {
+  Allocator* alloc = new OpenCLPooledAllocator();
+  *rv = static_cast<void*>(alloc);
 });
 
 }  // namespace cl
diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc
index fb58c53714f6..981b9b5aa782 100644
--- a/tests/cpp-runtime/opencl/texture_copy_test.cc
+++ b/tests/cpp-runtime/opencl/texture_copy_test.cc
@@ -83,6 +83,7 @@ TEST(TextureCopy, HostDeviceRT) {
         1e-5);
   }
 }
+
 #if 0
 TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
   using namespace tvm;
diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc
index e7579a2cabe9..75dabb42598b 100644
--- a/tests/cpp/runtime/memory/memory_manager_tests.cc
+++ b/tests/cpp/runtime/memory/memory_manager_tests.cc
@@ -48,26 +48,6 @@ class TvmVMMemoryManagerTest : public ::testing::Test {
   }
 };
 
-TEST_F(TvmVMMemoryManagerTest, AnyAllocatorNaiveAutoCreate) {
-  Device dev = {kDLCPU, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kAny);
-  EXPECT_EQ(allocator->type(), kNaive);
-}
-
-TEST_F(TvmVMMemoryManagerTest, AnyAllocatorNaiveReuse) {
-  Device dev = {kDLCPU, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
-  allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kAny);
-  EXPECT_EQ(allocator->type(), kNaive);
-}
-
-TEST_F(TvmVMMemoryManagerTest, AnyAllocatorPooled) {
-  Device dev = {kDLCPU, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
-  allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kAny);
-  EXPECT_EQ(allocator->type(), kPooled);
-}
-
 TEST_F(TvmVMMemoryManagerTest, NaiveAllocBasic) {
   Device dev = {kDLCPU, 0};
   Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);

From 45c06f2aee0e2287e5e37a1b38018a7cc1cf006b Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 6 Feb 2025 05:47:19 +0530
Subject: [PATCH 08/14] Testcase fix

---
 include/tvm/runtime/memory/memory_manager.h      | 4 ++--
 tests/cpp/runtime/memory/memory_manager_tests.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index f54a9d5a2054..05c1854b9de7 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -99,13 +99,13 @@ class Allocator {
   TVM_DLL virtual void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
                                    const std::string& mem_scope = "global") {
     return buffer.data;
-  };
+  }
 
   /*! \brief Release the view .
    *  \param dev is the device where this view is created
    *  \param data The view pointer to be freed.
    */
-  TVM_DLL virtual void FreeView(Device dev, void* data){};
+  TVM_DLL virtual void FreeView(Device dev, void* data){}
 
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc
index 75dabb42598b..70806fa9364b 100644
--- a/tests/cpp/runtime/memory/memory_manager_tests.cc
+++ b/tests/cpp/runtime/memory/memory_manager_tests.cc
@@ -177,7 +177,7 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocWithShape) {
     FAIL();
   } catch (std::exception& e) {
     std::string pattern =
-        "Device does not support allocate data space with specified memory scope: global.texture";
+        "This alloc should be implemented";
     std::string what = e.what();
     EXPECT_NE(what.find(pattern), std::string::npos) << what;
   }

From abd63ca9da71dab0f861ad2ef140877c1716deed Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 6 Feb 2025 13:00:24 +0530
Subject: [PATCH 09/14] tests

---
 include/tvm/runtime/memory/memory_manager.h   |  2 +-
 src/runtime/opencl/opencl_device_api.cc       |  6 ++-
 tests/cpp-runtime/opencl/texture_copy_test.cc | 51 ++++++++++++++-----
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index 05c1854b9de7..ab1e6b5c9f6d 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -105,7 +105,7 @@ class Allocator {
    *  \param dev is the device where this view is created
    *  \param data The view pointer to be freed.
    */
-  TVM_DLL virtual void FreeView(Device dev, void* data){}
+  TVM_DLL virtual void FreeView(Device dev, void* data) {}
 
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 4348c2ef8dfe..41acc63374e3 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -870,7 +870,10 @@ class OpenCLPooledAllocator final : public memory::PooledAllocator {
       buf.alloc_type = AllocatorType::kPooled;
       buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
                                                      String(mem_scope));
-      used_memory_.fetch_add(size, std::memory_order_relaxed);
+      if (mem_scope.find("texture") == std::string::npos) {
+        // All textures are backed by buffers - don't count in total memory
+        used_memory_.fetch_add(size, std::memory_order_relaxed);
+      }
       DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
       return buf;
     }
@@ -889,7 +892,6 @@ class OpenCLPooledAllocator final : public memory::PooledAllocator {
 
   void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
                    const std::string& mem_scope) final {
-    LOG(WARNING) << "OpenCL View:" << mem_scope;
     OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
     return ws_->AllocDataSpaceView(buffer.device, buffer.data, shape, type_hint,
                                    Optional<String>(mem_scope));
diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc
index 981b9b5aa782..23b490f695e2 100644
--- a/tests/cpp-runtime/opencl/texture_copy_test.cc
+++ b/tests/cpp-runtime/opencl/texture_copy_test.cc
@@ -26,6 +26,12 @@
 
 #include "../src/runtime/opencl/opencl_common.h"
 
+using tvm::runtime::kAllocAlignment;
+using tvm::runtime::memory::AllocatorType;
+using tvm::runtime::memory::Buffer;
+using tvm::runtime::memory::MemoryManager;
+using tvm::runtime::memory::Storage;
+
 class TextureCopyTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -40,7 +46,7 @@ class TextureCopyTest : public ::testing::Test {
       GTEST_SKIP() << "Skip test case as BufferToImage is not supported \n";
     }
     (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
-        thr->device, tvm::runtime::memory::AllocatorType::kNaive);
+        thr->device, tvm::runtime::memory::AllocatorType::kPooled);
   }
 };
 
@@ -53,7 +59,7 @@ TEST(TextureCopy, HostDeviceRT) {
   tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global();
   tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry();
   (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
-      thr->device, tvm::runtime::memory::AllocatorType::kNaive);
+      thr->device, tvm::runtime::memory::AllocatorType::kPooled);
   std::vector<int64_t> shape{16, 16, 4};
   auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
   auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
@@ -84,7 +90,6 @@ TEST(TextureCopy, HostDeviceRT) {
   }
 }
 
-#if 0
 TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
   using namespace tvm;
   std::vector<int64_t> shape{1, 16, 16, 8};
@@ -93,8 +98,15 @@ TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
   auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
   String mem_scope = "global";
-  auto opencl_memobj = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope);
-  auto opencl_memview = opencl_memobj.CreateView(same_shape, {kDLFloat, 32, 1});
+
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
+  auto opencl_memobj = stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, mem_scope);
+  auto opencl_memview =
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, mem_scope);
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -144,10 +156,14 @@ TEST_F(TextureCopyTest, ViewBufferAsImage) {
   auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
   auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
-  auto opencl_buf_obj =
-      runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, String("global"));
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
+  auto opencl_buf_obj = stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global");
   auto opencl_img_obj =
-      opencl_buf_obj.CreateView(same_shape, {kDLFloat, 32, 1}, 0, String("global.texture"));
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global.texture");
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -197,10 +213,15 @@ TEST_F(TextureCopyTest, ViewImageAsBuffer) {
   auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
   auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
   auto opencl_img_obj =
-      runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, String("global.texture"));
+      stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global.texture");
   auto opencl_buf_obj =
-      opencl_img_obj.CreateView(same_shape, {kDLFloat, 32, 1}, 0, String("global"));
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global");
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -250,10 +271,15 @@ TEST_F(TextureCopyTest, ViewImageAsImage) {
   auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
   auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
   auto opencl_img_obj_1 =
-      runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, String("global.texture"));
+      stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global.texture");
   auto opencl_img_obj_2 =
-      opencl_img_obj_1.CreateView(same_shape, {kDLFloat, 32, 1}, 0, String("global.texture"));
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global.texture");
 
   std::random_device dev;
   std::mt19937 mt(dev());
@@ -294,4 +320,3 @@ TEST_F(TextureCopyTest, ViewImageAsImage) {
               1e-5);
   }
 }
-#endif

From 6e3da8cef8e832c965ca7b5f9e762fa14c7f217a Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 6 Feb 2025 14:46:59 +0530
Subject: [PATCH 10/14] Lint

---
 tests/cpp/runtime/memory/memory_manager_tests.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc
index 70806fa9364b..06af7ec1d25d 100644
--- a/tests/cpp/runtime/memory/memory_manager_tests.cc
+++ b/tests/cpp/runtime/memory/memory_manager_tests.cc
@@ -176,8 +176,7 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocWithShape) {
     (void)texture;
     FAIL();
   } catch (std::exception& e) {
-    std::string pattern =
-        "This alloc should be implemented";
+    std::string pattern = "This alloc should be implemented";
     std::string what = e.what();
     EXPECT_NE(what.find(pattern), std::string::npos) << what;
   }

From 559b91f357fbb82174240ac4b2ca59def254f6f5 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 6 Feb 2025 20:06:52 +0530
Subject: [PATCH 11/14] Test

---
 .../runtime/memory/memory_manager_tests.cc    | 45 -------------------
 1 file changed, 45 deletions(-)

diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc
index 06af7ec1d25d..47146d2000fc 100644
--- a/tests/cpp/runtime/memory/memory_manager_tests.cc
+++ b/tests/cpp/runtime/memory/memory_manager_tests.cc
@@ -182,51 +182,6 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocWithShape) {
   }
 }
 
-TEST_F(TvmVMMemoryManagerTest, NaiveAllocOpenCLTexture) {
-  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
-  if (!enabled) {
-    LOG(INFO) << "Skip OpenCL Texture alloc test because opencl runtime is disabled.\n";
-    return;
-  }
-  Device dev = {kDLOpenCL, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-  auto dt = DataType::Float(32);
-  size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes();
-  ShapeTuple shape = {1, 3, 6, 6};
-  auto buff = allocator->Alloc(dev, shape, dt);
-  EXPECT_EQ(allocator->UsedMemory(), nbytes);
-  allocator->Free(buff);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-
-  auto texture = allocator->Alloc(dev, shape, dt, "global.texture");
-  EXPECT_EQ(allocator->UsedMemory(), nbytes);
-  allocator->Free(texture);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-}
-
-TEST_F(TvmVMMemoryManagerTest, PooledAllocOpenCLTexture) {
-  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
-  if (!enabled) {
-    LOG(INFO) << "Skip OpenCL Texture alloc test because opencl runtime is disabled.\n";
-    return;
-  }
-  Device dev = {kDLOpenCL, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-  auto dt = DataType::Float(32);
-  size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes();
-  size_t page_size = PooledAllocator::kDefaultPageSize;
-  size_t size = ((nbytes + page_size - 1) / page_size) * page_size;
-  ShapeTuple shape = {1, 3, 6, 6};
-  auto buff = allocator->Alloc(dev, shape, dt);
-  EXPECT_EQ(allocator->UsedMemory(), size);
-  allocator->Free(buff);
-  EXPECT_EQ(allocator->UsedMemory(), size);
-
-  auto texture = allocator->Alloc(dev, shape, dt, "global.texture");
-  allocator->Free(texture);
-}
 }  // namespace memory
 }  // namespace runtime
 }  // namespace tvm

From 37b0ea27d05f927177688274d56b7fc0013595d7 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 11 Feb 2025 15:34:40 +0530
Subject: [PATCH 12/14] Sync call not required  according to OpenCL spec.

---
 src/runtime/opencl/opencl_device_api.cc | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 41acc63374e3..06f966e5f438 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -421,8 +421,6 @@ void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
   // Handle the fall back
   if (!IsBufferToImageSupported(dev.device_id)) {
     if (desc->is_compat_view) {
-      // TODO(Siva): Do we need this waiting for entire queue ?
-      OPENCL_CALL(clFinish(this->GetQueue(dev)));
       OPENCL_CALL(clReleaseMemObject(desc->buffer));
       delete desc;
     }
@@ -430,8 +428,6 @@ void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
   }
 
   if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
-    // TODO(Siva): Do we need this waiting for entire queue ?
-    OPENCL_CALL(clFinish(this->GetQueue(dev)));
     OPENCL_CALL(clReleaseMemObject(desc->buffer));
     delete desc;
   }
@@ -453,7 +449,6 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host
     cl_device_id device_id = GetCLDeviceID(dev.device_id);
     auto platform = device_info[device_id].platform_id;
 
-    OPENCL_CALL(clFinish(this->GetQueue(dev)));
     if (desc->host_ptr) {
       OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
                                           reinterpret_cast<void*>(desc->host_ptr), 0, nullptr,
@@ -484,10 +479,6 @@ void OpenCLWorkspace::SetPerfHint(Device dev, cl_uint perf_hint) {
 }
 
 void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
-  // We have to make sure that the memory object is not in the command queue
-  // for some OpenCL platforms.
-  OPENCL_CALL(clFinish(this->GetQueue(dev)));
-
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
   if (desc->back_buffer) {
     // 2D Image w/ back buffer allocated from pool
@@ -502,7 +493,6 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
         clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
                                 reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
       }
-      OPENCL_CALL(clFinish(this->GetQueue(dev)));
       OPENCL_CALL(clReleaseMemObject(desc->buffer));
       delete desc;
     } else if (!IsBufferToImageSupported(dev.device_id)) {

From ba9b2fda3952129080749cdd7467412bc967ccc5 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 13 Feb 2025 16:46:57 +0530
Subject: [PATCH 13/14] Test to check CI fails

---
 tests/scripts/task_python_integration.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 51ef86d05ec7..f639fcb9a6bd 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -44,7 +44,7 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
 
 # OpenCL texture test. Deselected specific tests that fails  in CI
 TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
-    run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture
+    run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py
 # Command line driver test
 run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
 

From e6ced507a79496c277fcb38aaeb0c3f248c53eea Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 13 Feb 2025 20:22:17 +0530
Subject: [PATCH 14/14] Run tests in batches

---
 tests/scripts/task_python_integration.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index f639fcb9a6bd..3202839e50ed 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -43,8 +43,13 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
     run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay --ignore=tests/python/relay/aot
 
 # OpenCL texture test. Deselected specific tests that fails  in CI
-TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
-    run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture/test_conv2d_transpose_nchw_texture.py
+TEXTURE_TESTS=$(ls tests/python/relay/opencl_texture/test_*)
+i=0
+for TEST in $TEXTURE_TESTS; do
+    TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
+        run_pytest "${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture-$i" "$TEST"
+    i=$((i+1))
+done
 # Command line driver test
 run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver