diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index fb14d84b794f..7b4ced7c9c0d 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -66,7 +66,6 @@
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
 #include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
-#include "../src/runtime/opencl/texture_pool.cc"
 #include "../src/runtime/source_utils.cc"
 #endif
 
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c33606d98ed3..f27bfdacb570 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -52,6 +52,7 @@ enum DeviceAttrKind : int {
   kL2CacheSizeBytes = 13,
   kTotalGlobalMemory = 14,
   kAvailableGlobalMemory = 15,
+  kImagePitchAlignment = 16,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
index 0c4647e6fa5a..ab1e6b5c9f6d 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -87,7 +87,26 @@ class Allocator {
    *  \return A sized allocation in the form of a buffer.
    */
   TVM_DLL virtual Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
-                               const std::string& mem_scope = "") = 0;
+                               const std::string& mem_scope = "");
+
+  /*! \brief Create a view for the buffer given a shape, type and scope.
+   *  \param buffer The existing buffer upon which we need to create a view.
+   *  \param shape The shape of the view.
+   *  \param type_hint A type hint to the view.
+   *  \param mem_scope A memory scope of the view.
+   *  \return A device pointer to the created view.
+   */
+  TVM_DLL virtual void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+                                   const std::string& mem_scope = "global") {
+    return buffer.data;
+  }
+
+  /*! \brief Release the view .
+   *  \param dev is the device where this view is created
+   *  \param data The view pointer to be freed.
+   */
+  TVM_DLL virtual void FreeView(Device dev, void* data) {}
+
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
    */
@@ -147,6 +166,13 @@ class StorageObj : public Object {
   /*! \brief Allocate an NDArray from a given piece of storage. */
   TVM_DLL NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 
+  /*! \brief Allocate an NDArray with memory scope from a given piece of storage. */
+  TVM_DLL NDArray AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype,
+                                     String scope = "global");
+
+  /*! \brief The deleter for an NDArray when allocated from underlying storage. */
+  static void ScopedDeleter(Object* ptr);
+
   /*! \brief The deleter for an NDArray when allocated from underlying storage. */
   static void Deleter(Object* ptr);
 
@@ -170,6 +196,12 @@ class Storage : public ObjectRef {
 };
 
 }  // namespace memory
+
+using memory::Allocator;
+using memory::AllocatorType;
+using memory::MemoryManager;
+using memory::StorageObj;
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index d85ffd78291c..33b3adea5f2f 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -229,6 +229,16 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     VLOG_CONTEXT << "StorageAllocator";
     VLOG(1) << "planning:" << std::endl << PrettyPrint(func);
     prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
+    // Backup the virtual devices as token reuse might lost the original memory scope
+    std::unordered_map<const ExprNode*, std::vector<VirtualDevice>> virtual_device_map_;
+    for (const auto& kv : prototype_) {
+      std::vector<VirtualDevice> virtual_devices;
+      virtual_devices.reserve(kv.second.size());
+      for (StorageToken* tok : kv.second) {
+        virtual_devices.push_back(tok->virtual_device);
+      }
+      virtual_device_map_.insert({kv.first, virtual_devices});
+    }
     this->Run(func);
 
     // The value of smap contains two integer arrays where the first array
@@ -252,9 +262,13 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         }
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
-        virtual_devices.push_back(tok->virtual_device);
         sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
       }
+      ICHECK(kv.second.size() == virtual_device_map_[kv.first].size())
+          << "Mismatch of tokens and virtual devices";
+      for (auto vdev : virtual_device_map_[kv.first]) {
+        virtual_devices.push_back(vdev);
+      }
       auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices),
                                                std::move(sid_sizes_byte));
       smap.Set(GetRef<Expr>(kv.first), storage_info);
@@ -356,25 +370,19 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
   class TokenAllocator {
    public:
-    StorageToken* Alloc(StorageToken* proto) {
-      return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
-                                : token_1d_.Alloc(proto, storage_ids_++);
-    }
+    StorageToken* Alloc(StorageToken* proto) { return token_mixed_.Alloc(proto, storage_ids_++); }
     StorageToken* Request(StorageToken* proto) {
-      StorageToken* token =
-          Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
+      StorageToken* token = token_mixed_.Request(proto);
       return token ? token : this->Alloc(proto);
     }
-    void CheckForRelease(StorageToken* tok) {
-      return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
-    }
+    void CheckForRelease(StorageToken* tok) { return token_mixed_.CheckForRelease(tok); }
 
     size_t GetMemorySize(StorageToken* tok) {
       // TODO(amalyshe): figure out who requries sizes and for what
       // size in case of texture is not enough - we can return any value if it
       // assumed to be used for memory allocatoion or we can return real size
       // if it is just for information
-      return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
+      return token_mixed_.GetMemorySize(tok);
     }
     static bool Is2DStorage(StorageToken* tok) {
       return relay::Is2DStorage(tok->virtual_device->memory_scope);
@@ -382,8 +390,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
    private:
     int64_t storage_ids_{0};
-    TokenAllocator1D token_1d_;
-    TokenAllocator2D token_2d_;
+    TokenAllocatorMixed token_mixed_;
   };
 
  private:
diff --git a/src/relay/backend/token_allocator.cc b/src/relay/backend/token_allocator.cc
index bdecba9afad7..e974944b33b0 100644
--- a/src/relay/backend/token_allocator.cc
+++ b/src/relay/backend/token_allocator.cc
@@ -31,22 +31,45 @@
 
 namespace tvm {
 namespace relay {
+constexpr auto Is2DStorage = runtime::IsTextureStorage;
 
-size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) {
+/*
+ * Mixed mode memory allocator
+ */
+size_t TokenAllocatorMixed::GetMemorySize(StorageToken* prototype) {
   TensorType ttype = prototype->ttype;
   ICHECK(ttype.defined());
   size_t size = 1;
-  for (IndexExpr dim : ttype->shape) {
-    const int64_t* pval = tir::as_const_int(dim);
-    ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-    ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-    size *= static_cast<size_t>(pval[0]);
+  if (relay::Is2DStorage(prototype->virtual_device->memory_scope)) {
+    size = GetSize2D(prototype);
+  } else {
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = tir::as_const_int(dim);
+      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
   }
-  size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
   return size;
 }
 
-StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
+String GetDeviceCompatibleToken(StorageToken* tok) {
+  Target null_tgt{nullptr};
+  if (null_tgt == tok->virtual_device->target) {
+    return tok->virtual_device->memory_scope;
+  }
+  std::string dev_kind = tok->virtual_device->target->kind->name;
+  auto* device_scope_handler = tvm::runtime::Registry::Get("DeviceScopeCompatibility." + dev_kind);
+  if (device_scope_handler) {
+    String dev_scope =
+        (*device_scope_handler)(tok->virtual_device->target, tok->virtual_device->memory_scope);
+    return dev_scope;
+  }
+  return tok->virtual_device->memory_scope;
+}
+
+StorageToken* TokenAllocatorMixed::Request(StorageToken* prototype) {
   // calculate the size;
   size_t size = GetMemorySize(prototype);
   // search memory block in [size / match_range_, size * match_range_)
@@ -59,32 +82,42 @@ StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
   // search for memory blocks larger than requested
   for (auto it = mid; it != end; ++it) {
     StorageToken* tok = it->second;
-    if (!tok->is_compatible(*prototype)) continue;
-    ICHECK_EQ(tok->ref_counter, 0);
-    // Use exect matching strategy
-    tok->max_bytes = std::max(size, tok->max_bytes);
-    tok->ref_counter = prototype->ref_counter;
-    // find a exact match, erase from map and return
-    free_.erase(it);
-    return tok;
+    bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype));
+    if (tok->is_compatible(*prototype) || (dev_compatible)) {
+      ICHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      if (size > tok->max_bytes) {
+        tok->max_bytes = size;
+        tok->ttype = prototype->ttype;
+      }
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
   }
   // then search for memory blocks smaller than requested space
   for (auto it = mid; it != begin;) {
     --it;
     StorageToken* tok = it->second;
-    if (!tok->is_compatible(*prototype)) continue;
-    ICHECK_EQ(tok->ref_counter, 0);
-    // Use exect matching strategy
-    tok->max_bytes = std::max(size, tok->max_bytes);
-    tok->ref_counter = prototype->ref_counter;
-    // erase from map and return
-    free_.erase(it);
-    return tok;
+    bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype));
+    if (tok->is_compatible(*prototype) || (dev_compatible)) {
+      ICHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      if (size > tok->max_bytes) {
+        tok->max_bytes = size;
+        tok->ttype = prototype->ttype;
+      }
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
   }
   return nullptr;
 }
 
-StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) {
+StorageToken* TokenAllocatorMixed::Alloc(StorageToken* prototype, int64_t storage_id) {
   size_t size = GetMemorySize(prototype);
   prototype->max_bytes = size;
   prototype->storage_id = storage_id;
@@ -92,7 +125,7 @@ StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_i
   return prototype;
 }
 
-void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
+void TokenAllocatorMixed::CheckForRelease(StorageToken* tok) {
   ICHECK_GE(tok->storage_id, 0);
   ICHECK_GE(tok->ref_counter, 0);
   if (tok->ref_counter == 0) {
@@ -100,101 +133,22 @@ void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
   }
 }
 
-StorageToken* TokenAllocator2D::Request(StorageToken* prototype) {
-  auto shape = GetSize2D(prototype);
-  const int64_t max_ratio = 5;
-  int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-  int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-  int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-  int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-  int64_t best_storage_id = -1;
-  MemBlock new_mem;
-  for (int64_t free_id : free_list_) {
-    MemBlock& cached = blocks_[free_id];
-    // Can only reuse texture 2d blocks of the same type
-    if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
-      continue;
-    }
-    // Can only reuse texture 2d blocks of the same scope
-    // Because reusing textures with different memory scope may lead to
-    // accuracy issues, because the data will be packed in a different way for
-    // different memory scopes.
-    if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) {
-      continue;
-    }
-    // avoid reusing too small and too big textures
-    if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio ||
-        shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) {
-      continue;
-    }
-    int64_t new_width = std::max(cached.x_, shape.width);
-    int64_t new_height = std::max(cached.y_, shape.height);
-    int64_t added_size_x = new_width - cached.x_;
-    int64_t added_size_y = new_height - cached.y_;
-    int64_t wasted_size_x = new_width - shape.width;
-    int64_t wasted_size_y = new_height - shape.height;
-    // Prioritize minimization of added size first, then minimize
-    // wasted size among blocks which would not require expansion
-    if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-        (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-        (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-        (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-      min_added_size_x = added_size_x;
-      min_added_size_y = added_size_y;
-      min_wasted_size_x = wasted_size_x;
-      min_wasted_size_y = wasted_size_y;
-      best_storage_id = free_id;
-      new_mem.x_ = new_width;
-      new_mem.y_ = new_height;
-    }
-  }
-
-  if (min_added_size_x == 0 && min_added_size_y == 0) {
-    // use existing block
-    free_list_.erase(best_storage_id);
-    blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter;
-    return blocks_[best_storage_id].token_;
-  } else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) {
-    // Reset the reference counter of the now live token
-    free_list_.erase(best_storage_id);
-    new_mem.token_ = prototype;
-    new_mem.token_->ref_counter += 1;
-    new_mem.token_->storage_id = best_storage_id;
-    blocks_[best_storage_id] = new_mem;
-    return new_mem.token_;
-  }
-  return nullptr;
-}
-
-StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) {
-  auto shape = GetSize2D(prototype);
-  MemBlock block;
-  block.x_ = shape.width;
-  block.y_ = shape.height;
-  prototype->storage_id = storage_id;
-  block.token_ = prototype;
-  blocks_[prototype->storage_id] = block;
-  return prototype;
-}
-
-void TokenAllocator2D::CheckForRelease(StorageToken* tok) {
-  ICHECK_GE(tok->storage_id, 0);
-  ICHECK_GE(tok->ref_counter, 0);
-  if (tok->ref_counter == 0) {
-    free_list_.insert(tok->storage_id);
-  }
-}
-
-runtime::Texture2DShape<int64_t> TokenAllocator2D::GetSize2D(StorageToken* prototype) {
+size_t TokenAllocatorMixed::GetSize2D(StorageToken* prototype) {
   TensorType ttype = prototype->ttype;
   ICHECK(ttype.defined());
-  size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
-                                                       prototype->virtual_device->memory_scope);
   struct Shape {
     const Array<PrimExpr>& shape;
     int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+    int size() { return this->shape.size(); }
   };
-  return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
+  auto shape = Shape{ttype->shape};
+  int image_row_align =
+      prototype->virtual_device->target->GetAttr<Integer>("image_base_address_alignment")
+          .value_or(Integer(64))
+          ->value;
+  return runtime::GetTextureMemorySize<Shape>(shape, ttype->dtype.bits(), ttype->dtype.lanes(),
+                                              prototype->virtual_device->memory_scope,
+                                              image_row_align);
 }
 
 }  // namespace relay
diff --git a/src/relay/backend/token_allocator.h b/src/relay/backend/token_allocator.h
index 3aebd71b6c2b..5524e6b2c634 100644
--- a/src/relay/backend/token_allocator.h
+++ b/src/relay/backend/token_allocator.h
@@ -66,9 +66,9 @@ struct StorageToken {
 };
 
 /**
- * @brief Memory manager for flattened 1d memory (buffers)
+ * @brief Memory manager for mixed mode memory types
  */
-class TokenAllocator1D {
+class TokenAllocatorMixed {
  public:
   /*!
    * \brief ceil(size/word_size) to get number of words.
@@ -105,54 +105,22 @@ class TokenAllocator1D {
    * \param tok The token to be released.
    */
   void CheckForRelease(StorageToken* tok);
-
- private:
-  // scale used for rough match
-  const size_t match_range_{16};
-  // free list of storage entry
-  std::multimap<size_t, StorageToken*> free_;
-  // all the storage resources available
-  std::vector<StorageToken*> data_;
-};
-
-/**
- * @brief Memory manager for 2d memory (textures)
- */
-class TokenAllocator2D {
- public:
-  /*!
-   * \brief Request a storage token for a given prototype.
-   * \param prototype. The prototype storage token.
-   * \return The result token.
-   */
-  StorageToken* Request(StorageToken* prototype);
-  /*!
-   * \brief Alloacte a storage token by consuming prototype
-   * \param prototype The prototype token.
-   * \param size The size of memory being requested.
-   */
-  StorageToken* Alloc(StorageToken* prototype, int64_t storage_id);
-  /*!
-   * \brief Check if we can release token.
-   * \param tok The token to be released.
-   */
-  void CheckForRelease(StorageToken* tok);
   /*!
    * \brief Get the texture 2d size requirement
    * \param prototype The prototype token.
-   * \return The required texture 2d memory size in (width, height, channel).
+   * \return The physical memory size.
    */
-  runtime::Texture2DShape<int64_t> GetSize2D(StorageToken* prototype);
+  size_t GetSize2D(StorageToken* prototype);
 
  protected:
-  struct MemBlock {
-    StorageToken* token_;
-    int64_t x_;
-    int64_t y_;
-  };
+  // free list of storage entry
+  std::multimap<size_t, StorageToken*> free_;
+  // all the storage resources available
+  std::vector<StorageToken*> data_;
 
-  std::unordered_map<int64_t, MemBlock> blocks_;
-  std::unordered_set<int64_t> free_list_;
+ private:
+  // scale used for rough match
+  const size_t match_range_{16};
 };
 
 }  // namespace relay
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index d8c0075fcdc1..fa7338177cbe 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -54,7 +54,7 @@ CLMLWorkspace::CLMLWorkspace() {
   tentry = workspace->GetThreadEntry();
 
   device_id = workspace->GetCLDeviceID(tentry->device.device_id);
-  platform_id = workspace->device_to_platform[device_id];
+  platform_id = workspace->device_info[device_id].platform_id;
 
   // Print extensions
   size_t reqd_size = 0;
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 33908d750d6d..82b8d9062615 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -127,6 +127,8 @@ class CUDADeviceAPI final : public DeviceAPI {
         *rv = static_cast<int64_t>(free_mem);
         return;
       }
+      case kImagePitchAlignment:
+        return;
     }
     *rv = value;
   }
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 1b1051322c49..3cc3ea396e17 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -26,6 +26,7 @@
 #include <tvm/runtime/container/string.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
@@ -424,36 +425,31 @@ void GraphExecutor::SetupStorage() {
     }
     pool_entry[sid].param_data_entry = i;
     pool_entry[sid].device_type = device_type;
-    pool_entry[sid].scope = storage_scope;
 
     DLDataType t = vtype[i];
-    if (!details::Is2DStorage(storage_scope)) {
-      size_t size = 1;
-      for (int64_t sz : attrs_.shape[i]) {
-        size *= static_cast<size_t>(sz);
-      }
-      size_t bits = t.bits * t.lanes;
-      ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
-      int64_t bytes = ((bits + 7U) / 8U) * size;
-      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes);
-      pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1};
-    } else {
-      if (pool_entry[sid].shape.size() == 1) {
-        pool_entry[sid].shape.resize(3, 0);
-      }
-      size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope);
-      auto shape = ApplyTexture2DFlattening<int64_t>(attrs_.shape[i], attrs_.shape[i].size(), axis);
-      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height);
-      pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width);
-      CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel)
-          << pool_entry[sid].shape[2] << " != " << shape.channel
-          << ",  texture channel length must be consistent within a storage pool";
-      pool_entry[sid].shape[2] = shape.channel;
-      CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t))
-          << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t)
-          << ", pool entry for 2d texure allocations must be of the same type;"
-          << " downstream error from memory planner likely";
+
+    auto dev_type = pool_entry[sid].device_type;
+    const auto& cit = std::find_if(devices_.begin(), devices_.end(), [&dev_type](const Device& d) {
+      return dev_type == static_cast<int>(d.device_type);
+    });
+    Device dev = cit == devices_.end() ? devices_[0] : *cit;
+
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.device = dev;
+    temp.ndim = attrs_.shape[i].size();
+    temp.dtype = t;
+    temp.shape = static_cast<int64_t*>(attrs_.shape[i].data());
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+
+    int64_t alloc_size = DeviceAPI::Get(dev)->GetDataSize(temp, String(storage_scope));
+
+    if (pool_entry[sid].alloc_size < alloc_size) {
       pool_entry[sid].dtype = t;
+      pool_entry[sid].shape = attrs_.shape[i];
+      pool_entry[sid].alloc_size = alloc_size;
+      pool_entry[sid].scope = storage_scope;
     }
   }
 
@@ -466,18 +462,14 @@ void GraphExecutor::SetupStorage() {
     });
     Device dev = cit == devices_.end() ? devices_[0] : *cit;
     if (pit.linked_param.defined()) {
-      storage_pool_.push_back(pit.linked_param);
+      ndarray_pool_.push_back(pit.linked_param);
     } else {
       std::vector<int64_t> shape = pit.shape;
-      if (shape.size() == 1) {
-        shape[0] = (shape[0] + 3) / 4;
-      }
-      Optional<String> mem_scope;
-      if (!pit.scope.empty()) {
-        mem_scope = String(pit.scope);
-      }
-      storage_pool_.push_back(MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kNaive)
-                                  ->Empty(shape, pit.dtype, dev, mem_scope));
+      String mem_scope = pit.scope.empty() ? "global" : String(pit.scope);
+      auto allocator = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled);
+      auto buffer = allocator->Alloc(dev, pit.alloc_size, kAllocAlignment, pit.dtype);
+      auto stor = Storage(buffer, allocator);
+      storage_pool_.push_back(stor);
     }
   }
 
@@ -486,16 +478,22 @@ void GraphExecutor::SetupStorage() {
   // is mapped to this pool.
   data_entry_.resize(num_node_entries());
   data_alignment_.resize(num_node_entries());
-  // sid_to_eid has a size of storage_id's size, which is the size of storage_pool_.
-  sid_to_eid_.resize(storage_pool_.size());
-  for (size_t i = 0; i < data_entry_.size(); ++i) {
+  // sid_to_eid has a size of storage_id's size, which is the size of pool_entry.
+  sid_to_eid_.resize(pool_entry.size());
+  for (size_t i = 0, j = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     // Update "storage_id -> entry_id" pair.
     sid_to_eid_[storage_id].push_back(i);
 
-    ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    ICHECK_LT(static_cast<size_t>(storage_id), pool_entry.size());
 
+    if (pool_entry[storage_id].linked_param.defined()) {
+      data_entry_[i] = ndarray_pool_[j++];
+    } else {
+      std::string storage_scope = attrs_.storage_scope.empty() ? "global" : attrs_.storage_scope[i];
+      data_entry_[i] = storage_pool_[storage_id]->AllocNDArrayScoped(0, ShapeTuple(attrs_.shape[i]),
+                                                                     vtype[i], storage_scope);
+    }
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
   }
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index cfdba8916baa..e1c61001f1d9 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -45,6 +45,7 @@ namespace runtime {
 
 using memory::AllocatorType;
 using memory::MemoryManager;
+using tvm::runtime::memory::Storage;
 
 /*! \brief macro to do C API call */
 #define TVM_CCALL(func)                     \
@@ -224,6 +225,7 @@ class TVM_DLL GraphExecutor : public ModuleNode {
     int param_data_entry;
     NDArray linked_param;
     std::string scope;
+    int64_t alloc_size{-1};
     //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
     //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
@@ -483,7 +485,9 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   /*! \brief Execution context of all devices including the host. */
   std::vector<Device> devices_;
   /*! \brief Common storage pool for all devices. */
-  std::vector<NDArray> storage_pool_;
+  std::vector<Storage> storage_pool_;
+  /*! \brief Common NDArray pool for all devices. */
+  std::vector<NDArray> ndarray_pool_;
   /*! \brief Data entry of each node. */
   std::vector<NDArray> data_entry_;
   /*! \brief Data alignment of each node. */
diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc
index 0607697e6b83..a4b8e15943bd 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -84,6 +84,37 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   return align;
 }
 
+void StorageObj::ScopedDeleter(Object* obj) {
+  auto* ptr = static_cast<NDArray::Container*>(obj);
+  StorageObj* storage = reinterpret_cast<StorageObj*>(ptr->manager_ctx);
+
+  // Let the device handle proper cleanup of view
+  storage->allocator->FreeView(ptr->dl_tensor.device, ptr->dl_tensor.data);
+  storage->DecRef();
+  delete ptr;
+}
+
+NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype,
+                                       String scope) {
+  if (scope == "global" || scope.empty()) {
+    return AllocNDArray(offset, shape, dtype);
+  }
+  VerifyDataType(dtype);
+  void* data = this->allocator->CreateView(this->buffer, shape, dtype, scope);
+  NDArray::Container* container = new NDArray::Container(data, shape, dtype, this->buffer.device);
+  container->dl_tensor.byte_offset = offset;
+  container->SetDeleter(StorageObj::ScopedDeleter);
+  size_t needed_size = DeviceAPI::Get(this->buffer.device)->GetDataSize(container->dl_tensor);
+  this->IncRef();
+  container->manager_ctx = reinterpret_cast<void*>(this);
+  NDArray ret(GetObjectPtr<Object>(container));
+  // RAII in effect, now run the check.
+  ICHECK(offset + needed_size <= this->buffer.size)
+      << "storage allocation failure, attempted to allocate " << needed_size << " at offset "
+      << offset << " in region that is " << this->buffer.size << "bytes";
+  return ret;
+}
+
 NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype) {
   VerifyDataType(dtype);
 
@@ -128,38 +159,62 @@ MemoryManager* MemoryManager::Global() {
   return inst;
 }
 
-Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
-  MemoryManager* m = MemoryManager::Global();
-  std::lock_guard<std::mutex> lock(m->mu_);
-  if (m->allocators_.find(dev) == m->allocators_.end()) {
-    m->allocators_.emplace(dev, std::unordered_map<AllocatorType, std::unique_ptr<Allocator>>());
+std::string DeviceTypeStr(DLDeviceType type) {
+  switch (type) {
+    case kDLOpenCL:
+      return "opencl";
+      break;
+    case kDLVulkan:
+      return "vulkan";
+      break;
+    default:
+      return "";
   }
-  if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) {
-    std::unique_ptr<Allocator> alloc;
+}
+
+Allocator* GetDeviceSpecificAllocator(Device dev, AllocatorType type) {
+  std::string dev_str = DeviceTypeStr(dev.device_type);
+  auto* device_alloc_helper = tvm::runtime::Registry::Get("DeviceAllocator." + dev_str);
+  void* valloc;
+  Allocator* allocator = nullptr;
+  if (device_alloc_helper) {
+    valloc = (*device_alloc_helper)(dev, static_cast<int>(type));
+    allocator = static_cast<Allocator*>(valloc);
+  }
+  if (nullptr == allocator) {
     switch (type) {
       case kNaive: {
         VLOG(1) << "New naive allocator for " << dev;
-        alloc.reset(new NaiveAllocator());
+        allocator = new NaiveAllocator();
         break;
       }
       case kPooled: {
         VLOG(1) << "New pooled allocator for " << dev;
-        alloc.reset(new PooledAllocator());
+        allocator = new PooledAllocator();
         break;
       }
       default:
         LOG(FATAL) << "Unknown allocator type: " << type;
     }
+  }
+  return allocator;
+}
+
+Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
+  MemoryManager* m = MemoryManager::Global();
+  std::lock_guard<std::mutex> lock(m->mu_);
+  if (m->allocators_.find(dev) == m->allocators_.end()) {
+    m->allocators_.emplace(dev, std::unordered_map<AllocatorType, std::unique_ptr<Allocator>>());
+  }
+  if (m->allocators_.at(dev).find(type) == m->allocators_.at(dev).end()) {
+    std::unique_ptr<Allocator> alloc;
+    alloc.reset(GetDeviceSpecificAllocator(dev, type));
     auto ret = alloc.get();
     m->allocators_.at(dev).emplace(type, std::move(alloc));
     return ret;
   }
   auto alloc = m->allocators_.at(dev).at(type).get();
-  /*if (alloc->type() != type) {
-    LOG(WARNING) << "The type of existing allocator for " << dev
-                 << " is different from the request type (" << alloc->type() << " vs " << type
-                 << ")";
-  }*/
+
   return alloc;
 }
 
@@ -191,7 +246,7 @@ NDArray Allocator::Empty(ShapeTuple shape, DLDataType dtype, DLDevice dev,
   VerifyDataType(dtype);
   NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, dev);
   container->SetDeleter(BufferDeleter);
-  size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor);
+  size_t size = DeviceAPI::Get(dev)->GetDataSize(container->dl_tensor, mem_scope);
   size_t alignment = GetDataAlignment(container->dl_tensor);
   Buffer* buffer = new Buffer;
   if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index f2e8c4ab0b75..cbdacb5c096f 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -95,6 +95,8 @@
         *rv = static_cast<int64_t>([devices[dev.device_id] recommendedMaxWorkingSetSize]);
         return;
       }
+      case kImagePitchAlignment:
+        return;
     }
   };
 }
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 2e9b05edcb58..94ab736f5ed5 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -27,6 +27,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
@@ -74,12 +75,13 @@
 #include "../pack_args.h"
 #include "../texture.h"
 #include "../thread_storage_scope.h"
-#include "../workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
 namespace cl {
 
+using tvm::runtime::memory::Buffer;
+
 static_assert(sizeof(cl_mem) == sizeof(void*), "Required to store cl_mem inside void*");
 
 inline const char* CLGetErrorString(cl_int error) {
@@ -221,6 +223,12 @@ inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
 class OpenCLThreadEntry;
 struct BufferDescriptor;
 
+struct CLDeviceInfo {
+  cl_platform_id platform_id;      // platform Id
+  cl_uint image_row_align;         // CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR
+  bool image_from_buffer_support;  // extn: cl_khr_image2d_from_buffer
+};
+
 /*!
  * \brief Process global OpenCL workspace.
  */
@@ -234,8 +242,8 @@ class OpenCLWorkspace : public DeviceAPI {
   std::unordered_map<cl_platform_id, cl_context> contexts;
   // whether the workspace it initialized.
   bool initialized_{false};
-  // map device to platform
-  std::unordered_map<cl_device_id, cl_platform_id> device_to_platform;
+  // map device to various device informations
+  std::unordered_map<cl_device_id, CLDeviceInfo> device_info;
   // the devices
   std::vector<cl_device_id> devices;
   // the queues
@@ -251,6 +259,7 @@ class OpenCLWorkspace : public DeviceAPI {
   std::vector<size_t> free_kernel_ids;
   // the mutex for initialization
   std::mutex mu;
+
   // destructor
   ~OpenCLWorkspace() {
     for (auto& it : contexts) {
@@ -284,6 +293,15 @@ class OpenCLWorkspace : public DeviceAPI {
         << "Invalid OpenCL device_id=" << dev.device_id << ". " << GetError();
     return events[dev.device_id];
   }
+  bool IsOpenCLExtensionSupported(cl_device_id did, const std::string& name) {
+    size_t reqd_size = 0;
+    OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size));
+    std::vector<char> extn_buf(reqd_size);
+    OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr));
+    std::string extensions(extn_buf.data());
+    return (extensions.find(name) != std::string::npos);
+  }
+
   // is current clCommandQueue in profiling mode
   bool IsProfiling(Device dev) {
     cl_command_queue queue = GetQueue(dev);
@@ -309,12 +327,22 @@ class OpenCLWorkspace : public DeviceAPI {
     OPENCL_CALL(clReleaseCommandQueue(queue));
     cl_int err_code;
     cl_device_id did = cl::OpenCLWorkspace::Global()->GetCLDeviceID(dev.device_id);
-    cl_platform_id platform = cl::OpenCLWorkspace::Global()->device_to_platform[did];
+    cl_platform_id platform = cl::OpenCLWorkspace::Global()->device_info[did].platform_id;
     auto profiling_queue = clCreateCommandQueue(cl::OpenCLWorkspace::Global()->contexts[platform],
                                                 did, prop, &err_code);
     OPENCL_CHECK_ERROR(err_code);
     cl::OpenCLWorkspace::Global()->queues[dev.device_id] = profiling_queue;
   }
+  cl_uint GetImageAlignment(int device_id) {
+    return device_info[GetCLDeviceID(device_id)].image_row_align;
+  }
+  bool IsBufferToImageSupported(int device_id) {
+    return device_info[GetCLDeviceID(device_id)].image_from_buffer_support;
+  }
+
+  void* AllocDataSpaceView(Device dev, void* data, ShapeTuple shape, DLDataType dtype,
+                           Optional<String> mem_scope = NullOpt);
+  void FreeDataSpaceView(Device dev, void* ptr);
 
   cl_device_id GetCLDeviceID(int device_id);
   // override device API
@@ -323,6 +351,8 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope = NullOpt) final;
+  void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
+                       Optional<String> mem_scope = NullOpt);
   void* GetNativePtr(const tvm::runtime::NDArray& narr);
   void SetNativePtr(const tvm::runtime::NDArray& narr, void* host_ptr, size_t buf_size);
   void SetPerfHint(Device dev, cl_uint perf_hint);
@@ -330,11 +360,12 @@ class OpenCLWorkspace : public DeviceAPI {
   void StreamSync(Device dev, TVMStreamHandle stream) final;
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(Device dev, void* data) final;
+  size_t GetDataSize(const DLTensor& arr, Optional<String> mem_scope = NullOpt) final;
 
-  // Texture (image2d_t) alloca APIs
-  cl_mem AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint);
-  void* AllocTextureWorkspace(Device dev, size_t width, size_t height, DLDataType type_hint);
-  void FreeTextureWorkspace(Device dev, void* data);
+  // cl_mem alloc utils
+  void* AllocCLBuffer(Device dev, size_t size, size_t alignment, DLDataType type_hint);
+  void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t row_pitch,
+                     DLDataType type_hint, Optional<String> mem_scope);
 
   /*!
    * \brief Get the thread local ThreadEntry
@@ -370,13 +401,8 @@ class OpenCLThreadEntry {
   Device device;
   /*! \brief The thread-local kernel table */
   std::vector<KTEntry> kernel_table;
-  /*! \brief workspace pool */
-  WorkspacePool pool;
-  /*! \brief texture pool */
-  TexturePool texture_pool;
   // constructor
-  OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api)
-      : pool(device_type, device_api), texture_pool(device_type, device_api) {
+  OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api) {
     device.device_id = 0;
     device.device_type = device_type;
   }
@@ -414,9 +440,14 @@ struct BufferDescriptor {
   static MemoryLayout MemoryLayoutFromScope(Optional<String> mem_scope);
   static String ScopeFromMemoryLayout(MemoryLayout mem_scope);
 
+  /* clBuffer object */
+  // buffer should be the first element here
   cl_mem buffer{nullptr};
+  cl::BufferDescriptor* back_buffer{nullptr};
   cl_uchar* host_ptr{nullptr};
   MemoryLayout layout{MemoryLayout::kBuffer1D};
+  Buffer mbuf{nullptr};  // MemoryManager ref.
+  bool is_compat_view{false};
 };
 }  // namespace cl
 
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 5c5873b67f74..06f966e5f438 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -27,6 +27,7 @@
 
 #include <sstream>
 
+#include "../memory/pooled_allocator.h"
 #include "opencl_common.h"
 
 #ifdef OPENCL_ENABLE_HOST_PTR
@@ -103,6 +104,19 @@ String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryL
   return "";
 }
 
+static size_t GetMemObjectSize(Device dev, int ndim, const int64_t* shape, DLDataType dtype) {
+  DLTensor temp;
+  temp.data = nullptr;
+  temp.device = dev;
+  temp.ndim = ndim;
+  temp.dtype = dtype;
+  temp.shape = const_cast<int64_t*>(shape);
+  temp.strides = nullptr;
+  temp.byte_offset = 0;
+  size_t size = DeviceAPI::Get(dev)->GetDataSize(temp);
+  return size;
+}
+
 OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }
 
 OpenCLWorkspace* OpenCLWorkspace::Global() {
@@ -220,6 +234,10 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       // https://stackoverflow.com/a/3568223, may not be implementable
       // at all through OpenCL API.
       break;
+    case kImagePitchAlignment: {
+      *rv = static_cast<int64_t>(device_info[device_id].image_row_align);
+      break;
+    }
   }
 }
 
@@ -238,8 +256,55 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device
 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
+  return AllocCLBuffer(dev, size, alignment, type_hint);
+}
+
+void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
+                                      Optional<String> mem_scope) {
+  // Texture allocation given width and height
+  cl_uint row_align = GetImageAlignment(dev.device_id);
+  size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
+  size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
+  size_t mem_size = row_pitch * height;
+
+  // Alloc back buffer from pool
+  cl::BufferDescriptor* back_buffer = nullptr;
+  if (IsBufferToImageSupported(dev.device_id)) {
+    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
+                   ->Alloc(dev, mem_size, kTempAllocaAlignment, type_hint);
+    back_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
+    back_buffer->mbuf = buf;
+  }
+
+  if (!mem_scope.defined()) {
+    mem_scope = String("global.texture");
+  }
+  return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope);
+}
+
+void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
+                                      Optional<String> mem_scope) {
+  this->Init();
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    size_t size = GetMemObjectSize(dev, ndim, shape, dtype);
+    cl::BufferDescriptor* ret_buffer = nullptr;
+    auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
+                   ->Alloc(dev, size, kTempAllocaAlignment, dtype);
+    ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
+    ret_buffer->mbuf = buf;
+    return ret_buffer;
+  }
+  size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
+  auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
+
+  return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope);
+}
+
+void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
+                                     DLDataType type_hint) {
+  this->Init();
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
-  auto platform = device_to_platform[device_id];
+  auto platform = device_info[device_id].platform_id;
   cl_int err_code;
   cl::BufferDescriptor* desc = new cl::BufferDescriptor;
   // CL_INVALID_BUFFER_SIZE if size is 0.
@@ -253,25 +318,121 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
   return CreateHostPtrIfEnabled(desc, dev, size);
 }
 
-void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
-                                      Optional<String> mem_scope) {
-  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
-    return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
-  }
-  ICHECK(IsTextureStorage(std::string(mem_scope.value())))
-      << "Device does not support allocate data space with "
-      << "specified memory scope: " << mem_scope.value();
+void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height,
+                                    size_t row_pitch, DLDataType type_hint,
+                                    Optional<String> mem_scope) {
+  this->Init();
+  ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos)
+      << "Expect texture scope while creating an Image object";
+  cl::BufferDescriptor* back_desc = static_cast<cl::BufferDescriptor*>(back_buffer);
+  cl_device_id device_id = GetCLDeviceID(dev.device_id);
+  auto platform = device_info[device_id].platform_id;
+  cl_int err_code;
+  cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
+  cl_image_format format = {CL_RGBA, cl_type};
+  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
 
-  ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
-                   << "provided shape is rank " << ndim;
+  if (IsBufferToImageSupported(dev.device_id)) {
+    descriptor.image_row_pitch = row_pitch;
+    descriptor.buffer = back_desc->buffer;
+  }
+  cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_CREATE_FLAGS, &format, &descriptor,
+                              nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
 
   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
-  size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
-  auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
-  desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
+  desc->buffer = mptr;
+  desc->back_buffer = back_desc;
+
   return desc;
 }
 
+size_t OpenCLWorkspace::GetDataSize(const DLTensor& arr, Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    return DeviceAPI::GetDataSize(arr);
+  }
+  cl_uint row_align = GetImageAlignment(GetThreadEntry()->device.device_id);
+  std::vector<int64_t> shape;
+  shape.assign(arr.shape, arr.shape + arr.ndim);
+  return runtime::GetTextureMemorySize<std::vector<int64_t>>(shape, arr.dtype.bits, arr.dtype.lanes,
+                                                             mem_scope.value(), row_align);
+}
+
+void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ShapeTuple shape,
+                                          DLDataType dtype, Optional<String> mem_scope) {
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
+
+  // Fall back for devices w/o "cl_khr_image2d_from_buffer"
+  if (!IsBufferToImageSupported(dev.device_id)) {
+    cl::BufferDescriptor* ret_desc = desc;  // buffer -> buffer
+    if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+      if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+        // image -> buffer
+        size_t nbytes = GetMemObjectSize(dev, shape.size(), shape.data(), dtype);
+        ret_desc = static_cast<cl::BufferDescriptor*>(
+            OpenCLWorkspace::AllocCLBuffer(dev, nbytes, kTempAllocaAlignment, dtype));
+        ret_desc->is_compat_view = true;
+      }
+    } else {
+      // Any -> Image
+      size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
+      auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
+      cl_uint row_align = GetImageAlignment(dev.device_id);
+      size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
+      size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4
+
+      ret_desc = static_cast<cl::BufferDescriptor*>(OpenCLWorkspace::Global()->AllocCLImage(
+          dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope));
+      ret_desc->is_compat_view = true;
+    }
+    return ret_desc;
+  }
+
+  if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
+    if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+      //  buffer -> buffer
+      return desc;
+    } else {
+      // image -> buffer
+      return desc->back_buffer;
+    }
+  }
+  size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
+  auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
+  cl_uint row_align = GetImageAlignment(dev.device_id);
+  size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
+  size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4
+
+  cl::BufferDescriptor* back_buffer;
+  if (desc->back_buffer) {
+    // image -> image
+    back_buffer = desc->back_buffer;
+  } else {
+    // buffer -> image
+    back_buffer = desc;
+  }
+
+  return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height,
+                                             row_pitch, dtype, mem_scope);
+}
+
+void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
+  auto* desc = static_cast<const cl::BufferDescriptor*>(ptr);
+  // Handle the fall back
+  if (!IsBufferToImageSupported(dev.device_id)) {
+    if (desc->is_compat_view) {
+      OPENCL_CALL(clReleaseMemObject(desc->buffer));
+      delete desc;
+    }
+    return;
+  }
+
+  if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+    OPENCL_CALL(clReleaseMemObject(desc->buffer));
+    delete desc;
+  }
+}
+
 void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::NDArray& narr) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
   return desc->host_ptr;
@@ -286,9 +447,8 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host
 #ifdef USE_OPENCL_EXTN_QCOM
     Device dev = narr.operator->()->device;
     cl_device_id device_id = GetCLDeviceID(dev.device_id);
-    auto platform = device_to_platform[device_id];
+    auto platform = device_info[device_id].platform_id;
 
-    OPENCL_CALL(clFinish(this->GetQueue(dev)));
     if (desc->host_ptr) {
       OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
                                           reinterpret_cast<void*>(desc->host_ptr), 0, nullptr,
@@ -313,48 +473,35 @@ void OpenCLWorkspace::SetNativePtr(const tvm::runtime::NDArray& narr, void* host
 void OpenCLWorkspace::SetPerfHint(Device dev, cl_uint perf_hint) {
 #ifdef CL_CONTEXT_PERF_HINT_QCOM
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
-  auto platform = device_to_platform[device_id];
+  auto platform = device_info[device_id].platform_id;
   OPENCL_CALL(clSetPerfHintQCOM(this->contexts[platform], perf_hint));
 #endif
 }
 
 void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
-  // We have to make sure that the memory object is not in the command queue
-  // for some OpenCL platforms.
-  OPENCL_CALL(clFinish(this->GetQueue(dev)));
-
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
-  if (desc->host_ptr) {
-    OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
-                                        reinterpret_cast<void*>(desc->host_ptr), 0, nullptr,
-                                        nullptr));
+  if (desc->back_buffer) {
+    // 2D Image w/ back buffer allocated from pool
+    OPENCL_CALL(clReleaseMemObject(desc->buffer));
+    MemoryManager::GetAllocator(dev, desc->back_buffer->mbuf.alloc_type)
+        ->Free(desc->back_buffer->mbuf);
+    delete desc;
+  } else {
+    if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+      // 1D buffer allocated from pool
+      if (desc->host_ptr) {
+        clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
+                                reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
+      }
+      OPENCL_CALL(clReleaseMemObject(desc->buffer));
+      delete desc;
+    } else if (!IsBufferToImageSupported(dev.device_id)) {
+      // 2D Image allocated w/o pool
+      OPENCL_CALL(clReleaseMemObject(desc->buffer));
+      delete desc;
+      return;
+    }
   }
-  OPENCL_CALL(clReleaseMemObject(desc->buffer));
-  delete desc;
-}
-
-cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
-                                     DLDataType type_hint) {
-  this->Init();
-  cl_device_id device_id = GetCLDeviceID(dev.device_id);
-  auto platform = device_to_platform[device_id];
-  cl_int err_code;
-  cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
-  cl_image_format format = {CL_RGBA, cl_type};
-  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
-  cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_READ_WRITE, &format, &descriptor,
-                              nullptr, &err_code);
-  OPENCL_CHECK_ERROR(err_code);
-  return mptr;
-}
-
-void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height,
-                                             DLDataType type_hint) {
-  return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint);
-}
-
-void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
-  GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
 }
 
 void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
@@ -444,11 +591,18 @@ void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
 }
 
 void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  return GetThreadEntry()->pool.AllocWorkspace(dev, size);
+  this->Init();
+  cl::BufferDescriptor* ret_buffer = nullptr;
+  auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
+                 ->Alloc(dev, size, kTempAllocaAlignment, type_hint);
+  ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
+  ret_buffer->mbuf = buf;
+  return ret_buffer;
 }
 
 void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
-  GetThreadEntry()->pool.FreeWorkspace(dev, data);
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
+  MemoryManager::GetAllocator(dev, desc->mbuf.alloc_type)->Free(desc->mbuf);
 }
 
 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
@@ -585,9 +739,20 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
     this->devices.insert(this->devices.end(), devices.begin(), devices.end());
     for (size_t i = 0; i < devices.size(); ++i) {
       cl_device_id did = devices[i];
-      device_to_platform[did] = platform;
+      CLDeviceInfo dev_info;
+      dev_info.platform_id = platform;
       this->queues.push_back(clCreateCommandQueue(this->contexts[platform], did, 0, &err_code));
       OPENCL_CHECK_ERROR(err_code);
+      cl_uint row_pitch;
+      OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, sizeof(row_pitch),
+                                  &row_pitch, nullptr));
+      if (0 == row_pitch) {
+        row_pitch = kAllocAlignment;  // Fallback
+      }
+      dev_info.image_row_align = row_pitch;
+      dev_info.image_from_buffer_support =
+          IsOpenCLExtensionSupported(did, "cl_khr_image2d_from_buffer");
+      device_info.insert({did, dev_info});
     }
     OPENCL_CHECK_ERROR(err_code);
   }
@@ -617,9 +782,9 @@ TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMR
   type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
   type_hint.lanes = 1;
 
-  OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
-  *rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), static_cast<size_t>(height),
-                                   type_hint);
+  *rv = OpenCLWorkspace::Global()->AllocDataSpace(dev, static_cast<size_t>(width),
+                                                  static_cast<size_t>(height), type_hint,
+                                                  Optional<String>("global.texture"));
 });
 
 TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
@@ -632,7 +797,7 @@ TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRe
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
-  ptr->FreeTextureWorkspace(dev, data);
+  ptr->FreeDataSpace(dev, data);
   *rv = static_cast<int32_t>(0);
 });
 
@@ -647,6 +812,92 @@ TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
   return Timer(make_object<OpenCLTimerNode>(dev));
 });
 
+class OpenCLPooledAllocator final : public memory::PooledAllocator {
+ public:
+  explicit OpenCLPooledAllocator() : PooledAllocator() {}
+
+  bool AllowMemoryScope(const std::string& mem_scope) const final {
+    return ((mem_scope.find("texture") != std::string::npos) || mem_scope.empty() ||
+            ("global" == mem_scope));
+  }
+
+  Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) override {
+    std::lock_guard<std::recursive_mutex> lock(mu_);
+    size_t size = ((nbytes + page_size_ - 1) / page_size_) * page_size_;
+    auto&& it = memory_pool_.find(size);
+    if (it != memory_pool_.end() && !it->second.empty()) {
+      auto&& pool = it->second;
+      auto ret = pool.back();
+      pool.pop_back();
+      return ret;
+    }
+    Buffer buf;
+    buf.device = dev;
+    buf.size = size;
+    buf.alloc_type = AllocatorType::kPooled;
+    try {
+      buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
+    } catch (InternalError& err) {
+      LOG(WARNING) << "PooledAllocator got InternalError during allocation: " << err.message();
+      LOG(WARNING) << "Trying to release all unused memory and reallocate...";
+      ReleaseAll();
+      buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
+    }
+
+    used_memory_.fetch_add(size, std::memory_order_relaxed);
+    VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+    return buf;
+  }
+
+  Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
+               const std::string& mem_scope) override {
+    if (AllowMemoryScope(mem_scope)) {
+      NDArray::Container container(nullptr, shape, type_hint, dev);
+      size_t size = DeviceAPI::Get(dev)->GetDataSize(container.dl_tensor);
+      Buffer buf;
+      buf.device = dev;
+      buf.size = size;
+      buf.alloc_type = AllocatorType::kPooled;
+      buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
+                                                     String(mem_scope));
+      if (mem_scope.find("texture") == std::string::npos) {
+        // All textures are backed by buffers - don't count in total memory
+        used_memory_.fetch_add(size, std::memory_order_relaxed);
+      }
+      DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+      return buf;
+    }
+    LOG(FATAL) << "Unsupported memory scope for this Allocator:" << mem_scope;
+    return {};
+  }
+
+  void Free(const Buffer& buffer) override {
+    std::lock_guard<std::recursive_mutex> lock(mu_);
+    if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
+      memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
+    }
+    memory_pool_.at(buffer.size).push_back(buffer);
+    VLOG(1) << "reclaim buffer " << buffer.size;
+  }
+
+  void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+                   const std::string& mem_scope) final {
+    OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
+    return ws_->AllocDataSpaceView(buffer.device, buffer.data, shape, type_hint,
+                                   Optional<String>(mem_scope));
+  }
+
+  void FreeView(Device dev, void* data) final {
+    OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
+    return ws_->FreeDataSpaceView(dev, data);
+  }
+};
+
+TVM_REGISTER_GLOBAL("DeviceAllocator.opencl").set_body([](TVMArgs args, TVMRetValue* rv) {
+  Allocator* alloc = new OpenCLPooledAllocator();
+  *rv = static_cast<void*>(alloc);
+});
+
 }  // namespace cl
 size_t OpenCLTimerNode::count_timer_execs = 0;
 std::vector<size_t> OpenCLTimerNode::event_start_idxs;
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 567b7ad88a9e..77c50b23895c 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -225,7 +225,7 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
   std::lock_guard<std::mutex> lock(build_lock_);
   int device_id = t->device.device_id;
   auto did = w->GetCLDeviceID(device_id);
-  auto platform = w->device_to_platform[did];
+  auto platform = w->device_info[did].platform_id;
   if (!IsProgramCreated(func_name, device_id)) {
     // create program
     if (fmt_ == "cl") {
@@ -294,7 +294,7 @@ void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) {
       const unsigned char* programBinary = bin_vector.data();
 
       cl_device_id dev = workspace_->GetCLDeviceID(device_id);
-      auto platform = workspace_->device_to_platform[dev];
+      auto platform = workspace_->device_info[dev].platform_id;
       programs_[name][device_id] =
           clCreateProgramWithBinary(workspace_->contexts[platform], 1, &dev, &binarySize,
                                     &programBinary, &binaryStatus, &err);
diff --git a/src/runtime/opencl/opencl_module_spirv.cc b/src/runtime/opencl/opencl_module_spirv.cc
index 7e52b7057bc7..28e02a4e3749 100644
--- a/src/runtime/opencl/opencl_module_spirv.cc
+++ b/src/runtime/opencl/opencl_module_spirv.cc
@@ -96,7 +96,7 @@ cl_kernel OpenCLSPIRVModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenC
     size_t len = it->second.data.size() * sizeof(uint32_t);
     cl_int err;
     cl_device_id dev = w->devices[device_id];
-    auto platform = w->device_to_platform[dev];
+    auto platform = w->device_info[dev].platform_id;
     programs_[func_name][device_id] =
         clCreateProgramWithBinary(w->contexts[platform], 1, &dev, &len, &s, nullptr, &err);
     OPENCL_CHECK_ERROR(err);
diff --git a/src/runtime/opencl/texture_pool.cc b/src/runtime/opencl/texture_pool.cc
deleted file mode 100644
index 0b9477f2d4ea..000000000000
--- a/src/runtime/opencl/texture_pool.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file texture_pool.h
- * \brief Texture pool utility.
- */
-#include <limits>
-#include <memory>
-
-#include "../texture.h"
-
-namespace tvm {
-namespace runtime {
-
-void* Pool2D::Alloc(Device dev, DeviceAPI* device, size_t width, size_t height,
-                    DLDataType type_hint) {
-  Entry e;
-  Entry new_mem;
-  // Processed several experiments and found that when we are trying to fit
-  // small texture to too big texture then it may lead to the performance
-  // degradation.
-  // Coefficient at 5 looks like robust variant for reusing textures.
-  const int64_t max_ratio = 5;
-  e.data = nullptr;
-  std::vector<Entry>::iterator best_mem;
-  if (free_list_.size() != 0) {
-    int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-    int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-    int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-    int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-    for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
-      if (it->type.code != type_hint.code) {
-        continue;
-      }
-      // avoid reusing too small and too big textures
-      if (width / it->x > max_ratio || it->x / width > max_ratio || height / it->y > max_ratio ||
-          it->y / height > max_ratio) {
-        continue;
-      }
-      int64_t new_width = std::max(it->x, width);
-      int64_t new_height = std::max(it->y, height);
-      int64_t added_size_x = new_width - it->x;
-      int64_t added_size_y = new_height - it->y;
-      int64_t wasted_size_x = new_width - width;
-      int64_t wasted_size_y = new_height - height;
-      // Minimize added size first and wasted size thereafter
-      if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-          (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-          (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-          (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-        min_added_size_x = added_size_x;
-        min_added_size_y = added_size_y;
-        min_wasted_size_x = wasted_size_x;
-        min_wasted_size_y = wasted_size_y;
-        best_mem = it;
-        new_mem.x = new_width;
-        new_mem.y = new_height;
-      }
-    }
-
-    if (min_added_size_x == 0 && min_added_size_y == 0) {
-      // use existing block
-      e = *best_mem;
-      free_list_.erase(best_mem);
-    } else if (static_cast<size_t>(min_added_size_x) <= width ||
-               static_cast<size_t>(min_added_size_y) <= height) {
-      // if added size is less or equal to
-      // what is needed by alloc, then grow entry
-      device->FreeDataSpace(dev, best_mem->data);
-      free_list_.erase(best_mem);
-      new_mem.type = type_hint;
-      std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
-      new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type,
-                                            Optional<String>("global.texture"));
-      e = new_mem;
-    }
-  }
-
-  if (e.data == nullptr) {
-    // create new block
-    std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
-    e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
-                                    Optional<String>("global.texture"));
-    e.x = width;
-    e.y = height;
-    e.type = type_hint;
-  }
-
-  allocated_.push_back(e);
-  return e.data;
-}
-
-void Pool2D::Free(void* data) {
-  Entry e;
-  if (allocated_.back().data == data) {
-    // quick path, last allocated.
-    e = allocated_.back();
-    allocated_.pop_back();
-  } else {
-    int index = static_cast<int>(allocated_.size()) - 2;
-    for (; index >= 0 && allocated_[index].data != data; --index) {
-    }
-    ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated";
-    e = allocated_[index];
-    allocated_.erase(allocated_.begin() + index);
-  }
-  free_list_.push_back(e);
-}
-
-// Release all resources immediately
-void Pool2D::Release(Device dev, DeviceAPI* device) {
-  for (auto& e : allocated_) {
-    device->FreeDataSpace(dev, e.data);
-  }
-  for (auto& e : free_list_) {
-    device->FreeDataSpace(dev, e.data);
-  }
-  allocated_.clear();
-  free_list_.clear();
-}
-
-TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device)
-    : device_type_(device_type), device_(device) {}
-
-TexturePool::~TexturePool() {
-  for (size_t i = 0; i < array_.size(); ++i) {
-    if (array_[i] != nullptr) {
-      Device dev;
-      dev.device_type = device_type_;
-      dev.device_id = static_cast<int>(i);
-      array_[i]->Release(dev, device_);
-      delete array_[i];
-    }
-  }
-}
-
-void* TexturePool::AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint) {
-  if (static_cast<size_t>(dev.device_id) >= array_.size()) {
-    array_.resize(dev.device_id + 1, nullptr);
-  }
-  if (array_[dev.device_id] == nullptr) {
-    array_[dev.device_id] = new Pool2D();
-  }
-  return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint);
-}
-
-void TexturePool::FreeTexture(Device dev, void* ptr) {
-  ICHECK(static_cast<size_t>(dev.device_id) < array_.size() && array_[dev.device_id] != nullptr)
-      << "Attempt to free texture from null texture pool";
-  array_[dev.device_id]->Free(ptr);
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index ebfd312595a3..e7f103daadc9 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -136,7 +136,8 @@ class ROCMDeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
-
+      case kImagePitchAlignment:
+        return;
       case kAvailableGlobalMemory:
         // Not currently implemented.
         *rv = nullptr;
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index dc38101f0cd4..f3a827aa8792 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -30,6 +30,8 @@
 #include <string>
 #include <vector>
 
+#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1))
+
 namespace tvm {
 namespace runtime {
 
@@ -94,74 +96,26 @@ inline bool IsTextureStorage(std::string scope) {
   return scope.find("texture") != std::string::npos;
 }
 
-class TVM_DLL Pool2D {
- public:
-  Pool2D() = default;
-  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint);
-  void Free(void* data);
-  // Release all resources immediately
-  void Release(Device dev, DeviceAPI* device);
-
- protected:
-  struct Entry {
-    void* data;
-    size_t x;
-    size_t y;
-    DLDataType type;
-  };
-  std::vector<Entry> free_list_;
-  std::vector<Entry> allocated_;
-};
-
 /*!
- * \brief A two dimensional storage pool that recycles temporal workspace
- * allocations for dynamically allocated texture. See AllocTexture docstring
- * for approach to allocation and reuse.
+ * \brief Returns the physical backing memory size required for given specification
+ * \param shape shape of tensor
+ * \param bits dtype bits
+ * \param lanes vectorization lanes
+ * \param mem_scope the memory scope info
+ * \param image_row_align image rowwise alignment size
+ * \return returns the backing memory size
  */
-class TVM_DLL TexturePool {
- public:
-  /*!
-   * \brief Create pool with specific device type and device.
-   * \param device_type The device type.
-   * \param device_api The device API.
-   */
-  TexturePool(DLDeviceType device_type, DeviceAPI* device_api);
-  /*! \brief destructor */
-  ~TexturePool();
-
-  /*!
-   * \brief Allocate a two dimensional temporal texture workspace on device
-   *
-   * \note Two dimensional texture workspaces will be grown and reused
-   * according to the following strategy:
-   *  - Choose the workspace which minimizes the amount of memory required to
-   *    grow the workspace to fit the request.
-   *  - If a set of workspaces exist that fit the current request without
-   *    expansion, choose the workspace of that set which most closely
-   *    matches the request size, minimizing wasted space.
-   *
-   * \param dev The context of allocation.
-   * \param width The width of the 2d texture to be allocated.
-   * \param height The height of the 2d texture to be allocated.
-   * \param type_hint The type of elements.
-   */
-  void* AllocTexture(Device dev, size_t width, size_t height, DLDataType type_hint);
-  /*!
-   * \brief Free temporal texture in backend execution.
-   *
-   * \param dev The context of allocation.
-   * \param ptr The pointer to be freed.
-   */
-  void FreeTexture(Device dev, void* ptr);
+template <typename T>
+size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope,
+                            int image_row_align) {
+  size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope);
+  auto tshape = ApplyTexture2DFlattening<int64_t>(shape, shape.size(), axis);
 
- private:
-  /*! \brief pool of device local array */
-  std::vector<Pool2D*> array_;
-  /*! \brief device type this pool support */
-  DLDeviceType device_type_;
-  /*! \brief The device API */
-  DeviceAPI* device_;
-};
+  auto pack_size = shape[shape.size() - 1];
+  auto pixel_size = (bits * lanes + 7) / 8;
+  size_t row_pitch = ALIGN_UP(tshape.width * pixel_size * pack_size, image_row_align);
+  return row_pitch * tshape.height;
+}
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 483668a2a75f..af7b35e85ec5 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -168,11 +168,12 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = device(index).compute_memory_size;
       return;
     }
-
     case kAvailableGlobalMemory:
       // Not currently implemented.  Will only be implementable for
       // devices that support the VK_EXT_memory_budget extension.
       break;
+    case kImagePitchAlignment:
+      return;
   }
 }
 
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 5933c9582cec..b447c0729746 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -674,5 +674,19 @@ runtime::Module BuildOpenCL(IRModule mod, Target target) {
 }
 
 TVM_REGISTER_GLOBAL("target.build.opencl").set_body_typed(BuildOpenCL);
+
+String DeviceScopeCompatibilityFromTarget(Target target, String memory_scope) {
+  auto prototype_keys = target->GetKeys();
+  bool is_adreno =
+      std::find(prototype_keys.begin(), prototype_keys.end(), "adreno") != prototype_keys.end();
+  if (is_adreno) {
+    return String("global");
+  }
+  return memory_scope;
+}
+
+TVM_REGISTER_GLOBAL("DeviceScopeCompatibility.opencl")
+    .set_body_typed(DeviceScopeCompatibilityFromTarget);
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index e0a0ad23a1b6..e12c18e5ac73 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -366,6 +366,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     // specify any limitations on the number of kernel arguments. max_function_args
     // equals to 128 looks like a reasonable number of kernel arguments.
     .add_attr_option<runtime::Int>("max_function_args", runtime::Int(128))
+    .add_attr_option<runtime::Int>("image_base_address_alignment", runtime::Int(64))
     .set_default_keys({"opencl", "gpu"});
 
 // The metal has some limitations on the number of input parameters. This is why attribute
diff --git a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
deleted file mode 100644
index 2d3f43ddce6d..000000000000
--- a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/runtime/container/optional.h>
-
-#include "../src/runtime/opencl/opencl_common.h"
-#include "../src/runtime/texture.h"
-
-using namespace tvm::runtime;
-using namespace tvm::runtime::cl;
-
-// PoolWrapper is necessary because in class Pool2D we don't have an access to
-// its protected members. In this class we add new methods which allow us to
-// get and check internal state of class Pool
-class PoolWrapper : public Pool2D {
- public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t AllocatedListSize() const { return allocated_.size(); }
-  inline std::pair<size_t, size_t> FreeListItemSize(size_t idx) const {
-    return std::make_pair(free_list_[idx].x, free_list_[idx].y);
-  }
-  inline std::pair<size_t, size_t> AllocatedListItemSize(size_t idx) const {
-    return std::make_pair(allocated_[idx].x, allocated_[idx].y);
-  }
-};
-
-TEST(OpenCLTexturePool, textures_reallocation_optimal_size) {
-  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
-  OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
-  DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 64, 12455, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 768, 1024, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 1024);
-}
-
-TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) {
-  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
-  OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
-  DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 12455, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-}
-
-TEST(OpenCLTexturePool, avoid_reusing_too_small_textures) {
-  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
-  OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
-  DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 1024, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Alloc(t->device, workspace, 12544, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 64);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 12544);
-  EXPECT_EQ(item.second, 64);
-}
diff --git a/tests/cpp-runtime/opencl/opencl_timer_test.cc b/tests/cpp-runtime/opencl/opencl_timer_test.cc
index 1753300d3a09..ec038be5406c 100644
--- a/tests/cpp-runtime/opencl/opencl_timer_test.cc
+++ b/tests/cpp-runtime/opencl/opencl_timer_test.cc
@@ -37,7 +37,7 @@ TEST(OpenCLTimerNode, nested_timers) {
   int64_t nested_time_sum = 0;
 
   auto did = workspace->GetCLDeviceID(thr->device.device_id);
-  auto platform = workspace->device_to_platform[did];
+  auto platform = workspace->device_info[did].platform_id;
   Timer init_timer = Timer::Start(thr->device);
   for (int i = 0; i < NUM_REPEAT; ++i) {
     Timer nested_timer = Timer::Start(thr->device);
diff --git a/tests/cpp-runtime/opencl/texture_copy_test.cc b/tests/cpp-runtime/opencl/texture_copy_test.cc
new file mode 100644
index 000000000000..23b490f695e2
--- /dev/null
+++ b/tests/cpp-runtime/opencl/texture_copy_test.cc
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <random>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+using tvm::runtime::kAllocAlignment;
+using tvm::runtime::memory::AllocatorType;
+using tvm::runtime::memory::Buffer;
+using tvm::runtime::memory::MemoryManager;
+using tvm::runtime::memory::Storage;
+
+class TextureCopyTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+    if (!enabled) {
+      GTEST_SKIP() << "Skip texture copy test because opencl runtime is disabled.\n";
+    }
+    // Check hardware support
+    tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global();
+    tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry();
+    if (!workspace->IsBufferToImageSupported(thr->device.device_id)) {
+      GTEST_SKIP() << "Skip test case as BufferToImage is not supported \n";
+    }
+    (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
+        thr->device, tvm::runtime::memory::AllocatorType::kPooled);
+  }
+};
+
+TEST(TextureCopy, HostDeviceRT) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    GTEST_SKIP() << "Skip texture copy test because opencl runtime is disabled.\n";
+  }
+  tvm::runtime::cl::OpenCLWorkspace* workspace = tvm::runtime::cl::OpenCLWorkspace::Global();
+  tvm::runtime::cl::OpenCLThreadEntry* thr = workspace->GetThreadEntry();
+  (void)tvm::runtime::memory::MemoryManager::GetOrCreateAllocator(
+      thr->device, tvm::runtime::memory::AllocatorType::kPooled);
+  std::vector<int64_t> shape{16, 16, 4};
+  auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  String mem_scope = "global.texture";
+  auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, mem_scope);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  // Random initialize host ndarray
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr0->data)[i] = random(mt);
+  }
+
+  // Do a roundtrip from host storage to opencl texture storage and back
+  cpu_arr0.CopyTo(opencl_txarr0);
+  opencl_txarr0.CopyTo(cpu_arr1);
+  for (size_t i = 0; i < size; ++i) {
+    ICHECK_LT(
+        std::fabs(static_cast<float*>(cpu_arr1->data)[i] - static_cast<float*>(cpu_arr0->data)[i]),
+        1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewBufferAsBuffer) {
+  using namespace tvm;
+  std::vector<int64_t> shape{1, 16, 16, 8};
+  std::vector<int64_t> same_shape{1, 8, 16, 16};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  String mem_scope = "global";
+
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
+  auto opencl_memobj = stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, mem_scope);
+  auto opencl_memview =
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, mem_scope);
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_memobj);
+  // Copy from OpenCLBuffer
+  opencl_memobj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_memview);
+  // Copy from OpenCLBuffer
+  opencl_memview.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewBufferAsImage) {
+  using namespace tvm;
+  // Shape that doesn't cause padding for image row
+  std::vector<int64_t> shape{1, 16, 16, 8, 4};
+  std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
+  auto opencl_buf_obj = stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global");
+  auto opencl_img_obj =
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global.texture");
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_buf_obj);
+  // Copy from OpenCLBuffer
+  opencl_buf_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj);
+  // Copy from OpenCLBuffer
+  opencl_img_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewImageAsBuffer) {
+  using namespace tvm;
+  // Shape that doesn't cause padding for image row
+  std::vector<int64_t> shape{1, 16, 16, 8, 4};
+  std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
+  auto opencl_img_obj =
+      stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global.texture");
+  auto opencl_buf_obj =
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global");
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_buf_obj);
+  // Copy from OpenCLBuffer
+  opencl_buf_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj);
+  // Copy from OpenCLBuffer
+  opencl_img_obj.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
+
+TEST_F(TextureCopyTest, ViewImageAsImage) {
+  using namespace tvm;
+  // Shape that doesn't cause padding for image row
+  std::vector<int64_t> shape{1, 16, 16, 8, 4};
+  std::vector<int64_t> same_shape{1, 8, 16, 16, 4};
+  auto cpu_arr = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto cpu_arr_ret = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  DLDevice cl_dev = {kDLOpenCL, 0};
+  auto allocator = MemoryManager::GetOrCreateAllocator(cl_dev, AllocatorType::kPooled);
+  auto buffer = allocator->Alloc(cl_dev, ShapeTuple(shape), {kDLFloat, 32, 1});
+  auto stor = Storage(buffer, allocator);
+
+  auto opencl_img_obj_1 =
+      stor->AllocNDArrayScoped(0, ShapeTuple(shape), {kDLFloat, 32, 1}, "global.texture");
+  auto opencl_img_obj_2 =
+      stor->AllocNDArrayScoped(0, ShapeTuple(same_shape), {kDLFloat, 32, 1}, "global.texture");
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  /* Check original object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj_1);
+  // Copy from OpenCLBuffer
+  opencl_img_obj_1.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+
+  /* Check view object round trip */
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr->data)[i] = random(mt);
+  }
+  // Copy to OpenCLBuffer
+  cpu_arr.CopyTo(opencl_img_obj_2);
+  // Copy from OpenCLBuffer
+  opencl_img_obj_2.CopyTo(cpu_arr_ret);
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_arr->data)[i] -
+                        static_cast<float*>(cpu_arr_ret->data)[i]),
+              1e-5);
+  }
+}
diff --git a/tests/cpp/relay/backend/graph_plan_token_alloc.cc b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
index 4641da2cb8b5..7fca4b26a985 100644
--- a/tests/cpp/relay/backend/graph_plan_token_alloc.cc
+++ b/tests/cpp/relay/backend/graph_plan_token_alloc.cc
@@ -24,23 +24,24 @@
 namespace tvm {
 namespace relay {
 
-// TokenAllocator2d is necessary because in class TokenAllocator2D we don't
+// TokenAllocatorMixed is necessary because in class TokenAllocatorMixed we don't
 // have an access to its protected members. In this class we add new methods
-// which allow us to get and check internal state of class TokenAllocator2D
-class TokenAllocator2DWrapper : public TokenAllocator2D {
+// which allow us to get and check internal state of class TokenAllocatorMixed
+class TokenAllocatorMixedWrapper : public TokenAllocatorMixed {
  public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t BlockMapSize() const { return blocks_.size(); }
+  inline size_t FreeListSize() const { return free_.size(); }
+  inline size_t AllocListSize() const { return data_.size(); }
 };
 
-TEST(Token2DAlloc, OneToken) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureOneToken) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -49,29 +50,28 @@ TEST(Token2DAlloc, OneToken) {
       -1    // storage_id
   };
   auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(size2d, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 }
 
-TEST(Token2DAlloc, EqualSizeTokenReuse) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureEqualSizeTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -80,18 +80,16 @@ TEST(Token2DAlloc, EqualSizeTokenReuse) {
       -1    // storage_id
   };
   auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  EXPECT_EQ(size2d, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   StorageToken tok2 = {
@@ -103,24 +101,51 @@ TEST(Token2DAlloc, EqualSizeTokenReuse) {
   };
   auto req = alloc.Request(&tok2);
   EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req->storage_id, storage_ids - 1);
   EXPECT_EQ(req->ref_counter, 1);
   auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq.channel, 4);
-  EXPECT_EQ(sizeReq.height, 22);
-  EXPECT_EQ(sizeReq.width, 400);
+  EXPECT_EQ(sizeReq, 140800);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  // Try reuse of the texture memory for buffer object
+  VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"), MemoryScope("global"));
+  StorageToken tok3 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd2,  // virtual device
+      -1    // storage_id
+  };
+  auto req1 = alloc.Request(&tok3);
+  EXPECT_NE(req1, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req1->storage_id, storage_ids - 1);
+  EXPECT_EQ(req1->ref_counter, 1);
+  sizeReq = alloc.GetSize2D(req1);
+  EXPECT_EQ(sizeReq, 140800);
+
+  req1->ref_counter -= 1;
+  alloc.CheckForRelease(req1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
 }
 
-TEST(Token2DAlloc, EqualSizeDiffTypes) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureEqualSizeDiffTypes) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -128,19 +153,17 @@ TEST(Token2DAlloc, EqualSizeDiffTypes) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
@@ -151,28 +174,27 @@ TEST(Token2DAlloc, EqualSizeDiffTypes) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  EXPECT_EQ(alloc.Request(&tok2), nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
 
-  alloc.Alloc(&tok2, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
-  EXPECT_EQ(alloc.FreeListSize(), 1);
+  auto req1 = alloc.Request(&tok2);
+  EXPECT_NE(req1, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
 
-  tok2.ref_counter -= 1;
-  alloc.CheckForRelease(&tok2);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
-  EXPECT_EQ(alloc.FreeListSize(), 2);
+  req1->ref_counter -= 1;
+  alloc.CheckForRelease(req1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
 }
 
-TEST(Token2DAlloc, DifferentSizesTokenReuse) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -180,19 +202,17 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
@@ -205,19 +225,16 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) {
   };
   auto req = alloc.Request(&tok2);
   EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 2);
-  auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq.channel, 4);
-  EXPECT_EQ(sizeReq.height, 40);
-  EXPECT_EQ(sizeReq.width, 900);
+  EXPECT_EQ(req->ref_counter, 1);
+  sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq, 576000);
 
-  tok2.ref_counter -= 1;
   req->ref_counter -= 1;
-  alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
@@ -230,24 +247,23 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse) {
   };
   auto req2 = alloc.Request(&tok3);
   EXPECT_NE(req2, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req2->storage_id, storage_ids - 1);
   EXPECT_EQ(req2->ref_counter, 1);
-  auto sizeReq2 = alloc.GetSize2D(req2);
-  EXPECT_EQ(sizeReq2.channel, 4);
-  EXPECT_EQ(sizeReq2.height, 40);
-  EXPECT_EQ(sizeReq2.width, 900);
+  sizeReq = alloc.GetSize2D(req2);
+  EXPECT_EQ(sizeReq, 576000);
 }
 
-TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureDifferentSizesTokenReuse2) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -255,19 +271,17 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 22);
-  EXPECT_EQ(size2d.width, 400);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 140800);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 5, 30, 20, 4}, DataType(kDLFloat, 32, 1));
@@ -280,24 +294,23 @@ TEST(Token2DAlloc, DifferentSizesTokenReuse2) {
   };
   auto req = alloc.Request(&tok2);
   EXPECT_NE(req, nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
   EXPECT_EQ(req->storage_id, storage_ids - 1);
-  EXPECT_EQ(req->ref_counter, 2);
-  auto sizeReq = alloc.GetSize2D(req);
-  EXPECT_EQ(sizeReq.channel, 4);
-  EXPECT_EQ(sizeReq.height, 5);
-  EXPECT_EQ(sizeReq.width, 600);
+  EXPECT_EQ(req->ref_counter, 1);
+  sizeReq = alloc.GetSize2D(req);
+  EXPECT_EQ(sizeReq, 140800);
 }
 
-TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, TextureSameSizesButDiffMemoryScopes) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({28, 676, 1, 1, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-weight"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-weight"));
   StorageToken tok1 = {
       1,    // ref_counter
       0,    // max bytes
@@ -305,23 +318,22 @@ TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
       vd1,  // virtual device
       -1    // storage_id
   };
-  auto size2d = alloc.GetSize2D(&tok1);
-  EXPECT_EQ(size2d.channel, 4);
-  EXPECT_EQ(size2d.height, 28);
-  EXPECT_EQ(size2d.width, 676);
+  auto sizeReq = alloc.GetSize2D(&tok1);
+  EXPECT_EQ(sizeReq, 302848);
   EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
   alloc.Alloc(&tok1, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   tok1.ref_counter -= 1;
   alloc.CheckForRelease(&tok1);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
 
   TensorType tt2({1, 28, 26, 26, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd2(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd2(kDLOpenCL, 0, Target("opencl -device=adreno"),
+                    MemoryScope("global.texture-nhwc"));
   StorageToken tok2 = {
       1,    // ref_counter
       0,    // max bytes
@@ -330,22 +342,199 @@ TEST(Token2DAlloc, SameSizesButDiffMemoryScopes) {
       -1    // storage_id
   };
   auto tok2Size = alloc.GetSize2D(&tok2);
-  EXPECT_EQ(tok2Size.channel, 4);
-  EXPECT_EQ(tok2Size.height, 28);
-  EXPECT_EQ(tok2Size.width, 676);
+  EXPECT_EQ(tok2Size, 302848);
 
-  EXPECT_EQ(alloc.Request(&tok2), nullptr);
-  EXPECT_EQ(alloc.BlockMapSize(), 1);
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(TokenMixedAlloc, OneToken) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
 
-  alloc.Alloc(&tok2, storage_ids++);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
   EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(TokenMixedAlloc, EqualSizeTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
 
-  tok2.ref_counter -= 1;
-  alloc.CheckForRelease(&tok2);
-  EXPECT_EQ(alloc.BlockMapSize(), 2);
-  EXPECT_EQ(alloc.FreeListSize(), 2);
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 1);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
 }
+
+TEST(TokenMixedAlloc, EqualSizeDiffTypes) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 22, 20, 20, 4}, DataType(kDLFloat, 16, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+
+  auto req1 = alloc.Request(&tok2);
+  EXPECT_NE(req1, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  req1->ref_counter -= 1;
+  alloc.CheckForRelease(req1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+}
+
+TEST(TokenMixedAlloc, DifferentSizesTokenReuse) {
+  TokenAllocatorMixedWrapper alloc;
+  int storage_ids = 0;
+  EXPECT_EQ(alloc.AllocListSize(), 0);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl"));
+  StorageToken tok1 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt1,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  EXPECT_EQ(alloc.Request(&tok1), nullptr);
+
+  alloc.Alloc(&tok1, storage_ids++);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+
+  tok1.ref_counter -= 1;
+  alloc.CheckForRelease(&tok1);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt2({1, 40, 30, 30, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok2 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt2,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req = alloc.Request(&tok2);
+  EXPECT_NE(req, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req->storage_id, storage_ids - 1);
+  EXPECT_EQ(req->ref_counter, 1);
+
+  req->ref_counter -= 1;
+  alloc.CheckForRelease(req);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 1);
+
+  TensorType tt3({1, 25, 30, 30, 4}, DataType(kDLFloat, 32, 1));
+  StorageToken tok3 = {
+      1,    // ref_counter
+      0,    // max bytes
+      tt3,  // tensor type
+      vd1,  // virtual device
+      -1    // storage_id
+  };
+  auto req2 = alloc.Request(&tok3);
+  EXPECT_NE(req2, nullptr);
+  EXPECT_EQ(alloc.AllocListSize(), 1);
+  EXPECT_EQ(alloc.FreeListSize(), 0);
+  EXPECT_EQ(req2->storage_id, storage_ids - 1);
+  EXPECT_EQ(req2->ref_counter, 1);
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/cpp/runtime/memory/memory_manager_tests.cc b/tests/cpp/runtime/memory/memory_manager_tests.cc
index aea37bf7fbfe..47146d2000fc 100644
--- a/tests/cpp/runtime/memory/memory_manager_tests.cc
+++ b/tests/cpp/runtime/memory/memory_manager_tests.cc
@@ -85,6 +85,38 @@ TEST_F(TvmVMMemoryManagerTest, NaiveEmptyBasic) {
   EXPECT_EQ(allocator->UsedMemory(), 0);
 }
 
+TEST_F(TvmVMMemoryManagerTest, BothAllocatorsCoexists) {
+  Device dev = {kDLCPU, 0};
+  // Initialize and use Naive allocator
+  Allocator* nallocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
+  EXPECT_EQ(nallocator->UsedMemory(), 0);
+  auto dt = DataType::Float(32);
+  size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes();
+  ShapeTuple shape = {1, 3, 6, 6};
+  {
+    auto ndarray = nallocator->Empty(shape, dt, dev);
+    EXPECT_EQ(nallocator->UsedMemory(), nbytes);
+  }
+  EXPECT_EQ(nallocator->UsedMemory(), 0);
+  auto naive_buff = nallocator->Alloc(dev, shape, dt);
+  EXPECT_EQ(nallocator->UsedMemory(), nbytes);
+
+  // Initialize and use Pooled allocator
+  Allocator* pallocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
+  EXPECT_EQ(pallocator->UsedMemory(), 0);
+  auto pooled_buff = pallocator->Alloc(dev, shape, dt);
+  EXPECT_NE(pallocator->UsedMemory(), 0);
+
+  // Operate on Naive allocator
+  EXPECT_EQ(nallocator->UsedMemory(), nbytes);
+  nallocator->Free(naive_buff);
+  EXPECT_EQ(nallocator->UsedMemory(), 0);
+
+  // Operate on Pooled allocator
+  pallocator->Free(pooled_buff);
+  EXPECT_NE(pallocator->UsedMemory(), 0);
+}
+
 TEST_F(TvmVMMemoryManagerTest, PooledEmptyBasic) {
   Device dev = {kDLCPU, 0};
   Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
@@ -150,58 +182,6 @@ TEST_F(TvmVMMemoryManagerTest, PooledAllocWithShape) {
   }
 }
 
-TEST_F(TvmVMMemoryManagerTest, NaiveAllocOpenCLTexture) {
-  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
-  if (!enabled) {
-    LOG(INFO) << "Skip OpenCL Texture alloc test because opencl runtime is disabled.\n";
-    return;
-  }
-  Device dev = {kDLOpenCL, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kNaive);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-  auto dt = DataType::Float(32);
-  size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes();
-  ShapeTuple shape = {1, 3, 6, 6};
-  auto buff = allocator->Alloc(dev, shape, dt);
-  EXPECT_EQ(allocator->UsedMemory(), nbytes);
-  allocator->Free(buff);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-
-  auto texture = allocator->Alloc(dev, shape, dt, "global.texture");
-  EXPECT_EQ(allocator->UsedMemory(), nbytes);
-  allocator->Free(texture);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-}
-
-TEST_F(TvmVMMemoryManagerTest, PooledAllocOpenCLTexture) {
-  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
-  if (!enabled) {
-    LOG(INFO) << "Skip OpenCL Texture alloc test because opencl runtime is disabled.\n";
-    return;
-  }
-  Device dev = {kDLOpenCL, 0};
-  Allocator* allocator = MemoryManagerWrapper::GetOrCreateAllocator(dev, kPooled);
-  EXPECT_EQ(allocator->UsedMemory(), 0);
-  auto dt = DataType::Float(32);
-  size_t nbytes = 1 * 3 * 6 * 6 * dt.bytes();
-  size_t page_size = PooledAllocator::kDefaultPageSize;
-  size_t size = ((nbytes + page_size - 1) / page_size) * page_size;
-  ShapeTuple shape = {1, 3, 6, 6};
-  auto buff = allocator->Alloc(dev, shape, dt);
-  EXPECT_EQ(allocator->UsedMemory(), size);
-  allocator->Free(buff);
-  EXPECT_EQ(allocator->UsedMemory(), size);
-
-  try {
-    auto texture = allocator->Alloc(dev, shape, dt, "global.texture");
-    (void)texture;
-    FAIL();
-  } catch (std::exception& e) {
-    std::string pattern = "This alloc should be implemented";
-    std::string what = e.what();
-    EXPECT_NE(what.find(pattern), std::string::npos) << what;
-  }
-}
 }  // namespace memory
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/cpp/texture_copy_test.cc b/tests/cpp/texture_copy_test.cc
index 92c12bafdd9a..63e2ac1a0af4 100644
--- a/tests/cpp/texture_copy_test.cc
+++ b/tests/cpp/texture_copy_test.cc
@@ -98,39 +98,28 @@ TEST(TextureCopy, OverwritePoolSubview) {
     static_cast<float*>(cpu_pool0->data)[i] = random(mt);
   }
 
-  // Random initialize host array
-  for (int64_t h = 0; h < shape[0]; h++) {
-    for (int64_t w = 0; w < shape[1]; w++) {
-      for (int64_t rgba = 0; rgba < shape[2]; rgba++) {
-        static_cast<float*>(cpu_arr0->data)[shape[1] * shape[2] * h + shape[2] * w + rgba] = 1.1f;
-      }
-    }
+  // Random initialize host array storage
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr0->data)[i] = random(mt);
   }
 
-  // Copy to texture pool for initialization
+  // Loop through pool
   cpu_pool0.CopyTo(opencl_txpool);
-  // Copy host data to subview into texture storage
-  cpu_arr0.CopyTo(opencl_txarr0);
-  // Copy modified pool back
   opencl_txpool.CopyTo(cpu_pool1);
 
-  // Check that modifications to pool follow two dimensional
-  // strides according to the written texture shape.
-  for (int64_t h = 0; h < shape_pool[0]; h++) {
-    for (int64_t w = 0; w < shape_pool[1]; w++) {
-      for (int64_t rgba = 0; rgba < shape_pool[2]; rgba++) {
-        size_t i = shape_pool[1] * shape_pool[2] * h + shape_pool[2] * w + rgba;
-        if (h < shape[0] && w < shape[1] && rgba < shape[2]) {
-          size_t j = shape[1] * shape[2] * h + shape[2] * w + rgba;
-          ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
-                              static_cast<float*>(cpu_arr0->data)[j]),
-                    1e-5);
-        } else {
-          ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
-                              static_cast<float*>(cpu_pool0->data)[i]),
-                    1e-5);
-        }
-      }
-    }
+  for (size_t i = 0; i < size_pool; i++) {
+    ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool0->data)[i] -
+                        static_cast<float*>(cpu_pool1->data)[i]),
+              1e-5);
+  }
+
+  // Loop through view
+  cpu_arr0.CopyTo(opencl_txarr0);
+  opencl_txarr0.CopyTo(cpu_arr1);
+
+  for (size_t i = 0; i < size; i++) {
+    ICHECK_LT(
+        std::fabs(static_cast<float*>(cpu_arr0->data)[i] - static_cast<float*>(cpu_arr1->data)[i]),
+        1e-5);
   }
 }
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index d7b6e13c18b6..133fcd191961 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -179,7 +179,7 @@ def test_plan_memory():
     assert (
         storage_sizes[0][0] == 40
         and storage_sizes[1][0] == 4
-        and storage_sizes[2][0] == 4
+        and storage_sizes[2][0] == 40
         and storage_sizes[3][0] == 40
     )
 
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 51ef86d05ec7..3202839e50ed 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -43,8 +43,13 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
     run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay --ignore=tests/python/relay/aot
 
 # OpenCL texture test. Deselected specific tests that fails  in CI
-TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
-    run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture
+TEXTURE_TESTS=$(ls tests/python/relay/opencl_texture/test_*)
+i=0
+for TEST in $TEXTURE_TESTS; do
+    TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \
+        run_pytest "${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture-$i" "$TEST"
+    i=$((i+1))
+done
 # Command line driver test
 run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver