apache · srkreddy1238 · Dec 24, 2024 · Dec 30, 2024 · Jan 20, 2025 · Feb 4, 2025
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -66,7 +66,6 @@
 #include "../src/runtime/opencl/opencl_device_api.cc"
 #include "../src/runtime/opencl/opencl_module.cc"
 #include "../src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
-#include "../src/runtime/opencl/texture_pool.cc"
 #include "../src/runtime/source_utils.cc"
 #endif
 

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
@@ -52,6 +52,7 @@ enum DeviceAttrKind : int {
   kL2CacheSizeBytes = 13,
   kTotalGlobalMemory = 14,
   kAvailableGlobalMemory = 15,
+  kImagePitchAlignment = 16,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT

diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
@@ -87,7 +87,26 @@ class Allocator {
    *  \return A sized allocation in the form of a buffer.
    */
   TVM_DLL virtual Buffer Alloc(Device dev, ShapeTuple shape, DLDataType type_hint,
-                               const std::string& mem_scope = "") = 0;
+                               const std::string& mem_scope = "");
+
+  /*! \brief Create a view for the buffer given a shape, type and scope.
+   *  \param buffer The existing buffer upon which we need to create a view.
+   *  \param shape The shape of the view.
+   *  \param type_hint A type hint to the view.
+   *  \param mem_scope A memory scope of the view.
+   *  \return A device pointer to the created view.
+   */
+  TVM_DLL virtual void* CreateView(const Buffer& buffer, ShapeTuple shape, DLDataType type_hint,
+                                   const std::string& mem_scope = "global") {
+    return buffer.data;
+  }
+
+  /*! \brief Release the view .
+   *  \param dev is the device where this view is created
+   *  \param data The view pointer to be freed.
+   */
+  TVM_DLL virtual void FreeView(Device dev, void* data) {}
+
   /*! \brief Free a buffer allocated by the allocator.
    *  \param buffer The buffer to free.
    */
@@ -147,6 +166,13 @@ class StorageObj : public Object {
   /*! \brief Allocate an NDArray from a given piece of storage. */
   TVM_DLL NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 
+  /*! \brief Allocate an NDArray with memory scope from a given piece of storage. */
+  TVM_DLL NDArray AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataType dtype,
+                                     String scope = "global");
+
+  /*! \brief The deleter for an NDArray when allocated from underlying storage. */
+  static void ScopedDeleter(Object* ptr);
+
   /*! \brief The deleter for an NDArray when allocated from underlying storage. */
   static void Deleter(Object* ptr);
 
@@ -170,6 +196,12 @@ class Storage : public ObjectRef {
 };
 
 }  // namespace memory
+
+using memory::Allocator;
+using memory::AllocatorType;
+using memory::MemoryManager;
+using memory::StorageObj;
+
 }  // namespace runtime
 }  // namespace tvm
 

diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
@@ -229,6 +229,16 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     VLOG_CONTEXT << "StorageAllocator";
     VLOG(1) << "planning:" << std::endl << PrettyPrint(func);
     prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
+    // Backup the virtual devices as token reuse might lost the original memory scope
+    std::unordered_map<const ExprNode*, std::vector<VirtualDevice>> virtual_device_map_;
+    for (const auto& kv : prototype_) {
+      std::vector<VirtualDevice> virtual_devices;
+      virtual_devices.reserve(kv.second.size());
+      for (StorageToken* tok : kv.second) {
+        virtual_devices.push_back(tok->virtual_device);
+      }
+      virtual_device_map_.insert({kv.first, virtual_devices});
+    }
     this->Run(func);
 
     // The value of smap contains two integer arrays where the first array
@@ -252,9 +262,13 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         }
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
-        virtual_devices.push_back(tok->virtual_device);
         sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
       }
+      ICHECK(kv.second.size() == virtual_device_map_[kv.first].size())
+          << "Mismatch of tokens and virtual devices";
+      for (auto vdev : virtual_device_map_[kv.first]) {
+        virtual_devices.push_back(vdev);
+      }
       auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices),
                                                std::move(sid_sizes_byte));
       smap.Set(GetRef<Expr>(kv.first), storage_info);
@@ -356,34 +370,27 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
   class TokenAllocator {
    public:
-    StorageToken* Alloc(StorageToken* proto) {
-      return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
-                                : token_1d_.Alloc(proto, storage_ids_++);
-    }
+    StorageToken* Alloc(StorageToken* proto) { return token_mixed_.Alloc(proto, storage_ids_++); }
     StorageToken* Request(StorageToken* proto) {
-      StorageToken* token =
-          Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
+      StorageToken* token = token_mixed_.Request(proto);
       return token ? token : this->Alloc(proto);
     }
-    void CheckForRelease(StorageToken* tok) {
-      return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
-    }
+    void CheckForRelease(StorageToken* tok) { return token_mixed_.CheckForRelease(tok); }
 
     size_t GetMemorySize(StorageToken* tok) {
       // TODO(amalyshe): figure out who requries sizes and for what
       // size in case of texture is not enough - we can return any value if it
       // assumed to be used for memory allocatoion or we can return real size
       // if it is just for information
-      return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
+      return token_mixed_.GetMemorySize(tok);
     }
     static bool Is2DStorage(StorageToken* tok) {
       return relay::Is2DStorage(tok->virtual_device->memory_scope);
     }
 
    private:
     int64_t storage_ids_{0};
-    TokenAllocator1D token_1d_;
-    TokenAllocator2D token_2d_;
+    TokenAllocatorMixed token_mixed_;
   };
 
  private:

diff --git a/src/relay/backend/token_allocator.cc b/src/relay/backend/token_allocator.cc
@@ -31,22 +31,45 @@
 
 namespace tvm {
 namespace relay {
+constexpr auto Is2DStorage = runtime::IsTextureStorage;
 
-size_t TokenAllocator1D::GetMemorySize(StorageToken* prototype) {
+/*
+ * Mixed mode memory allocator
+ */
+size_t TokenAllocatorMixed::GetMemorySize(StorageToken* prototype) {
   TensorType ttype = prototype->ttype;
   ICHECK(ttype.defined());
   size_t size = 1;
-  for (IndexExpr dim : ttype->shape) {
-    const int64_t* pval = tir::as_const_int(dim);
-    ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-    ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-    size *= static_cast<size_t>(pval[0]);
+  if (relay::Is2DStorage(prototype->virtual_device->memory_scope)) {
+    size = GetSize2D(prototype);
+  } else {
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = tir::as_const_int(dim);
+      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
   }
-  size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
   return size;
 }
 
-StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
+String GetDeviceCompatibleToken(StorageToken* tok) {
+  Target null_tgt{nullptr};
+  if (null_tgt == tok->virtual_device->target) {
+    return tok->virtual_device->memory_scope;
+  }
+  std::string dev_kind = tok->virtual_device->target->kind->name;
+  auto* device_scope_handler = tvm::runtime::Registry::Get("DeviceScopeCompatibility." + dev_kind);
+  if (device_scope_handler) {
+    String dev_scope =
+        (*device_scope_handler)(tok->virtual_device->target, tok->virtual_device->memory_scope);
+    return dev_scope;
+  }
+  return tok->virtual_device->memory_scope;
+}
+
+StorageToken* TokenAllocatorMixed::Request(StorageToken* prototype) {
   // calculate the size;
   size_t size = GetMemorySize(prototype);
   // search memory block in [size / match_range_, size * match_range_)
@@ -59,142 +82,73 @@ StorageToken* TokenAllocator1D::Request(StorageToken* prototype) {
   // search for memory blocks larger than requested
   for (auto it = mid; it != end; ++it) {
     StorageToken* tok = it->second;
-    if (!tok->is_compatible(*prototype)) continue;
-    ICHECK_EQ(tok->ref_counter, 0);
-    // Use exect matching strategy
-    tok->max_bytes = std::max(size, tok->max_bytes);
-    tok->ref_counter = prototype->ref_counter;
-    // find a exact match, erase from map and return
-    free_.erase(it);
-    return tok;
+    bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype));
+    if (tok->is_compatible(*prototype) || (dev_compatible)) {
+      ICHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      if (size > tok->max_bytes) {
+        tok->max_bytes = size;
+        tok->ttype = prototype->ttype;
+      }
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
   }
   // then search for memory blocks smaller than requested space
   for (auto it = mid; it != begin;) {
     --it;
     StorageToken* tok = it->second;
-    if (!tok->is_compatible(*prototype)) continue;
-    ICHECK_EQ(tok->ref_counter, 0);
-    // Use exect matching strategy
-    tok->max_bytes = std::max(size, tok->max_bytes);
-    tok->ref_counter = prototype->ref_counter;
-    // erase from map and return
-    free_.erase(it);
-    return tok;
+    bool dev_compatible = (GetDeviceCompatibleToken(tok) == GetDeviceCompatibleToken(prototype));
+    if (tok->is_compatible(*prototype) || (dev_compatible)) {
+      ICHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      if (size > tok->max_bytes) {
+        tok->max_bytes = size;
+        tok->ttype = prototype->ttype;
+      }
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
   }
   return nullptr;
 }
 
-StorageToken* TokenAllocator1D::Alloc(StorageToken* prototype, int64_t storage_id) {
+StorageToken* TokenAllocatorMixed::Alloc(StorageToken* prototype, int64_t storage_id) {
   size_t size = GetMemorySize(prototype);
   prototype->max_bytes = size;
   prototype->storage_id = storage_id;
   data_.push_back(prototype);
   return prototype;
 }
 
-void TokenAllocator1D::CheckForRelease(StorageToken* tok) {
+void TokenAllocatorMixed::CheckForRelease(StorageToken* tok) {
   ICHECK_GE(tok->storage_id, 0);
   ICHECK_GE(tok->ref_counter, 0);
   if (tok->ref_counter == 0) {
     free_.insert({tok->max_bytes, tok});
   }
 }
 
-StorageToken* TokenAllocator2D::Request(StorageToken* prototype) {
-  auto shape = GetSize2D(prototype);
-  const int64_t max_ratio = 5;
-  int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-  int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-  int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-  int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-  int64_t best_storage_id = -1;
-  MemBlock new_mem;
-  for (int64_t free_id : free_list_) {
-    MemBlock& cached = blocks_[free_id];
-    // Can only reuse texture 2d blocks of the same type
-    if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
-      continue;
-    }
-    // Can only reuse texture 2d blocks of the same scope
-    // Because reusing textures with different memory scope may lead to
-    // accuracy issues, because the data will be packed in a different way for
-    // different memory scopes.
-    if (cached.token_->virtual_device->memory_scope != prototype->virtual_device->memory_scope) {
-      continue;
-    }
-    // avoid reusing too small and too big textures
-    if (shape.width / cached.x_ > max_ratio || cached.x_ / shape.width > max_ratio ||
-        shape.height / cached.y_ > max_ratio || cached.y_ / shape.height > max_ratio) {
-      continue;
-    }
-    int64_t new_width = std::max(cached.x_, shape.width);
-    int64_t new_height = std::max(cached.y_, shape.height);
-    int64_t added_size_x = new_width - cached.x_;
-    int64_t added_size_y = new_height - cached.y_;
-    int64_t wasted_size_x = new_width - shape.width;
-    int64_t wasted_size_y = new_height - shape.height;
-    // Prioritize minimization of added size first, then minimize
-    // wasted size among blocks which would not require expansion
-    if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-        (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-        (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-        (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-      min_added_size_x = added_size_x;
-      min_added_size_y = added_size_y;
-      min_wasted_size_x = wasted_size_x;
-      min_wasted_size_y = wasted_size_y;
-      best_storage_id = free_id;
-      new_mem.x_ = new_width;
-      new_mem.y_ = new_height;
-    }
-  }
-
-  if (min_added_size_x == 0 && min_added_size_y == 0) {
-    // use existing block
-    free_list_.erase(best_storage_id);
-    blocks_[best_storage_id].token_->ref_counter += prototype->ref_counter;
-    return blocks_[best_storage_id].token_;
-  } else if (min_added_size_x <= shape.width || min_added_size_y <= shape.height) {
-    // Reset the reference counter of the now live token
-    free_list_.erase(best_storage_id);
-    new_mem.token_ = prototype;
-    new_mem.token_->ref_counter += 1;
-    new_mem.token_->storage_id = best_storage_id;
-    blocks_[best_storage_id] = new_mem;
-    return new_mem.token_;
-  }
-  return nullptr;
-}
-
-StorageToken* TokenAllocator2D::Alloc(StorageToken* prototype, int64_t storage_id) {
-  auto shape = GetSize2D(prototype);
-  MemBlock block;
-  block.x_ = shape.width;
-  block.y_ = shape.height;
-  prototype->storage_id = storage_id;
-  block.token_ = prototype;
-  blocks_[prototype->storage_id] = block;
-  return prototype;
-}
-
-void TokenAllocator2D::CheckForRelease(StorageToken* tok) {
-  ICHECK_GE(tok->storage_id, 0);
-  ICHECK_GE(tok->ref_counter, 0);
-  if (tok->ref_counter == 0) {
-    free_list_.insert(tok->storage_id);
-  }
-}
-
-runtime::Texture2DShape<int64_t> TokenAllocator2D::GetSize2D(StorageToken* prototype) {
+size_t TokenAllocatorMixed::GetSize2D(StorageToken* prototype) {
   TensorType ttype = prototype->ttype;
   ICHECK(ttype.defined());
-  size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
-                                                       prototype->virtual_device->memory_scope);
   struct Shape {
     const Array<PrimExpr>& shape;
     int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+    int size() { return this->shape.size(); }
   };
-  return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
+  auto shape = Shape{ttype->shape};
+  int image_row_align =
+      prototype->virtual_device->target->GetAttr<Integer>("image_base_address_alignment")
+          .value_or(Integer(64))
+          ->value;
+  return runtime::GetTextureMemorySize<Shape>(shape, ttype->dtype.bits(), ttype->dtype.lanes(),
+                                              prototype->virtual_device->memory_scope,
+                                              image_row_align);
 }
 
 }  // namespace relay