From c12fb2b8ec40610db470803051736e2137ffd3cd Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Wed, 8 Jun 2022 14:00:18 +0800
Subject: [PATCH 1/7] [aot] [llvm] Implemented FieldCacheData and refactored
 initialize_llvm_runtime_snodes()

---
 taichi/llvm/llvm_offline_cache.h |  31 +++++-
 taichi/llvm/llvm_program.cpp     | 177 ++++++++++++++++++-------------
 taichi/llvm/llvm_program.h       |   6 +-
 3 files changed, 139 insertions(+), 75 deletions(-)
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index c82837e42521e..fe9eca8f8ba96 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -42,7 +42,36 @@ struct LlvmOfflineCache {
     TI_IO_DEF(kernel_key, args, offloaded_task_list);
   };
 
-  std::unordered_map<std::string, KernelCacheData> kernels;
+  struct FieldCacheData {
+    struct SNodeCacheData {
+      int id;
+      int type;
+      size_t cell_size_bytes;
+      size_t chunk_size;
+
+      TI_IO_DEF(id, type, cell_size_bytes, chunk_size);
+    };
+
+    int tree_id;
+    size_t root_size;
+    std::vector<SNodeCacheData> snode_metas;
+
+    TI_IO_DEF(tree_id, root_size, snode_metas);
+
+    // TODO(zhanlue)
+    //  Serialize/Deserialize the llvm::Module from StructCompiler
+    //  At runtime, make sure loaded Field-Modules and Kernel-Modules are linked
+    //  altogether.
+  };
+
+  // TODO(zhanlue): we need a better identifier for each FieldCacheData
+  // (SNodeTree) Given that snode_tree_id is not continuous, it is ridiculous to
+  // ask the users to remember each of the snode_tree_ids
+  // ** Find a way to name each SNodeTree **
+  std::unordered_map<int, FieldCacheData> fields;  // key = snode_tree_id
+
+  std::unordered_map<std::string, KernelCacheData>
+      kernels;  // key = kernel_name
 
   TI_IO_DEF(kernels);
 };
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 65bc1a75e12e1..e922a9ebe8026 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -158,95 +158,105 @@ LlvmProgramImpl::clone_struct_compiler_initial_context(
   return tlctx->clone_runtime_module();
 }
 
-void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
-                                                     StructCompiler *scomp,
-                                                     uint64 *result_buffer) {
-  TaichiLLVMContext *tlctx = nullptr;
-  if (config->arch == Arch::cuda) {
+void LlvmProgramImpl::initialize_llvm_runtime_snodes(
+    const LlvmOfflineCache::FieldCacheData &field_cache_data,
+    uint64 *result_buffer);
+TaichiLLVMContext *tlctx = nullptr;
+if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    tlctx = llvm_context_device_.get();
+  tlctx = llvm_context_device_.get();
 #else
-    TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    tlctx = llvm_context_host_.get();
-  }
+} else {
+  tlctx = llvm_context_host_.get();
+}
 
-  auto *const runtime_jit = tlctx->runtime_jit_module;
-  // By the time this creator is called, "this" is already destroyed.
-  // Therefore it is necessary to capture members by values.
-  const auto snodes = scomp->snodes;
-  const int root_id = tree->root()->id;
-
-  TI_TRACE("Allocating data structure of size {} bytes", scomp->root_size);
-  std::size_t rounded_size =
-      taichi::iroundup(scomp->root_size, taichi_page_size);
-
-  Ptr root_buffer = snode_tree_buffer_manager_->allocate(
-      runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree->id(),
-      result_buffer);
-  if (config->arch == Arch::cuda) {
+auto *const runtime_jit = tlctx -> runtime_jit_module;
+// By the time this creator is called, "this" is already destroyed.
+// Therefore it is necessary to capture members by values.
+size_t root_size = field_cache_data.root_size;
+const auto snode_metas = field_cache_data.snode_metas;
+const int root_id = field_cache_data.tree_id;
+
+TI_TRACE("Allocating data structure of size {} bytes", root_size);
+std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
+
+Ptr root_buffer = snode_tree_buffer_manager_->allocate(runtime_jit,
+                                                       llvm_runtime_,
+                                                       rounded_size,
+                                                       taichi_page_size,
+                                                       root_id,
+                                                       result_buffer);
+if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
+  CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
-    TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    std::memset(root_buffer, 0, rounded_size);
-  }
+} else {
+  std::memset(root_buffer, 0, rounded_size);
+}
 
-  DeviceAllocation alloc{kDeviceNullAllocation};
+DeviceAllocation alloc{kDeviceNullAllocation};
 
-  if (config->arch == Arch::cuda) {
+if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    alloc = cuda_device()->import_memory(root_buffer, rounded_size);
+  alloc = cuda_device()->import_memory(root_buffer, rounded_size);
 #else
-    TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    alloc = cpu_device()->import_memory(root_buffer, rounded_size);
-  }
+} else {
+  alloc = cpu_device()->import_memory(root_buffer, rounded_size);
+}
 
-  snode_tree_allocs_[tree->id()] = alloc;
+snode_tree_allocs_[tree->id()] = alloc;
 
-  bool all_dense = config->demote_dense_struct_fors;
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    if (snodes[i]->type != SNodeType::dense &&
-        snodes[i]->type != SNodeType::place &&
-        snodes[i]->type != SNodeType::root) {
-      all_dense = false;
-      break;
-    }
+bool all_dense = config->demote_dense_struct_fors;
+for (size_t i = 0; i < snode_metas.size(); i++) {
+  if (snode_metas[i]->type != SNodeType::dense &&
+      snode_metas[i]->type != SNodeType::place &&
+      snode_metas[i]->type != SNodeType::root) {
+    all_dense = false;
+    break;
   }
+}
 
-  runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
-      "runtime_initialize_snodes", llvm_runtime_, scomp->root_size, root_id,
-      (int)snodes.size(), tree->id(), rounded_size, root_buffer, all_dense);
-
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    if (is_gc_able(snodes[i]->type)) {
-      const auto snode_id = snodes[i]->id;
-      std::size_t node_size;
-      auto element_size = snodes[i]->cell_size_bytes;
-      if (snodes[i]->type == SNodeType::pointer) {
-        // pointer. Allocators are for single elements
-        node_size = element_size;
-      } else {
-        // dynamic. Allocators are for the chunks
-        node_size = sizeof(void *) + element_size * snodes[i]->chunk_size;
-      }
-      TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
-               node_size);
-      auto rt = llvm_runtime_;
-      runtime_jit->call<void *, int, std::size_t>(
-          "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
-      TI_TRACE("Allocating ambient element for snode {} (node size {})",
-               snode_id, node_size);
-      runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
-                                     node_size);
+runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
+    "runtime_initialize_snodes",
+    llvm_runtime_,
+    root_size,
+    root_id,
+    (int)snode_metas.size(),
+    root_id,
+    rounded_size,
+    root_buffer,
+    all_dense);
+
+for (size_t i = 0; i < snode_metas.size(); i++) {
+  if (is_gc_able(snode_metas[i]->type)) {
+    const auto snode_id = snode_metas[i].id;
+    std::size_t node_size;
+    auto element_size = snode_metas[i].cell_size_bytes;
+    if (snode_metas[i].type == SNodeType::pointer) {
+      // pointer. Allocators are for single elements
+      node_size = element_size;
+    } else {
+      // dynamic. Allocators are for the chunks
+      node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
     }
+    TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
+             node_size);
+    auto rt = llvm_runtime_;
+    runtime_jit->call<void *, int, std::size_t>(
+        "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
+    TI_TRACE("Allocating ambient element for snode {} (node size {})", snode_id,
+             node_size);
+    runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
+                                   node_size);
   }
 }
+}
 
 std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
     SNodeTree *tree) {
@@ -275,10 +285,35 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   compile_snode_tree_types_impl(tree);
 }
 
+static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
+    const SNodeTree &tree,
+    const StructCompiler &struct_compiler) {
+  TI_ASSERT(tree.id == tree.root()->id);
+
+  LlvmOfflineCache::FieldCacheData ret;
+  ret.tree_id = tree.id;
+  ret.root_size = struct_compiler.root_size;
+
+  const auto &snodes = struct_compiler.snodes;
+  for (size_t i = 0; i < snodes.size(); i++) {
+    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
+    snode_cache_data.id = snodes[i]->id;
+    snode_cache_data.type = snodes[i]->type;
+    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
+    snode_cache_data.chunk_size = snodes[i]->chunk_size;
+
+    ret.snode_metas.emplace_back(std::move(snode_cache_data));
+  }
+
+  return ret;
+}
+
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
   auto struct_compiler = compile_snode_tree_types_impl(tree);
-  initialize_llvm_runtime_snodes(tree, struct_compiler.get(), result_buffer);
+
+  auto field_cache_data = construct_filed_cache_data(*tree, *struct_compiler);
+  initialize_llvm_runtime_snodes(field_cache_data, result_buffer);
 }
 
 uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index c9029bbcd85f0..69378ee660bf1 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -132,9 +132,9 @@ class LlvmProgramImpl : public ProgramImpl {
   /**
    * Initializes the SNodes for LLVM based backends.
    */
-  void initialize_llvm_runtime_snodes(const SNodeTree *tree,
-                                      StructCompiler *scomp,
-                                      uint64 *result_buffer);
+  void initialize_llvm_runtime_snodes(
+      const LlvmOfflineCache::FieldCacheData &field_cache_data,
+      uint64 *result_buffer);
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);
 

From e3f1ab8ac70798f8b14e5e4750c9ea43c3544b74 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Wed, 8 Jun 2022 14:15:47 +0800
Subject: [PATCH 2/7] Addressed compilation erros

---
 taichi/llvm/llvm_offline_cache.h |   3 +-
 taichi/llvm/llvm_program.cpp     | 152 +++++++++++++++----------------
 2 files changed, 73 insertions(+), 82 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index fe9eca8f8ba96..91386c2bffb31 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -45,7 +45,7 @@ struct LlvmOfflineCache {
   struct FieldCacheData {
     struct SNodeCacheData {
       int id;
-      int type;
+      SNodeType type;
       size_t cell_size_bytes;
       size_t chunk_size;
 
@@ -53,6 +53,7 @@ struct LlvmOfflineCache {
     };
 
     int tree_id;
+    int root_id;
     size_t root_size;
     std::vector<SNodeCacheData> snode_metas;
 
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index e922a9ebe8026..eea60dad165f7 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -160,103 +160,94 @@ LlvmProgramImpl::clone_struct_compiler_initial_context(
 
 void LlvmProgramImpl::initialize_llvm_runtime_snodes(
     const LlvmOfflineCache::FieldCacheData &field_cache_data,
-    uint64 *result_buffer);
-TaichiLLVMContext *tlctx = nullptr;
-if (config->arch == Arch::cuda) {
+    uint64 *result_buffer) {
+  TaichiLLVMContext *tlctx = nullptr;
+  if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-  tlctx = llvm_context_device_.get();
+    tlctx = llvm_context_device_.get();
 #else
-  TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED
 #endif
-} else {
-  tlctx = llvm_context_host_.get();
-}
+  } else {
+    tlctx = llvm_context_host_.get();
+  }
 
-auto *const runtime_jit = tlctx -> runtime_jit_module;
-// By the time this creator is called, "this" is already destroyed.
-// Therefore it is necessary to capture members by values.
-size_t root_size = field_cache_data.root_size;
-const auto snode_metas = field_cache_data.snode_metas;
-const int root_id = field_cache_data.tree_id;
-
-TI_TRACE("Allocating data structure of size {} bytes", root_size);
-std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
-
-Ptr root_buffer = snode_tree_buffer_manager_->allocate(runtime_jit,
-                                                       llvm_runtime_,
-                                                       rounded_size,
-                                                       taichi_page_size,
-                                                       root_id,
-                                                       result_buffer);
-if (config->arch == Arch::cuda) {
+  auto *const runtime_jit = tlctx->runtime_jit_module;
+  // By the time this creator is called, "this" is already destroyed.
+  // Therefore it is necessary to capture members by values.
+  size_t root_size = field_cache_data.root_size;
+  const auto snode_metas = field_cache_data.snode_metas;
+  const int tree_id = field_cache_data.tree_id;
+  const int root_id = field_cache_data.root_id;
+
+  TI_TRACE("Allocating data structure of size {} bytes", root_size);
+  std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
+
+  Ptr root_buffer = snode_tree_buffer_manager_->allocate(
+      runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id,
+      result_buffer);
+  if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-  CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
+    CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
-  TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED
 #endif
-} else {
-  std::memset(root_buffer, 0, rounded_size);
-}
+  } else {
+    std::memset(root_buffer, 0, rounded_size);
+  }
 
-DeviceAllocation alloc{kDeviceNullAllocation};
+  DeviceAllocation alloc{kDeviceNullAllocation};
 
-if (config->arch == Arch::cuda) {
+  if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-  alloc = cuda_device()->import_memory(root_buffer, rounded_size);
+    alloc = cuda_device()->import_memory(root_buffer, rounded_size);
 #else
-  TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED
 #endif
-} else {
-  alloc = cpu_device()->import_memory(root_buffer, rounded_size);
-}
+  } else {
+    alloc = cpu_device()->import_memory(root_buffer, rounded_size);
+  }
 
-snode_tree_allocs_[tree->id()] = alloc;
+  snode_tree_allocs_[tree_id] = alloc;
 
-bool all_dense = config->demote_dense_struct_fors;
-for (size_t i = 0; i < snode_metas.size(); i++) {
-  if (snode_metas[i]->type != SNodeType::dense &&
-      snode_metas[i]->type != SNodeType::place &&
-      snode_metas[i]->type != SNodeType::root) {
-    all_dense = false;
-    break;
+  bool all_dense = config->demote_dense_struct_fors;
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (snode_metas[i].type != SNodeType::dense &&
+        snode_metas[i].type != SNodeType::place &&
+        snode_metas[i].type != SNodeType::root) {
+      all_dense = false;
+      break;
+    }
   }
-}
 
-runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
-    "runtime_initialize_snodes",
-    llvm_runtime_,
-    root_size,
-    root_id,
-    (int)snode_metas.size(),
-    root_id,
-    rounded_size,
-    root_buffer,
-    all_dense);
-
-for (size_t i = 0; i < snode_metas.size(); i++) {
-  if (is_gc_able(snode_metas[i]->type)) {
-    const auto snode_id = snode_metas[i].id;
-    std::size_t node_size;
-    auto element_size = snode_metas[i].cell_size_bytes;
-    if (snode_metas[i].type == SNodeType::pointer) {
-      // pointer. Allocators are for single elements
-      node_size = element_size;
-    } else {
-      // dynamic. Allocators are for the chunks
-      node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
+  runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
+      "runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
+      (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
+
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (is_gc_able(snode_metas[i].type)) {
+      const auto snode_id = snode_metas[i].id;
+      std::size_t node_size;
+      auto element_size = snode_metas[i].cell_size_bytes;
+      if (snode_metas[i].type == SNodeType::pointer) {
+        // pointer. Allocators are for single elements
+        node_size = element_size;
+      } else {
+        // dynamic. Allocators are for the chunks
+        node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
+      }
+      TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
+               node_size);
+      auto rt = llvm_runtime_;
+      runtime_jit->call<void *, int, std::size_t>(
+          "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
+      TI_TRACE("Allocating ambient element for snode {} (node size {})",
+               snode_id, node_size);
+      runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
+                                     node_size);
     }
-    TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
-             node_size);
-    auto rt = llvm_runtime_;
-    runtime_jit->call<void *, int, std::size_t>(
-        "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
-    TI_TRACE("Allocating ambient element for snode {} (node size {})", snode_id,
-             node_size);
-    runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
-                                   node_size);
   }
 }
-}
 
 std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
     SNodeTree *tree) {
@@ -288,10 +279,9 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
 static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
     const SNodeTree &tree,
     const StructCompiler &struct_compiler) {
-  TI_ASSERT(tree.id == tree.root()->id);
-
   LlvmOfflineCache::FieldCacheData ret;
-  ret.tree_id = tree.id;
+  ret.tree_id = tree.id();
+  ret.root_id = tree.root()->id;
   ret.root_size = struct_compiler.root_size;
 
   const auto &snodes = struct_compiler.snodes;

From cf72ff8c7bd7eab6203b51bdb0107d7a69453d73 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Wed, 8 Jun 2022 16:06:02 +0800
Subject: [PATCH 3/7] [aot] [llvm] LLVM AOT Field #1: Adjust
 serialization/deserialization logics for FieldCacheData

---
 taichi/llvm/llvm_offline_cache.cpp | 14 +++++++++++++
 taichi/llvm/llvm_offline_cache.h   | 32 ++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index 542f02aebc2c7..92994fe9aa2f0 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -51,6 +51,20 @@ LlvmOfflineCacheFileReader::LlvmOfflineCacheFileReader(
     : path_(path), data_(std::move(data)), format_(format) {
 }
 
+bool LlvmOfflineCacheFileReader::get_field_cache(
+    LlvmOfflineCache::FieldCacheData &res,
+    int snode_tree_id) {
+  auto itr = data_.fields.find(snode_tree_id);
+  if (itr == data_.fields.end()) {
+    TI_DEBUG("Cannot find field with snode_tree_id={}", snode_tree_id);
+    return false;
+  }
+
+  const auto &loaded_field_cache = itr->second;
+  res = loaded_field_cache;  // copy assign
+  return true;
+}
+
 bool LlvmOfflineCacheFileReader::get_kernel_cache(
     LlvmOfflineCache::KernelCacheData &res,
     const std::string &key,
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 91386c2bffb31..1193365fe30e3 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -59,10 +59,31 @@ struct LlvmOfflineCache {
 
     TI_IO_DEF(tree_id, root_size, snode_metas);
 
-    // TODO(zhanlue)
-    //  Serialize/Deserialize the llvm::Module from StructCompiler
-    //  At runtime, make sure loaded Field-Modules and Kernel-Modules are linked
-    //  altogether.
+    // TODO(zhanlue): refactor llvm::Modules
+    //
+    // struct_module will eventually get cloned into each kernel_module,
+    // so there's no need to serialize it here.
+    //
+    // We have three different types of llvm::Module
+    // 1. runtime_module: contains runtime functions.
+    // 2. struct_module: contains compiled SNodeTree in llvm::Type.
+    // 3. kernel_modules: contains compiled kernel codes.
+    //
+    // The way those modules work rely on a recursive clone mechanism:
+    //   runtime_module = load("runtime.bc")
+    //   struct_module = clone(runtime_module) + compiled-SNodeTree
+    //   kernel_module = clone(struct_module) + compiled-Kernel
+    //
+    // As a result, every kernel_module contains a copy of struct_module +
+    // runtime_module.
+    //
+    // This recursive clone mechanism is super fragile,
+    // which potentially causes inconsistency between modules if not handled
+    // properly.
+    //
+    // Let's turn to use llvm::link to bind the modules,
+    // and make runtime_module, struct_module, kernel_module independent of each
+    // other
   };
 
   // TODO(zhanlue): we need a better identifier for each FieldCacheData
@@ -83,6 +104,9 @@ class LlvmOfflineCacheFileReader {
                         const std::string &key,
                         llvm::LLVMContext &llvm_ctx);
 
+  bool get_field_cache(LlvmOfflineCache::FieldCacheData &res,
+                       int snode_tree_id);
+
   static std::unique_ptr<LlvmOfflineCacheFileReader> make(
       const std::string &path,
       LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL);

From dc0e12b31a1e8ac6f496823481236a0e470a0f93 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Thu, 9 Jun 2022 15:59:51 +0800
Subject: [PATCH 4/7] [llvm] [aot] Added Field support for LLVM AOT

---
 taichi/aot/module_builder.h             |  8 ++++
 taichi/ir/snode.cpp                     |  2 +-
 taichi/ir/snode.h                       |  2 +-
 taichi/llvm/llvm_aot_module_builder.cpp | 34 ++++++++++++++
 taichi/llvm/llvm_aot_module_builder.h   |  8 ++++
 taichi/llvm/llvm_aot_module_loader.cpp  | 50 ++++++++++++++++++++
 taichi/llvm/llvm_aot_module_loader.h    | 10 ++++
 taichi/llvm/llvm_program.cpp            | 62 +++++++++++++++----------
 taichi/llvm/llvm_program.h              | 23 ++++++---
 taichi/program/program.cpp              |  4 +-
 10 files changed, 169 insertions(+), 34 deletions(-)

diff --git a/taichi/aot/module_builder.h b/taichi/aot/module_builder.h
index 02cdb9a83ded4..b7da8d76cb6d0 100644
--- a/taichi/aot/module_builder.h
+++ b/taichi/aot/module_builder.h
@@ -40,6 +40,13 @@ class AotModuleBuilder {
 
   void add_graph(const std::string &name, const aot::CompiledGraph &graph);
 
+  void set_program(Program *prog) {
+    prog_ = prog;
+  }
+  Program *get_mutable_program() {
+    return prog_;
+  }
+
  protected:
   /**
    * Intended to be overriden by each backend's implementation.
@@ -81,6 +88,7 @@ class AotModuleBuilder {
 
  private:
   std::unordered_map<std::string, aot::CompiledGraph> graphs_;
+  Program *prog_ = nullptr;
 };
 
 }  // namespace lang
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index 1a583cda431b5..f36511cb27b5f 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -326,7 +326,7 @@ void SNode::set_snode_tree_id(int id) {
   snode_tree_id_ = id;
 }
 
-int SNode::get_snode_tree_id() {
+int SNode::get_snode_tree_id() const {
   return snode_tree_id_;
 }
 
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index 8a21721c2a7bc..da7560501d97f 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -354,7 +354,7 @@ class SNode {
 
   void set_snode_tree_id(int id);
 
-  int get_snode_tree_id();
+  int get_snode_tree_id() const;
 
   static void reset_counter() {
     counter = 0;
diff --git a/taichi/llvm/llvm_aot_module_builder.cpp b/taichi/llvm/llvm_aot_module_builder.cpp
index d23ee5c47c564..910465fb43fb0 100644
--- a/taichi/llvm/llvm_aot_module_builder.cpp
+++ b/taichi/llvm/llvm_aot_module_builder.cpp
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include "taichi/llvm/launch_arg_info.h"
+#include "taichi/llvm/llvm_program.h"
 
 namespace taichi {
 namespace lang {
@@ -34,5 +35,38 @@ void LlvmAotModuleBuilder::add_per_backend(const std::string &identifier,
   cache_.kernels[identifier] = std::move(kcache);
 }
 
+void LlvmAotModuleBuilder::add_field_per_backend(const std::string &identifier,
+                                                 const SNode *rep_snode,
+                                                 bool is_scalar,
+                                                 DataType dt,
+                                                 std::vector<int> shape,
+                                                 int row_num,
+                                                 int column_num) {
+  // Field refers to a leaf node(Place SNode) in a SNodeTree.
+  // It makes no sense to just serialize the leaf node or its corresponding
+  // branch. Instead, the minimal unit we have to serialize is the entire
+  // SNodeTree. Note that SNodeTree's uses snode_tree_id as its identifier,
+  // rather than the field's name. (multiple fields may end up referring to the
+  // same SNodeTree)
+
+  // 1. Find snode_tree_id
+  int snode_tree_id = rep_snode->get_snode_tree_id();
+
+  // 2. Fetch Cache from the Program
+  // Kernel compilation is not allowed until all the Fields are finalized,
+  // so we finished SNodeTree compilation during AOTModuleBuilder construction.
+  //
+  // By the time "add_field_per_backend()" is called,
+  // SNodeTrees should have already been finalized,
+  // with compiled info stored in LlvmProgramImpl::cache_data_.
+  const LlvmProgramImpl *prog =
+      this->get_mutable_program()->get_llvm_program_impl();
+  LlvmOfflineCache::FieldCacheData field_cache =
+      prog->get_cached_field(snode_tree_id);
+
+  // 3. Update AOT Cache
+  cache_.fields[snode_tree_id] = std::move(field_cache);
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_builder.h b/taichi/llvm/llvm_aot_module_builder.h
index b88133a761783..a5fa2558b4a16 100644
--- a/taichi/llvm/llvm_aot_module_builder.h
+++ b/taichi/llvm/llvm_aot_module_builder.h
@@ -16,6 +16,14 @@ class LlvmAotModuleBuilder : public AotModuleBuilder {
   void add_per_backend(const std::string &identifier, Kernel *kernel) override;
   virtual CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) = 0;
 
+  void add_field_per_backend(const std::string &identifier,
+                             const SNode *rep_snode,
+                             bool is_scalar,
+                             DataType dt,
+                             std::vector<int> shape,
+                             int row_num,
+                             int column_num) override;
+
  private:
   mutable LlvmOfflineCache cache_;
 };
diff --git a/taichi/llvm/llvm_aot_module_loader.cpp b/taichi/llvm/llvm_aot_module_loader.cpp
index 5d725927388d7..a7ffa168cf117 100644
--- a/taichi/llvm/llvm_aot_module_loader.cpp
+++ b/taichi/llvm/llvm_aot_module_loader.cpp
@@ -17,6 +17,24 @@ class KernelImpl : public aot::Kernel {
   FunctionType fn_;
 };
 
+class FieldImpl : public aot::Field {
+ public:
+  explicit FieldImpl(const LlvmOfflineCache::FieldCacheData &field)
+      : field_(field) {
+  }
+
+  explicit FieldImpl(LlvmOfflineCache::FieldCacheData &&field)
+      : field_(std::move(field)) {
+  }
+
+  LlvmOfflineCache::FieldCacheData get_field() const {
+    return field_;
+  }
+
+ private:
+  LlvmOfflineCache::FieldCacheData field_;
+};
+
 }  // namespace
 
 LlvmOfflineCache::KernelCacheData LlvmAotModule::load_kernel_from_cache(
@@ -37,5 +55,37 @@ std::unique_ptr<aot::Kernel> LlvmAotModule::make_new_kernel(
   return std::make_unique<KernelImpl>(fn);
 }
 
+std::unique_ptr<aot::Field> LlvmAotModule::make_new_field(
+    const std::string &name) {
+  // Check if "name" represents snode_tree_id.
+  // Avoid using std::atoi due to its poor error handling.
+  char *end;
+  int snode_tree_id = static_cast<int>(strtol(name.c_str(), &end, 10 /*base*/));
+
+  TI_ASSERT(end != name.c_str());
+  TI_ASSERT(*end == '\0');
+
+  // Load FieldCache
+  LlvmOfflineCache::FieldCacheData loaded;
+  auto ok = cache_reader_->get_field_cache(loaded, snode_tree_id);
+  TI_ERROR_IF(!ok, "Failed to load field with id={}", snode_tree_id);
+
+  return std::make_unique<FieldImpl>(std::move(loaded));
+}
+
+void finalize_aot_field(aot::Module *aot_module,
+                        aot::Field *aot_field,
+                        uint64 *result_buffer) {
+  auto *llvm_aot_module = dynamic_cast<LlvmAotModule *>(aot_module);
+  auto *aot_field_impl = dynamic_cast<FieldImpl *>(aot_field);
+
+  TI_ASSERT(llvm_aot_module != nullptr);
+  TI_ASSERT(aot_field_impl != nullptr);
+
+  auto *llvm_prog = llvm_aot_module->get_program();
+  const auto &field_cache = aot_field_impl->get_field();
+  llvm_prog->initialize_llvm_runtime_snodes(field_cache, result_buffer);
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_loader.h b/taichi/llvm/llvm_aot_module_loader.h
index b5e8f527cea67..21e2be11ee221 100644
--- a/taichi/llvm/llvm_aot_module_loader.h
+++ b/taichi/llvm/llvm_aot_module_loader.h
@@ -6,6 +6,10 @@
 namespace taichi {
 namespace lang {
 
+TI_DLL_EXPORT void finalize_aot_field(aot::Module *aot_module,
+                                      aot::Field *aot_field,
+                                      uint64 *result_buffer);
+
 class LlvmAotModule : public aot::Module {
  public:
   explicit LlvmAotModule(const std::string &module_path,
@@ -27,6 +31,10 @@ class LlvmAotModule : public aot::Module {
     return 0;
   }
 
+  LlvmProgramImpl *const get_program() {
+    return program_;
+  }
+
  protected:
   virtual FunctionType convert_module_to_function(
       const std::string &name,
@@ -38,6 +46,8 @@ class LlvmAotModule : public aot::Module {
   std::unique_ptr<aot::Kernel> make_new_kernel(
       const std::string &name) override;
 
+  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override;
+
   LlvmProgramImpl *const program_{nullptr};
   std::unique_ptr<LlvmOfflineCacheFileReader> cache_reader_{nullptr};
 };
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index eea60dad165f7..30adf5f9ccf79 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -273,37 +273,22 @@ std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
 }
 
 void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
-  compile_snode_tree_types_impl(tree);
-}
-
-static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
-    const SNodeTree &tree,
-    const StructCompiler &struct_compiler) {
-  LlvmOfflineCache::FieldCacheData ret;
-  ret.tree_id = tree.id();
-  ret.root_id = tree.root()->id;
-  ret.root_size = struct_compiler.root_size;
-
-  const auto &snodes = struct_compiler.snodes;
-  for (size_t i = 0; i < snodes.size(); i++) {
-    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
-    snode_cache_data.id = snodes[i]->id;
-    snode_cache_data.type = snodes[i]->type;
-    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
-    snode_cache_data.chunk_size = snodes[i]->chunk_size;
-
-    ret.snode_metas.emplace_back(std::move(snode_cache_data));
-  }
+  auto struct_compiler = compile_snode_tree_types_impl(tree);
+  int snode_tree_id = tree->id();
+  int root_id = tree->root()->id;
 
-  return ret;
+  // Add compiled result to Cache
+  cache_field(snode_tree_id, root_id, *struct_compiler);
 }
 
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
-  auto struct_compiler = compile_snode_tree_types_impl(tree);
+  compile_snode_tree_types(tree);
+  int snode_tree_id = tree->id();
 
-  auto field_cache_data = construct_filed_cache_data(*tree, *struct_compiler);
-  initialize_llvm_runtime_snodes(field_cache_data, result_buffer);
+  TI_ASSERT(cache_data_.fields.find(snode_tree_id) != cache_data_.fields.end());
+  initialize_llvm_runtime_snodes(cache_data_.fields.at(snode_tree_id),
+                                 result_buffer);
 }
 
 uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) {
@@ -701,6 +686,33 @@ void LlvmProgramImpl::cache_kernel(
   kernel_cache.offloaded_task_list = std::move(offloaded_task_list);
 }
 
+void LlvmProgramImpl::cache_field(int snode_tree_id,
+                                  int root_id,
+                                  const StructCompiler &struct_compiler) {
+  if (cache_data_.fields.find(snode_tree_id) != cache_data_.fields.end()) {
+    // [TODO] check and update the Cache, instead of simply return.
+    return;
+  }
+
+  LlvmOfflineCache::FieldCacheData ret;
+  ret.tree_id = snode_tree_id;
+  ret.root_id = root_id;
+  ret.root_size = struct_compiler.root_size;
+
+  const auto &snodes = struct_compiler.snodes;
+  for (size_t i = 0; i < snodes.size(); i++) {
+    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
+    snode_cache_data.id = snodes[i]->id;
+    snode_cache_data.type = snodes[i]->type;
+    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
+    snode_cache_data.chunk_size = snodes[i]->chunk_size;
+
+    ret.snode_metas.emplace_back(std::move(snode_cache_data));
+  }
+
+  cache_data_.fields[snode_tree_id] = std::move(ret);
+}
+
 void LlvmProgramImpl::dump_cache_data_to_disk() {
   if (config->offline_cache && !cache_data_.kernels.empty()) {
     LlvmOfflineCacheFileWriter writer{};
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index 69378ee660bf1..2eec64dd8e7bd 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -118,10 +118,27 @@ class LlvmProgramImpl : public ProgramImpl {
                     std::vector<LlvmOfflineCache::OffloadedTaskCacheData>
                         &&offloaded_task_list);
 
+  void cache_field(int snode_tree_id,
+                   int root_id,
+                   const StructCompiler &struct_compiler);
+
+  LlvmOfflineCache::FieldCacheData get_cached_field(int snode_tree_id) const {
+    TI_ASSERT(cache_data_.fields.find(snode_tree_id) !=
+              cache_data_.fields.end());
+    return cache_data_.fields.at(snode_tree_id);
+  }
+
   Device *get_compute_device() override {
     return device_.get();
   }
 
+  /**
+   * Initializes the SNodes for LLVM based backends.
+   */
+  void initialize_llvm_runtime_snodes(
+      const LlvmOfflineCache::FieldCacheData &field_cache_data,
+      uint64 *result_buffer);
+
  private:
   std::unique_ptr<llvm::Module> clone_struct_compiler_initial_context(
       bool has_multiple_snode_trees,
@@ -129,12 +146,6 @@ class LlvmProgramImpl : public ProgramImpl {
 
   std::unique_ptr<StructCompiler> compile_snode_tree_types_impl(
       SNodeTree *tree);
-  /**
-   * Initializes the SNodes for LLVM based backends.
-   */
-  void initialize_llvm_runtime_snodes(
-      const LlvmOfflineCache::FieldCacheData &field_cache_data,
-      uint64 *result_buffer);
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);
 
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 994fcbfbdf351..f55ebab88b096 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -610,7 +610,9 @@ std::unique_ptr<AotModuleBuilder> Program::make_aot_module_builder(Arch arch) {
   }
   if (arch_uses_llvm(config.arch) || config.arch == Arch::metal ||
       config.arch == Arch::vulkan || config.arch == Arch::opengl) {
-    return program_impl_->make_aot_module_builder();
+    auto aot_builder = program_impl_->make_aot_module_builder();
+    aot_builder->set_program(this);
+    return aot_builder;
   }
   return nullptr;
 }

From 417f3fb3c60122f5f3878760fd66e86fedfc265a Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Thu, 9 Jun 2022 18:00:35 +0800
Subject: [PATCH 5/7] [aot] [llvm] LLVM AOT Field #2: Updated LLVM
 AOTModuleLoader & AOTModuleBuilder to support Fields

---
 taichi/backends/cpu/aot_module_loader_impl.cpp  |  5 -----
 taichi/backends/cuda/aot_module_loader_impl.cpp |  5 -----
 taichi/llvm/llvm_aot_module_loader.cpp          |  7 ++++++-
 taichi/llvm/llvm_aot_module_loader.h            | 11 +++++++++++
 taichi/llvm/llvm_offline_cache.h                |  2 +-
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/taichi/backends/cpu/aot_module_loader_impl.cpp b/taichi/backends/cpu/aot_module_loader_impl.cpp
index e2ff3b2ecf0f6..16c297dced325 100644
--- a/taichi/backends/cpu/aot_module_loader_impl.cpp
+++ b/taichi/backends/cpu/aot_module_loader_impl.cpp
@@ -44,11 +44,6 @@ class AotModuleImpl : public LlvmAotModule {
     TI_NOT_IMPLEMENTED;
     return nullptr;
   }
-
-  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override {
-    TI_NOT_IMPLEMENTED;
-    return nullptr;
-  }
 };
 
 }  // namespace
diff --git a/taichi/backends/cuda/aot_module_loader_impl.cpp b/taichi/backends/cuda/aot_module_loader_impl.cpp
index b08efdc9632da..69bf52d749772 100644
--- a/taichi/backends/cuda/aot_module_loader_impl.cpp
+++ b/taichi/backends/cuda/aot_module_loader_impl.cpp
@@ -44,11 +44,6 @@ class AotModuleImpl : public LlvmAotModule {
     TI_NOT_IMPLEMENTED;
     return nullptr;
   }
-
-  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override {
-    TI_NOT_IMPLEMENTED;
-    return nullptr;
-  }
 };
 
 }  // namespace
diff --git a/taichi/llvm/llvm_aot_module_loader.cpp b/taichi/llvm/llvm_aot_module_loader.cpp
index a7ffa168cf117..99ca51f665363 100644
--- a/taichi/llvm/llvm_aot_module_loader.cpp
+++ b/taichi/llvm/llvm_aot_module_loader.cpp
@@ -84,7 +84,12 @@ void finalize_aot_field(aot::Module *aot_module,
 
   auto *llvm_prog = llvm_aot_module->get_program();
   const auto &field_cache = aot_field_impl->get_field();
-  llvm_prog->initialize_llvm_runtime_snodes(field_cache, result_buffer);
+
+  int snode_tree_id = field_cache.tree_id;
+  if (!llvm_aot_module->is_snode_tree_initialized(snode_tree_id)) {
+    llvm_prog->initialize_llvm_runtime_snodes(field_cache, result_buffer);
+    llvm_aot_module->set_initialized_snode_tree(snode_tree_id);
+  }
 }
 
 }  // namespace lang
diff --git a/taichi/llvm/llvm_aot_module_loader.h b/taichi/llvm/llvm_aot_module_loader.h
index 21e2be11ee221..1e4e093bcfc2c 100644
--- a/taichi/llvm/llvm_aot_module_loader.h
+++ b/taichi/llvm/llvm_aot_module_loader.h
@@ -35,6 +35,14 @@ class LlvmAotModule : public aot::Module {
     return program_;
   }
 
+  void set_initialized_snode_tree(int snode_tree_id) {
+    initialized_snode_tree_ids.insert(snode_tree_id);
+  }
+
+  bool is_snode_tree_initialized(int snode_tree_id) {
+    return initialized_snode_tree_ids.count(snode_tree_id);
+  }
+
  protected:
   virtual FunctionType convert_module_to_function(
       const std::string &name,
@@ -50,6 +58,9 @@ class LlvmAotModule : public aot::Module {
 
   LlvmProgramImpl *const program_{nullptr};
   std::unique_ptr<LlvmOfflineCacheFileReader> cache_reader_{nullptr};
+
+  // To prevent repeated SNodeTree initialization
+  std::unordered_set<int> initialized_snode_tree_ids;
 };
 
 }  // namespace lang
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 1193365fe30e3..bd7f7900cfb95 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -95,7 +95,7 @@ struct LlvmOfflineCache {
   std::unordered_map<std::string, KernelCacheData>
       kernels;  // key = kernel_name
 
-  TI_IO_DEF(kernels);
+  TI_IO_DEF(fields, kernels);
 };
 
 class LlvmOfflineCacheFileReader {

From 1480736b797a02ba6327aaa2f96aa0952977413a Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Fri, 10 Jun 2022 15:34:34 +0800
Subject: [PATCH 6/7] Fixed merge issues

---
 taichi/llvm/llvm_program.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 00a5a9935c011..30adf5f9ccf79 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -281,28 +281,6 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   cache_field(snode_tree_id, root_id, *struct_compiler);
 }
 
-static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
-    const SNodeTree &tree,
-    const StructCompiler &struct_compiler) {
-  LlvmOfflineCache::FieldCacheData ret;
-  ret.tree_id = tree.id();
-  ret.root_id = tree.root()->id;
-  ret.root_size = struct_compiler.root_size;
-
-  const auto &snodes = struct_compiler.snodes;
-  for (size_t i = 0; i < snodes.size(); i++) {
-    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
-    snode_cache_data.id = snodes[i]->id;
-    snode_cache_data.type = snodes[i]->type;
-    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
-    snode_cache_data.chunk_size = snodes[i]->chunk_size;
-
-    ret.snode_metas.emplace_back(std::move(snode_cache_data));
-  }
-
-  return ret;
-}
-
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
   compile_snode_tree_types(tree);

From 52872e6f6f7cff2c3cd7fc8695a7bf5ad72ebc4c Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Mon, 13 Jun 2022 14:35:07 +0800
Subject: [PATCH 7/7] Stopped abusing Program*

---
 taichi/aot/module_builder.h                    | 8 --------
 taichi/backends/cpu/aot_module_builder_impl.h  | 5 +++++
 taichi/backends/cuda/aot_module_builder_impl.h | 5 +++++
 taichi/llvm/llvm_aot_module_builder.cpp        | 5 ++---
 taichi/llvm/llvm_aot_module_builder.h          | 4 ++++
 taichi/llvm/llvm_program.cpp                   | 4 ++--
 taichi/program/program.cpp                     | 4 +---
 7 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/taichi/aot/module_builder.h b/taichi/aot/module_builder.h
index b7da8d76cb6d0..02cdb9a83ded4 100644
--- a/taichi/aot/module_builder.h
+++ b/taichi/aot/module_builder.h
@@ -40,13 +40,6 @@ class AotModuleBuilder {
 
   void add_graph(const std::string &name, const aot::CompiledGraph &graph);
 
-  void set_program(Program *prog) {
-    prog_ = prog;
-  }
-  Program *get_mutable_program() {
-    return prog_;
-  }
-
  protected:
   /**
    * Intended to be overriden by each backend's implementation.
@@ -88,7 +81,6 @@ class AotModuleBuilder {
 
  private:
   std::unordered_map<std::string, aot::CompiledGraph> graphs_;
-  Program *prog_ = nullptr;
 };
 
 }  // namespace lang
diff --git a/taichi/backends/cpu/aot_module_builder_impl.h b/taichi/backends/cpu/aot_module_builder_impl.h
index 1d81fa41d7c2e..039174aa88503 100644
--- a/taichi/backends/cpu/aot_module_builder_impl.h
+++ b/taichi/backends/cpu/aot_module_builder_impl.h
@@ -9,6 +9,11 @@ namespace lang {
 namespace cpu {
 
 class AotModuleBuilderImpl : public LlvmAotModuleBuilder {
+ public:
+  explicit AotModuleBuilderImpl(LlvmProgramImpl *prog)
+      : LlvmAotModuleBuilder(prog) {
+  }
+
  private:
   CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) override;
 };
diff --git a/taichi/backends/cuda/aot_module_builder_impl.h b/taichi/backends/cuda/aot_module_builder_impl.h
index f0fdc74e14f9c..94ac89380d1e0 100644
--- a/taichi/backends/cuda/aot_module_builder_impl.h
+++ b/taichi/backends/cuda/aot_module_builder_impl.h
@@ -9,6 +9,11 @@ namespace lang {
 namespace cuda {
 
 class AotModuleBuilderImpl : public LlvmAotModuleBuilder {
+ public:
+  explicit AotModuleBuilderImpl(LlvmProgramImpl *prog)
+      : LlvmAotModuleBuilder(prog) {
+  }
+
  private:
   CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) override;
 };
diff --git a/taichi/llvm/llvm_aot_module_builder.cpp b/taichi/llvm/llvm_aot_module_builder.cpp
index 910465fb43fb0..664ee933893c9 100644
--- a/taichi/llvm/llvm_aot_module_builder.cpp
+++ b/taichi/llvm/llvm_aot_module_builder.cpp
@@ -59,10 +59,9 @@ void LlvmAotModuleBuilder::add_field_per_backend(const std::string &identifier,
   // By the time "add_field_per_backend()" is called,
   // SNodeTrees should have already been finalized,
   // with compiled info stored in LlvmProgramImpl::cache_data_.
-  const LlvmProgramImpl *prog =
-      this->get_mutable_program()->get_llvm_program_impl();
+  TI_ASSERT(prog_ != nullptr);
   LlvmOfflineCache::FieldCacheData field_cache =
-      prog->get_cached_field(snode_tree_id);
+      prog_->get_cached_field(snode_tree_id);
 
   // 3. Update AOT Cache
   cache_.fields[snode_tree_id] = std::move(field_cache);
diff --git a/taichi/llvm/llvm_aot_module_builder.h b/taichi/llvm/llvm_aot_module_builder.h
index a5fa2558b4a16..857f237c4a73c 100644
--- a/taichi/llvm/llvm_aot_module_builder.h
+++ b/taichi/llvm/llvm_aot_module_builder.h
@@ -9,6 +9,9 @@ namespace lang {
 
 class LlvmAotModuleBuilder : public AotModuleBuilder {
  public:
+  explicit LlvmAotModuleBuilder(LlvmProgramImpl *prog) : prog_(prog) {
+  }
+
   void dump(const std::string &output_dir,
             const std::string &filename) const override;
 
@@ -26,6 +29,7 @@ class LlvmAotModuleBuilder : public AotModuleBuilder {
 
  private:
   mutable LlvmOfflineCache cache_;
+  LlvmProgramImpl *prog_ = nullptr;
 };
 
 }  // namespace lang
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 30adf5f9ccf79..a805ade265e37 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -350,12 +350,12 @@ void LlvmProgramImpl::print_list_manager_info(void *list_manager,
 
 std::unique_ptr<AotModuleBuilder> LlvmProgramImpl::make_aot_module_builder() {
   if (config->arch == Arch::x64 || config->arch == Arch::arm64) {
-    return std::make_unique<cpu::AotModuleBuilderImpl>();
+    return std::make_unique<cpu::AotModuleBuilderImpl>(this);
   }
 
 #if defined(TI_WITH_CUDA)
   if (config->arch == Arch::cuda) {
-    return std::make_unique<cuda::AotModuleBuilderImpl>();
+    return std::make_unique<cuda::AotModuleBuilderImpl>(this);
   }
 #endif
 
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 62562bcc004a1..d94b6b6ddff2b 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -610,9 +610,7 @@ std::unique_ptr<AotModuleBuilder> Program::make_aot_module_builder(Arch arch) {
   }
   if (arch_uses_llvm(config.arch) || config.arch == Arch::metal ||
       config.arch == Arch::vulkan || config.arch == Arch::opengl) {
-    auto aot_builder = program_impl_->make_aot_module_builder();
-    aot_builder->set_program(this);
-    return aot_builder;
+    return program_impl_->make_aot_module_builder();
   }
   return nullptr;
 }