diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index 7909d2a8c35f2d..b37d9421f3f3a8 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -290,19 +290,6 @@ class KernelCodegenImpl : public IRVisitor {
       ScopedIndent s(current_appender());
       const auto &parent = stmt->ptr->raw_name();
       const bool is_dynamic = (stmt->snode->type == SNodeType::dynamic);
-      std::string ch_id;
-      if (is_dynamic &&
-          (opty == SNodeOpType::deactivate || opty == SNodeOpType::append ||
-           opty == SNodeOpType::length)) {
-        // For these ops, `dynamic` is a special case because |stmt| doesn't
-        // contain an index to its cells. Setting it to zero to store the
-        // address of the first child into |ch_addr|.
-        ch_id = "0";
-      } else {
-        ch_id = stmt->val->raw_name();
-      }
-      const std::string ch_addr =
-          fmt::format("{}.children({}).addr()", stmt->ptr->raw_name(), ch_id);
       if (opty == SNodeOpType::is_active) {
         emit("{} = {}.is_active({});", result_var, parent,
              stmt->val->raw_name());
@@ -619,7 +606,7 @@ class KernelCodegenImpl : public IRVisitor {
     } else if (stmt->task_type == Type::listgen) {
       add_runtime_list_op_kernel(stmt);
     } else if (stmt->task_type == Type::gc) {
-      // Ignored
+      add_gc_op_kernels(stmt);
     } else {
       TI_ERROR("Unsupported offload type={} on Metal arch", stmt->task_name());
     }
@@ -1284,6 +1271,41 @@ class KernelCodegenImpl : public IRVisitor {
     used_features()->sparse = true;
   }
 
+  void add_gc_op_kernels(OffloadedStmt *stmt) {
+    TI_ASSERT(stmt->task_type == OffloadedTaskType::gc);
+
+    auto *const sn = stmt->snode;
+    const auto &sn_descs = compiled_structs_->snode_descriptors;
+    // common attributes shared among the 3-stage GC kernels
+    KernelAttributes ka;
+    ka.task_type = OffloadedTaskType::gc;
+    ka.gc_op_attribs = KernelAttributes::GcOpAttributes();
+    ka.gc_op_attribs->snode = sn;
+    ka.buffers = {BuffersEnum::Runtime, BuffersEnum::Context};
+    current_kernel_attribs_ = nullptr;
+    // stage 1 specific
+    ka.name = "gc_compact_free_list";
+    ka.advisory_total_num_threads =
+        std::min(total_num_self_from_root(sn_descs, sn->id),
+                 kMaxNumThreadsGridStrideLoop);
+    ka.advisory_num_threads_per_group = stmt->block_dim;
+    mtl_kernels_attribs()->push_back(ka);
+    // stage 2 specific
+    ka.name = "gc_reset_free_list";
+    ka.advisory_total_num_threads = 1;
+    ka.advisory_num_threads_per_group = 1;
+    mtl_kernels_attribs()->push_back(ka);
+    // stage 3 specific
+    ka.name = "gc_move_recycled_to_free";
+    ka.advisory_total_num_threads =
+        std::min(total_num_self_from_root(sn_descs, sn->id),
+                 kMaxNumThreadsGridStrideLoop);
+    ka.advisory_num_threads_per_group = stmt->block_dim;
+    mtl_kernels_attribs()->push_back(ka);
+
+    used_features()->sparse = true;
+  }
+
   std::string inject_load_global_tmp(int offset,
                                      DataType dt = PrimitiveType::i32) {
     const auto vt = TypeFactory::create_vector_or_scalar_type(1, dt);
diff --git a/taichi/backends/metal/features.h b/taichi/backends/metal/features.h
index 95615056c7051b..c6f3886716c80c 100644
--- a/taichi/backends/metal/features.h
+++ b/taichi/backends/metal/features.h
@@ -8,7 +8,8 @@ TLANG_NAMESPACE_BEGIN
 namespace metal {
 
 inline bool is_supported_sparse_type(SNodeType t) {
-  return t == SNodeType::bitmasked || t == SNodeType::dynamic;
+  return t == SNodeType::bitmasked || t == SNodeType::dynamic ||
+         t == SNodeType::pointer;
 }
 
 }  // namespace metal
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index d2381ccdb8c7dd..20c0012a765b80 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -8,6 +8,7 @@
 #include <string_view>
 
 #include "taichi/backends/metal/constants.h"
+#include "taichi/backends/metal/features.h"
 #include "taichi/inc/constants.h"
 #include "taichi/math/arithmetic.h"
 #include "taichi/program/py_print_buffer.h"
@@ -25,7 +26,8 @@
 #include "taichi/program/program.h"
 #endif  // TI_PLATFORM_OSX
 
-TLANG_NAMESPACE_BEGIN
+namespace taichi {
+namespace lang {
 namespace metal {
 
 #ifdef TI_PLATFORM_OSX
@@ -221,41 +223,20 @@ class UserMtlKernel : public CompiledMtlKernelBase {
 };
 
 // Internal Metal kernel used to maintain the kernel runtime data
-class RuntimeListOpsMtlKernel : public CompiledMtlKernelBase {
+class SparseRuntimeMtlKernelBase : public CompiledMtlKernelBase {
  public:
   struct Params : public CompiledMtlKernelBase::Params {
     MemoryPool *mem_pool = nullptr;
-    const SNodeDescriptorsMap *snode_descriptors = nullptr;
-
-    const SNode *snode() const {
-      return kernel_attribs->runtime_list_op_attribs->snode;
-    }
   };
 
-  explicit RuntimeListOpsMtlKernel(Params &params)
+  explicit SparseRuntimeMtlKernelBase(Params &params, int args_size)
       : CompiledMtlKernelBase(params),
-        parent_snode_id_(params.snode()->parent->id),
-        child_snode_id_(params.snode()->id),
-        args_mem_(std::make_unique<BufferMemoryView>(
-            /*size=*/sizeof(int32_t) * 3,
-            params.mem_pool)),
+        args_mem_(
+            std::make_unique<BufferMemoryView>(args_size, params.mem_pool)),
         args_buffer_(new_mtl_buffer_no_copy(params.device,
                                             args_mem_->ptr(),
                                             args_mem_->size())) {
     TI_ASSERT(args_buffer_ != nullptr);
-    auto *mem = reinterpret_cast<int32_t *>(args_mem_->ptr());
-    mem[0] = parent_snode_id_;
-    mem[1] = child_snode_id_;
-    const auto &sn_descs = *params.snode_descriptors;
-    mem[2] = total_num_self_from_root(sn_descs, child_snode_id_);
-    TI_DEBUG(
-        "Registered RuntimeListOpsMtlKernel: name={} num_threads={} "
-        "parent_snode={} "
-        "child_snode={} max_num_elems={} ",
-        params.kernel_attribs->name,
-        params.kernel_attribs->advisory_total_num_threads, mem[0], mem[1],
-        mem[2]);
-    did_modify_range(args_buffer_.get(), /*location=*/0, args_mem_->size());
   }
 
   void launch(InputBuffersMap &input_buffers,
@@ -271,20 +252,67 @@ class RuntimeListOpsMtlKernel : public CompiledMtlKernelBase {
     launch_if_not_empty(std::move(buffers), command_buffer);
   }
 
- private:
-  const int parent_snode_id_;
-  const int child_snode_id_;
-  // For such Metal kernels, it always takes in an args buffer of two int32's:
-  // args[0] = parent_snode_id
-  // args[1] = child_snode_id
-  // args[2] = child_snode.total_num_self_from_root
-  // Note that this args buffer has nothing to do with the one passed to Taichi
-  // kernel.
-  // See taichi/backends/metal/shaders/runtime_kernels.metal.h
+ protected:
   std::unique_ptr<BufferMemoryView> args_mem_;
   nsobj_unique_ptr<MTLBuffer> args_buffer_;
 };
 
+class ListgenOpMtlKernel : public SparseRuntimeMtlKernelBase {
+ public:
+  struct Params : public SparseRuntimeMtlKernelBase::Params {
+    const SNodeDescriptorsMap *snode_descriptors{nullptr};
+
+    const SNode *snode() const {
+      return kernel_attribs->runtime_list_op_attribs->snode;
+    }
+  };
+
+  explicit ListgenOpMtlKernel(Params &params)
+      : SparseRuntimeMtlKernelBase(params, /*args_size=*/sizeof(int32_t) * 3) {
+    // For such Metal kernels, it always takes in an args buffer of 3 int32's:
+    // args[0] = parent_snode_id
+    // args[1] = child_snode_id
+    // args[2] = child_snode.total_num_self_from_root
+    // Note that this args buffer has nothing to do with the one passed to
+    // Taichi kernel. See taichi/backends/metal/shaders/runtime_kernels.metal.h
+    const int parent_snode_id = params.snode()->parent->id;
+    const int child_snode_id = params.snode()->id;
+    auto *mem = reinterpret_cast<int32_t *>(args_mem_->ptr());
+    mem[0] = parent_snode_id;
+    mem[1] = child_snode_id;
+    const auto &sn_descs = *params.snode_descriptors;
+    mem[2] = total_num_self_from_root(sn_descs, child_snode_id);
+    TI_DEBUG(
+        "Registered ListgenOpMtlKernel: name={} num_threads={} "
+        "parent_snode={} "
+        "child_snode={} max_num_elems={} ",
+        params.kernel_attribs->name,
+        params.kernel_attribs->advisory_total_num_threads, mem[0], mem[1],
+        mem[2]);
+    did_modify_range(args_buffer_.get(), /*location=*/0, args_mem_->size());
+  }
+};
+
+class GcOpMtlKernel : public SparseRuntimeMtlKernelBase {
+ public:
+  struct Params : public SparseRuntimeMtlKernelBase::Params {
+    const SNode *snode() const {
+      return kernel_attribs->gc_op_attribs->snode;
+    }
+  };
+
+  explicit GcOpMtlKernel(Params &params)
+      : SparseRuntimeMtlKernelBase(params, /*args_size=*/sizeof(int32_t)) {
+    const int snode_id = params.snode()->id;
+    auto *mem = reinterpret_cast<int32_t *>(args_mem_->ptr());
+    mem[0] = snode_id;
+    TI_DEBUG("Registered GcOpMtlKernel: name={} num_threads={} snode_id={}",
+             params.kernel_attribs->name,
+             params.kernel_attribs->advisory_total_num_threads, mem[0]);
+    did_modify_range(args_buffer_.get(), /*location=*/0, args_mem_->size());
+  }
+};
+
 // Info for launching a compiled Taichi kernel, which consists of a series of
 // compiled Metal kernels.
 class CompiledTaichiKernel {
@@ -327,7 +355,7 @@ class CompiledTaichiKernel {
       std::unique_ptr<CompiledMtlKernelBase> kernel = nullptr;
       const auto ktype = ka.task_type;
       if (ktype == KernelTaskType::listgen) {
-        RuntimeListOpsMtlKernel::Params kparams;
+        ListgenOpMtlKernel::Params kparams;
         kparams.kernel_attribs = &ka;
         kparams.is_jit_evaluator = false;
         kparams.config = params.compile_config;
@@ -335,7 +363,16 @@ class CompiledTaichiKernel {
         kparams.mtl_func = mtl_func.get();
         kparams.mem_pool = params.mem_pool;
         kparams.snode_descriptors = params.snode_descriptors;
-        kernel = std::make_unique<RuntimeListOpsMtlKernel>(kparams);
+        kernel = std::make_unique<ListgenOpMtlKernel>(kparams);
+      } else if (ktype == KernelTaskType::gc) {
+        GcOpMtlKernel::Params kparams;
+        kparams.kernel_attribs = &ka;
+        kparams.is_jit_evaluator = false;
+        kparams.config = params.compile_config;
+        kparams.device = device;
+        kparams.mtl_func = mtl_func.get();
+        kparams.mem_pool = params.mem_pool;
+        kernel = std::make_unique<GcOpMtlKernel>(kparams);
       } else {
         UserMtlKernel::Params kparams;
         kparams.kernel_attribs = &ka;
@@ -701,6 +738,18 @@ class KernelManager::Impl {
     return &print_strtable_;
   }
 
+  std::size_t get_snode_num_dynamically_allocated(SNode *snode) {
+    // TODO(k-ye): Have a generic way for querying these sparse runtime stats.
+    mac::ScopedAutoreleasePool pool;
+    blit_buffers_and_sync({runtime_buffer_.get()});
+    auto *sna = dev_runtime_mirror_.snode_allocators + snode->id;
+    // WHY -1?
+    //
+    // We allocate one ambient element for each `pointer` SNode from its
+    // corresponding snode_allocator |sna|. Therefore the count starts at 1.
+    return sna->data_list.next - 1;
+  }
+
  private:
   void init_runtime(int root_id) {
     using namespace shaders;
@@ -709,6 +758,7 @@ class KernelManager::Impl {
     const int max_snodes = compiled_structs_.max_snodes;
     const auto &snode_descriptors = compiled_structs_.snode_descriptors;
     // init snode_metas
+    dev_runtime_mirror_.snode_metas = (SNodeMeta *)addr;
     for (int i = 0; i < max_snodes; ++i) {
       auto iter = snode_descriptors.find(i);
       if (iter == snode_descriptors.end()) {
@@ -755,6 +805,7 @@ class KernelManager::Impl {
     TI_DEBUG("Initialized SNodeMeta, size={} accumulated={}", addr_offset,
              (addr - addr_begin));
     // init snode_extractors
+    dev_runtime_mirror_.snode_extractors = (SNodeExtractors *)addr;
     for (int i = 0; i < max_snodes; ++i) {
       auto iter = snode_descriptors.find(i);
       if (iter == snode_descriptors.end()) {
@@ -778,6 +829,7 @@ class KernelManager::Impl {
     TI_DEBUG("Initialized SNodeExtractors, size={} accumulated={}", addr_offset,
              (addr - addr_begin));
     // init snode_lists
+    dev_runtime_mirror_.snode_lists = (ListManagerData *)addr;
     ListManagerData *const rtm_list_begin =
         reinterpret_cast<ListManagerData *>(addr);
     for (int i = 0; i < max_snodes; ++i) {
@@ -800,11 +852,58 @@ class KernelManager::Impl {
     addr += addr_offset;
     TI_DEBUG("Initialized ListManagerData, size={} accumulated={}", addr_offset,
              (addr - addr_begin));
-    // TODO(k-ye): Initialize these
+    // init snode_allocators
+    dev_runtime_mirror_.snode_allocators = (NodeManagerData *)addr;
+    auto init_node_mgr = [&snode_descriptors](const SNodeDescriptor &sn_desc,
+                                              NodeManagerData *nm_data) {
+      nm_data->data_list.element_stride = sn_desc.element_stride;
+      const int num_elems_per_chunk = compute_num_elems_per_chunk(
+          sn_desc.total_num_self_from_root(snode_descriptors));
+      const int log2num = log2int(num_elems_per_chunk);
+      nm_data->data_list.log2_num_elems_per_chunk = log2num;
+      nm_data->data_list.next = 0;
+
+      nm_data->free_list.element_stride = sizeof(int32_t);
+      nm_data->free_list.log2_num_elems_per_chunk = log2num;
+      nm_data->free_list.next = 0;
+
+      nm_data->recycled_list.element_stride = sizeof(int32_t);
+      nm_data->recycled_list.log2_num_elems_per_chunk = log2num;
+      nm_data->recycled_list.next = 0;
+
+      nm_data->recycled_list_size_backup = 0;
+      TI_DEBUG(
+          "NodetManagerData\n  id={}\n  element_stride={}\n  "
+          "num_elems_per_chunk={}\n",
+          sn_desc.snode->id, nm_data->data_list.element_stride,
+          num_elems_per_chunk);
+    };
+    std::vector<std::pair<int, NodeManagerData *>> snode_id_to_nodemgrs;
+    for (int i = 0; i < max_snodes; ++i) {
+      auto iter = snode_descriptors.find(i);
+      if (iter == snode_descriptors.end()) {
+        continue;
+      }
+      const SNodeDescriptor &sn_desc = iter->second;
+      NodeManagerData *nm_data = reinterpret_cast<NodeManagerData *>(addr) + i;
+      init_node_mgr(sn_desc, nm_data);
+      snode_id_to_nodemgrs.push_back(std::make_pair(i, nm_data));
+    }
     addr_offset = sizeof(NodeManagerData) * max_snodes;
     addr += addr_offset;
+    TI_DEBUG("Initialized NodeManagerData, size={} accumulated={}", addr_offset,
+             (addr - addr_begin));
+    // ambient_indices initialization has to be delayed, because it relies on
+    // the initialization of MemoryAllocator.
+    auto *const ambient_indices_begin =
+        reinterpret_cast<NodeManagerData::ElemIndex *>(addr);
+    dev_runtime_mirror_.ambient_indices = ambient_indices_begin;
     addr_offset = sizeof(NodeManagerData::ElemIndex) * max_snodes;
     addr += addr_offset;
+    TI_DEBUG(
+        "Delayed the initialization of SNode ambient elements, size={} "
+        "accumulated={}",
+        addr_offset, (addr - addr_begin));
     // init rand_seeds
     // TODO(k-ye): Provide a way to use a fixed seed in dev mode.
     std::mt19937 generator(
@@ -839,6 +938,18 @@ class KernelManager::Impl {
     for (int i = 0; i < taichi_max_num_indices; ++i) {
       root_elem.coords.at[i] = 0;
     }
+    ListManager root_lm;
+    root_lm.lm_data = rtm_list_begin + root_id;
+    root_lm.mem_alloc = mem_alloc;
+    root_lm.append(root_elem);
+    // initialize all the ambient elements
+    for (const auto &p : snode_id_to_nodemgrs) {
+      NodeManager nm;
+      nm.nm_data = p.second;
+      nm.mem_alloc = mem_alloc;
+      ambient_indices_begin[p.first] = nm.allocate();
+    }
+
     ListManager root_lm;
     root_lm.lm_data = rtm_list_begin + root_id;
     root_lm.mem_alloc = mem_alloc;
@@ -874,6 +985,32 @@ class KernelManager::Impl {
     wait_until_completed(cur_command_buffer_.get());
     create_new_command_buffer();
     profiler_->stop();
+
+    print_runtime_debug();
+  }
+
+  void print_runtime_debug() {
+    const auto &sn_descs = compiled_structs_.snode_descriptors;
+    for (int i = 0; i < compiled_structs_.max_snodes; ++i) {
+      auto iter = sn_descs.find(i);
+      if (iter == sn_descs.end()) {
+        continue;
+      }
+      // const SNodeDescriptor &sn_desc = iter->second;
+      shaders::ListManager lm;
+      lm.lm_data = (dev_runtime_mirror_.snode_lists + i);
+      lm.mem_alloc = dev_mem_alloc_mirror_;
+      // lm.num_active();
+      TI_INFO("ListManager for SNode={} num_active={}", i, lm.num_active());
+      for (int j = 0; j < lm.num_active(); ++j) {
+        const auto elem = lm.get<shaders::ListgenElement>(j);
+        TI_INFO(
+            "  [{}] coord={} mem_offset={} in_root_buffer={} nodemgr: id={} "
+            "elem_idx_raw={}",
+            j, elem.coords.at[0], elem.mem_offset, elem.in_root_buffer(),
+            elem.belonged_nodemgr.id, elem.belonged_nodemgr.elem_idx.value());
+      }
+    }
   }
 
   void check_assertion_failure() {
@@ -1003,6 +1140,18 @@ class KernelManager::Impl {
   std::unordered_map<std::string, std::unique_ptr<CompiledTaichiKernel>>
       compiled_taichi_kernels_;
   PrintStringTable print_strtable_;
+
+  // The |dev_*_mirror_|s are the data structures stored in the Metal device
+  // side that get mirrored to the host side. This is possible because the
+  // underlying memory between host and device is unified. However, make sure
+  // to do a dev <-> host buffer synchronization before reading from/after
+  // writing to these mirrors.
+  //
+  // TODO(k-ye): These mirrors are really just a few pointers into the memory
+  // region maintained by |runtime_mem_|. Maybe create a view wrapper directly
+  // on top of |runtime_mem_|?
+  shaders::Runtime dev_runtime_mirror_;
+  shaders::MemoryAllocator *dev_mem_alloc_mirror_{nullptr};
 };
 
 #else
@@ -1037,6 +1186,11 @@ class KernelManager::Impl {
     TI_ERROR("Metal not supported on the current OS");
     return nullptr;
   }
+
+  std::size_t get_snode_num_dynamically_allocated(SNode *) {
+    TI_ERROR("Metal not supported on the current OS");
+    return 0;
+  }
 };
 
 #endif  // TI_PLATFORM_OSX
@@ -1074,5 +1228,10 @@ PrintStringTable *KernelManager::print_strtable() {
   return impl_->print_strtable();
 }
 
+std::size_t KernelManager::get_snode_num_dynamically_allocated(SNode *snode) {
+  return impl_->get_snode_num_dynamically_allocated(snode);
+}
+
 }  // namespace metal
-TLANG_NAMESPACE_END
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h
index b1caf355d6e91e..aaf36ec6db7882 100644
--- a/taichi/backends/metal/kernel_manager.h
+++ b/taichi/backends/metal/kernel_manager.h
@@ -12,7 +12,8 @@
 #include "taichi/program/kernel_profiler.h"
 #include "taichi/system/memory_pool.h"
 
-TLANG_NAMESPACE_BEGIN
+namespace taichi {
+namespace lang {
 
 struct Context;
 
@@ -64,12 +65,16 @@ class KernelManager {
 
   PrintStringTable *print_strtable();
 
+  // For debugging purpose
+  std::size_t get_snode_num_dynamically_allocated(SNode *snode);
+
  private:
   // Use Pimpl so that we can expose this interface without conditionally
   // compiling on TI_PLATFORM_OSX
   class Impl;
-  std::unique_ptr<Impl> impl_;
+  std::unique_ptr<Impl> impl_{nullptr};
 };
 
 }  // namespace metal
-TLANG_NAMESPACE_END
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/metal/kernel_utils.cpp b/taichi/backends/metal/kernel_utils.cpp
index f241c8bf9cdd3e..2c7b9c2c9e28cb 100644
--- a/taichi/backends/metal/kernel_utils.cpp
+++ b/taichi/backends/metal/kernel_utils.cpp
@@ -7,7 +7,8 @@
 #include "taichi/program/context.h"
 #undef TI_RUNTIME_HOST
 
-TLANG_NAMESPACE_BEGIN
+namespace taichi {
+namespace lang {
 
 namespace metal {
 
@@ -52,6 +53,8 @@ std::string KernelAttributes::debug_string() const {
   // TODO(k-ye): show range_for
   if (task_type == OffloadedTaskType::listgen) {
     result += fmt::format(" snode={}", runtime_list_op_attribs->snode->id);
+  } else if (task_type == OffloadedTaskType::gc) {
+    result += fmt::format(" snode={}", gc_op_attribs->snode->id);
   }
   result += ">";
   return result;
@@ -119,4 +122,5 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
 
 }  // namespace metal
 
-TLANG_NAMESPACE_END
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/metal/kernel_utils.h b/taichi/backends/metal/kernel_utils.h
index 6594746daeea5c..aa4192ede75a56 100644
--- a/taichi/backends/metal/kernel_utils.h
+++ b/taichi/backends/metal/kernel_utils.h
@@ -73,11 +73,16 @@ struct KernelAttributes {
   struct RuntimeListOpAttributes {
     const SNode *snode = nullptr;
   };
+  struct GcOpAttributes {
+    const SNode *snode = nullptr;
+  };
   std::vector<Buffers> buffers;
-  // Only valid when |task_type| is range_for.
+  // Only valid when |task_type| is `range_for`.
   std::optional<RangeForAttributes> range_for_attribs;
-  // Only valid when |task_type| is {clear_list, listgen}.
+  // Only valid when |task_type| is `listgen`.
   std::optional<RuntimeListOpAttributes> runtime_list_op_attribs;
+  // Only valid when |task_type| is `gc`.
+  std::optional<GcOpAttributes> gc_op_attribs;
 
   static std::string buffers_name(Buffers b);
   std::string debug_string() const;
diff --git a/taichi/program/extension.cpp b/taichi/program/extension.cpp
index fb3a2d416b4716..ce121f81d4cccb 100644
--- a/taichi/program/extension.cpp
+++ b/taichi/program/extension.cpp
@@ -22,7 +22,7 @@ bool is_extension_supported(Arch arch, Extension ext) {
         Extension::bls, Extension::assertion}},
       {Arch::metal,
        {Extension::adstack, Extension::assertion, Extension::quant_basic,
-        Extension::async_mode}},
+        Extension::async_mode, Extension::sparse}},
       {Arch::opengl, {Extension::extfunc}},
       {Arch::cc, {Extension::data64, Extension::extfunc, Extension::adstack}},
   };
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 753a3234b4244e..413f6f54df9ab4 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -877,6 +877,9 @@ void Program::print_memory_profiler_info() {
 }
 
 std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) {
+  if (config.arch == Arch::metal) {
+    return metal_kernel_mgr_->get_snode_num_dynamically_allocated(snode);
+  }
   auto node_allocator = runtime_query<void *>("LLVMRuntime_get_node_allocators",
                                               llvm_runtime, snode->id);
   auto data_list =
diff --git a/tests/python/test_function.py b/tests/python/test_function.py
index 84393c3ba33249..4b532b939cc684 100644
--- a/tests/python/test_function.py
+++ b/tests/python/test_function.py
@@ -1,5 +1,5 @@
 import taichi as ti
-
+import pytest
 
 @ti.test(experimental_real_function=True)
 def test_function_without_return():
@@ -148,7 +148,8 @@ def run(self) -> ti.i32:
     assert x[None] == 0
 
 
-@ti.test(experimental_real_function=True, exclude=[ti.opengl, ti.cc])
+@pytest.mark.skip(reason='https://github.com/taichi-dev/taichi/issues/2442')
+@ti.test(experimental_real_function=True, debug=True)
 def test_templates():
     x = ti.field(ti.i32, shape=())
     y = ti.field(ti.i32, shape=())
diff --git a/tests/python/test_gc.py b/tests/python/test_gc.py
index 9d890a14512896..bedfdcf2c2b88e 100644
--- a/tests/python/test_gc.py
+++ b/tests/python/test_gc.py
@@ -1,7 +1,6 @@
 import taichi as ti
 
 
-@ti.test(require=ti.extension.sparse)
 def _test_block_gc():
     N = 100000
 
diff --git a/tests/python/test_sparse_basics.py b/tests/python/test_sparse_basics.py
index 742d4f51c35c94..2cece435142c47 100644
--- a/tests/python/test_sparse_basics.py
+++ b/tests/python/test_sparse_basics.py
@@ -1,7 +1,7 @@
 import taichi as ti
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_pointer():
     x = ti.field(ti.f32)
     s = ti.field(ti.i32)
@@ -24,7 +24,7 @@ def func():
     assert s[None] == 256
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_pointer_is_active():
     x = ti.field(ti.f32)
     s = ti.field(ti.i32)
@@ -47,7 +47,7 @@ def func():
     assert s[None] == 256
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_pointer2():
     x = ti.field(ti.f32)
     s = ti.field(ti.i32)
diff --git a/tests/python/test_sparse_deactivate.py b/tests/python/test_sparse_deactivate.py
index c2f6fce616a09e..f2961859364500 100644
--- a/tests/python/test_sparse_deactivate.py
+++ b/tests/python/test_sparse_deactivate.py
@@ -156,16 +156,19 @@ def clear_temp():
     fill2()
     clear_temp()
 
-    for iter in range(100):
+    for itr in range(100):
+        if itr % 10 == 0:
+            print('test itr=', itr)
         copy_to_temp()
         clear()
         copy_from_temp()
         clear_temp()
 
+        xn = x.to_numpy()
         for j in range(n * n):
             for i in range(n * n):
                 if i + j < 100:
-                    assert x[i, j] == i + j
+                    assert xn[i, j] == i + j
 
 
 @ti.test(require=ti.extension.sparse)
diff --git a/tests/python/test_sparse_parallel.py b/tests/python/test_sparse_parallel.py
index 54c5fe445af49c..4680396bc570dd 100644
--- a/tests/python/test_sparse_parallel.py
+++ b/tests/python/test_sparse_parallel.py
@@ -1,7 +1,7 @@
 import taichi as ti
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_pointer():
     x = ti.field(ti.f32)
     s = ti.field(ti.i32)
@@ -26,7 +26,7 @@ def func():
     assert s[None] == n * n
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_pointer2():
     x = ti.field(ti.f32)
     s = ti.field(ti.i32)
@@ -52,7 +52,7 @@ def func():
     assert s[None] == N * (N - 1) / 2
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_nested_struct_fill_and_clear():
     a = ti.field(dtype=ti.f32)
     N = 512
diff --git a/tests/python/test_struct_for.py b/tests/python/test_struct_for.py
index 36223c1a74c451..646078eef906e8 100644
--- a/tests/python/test_struct_for.py
+++ b/tests/python/test_struct_for.py
@@ -210,7 +210,7 @@ def fill():
         assert x[i] == i
 
 
-@ti.archs_support_sparse
+@ti.test(require=ti.extension.sparse)
 def test_struct_for_branching():
     # Related issue: https://github.com/taichi-dev/taichi/issues/704
     x = ti.field(dtype=ti.i32)
diff --git a/tests/python/test_struct_for_intermediate.py b/tests/python/test_struct_for_intermediate.py
index 00794e6dfc09ba..2683149282d9c5 100644
--- a/tests/python/test_struct_for_intermediate.py
+++ b/tests/python/test_struct_for_intermediate.py
@@ -1,8 +1,10 @@
 import taichi as ti
 
 
-@ti.test(require=ti.extension.sparse, demote_dense_struct_fors=False)
+# @ti.test(require=ti.extension.sparse, demote_dense_struct_fors=False)
 def test_nested():
+    ti.init(ti.metal, demote_dense_struct_fors=False)
+    ti.set_logging_level(ti.TRACE)
     x = ti.field(ti.i32)
 
     p, q = 3, 7
@@ -13,14 +15,16 @@ def test_nested():
     @ti.kernel
     def iterate():
         for i, j in x.parent():
+            print('i=', i, ' j=', j)
             x[i, j] += 1
 
     iterate()
+    # print(x.to_numpy())
+    # for i in range(p):
+    #     for j in range(q):
+    #         assert x[i * n, j * m] == 1, (i, j)
 
-    for i in range(p):
-        for j in range(q):
-            assert x[i * n, j * m] == 1
-
+test_nested()
 
 @ti.test()
 def test_nested_demote():
diff --git a/tests/python/test_test.py b/tests/python/test_test.py
index 7652a72fd0399f..eb78459bf3659a 100644
--- a/tests/python/test_test.py
+++ b/tests/python/test_test.py
@@ -53,7 +53,7 @@ def test_init_args():
 
 @ti.test(require=ti.extension.sparse)
 def test_require_extensions_1():
-    assert ti.cfg.arch in [ti.cpu, ti.cuda]
+    assert ti.cfg.arch in [ti.cpu, ti.cuda, ti.metal]
 
 
 @ti.test(arch=[ti.cpu, ti.opengl], require=ti.extension.sparse)