speedup find_copy a little bit more

owensgroup · Jul 9, 2023 · 2a83cf8 · 2a83cf8
1 parent d095ec9
commit 2a83cf8
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 64 deletions.
diff --git a/include/rxmesh/cavity_manager.cuh b/include/rxmesh/cavity_manager.cuh
@@ -24,6 +24,10 @@ struct CavityManager
  : m_write_to_gmem(true),
  m_s_num_cavities(nullptr),
  m_s_cavity_size_prefix(nullptr),
+ m_s_q_correspondence_e(nullptr),
+ m_s_q_correspondence_vf(nullptr),
+ m_correspondence_size_e(0),
+ m_correspondence_size_vf(0),
  m_s_readd_to_queue(nullptr),
  m_s_ev(nullptr),
  m_s_fe(nullptr),
@@ -543,17 +547,19 @@ struct CavityManager
  const LPPair* s_stash);
 
  /**
- * @brief give a patch q, set a bit in s_in_patch if element x appears in
- * s_table/s_stash.
+ * @brief give a patch q, we store the corresponding element in p in
+ * s_correspondence. Thus, s_correspondence is indexing via q's index space
  */
  template <typename HandleT>
- __device__ __inline__ void populate_in_patch(
+ __device__ __inline__ void populate_correspondence(
  cooperative_groups::thread_block& block,
- uint8_t q_stash,
- Bitmask& s_in_patch,
+ const uint8_t q_stash,
+ uint16_t* s_correspondence,
+ const uint16_t s_correspondence_size,
  const LPPair* s_table,
  const LPPair* s_stash);
 
+
  // indicate if this block can write its updates to global memory during
  // epilogue
  bool m_write_to_gmem;
@@ -603,10 +609,15 @@ struct CavityManager
  // indicate if the mesh element is in the interior of the cavity
  Bitmask m_s_in_cavity_v, m_s_in_cavity_e, m_s_in_cavity_f;
 
- // given a patch q, this bitmask store whether a vertex/edge/face in q is
- // stored in the hashtable of this patch (p). Thus, this bitmask used the
- // index space of q
- Bitmask m_s_in_patch_v, m_s_in_patch_e, m_s_in_patch_f;
+ // given a patch q, this buffer stores the p's local index corresponding to
+ // an element in q. Thus, this buffer is indexed using q's index space.
+ // We either need this for (vertices and edges) or (edges and faces) at the
+ // same time. Thus, the buffer use for vertices/faces is being recycled to
+ // serve both
+ uint16_t* m_s_q_correspondence_e;
+ uint16_t* m_s_q_correspondence_vf;
+ uint16_t m_correspondence_size_e;
+ uint16_t m_correspondence_size_vf;
 
  bool* m_s_readd_to_queue;
 

diff --git a/include/rxmesh/cavity_manager_impl.cuh b/include/rxmesh/cavity_manager_impl.cuh
@@ -107,6 +107,16 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
  const uint16_t edge_cap = m_patch_info.edges_capacity[0];
  const uint16_t face_cap = m_patch_info.faces_capacity[0];
 
+ const uint16_t max_vertex_cap = static_cast<uint16_t>(
+ m_context.m_capacity_factor *
+ static_cast<float>(m_context.m_max_num_vertices[0]));
+ const uint16_t max_edge_cap =
+ static_cast<uint16_t>(m_context.m_capacity_factor *
+ static_cast<float>(m_context.m_max_num_edges[0]));
+ const uint16_t max_face_cap =
+ static_cast<uint16_t>(m_context.m_capacity_factor *
+ static_cast<float>(m_context.m_max_num_faces[0]));
+
  // load EV and FE
  m_s_ev = shrd_alloc.alloc<uint16_t>(2 * edge_cap);
  m_s_fe = shrd_alloc.alloc<uint16_t>(3 * face_cap);
@@ -169,7 +179,6 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
  m_s_src_mask_v = Bitmask(m_context.m_max_num_vertices[0], shrd_alloc);
  m_s_src_connect_mask_v =
  Bitmask(m_context.m_max_num_vertices[0], shrd_alloc);
- m_s_in_patch_v = Bitmask(m_context.m_max_num_vertices[0], shrd_alloc);
 
 
  // edges masks
@@ -180,12 +189,8 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
  m_s_in_cavity_e,
  m_patch_info.owned_mask_e,
  m_patch_info.active_mask_e);
- const uint16_t max_edge_cap =
- static_cast<uint16_t>(m_context.m_capacity_factor *
- static_cast<float>(m_context.m_max_num_edges[0]));
  m_s_src_mask_e = Bitmask(std::max(max_edge_cap, edge_cap), shrd_alloc);
  m_s_src_connect_mask_e = Bitmask(m_context.m_max_num_edges[0], shrd_alloc);
- m_s_in_patch_e = Bitmask(m_context.m_max_num_edges[0], shrd_alloc);
 
  // faces masks
  alloc_masks(face_cap,
@@ -195,7 +200,14 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
  m_s_in_cavity_f,
  m_patch_info.owned_mask_f,
  m_patch_info.active_mask_f);
- m_s_in_patch_f = Bitmask(m_context.m_max_num_faces[0], shrd_alloc);
+
+ // correspondence
+ m_correspondence_size_e = max_edge_cap;
+ m_s_q_correspondence_e =
+ shrd_alloc.alloc<uint16_t>(m_correspondence_size_e);
+ m_correspondence_size_vf = std::max(max_face_cap, max_vertex_cap);
+ m_s_q_correspondence_vf =
+ shrd_alloc.alloc<uint16_t>(m_correspondence_size_vf);
 
  // patch to lock
  __shared__ uint32_t p_to_lock[PatchStash::stash_size];
@@ -1459,20 +1471,6 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  // they mark patches they read from in m_s_patches_to_lock_mask.
  // At the end of every round, one thread make sure make sure that all
  // patches marked in m_s_patches_to_lock_mask are actually locked.
- assert(m_s_in_patch_v.m_bitmask != nullptr);
- assert(m_s_in_patch_e.m_bitmask != nullptr);
- assert(m_s_in_patch_f.m_bitmask != nullptr);
- m_s_in_patch_v.reset(block);
- m_s_in_patch_e.reset(block);
- m_s_in_patch_f.reset(block);
- block.sync();
-
- populate_in_patch<VertexHandle>(
- block, q_stash_id, m_s_in_patch_v, m_s_table_v, m_s_table_stash_v);
- populate_in_patch<EdgeHandle>(
- block, q_stash_id, m_s_in_patch_e, m_s_table_e, m_s_table_stash_e);
- populate_in_patch<FaceHandle>(
- block, q_stash_id, m_s_in_patch_f, m_s_table_f, m_s_table_stash_f);
 
  PatchInfo q_patch_info = m_context.m_patches_info[q];
 
@@ -1504,6 +1502,13 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  }
  }
  }
+
+ populate_correspondence<VertexHandle>(block,
+ q_stash_id,
+ m_s_q_correspondence_vf,
+ m_correspondence_size_vf,
+ m_s_table_v,
+ m_s_table_stash_v);
  block.sync();
 
  // 3. make sure there is a copy in p for any vertex in
@@ -1542,8 +1547,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  lp, m_s_table_v, m_s_table_stash_v);
  if (lp.patch_stash_id() == q_stash_id) {
  assert(lp.local_id_in_owner_patch() <
- m_s_in_patch_v.size());
- m_s_in_patch_v.set(lp.local_id_in_owner_patch(), true);
+ m_correspondence_size_vf);
+ m_s_q_correspondence_vf[lp.local_id_in_owner_patch()] =
+ lp.key();
  }
  assert(inserted);
  }
@@ -1555,6 +1561,14 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  return false;
  }
 
+ populate_correspondence<EdgeHandle>(block,
+ q_stash_id,
+ m_s_q_correspondence_e,
+ m_correspondence_size_e,
+ m_s_table_e,
+ m_s_table_stash_e);
+ block.sync();
+
  // same story as with the loop that adds vertices
  const uint16_t q_num_edges_up =
  ROUND_UP_TO_NEXT_MULTIPLE(q_num_edges, blockThreads);
@@ -1591,8 +1605,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  lp, m_s_table_e, m_s_table_stash_e);
  if (lp.patch_stash_id() == q_stash_id) {
  assert(lp.local_id_in_owner_patch() <
- m_s_in_patch_e.size());
- m_s_in_patch_e.set(lp.local_id_in_owner_patch(), true);
+ m_correspondence_size_e);
+ m_s_q_correspondence_e[lp.local_id_in_owner_patch()] =
+ lp.key();
  }
  assert(inserted);
  }
@@ -1665,8 +1680,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  lp, m_s_table_e, m_s_table_stash_e);
  if (lp.patch_stash_id() == q_stash_id) {
  assert(lp.local_id_in_owner_patch() <
- m_s_in_patch_e.size());
- m_s_in_patch_e.set(lp.local_id_in_owner_patch(), true);
+ m_correspondence_size_e);
+ m_s_q_correspondence_e[lp.local_id_in_owner_patch()] =
+ lp.key();
  }
  assert(inserted);
  }
@@ -1677,6 +1693,14 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  return false;
  }
 
+ populate_correspondence<FaceHandle>(block,
+ q_stash_id,
+ m_s_q_correspondence_vf,
+ m_correspondence_size_vf,
+ m_s_table_f,
+ m_s_table_stash_f);
+ block.sync();
+
  // same story as with the loop that adds vertices
  const uint16_t q_num_faces_up =
  ROUND_UP_TO_NEXT_MULTIPLE(q_num_faces, blockThreads);
@@ -1705,8 +1729,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
  lp, m_s_table_f, m_s_table_stash_f);
  if (lp.patch_stash_id() == q_stash_id) {
  assert(lp.local_id_in_owner_patch() <
- m_s_in_patch_f.size());
- m_s_in_patch_f.set(lp.local_id_in_owner_patch(), true);
+ m_correspondence_size_vf);
+ m_s_q_correspondence_vf[lp.local_id_in_owner_patch()] =
+ lp.key();
  }
  assert(inserted);
  }
@@ -2082,22 +2107,12 @@ __device__ __inline__ uint16_t CavityManager<blockThreads, cop>::find_copy(
  src_patch = owner.patch_id();
  lid = owner.local_id();
  } else {
- if constexpr (std::is_same_v<HandleT, VertexHandle>) {
- if (!m_s_in_patch_v(lid)) {
- return INVALID16;
- }
- }
-
  if constexpr (std::is_same_v<HandleT, EdgeHandle>) {
- if (!m_s_in_patch_e(lid)) {
- return INVALID16;
- }
- }
-
- if constexpr (std::is_same_v<HandleT, FaceHandle>) {
- if (!m_s_in_patch_f(lid)) {
- return INVALID16;
- }
+ assert(lid < m_correspondence_size_e);
+ return m_s_q_correspondence_e[lid];
+ } else {
+ assert(lid < m_correspondence_size_vf);
+ return m_s_q_correspondence_vf[lid];
  }
  }
 
@@ -2137,19 +2152,27 @@ __device__ __inline__ uint16_t CavityManager<blockThreads, cop>::find_copy(
 
 template <uint32_t blockThreads, CavityOp cop>
 template <typename HandleT>
-__device__ __inline__ void CavityManager<blockThreads, cop>::populate_in_patch(
+__device__ __inline__ void
+CavityManager<blockThreads, cop>::populate_correspondence(
  cooperative_groups::thread_block& block,
- uint8_t q_stash,
- Bitmask& s_in_patch,
+ const uint8_t q_stash,
+ uint16_t* s_correspondence,
+ const uint16_t s_correspondence_size,
  const LPPair* s_table,
  const LPPair* s_stash)
 {
+
+ fill_n<blockThreads>(
+ s_correspondence, s_correspondence_size, uint16_t(INVALID16));
+ block.sync();
+
  LPHashTable lp = m_patch_info.get_lp<HandleT>();
 
  for (uint16_t b = threadIdx.x; b < lp.m_capacity; b += blockThreads) {
  const auto pair = s_table[b];
  if (pair.patch_stash_id() == q_stash) {
- s_in_patch.set(pair.local_id_in_owner_patch(), true);
+ assert(pair.local_id_in_owner_patch() < s_correspondence_size);
+ s_correspondence[pair.local_id_in_owner_patch()] = pair.key();
  }
  }
 
@@ -2158,11 +2181,13 @@ __device__ __inline__ void CavityManager<blockThreads, cop>::populate_in_patch(
  b += blockThreads) {
  auto pair = s_stash[b];
  if (pair.patch_stash_id() == q_stash) {
- s_in_patch.set(pair.local_id_in_owner_patch(), true);
+ assert(pair.local_id_in_owner_patch() < s_correspondence_size);
+ s_correspondence[pair.local_id_in_owner_patch()] = pair.key();
  }
  }
 }
 
+
 template <uint32_t blockThreads, CavityOp cop>
 template <typename HandleT>
 __device__ __inline__ bool CavityManager<blockThreads, cop>::ensure_ownership(

diff --git a/include/rxmesh/lp_hashtable.cuh b/include/rxmesh/lp_hashtable.cuh
@@ -56,6 +56,7 @@ struct LPHashTable
 
  __device__ __host__ LPHashTable()
  : m_table(nullptr),
+ m_stash(nullptr),
  m_capacity(0),
  m_max_cuckoo_chains(0),
  m_is_on_device(false)

diff --git a/include/rxmesh/rxmesh_dynamic.h b/include/rxmesh/rxmesh_dynamic.h
@@ -459,23 +459,31 @@ class RXMeshDynamic : public RXMeshStatic
  // owned_cavity_bdry (for vertices only), ribbonize (for vertices only)
  // added_to_lp, in_cavity
  size_t bitmasks_shmem = 0;
- bitmasks_shmem += 11 * detail::mask_num_bytes(vertex_cap) +
- 11 * ShmemAllocator::default_alignment;
- bitmasks_shmem += 8 * detail::mask_num_bytes(edge_cap) +
- 8 * ShmemAllocator::default_alignment;
- bitmasks_shmem += 6 * detail::mask_num_bytes(face_cap) +
- 6 * ShmemAllocator::default_alignment;
+ bitmasks_shmem += 10 * detail::mask_num_bytes(vertex_cap) +
+ 10 * ShmemAllocator::default_alignment;
+ bitmasks_shmem += 7 * detail::mask_num_bytes(edge_cap) +
+ 7 * ShmemAllocator::default_alignment;
+ bitmasks_shmem += 5 * detail::mask_num_bytes(face_cap) +
+ 5 * ShmemAllocator::default_alignment;
 
  // active cavity bitmask
  bitmasks_shmem += detail::mask_num_bytes(face_cap / 2);
 
+
+ // correspondence buffer
+ size_t cv = sizeof(uint16_t) * vertex_cap;
+ size_t ce = sizeof(uint16_t) * edge_cap;
+ size_t cf = sizeof(uint16_t) * face_cap;
+ size_t correspond_shmem =
+ std::max(cv + ce, ce + cf) + 2 * ShmemAllocator::default_alignment;
+
  // shared memory is the max of 1. static query shared memory + the
  // cavity ID shared memory (since we need to mark seed elements) 2.
  // dynamic rxmesh shared memory which includes cavity ID shared memory
  // and other things
  launch_box.smem_bytes_dyn =
  std::max(connectivity_shmem + cavity_id_shmem + cavity_bdr_shmem +
- cavity_size_shmem + bitmasks_shmem,
+ cavity_size_shmem + bitmasks_shmem + correspond_shmem,
  static_shmem + cavity_id_shmem);
 
  if (with_vertex_valence) {