Skip to content

Commit

Permalink
speedup find_copy a little bit more
Browse files Browse the repository at this point in the history
  • Loading branch information
Ahdhn committed Jul 9, 2023
1 parent d095ec9 commit 2a83cf8
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 64 deletions.
29 changes: 20 additions & 9 deletions include/rxmesh/cavity_manager.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ struct CavityManager
: m_write_to_gmem(true),
m_s_num_cavities(nullptr),
m_s_cavity_size_prefix(nullptr),
m_s_q_correspondence_e(nullptr),
m_s_q_correspondence_vf(nullptr),
m_correspondence_size_e(0),
m_correspondence_size_vf(0),
m_s_readd_to_queue(nullptr),
m_s_ev(nullptr),
m_s_fe(nullptr),
Expand Down Expand Up @@ -543,17 +547,19 @@ struct CavityManager
const LPPair* s_stash);

/**
* @brief give a patch q, set a bit in s_in_patch if element x appears in
* s_table/s_stash.
* @brief give a patch q, we store the corresponding element in p in
* s_correspondence. Thus, s_correspondence is indexing via q's index space
*/
template <typename HandleT>
__device__ __inline__ void populate_in_patch(
__device__ __inline__ void populate_correspondence(
cooperative_groups::thread_block& block,
uint8_t q_stash,
Bitmask& s_in_patch,
const uint8_t q_stash,
uint16_t* s_correspondence,
const uint16_t s_correspondence_size,
const LPPair* s_table,
const LPPair* s_stash);


// indicate if this block can write its updates to global memory during
// epilogue
bool m_write_to_gmem;
Expand Down Expand Up @@ -603,10 +609,15 @@ struct CavityManager
// indicate if the mesh element is in the interior of the cavity
Bitmask m_s_in_cavity_v, m_s_in_cavity_e, m_s_in_cavity_f;

// given a patch q, this bitmask store whether a vertex/edge/face in q is
// stored in the hashtable of this patch (p). Thus, this bitmask used the
// index space of q
Bitmask m_s_in_patch_v, m_s_in_patch_e, m_s_in_patch_f;
// given a patch q, this buffer stores the p's local index corresponding to
// an element in q. Thus, this buffer is indexed using q's index space.
// We either need this for (vertices and edges) or (edges and faces) at the
// same time. Thus, the buffer use for vertices/faces is being recycled to
// serve both
uint16_t* m_s_q_correspondence_e;
uint16_t* m_s_q_correspondence_vf;
uint16_t m_correspondence_size_e;
uint16_t m_correspondence_size_vf;

bool* m_s_readd_to_queue;

Expand Down
121 changes: 73 additions & 48 deletions include/rxmesh/cavity_manager_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
const uint16_t edge_cap = m_patch_info.edges_capacity[0];
const uint16_t face_cap = m_patch_info.faces_capacity[0];

const uint16_t max_vertex_cap = static_cast<uint16_t>(
m_context.m_capacity_factor *
static_cast<float>(m_context.m_max_num_vertices[0]));
const uint16_t max_edge_cap =
static_cast<uint16_t>(m_context.m_capacity_factor *
static_cast<float>(m_context.m_max_num_edges[0]));
const uint16_t max_face_cap =
static_cast<uint16_t>(m_context.m_capacity_factor *
static_cast<float>(m_context.m_max_num_faces[0]));

// load EV and FE
m_s_ev = shrd_alloc.alloc<uint16_t>(2 * edge_cap);
m_s_fe = shrd_alloc.alloc<uint16_t>(3 * face_cap);
Expand Down Expand Up @@ -169,7 +179,6 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
m_s_src_mask_v = Bitmask(m_context.m_max_num_vertices[0], shrd_alloc);
m_s_src_connect_mask_v =
Bitmask(m_context.m_max_num_vertices[0], shrd_alloc);
m_s_in_patch_v = Bitmask(m_context.m_max_num_vertices[0], shrd_alloc);


// edges masks
Expand All @@ -180,12 +189,8 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
m_s_in_cavity_e,
m_patch_info.owned_mask_e,
m_patch_info.active_mask_e);
const uint16_t max_edge_cap =
static_cast<uint16_t>(m_context.m_capacity_factor *
static_cast<float>(m_context.m_max_num_edges[0]));
m_s_src_mask_e = Bitmask(std::max(max_edge_cap, edge_cap), shrd_alloc);
m_s_src_connect_mask_e = Bitmask(m_context.m_max_num_edges[0], shrd_alloc);
m_s_in_patch_e = Bitmask(m_context.m_max_num_edges[0], shrd_alloc);

// faces masks
alloc_masks(face_cap,
Expand All @@ -195,7 +200,14 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
m_s_in_cavity_f,
m_patch_info.owned_mask_f,
m_patch_info.active_mask_f);
m_s_in_patch_f = Bitmask(m_context.m_max_num_faces[0], shrd_alloc);

// correspondence
m_correspondence_size_e = max_edge_cap;
m_s_q_correspondence_e =
shrd_alloc.alloc<uint16_t>(m_correspondence_size_e);
m_correspondence_size_vf = std::max(max_face_cap, max_vertex_cap);
m_s_q_correspondence_vf =
shrd_alloc.alloc<uint16_t>(m_correspondence_size_vf);

// patch to lock
__shared__ uint32_t p_to_lock[PatchStash::stash_size];
Expand Down Expand Up @@ -1459,20 +1471,6 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
// they mark patches they read from in m_s_patches_to_lock_mask.
// At the end of every round, one thread make sure make sure that all
// patches marked in m_s_patches_to_lock_mask are actually locked.
assert(m_s_in_patch_v.m_bitmask != nullptr);
assert(m_s_in_patch_e.m_bitmask != nullptr);
assert(m_s_in_patch_f.m_bitmask != nullptr);
m_s_in_patch_v.reset(block);
m_s_in_patch_e.reset(block);
m_s_in_patch_f.reset(block);
block.sync();

populate_in_patch<VertexHandle>(
block, q_stash_id, m_s_in_patch_v, m_s_table_v, m_s_table_stash_v);
populate_in_patch<EdgeHandle>(
block, q_stash_id, m_s_in_patch_e, m_s_table_e, m_s_table_stash_e);
populate_in_patch<FaceHandle>(
block, q_stash_id, m_s_in_patch_f, m_s_table_f, m_s_table_stash_f);

PatchInfo q_patch_info = m_context.m_patches_info[q];

Expand Down Expand Up @@ -1504,6 +1502,13 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
}
}
}

populate_correspondence<VertexHandle>(block,
q_stash_id,
m_s_q_correspondence_vf,
m_correspondence_size_vf,
m_s_table_v,
m_s_table_stash_v);
block.sync();

// 3. make sure there is a copy in p for any vertex in
Expand Down Expand Up @@ -1542,8 +1547,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
lp, m_s_table_v, m_s_table_stash_v);
if (lp.patch_stash_id() == q_stash_id) {
assert(lp.local_id_in_owner_patch() <
m_s_in_patch_v.size());
m_s_in_patch_v.set(lp.local_id_in_owner_patch(), true);
m_correspondence_size_vf);
m_s_q_correspondence_vf[lp.local_id_in_owner_patch()] =
lp.key();
}
assert(inserted);
}
Expand All @@ -1555,6 +1561,14 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
return false;
}

populate_correspondence<EdgeHandle>(block,
q_stash_id,
m_s_q_correspondence_e,
m_correspondence_size_e,
m_s_table_e,
m_s_table_stash_e);
block.sync();

// same story as with the loop that adds vertices
const uint16_t q_num_edges_up =
ROUND_UP_TO_NEXT_MULTIPLE(q_num_edges, blockThreads);
Expand Down Expand Up @@ -1591,8 +1605,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
lp, m_s_table_e, m_s_table_stash_e);
if (lp.patch_stash_id() == q_stash_id) {
assert(lp.local_id_in_owner_patch() <
m_s_in_patch_e.size());
m_s_in_patch_e.set(lp.local_id_in_owner_patch(), true);
m_correspondence_size_e);
m_s_q_correspondence_e[lp.local_id_in_owner_patch()] =
lp.key();
}
assert(inserted);
}
Expand Down Expand Up @@ -1665,8 +1680,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
lp, m_s_table_e, m_s_table_stash_e);
if (lp.patch_stash_id() == q_stash_id) {
assert(lp.local_id_in_owner_patch() <
m_s_in_patch_e.size());
m_s_in_patch_e.set(lp.local_id_in_owner_patch(), true);
m_correspondence_size_e);
m_s_q_correspondence_e[lp.local_id_in_owner_patch()] =
lp.key();
}
assert(inserted);
}
Expand All @@ -1677,6 +1693,14 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
return false;
}

populate_correspondence<FaceHandle>(block,
q_stash_id,
m_s_q_correspondence_vf,
m_correspondence_size_vf,
m_s_table_f,
m_s_table_stash_f);
block.sync();

// same story as with the loop that adds vertices
const uint16_t q_num_faces_up =
ROUND_UP_TO_NEXT_MULTIPLE(q_num_faces, blockThreads);
Expand Down Expand Up @@ -1705,8 +1729,9 @@ __device__ __inline__ bool CavityManager<blockThreads, cop>::migrate_from_patch(
lp, m_s_table_f, m_s_table_stash_f);
if (lp.patch_stash_id() == q_stash_id) {
assert(lp.local_id_in_owner_patch() <
m_s_in_patch_f.size());
m_s_in_patch_f.set(lp.local_id_in_owner_patch(), true);
m_correspondence_size_vf);
m_s_q_correspondence_vf[lp.local_id_in_owner_patch()] =
lp.key();
}
assert(inserted);
}
Expand Down Expand Up @@ -2082,22 +2107,12 @@ __device__ __inline__ uint16_t CavityManager<blockThreads, cop>::find_copy(
src_patch = owner.patch_id();
lid = owner.local_id();
} else {
if constexpr (std::is_same_v<HandleT, VertexHandle>) {
if (!m_s_in_patch_v(lid)) {
return INVALID16;
}
}

if constexpr (std::is_same_v<HandleT, EdgeHandle>) {
if (!m_s_in_patch_e(lid)) {
return INVALID16;
}
}

if constexpr (std::is_same_v<HandleT, FaceHandle>) {
if (!m_s_in_patch_f(lid)) {
return INVALID16;
}
assert(lid < m_correspondence_size_e);
return m_s_q_correspondence_e[lid];
} else {
assert(lid < m_correspondence_size_vf);
return m_s_q_correspondence_vf[lid];
}
}

Expand Down Expand Up @@ -2137,19 +2152,27 @@ __device__ __inline__ uint16_t CavityManager<blockThreads, cop>::find_copy(

template <uint32_t blockThreads, CavityOp cop>
template <typename HandleT>
__device__ __inline__ void CavityManager<blockThreads, cop>::populate_in_patch(
__device__ __inline__ void
CavityManager<blockThreads, cop>::populate_correspondence(
cooperative_groups::thread_block& block,
uint8_t q_stash,
Bitmask& s_in_patch,
const uint8_t q_stash,
uint16_t* s_correspondence,
const uint16_t s_correspondence_size,
const LPPair* s_table,
const LPPair* s_stash)
{

fill_n<blockThreads>(
s_correspondence, s_correspondence_size, uint16_t(INVALID16));
block.sync();

LPHashTable lp = m_patch_info.get_lp<HandleT>();

for (uint16_t b = threadIdx.x; b < lp.m_capacity; b += blockThreads) {
const auto pair = s_table[b];
if (pair.patch_stash_id() == q_stash) {
s_in_patch.set(pair.local_id_in_owner_patch(), true);
assert(pair.local_id_in_owner_patch() < s_correspondence_size);
s_correspondence[pair.local_id_in_owner_patch()] = pair.key();
}
}

Expand All @@ -2158,11 +2181,13 @@ __device__ __inline__ void CavityManager<blockThreads, cop>::populate_in_patch(
b += blockThreads) {
auto pair = s_stash[b];
if (pair.patch_stash_id() == q_stash) {
s_in_patch.set(pair.local_id_in_owner_patch(), true);
assert(pair.local_id_in_owner_patch() < s_correspondence_size);
s_correspondence[pair.local_id_in_owner_patch()] = pair.key();
}
}
}


template <uint32_t blockThreads, CavityOp cop>
template <typename HandleT>
__device__ __inline__ bool CavityManager<blockThreads, cop>::ensure_ownership(
Expand Down
1 change: 1 addition & 0 deletions include/rxmesh/lp_hashtable.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct LPHashTable

__device__ __host__ LPHashTable()
: m_table(nullptr),
m_stash(nullptr),
m_capacity(0),
m_max_cuckoo_chains(0),
m_is_on_device(false)
Expand Down
22 changes: 15 additions & 7 deletions include/rxmesh/rxmesh_dynamic.h
Original file line number Diff line number Diff line change
Expand Up @@ -459,23 +459,31 @@ class RXMeshDynamic : public RXMeshStatic
// owned_cavity_bdry (for vertices only), ribbonize (for vertices only)
// added_to_lp, in_cavity
size_t bitmasks_shmem = 0;
bitmasks_shmem += 11 * detail::mask_num_bytes(vertex_cap) +
11 * ShmemAllocator::default_alignment;
bitmasks_shmem += 8 * detail::mask_num_bytes(edge_cap) +
8 * ShmemAllocator::default_alignment;
bitmasks_shmem += 6 * detail::mask_num_bytes(face_cap) +
6 * ShmemAllocator::default_alignment;
bitmasks_shmem += 10 * detail::mask_num_bytes(vertex_cap) +
10 * ShmemAllocator::default_alignment;
bitmasks_shmem += 7 * detail::mask_num_bytes(edge_cap) +
7 * ShmemAllocator::default_alignment;
bitmasks_shmem += 5 * detail::mask_num_bytes(face_cap) +
5 * ShmemAllocator::default_alignment;

// active cavity bitmask
bitmasks_shmem += detail::mask_num_bytes(face_cap / 2);


// correspondence buffer
size_t cv = sizeof(uint16_t) * vertex_cap;
size_t ce = sizeof(uint16_t) * edge_cap;
size_t cf = sizeof(uint16_t) * face_cap;
size_t correspond_shmem =
std::max(cv + ce, ce + cf) + 2 * ShmemAllocator::default_alignment;

// shared memory is the max of 1. static query shared memory + the
// cavity ID shared memory (since we need to mark seed elements) 2.
// dynamic rxmesh shared memory which includes cavity ID shared memory
// and other things
launch_box.smem_bytes_dyn =
std::max(connectivity_shmem + cavity_id_shmem + cavity_bdr_shmem +
cavity_size_shmem + bitmasks_shmem,
cavity_size_shmem + bitmasks_shmem + correspond_shmem,
static_shmem + cavity_id_shmem);

if (with_vertex_valence) {
Expand Down

0 comments on commit 2a83cf8

Please sign in to comment.