Fix memory allocation

Make elements allocation based on either the max element per patch or the standard patch
owensgroup · Nov 3, 2023 · db5fd8e · db5fd8e
1 parent d13531b
commit db5fd8e
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 85 deletions.
diff --git a/apps/Remesh/remesh_kernels.cuh b/apps/Remesh/remesh_kernels.cuh
@@ -47,7 +47,7 @@ __global__ static void edge_split(rxmesh::Context context,
  ShmemAllocator shrd_alloc;
 
  CavityManager<blockThreads, CavityOp::E> cavity(
- block, context, shrd_alloc, false);
+ block, context, shrd_alloc, true);
 
 
  if (cavity.patch_id() == INVALID32) {

diff --git a/include/rxmesh/rxmesh.cpp b/include/rxmesh/rxmesh.cpp
@@ -70,9 +70,26 @@ void RXMesh::init(const std::vector<std::vector<uint32_t>>& fv,
  }
 
  build(fv, patcher_file);
+
+ // calc max elements for use in build_device (which populates
+ // m_h_patches_info and thus we can not use calc_max_elements now)
+ m_max_vertices_per_patch = 0;
+ m_max_edges_per_patch = 0;
+ m_max_faces_per_patch = 0;
+ for (uint32_t p = 0; p < get_num_patches(); ++p) {
+ m_max_vertices_per_patch =
+ std::max(m_max_vertices_per_patch,
+ static_cast<uint32_t>(m_h_patches_ltog_v[p].size()));
+ m_max_edges_per_patch =
+ std::max(m_max_edges_per_patch,
+ static_cast<uint32_t>(m_h_patches_ltog_e[p].size()));
+ m_max_faces_per_patch =
+ std::max(m_max_faces_per_patch,
+ static_cast<uint32_t>(m_h_patches_ltog_f[p].size()));
+ }
+
  build_device();
 
- calc_max_elements();
 
  PatchScheduler sch;
  sch.init(get_max_num_patches());
@@ -117,12 +134,6 @@ void RXMesh::init(const std::vector<std::vector<uint32_t>>& fv,
  RXMESH_TRACE("per-patch maximum edge count = {}", m_max_edges_per_patch);
  RXMESH_TRACE("per-patch maximum vertex count = {}",
  m_max_vertices_per_patch);
- // RXMESH_TRACE("per-patch maximum not-owned face count = {}",
- // m_max_not_owned_faces);
- // RXMESH_TRACE("per-patch maximum not-owned edge count = {}",
- // m_max_not_owned_edges);
- // RXMESH_TRACE("per-patch maximum not-owned vertex count = {}",
- // m_max_not_owned_vertices);
 }
 
 RXMesh::~RXMesh()
@@ -425,9 +436,6 @@ void RXMesh::calc_input_statistics(const std::vector<std::vector<uint32_t>>& fv,
 
 void RXMesh::calc_max_elements()
 {
- // m_max_not_owned_vertices = 0;
- // m_max_not_owned_edges = 0;
- // m_max_not_owned_faces = 0;
  m_max_vertices_per_patch = 0;
  m_max_edges_per_patch = 0;
  m_max_faces_per_patch = 0;
@@ -437,28 +445,10 @@ void RXMesh::calc_max_elements()
  m_max_vertices_per_patch =
  std::max(m_max_vertices_per_patch,
  uint32_t(m_h_patches_info[p].num_vertices[0]));
-
  m_max_edges_per_patch = std::max(
  m_max_edges_per_patch, uint32_t(m_h_patches_info[p].num_edges[0]));
-
  m_max_faces_per_patch = std::max(
  m_max_faces_per_patch, uint32_t(m_h_patches_info[p].num_faces[0]));
-
-
- // m_max_not_owned_vertices = std::max(
- // m_max_not_owned_vertices,
- // detail::count_zero_bits(m_h_patches_info[p].num_vertices[0],
- // m_h_patches_info[p].owned_mask_v));
- //
- // m_max_not_owned_edges =
- // std::max(m_max_not_owned_edges,
- // detail::count_zero_bits(m_h_patches_info[p].num_edges[0],
- // m_h_patches_info[p].owned_mask_e));
- //
- // m_max_not_owned_faces =
- // std::max(m_max_not_owned_faces,
- // detail::count_zero_bits(m_h_patches_info[p].num_faces[0],
- // m_h_patches_info[p].owned_mask_f));
  }
 }
 
@@ -768,11 +758,54 @@ uint32_t RXMesh::get_edge_id(const std::pair<uint32_t, uint32_t>& edge) const
  return edge_id;
 }
 
+uint32_t RXMesh::get_standard_patch_num_vertices() const
+{
+ return DIVIDE_UP(get_standard_patch_num_faces(), 2);
+}
+uint32_t RXMesh::get_standard_patch_num_edges() const
+{
+ return 3 * get_standard_patch_num_vertices();
+}
+uint32_t RXMesh::get_standard_patch_num_faces() const
+{
+ return m_patch_size;
+}
+
+uint16_t RXMesh::get_per_patch_max_vertex_capacity() const
+{
+ // the capacity size of a patch is based on the maximum of
+ // 1. the standard patch
+ // 2. the per-patch max number of elements
+ // we then pre-multiply this number by the capacity factor increase
+
+ const float max_cap_vertices = static_cast<float>(std::max(
+ get_standard_patch_num_vertices(), get_per_patch_max_vertices()));
+
+ return static_cast<uint16_t>(
+ std::ceil(m_capacity_factor * max_cap_vertices));
+}
+uint16_t RXMesh::get_per_patch_max_edge_capacity() const
+{
+ const float max_cap_edges = static_cast<float>(
+ std::max(get_standard_patch_num_edges(), get_per_patch_max_edges()));
+
+ return static_cast<uint16_t>(std::ceil(m_capacity_factor * max_cap_edges));
+}
+uint16_t RXMesh::get_per_patch_max_face_capacity() const
+{
+ const float max_cap_faces = static_cast<float>(
+ std::max(get_standard_patch_num_faces(), get_per_patch_max_faces()));
+
+ return static_cast<uint16_t>(std::ceil(m_capacity_factor * max_cap_faces));
+}
+
+
 void RXMesh::build_device()
 {
  CUDA_ERROR(cudaMalloc((void**)&m_d_patches_info,
  get_max_num_patches() * sizeof(PatchInfo)));
 
+
 #pragma omp parallel for
  for (int p = 0; p < static_cast<int>(get_num_patches()); ++p) {
 
@@ -783,20 +816,13 @@ void RXMesh::build_device()
  const uint16_t p_num_faces =
  static_cast<uint16_t>(m_h_patches_ltog_f[p].size());
 
- const uint16_t p_vertices_capacity = static_cast<uint16_t>(
- std::ceil(m_capacity_factor * static_cast<float>(p_num_vertices)));
- const uint16_t p_edges_capacity = static_cast<uint16_t>(
- std::ceil(m_capacity_factor * static_cast<float>(p_num_edges)));
- const uint16_t p_faces_capacity = static_cast<uint16_t>(
- std::ceil(m_capacity_factor * static_cast<float>(p_num_faces)));
-
  build_device_single_patch(p,
  p_num_vertices,
  p_num_edges,
  p_num_faces,
- p_vertices_capacity,
- p_edges_capacity,
- p_faces_capacity,
+ get_per_patch_max_vertex_capacity(),
+ get_per_patch_max_edge_capacity(),
+ get_per_patch_max_face_capacity(),
  m_h_num_owned_v[p],
  m_h_num_owned_e[p],
  m_h_num_owned_f[p],
@@ -921,8 +947,13 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
  cudaMemcpyHostToDevice));
 
  // allocate and copy patch topology to the device
+ // we realloc the host h_patch_info EV and FE to ensure that both host and
+ // device has the same capacity
  CUDA_ERROR(cudaMalloc((void**)&d_patch.ev,
  p_edges_capacity * 2 * sizeof(LocalVertexT)));
+ h_patch_info.ev = (LocalVertexT*)realloc(
+ h_patch_info.ev, p_edges_capacity * 2 * sizeof(LocalVertexT));
+
  if (p_num_edges > 0) {
  CUDA_ERROR(cudaMemcpy(d_patch.ev,
  h_patch_info.ev,
@@ -932,6 +963,9 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
 
  CUDA_ERROR(cudaMalloc((void**)&d_patch.fe,
  p_faces_capacity * 3 * sizeof(LocalEdgeT)));
+ h_patch_info.fe = (LocalEdgeT*)realloc(
+ h_patch_info.fe, p_faces_capacity * 3 * sizeof(LocalEdgeT));
+
  if (p_num_faces > 0) {
  CUDA_ERROR(cudaMemcpy(d_patch.fe,
  h_patch_info.fe,
@@ -1156,6 +1190,10 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
 void RXMesh::allocate_extra_patches()
 {
 
+ const uint16_t p_vertices_capacity = get_per_patch_max_vertex_capacity();
+ const uint16_t p_edges_capacity = get_per_patch_max_edge_capacity();
+ const uint16_t p_faces_capacity = get_per_patch_max_face_capacity();
+
 #pragma omp parallel for
  for (int p = get_num_patches(); p < static_cast<int>(get_max_num_patches());
  ++p) {
@@ -1164,11 +1202,6 @@ void RXMesh::allocate_extra_patches()
  const uint16_t p_num_edges = 0;
  const uint16_t p_num_faces = 0;
 
- const uint16_t p_vertices_capacity =
- get_per_patch_max_vertices_capacity();
- const uint16_t p_edges_capacity = get_per_patch_max_edges_capacity();
- const uint16_t p_faces_capacity = get_per_patch_max_faces_capacity();
-
  m_h_patches_info[p].ev =
  (LocalVertexT*)malloc(2 * p_edges_capacity * sizeof(LocalVertexT));
  m_h_patches_info[p].fe =

diff --git a/include/rxmesh/rxmesh.h b/include/rxmesh/rxmesh.h
@@ -172,33 +172,6 @@ class RXMesh
  return m_max_faces_per_patch;
  }
 
- /**
- * @brief Maximum capacity number of vertices in a patch
- */
- uint16_t get_per_patch_max_vertices_capacity() const
- {
- return static_cast<uint16_t>(std::ceil(
- m_capacity_factor * static_cast<float>(m_max_vertices_per_patch)));
- }
-
- /**
- * @brief Maximum capacity number of edges in a patch
- */
- uint32_t get_per_patch_max_edges_capacity() const
- {
- return static_cast<uint16_t>(std::ceil(
- m_capacity_factor * static_cast<float>(m_max_edges_per_patch)));
- }
-
- /**
- * @brief Maximum capacity number of faces in a patch
- */
- uint32_t get_per_patch_max_faces_capacity() const
- {
- return static_cast<uint16_t>(std::ceil(
- m_capacity_factor * static_cast<float>(m_max_faces_per_patch)));
- }
-
  /**
  * @brief The time used to construct the patches on the GPU
  */
@@ -409,6 +382,18 @@ class RXMesh
  const std::vector<std::vector<uint32_t>>& fv,
  const uint32_t patch_id);
 
+ // standard patch is a patch that has m_patch_size faces
+ // we apply Euler characteristic to get the number of vertices and edges in
+ // such a patch
+ uint32_t get_standard_patch_num_vertices() const;
+ uint32_t get_standard_patch_num_edges() const;
+ uint32_t get_standard_patch_num_faces() const;
+
+ // get the max vertex/edge/face capacity i.e., the max number of
+ // vertices/edges/faces allowed in a patch (for allocation purposes)
+ uint16_t get_per_patch_max_vertex_capacity() const;
+ uint16_t get_per_patch_max_edge_capacity() const;
+ uint16_t get_per_patch_max_face_capacity() const;
 
  void build_device();
  void build_device_single_patch(const uint32_t patch_id,

diff --git a/include/rxmesh/rxmesh_dynamic.h b/include/rxmesh/rxmesh_dynamic.h
@@ -165,8 +165,8 @@ __global__ static void slice_patches(Context context,
  ::atomicAdd(context.m_num_patches, uint32_t(1));
  assert(s_new_patch_id < context.m_max_num_patches);
  } else {
- s_new_patch_id = INVALID32;
- pi.should_slice = false;
+ s_new_patch_id  = INVALID32;
+ context.m_patches_info[pid].should_slice = false;
  }
  }
  Bitmask s_owned_v, s_owned_e, s_owned_f;
@@ -515,17 +515,9 @@ class RXMeshDynamic : public RXMeshStatic
  }
 
  if (is_dyn) {
- uint16_t vertex_cap = static_cast<uint16_t>(
- this->m_capacity_factor *
- static_cast<float>(this->m_max_vertices_per_patch));
-
- uint16_t edge_cap = static_cast<uint16_t>(
- this->m_capacity_factor *
- static_cast<float>(this->m_max_edges_per_patch));
-
- uint16_t face_cap = static_cast<uint16_t>(
- this->m_capacity_factor *
- static_cast<float>(this->m_max_faces_per_patch));
+ uint16_t vertex_cap = get_per_patch_max_vertex_capacity();
+ uint16_t edge_cap = get_per_patch_max_edge_capacity();
+ uint16_t face_cap = get_per_patch_max_face_capacity();
 
  // connecivity (FE and EV) shared memory
  size_t connectivity_shmem = 0;
@@ -602,7 +594,8 @@ class RXMeshDynamic : public RXMeshStatic
  // memory and other things
  launch_box.smem_bytes_dyn = std::max(
  connectivity_shmem + cavity_id_shmem + cavity_bdr_shmem +
- cavity_size_shmem + bitmasks_shmem + correspond_shmem,
+ cavity_size_shmem + bitmasks_shmem + correspond_shmem +
+ cavity_creator_shmem,
  static_shmem + cavity_id_shmem + cavity_creator_shmem);
  } else {
  launch_box.smem_bytes_dyn = static_shmem;
@@ -672,6 +665,18 @@ class RXMeshDynamic : public RXMeshStatic
  constexpr uint32_t block_size = 256;
  const uint32_t grid_size = get_num_patches();
 
+ CUDA_ERROR(cudaMemcpy(&this->m_max_vertices_per_patch,
+ this->m_rxmesh_context.m_max_num_vertices,
+ sizeof(uint32_t),
+ cudaMemcpyDeviceToHost));
+ CUDA_ERROR(cudaMemcpy(&this->m_max_edges_per_patch,
+ this->m_rxmesh_context.m_max_num_edges,
+ sizeof(uint32_t),
+ cudaMemcpyDeviceToHost));
+ CUDA_ERROR(cudaMemcpy(&this->m_max_faces_per_patch,
+ this->m_rxmesh_context.m_max_num_faces,
+ sizeof(uint32_t),
+ cudaMemcpyDeviceToHost));
 
  // ev, fe
  uint32_t dyn_shmem =