From 8aeab463df12fd21c1ba2a86147f259554ed3d0e Mon Sep 17 00:00:00 2001 From: "Samuel K. Gutierrez" Date: Fri, 26 Jul 2024 15:06:20 -0600 Subject: [PATCH] Rework qvi_scope_split_agg_s. Rework qvi_scope_split_agg_s in preparation for hardware mapping improvements. Signed-off-by: Samuel K. Gutierrez --- src/qvi-bbuff-rmi.h | 4 +- src/qvi-group-mpi.h | 4 +- src/qvi-group-omp.h | 4 +- src/qvi-group-process.h | 4 +- src/qvi-group-pthread.h | 4 +- src/qvi-group.h | 4 +- src/qvi-hwpool.cc | 18 +- src/qvi-hwpool.h | 2 +- src/qvi-scope.cc | 983 +++++++++++++++++++--------------------- 9 files changed, 480 insertions(+), 547 deletions(-) diff --git a/src/qvi-bbuff-rmi.h b/src/qvi-bbuff-rmi.h index 665db9e..1d69966 100644 --- a/src/qvi-bbuff-rmi.h +++ b/src/qvi-bbuff-rmi.h @@ -825,11 +825,11 @@ qvi_bbuff_rmi_unpack_item( } /** - * Unpacks qvi_hwpool_dev_s * + * Unpacks qvi_hwpool_dev_s & */ inline int qvi_bbuff_rmi_unpack_item( - qvi_hwpool_dev_s *dev, + qvi_hwpool_dev_s &dev, byte_t *buffpos, size_t *bytes_written ) { diff --git a/src/qvi-group-mpi.h b/src/qvi-group-mpi.h index f0b79c0..8722b6c 100644 --- a/src/qvi-group-mpi.h +++ b/src/qvi-group-mpi.h @@ -46,13 +46,13 @@ struct qvi_group_mpi_s : public qvi_group_s { } virtual int - rank(void) + rank(void) const { return qvi_mpi_group_rank(m_mpi_group); } virtual int - size(void) + size(void) const { return qvi_mpi_group_size(m_mpi_group); } diff --git a/src/qvi-group-omp.h b/src/qvi-group-omp.h index 3e51c79..799e7c2 100644 --- a/src/qvi-group-omp.h +++ b/src/qvi-group-omp.h @@ -43,13 +43,13 @@ struct qvi_group_omp_s : public qvi_group_s { } virtual int - rank(void) + rank(void) const { return qvi_omp_group_id(m_ompgroup); } virtual int - size(void) + size(void) const { return qvi_omp_group_size(m_ompgroup); } diff --git a/src/qvi-group-process.h b/src/qvi-group-process.h index d32795e..a4be770 100644 --- a/src/qvi-group-process.h +++ b/src/qvi-group-process.h @@ -37,13 +37,13 @@ struct qvi_group_process_s : public qvi_group_s { } virtual int - rank(void) + rank(void) const { return qvi_process_group_id(m_proc_group); } virtual int - size(void) + size(void) const { return qvi_process_group_size(m_proc_group); } diff --git a/src/qvi-group-pthread.h b/src/qvi-group-pthread.h index 95dffe1..0edeb32 100644 --- a/src/qvi-group-pthread.h +++ b/src/qvi-group-pthread.h @@ -37,13 +37,13 @@ struct qvi_group_pthread_s : public qvi_group_s { } virtual int - rank(void) + rank(void) const { return thgroup->rank(); } virtual int - size(void) + size(void) const { return thgroup->size(); } diff --git a/src/qvi-group.h b/src/qvi-group.h index a05d68f..f9fdaab 100644 --- a/src/qvi-group.h +++ b/src/qvi-group.h @@ -39,10 +39,10 @@ struct qvi_group_s : qvi_refc_s { hwloc(void); /** Returns the number of members in this group. */ virtual int - size(void) = 0; + size(void) const = 0; /** Returns the caller's group rank. */ virtual int - rank(void) = 0; + rank(void) const = 0; /** Performs node-local group barrier. */ virtual int barrier(void) = 0; diff --git a/src/qvi-hwpool.cc b/src/qvi-hwpool.cc index a54ce2f..7bc980f 100644 --- a/src/qvi-hwpool.cc +++ b/src/qvi-hwpool.cc @@ -260,47 +260,47 @@ int qvi_hwpool_dev_s::unpack( byte_t *buffpos, size_t *bytes_written, - qvi_hwpool_dev_s *dev + qvi_hwpool_dev_s &dev ) { size_t bw = 0, total_bw = 0; int rc = qvi_bbuff_rmi_unpack_item( - &dev->m_hints, buffpos, &bw + &dev.m_hints, buffpos, &bw ); if (qvi_unlikely(rc != QV_SUCCESS)) goto out; total_bw += bw; buffpos += bw; rc = qvi_bbuff_rmi_unpack_item( - dev->m_affinity, buffpos, &bw + dev.m_affinity, buffpos, &bw ); if (qvi_unlikely(rc != QV_SUCCESS)) goto out; total_bw += bw; buffpos += bw; rc = qvi_bbuff_rmi_unpack_item( - &dev->m_type, buffpos, &bw + &dev.m_type, buffpos, &bw ); if (qvi_unlikely(rc != QV_SUCCESS)) goto out; total_bw += bw; buffpos += bw; rc = qvi_bbuff_rmi_unpack_item( - &dev->m_id, buffpos, &bw + &dev.m_id, buffpos, &bw ); if (qvi_unlikely(rc != QV_SUCCESS)) goto out; total_bw += bw; buffpos += bw; rc = qvi_bbuff_rmi_unpack_item( - dev->m_pci_bus_id, buffpos, &bw + dev.m_pci_bus_id, buffpos, &bw ); if (qvi_unlikely(rc != QV_SUCCESS)) goto out; total_bw += bw; buffpos += bw; rc = qvi_bbuff_rmi_unpack_item( - dev->m_uuid, buffpos, &bw + dev.m_uuid, buffpos, &bw ); if (qvi_unlikely(rc != QV_SUCCESS)) goto out; total_bw += bw; @@ -453,9 +453,7 @@ qvi_hwpool_s::unpack( // Unpack and add the devices. for (size_t i = 0; i < ndev; ++i) { qvi_hwpool_dev_s dev; - rc = qvi_bbuff_rmi_unpack_item( - &dev, buffpos, &bw - ); + rc = qvi_bbuff_rmi_unpack_item(dev, buffpos, &bw); if (qvi_unlikely(rc != QV_SUCCESS)) break; total_bw += bw; buffpos += bw; diff --git a/src/qvi-hwpool.h b/src/qvi-hwpool.h index 037c279..6500947 100644 --- a/src/qvi-hwpool.h +++ b/src/qvi-hwpool.h @@ -122,7 +122,7 @@ struct qvi_hwpool_dev_s : qvi_hwpool_res_s { unpack( byte_t *buffpos, size_t *bytes_written, - qvi_hwpool_dev_s *dev + qvi_hwpool_dev_s &dev ); }; diff --git a/src/qvi-scope.cc b/src/qvi-scope.cc index 41d9619..007f358 100644 --- a/src/qvi-scope.cc +++ b/src/qvi-scope.cc @@ -465,21 +465,13 @@ bcast_value( * qvi_scope_split_agg_s, but that isn't a requirement. */ struct qvi_scope_split_agg_s { - /** - * A pointer to my RMI. - */ + /** A pointer to my RMI. */ qvi_rmi_client_t *rmi = nullptr; - /** - * The base hardware pool we are splitting. - */ + /** The base hardware pool we are splitting. */ qvi_hwpool_s *base_hwpool = nullptr; - /** - * The number of members that are part of the split. - */ + /** The number of members that are part of the split. */ uint_t group_size = 0; - /** - * The number of pieces in the split. - */ + /** The number of pieces in the split. */ uint_t split_size = 0; /** * The potential hardware resource that we are splitting at. QV_HW_OBJ_LAST @@ -487,66 +479,450 @@ struct qvi_scope_split_agg_s { * resource type indicates that we are splitting at that type: called from a * split_at() context. */ - qv_hw_obj_type_t split_at_type{}; + qv_hw_obj_type_t split_at_type; /** * Vector of task IDs, one for each member of the group. Note that the * number of task IDs will always match the group size and that their array * index corresponds to a task ID. It is handy to have the task IDs for * splitting so we can query task characteristics during a splitting. */ - std::vector taskids{}; + std::vector taskids; /** * Vector of hardware pools, one for each member of the group. Note that the * number of hardware pools will always match the group size and that their * array index corresponds to a task ID: 0 ... group_size - 1. */ - std::vector hwpools{}; + std::vector hwpools; /** * Vector of colors, one for each member of the group. Note that the number * of colors will always match the group size and that their array index * corresponds to a task ID. */ - std::vector colors{}; + std::vector colors; + /** Vector of task affinities. */ + qvi_hwloc_cpusets_t affinities; + /** Constructor. */ + qvi_scope_split_agg_s(void) = default; + /** Constructor. */ + qvi_scope_split_agg_s( + qv_scope_t *parent, + uint_t group_size_a, + uint_t split_size_a, + qv_hw_obj_type_t split_at_type_a + ) : rmi(parent->group->task()->rmi()) + , base_hwpool(parent->hwpool) + , group_size(group_size_a) + , split_size(split_size_a) + , split_at_type(split_at_type_a) + { + // To save memory we don't eagerly resize our vectors to group_size + // since most processes will not use the storage. For example, in the + // collective case the root ID process will be the only one needing + // group_size elements in our vectors. We'll let the call paths enforce + // appropriate vector sizing. + } + /** Destructor. */ + ~qvi_scope_split_agg_s(void) + { + for (auto &hwpool : hwpools) { + qvi_delete(&hwpool); + } + } /** - * Vector of task affinities. + * Resizes the relevant containers to make + * room for |group size| number of elements. */ - qvi_hwloc_cpusets_t affinities{}; + void + reserve(void) + { + taskids.resize(group_size); + hwpools.resize(group_size); + colors.resize(group_size); + affinities.resize(group_size); + } /** - * Constructor. + * Returns a copy of the aggregate cpuset. Note that the cpuset will be shared + * among the aggregate, but other resources may be distributed differently. + * For example, some hardware pools may have GPUs, while others may not. */ - qvi_scope_split_agg_s(void) = default; + qvi_hwloc_bitmap_s + cpuset(void) const + { + // This shouldn't happen. + assert(hwpools.size() != 0); + return hwpools[0]->cpuset(); + } /** - * Destructor + * Performs a straightforward splitting of the provided cpuset: split the + * provided base cpuset into splitagg.split_size distinct pieces. */ - ~qvi_scope_split_agg_s(void) + int + split_cpuset( + qvi_hwloc_cpusets_t &result + ) const { + // The cpuset that we are going to split. + const qvi_hwloc_bitmap_s base_cpuset = cpuset(); + // Pointer to my hwloc instance. + qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(rmi); + // Holds the host's split cpusets. + result.resize(split_size); + // Notice that we do not go through the RMI for this because this is just an + // local, temporary splitting that is ultimately fed to another splitting + // algorithm. + int rc = QV_SUCCESS; + for (uint_t chunkid = 0; chunkid < split_size; ++chunkid) { + rc = qvi_hwloc_split_cpuset_by_chunk_id( + hwloc, base_cpuset.cdata(), split_size, + chunkid, result[chunkid].data() + ); + if (rc != QV_SUCCESS) break; + } + return rc; + } + /** + * Returns device affinities that are part of the split. + */ + int + osdev_cpusets( + qvi_hwloc_cpusets_t &result + ) const { + // Get the number of devices we have available in the provided scope. + int nobj = 0; + int rc = base_hwpool->nobjects( + qvi_rmi_client_hwloc(rmi), split_at_type, &nobj + ); + if (rc != QV_SUCCESS) return rc; + // Holds the device affinities used for the split. + result.resize(nobj); + uint_t affi = 0; + for (const auto &dinfo : base_hwpool->devices()) { + // Not the type we are looking to split. + if (split_at_type != dinfo.first) continue; + // Copy the device's affinity to our list of device affinities. + result[affi++] = dinfo.second->affinity(); + } + return rc; + } + + int + primary_cpusets( + qvi_hwloc_cpusets_t &result + ) const { + // We were provided a real host resource type that we have to split. Or + // QV_HW_OBJ_LAST is instead provided to indicate that we were called from a + // split() context, which uses the host's cpuset to split the resources. + // TODO(skg) This looks suspicious to me. Make sure we want to do this. + // What about getting called from a split context for devices? + if (qvi_hwloc_obj_type_is_host_resource(split_at_type) || + split_at_type == QV_HW_OBJ_LAST) { + return split_cpuset(result); + } + // An OS device. + else { + return osdev_cpusets(result); + } + } + + qvi_map_fn_t + affinity_preserving_policy(void) const { + switch (split_at_type) { + // For split() + case QV_HW_OBJ_LAST: + return qvi_map_packed; + // For split_at() + default: + return qvi_map_spread; + } + } + /** Releases all devices contained in the provided split aggregate. */ + int + release_devices(void) + { + int rc = QV_SUCCESS; for (auto &hwpool : hwpools) { - qvi_delete(&hwpool); + rc = hwpool->release_devices(); + if (qvi_unlikely(rc != QV_SUCCESS)) return rc; } + return rc; } /** - * Minimally initializes instance. + * Straightforward user-defined device splitting. */ + // TODO(skg) Plenty of opportunity for optimization. + // TODO(skg) Move lots of logic to map int - init( - qvi_rmi_client_t *rmi_a, - qvi_hwpool_s *base_hwpool_a, - uint_t group_size_a, - uint_t split_size_a, - qv_hw_obj_type_t split_at_type_a + split_devices_user_defined(void) + { + // Release devices from the hardware pools because + // they will be redistributed in the next step. + int rc = release_devices(); + if (rc != QV_SUCCESS) return rc; + // Determine mapping of colors to task IDs. The array index i of colors is + // the color requested by task i. Also determine the number of distinct + // colors provided in the colors array. + std::set color_set(colors.begin(), colors.end()); + // Adjust the color set so that the distinct colors provided + // fall within the range of the number of splits requested. + std::set color_setp; + uint_t ncolors_chosen = 0; + for (const auto &c : color_set) { + if (ncolors_chosen >= split_size) break; + color_setp.insert(c); + ncolors_chosen++; + } + // Cache all device infos associated with the parent hardware pool. + auto dinfos = base_hwpool->devices(); + // Iterate over the supported device types and split them up round-robin. + // TODO(skg) Should this be a mapping operation in qvi-map? + for (const auto devt : qvi_hwloc_supported_devices()) { + // Get the number of devices. + const uint_t ndevs = dinfos.count(devt); + // Store device infos. + std::vector devs; + for (const auto &dinfo : dinfos) { + // Not the type we are currently dealing with. + if (devt != dinfo.first) continue; + devs.push_back(dinfo.second.get()); + } + // Maps colors to device information. + id_devinfo_multimap_t devmap; + uint_t devi = 0; + while (devi < ndevs) { + for (const auto &c : color_setp) { + if (devi >= ndevs) break; + devmap.insert(std::make_pair(c, devs[devi++])); + } + } + // Now that we have the mapping of colors to devices, assign devices to + // the associated hardware pools. + for (uint_t i = 0; i < group_size; ++i) { + const int color = colors[i]; + for (const auto &c2d : devmap) { + if (c2d.first != color) continue; + rc = hwpools[i]->add_device(*c2d.second); + if (rc != QV_SUCCESS) break; + } + if (rc != QV_SUCCESS) break; + } + if (rc != QV_SUCCESS) break; + } + return rc; + } + /** + * Affinity preserving device splitting. + */ + int + split_devices_affinity_preserving(void) + { + // Release devices from the hardware pools because + // they will be redistributed in the next step. + int rc = release_devices(); + if (rc != QV_SUCCESS) return rc; + // Get a pointer to device infos associated with the parent hardware pool. + auto dinfos = base_hwpool->devices(); + // Iterate over the supported device types and split them up. + for (const auto devt : qvi_hwloc_supported_devices()) { + // Store device infos. + std::vector devs; + for (const auto &dinfo : dinfos) { + // Not the type we are currently dealing with. + if (devt != dinfo.first) continue; + devs.push_back(dinfo.second.get()); + } + // Store device affinities. + qvi_hwloc_cpusets_t devaffs; + for (auto &dev : devs) { + devaffs.push_back(dev->affinity()); + } + + qvi_map_t map; + const auto policy = affinity_preserving_policy(); + rc = qvi_map_affinity_preserving( + map, policy, devaffs, affinities + ); + if (rc != QV_SUCCESS) return rc; + //qvi_map_debug_dump(map); + // Now that we have the mapping, assign + // devices to the associated hardware pools. + for (const auto &mi : map) { + const uint_t devid = mi.first; + const uint_t pooli = mi.second; + rc = hwpools[pooli]->add_device(*devs[devid]); + if (rc != QV_SUCCESS) break; + } + if (rc != QV_SUCCESS) break; + } + return rc; + } + + static int + apply_cpuset_mapping( + qvi_hwloc_t *hwloc, + const qvi_map_t &map, + const qvi_hwloc_cpusets_t cpusets, + std::vector &hwpools, + std::vector &colors ) { - rmi = rmi_a; - base_hwpool = base_hwpool_a; - group_size = group_size_a; - split_size = split_size_a; - split_at_type = split_at_type_a; - // To save memory we don't eagerly resize our vectors to group_size - // since most processes will not use the storage. For example, in the - // collective case the root ID process will be the only one needing - // group_size elements in our vectors. We'll let the call paths enforce - // appropriate vector sizing. + int rc = QV_SUCCESS; + + const uint_t npools = hwpools.size(); + for (uint_t pid = 0; pid < npools; ++pid) { + rc = hwpools.at(pid)->initialize( + hwloc, qvi_map_cpuset_at(map, cpusets, pid) + ); + if (rc != QV_SUCCESS) break; + } + if (rc != QV_SUCCESS) { + // Invalidate colors + colors.clear(); + } + else { + colors = qvi_map_flatten_to_colors(map); + } + return rc; + } + /** + * User-defined split. + */ + int + split_user_defined(void) + { + // Split the base cpuset into the requested number of pieces. + qvi_hwloc_cpusets_t cpusets; + int rc = split_cpuset(cpusets); + if (rc != QV_SUCCESS) return rc; + // Developer sanity check. + assert(cpusets.size() == split_size); + // Maintains the mapping between task (consumer) IDs and resource IDs. + qvi_map_t map{}; + rc = qvi_map_colors(map, colors, cpusets); + if (rc != QV_SUCCESS) return rc; + qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(rmi); + // Update the hardware pools and colors to reflect the new mapping. + rc = apply_cpuset_mapping( + hwloc, map, cpusets, hwpools, colors + ); + if (rc != QV_SUCCESS) return rc; + // Use a straightforward device splitting algorithm based on user's request. + return split_devices_user_defined(); + } + + int + split_affinity_preserving_pass1(void) + { + // cpusets used for first mapping pass. + qvi_hwloc_cpusets_t cpusets{}; + // Get the primary cpusets used for the first pass of mapping. + int rc = primary_cpusets(cpusets); + if (rc != QV_SUCCESS) return rc; + // Maintains the mapping between task (consumer) IDs and resource IDs. + qvi_map_t map{}; + // Map tasks based on their affinity to resources encoded by the cpusets. + const auto policy = affinity_preserving_policy(); + rc = qvi_map_affinity_preserving( + map, policy, affinities, cpusets + ); + if (rc != QV_SUCCESS) return rc; + // Make sure that we mapped all the tasks. If not, this is a bug. + if (qvi_map_nfids_mapped(map) != group_size) { + qvi_abort(); + } + qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(rmi); + // Update the hardware pools and colors to reflect the new mapping. + return apply_cpuset_mapping( + hwloc, map, cpusets, hwpools, colors + ); + } + + /** + * Affinity preserving split. + */ + // TODO(skg) This needs more work. + int + split_affinity_preserving(void) + { + int rc = split_affinity_preserving_pass1(); + if (rc != QV_SUCCESS) return rc; + // Finally, split the devices. + return split_devices_affinity_preserving(); + } + /** + * Takes a vector of colors and clamps their values to [0, ndc) + * in place, where ndc is the number of distinct numbers found in values. + */ + static int + clamp_colors( + std::vector &values + ) { + // Recall: sets are ordered. + std::set valset(values.begin(), values.end()); + // Maps the input vector colors to their clamped values. + std::map colors2clamped; + // color': the clamped color. + int colorp = 0; + for (auto val : valset) { + colors2clamped.insert({val, colorp++}); + } + for (uint_t i = 0; i < values.size(); ++i) { + values[i] = colors2clamped[values[i]]; + } return QV_SUCCESS; } + + /** + * Splits aggregate scope data. + */ + int + split(void) + { + int rc = QV_SUCCESS; + bool auto_split = false; + // Make sure that the supplied colors are consistent and determine the type + // of coloring we are using. Positive values denote an explicit coloring + // provided by the caller. Negative values are reserved for internal + // use and shall be constants defined in quo-vadis.h. Note we don't sort the + // splitagg's colors directly because they are ordered by task ID. + std::vector tcolors(colors); + std::sort(tcolors.begin(), tcolors.end()); + // We have a few possibilities here: + // * The values are all positive: user-defined split, but we have to clamp + // their values to a usable range for internal consumption. + // * The values are negative and equal: + // - All the same, valid auto split constant: auto split + // - All the same, undefined constant: user-defined split, but this is a + // strange case since all participants will get empty sets. + // * A mix if positive and negative values: + // - A strict subset is QV_SCOPE_SPLIT_UNDEFINED: user-defined split + // - A strict subset is not QV_SCOPE_SPLIT_UNDEFINED: return error code. + + // All colors are positive. + if (tcolors.front() >= 0) { + rc = clamp_colors(colors); + if (rc != QV_SUCCESS) return rc; + } + // Some values are negative. + else { + // TODO(skg) Implement the rest. + if (tcolors.front() != tcolors.back()) { + return QV_ERR_INVLD_ARG; + } + auto_split = true; + } + // User-defined splitting. + if (!auto_split) { + return split_user_defined(); + } + // Automatic splitting. + switch (colors[0]) { + case QV_SCOPE_SPLIT_AFFINITY_PRESERVING: + return split_affinity_preserving(); + default: + rc = QV_ERR_INVLD_ARG; + break; + } + return rc; + } }; /** @@ -560,40 +936,32 @@ struct qvi_scope_split_coll_s { * Note: We use 0 as the root because 0 will always exist. */ static constexpr int rootid = 0; - /** - * Points to the parent scope that we are splitting. - */ - qv_scope_t *parent_scope = nullptr; - /** - * My color. - */ + /** Points to the parent scope that we are splitting. */ + qv_scope_t *parent = nullptr; + /** My color. */ int mycolor = 0; /** * Stores group-global split information brought together by collective * operations across the members in parent_scope. */ - qvi_scope_split_agg_s gsplit{}; - /** - * Initializes instance. - */ - int - init( - qv_scope_t *parent_scope_a, + qvi_scope_split_agg_s gsplit; + /** Constructor. */ + qvi_scope_split_coll_s(void) = delete; + /** Constructor. */ + qvi_scope_split_coll_s( + qv_scope_t *parent_a, uint_t split_size_a, int mycolor_a, qv_hw_obj_type_t split_at_type_a - ) { - const int myid = parent_scope_a->group->rank(); - parent_scope = parent_scope_a; - mycolor = mycolor_a; - if (myid == qvi_scope_split_coll_s::rootid) { - const uint_t group_size = parent_scope->group->size(); - gsplit.init( - parent_scope->group->task()->rmi(), parent_scope->hwpool, - group_size, split_size_a, split_at_type_a + ) : parent(parent_a) + , mycolor(mycolor_a) + { + const qvi_group_t *const pgroup = parent->group; + if (pgroup->rank() == qvi_scope_split_coll_s::rootid) { + gsplit = qvi_scope_split_agg_s( + parent, pgroup->size(), split_size_a, split_at_type_a ); } - return QV_SUCCESS; } }; @@ -601,7 +969,7 @@ static int scope_split_coll_gather( qvi_scope_split_coll_s &splitcoll ) { - qv_scope_t *const parent = splitcoll.parent_scope; + qv_scope_t *const parent = splitcoll.parent; int rc = gather_values( parent->group, @@ -633,10 +1001,7 @@ scope_split_coll_gather( splitcoll.gsplit.affinities.resize(group_size); for (uint_t tid = 0; tid < group_size; ++tid) { hwloc_cpuset_t cpuset = nullptr; - rc = qvi_rmi_cpubind( - parent->group->task()->rmi(), - splitcoll.gsplit.taskids[tid], &cpuset - ); + rc = parent->group->task()->bind_top(&cpuset); if (rc != QV_SUCCESS) break; rc = splitcoll.gsplit.affinities[tid].set(cpuset); @@ -655,7 +1020,7 @@ scope_split_coll_scatter( qvi_hwpool_s **result ) { const int rc = scatter_values( - splitcoll.parent_scope->group, + splitcoll.parent->group, qvi_scope_split_coll_s::rootid, splitcoll.gsplit.colors, colorp @@ -663,430 +1028,13 @@ scope_split_coll_scatter( if (qvi_unlikely(rc != QV_SUCCESS)) return rc; return scatter_hwpools( - splitcoll.parent_scope->group, + splitcoll.parent->group, qvi_scope_split_coll_s::rootid, splitcoll.gsplit.hwpools, result ); } -/** - * Returns a copy of the aggregate cpuset. Note that the cpuset will be shared - * among the aggregate, but other resources may be distributed differently. - * For example, some hardware pools may have GPUs, while others may not. - */ -static int -qvi_scope_split_agg_cpuset_dup( - const qvi_scope_split_agg_s &splitagg, - qvi_hwloc_bitmap_s &result -) { - // This shouldn't happen. - assert(splitagg.hwpools.size() != 0); - - result = splitagg.hwpools[0]->cpuset(); - return QV_SUCCESS; -} - -static int -apply_cpuset_mapping( - qvi_hwloc_t *hwloc, - const qvi_map_t &map, - const qvi_hwloc_cpusets_t cpusets, - std::vector &hwpools, - std::vector &colors -) { - int rc = QV_SUCCESS; - - const uint_t npools = hwpools.size(); - for (uint_t pid = 0; pid < npools; ++pid) { - rc = hwpools.at(pid)->initialize( - hwloc, qvi_map_cpuset_at(map, cpusets, pid) - ); - if (rc != QV_SUCCESS) break; - } - if (rc != QV_SUCCESS) { - // Invalidate colors - colors.clear(); - } - else { - colors = qvi_map_flatten_to_colors(map); - } - return rc; -} - -static qvi_map_fn_t -split_agg_get_affinity_preserving_policy( - const qvi_scope_split_agg_s &splitagg -) { - switch (splitagg.split_at_type) { - // For split() - case QV_HW_OBJ_LAST: - return qvi_map_packed; - // For split_at() - default: - return qvi_map_spread; - } -} - -/** - * Releases all devices contained in the provided split aggregate. - */ -static int -agg_split_release_devices( - qvi_scope_split_agg_s &splitagg -) { - int rc = QV_SUCCESS; - for (auto &hwpool : splitagg.hwpools) { - rc = hwpool->release_devices(); - if (rc != QV_SUCCESS) return rc; - } - return rc; -} - -/** - * Straightforward user-defined device splitting. - */ -// TODO(skg) Plenty of opportunity for optimization. -// TODO(skg) Move lots of logic to map -static int -agg_split_devices_user_defined( - qvi_scope_split_agg_s &splitagg -) { - const uint_t group_size = splitagg.group_size; - // Release devices from the hardware pools because - // they will be redistributed in the next step. - int rc = agg_split_release_devices(splitagg); - if (rc != QV_SUCCESS) return rc; - // Determine mapping of colors to task IDs. The array index i of colors is - // the color requested by task i. Also determine the number of distinct - // colors provided in the colors array. - std::set color_set(splitagg.colors.begin(), splitagg.colors.end()); - // Adjust the color set so that the distinct colors provided - // fall within the range of the number of splits requested. - std::set color_setp; - uint_t ncolors_chosen = 0; - for (const auto &c : color_set) { - if (ncolors_chosen >= splitagg.split_size) break; - color_setp.insert(c); - ncolors_chosen++; - } - // Cache all device infos associated with the parent hardware pool. - auto dinfos = splitagg.base_hwpool->devices(); - // Iterate over the supported device types and split them up round-robin. - // TODO(skg) Should this be a mapping operation in qvi-map? - for (const auto devt : qvi_hwloc_supported_devices()) { - // Get the number of devices. - const uint_t ndevs = dinfos.count(devt); - // Store device infos. - std::vector devs; - for (const auto &dinfo : dinfos) { - // Not the type we are currently dealing with. - if (devt != dinfo.first) continue; - devs.push_back(dinfo.second.get()); - } - // Maps colors to device information. - id_devinfo_multimap_t devmap; - uint_t devi = 0; - while (devi < ndevs) { - for (const auto &c : color_setp) { - if (devi >= ndevs) break; - devmap.insert(std::make_pair(c, devs[devi++])); - } - } - // Now that we have the mapping of colors to devices, assign devices to - // the associated hardware pools. - for (uint_t i = 0; i < group_size; ++i) { - const int color = splitagg.colors[i]; - for (const auto &c2d : devmap) { - if (c2d.first != color) continue; - rc = splitagg.hwpools[i]->add_device(*c2d.second); - if (rc != QV_SUCCESS) break; - } - if (rc != QV_SUCCESS) break; - } - if (rc != QV_SUCCESS) break; - } - return rc; -} - -/** - * Affinity preserving device splitting. - */ -static int -agg_split_devices_affinity_preserving( - qvi_scope_split_agg_s &splitagg -) { - // Release devices from the hardware pools because - // they will be redistributed in the next step. - int rc = agg_split_release_devices(splitagg); - if (rc != QV_SUCCESS) return rc; - // Get a pointer to device infos associated with the parent hardware pool. - auto dinfos = splitagg.base_hwpool->devices(); - // Iterate over the supported device types and split them up. - for (const auto devt : qvi_hwloc_supported_devices()) { - // Store device infos. - std::vector devs; - for (const auto &dinfo : dinfos) { - // Not the type we are currently dealing with. - if (devt != dinfo.first) continue; - devs.push_back(dinfo.second.get()); - } - // Store device affinities. - qvi_hwloc_cpusets_t devaffs; - for (auto &dev : devs) { - devaffs.push_back(dev->affinity()); - } - - qvi_map_t map; - const auto policy = split_agg_get_affinity_preserving_policy(splitagg); - rc = qvi_map_affinity_preserving( - map, policy, devaffs, splitagg.affinities - ); - if (rc != QV_SUCCESS) return rc; -#if 0 - qvi_map_debug_dump(map); -#endif - // Now that we have the mapping, assign - // devices to the associated hardware pools. - for (const auto &mi : map) { - const uint_t devid = mi.first; - const uint_t pooli = mi.second; - rc = splitagg.hwpools[pooli]->add_device(*devs[devid]); - if (rc != QV_SUCCESS) break; - } - if (rc != QV_SUCCESS) break; - } - return rc; -} - -/** - * Performs a straightforward splitting of the provided cpuset: split the - * provided base cpuset into splitagg.split_size distinct pieces. - */ -static int -agg_split_cpuset( - const qvi_scope_split_agg_s &splitagg, - qvi_hwloc_cpusets_t &result -) { - int rc = QV_SUCCESS; - // The cpuset that we are going to split. - qvi_hwloc_bitmap_s base_cpuset; - rc = qvi_scope_split_agg_cpuset_dup(splitagg, base_cpuset); - if (rc != QV_SUCCESS) return rc; - // Pointer to my hwloc instance. - qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(splitagg.rmi); - // Holds the host's split cpusets. - result.resize(splitagg.split_size); - // Notice that we do not go through the RMI for this because this is just an - // local, temporary splitting that is ultimately fed to another splitting - // algorithm. - for (uint_t chunkid = 0; chunkid < splitagg.split_size; ++chunkid) { - rc = qvi_hwloc_split_cpuset_by_chunk_id( - hwloc, base_cpuset.cdata(), splitagg.split_size, - chunkid, result[chunkid].data() - ); - if (rc != QV_SUCCESS) break; - } - return rc; -} - -/** - * User-defined split. - */ -static int -agg_split_user_defined( - qvi_scope_split_agg_s &splitagg -) { - const uint_t split_size = splitagg.split_size; - // Split the base cpuset into the requested number of pieces. - qvi_hwloc_cpusets_t cpusets; - int rc = agg_split_cpuset(splitagg, cpusets); - if (rc != QV_SUCCESS) return rc; - // Developer sanity check. - if (cpusets.size() != split_size) { - qvi_abort(); - } - // Maintains the mapping between task (consumer) IDs and resource IDs. - qvi_map_t map{}; - rc = qvi_map_colors(map, splitagg.colors, cpusets); - if (rc != QV_SUCCESS) return rc; - qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(splitagg.rmi); - // Update the hardware pools and colors to reflect the new mapping. - rc = apply_cpuset_mapping( - hwloc, map, cpusets, splitagg.hwpools, splitagg.colors - ); - if (rc != QV_SUCCESS) return rc; - // Use a straightforward device splitting algorithm based on user's request. - return agg_split_devices_user_defined(splitagg); -} - -static int -agg_split_get_new_osdev_cpusets( - const qvi_scope_split_agg_s &splitagg, - qvi_hwloc_cpusets_t &result -) { - // The target object type. - const qv_hw_obj_type_t obj_type = splitagg.split_at_type; - // Get the number of devices we have available in the provided scope. - int nobj = 0; - int rc = splitagg.base_hwpool->nobjects( - qvi_rmi_client_hwloc(splitagg.rmi), obj_type, &nobj - ); - if (rc != QV_SUCCESS) return rc; - // Holds the device affinities used for the split. - result.resize(nobj); - // Get a pointer to device infos associated with the base hardware pool. - auto dinfos = splitagg.base_hwpool->devices(); - uint_t affi = 0; - for (const auto &dinfo : dinfos) { - // Not the type we are looking to split. - if (obj_type != dinfo.first) continue; - // Copy the device's affinity to our list of device affinities. - result[affi++] = dinfo.second->affinity(); - } - return rc; -} - -static int -agg_split_get_primary_cpusets( - const qvi_scope_split_agg_s &splitagg, - qvi_hwloc_cpusets_t &result -) { - const qv_hw_obj_type_t obj_type = splitagg.split_at_type; - // We were provided a real host resource type that we have to split. Or - // QV_HW_OBJ_LAST is instead provided to indicate that we were called from a - // split() context, which uses the host's cpuset to split the resources. - if (qvi_hwloc_obj_type_is_host_resource(obj_type) || - obj_type == QV_HW_OBJ_LAST) { - return agg_split_cpuset(splitagg, result); - } - // An OS device. - else { - return agg_split_get_new_osdev_cpusets(splitagg, result); - } -} - -static int -agg_split_affinity_preserving_pass1( - qvi_scope_split_agg_s &splitagg -) { - const uint_t group_size = splitagg.group_size; - // cpusets used for first mapping pass. - qvi_hwloc_cpusets_t cpusets{}; - // Get the primary cpusets used for the first pass of mapping. - int rc = agg_split_get_primary_cpusets(splitagg, cpusets); - if (rc != QV_SUCCESS) return rc; - // Maintains the mapping between task (consumer) IDs and resource IDs. - qvi_map_t map{}; - // Map tasks based on their affinity to resources encoded by the cpusets. - const auto policy = split_agg_get_affinity_preserving_policy(splitagg); - rc = qvi_map_affinity_preserving( - map, policy, splitagg.affinities, cpusets - ); - if (rc != QV_SUCCESS) return rc; - // Make sure that we mapped all the tasks. If not, this is a bug. - if (qvi_map_nfids_mapped(map) != group_size) { - qvi_abort(); - } - qvi_hwloc_t *const hwloc = qvi_rmi_client_hwloc(splitagg.rmi); - // Update the hardware pools and colors to reflect the new mapping. - return apply_cpuset_mapping( - hwloc, map, cpusets, splitagg.hwpools, splitagg.colors - ); -} - -/** - * Affinity preserving split. - */ -// TODO(skg) This needs more work. -static int -agg_split_affinity_preserving( - qvi_scope_split_agg_s &splitagg -) { - int rc = agg_split_affinity_preserving_pass1(splitagg); - if (rc != QV_SUCCESS) return rc; - // Finally, split the devices. - return agg_split_devices_affinity_preserving(splitagg); -} - -/** - * Takes a vector of colors and clamps their values to [0, ndc) - * in place, where ndc is the number of distinct numbers found in values. - */ -static int -clamp_colors( - std::vector &values -) { - // Recall: sets are ordered. - std::set valset(values.begin(), values.end()); - // Maps the input vector colors to their clamped values. - std::map colors2clamped; - // color': the clamped color. - int colorp = 0; - for (auto val : valset) { - colors2clamped.insert({val, colorp++}); - } - for (uint_t i = 0; i < values.size(); ++i) { - values[i] = colors2clamped[values[i]]; - } - return QV_SUCCESS; -} - -/** - * Splits aggregate scope data. - */ -static int -agg_split( - qvi_scope_split_agg_s &splitagg -) { - int rc = QV_SUCCESS; - bool auto_split = false; - // Make sure that the supplied colors are consistent and determine the type - // of coloring we are using. Positive values denote an explicit coloring - // provided by the caller. Negative values are reserved for internal - // use and shall be constants defined in quo-vadis.h. Note we don't sort the - // splitagg's colors directly because they are ordered by task ID. - std::vector tcolors(splitagg.colors); - std::sort(tcolors.begin(), tcolors.end()); - // We have a few possibilities here: - // * The values are all positive: user-defined split, but we have to clamp - // their values to a usable range for internal consumption. - // * The values are negative and equal: - // - All the same, valid auto split constant: auto split - // - All the same, undefined constant: user-defined split, but this is a - // strange case since all participants will get empty sets. - // * A mix if positive and negative values: - // - A strict subset is QV_SCOPE_SPLIT_UNDEFINED: user-defined split - // - A strict subset is not QV_SCOPE_SPLIT_UNDEFINED: return error code. - - // All colors are positive. - if (tcolors.front() >= 0) { - rc = clamp_colors(splitagg.colors); - if (rc != QV_SUCCESS) return rc; - } - // Some values are negative. - else { - // TODO(skg) Implement the rest. - if (tcolors.front() != tcolors.back()) { - return QV_ERR_INVLD_ARG; - } - auto_split = true; - } - // User-defined splitting. - if (!auto_split) { - return agg_split_user_defined(splitagg); - } - // Automatic splitting. - switch (splitagg.colors[0]) { - case QV_SCOPE_SPLIT_AFFINITY_PRESERVING: - return agg_split_affinity_preserving(splitagg); - default: - rc = QV_ERR_INVLD_ARG; - break; - } - return rc; -} - /** * Split the hardware resources based on the provided split parameters: * - npieces: The number of splits requested. @@ -1107,31 +1055,31 @@ coll_split_hardware_resources( int *colorp, qvi_hwpool_s **result ) { - int rc = QV_SUCCESS, rc2 = QV_SUCCESS; + int rc2 = QV_SUCCESS; const int rootid = qvi_scope_split_coll_s::rootid, myid = parent->group->rank(); // Information relevant to hardware resource splitting. Note that // aggregated data are only valid for the task whose id is equal to // qvi_global_split_t::rootid after gather has completed. - qvi_scope_split_coll_s splitcoll{}; - rc = splitcoll.init(parent, npieces, color, maybe_obj_type); - if (rc != QV_SUCCESS) goto out; + qvi_scope_split_coll_s splitcoll( + parent, npieces, color, maybe_obj_type + ); // First consolidate the provided information, as this is coming from a // SPMD-like context (e.g., splitting a resource shared by MPI processes). // In most cases it is easiest to have a single task calculate the split // based on global knowledge and later redistribute the calculated result to // its group members. - rc = scope_split_coll_gather(splitcoll); + int rc = scope_split_coll_gather(splitcoll); if (rc != QV_SUCCESS) goto out; // The root does this calculation. if (myid == rootid) { - rc2 = agg_split(splitcoll.gsplit); + rc2 = splitcoll.gsplit.split(); } // Wait for the split information. Explicitly barrier here in case the // underlying broadcast implementation polls heavily for completion. - rc = splitcoll.parent_scope->group->barrier(); + rc = splitcoll.parent->group->barrier(); if (rc != QV_SUCCESS) goto out; // To avoid hangs in split error paths, share the split rc with everyone. - rc = bcast_value(splitcoll.parent_scope->group, rootid, &rc2); + rc = bcast_value(splitcoll.parent->group, rootid, &rc2); if (rc != QV_SUCCESS) goto out; // If the split failed, return the error to all callers. if (rc2 != QV_SUCCESS) { @@ -1190,29 +1138,23 @@ qvi_scope_thsplit( ) { *thchildren = nullptr; + qvi_group_t *const pgroup = parent->group; const uint_t group_size = k; - qvi_scope_split_agg_s splitagg{}; - int rc = splitagg.init( - parent->group->task()->rmi(), parent->hwpool, - group_size, npieces, maybe_obj_type + + qvi_scope_split_agg_s splitagg( + parent, group_size, npieces, maybe_obj_type ); - if (rc != QV_SUCCESS) return rc; - // Since this is called by a single task, get its ID and associated hardware - // affinity here, and replicate them in the following loop that populates - // splitagg. + // Eagerly make room for the group member information. + splitagg.reserve(); + // Since this is called by a single task, get its ID and associated + // hardware affinity here, and replicate them in the following loop + // that populates splitagg. //No point in doing this in a loop. const pid_t taskid = qvi_task_t::mytid(); hwloc_cpuset_t task_affinity = nullptr; - rc = qvi_rmi_cpubind( - parent->group->task()->rmi(), - taskid, &task_affinity - ); + // Get the task's current affinity. + int rc = pgroup->task()->bind_top(&task_affinity); if (rc != QV_SUCCESS) return rc; - // Now populate the relevant data before attempting a split. - splitagg.colors.resize(group_size); - splitagg.hwpools.resize(group_size); - splitagg.taskids.resize(group_size); - splitagg.affinities.resize(group_size); for (uint_t i = 0; i < group_size; ++i) { // Store requested colors in aggregate. splitagg.colors[i] = kcolors[i]; @@ -1225,18 +1167,17 @@ qvi_scope_thsplit( // Same goes for the task's affinity. splitagg.affinities[i].set(task_affinity); } - if (rc != QV_SUCCESS) return rc; // Cleanup: we don't need task_affinity anymore. qvi_hwloc_bitmap_delete(&task_affinity); if (rc != QV_SUCCESS) return rc; // Split the hardware resources based on the provided split parameters. - rc = agg_split(splitagg); + rc = splitagg.split(); if (rc != QV_SUCCESS) return rc; // Split off from our parent group. This call is called from a context in // which a process is splitting its resources across threads, so create a // new thread group for each child. qvi_group_t *thgroup = nullptr; - rc = parent->group->thsplit(group_size, &thgroup); + rc = pgroup->thsplit(group_size, &thgroup); if (rc != QV_SUCCESS) return rc; // Now create and populate the children. qv_scope_t **ithchildren = new qv_scope_t *[group_size]; @@ -1244,17 +1185,11 @@ qvi_scope_thsplit( // Copy out, since the hardware pools in splitagg will get freed. qvi_hwpool_s *hwpool = nullptr; rc = qvi_dup(*splitagg.hwpools[i], &hwpool); - if (rc != QV_SUCCESS) { - qvi_delete(&thgroup); - break; - } + if (rc != QV_SUCCESS) break; // Create and initialize the new scope. qv_scope_t *child = nullptr; rc = scope_new(thgroup, hwpool, &child); - if (rc != QV_SUCCESS) { - qvi_delete(&thgroup); - break; - } + if (rc != QV_SUCCESS) break; thgroup->retain(); ithchildren[i] = child; }