From 8ab5b0e3160ce6935717df5a2e4e50b19ed779e0 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Tue, 20 Oct 2020 14:48:00 -0700 Subject: [PATCH 01/36] #875: phase: start with a very simple PhaseManager --- src/CMakeLists.txt | 1 + src/vt/phase/phase_manager.h | 83 ++++++++++++++++++++++++++ src/vt/runtime/runtime.cc | 13 +++- src/vt/runtime/runtime.h | 1 + src/vt/runtime/runtime_component_fwd.h | 3 + src/vt/runtime/runtime_get.cc | 2 + 6 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 src/vt/phase/phase_manager.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1cfd0be3d5..3c294a8246 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -49,6 +49,7 @@ set( termination/interval termination/graph messaging/envelope messaging/message + phase pool/static_sized pool/header rdma/channel rdma/collection rdma/group rdma/state rdmahandle diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h new file mode 100644 index 0000000000..7263c730af --- /dev/null +++ b/src/vt/phase/phase_manager.h @@ -0,0 +1,83 @@ +/* +//@HEADER +// ***************************************************************************** +// +// phase_manager.h +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#if !defined INCLUDED_VT_PHASE_PHASE_MANAGER_H +#define INCLUDED_VT_PHASE_PHASE_MANAGER_H + +#include "vt/configs/types/types_type.h" +#include "vt/runtime/component/component_pack.h" + +namespace vt { namespace phase { + +/** + * \struct PhaseManager + * + * \brief General management of phases in applications + */ +struct PhaseManager : runtime::component::Component { + + PhaseManager() = default; + + std::string name() override { return "PhaseManager"; } + + /** + * \brief Get the current phase + * + * \return the current phase + */ + PhaseType getCurrentPhase() const { return cur_phase_; } + +private: + PhaseType cur_phase_ = 0; /**< Current phase on this node */ +}; + +}} /* end namespace vt::phase */ + +namespace vt { + +extern phase::PhaseManager* thePhase(); + +} /* end namespace vt */ + +#endif /*INCLUDED_VT_PHASE_PHASE_MANAGER_H*/ diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index ce45aa81b3..4d451d901c 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -830,7 +830,8 @@ void Runtime::initializeComponents() { p_->registerComponent( &theNodeStats, Deps< - ctx::Context // Everything depends on theContext + ctx::Context, // Everything depends on theContext + phase::PhaseManager // For phase structure >{} ); @@ -845,7 +846,8 @@ void Runtime::initializeComponents() { &theLBManager, Deps< ctx::Context, // Everything depends on theContext util::memory::MemoryUsage, // Output mem usage on phase change - vrt::collection::balance::NodeStats // For stat collection + vrt::collection::balance::NodeStats, // For stat collection + phase::PhaseManager // For phase structure >{} ); @@ -855,6 +857,12 @@ void Runtime::initializeComponents() { >{} ); + p_->registerComponent( + &thePhase, Deps< + ctx::Context // Everything depends on theContext + >{} + ); + p_->add(); p_->add(); p_->add(); @@ -888,6 +896,7 @@ void Runtime::initializeComponents() { p_->add(); p_->add(); p_->add(); + p_->add(); if (addStatsRestartReader) { p_->add(); diff --git a/src/vt/runtime/runtime.h b/src/vt/runtime/runtime.h index 8c7269da64..585b5792d7 100644 --- a/src/vt/runtime/runtime.h +++ b/src/vt/runtime/runtime.h @@ -405,6 +405,7 @@ struct Runtime { ComponentPtrType theStatsReader = nullptr; ComponentPtrType theLBManager = nullptr; ComponentPtrType theTimeTrigger = nullptr; + ComponentPtrType thePhase = nullptr; // Node-level worker-based components for vt (these are optional) ComponentPtrType theWorkerGrp = nullptr; diff --git a/src/vt/runtime/runtime_component_fwd.h b/src/vt/runtime/runtime_component_fwd.h index 6976f29440..56441acc13 100644 --- a/src/vt/runtime/runtime_component_fwd.h +++ b/src/vt/runtime/runtime_component_fwd.h @@ -111,6 +111,9 @@ struct MemoryUsage; namespace timetrigger { struct TimeTriggerManager; } +namespace phase { +struct PhaseManager; +} #if vt_check_enabled(trace_enabled) namespace trace { diff --git a/src/vt/runtime/runtime_get.cc b/src/vt/runtime/runtime_get.cc index d49f798854..045634c49f 100644 --- a/src/vt/runtime/runtime_get.cc +++ b/src/vt/runtime/runtime_get.cc @@ -68,6 +68,7 @@ #include "vt/pipe/pipe_headers.h" #include "vt/objgroup/headers.h" #include "vt/timetrigger/time_trigger_manager.h" +#include "vt/phase/phase_manager.h" #include @@ -134,6 +135,7 @@ vrt::collection::balance::StatsRestartReader* theStatsReader() { return CUR_RT-> vrt::collection::balance::LBManager* theLBManager() { return CUR_RT->theLBManager; } timetrigger::TimeTriggerManager* theTimeTrigger() { return CUR_RT->theTimeTrigger; } vt::arguments::AppConfig* theConfig() { return &CUR_RT->theArgConfig->config_; } +vt::phase::PhaseManager* thePhase() { return CUR_RT->thePhase; } #if vt_check_enabled(trace_enabled) trace::Trace* theTrace() { return CUR_RT->theTrace; } From be10a085cc042a6682ca07791027d715878fd772 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Tue, 20 Oct 2020 14:56:59 -0700 Subject: [PATCH 02/36] #875: phase: add cc file, promote to objgroup --- src/vt/phase/phase_manager.cc | 57 +++++++++++++++++++++++++++++++++++ src/vt/phase/phase_manager.h | 11 ++++++- src/vt/runtime/runtime.cc | 1 + 3 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 src/vt/phase/phase_manager.cc diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc new file mode 100644 index 0000000000..f7a6247a37 --- /dev/null +++ b/src/vt/phase/phase_manager.cc @@ -0,0 +1,57 @@ +/* +//@HEADER +// ***************************************************************************** +// +// phase_manager.cc +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#include "vt/phase/phase_manager.h" +#include "vt/objgroup/headers.h" + +namespace vt { namespace phase { + +/*static*/ std::unique_ptr PhaseManager::construct() { + auto ptr = std::make_unique(); + auto proxy = theObjGroup()->makeCollective(ptr.get()); + proxy.get()->proxy_ = proxy.getProxy();; + return ptr; +} + +}} /* end namespace vt::phase */ diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 7263c730af..36809add97 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -61,6 +61,14 @@ struct PhaseManager : runtime::component::Component { std::string name() override { return "PhaseManager"; } + /** + * \internal + * \brief Construct a new \c PhaseManager as an objgroup + * + * \return unique pointer to the new manager + */ + static std::unique_ptr construct(); + /** * \brief Get the current phase * @@ -69,7 +77,8 @@ struct PhaseManager : runtime::component::Component { PhaseType getCurrentPhase() const { return cur_phase_; } private: - PhaseType cur_phase_ = 0; /**< Current phase on this node */ + PhaseType cur_phase_ = 0; /**< Current phase */ + ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ }; }} /* end namespace vt::phase */ diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index 4d451d901c..25a542134e 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -71,6 +71,7 @@ #include "vt/utils/mpi_limits/mpi_max_tag.h" #include "vt/vrt/collection/balance/stats_restart_reader.h" #include "vt/timetrigger/time_trigger_manager.h" +#include "vt/phase/phase_manager.h" #include "vt/configs/arguments/app_config.h" #include "vt/configs/arguments/args.h" From b7ed578e7a5622ad01a915c01f81edffa9b2b30a Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Tue, 20 Oct 2020 16:14:54 -0700 Subject: [PATCH 03/36] #875: phase: implement hooks, (de-)registration --- src/vt/phase/phase_hook_enum.h | 64 ++++++++++++++++++ src/vt/phase/phase_manager.cc | 18 +++++ src/vt/phase/phase_manager.h | 28 +++++++- src/vt/phase/registered_phase_hook.h | 99 ++++++++++++++++++++++++++++ 4 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 src/vt/phase/phase_hook_enum.h create mode 100644 src/vt/phase/registered_phase_hook.h diff --git a/src/vt/phase/phase_hook_enum.h b/src/vt/phase/phase_hook_enum.h new file mode 100644 index 0000000000..d94c3f2bfb --- /dev/null +++ b/src/vt/phase/phase_hook_enum.h @@ -0,0 +1,64 @@ +/* +//@HEADER +// ***************************************************************************** +// +// phase_hook_enum.h +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#if !defined INCLUDED_VT_PHASE_PHASE_HOOK_ENUM_H +#define INCLUDED_VT_PHASE_PHASE_HOOK_ENUM_H + +namespace vt { namespace phase { + +/** + * \enum PhaseHook + * + * \brief Different times in phase execution one can hook triggered actions into + * the \c PhaseManager + */ +enum struct PhaseHook : int8_t { + Start, /**< Before a phase starts */ + End, /**< After a phase ends */ + EndPostMigration /**< After a phase ends after all migrations */ +}; + +}} /* end namespace vt::phase */ + +#endif /*INCLUDED_VT_PHASE_PHASE_HOOK_ENUM_H*/ diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index f7a6247a37..d1567c0efd 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -54,4 +54,22 @@ namespace vt { namespace phase { return ptr; } +PhaseHookID PhaseManager::registerHook(PhaseHook type, ActionType trigger) { + auto const type_bits = static_cast(type); + auto const hook_id = next_hook_id_++; + hooks_[type_bits][hook_id] = trigger; + return PhaseHookID{type, hook_id}; +} + +void PhaseManager::unregisterHook(PhaseHookID hook) { + auto const type = static_cast(hook.getType()); + auto const id = hook.getID(); + auto iter = hooks_[type].find(id); + if (iter != hooks_[type].end()) { + hooks_[type].erase(iter); + } else { + vtAssert(false, "Could not find registered hook ID to erase"); + } +} + }} /* end namespace vt::phase */ diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 36809add97..db27a9711f 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -47,6 +47,8 @@ #include "vt/configs/types/types_type.h" #include "vt/runtime/component/component_pack.h" +#include "vt/phase/phase_hook_enum.h" +#include "vt/phase/registered_phase_hook.h" namespace vt { namespace phase { @@ -56,6 +58,9 @@ namespace vt { namespace phase { * \brief General management of phases in applications */ struct PhaseManager : runtime::component::Component { + using HookIDType = typename std::underlying_type::type; + using HookMapType = std::unordered_map; + using HookIDMapType = std::unordered_map; PhaseManager() = default; @@ -76,9 +81,28 @@ struct PhaseManager : runtime::component::Component { */ PhaseType getCurrentPhase() const { return cur_phase_; } + /** + * \brief Register a phase hook that triggers depending on the type of hook + * + * \param[in] type the type of trigger to register + * \param[in] trigger the action to trigger + * + * \return registered ID that can be used to unregister the hook + */ + PhaseHookID registerHook(PhaseHook type, ActionType trigger); + + /** + * \brief Unregister an existing hook + * + * \param[in] hook the id of the hook to unregister + */ + void unregisterHook(PhaseHookID hook); + private: - PhaseType cur_phase_ = 0; /**< Current phase */ - ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ + PhaseType cur_phase_ = 0; /**< Current phase */ + ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ + HookIDMapType hooks_; /**< Map of regisstered hooks */ + std::size_t next_hook_id_ = 1; /**< Next ID for hook registration */ }; }} /* end namespace vt::phase */ diff --git a/src/vt/phase/registered_phase_hook.h b/src/vt/phase/registered_phase_hook.h new file mode 100644 index 0000000000..d8f94196be --- /dev/null +++ b/src/vt/phase/registered_phase_hook.h @@ -0,0 +1,99 @@ +/* +//@HEADER +// ***************************************************************************** +// +// registered_phase_hook.h +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#if !defined INCLUDED_VT_PHASE_REGISTERED_PHASE_HOOK_H +#define INCLUDED_VT_PHASE_REGISTERED_PHASE_HOOK_H + +#include "vt/phase/phase_hook_enum.h" + +namespace vt { namespace phase { + +// forward-decl for friendship +struct PhaseManager; + +/** + * \struct PhaseHookID + * + * \brief A registered phase hook used to identify it and unregister it. + */ +struct PhaseHookID { + +private: + /** + * \internal + * \brief Used by the system to create a new phase hook ID + * + * \param[in] in_type the type of hook + * \param[in] in_id the registered ID + */ + PhaseHookID(PhaseHook in_type, std::size_t in_id) + : type_(in_type), + id_(in_id) + { } + + friend struct PhaseManager; + +public: + /** + * \brief Get the type of hook + * + * \return the type of hook + */ + PhaseHook getType() const { return type_; } + + /** + * \brief Get the ID of the registered hook + * + * \return the registered hook ID + */ + std::size_t getID() const { return id_; } + +private: + PhaseHook type_; + std::size_t id_ = 0; +}; + +}} /* end namespace vt::phase */ + +#endif /*INCLUDED_VT_PHASE_REGISTERED_PHASE_HOOK_H*/ From a783b6f02fa7757e80d5bedf82675d40e9fa6370 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 17:11:51 -0700 Subject: [PATCH 04/36] #875: phase: add reduction for nextPhaseCollective --- src/vt/phase/phase_manager.cc | 28 ++++++++++++++++++++++++++++ src/vt/phase/phase_manager.h | 23 +++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index d1567c0efd..313de1295c 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -44,6 +44,7 @@ #include "vt/phase/phase_manager.h" #include "vt/objgroup/headers.h" +#include "vt/pipe/pipe_manager.h" namespace vt { namespace phase { @@ -72,4 +73,31 @@ void PhaseManager::unregisterHook(PhaseHookID hook) { } } +struct NextMsg : collective::ReduceNoneMsg {}; + +void PhaseManager::nextPhaseCollective() { + vtAbortIf( + in_next_phase_collective_, + "A call to nextPhaseCollective has already been invoked." + " It must return before it is invoked again" + ); + in_next_phase_collective_ = true; + + // Convert bits to typed proxy + auto proxy = objgroup::proxy::Proxy(proxy_); + + // Start with a reduction to sure all nodes are ready for this + auto cb = theCB()->makeBcast(proxy); + auto msg = makeMessage(); + proxy.reduce(msg.get(), cb); + + theSched()->runSchedulerWhile([this]{ return not reduce_next_phase_done_; }); + + in_next_phase_collective_ = false; +} + +void PhaseManager::nextPhaseReduce(NextMsg* msg) { + reduce_next_phase_done_ = true; +} + }} /* end namespace vt::phase */ diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index db27a9711f..f5d247a663 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -52,6 +52,9 @@ namespace vt { namespace phase { +// fwd-decl for reduce messasge +struct NextMsg; + /** * \struct PhaseManager * @@ -98,11 +101,31 @@ struct PhaseManager : runtime::component::Component { */ void unregisterHook(PhaseHookID hook); + /** + * \brief Start the next phase collectively. + * + * \note Performs a reduction to coordinate across nodes and then triggers + * post-phase triggerable actions. This function does not return until the any + * post-phase actions, including migrations as a result, are terminated. + */ + void nextPhaseCollective(); + +private: + /** + * \internal + * \brief Reduce handler to kick off the next phase + * + * \param[in] msg the (empty) next phase message + */ + void nextPhaseReduce(NextMsg* msg); + private: PhaseType cur_phase_ = 0; /**< Current phase */ ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ HookIDMapType hooks_; /**< Map of regisstered hooks */ std::size_t next_hook_id_ = 1; /**< Next ID for hook registration */ + bool in_next_phase_collective_ = false; /**< Whether blocked in next phase */ + bool reduce_next_phase_done_ = false; /**< Whether reduce is complete */ }; }} /* end namespace vt::phase */ From 77c67039208aa1b22682d346812c268f9ea4fbb8 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 19:35:27 -0700 Subject: [PATCH 05/36] #875: phase: make hooks collective/rooted --- src/vt/phase/phase_manager.cc | 69 +++++++++++++++++++++++++--- src/vt/phase/phase_manager.h | 51 ++++++++++++++++---- src/vt/phase/registered_phase_hook.h | 13 +++++- 3 files changed, 115 insertions(+), 18 deletions(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 313de1295c..abbc4e058b 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -55,19 +55,44 @@ namespace vt { namespace phase { return ptr; } -PhaseHookID PhaseManager::registerHook(PhaseHook type, ActionType trigger) { +PhaseHookID +PhaseManager::registerHookCollective(PhaseHook type, ActionType trigger) { + vtAssertNot( + in_next_phase_collective_, "Must not be in next phase to register" + ); + + bool const is_collective = true; + auto const type_bits = static_cast(type); + auto const hook_id = next_collective_hook_id_++; + collective_hooks_[type_bits][hook_id] = trigger; + return PhaseHookID{type, hook_id, is_collective}; +} + +PhaseHookID +PhaseManager::registerHookRooted(PhaseHook type, ActionType trigger) { + vtAssertNot( + in_next_phase_collective_, "Must not be in next phase to register" + ); + + bool const is_collective = true; auto const type_bits = static_cast(type); - auto const hook_id = next_hook_id_++; - hooks_[type_bits][hook_id] = trigger; - return PhaseHookID{type, hook_id}; + auto const hook_id = next_rooted_hook_id_++; + rooted_hooks_[type_bits][hook_id] = trigger; + return PhaseHookID{type, hook_id, is_collective}; } void PhaseManager::unregisterHook(PhaseHookID hook) { + vtAssertNot( + in_next_phase_collective_, "Must not be in next phase to unregister" + ); + auto const type = static_cast(hook.getType()); auto const id = hook.getID(); - auto iter = hooks_[type].find(id); - if (iter != hooks_[type].end()) { - hooks_[type].erase(iter); + auto const is_collective = hook.getIsCollective(); + auto& hooks = is_collective ? collective_hooks_ : rooted_hooks_; + auto iter = hooks[type].find(id); + if (iter != hooks[type].end()) { + hooks[type].erase(iter); } else { vtAssert(false, "Could not find registered hook ID to erase"); } @@ -92,6 +117,7 @@ void PhaseManager::nextPhaseCollective() { proxy.reduce(msg.get(), cb); theSched()->runSchedulerWhile([this]{ return not reduce_next_phase_done_; }); + reduce_next_phase_done_ = false; in_next_phase_collective_ = false; } @@ -100,4 +126,33 @@ void PhaseManager::nextPhaseReduce(NextMsg* msg) { reduce_next_phase_done_ = true; } +void PhaseManager::runHooks(PhaseHook type) { + auto const type_bits = static_cast(type); + + // start out running all rooted hooks of a particular type + { + auto iter = rooted_hooks_.find(type_bits); + if (iter != rooted_hooks_.end()) { + if (iter->second.size() > 0) { + for (auto&& fn : iter->second) { + runInEpochRooted([=]{ fn.second(); }); + } + } + } + } + + // then, run collective hooks that should be symmetric across nodes + { + auto iter = collective_hooks_.find(type_bits); + if (iter != collective_hooks_.end()) { + if (iter->second.size() > 0) { + // note, this second is a map, so they are ordered across nodes + for (auto&& fn : iter->second) { + runInEpochCollective([=]{ fn.second(); }); + } + } + } + } +} + }} /* end namespace vt::phase */ diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index f5d247a663..9c60df966a 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -50,6 +50,9 @@ #include "vt/phase/phase_hook_enum.h" #include "vt/phase/registered_phase_hook.h" +#include +#include + namespace vt { namespace phase { // fwd-decl for reduce messasge @@ -62,7 +65,7 @@ struct NextMsg; */ struct PhaseManager : runtime::component::Component { using HookIDType = typename std::underlying_type::type; - using HookMapType = std::unordered_map; + using HookMapType = std::map; using HookIDMapType = std::unordered_map; PhaseManager() = default; @@ -85,18 +88,38 @@ struct PhaseManager : runtime::component::Component { PhaseType getCurrentPhase() const { return cur_phase_; } /** - * \brief Register a phase hook that triggers depending on the type of hook + * \brief Collectively register a phase hook that triggers depending on the + * type of hook + * + * \note These must be registered across all nodes as they will be run in a + * collective epoch. This is for synchronized phase actions. * * \param[in] type the type of trigger to register * \param[in] trigger the action to trigger * * \return registered ID that can be used to unregister the hook */ - PhaseHookID registerHook(PhaseHook type, ActionType trigger); + PhaseHookID registerHookCollective(PhaseHook type, ActionType trigger); + + /** + * \brief Rooted register a phase hook that triggers depending on the type of + * hook + * + * \note This is an independent hook that runs on this node only + * + * \param[in] type the type of trigger to register + * \param[in] trigger the action to trigger + * + * \return registered ID that can be used to unregister the hook + */ + PhaseHookID registerHookRooted(PhaseHook type, ActionType trigger); /** * \brief Unregister an existing hook * + * \warning For collective hooks, they must all be unregistered across all + * nodes before the next \c nextPhaseCollective is invoked. + * * \param[in] hook the id of the hook to unregister */ void unregisterHook(PhaseHookID hook); @@ -119,13 +142,23 @@ struct PhaseManager : runtime::component::Component { */ void nextPhaseReduce(NextMsg* msg); + /** + * \internal + * \brief Run all the hooks registered here of a certain variety + * + * \param[in] type type of hook to run designated by the enum \c PhsaeHook + */ + void runHooks(PhaseHook type); + private: - PhaseType cur_phase_ = 0; /**< Current phase */ - ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ - HookIDMapType hooks_; /**< Map of regisstered hooks */ - std::size_t next_hook_id_ = 1; /**< Next ID for hook registration */ - bool in_next_phase_collective_ = false; /**< Whether blocked in next phase */ - bool reduce_next_phase_done_ = false; /**< Whether reduce is complete */ + PhaseType cur_phase_ = 0; /**< Current phase */ + ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ + HookIDMapType collective_hooks_; /**< Collective regisstered hooks */ + HookIDMapType rooted_hooks_; /**< Rooted regisstered hooks */ + std::size_t next_collective_hook_id_ = 1; /**< Next ID for collective hooks */ + std::size_t next_rooted_hook_id_ = 1; /**< Next ID for rooted hooks */ + bool in_next_phase_collective_ = false; /**< Whether blocked in next phase */ + bool reduce_next_phase_done_ = false; /**< Whether reduce is complete */ }; }} /* end namespace vt::phase */ diff --git a/src/vt/phase/registered_phase_hook.h b/src/vt/phase/registered_phase_hook.h index d8f94196be..9eeacbe2ee 100644 --- a/src/vt/phase/registered_phase_hook.h +++ b/src/vt/phase/registered_phase_hook.h @@ -67,9 +67,10 @@ struct PhaseHookID { * \param[in] in_type the type of hook * \param[in] in_id the registered ID */ - PhaseHookID(PhaseHook in_type, std::size_t in_id) + PhaseHookID(PhaseHook in_type, std::size_t in_id, bool in_is_collective) : type_(in_type), - id_(in_id) + id_(in_id), + is_collective_(in_is_collective) { } friend struct PhaseManager; @@ -89,9 +90,17 @@ struct PhaseHookID { */ std::size_t getID() const { return id_; } + /** + * \brief Get whether the hook is collective or not + * + * \return whether it is collective + */ + std::size_t getIsCollective() const { return is_collective_; } + private: PhaseHook type_; std::size_t id_ = 0; + bool is_collective_ = false; }; }} /* end namespace vt::phase */ From 3289ed9ee4c40c4e07980d74e3b0f7e7b61a9944 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 19:59:49 -0700 Subject: [PATCH 06/36] #875: phase: add call to invoke the hooks --- src/vt/phase/phase_manager.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index abbc4e058b..a283eec431 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -119,6 +119,12 @@ void PhaseManager::nextPhaseCollective() { theSched()->runSchedulerWhile([this]{ return not reduce_next_phase_done_; }); reduce_next_phase_done_ = false; + runHooks(PhaseHook::End); + runHooks(PhaseHook::EndPostMigration); + + cur_phase_++; + runHooks(PhaseHook::Start); + in_next_phase_collective_ = false; } From 1b18e41dc21ddf7366a0022db455cea836e64599 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 20:00:04 -0700 Subject: [PATCH 07/36] #875: runtime: change deps for startup order --- src/vt/runtime/runtime.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index 25a542134e..a34869fcf9 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -652,7 +652,8 @@ void Runtime::initializeComponents() { ); p_->registerComponent(&theMemUsage, Deps< - ctx::Context // Everything depends on theContext + ctx::Context, // Everything depends on theContext + phase::PhaseManager // For outputting memory at phase boundaries >{}); p_->registerComponent(&theRegistry, Deps< @@ -860,7 +861,8 @@ void Runtime::initializeComponents() { p_->registerComponent( &thePhase, Deps< - ctx::Context // Everything depends on theContext + ctx::Context, // Everything depends on theContext + objgroup::ObjGroupManager // Since it's an objgroup >{} ); From fe077e87668e276993ebb4eda19975e27c92e440 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 20:00:34 -0700 Subject: [PATCH 08/36] #875: utils: make memory usage component use hook for printing memory usage --- src/vt/utils/memory/memory_usage.cc | 24 ++++++++++++++++++++++++ src/vt/utils/memory/memory_usage.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/src/vt/utils/memory/memory_usage.cc b/src/vt/utils/memory/memory_usage.cc index 16bd227dba..f203d6430d 100644 --- a/src/vt/utils/memory/memory_usage.cc +++ b/src/vt/utils/memory/memory_usage.cc @@ -45,6 +45,7 @@ #include "vt/config.h" #include "vt/configs/arguments/app_config.h" #include "vt/utils/memory/memory_usage.h" +#include "vt/phase/phase_manager.h" #include "vt/context/context.h" #include @@ -355,6 +356,29 @@ MemoryUsage::MemoryUsage() { getFirstUsage(); } +void MemoryUsage::initialize() { + if (theConfig()->vt_print_memory_each_phase) { + auto this_node = theContext()->getNode(); + if ( + "all" == theConfig()->vt_print_memory_node or + std::to_string(this_node) == theConfig()->vt_print_memory_node + ) { + if (theMemUsage()->hasWorkingReporter()) { + thePhase()->registerHookRooted( + phase::PhaseHook::EndPostMigration, []{ + auto cur_phase = thePhase()->getCurrentPhase(); + auto memory_usage_str = fmt::format( + "Memory Usage: phase={}: {}\n", cur_phase, + theMemUsage()->getUsageAll() + ); + vt_print(gen, memory_usage_str); + } + ); + } + } + } +} + std::size_t MemoryUsage::getAverageUsage() { std::size_t usage = 0; std::size_t num_valid = 0; diff --git a/src/vt/utils/memory/memory_usage.h b/src/vt/utils/memory/memory_usage.h index cf617c62b6..51e0bd6885 100644 --- a/src/vt/utils/memory/memory_usage.h +++ b/src/vt/utils/memory/memory_usage.h @@ -125,6 +125,8 @@ struct MemoryUsage : runtime::component::Component { */ MemoryUsage(); + void initialize() override; + std::string name() override { return "MemoryUsage"; } /** From 11714f3f6368fa1ae054deb8f97c3280e690dfb1 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 20:05:32 -0700 Subject: [PATCH 09/36] #875: phase: add initial time start hooks are invoked --- src/vt/phase/phase_manager.cc | 5 +++++ src/vt/phase/phase_manager.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index a283eec431..d3a104d9d7 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -98,6 +98,11 @@ void PhaseManager::unregisterHook(PhaseHookID hook) { } } +void PhaseManager::startup() { + // This is the last chance to fire any starting hooks for the very first phase + runHooks(PhaseHook::Start); +} + struct NextMsg : collective::ReduceNoneMsg {}; void PhaseManager::nextPhaseCollective() { diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 9c60df966a..faa954de5b 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -72,6 +72,8 @@ struct PhaseManager : runtime::component::Component { std::string name() override { return "PhaseManager"; } + void startup() override; + /** * \internal * \brief Construct a new \c PhaseManager as an objgroup From 66d8270ace3ce80775e213ba46582cf501a0a2dc Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 20:10:15 -0700 Subject: [PATCH 10/36] #875: trace: add trace hooks for flushing and toggling --- src/vt/trace/trace.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/vt/trace/trace.cc b/src/vt/trace/trace.cc index cebbec98f3..af5f390527 100644 --- a/src/vt/trace/trace.cc +++ b/src/vt/trace/trace.cc @@ -52,6 +52,7 @@ #include "vt/trace/file_spec/spec.h" #include "vt/objgroup/headers.h" #include "vt/utils/memory/memory_usage.h" +#include "vt/phase/phase_manager.h" #include #include @@ -148,6 +149,15 @@ void Trace::startup() /*override*/ { theSched()->registerTrigger( sched::SchedulerEvent::EndIdle, [this]{ endIdle(); } ); + + thePhase()->registerHookRooted(phase::PhaseHook::End, []{ + auto const phase = thePhase()->getCurrentPhase(); + theTrace()->setTraceEnabledCurrentPhase(phase + 1); + }); + + thePhase()->registerHookCollective(phase::PhaseHook::EndPostMigration, []{ + theTrace()->flushTracesFile(false); + }); #endif } From 3d0fda413590e5e3101e3e2590457d0021a8d322 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:30:28 -0700 Subject: [PATCH 11/36] #875: phase: masssive simplification of phase code --- src/vt/vrt/collection/balance/elm_stats.h | 2 +- .../vrt/collection/balance/elm_stats.impl.h | 44 +-- .../balance/lb_invoke/lb_manager.cc | 180 +++-------- .../collection/balance/lb_invoke/lb_manager.h | 88 ++---- src/vt/vrt/collection/balance/node_stats.cc | 9 - src/vt/vrt/collection/balance/node_stats.h | 5 - src/vt/vrt/collection/balance/phase_msg.h | 12 + src/vt/vrt/collection/balance/proxy/lbable.h | 91 ------ .../collection/balance/proxy/lbable.impl.h | 142 --------- src/vt/vrt/collection/holders/base_holder.h | 1 - src/vt/vrt/collection/holders/col_holder.h | 7 - .../vrt/collection/holders/col_holder.impl.h | 10 - src/vt/vrt/collection/holders/entire_holder.h | 7 - .../collection/holders/entire_holder.impl.h | 43 --- src/vt/vrt/collection/holders/holder.h | 41 --- src/vt/vrt/collection/holders/holder.impl.h | 37 --- src/vt/vrt/collection/manager.cc | 59 +--- src/vt/vrt/collection/manager.fwd.h | 2 - src/vt/vrt/collection/manager.h | 168 +---------- src/vt/vrt/collection/manager.impl.h | 285 +----------------- .../proxy_traits/proxy_elm_traits.h | 7 +- 21 files changed, 112 insertions(+), 1128 deletions(-) delete mode 100644 src/vt/vrt/collection/balance/proxy/lbable.h delete mode 100644 src/vt/vrt/collection/balance/proxy/lbable.impl.h diff --git a/src/vt/vrt/collection/balance/elm_stats.h b/src/vt/vrt/collection/balance/elm_stats.h index f42185a5da..22c75d4e04 100644 --- a/src/vt/vrt/collection/balance/elm_stats.h +++ b/src/vt/vrt/collection/balance/elm_stats.h @@ -102,7 +102,7 @@ struct ElementStats { public: template - static void syncNextPhase(PhaseMsg* msg, ColT* col); + static void syncNextPhase(CollectStatsMsg* msg, ColT* col); friend struct collection::Migratable; diff --git a/src/vt/vrt/collection/balance/elm_stats.impl.h b/src/vt/vrt/collection/balance/elm_stats.impl.h index 178fc54744..4e15927962 100644 --- a/src/vt/vrt/collection/balance/elm_stats.impl.h +++ b/src/vt/vrt/collection/balance/elm_stats.impl.h @@ -71,7 +71,8 @@ void ElementStats::serialize(Serializer& s) { } template -/*static*/ void ElementStats::syncNextPhase(PhaseMsg* msg, ColT* col) { +/*static*/ +void ElementStats::syncNextPhase(CollectStatsMsg* msg, ColT* col) { auto& stats = col->stats_; vt_debug_print( @@ -85,52 +86,15 @@ template stats.updatePhase(1); auto const& cur_phase = msg->getPhase(); - auto const& proxy = col->getCollectionProxy(); auto const& untyped_proxy = col->getProxy(); auto const& total_load = stats.getLoad(cur_phase, getFocusedSubPhase(untyped_proxy)); auto const& subphase_loads = stats.subphase_timings_.at(cur_phase); auto const& comm = stats.getComm(cur_phase); auto const& subphase_comm = stats.getSubphaseComm(cur_phase); - auto const& idx = col->getIndex(); - auto const& elm_proxy = proxy[idx]; - theNodeStats()->addNodeStats(col, cur_phase, total_load, subphase_loads, comm, subphase_comm); - - auto const before_ready = theCollection()->numReadyCollections(); - theCollection()->makeCollectionReady(untyped_proxy); - auto const after_ready = theCollection()->numReadyCollections(); - auto const ready = theCollection()->readyNextPhase(); - - vt_debug_print( - lb, node, - "ElementStats: syncNextPhase: before_ready={}, after_ready={}, ready={}\n", - before_ready, after_ready, ready + theNodeStats()->addNodeStats( + col, cur_phase, total_load, subphase_loads, comm, subphase_comm ); - - auto lb_man = theLBManager()->getProxy(); - - auto const single_node = theContext()->getNumNodes() == 1; - auto const lb = lb_man.get()->decideLBToRun(cur_phase); - bool const must_run_lb = lb != LBType::NoLB and not single_node; - auto const num_collections = theCollection()->numCollections<>(); - auto const do_sync = msg->doSync(); - auto nmsg = makeMessage(cur_phase,lb,msg->manual(),num_collections); - - if (must_run_lb) { - auto cb = theCB()->makeBcast(lb_man); - proxy.reduce(nmsg.get(),cb); - } else { - - // Preemptively release the element directly, doing cleanup later after a - // collection reduction. This allows work to start early while still - // releasing the node-level LB continuations needed for cleanup - if (lb == LBType::NoLB and not do_sync) { - theCollection()->elmFinishedLB(elm_proxy,cur_phase); - } - - auto cb = theCB()->makeBcast(lb_man); - proxy.reduce(nmsg.get(),cb); - } } }}}} /* end namespace vt::vrt::collection::balance */ diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc index 6c5c43a4ea..cd6111969c 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc @@ -185,114 +185,71 @@ LBManager::runLB(LBProxyType base_proxy, PhaseType phase) { "LBManager: finished migrations\n" ); theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded()); - this->finishedRunningLB(phase); + this->finishedLB(phase); }); } -void LBManager::collectiveImpl( - PhaseType phase, LBType lb, bool manual, std::size_t num_calls -) { +void LBManager::selectStartLB(PhaseType phase) { + LBType lb = decideLBToRun(phase, true); + startLB(phase, lb); +} + +void LBManager::startLB(PhaseType phase, LBType lb) { vt_debug_print( lb, node, - "collectiveImpl: phase={}, manual={}, num_invocations_={}, num_calls={}, " - "num_release={}\n", - phase, manual, num_invocations_, num_calls, num_release_ + "LBManager::startLB: phase={}\n", phase ); - num_invocations_++; + auto const& this_node = theContext()->getNode(); - if (num_invocations_ == num_calls) { - auto const& this_node = theContext()->getNode(); + if (this_node == 0 and not theConfig()->vt_lb_quiet) { + vt_debug_print( + lb, node, + "LBManager::startLB: phase={}, balancer={}, name={}\n", + phase, + static_cast::type>(lb), + lb_names_[lb] + ); + } - if (this_node == 0 and not theConfig()->vt_lb_quiet) { - vt_debug_print( - lb, node, - "LBManager::collectiveImpl: phase={}, balancer={}, name={}\n", - phase, - static_cast::type>(lb), - lb_names_[lb] - ); - } + if (lb == LBType::NoLB) { + // perform cleanup actions and then return out + theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded()); + finishedLB(phase); + return; + } - switch (lb) { - case LBType::HierarchicalLB: lb_instances_["chosen"] = makeLB(); break; - case LBType::GreedyLB: lb_instances_["chosen"] = makeLB(); break; - case LBType::RotateLB: lb_instances_["chosen"] = makeLB(); break; - case LBType::GossipLB: lb_instances_["chosen"] = makeLB(); break; - case LBType::StatsMapLB: lb_instances_["chosen"] = makeLB(); break; - case LBType::RandomLB: lb_instances_["chosen"] = makeLB(); break; + switch (lb) { + case LBType::HierarchicalLB: lb_instances_["chosen"] = makeLB(); break; + case LBType::GreedyLB: lb_instances_["chosen"] = makeLB(); break; + case LBType::RotateLB: lb_instances_["chosen"] = makeLB(); break; + case LBType::GossipLB: lb_instances_["chosen"] = makeLB(); break; + case LBType::StatsMapLB: lb_instances_["chosen"] = makeLB(); break; + case LBType::RandomLB: lb_instances_["chosen"] = makeLB(); break; # if vt_check_enabled(zoltan) - case LBType::ZoltanLB: lb_instances_["chosen"] = makeLB(); break; + case LBType::ZoltanLB: lb_instances_["chosen"] = makeLB(); break; # endif - case LBType::NoLB: - vtAssert(false, "LBType::NoLB is not a valid LB for collectiveImpl"); - break; - default: - vtAssert(false, "A valid LB must be passed to collectiveImpl"); - break; - } - - LBProxyType base_proxy = lb_instances_["chosen"]; - - runLB(base_proxy, phase); + case LBType::NoLB: + vtAssert(false, "LBType::NoLB is not a valid LB for collectiveImpl"); + break; + default: + vtAssert(false, "A valid LB must be passed to collectiveImpl"); + break; } -} - -void LBManager::waitLBCollective() { - vt_debug_print( - lb, node, - "waitLBCollective (begin)\n" - ); - - // - // The invocation should only happen collectively across the whole all nodes. - // - theTerm()->produce(); - theSched()->runSchedulerWhile([this]{ return synced_in_lb_; }); - synced_in_lb_ = true; - theTerm()->consume(); - - vt_debug_print( - lb, node, - "waitLBCollective (end)\n" - ); -} - -void LBManager::finishedRunningLB(PhaseType phase) { - vt_debug_print( - lb, node, - "finishedRunningLB\n" - ); - releaseImpl(phase); -} - -void LBManager::releaseImpl(PhaseType phase, std::size_t num_calls) { - vt_debug_print( - lb, node, - "releaseImpl: phase={}, num_invocations_={}, num_calls={}, num_release={}\n", - phase, num_invocations_, num_calls, num_release_ - ); - vtAssert( - num_calls != 0 or - num_invocations_ > 0, "Must be automatically invoked to releaseImpl" - ); - num_release_++; - if (num_release_ == num_calls or num_release_ == num_invocations_) { - releaseNow(phase); - } + LBProxyType base_proxy = lb_instances_["chosen"]; + runLB(base_proxy, phase); } -void LBManager::releaseNow(PhaseType phase) { - vt_debug_print(lb, node, "releaseNow\n"); +void LBManager::finishedLB(PhaseType phase) { + vt_debug_print(lb, node, "finishedLB\n"); auto this_node = theContext()->getNode(); if (this_node == 0) { vt_print( lb, - "LBManager::releaseNow: finished LB, phase={}, invocations={}\n", - phase, num_invocations_ + "LBManager::finishedLB, phase={}\n", phase ); } @@ -304,42 +261,8 @@ void LBManager::releaseNow(PhaseType phase) { if (destroy_lb_ != nullptr) { triggerListeners(phase); destroy_lb_(); - printMemoryUsage(phase); destroy_lb_ = nullptr; } - releaseLBPhase(msg.get()); - synced_in_lb_ = false; - num_invocations_ = num_release_ = 0; -} - -void LBManager::sysLB(InvokeMsg* msg) { - vt_debug_print(lb, node, "sysLB\n"); - printMemoryUsage(msg->phase_); - flushTraceNextPhase(); - setTraceEnabledNextPhase(msg->phase_); - return collectiveImpl(msg->phase_, msg->lb_, msg->manual_, msg->num_collections_); -} - -void LBManager::sysReleaseLB(InvokeMsg* msg) { - vt_debug_print(lb, node, "sysReleaseLB\n"); - printMemoryUsage(msg->phase_); - flushTraceNextPhase(); - setTraceEnabledNextPhase(msg->phase_); - return releaseImpl(msg->phase_, msg->num_collections_); -} - -void LBManager::setTraceEnabledNextPhase(PhaseType phase) { - // Set if tracing is enabled for this next phase. Do this immediately before - // LB runs so LB is always instrumented as the beginning of the next phase -#if vt_check_enabled(trace_enabled) - theTrace()->setTraceEnabledCurrentPhase(phase + 1); -# endif -} - -void LBManager::flushTraceNextPhase() { -#if vt_check_enabled(trace_enabled) - theTrace()->flushTracesFile(false); -# endif } int LBManager::registerListenerAfterLB(ListenerFnType fn) { @@ -354,23 +277,6 @@ void LBManager::unregisterListenerAfterLB(int element) { listeners_[element] = nullptr; } -void LBManager::printMemoryUsage(PhaseType phase) { - if (theConfig()->vt_print_memory_each_phase) { - auto this_node = theContext()->getNode(); - if ( - "all" == theConfig()->vt_print_memory_node or - std::to_string(this_node) == theConfig()->vt_print_memory_node - ) { - if (theMemUsage()->hasWorkingReporter()) { - auto memory_usage_str = fmt::format( - "Memory Usage: phase={}: {}\n", phase, theMemUsage()->getUsageAll() - ); - vt_print(gen, memory_usage_str); - } - } - } -} - void LBManager::triggerListeners(PhaseType phase) { for (auto&& l : listeners_) { if (l) { diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h index bb8838909d..8696c06d43 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h @@ -86,7 +86,8 @@ struct LBManager : runtime::component::Component { public: /** - * \internal \brief Decide which LB to invoke given a certain phase + * \internal + * \brief Decide which LB to invoke given a certain phase * * \param[in] phase the phase in question * \param[in] try_file whether to try to read from file @@ -96,13 +97,8 @@ struct LBManager : runtime::component::Component { LBType decideLBToRun(PhaseType phase, bool try_file = true); /** - * \internal \brief Collectively wait for LB, used to invoke without - * consideration of readiness of the state of live collections - */ - void waitLBCollective(); - - /** - * \internal \brief Get the proxy for the LBManager + * \internal + * \brief Get the proxy for the LBManager * * \return proxy to the \c LBManager */ @@ -111,7 +107,8 @@ struct LBManager : runtime::component::Component { } /** - * \internal \brief Setup the proxy for the LBManager + * \internal + * \brief Setup the proxy for the LBManager * * \param[in] proxy the proxy to set */ @@ -120,81 +117,30 @@ struct LBManager : runtime::component::Component { } /** - * \internal \brief Tell the manager the LB is finished. - * - * \warning This should *not* be called by the - * user, only by load balancers. Not private/protected as friending every LBs - * adds too much overhead - - * \param[in] phase the phase that is finished - */ - void finishedRunningLB(PhaseType phase); - -protected: - /** - * \internal \brief Collectively start load balancing - * - * \param[in] phase the phase - * \param[in] lb the load balancer to run - * \param[in] manual whether it's manual or invoked from a collection - * \param[in] num_calls number of calls required to start - */ - void collectiveImpl( - PhaseType phase, LBType lb, bool manual, std::size_t num_calls = 1 - ); - - /** - * \internal \brief Release control back to user - * - * \param[in] phase the phase - * \param[in] num_calls number of calls required to start - */ - void releaseImpl(PhaseType phase, std::size_t num_calls = 0); - - /** - * \internal \brief Release control back to user now (without counting down) + * \internal + * \brief Collectively start load balancing after deciding which to run * * \param[in] phase the phase */ - void releaseNow(PhaseType phase); + void selectStartLB(PhaseType phase); -public: /** - * \internal \brief Print the memory usage for a phase + * \internal + * \brief Collectively start load balancing * * \param[in] phase the phase + * \param[in] lb the load balancer to run */ - void printMemoryUsage(PhaseType phase); + void startLB(PhaseType phase, LBType lb); +protected: /** - * \internal \brief Communicate to the trace component that a new phase - * occurred, so tracing can be enabled or disabled + * \internal + * \brief Call when LB is finished to complete post-LB actions * * \param[in] phase the phase */ - void setTraceEnabledNextPhase(PhaseType phase); - - /** - * \internal \brief Communicate to the trace component that a new phase - * occurred so flushing of traces can occur if required - */ - void flushTraceNextPhase(); - - /** - * \internal \brief Tell the manager that a collection has hit \c nextPhase so - * load balancing can begin once all collections enter - * - * \param[in] msg the LB message - */ - void sysLB(InvokeMsg* msg); - - /** - * \internal \brief Tell the manager that a collection has hit \c nextPhase, - * choosing to skip load balancing - * - * \param[in] msg the LB message - */ - void sysReleaseLB(InvokeMsg* msg); + void finishedLB(PhaseType phase); public: /** diff --git a/src/vt/vrt/collection/balance/node_stats.cc b/src/vt/vrt/collection/balance/node_stats.cc index 3da01d8c4b..aa46e376d7 100644 --- a/src/vt/vrt/collection/balance/node_stats.cc +++ b/src/vt/vrt/collection/balance/node_stats.cc @@ -165,15 +165,6 @@ ElementIDType NodeStats::getNextElm() { return (elm << 32) | this_node; } -void NodeStats::releaseLB() { - using MsgType = CollectionPhaseMsg; - auto msg = makeMessage(); - auto msg_hold = promoteMsg(msg.get()); - theMsg()->broadcastMsg(msg); - - CollectionManager::releaseLBPhase(msg_hold.get()); -} - void NodeStats::initialize() { #if vt_check_enabled(lblite) if (theConfig()->vt_lb_stats) { diff --git a/src/vt/vrt/collection/balance/node_stats.h b/src/vt/vrt/collection/balance/node_stats.h index ab73c031c7..0e3689071f 100644 --- a/src/vt/vrt/collection/balance/node_stats.h +++ b/src/vt/vrt/collection/balance/node_stats.h @@ -128,11 +128,6 @@ struct NodeStats : runtime::component::Component { */ void startIterCleanup(PhaseType phase, unsigned int look_back); - /** - * \internal \brief Release collection after LB runs for this phase - */ - void releaseLB(); - /** * \internal \brief Output stats file for given phase based on instrumented data * diff --git a/src/vt/vrt/collection/balance/phase_msg.h b/src/vt/vrt/collection/balance/phase_msg.h index e0be4be54d..86a82b4e2a 100644 --- a/src/vt/vrt/collection/balance/phase_msg.h +++ b/src/vt/vrt/collection/balance/phase_msg.h @@ -84,6 +84,18 @@ using PhaseReduceMsg = PhaseMsgBase< ColT,collective::ReduceTMsg >; +template +struct CollectStatsMsg : CollectionMessage { + CollectStatsMsg(PhaseType in_phase) + : phase_(in_phase) + { } + + PhaseType getPhase() const { return phase_; } + +private: + PhaseType phase_ = fst_lb_phase; +}; + }}}} /* end namespace vt::vrt::collection::balance */ #endif /*INCLUDED_VRT_COLLECTION_BALANCE_PHASE_MSG_H*/ diff --git a/src/vt/vrt/collection/balance/proxy/lbable.h b/src/vt/vrt/collection/balance/proxy/lbable.h deleted file mode 100644 index f9e4d46e7c..0000000000 --- a/src/vt/vrt/collection/balance/proxy/lbable.h +++ /dev/null @@ -1,91 +0,0 @@ -/* -//@HEADER -// ***************************************************************************** -// -// lbable.h -// DARMA Toolkit v. 1.0.0 -// DARMA/vt => Virtual Transport -// -// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC -// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. -// Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from this -// software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact darma@sandia.gov -// -// ***************************************************************************** -//@HEADER -*/ - -#if !defined INCLUDED_VT_VRT_COLLECTION_BALANCE_PROXY_LBABLE_H -#define INCLUDED_VT_VRT_COLLECTION_BALANCE_PROXY_LBABLE_H - -#include "vt/config.h" -#include "vt/messaging/message/smart_ptr.h" - -#include - -namespace vt { namespace vrt { namespace collection { - -template -struct LBable : BaseProxyT { - using FinishedLBType = std::function; - - LBable() = default; - LBable( - typename BaseProxyT::ProxyType const& in_proxy, - typename BaseProxyT::ElementProxyType const& in_elm - ); - - template - void serialize(SerializerT& s); - - template < - typename MsgT, ActiveColMemberTypedFnType f, typename... Args - > - void LBsync(Args&&... args) const; - template f> - void LBsync(MsgT* msg, PhaseType p = no_lb_phase) const; - template f> - void LBsync(MsgSharedPtr msg, PhaseType p = no_lb_phase) const; - void LBsync(FinishedLBType cont, PhaseType p = no_lb_phase) const; - - template < - typename MsgT, ActiveColMemberTypedFnType f, typename... Args - > - void LB(Args&&... args) const; - template f> - void LB(MsgT* msg, PhaseType p = no_lb_phase) const; - template f> - void LB(MsgSharedPtr msg, PhaseType p = no_lb_phase) const; - void LB(FinishedLBType cont, PhaseType p = no_lb_phase) const; -}; - -}}} /* end namespace vt::vrt::collection */ - -#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_PROXY_LBABLE_H*/ diff --git a/src/vt/vrt/collection/balance/proxy/lbable.impl.h b/src/vt/vrt/collection/balance/proxy/lbable.impl.h deleted file mode 100644 index f6a712df9d..0000000000 --- a/src/vt/vrt/collection/balance/proxy/lbable.impl.h +++ /dev/null @@ -1,142 +0,0 @@ -/* -//@HEADER -// ***************************************************************************** -// -// lbable.impl.h -// DARMA Toolkit v. 1.0.0 -// DARMA/vt => Virtual Transport -// -// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC -// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. -// Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * Neither the name of the copyright holder nor the names of its -// contributors may be used to endorse or promote products derived from this -// software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact darma@sandia.gov -// -// ***************************************************************************** -//@HEADER -*/ - -#if !defined INCLUDED_VT_VRT_COLLECTION_BALANCE_PROXY_LBABLE_IMPL_H -#define INCLUDED_VT_VRT_COLLECTION_BALANCE_PROXY_LBABLE_IMPL_H - -#include "vt/config.h" -#include "vt/vrt/collection/balance/proxy/lbable.h" -#include "vt/vrt/collection/manager.h" - -namespace vt { namespace vrt { namespace collection { - -template -LBable::LBable( - typename BaseProxyT::ProxyType const& in_proxy, - typename BaseProxyT::ElementProxyType const& in_elm -) : BaseProxyT(in_proxy, in_elm) -{ } - -template -template -void LBable::serialize(SerializerT& s) { - BaseProxyT::serialize(s); -} - -template -template f> -void LBable::LBsync(MsgT* msg, PhaseType p) const { - auto col_proxy = this->getCollectionProxy(); - auto elm_proxy = this->getElementProxy(); - auto proxy = VrtElmProxy(col_proxy,elm_proxy); - return theCollection()->elmReadyLB(proxy,p,msg,true); -} - - -template -template < - typename MsgT, ActiveColMemberTypedFnType f, typename... Args -> -void LBable::LBsync(Args&&... args) const { - return LBsync(makeMessage(args...)); -} - - -template -template f> -void LBable::LBsync( - MsgSharedPtr msg, PhaseType p -) const { - return LBsync(msg.get(),p); -} - -template -void LBable::LBsync( - FinishedLBType cont, PhaseType p -) const { - auto col_proxy = this->getCollectionProxy(); - auto elm_proxy = this->getElementProxy(); - auto proxy = VrtElmProxy(col_proxy,elm_proxy); - return theCollection()->elmReadyLB(proxy,p,true,cont); -} - -template -template f> -void LBable::LB(MsgT* msg, PhaseType p) const { - auto col_proxy = this->getCollectionProxy(); - auto elm_proxy = this->getElementProxy(); - auto proxy = VrtElmProxy(col_proxy,elm_proxy); - return theCollection()->elmReadyLB(proxy,p,msg,false); -} - -template -template < - typename MsgT, ActiveColMemberTypedFnType f, typename... Args -> -void LBable::LB(Args&&... args) const { - return LB(makeMessage(args...)); -} - -template -template f> -void LBable::LB( - MsgSharedPtr msg, PhaseType p -) const { - return LB(msg.get(),p); -} - -template -void LBable::LB( - FinishedLBType cont, PhaseType p -) const { - auto col_proxy = this->getCollectionProxy(); - auto elm_proxy = this->getElementProxy(); - auto proxy = VrtElmProxy(col_proxy,elm_proxy); - return theCollection()->elmReadyLB(proxy,p,false,cont); -} - - -}}} /* end namespace vt::vrt::collection */ - -#endif /*INCLUDED_VT_VRT_COLLECTION_BALANCE_PROXY_LBABLE_IMPL_H*/ diff --git a/src/vt/vrt/collection/holders/base_holder.h b/src/vt/vrt/collection/holders/base_holder.h index e6f09e13c4..d989c00a85 100644 --- a/src/vt/vrt/collection/holders/base_holder.h +++ b/src/vt/vrt/collection/holders/base_holder.h @@ -53,7 +53,6 @@ namespace vt { namespace vrt { namespace collection { struct BaseHolder { BaseHolder() = default; - virtual void runLB(PhaseType phase) = 0; virtual void destroy() = 0; }; diff --git a/src/vt/vrt/collection/holders/col_holder.h b/src/vt/vrt/collection/holders/col_holder.h index 56742900d9..5095244795 100644 --- a/src/vt/vrt/collection/holders/col_holder.h +++ b/src/vt/vrt/collection/holders/col_holder.h @@ -83,13 +83,6 @@ struct CollectionHolder : BaseHolder { */ void destroy() override; - /** - * \internal \brief Run LB continuations on all collection elements - * - * \param[in] phase the phase - */ - void runLB(PhaseType phase) override; - bool is_static_ = false; /**< Whether is static sized */ HandlerType map_fn = uninitialized_handler; /**< The map function */ IndexT max_idx; /**< Index range for collection */ diff --git a/src/vt/vrt/collection/holders/col_holder.impl.h b/src/vt/vrt/collection/holders/col_holder.impl.h index 34c56dbe29..93b65d4188 100644 --- a/src/vt/vrt/collection/holders/col_holder.impl.h +++ b/src/vt/vrt/collection/holders/col_holder.impl.h @@ -61,16 +61,6 @@ void CollectionHolder::destroy() { holder_.destroyAll(); } -template -void CollectionHolder::runLB(PhaseType cur_phase) { - holder_.foreach([=](IndexT const& idx, CollectionBase* base){ - auto proxy = base->getCollectionProxy(); - auto phase = cur_phase == no_lb_phase ? base->getStats().getPhase() : cur_phase; - auto phase_msg = makeMessage>(phase,proxy,true,true); - balance::ElementStats::syncNextPhase(phase_msg.get(), static_cast(base)); - }); -} - }}} /* end namespace vt::vrt::collection */ #endif /*INCLUDED_VRT_COLLECTION_HOLDERS_COL_HOLDER_IMPL_H*/ diff --git a/src/vt/vrt/collection/holders/entire_holder.h b/src/vt/vrt/collection/holders/entire_holder.h index 3e3eaca75b..56f9f65d7a 100644 --- a/src/vt/vrt/collection/holders/entire_holder.h +++ b/src/vt/vrt/collection/holders/entire_holder.h @@ -59,12 +59,6 @@ template struct UniversalIndexHolder { static void destroyAllLive(); static void destroyCollection(VirtualProxyType const proxy); - static bool readyNextPhase(); - static void makeCollectionReady(VirtualProxyType const proxy); - static void runLB(PhaseType phase); - static void resetPhase(); - static std::size_t getNumCollections(); - static std::size_t getNumReadyCollections(); static void insertMap( VirtualProxyType const proxy, HandlerType const& han, EpochType const& insert_epoch = no_epoch @@ -76,7 +70,6 @@ struct UniversalIndexHolder { ); static EpochType insertGetEpoch(VirtualProxyType const proxy); static std::unordered_map insert_epoch_; - static std::unordered_set ready_collections_; static std::unordered_map< VirtualProxyType,std::shared_ptr > live_collections_; diff --git a/src/vt/vrt/collection/holders/entire_holder.impl.h b/src/vt/vrt/collection/holders/entire_holder.impl.h index f7b2635607..0e835f5f99 100644 --- a/src/vt/vrt/collection/holders/entire_holder.impl.h +++ b/src/vt/vrt/collection/holders/entire_holder.impl.h @@ -64,14 +64,6 @@ template live_collections_.clear(); } -template -/*static*/ void UniversalIndexHolder::runLB(PhaseType phase) { - for (auto&& elm : live_collections_) { - auto base_holder = elm.second; - base_holder->runLB(phase); - } -} - template /*static*/ void UniversalIndexHolder::destroyCollection( VirtualProxyType const proxy @@ -82,37 +74,6 @@ template } } -template -/*static*/ bool UniversalIndexHolder::readyNextPhase() { - auto const ready_coll = getNumReadyCollections(); - auto const total_coll = getNumCollections(); - return ready_coll == total_coll; -} - -template -/*static*/ void UniversalIndexHolder::makeCollectionReady( - VirtualProxyType const proxy -) { - ready_collections_.insert(proxy); -} - -template -/*static*/ void UniversalIndexHolder::resetPhase() { - ready_collections_.clear(); -} - -template -/*static*/ std::size_t -UniversalIndexHolder::getNumCollections() { - return live_collections_.size(); -} - -template -/*static*/ std::size_t -UniversalIndexHolder::getNumReadyCollections() { - return ready_collections_.size(); -} - template /*static*/ void UniversalIndexHolder::insertMap( VirtualProxyType const proxy, HandlerType const& han, @@ -165,10 +126,6 @@ template /*static*/ std::unordered_map UniversalIndexHolder::live_collections_map_; -template -/*static*/ std::unordered_set -UniversalIndexHolder::ready_collections_ = {}; - template /*static*/ std::unordered_map UniversalIndexHolder::insert_epoch_ = {}; diff --git a/src/vt/vrt/collection/holders/holder.h b/src/vt/vrt/collection/holders/holder.h index 580860ca45..0ab0eee8d7 100644 --- a/src/vt/vrt/collection/holders/holder.h +++ b/src/vt/vrt/collection/holders/holder.h @@ -226,45 +226,6 @@ struct Holder { */ void setGroupRoot(NodeType const root) { group_root_ = root; } - /** - * \brief Get number of elements ready for LB - * - * \return number of ready elements - */ - CountType numReady() const { return elements_ready_; } - - /** - * \brief Add number of ready elements for LB - * - * \param[in] num number of ready elements - */ - void addReady(CountType num = 1) { elements_ready_ += 1; } - - /** - * \brief Clear all ready elements - */ - void clearReady() { elements_ready_ = 0; } - - /** - * \brief Add a LB continuation for an element - * - * \param[in] idx the index - * \param[in] fn the continuation - */ - void addLBCont(IndexT const& idx, LBContFnType fn); - - /** - * \brief Run LB continuations for element - * - * \param[in] idx the index - */ - void runLBCont(IndexT const& idx); - - /** - * \brief Run all LB continuations - */ - void runLBCont(); - /** * \brief Add element-specific listener * @@ -297,14 +258,12 @@ struct Holder { std::unordered_map*> bcasts_ = {}; EpochType cur_bcast_epoch_ = 1; TypedIndexContainer vc_container_ = {}; - TypedLBContainer vc_lb_continuation_ = {}; bool is_destroyed_ = false; GroupType cur_group_ = no_group; bool use_group_ = false; bool group_ready_ = false; NodeType group_root_ = 0; CountType num_erased_not_removed_ = 0; - CountType elements_ready_ = 0; std::vector> event_listeners_ = {}; }; diff --git a/src/vt/vrt/collection/holders/holder.impl.h b/src/vt/vrt/collection/holders/holder.impl.h index 0d51052d61..481387661a 100644 --- a/src/vt/vrt/collection/holders/holder.impl.h +++ b/src/vt/vrt/collection/holders/holder.impl.h @@ -188,43 +188,6 @@ Holder::numElementsExpr(FuncExprType fn) const { return num_in; } -template -void Holder::addLBCont(IndexT const& idx, LBContFnType fn) { - using MappedType = typename TypedLBContainer::mapped_type; - auto iter = vc_lb_continuation_.find(idx); - if (iter == vc_lb_continuation_.end()) { - vc_lb_continuation_.emplace( - std::piecewise_construct, - std::forward_as_tuple(idx), - std::forward_as_tuple(MappedType{}) - ); - iter = vc_lb_continuation_.find(idx); - } - vtAssert(iter != vc_lb_continuation_.end(), "Key must exist in map"); - iter->second.push_back(fn); -} - -template -void Holder::runLBCont(IndexT const& idx) { - auto iter = vc_lb_continuation_.find(idx); - if (iter != vc_lb_continuation_.end()) { - for (auto&& elm : iter->second) { - elm(); - } - vc_lb_continuation_.erase(iter); - } -} - -template -void Holder::runLBCont() { - for (auto&& idxc : vc_lb_continuation_) { - for (auto&& elm : idxc.second) { - elm(); - } - } - vc_lb_continuation_.clear(); -} - template int Holder::addListener(listener::ListenFnType fn) { event_listeners_.push_back(fn); diff --git a/src/vt/vrt/collection/manager.cc b/src/vt/vrt/collection/manager.cc index 0d15168387..e61446c3b3 100644 --- a/src/vt/vrt/collection/manager.cc +++ b/src/vt/vrt/collection/manager.cc @@ -60,47 +60,22 @@ void CollectionManager::finalize() { /*virtual*/ CollectionManager::~CollectionManager() { } +void CollectionManager::startup() { #if vt_check_enabled(lblite) -struct StartRootedMsg : vt::Message { - StartRootedMsg() = default; - explicit StartRootedMsg(PhaseType in_phase) : phase_(in_phase) { } - PhaseType phase_; -}; - -static void startRootedBroadcast(StartRootedMsg* msg) { - theCollection()->startPhaseCollective(nullptr, msg->phase_); -} -#endif - -void CollectionManager::startPhaseRooted( - ActionFinishedLBType fn, PhaseType lb_phase -) { -#if vt_check_enabled(lblite) - auto msg = makeMessage(lb_phase); - theMsg()->broadcastMsg(msg); - startPhaseCollective(fn, lb_phase); -#else - if (fn != nullptr) { - fn(); - } -#endif -} - -void CollectionManager::startPhaseCollective( - ActionFinishedLBType fn, PhaseType lb_phase -) { -#if vt_check_enabled(lblite) - UniversalIndexHolder<>::runLB(lb_phase); - if (fn != nullptr) { - theTerm()->produce(term::any_epoch_sentinel); - lb_continuations_.push_back(fn); - } else { - theLBManager()->waitLBCollective(); - } -#else - if (fn != nullptr) { - fn(); - } + // First hook, do all stat manipulation + thePhase()->registerHookCollective(phase::PhaseHook::End, []{ + auto const& map = theCollection()->collect_stats_for_lb_; + for (auto&& elm : map) { + // this will trigger all the data collection required for LB + elm.second(); + } + }); + + // Second hook, select and then potentially start the LB + thePhase()->registerHookCollective(phase::PhaseHook::End, []{ + auto const cur_phase = thePhase()->getCurrentPhase(); + theLBManager()->selectStartLB(cur_phase); + }); #endif } @@ -109,10 +84,6 @@ getDispatcher(auto_registry::AutoHandlerType const& han) { return theCollection()->getDispatcher(han); } -void releaseLBPhase(CollectionPhaseMsg* msg) { - CollectionManager::releaseLBPhase<>(msg); -} - balance::ElementIDType CollectionManager::getCurrentContextPerm() const { return cur_context_perm_elm_id_; } diff --git a/src/vt/vrt/collection/manager.fwd.h b/src/vt/vrt/collection/manager.fwd.h index 1d2f59ce56..7f1b769848 100644 --- a/src/vt/vrt/collection/manager.fwd.h +++ b/src/vt/vrt/collection/manager.fwd.h @@ -56,8 +56,6 @@ struct CollectionPhaseMsg; DispatchBasePtrType getDispatcher(auto_registry::AutoHandlerType const& han); -void releaseLBPhase(CollectionPhaseMsg* msg); - }}} /* end namespace vt::vrt::collection */ namespace vt { diff --git a/src/vt/vrt/collection/manager.h b/src/vt/vrt/collection/manager.h index f4111fbea3..ef6085788f 100644 --- a/src/vt/vrt/collection/manager.h +++ b/src/vt/vrt/collection/manager.h @@ -110,8 +110,6 @@ struct CollectionManager template using VirtualPtrType = typename Holder::VirtualPtrType; using ActionProxyType = std::function; - using ActionFinishedLBType = std::function; - using NoElementActionType = std::function; template using ReduceIdxFuncType = std::function; using ReduceVirtualIDType = collective::reduce::ReduceVirtualIDType; @@ -162,6 +160,8 @@ struct CollectionManager virtual ~CollectionManager(); + void startup() override; + void finalize() override; std::string name() override { return "CollectionManager"; } @@ -1381,166 +1381,6 @@ struct CollectionManager NodeType const& migrated_from = uninitialized_destination ); - /* - * ====================================================================== - * LB-related operations on the collection - * ====================================================================== - */ - - /** - * \brief Rooted call to start the next phase of the collection, starting LB - * if necessary. - * - * The \c nextPhase function is called by a single node on the whole - * collection to indicate that LB is ready. All collections must invoke this - * for LB to start. - * - * \param[in] proxy the collection proxy - * \param[in] cur_phase the phase that finished - * \param[in] continuation continuation to fire after LB is done - */ - template - void nextPhase( - CollectionProxyWrapType const& proxy, - PhaseType const& cur_phase, ActionFinishedLBType continuation = nullptr - ); - - /** - * \brief Rooted call to start LB, collection independent - * - * The function \c startPhaseRooted starts, in a rooted manner, the next phase - * of the LB without inquiring about the state of the collections. A single - * node calls it and the registered runLB funcs are invoked to collect up - * statistics and run appropriately. Continuation runs when LB is complete. - * - * \param[in] fn continuation to fire after LB is done - * \param[in] lb_phase the phase that finished - */ - void startPhaseRooted( - ActionFinishedLBType fn, PhaseType lb_phase = no_lb_phase - ); - - /** - * \brief Collective call to start LB, collective independent - * - * The function \c startPhaseCollective starts, in a collective manner, the - * next phase of the LB without inquiring about the state of the - * collections. The LB starts immediately after collecting statistics and the - * continuation executes at completion - * - * \param[in] fn action to trigger when LB is finished - * \param[in] lb_phase (optional) the LB phase - */ - void startPhaseCollective( - ActionFinishedLBType fn, PhaseType lb_phase = no_lb_phase - ); - - /** - * \internal \brief Indicate that a collection element is ready for LB - * - * The \c elmReady function is called by every element of every collection at - * the phase boundary for each local element residing on a node. Once all - * elements have invoked it, LB will commence. - * - * \param[in] proxy the collection element proxy - * \param[in] phase the phase that is finished - * \param[in] do_sync whether LB should do a sync - * \param[in] continuation continuation to fire after LB is done - */ - template - void elmReadyLB( - VirtualElmProxyType const& proxy, PhaseType phase, - bool do_sync, ActionFinishedLBType continuation - ); - - /** - * \internal \brief Indicate that a collection element is ready for LB - * - * The \c elmReady function is called by every element of every collection at - * the phase boundary for each local element residing on a node. Once all - * elements have invoked it, LB will commence. - * - * \param[in] proxy the collection element proxy - * \param[in] phase the phase that is finished - * \param[in] msg message passed by the user to deliver after LB finishes - * \param[in] do_sync whether LB should do a sync - */ - template < - typename MsgT, typename ColT, ActiveColMemberTypedFnType f - > - void elmReadyLB( - VirtualElmProxyType const& proxy, PhaseType phase, MsgT* msg, - bool do_sync - ); - - /** - * \internal \brief Tell an element that LB has completed - * - * \param[in] proxy the collection element proxy - * \param[in] phase the phase that is finished - */ - template - void elmFinishedLB(VirtualElmProxyType const& proxy, PhaseType phase); - - /** - * \internal \brief Release control flow after LB with type-erased - * continuations - * - * \param[in] msg the phase message - */ - template - static void releaseLBPhase(CollectionPhaseMsg* msg); - - /** - * \brief Get number of live collection - * - * \return number of collections - */ - template - std::size_t numCollections(); - - /** - * \brief Get number of live collections that are ready for LB - * - * \return number of collections - */ - template - std::size_t numReadyCollections(); - - /** - * \brief Query if all collections are ready to start LB - * - * \return whether live collections == live ready collections - */ - template - bool readyNextPhase(); - - /** - * \internal \brief Reset ready collections for LB - */ - template - void resetReadyPhase(); - - /** - * \internal \brief Release LB continuations after LB - */ - template - void releaseLBContinuation(); - - /** - * \internal \brief Make a collection ready for LB - * - * \param[in] coll the collection proxy bits - */ - template - void makeCollectionReady(VirtualProxyType const coll); - - /** - * \internal \brief Check if a collection has no elements but wants to reduce - */ - template - void checkReduceNoElements(); - private: /** * \internal \brief Get the entire collection system holder @@ -2125,8 +1965,7 @@ struct CollectionManager CleanupListFnType cleanup_fns_; std::unordered_set constructed_; std::unordered_map reduce_cur_stamp_; - std::vector lb_continuations_ = {}; - std::unordered_map lb_no_elm_ = {}; + std::unordered_map collect_stats_for_lb_; std::unordered_map insert_finished_action_ = {}; std::unordered_map user_insert_action_ = {}; std::unordered_map dist_tag_id_ = {}; @@ -2172,7 +2011,6 @@ namespace details #include "vt/vrt/collection/dispatch/registry.impl.h" #include "vt/vrt/collection/staged_token/token.impl.h" #include "vt/vrt/collection/types/base.impl.h" -#include "vt/vrt/collection/balance/proxy/lbable.impl.h" #include "vt/rdmahandle/manager.collection.impl.h" #include "vt/vrt/proxy/collection_proxy.impl.h" diff --git a/src/vt/vrt/collection/manager.impl.h b/src/vt/vrt/collection/manager.impl.h index cb66f597a5..e7484d1ea1 100644 --- a/src/vt/vrt/collection/manager.impl.h +++ b/src/vt/vrt/collection/manager.impl.h @@ -82,6 +82,7 @@ #include "vt/group/group_headers.h" #include "vt/pipe/pipe_headers.h" #include "vt/scheduler/scheduler.h" +#include "vt/phase/phase_manager.h" #include #include @@ -1979,6 +1980,21 @@ template * messages can be forwarded properly */ theLocMan()->getCollectionLM(proxy); + + /** + * Type-erase some lambdas for doing the collective broadcast that collects up + * the statistics on each node for each collection element + */ + theCollection()->collect_stats_for_lb_[proxy] = [bits=proxy]{ + using namespace balance; + using MsgType = CollectStatsMsg; + auto const phase = thePhase()->getCurrentPhase(); + CollectionProxyWrapType p{bits}; + p.template broadcastCollective>( + phase + ); + }; + vt_debug_print( vrt_coll, node, "addToState: proxy={:x}, AfterMeta\n", proxy @@ -2869,275 +2885,6 @@ Holder* CollectionManager::findElmHolder( return findElmHolder(proxy.getProxy()); } -template -std::size_t CollectionManager::numCollections() { - return UniversalIndexHolder<>::getNumCollections(); -} - -template -std::size_t CollectionManager::numReadyCollections() { - return UniversalIndexHolder<>::getNumReadyCollections(); -} - -template -void CollectionManager::resetReadyPhase() { - UniversalIndexHolder<>::resetPhase(); -} - -template -bool CollectionManager::readyNextPhase() { - auto const ready = UniversalIndexHolder<>::readyNextPhase(); - return ready; -} - -template -void CollectionManager::makeCollectionReady(VirtualProxyType const proxy) { - UniversalIndexHolder<>::makeCollectionReady(proxy); -} - -template -void CollectionManager::elmFinishedLB( - VirtualElmProxyType const& proxy, PhaseType phase -) { - auto const& col_proxy = proxy.getCollectionProxy(); - auto const& idx = proxy.getElementProxy().getIndex(); - auto elm_holder = findElmHolder(col_proxy); - vtAssertInfo( - elm_holder != nullptr, "Must find element holder at elmFinishedLB", - col_proxy, phase - ); - elm_holder->runLBCont(idx); -} - -template < - typename MsgT, typename ColT, ActiveColMemberTypedFnType f -> -void CollectionManager::elmReadyLB( - VirtualElmProxyType const& proxy, PhaseType phase, MsgT* msg, - bool do_sync -) { - auto lb_han = auto_registry::makeAutoHandlerCollectionMem(); - auto pmsg = promoteMsg(msg); - -#if !vt_check_enabled(lblite) - theCollection()->sendMsgUntypedHandler(proxy, pmsg.get(), lb_han,true); - return; -#endif - - auto const col_proxy = proxy.getCollectionProxy(); - auto const idx = proxy.getElementProxy().getIndex(); - auto elm_holder = findElmHolder(col_proxy); - vtAssertInfo( - elm_holder != nullptr, "Must find element holder at elmReadyLB", - col_proxy, phase - ); - - auto const cur_epoch = theMsg()->getEpochContextMsg(msg); - - vt_debug_print( - lb, node, - "elmReadyLB: proxy={:x}, idx={}, phase={}, msg={}, epoch={:x}\n", - col_proxy, idx, phase, pmsg, cur_epoch - ); - - theTerm()->produce(cur_epoch); - elm_holder->addLBCont(idx,[pmsg,proxy,lb_han,cur_epoch]{ - theCollection()->sendMsgUntypedHandler(proxy,pmsg.get(),lb_han,true); - theTerm()->consume(cur_epoch); - }); - - auto iter = release_lb_.find(col_proxy); - if (iter == release_lb_.end()) { - release_lb_[col_proxy] = [this,col_proxy]{ - auto cur_elm_holder = findElmHolder(col_proxy); - cur_elm_holder->runLBCont(); - }; - } - - elmReadyLB(proxy,phase,do_sync,nullptr); -} - -template -void CollectionManager::elmReadyLB( - VirtualElmProxyType const& proxy, PhaseType in_phase, - bool do_sync, ActionFinishedLBType cont -) { - - vt_debug_print( - lb, node, - "elmReadyLB: index={} ready at sync={}, phase={}\n", - proxy.getElementProxy().getIndex(), do_sync, in_phase - ); - -#if !vt_check_enabled(lblite) - cont(); - return; -#endif - - auto const col_proxy = proxy.getCollectionProxy(); - auto const idx = proxy.getElementProxy().getIndex(); - auto elm_holder = findElmHolder(col_proxy); - - PhaseType phase = in_phase; - if (phase == no_lb_phase) { - bool const elm_exists = elm_holder->exists(idx); - if (!(elm_exists)) fmt::print("Element must exist idx={}\n", idx); - auto& holder = elm_holder->lookup(idx); - auto elm = holder.getCollection(); - if (!(elm != nullptr)) fmt::print("Must have valid element"); - phase = elm->stats_.getPhase(); - } - - vtAssertInfo( - elm_holder != nullptr, "Must find element holder at elmReadyLB", - col_proxy, phase - ); - - vt_debug_print( - lb, node, - "elmReadyLB: proxy={:x}, idx={} ready at phase={}\n", - col_proxy, idx, phase - ); - - if (cont != nullptr) { - theTerm()->produce(term::any_epoch_sentinel); - lb_continuations_.push_back(cont); - } - if (elm_holder) { - vt_debug_print(lb, node, "has elm_holder: exists={}\n", elm_holder->exists(idx)); - - vtAssert( - elm_holder->exists(idx), - "Collection element must be local and currently reside on this node" - ); - elm_holder->addReady(); - auto const num_ready = elm_holder->numReady(); - auto const num_total = elm_holder->numElements(); - - vt_debug_print( - lb, node, - "elmReadyLB: proxy={:x}, ready={}, total={} at phase={}\n", - col_proxy, num_ready, num_total, phase - ); - - if (num_ready == num_total) { - elm_holder->clearReady(); - - vt_debug_print( - lb, node, - "elmReadyLB: all local elements of proxy={:x} ready at phase={}\n", - col_proxy, phase - ); - } - - using namespace balance; - CollectionProxyWrapType cur_proxy(col_proxy); - using MsgType = PhaseMsg; - auto msg = makeMessage(phase, cur_proxy, do_sync, false); - -#if vt_check_enabled(lblite) - msg->setLBLiteInstrument(false); -#endif - - vt_debug_print( - lb, node, - "elmReadyLB: invoking syncNextPhase on proxy={:x}, at phase={}\n", - col_proxy, phase - ); - - theCollection()->sendMsg>( - cur_proxy[idx], msg.get() - ); - } -} - -template -void CollectionManager::nextPhase( - CollectionProxyWrapType const& proxy, - PhaseType const& cur_phase, ActionFinishedLBType continuation -) { - using namespace balance; - using MsgType = PhaseMsg; - auto msg = makeMessage(cur_phase, proxy, true, false); - auto const instrument = false; - - vt_debug_print( - vrt_coll, node, - "nextPhase: broadcasting: cur_phase={}\n", - cur_phase - ); - - if (continuation != nullptr) { - theTerm()->produce(term::any_epoch_sentinel); - lb_continuations_.push_back(continuation); - - auto const& untyped_proxy = proxy.getProxy(); - auto iter = lb_no_elm_.find(untyped_proxy); - if (iter ==lb_no_elm_.end()) { - lb_no_elm_.emplace( - std::piecewise_construct, - std::forward_as_tuple(untyped_proxy), - std::forward_as_tuple([this,untyped_proxy]{ - auto elm_holder = - findElmHolder(untyped_proxy); - auto const& num_elms = elm_holder->numElements(); - // this collection manager does not participate in reduction - if (num_elms == 0) { - /* - * @todo: Implement child elision in reduction tree and up - * propagation - */ - } - }) - ); - } - } - - #if vt_check_enabled(lblite) - msg->setLBLiteInstrument(instrument); - vt_debug_print( - vrt_coll, node, - "nextPhase: broadcasting: instrument={}, cur_phase={}\n", - msg->lbLiteInstrument(), cur_phase - ); - #endif - - theCollection()->broadcastMsg>( - proxy, msg.get(), instrument - ); -} - -template -void CollectionManager::checkReduceNoElements() { - // @todo -} - -template -/*static*/ void CollectionManager::releaseLBPhase(CollectionPhaseMsg* msg) { - theCollection()->releaseLBContinuation(); -} - -template -void CollectionManager::releaseLBContinuation() { - vt_debug_print( - lb, node, - "releaseLBContinuation\n" - ); - UniversalIndexHolder<>::resetPhase(); - if (lb_continuations_.size() > 0) { - auto continuations = lb_continuations_; - lb_continuations_.clear(); - for (auto&& elm : continuations) { - theTerm()->consume(term::any_epoch_sentinel); - elm(); - } - } - for (auto&& elm : release_lb_) { - elm.second(); - } - release_lb_.clear(); -} - template CollectionManager::DispatchHandlerType CollectionManager::getDispatchHandler() { diff --git a/src/vt/vrt/collection/proxy_traits/proxy_elm_traits.h b/src/vt/vrt/collection/proxy_traits/proxy_elm_traits.h index 48324be400..adb2ae466a 100644 --- a/src/vt/vrt/collection/proxy_traits/proxy_elm_traits.h +++ b/src/vt/vrt/collection/proxy_traits/proxy_elm_traits.h @@ -49,20 +49,15 @@ #include "vt/vrt/proxy/base_collection_elm_proxy.h" #include "vt/vrt/proxy/base_elm_proxy.h" #include "vt/vrt/collection/send/sendable.h" -#include "vt/vrt/collection/balance/proxy/lbable.h" #include "vt/vrt/collection/gettable/gettable.h" #include "vt/vrt/collection/insert/insertable.h" -#include "vt/vrt/collection/balance/proxy/lbable.h" namespace vt { namespace vrt { namespace collection { namespace elm_proxy { template -using Chain4 = LBable>; - -template -using Chain3 = Gettable>; +using Chain3 = Gettable>; template using Chain2 = ElmInsertable>; From 7b463441c410a548b6276ef114e6d790f5c47455 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:30:49 -0700 Subject: [PATCH 12/36] #875: example: conform lb_iter to new interface --- examples/collection/lb_iter.cc | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/examples/collection/lb_iter.cc b/examples/collection/lb_iter.cc index ad3f03c5aa..478698116a 100644 --- a/examples/collection/lb_iter.cc +++ b/examples/collection/lb_iter.cc @@ -50,8 +50,6 @@ static int32_t num_iter = 8; struct IterCol : vt::Collection { IterCol() = default; - using EmptyMsg = vt::CollectionMessage; - struct IterMsg : vt::CollectionMessage { IterMsg() = default; explicit IterMsg(int64_t const in_work_amt, int64_t const in_iter) @@ -64,14 +62,6 @@ struct IterCol : vt::Collection { void iterWork(IterMsg* msg); - void runLB(EmptyMsg* msg) { - auto const idx = getIndex(); - auto proxy = getCollectionProxy(); - proxy[idx].LB(); - } - - void doneLB(EmptyMsg* msg) { } - template void serialize(SerializerT& s) { vt::Collection::serialize(s); @@ -135,8 +125,7 @@ int main(int argc, char** argv) { auto cur_time = vt::timing::Timing::getCurrentTime(); vt::runInEpochCollective([=]{ - if (this_node == 0) - proxy.broadcast(10, i); + proxy.broadcastCollective(10, i); }); auto total_time = vt::timing::Timing::getCurrentTime() - cur_time; @@ -144,11 +133,7 @@ int main(int argc, char** argv) { fmt::print("iteration: iter={},time={}\n", i, total_time); } - vt::runInEpochCollective([=]{ - if (this_node == 0) - proxy.broadcast(); - }); - + vt::thePhase()->nextPhaseCollective(); } vt::finalize(); From 2c33fec2311aebc728b31d87fdd66bf75527644e Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:32:34 -0700 Subject: [PATCH 13/36] #875: phase: add new debug print category --- cmake/define_build_types.cmake | 1 + src/vt/configs/arguments/app_config.h | 1 + src/vt/configs/arguments/args.cc | 3 +++ src/vt/configs/debug/debug_config.h | 4 +++- src/vt/phase/phase_manager.cc | 18 ++++++++++++++++++ src/vt/runtime/runtime_banner.cc | 1 + 6 files changed, 27 insertions(+), 1 deletion(-) diff --git a/cmake/define_build_types.cmake b/cmake/define_build_types.cmake index f6ff34d842..66006ada73 100644 --- a/cmake/define_build_types.cmake +++ b/cmake/define_build_types.cmake @@ -53,6 +53,7 @@ set( CatEnum::lb | \ CatEnum::vrt_coll | \ CatEnum::group | \ + CatEnum::phase | \ CatEnum::broadcast \ " ) diff --git a/src/vt/configs/arguments/app_config.h b/src/vt/configs/arguments/app_config.h index 448bbf0f5f..878d24c86f 100644 --- a/src/vt/configs/arguments/app_config.h +++ b/src/vt/configs/arguments/app_config.h @@ -195,6 +195,7 @@ struct AppConfig { bool vt_debug_group = false; bool vt_debug_broadcast = false; bool vt_debug_objgroup = false; + bool vt_debug_phase = false; bool vt_debug_print_flush = false; diff --git a/src/vt/configs/arguments/args.cc b/src/vt/configs/arguments/args.cc index 2710841ea6..ec421ef026 100644 --- a/src/vt/configs/arguments/args.cc +++ b/src/vt/configs/arguments/args.cc @@ -245,6 +245,7 @@ void ArgConfig::addDebugPrintArgs(CLI::App& app) { auto bbp = "Enable debug_group = \"" debug_pp(group) "\""; auto cbp = "Enable debug_broadcast = \"" debug_pp(broadcast) "\""; auto dbp = "Enable debug_objgroup = \"" debug_pp(objgroup) "\""; + auto dcp = "Enable debug_phase = \"" debug_pp(phase) "\""; auto r = app.add_flag("--vt_debug_all", config_.vt_debug_all, rp); auto r1 = app.add_flag("--vt_debug_verbose", config_.vt_debug_verbose, rq); @@ -279,6 +280,7 @@ void ArgConfig::addDebugPrintArgs(CLI::App& app) { auto bb = app.add_flag("--vt_debug_group", config_.vt_debug_group, bbp); auto cb = app.add_flag("--vt_debug_broadcast", config_.vt_debug_broadcast, cbp); auto db = app.add_flag("--vt_debug_objgroup", config_.vt_debug_objgroup, dbp); + auto dc = app.add_flag("--vt_debug_phase", config_.vt_debug_phase, dcp); auto debugGroup = "Debug Print Configuration (must be compile-time enabled)"; r->group(debugGroup); r1->group(debugGroup); @@ -313,6 +315,7 @@ void ArgConfig::addDebugPrintArgs(CLI::App& app) { bb->group(debugGroup); cb->group(debugGroup); db->group(debugGroup); + dc->group(debugGroup); auto dbq = "Always flush VT runtime prints"; auto eb = app.add_flag("--vt_debug_print_flush", config_.vt_debug_print_flush, dbq); diff --git a/src/vt/configs/debug/debug_config.h b/src/vt/configs/debug/debug_config.h index 4264ff27ef..866041f26a 100644 --- a/src/vt/configs/debug/debug_config.h +++ b/src/vt/configs/debug/debug_config.h @@ -80,7 +80,8 @@ enum CatEnum : uint64_t { group = 1ull<<27, broadcast = 1ull<<28, objgroup = 1ull<<29, - gossiplb = 1ull<<30 + gossiplb = 1ull<<30, + phase = 1ull<<31, }; enum CtxEnum : uint64_t { @@ -129,6 +130,7 @@ vt_option_category_pretty_print(lb, "lb") vt_option_category_pretty_print(location, "location") vt_option_category_pretty_print(objgroup, "objgroup") vt_option_category_pretty_print(param, "parameterization") +vt_option_category_pretty_print(phase, "phase") vt_option_category_pretty_print(pipe, "pipe") vt_option_category_pretty_print(pool, "pool") vt_option_category_pretty_print(reduce, "reduce") diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index d3a104d9d7..8d0ba90091 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -113,6 +113,11 @@ void PhaseManager::nextPhaseCollective() { ); in_next_phase_collective_ = true; + vt_debug_print( + phase, node, + "PhaseManager::nextPhaseCollective: cur_phase_={}\n", cur_phase_ + ); + // Convert bits to typed proxy auto proxy = objgroup::proxy::Proxy(proxy_); @@ -124,10 +129,23 @@ void PhaseManager::nextPhaseCollective() { theSched()->runSchedulerWhile([this]{ return not reduce_next_phase_done_; }); reduce_next_phase_done_ = false; + vt_debug_print( + phase, node, + "PhaseManager::nextPhaseCollective: cur_phase_={}, reduce done, running " + "hooks\n", cur_phase_ + ); + runHooks(PhaseHook::End); runHooks(PhaseHook::EndPostMigration); cur_phase_++; + + vt_debug_print( + phase, node, + "PhaseManager::nextPhaseCollective: starting next phase: cur_phase_={}\n" + "hooks\n", cur_phase_ + ); + runHooks(PhaseHook::Start); in_next_phase_collective_ = false; diff --git a/src/vt/runtime/runtime_banner.cc b/src/vt/runtime/runtime_banner.cc index 2567d98955..e14b490161 100644 --- a/src/vt/runtime/runtime_banner.cc +++ b/src/vt/runtime/runtime_banner.cc @@ -808,6 +808,7 @@ void Runtime::printStartupBanner() { vt_runtime_debug_warn_compile(group) vt_runtime_debug_warn_compile(broadcast) vt_runtime_debug_warn_compile(objgroup) + vt_runtime_debug_warn_compile(phase) auto arg_str = [](std::vector const& args) -> std::string { std::stringstream ss; From e78c6de2e1b284f342670d584c04cb772f9951cc Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:36:51 -0700 Subject: [PATCH 14/36] #875: phase: get rid of old, unneeded phase messages --- .../balance/lb_invoke/lb_manager.cc | 2 -- src/vt/vrt/collection/balance/phase_msg.h | 32 ------------------- src/vt/vrt/collection/manager.fwd.h | 1 - .../vrt/collection/messages/system_create.h | 5 --- 4 files changed, 40 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc index cd6111969c..ac8abea528 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc @@ -255,8 +255,6 @@ void LBManager::finishedLB(PhaseType phase) { theNodeStats()->outputStatsForPhase(phase); - auto msg = makeMessage(); - // Destruct the objgroup that was used for LB if (destroy_lb_ != nullptr) { triggerListeners(phase); diff --git a/src/vt/vrt/collection/balance/phase_msg.h b/src/vt/vrt/collection/balance/phase_msg.h index 86a82b4e2a..54d6f363f0 100644 --- a/src/vt/vrt/collection/balance/phase_msg.h +++ b/src/vt/vrt/collection/balance/phase_msg.h @@ -52,38 +52,6 @@ namespace vt { namespace vrt { namespace collection { namespace balance { -template -struct PhaseMsgBase : BaseMsgT { - using ProxyType = typename ColT::CollectionProxyType; - PhaseMsgBase() = default; - - PhaseMsgBase( - PhaseType const in_cur_phase, ProxyType const in_proxy, - bool in_do_sync, bool in_manual - ) : proxy_(in_proxy), cur_phase_(in_cur_phase), do_sync_(in_do_sync), - manual_(in_manual) - { } - - ProxyType getProxy() const { return proxy_; } - PhaseType getPhase() const { return cur_phase_; } - bool doSync() const { return do_sync_; } - bool manual() const { return manual_; } - -private: - ProxyType proxy_ = {}; - PhaseType cur_phase_ = fst_lb_phase; - bool do_sync_ = true; - bool manual_ = false; -}; - -template -using PhaseMsg = PhaseMsgBase>; - -template -using PhaseReduceMsg = PhaseMsgBase< - ColT,collective::ReduceTMsg ->; - template struct CollectStatsMsg : CollectionMessage { CollectStatsMsg(PhaseType in_phase) diff --git a/src/vt/vrt/collection/manager.fwd.h b/src/vt/vrt/collection/manager.fwd.h index 7f1b769848..7d0fbc19b6 100644 --- a/src/vt/vrt/collection/manager.fwd.h +++ b/src/vt/vrt/collection/manager.fwd.h @@ -52,7 +52,6 @@ namespace vt { namespace vrt { namespace collection { struct CollectionManager; -struct CollectionPhaseMsg; DispatchBasePtrType getDispatcher(auto_registry::AutoHandlerType const& han); diff --git a/src/vt/vrt/collection/messages/system_create.h b/src/vt/vrt/collection/messages/system_create.h index 1fcc066f0a..46284288a0 100644 --- a/src/vt/vrt/collection/messages/system_create.h +++ b/src/vt/vrt/collection/messages/system_create.h @@ -125,11 +125,6 @@ struct FinishedUpdateMsg : ::vt::collective::reduce::ReduceMsg { VirtualProxyType proxy_ = {}; }; -struct CollectionPhaseMsg : ::vt::Message { - using MessageParentType = ::vt::Message; - vt_msg_serialize_prohibited(); -}; - template struct InsertMsg : ::vt::Message { using MessageParentType = ::vt::Message; From 77f79f7e0a28abcb9a4cd397025bde503baba1b0 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:41:32 -0700 Subject: [PATCH 15/36] #875: phase: fix typo in debug print statement --- src/vt/phase/phase_manager.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 8d0ba90091..b01dc45c2a 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -142,8 +142,8 @@ void PhaseManager::nextPhaseCollective() { vt_debug_print( phase, node, - "PhaseManager::nextPhaseCollective: starting next phase: cur_phase_={}\n" - "hooks\n", cur_phase_ + "PhaseManager::nextPhaseCollective: starting next phase: cur_phase_={}\n", + cur_phase_ ); runHooks(PhaseHook::Start); From 35be795593e34fc210da4d2f663e5a0a8fa9936f Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:41:57 -0700 Subject: [PATCH 16/36] #875: test: massively simplify test with new phase management --- .../unit/collection/test_lb_lite.extended.cc | 66 +++++-------------- 1 file changed, 16 insertions(+), 50 deletions(-) diff --git a/tests/unit/collection/test_lb_lite.extended.cc b/tests/unit/collection/test_lb_lite.extended.cc index ea0322609a..9bba80c5e2 100644 --- a/tests/unit/collection/test_lb_lite.extended.cc +++ b/tests/unit/collection/test_lb_lite.extended.cc @@ -101,39 +101,11 @@ struct IterMsg : CollectionMessage { int32_t iter_ = 0; }; using ColProxyType = CollectionIndexProxy; -struct IterReduceMsg : collective::ReduceTMsg { - IterReduceMsg() = default; - IterReduceMsg(ColProxyType in_proxy, int32_t in_iter) - : proxy_(in_proxy), cur_iter_(in_iter) - {} - ColProxyType proxy_ = {}; - int32_t cur_iter_ = 0; -}; static void startIter(int32_t const iter, ColProxyType proxy); static TimeType cur_time = 0; static double weight = 1.0f; static int32_t num_iter = 8; -struct FinishedIter { - void operator()(IterReduceMsg* raw_msg) { - auto msg = promoteMsg(raw_msg); - auto const new_time = ::vt::timing::Timing::getCurrentTime(); - ::fmt::print( - "finished iteration: iter={}, num_iter={}, time={}\n", - msg->cur_iter_,num_iter,new_time-cur_time - ); - auto const iter = msg->cur_iter_; - cur_time = new_time; - if (iter < num_iter) { - theCollection()->nextPhase(msg->proxy_,iter,[=]{ - startIter(iter+1, msg->proxy_); - }); - } else { - msg->proxy_.destroy(); - } - } -}; - /*static*/ void LBTest::iterWork(IterMsg* msg, LBTest* col) { double val = 0.1f; double val2 = 0.4f; @@ -154,34 +126,28 @@ struct FinishedIter { } else { col->assertValues(); } - auto proxy = col->getCollectionProxy(); - auto reduce_msg = makeMessage(proxy,iter); - theCollection()->reduceMsg< - LBTest, - IterReduceMsg, - IterReduceMsg::template msgHandler< - IterReduceMsg, collective::PlusOp, FinishedIter - > - >(proxy, reduce_msg.get()); -} - -static void startIter(int32_t const iter, ColProxyType proxy) { - ::fmt::print( - "startIter: iter={}, cur_iter={}\n", iter, iter - ); - - proxy.broadcast(iter); } struct TestLB : TestParallelHarness { }; TEST_F(TestLB, test_lb_1) { auto const& this_node = theContext()->getNode(); - if (this_node == 0) { - auto const& range = Index1D(32); - auto proxy = theCollection()->construct(range); - cur_time = ::vt::timing::Timing::getCurrentTime(); - startIter(0,proxy); + auto const& range = Index1D(32); + auto proxy = theCollection()->constructCollective(range); + + for (int i = 0; i < num_iter; i++) { + auto cur_time = vt::timing::Timing::getCurrentTime(); + + vt::runInEpochCollective([=]{ + proxy.broadcastCollective(10, i); + }); + + auto total_time = vt::timing::Timing::getCurrentTime() - cur_time; + if (this_node == 0) { + fmt::print("iteration: iter={},time={}\n", i, total_time); + } + + vt::thePhase()->nextPhaseCollective(); } } From 0f6f0e85a532830995802676b84882f1af08b80d Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Wed, 21 Oct 2020 22:52:01 -0700 Subject: [PATCH 17/36] #875: test: conform all tests to use new interface --- tests/unit/collection/test_lb.extended.cc | 16 ++++------------ tests/unit/collection/test_lb_lite.extended.cc | 4 +--- .../test_model_per_collection.extended.cc | 11 +++-------- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/tests/unit/collection/test_lb.extended.cc b/tests/unit/collection/test_lb.extended.cc index 93b2e74281..737dc60f0c 100644 --- a/tests/unit/collection/test_lb.extended.cc +++ b/tests/unit/collection/test_lb.extended.cc @@ -105,16 +105,11 @@ TEST_P(TestLoadBalancer, test_load_balancer_1) { for (int phase = 0; phase < num_phases; phase++) { // Do some work. runInEpochCollective([&]{ - auto this_node = vt::theContext()->getNode(); - if (this_node == 0) { - proxy.broadcast(); - } + proxy.broadcastCollective(); }); // Go to the next phase. - runInEpochCollective([&]{ - vt::theCollection()->startPhaseCollective(nullptr); - }); + vt::thePhase()->nextPhaseCollective(); } } @@ -173,14 +168,11 @@ TEST_P(TestNodeStatsDumper, test_node_stats_dumping_with_interval) { for (int phase = 0; phase < num_phases; phase++) { // Do some work runInEpochCollective([&] { - if (vt::theContext()->getNode() == 0) { - proxy.broadcast(); - } + proxy.broadcastCollective(); }); // Go to the next phase - runInEpochCollective( - [&] { vt::theCollection()->startPhaseCollective(nullptr); }); + vt::thePhase()->nextPhaseCollective(); } auto const file_name = fmt::format( diff --git a/tests/unit/collection/test_lb_lite.extended.cc b/tests/unit/collection/test_lb_lite.extended.cc index 9bba80c5e2..967fae21dd 100644 --- a/tests/unit/collection/test_lb_lite.extended.cc +++ b/tests/unit/collection/test_lb_lite.extended.cc @@ -101,8 +101,6 @@ struct IterMsg : CollectionMessage { int32_t iter_ = 0; }; using ColProxyType = CollectionIndexProxy; -static void startIter(int32_t const iter, ColProxyType proxy); -static TimeType cur_time = 0; static double weight = 1.0f; static int32_t num_iter = 8; @@ -139,7 +137,7 @@ TEST_F(TestLB, test_lb_1) { auto cur_time = vt::timing::Timing::getCurrentTime(); vt::runInEpochCollective([=]{ - proxy.broadcastCollective(10, i); + proxy.broadcastCollective(i); }); auto total_time = vt::timing::Timing::getCurrentTime() - cur_time; diff --git a/tests/unit/collection/test_model_per_collection.extended.cc b/tests/unit/collection/test_model_per_collection.extended.cc index 25798ef4d7..3309dfe002 100644 --- a/tests/unit/collection/test_model_per_collection.extended.cc +++ b/tests/unit/collection/test_model_per_collection.extended.cc @@ -136,17 +136,12 @@ TEST_F(TestModelPerCollection, test_model_per_collection_1) { // Do some work. runInEpochCollective([&]{ - auto this_node = vt::theContext()->getNode(); - if (this_node == 0) { - proxy1.broadcast, colHandler>(); - proxy2.broadcast, colHandler>(); - } + proxy1.broadcastCollective, colHandler>(); + proxy2.broadcastCollective, colHandler>(); }); // Go to the next phase. - runInEpochCollective([&]{ - vt::theCollection()->startPhaseCollective(nullptr); - }); + vt::thePhase()->nextPhaseCollective(); // LB control flow means that there will be no recorded phase for // this to even look up objects in, causing failure From c38889f439066789169d731efead8a15327f65a8 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 00:50:40 -0700 Subject: [PATCH 18/36] #875: test: fix one last test --- tests/unit/location/test_hops.extended.cc | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/unit/location/test_hops.extended.cc b/tests/unit/location/test_hops.extended.cc index 3a6dae7ada..253ba0b1a2 100644 --- a/tests/unit/location/test_hops.extended.cc +++ b/tests/unit/location/test_hops.extended.cc @@ -120,12 +120,6 @@ struct TestColl : Collection { EXPECT_TRUE(not m->do_check_ or m->getHops() <= 1); } - void dolb(TestMsg* m) { - auto idx = this->getIndex(); - auto proxy = getCollectionProxy(); - proxy[idx].LB(); - } - void cont(TestMsg* m) { } @@ -166,11 +160,8 @@ TEST_F(TestHops, test_hops_1) { if (this_node == 0) { vt_print(gen, "Running LB for iter={}\n", i); } - runInEpochCollective([&]{ - if (this_node == 0) { - proxy.broadcast(); - } - }); + + thePhase()->nextPhaseCollective(); } } From 01e8bb715e11b37e4aa7853d7fee8911758c0915 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 11:37:18 -0700 Subject: [PATCH 19/36] #875: phase: remove listeners, add RDMA hook for collections --- src/vt/rdmahandle/manager.collection.impl.h | 3 +- .../balance/lb_invoke/lb_manager.cc | 46 ++++++------------- .../collection/balance/lb_invoke/lb_manager.h | 27 +---------- 3 files changed, 18 insertions(+), 58 deletions(-) diff --git a/src/vt/rdmahandle/manager.collection.impl.h b/src/vt/rdmahandle/manager.collection.impl.h index d90c022735..898626891c 100644 --- a/src/vt/rdmahandle/manager.collection.impl.h +++ b/src/vt/rdmahandle/manager.collection.impl.h @@ -49,6 +49,7 @@ #include "vt/rdmahandle/manager.h" #include "vt/rdmahandle/sub_handle.h" #include "vt/vrt/collection/manager.h" +#include "vt/phase/phase_manager.h" namespace vt { namespace rdma { @@ -140,7 +141,7 @@ Handle Manager::makeCollectionHandles( rdma, node, "CollectionHandle: registering LB listener\n" ); - theLBManager()->registerListenerAfterLB([=](PhaseType){ + thePhase()->registerHookCollective(phase::PhaseHook::EndPostMigration, [=]{ sub_proxy.get()->afterLB(); }); # endif diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc index ac8abea528..20819abffc 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc @@ -63,6 +63,7 @@ #include "vt/vrt/collection/balance/model/load_model.h" #include "vt/vrt/collection/balance/model/naive_persistence.h" #include "vt/vrt/collection/balance/model/raw_data.h" +#include "vt/phase/phase_manager.h" namespace vt { namespace vrt { namespace collection { namespace balance { @@ -179,14 +180,10 @@ LBManager::runLB(LBProxyType base_proxy, PhaseType phase) { strat->applyMigrations(strat->getTransfers()); }); - runInEpochCollective([=] { - vt_debug_print( - lb, node, - "LBManager: finished migrations\n" - ); - theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded()); - this->finishedLB(phase); - }); + vt_debug_print( + lb, node, + "LBManager: finished migrations\n" + ); } void LBManager::selectStartLB(PhaseType phase) { @@ -213,9 +210,7 @@ void LBManager::startLB(PhaseType phase, LBType lb) { } if (lb == LBType::NoLB) { - // perform cleanup actions and then return out - theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded()); - finishedLB(phase); + // nothing to do return; } @@ -241,6 +236,13 @@ void LBManager::startLB(PhaseType phase, LBType lb) { runLB(base_proxy, phase); } +void LBManager::startup() { + thePhase()->registerHookCollective(phase::PhaseHook::EndPostMigration, []{ + auto const phase = thePhase()->getCurrentPhase(); + theLBManager()->finishedLB(phase); + }); +} + void LBManager::finishedLB(PhaseType phase) { vt_debug_print(lb, node, "finishedLB\n"); @@ -253,34 +255,14 @@ void LBManager::finishedLB(PhaseType phase) { ); } + theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded()); theNodeStats()->outputStatsForPhase(phase); // Destruct the objgroup that was used for LB if (destroy_lb_ != nullptr) { - triggerListeners(phase); destroy_lb_(); destroy_lb_ = nullptr; } } -int LBManager::registerListenerAfterLB(ListenerFnType fn) { - listeners_.push_back(fn); - return static_cast(listeners_.size() - 1); -} - -void LBManager::unregisterListenerAfterLB(int element) { - vtAssert( - listeners_.size() > static_cast(element), "Listener must exist" - ); - listeners_[element] = nullptr; -} - -void LBManager::triggerListeners(PhaseType phase) { - for (auto&& l : listeners_) { - if (l) { - l(phase); - } - } -} - }}}} /* end namespace vt::vrt::collection::balance */ diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h index 8696c06d43..eed8c991bf 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.h @@ -69,7 +69,6 @@ class LoadModel; * and invocation. */ struct LBManager : runtime::component::Component { - using ListenerFnType = std::function; using LBProxyType = objgroup::proxy::Proxy; /** @@ -82,6 +81,8 @@ struct LBManager : runtime::component::Component { std::string name() override { return "LBManager"; } + void startup() override; + static std::unique_ptr construct(); public: @@ -143,29 +144,6 @@ struct LBManager : runtime::component::Component { void finishedLB(PhaseType phase); public: - /** - * \brief Register a listener to trigger after LB completes - * - * \param[in] fn the listener - * - * \return the ID of the registration - */ - int registerListenerAfterLB(ListenerFnType fn); - - /** - * \brief Unregister a listener to trigger after LB completes - * - * \param[in] element the registration ID - */ - void unregisterListenerAfterLB(int element); - - /** - * \internal \brief Trigger all after-LB listeners - * - * \param[in] phase the phase - */ - void triggerListeners(PhaseType phase); - /** * \brief Set a model of expected object loads to use in place of * the current installed model @@ -206,7 +184,6 @@ struct LBManager : runtime::component::Component { LBType cached_lb_ = LBType::NoLB; std::function destroy_lb_ = nullptr; bool synced_in_lb_ = true; - std::vector listeners_ = {}; objgroup::proxy::Proxy proxy_; std::shared_ptr base_model_; std::shared_ptr model_; From a2f3af3f80018fc344480112888e9e46611fe32e Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 11:45:27 -0700 Subject: [PATCH 20/36] #875: test: fix test using a hook for trigger so stats exists --- .../test_model_per_collection.extended.cc | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/tests/unit/collection/test_model_per_collection.extended.cc b/tests/unit/collection/test_model_per_collection.extended.cc index 3309dfe002..21996b847a 100644 --- a/tests/unit/collection/test_model_per_collection.extended.cc +++ b/tests/unit/collection/test_model_per_collection.extended.cc @@ -140,24 +140,28 @@ TEST_F(TestModelPerCollection, test_model_per_collection_1) { proxy2.broadcastCollective, colHandler>(); }); + // Add a hook for after LB runs, but before instrumentation is cleared + thePhase()->registerHookCollective(phase::PhaseHook::End, [=]{ + // LB control flow means that there will be no recorded phase for + // this to even look up objects in, causing failure +#if vt_check_enabled(lblite) + // Test the model, which should be per-collection and return the proxy. + auto model = theLBManager()->getLoadModel(); + // Call updateLoads manually, since it won't be called by the LB + // infrastructure when the LB hasn't run, and we need this for the + // model to function + model->updateLoads(0); + for (auto&& obj : *model) { + auto work_val = model->getWork(obj, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE}); + EXPECT_DOUBLE_EQ(work_val, static_cast(id_proxy_map[obj])); + //fmt::print("{:x} {}\n", obj, work_val); + } +#endif + }); + // Go to the next phase. vt::thePhase()->nextPhaseCollective(); - // LB control flow means that there will be no recorded phase for - // this to even look up objects in, causing failure -#if vt_check_enabled(lblite) - // Test the model, which should be per-collection and return the proxy. - auto model = theLBManager()->getLoadModel(); - // Call updateLoads manually, since it won't be called by the LB - // infrastructure when the LB hasn't run, and we need this for the - // model to function - model->updateLoads(0); - for (auto&& obj : *model) { - auto work_val = model->getWork(obj, {PhaseOffset::NEXT_PHASE, PhaseOffset::WHOLE_PHASE}); - EXPECT_EQ(work_val, static_cast(id_proxy_map[obj])); - //fmt::print("{:x} {}\n", obj, work_val); - } -#endif } }}}} // end namespace vt::tests::unit::per From 6d55b063a6199a5dd66dcd4625f6f41c2cb9ff4f Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 11:46:28 -0700 Subject: [PATCH 21/36] #875: phase: add clarifying note --- src/vt/phase/phase_manager.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index faa954de5b..6aafa9a1c5 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -94,7 +94,9 @@ struct PhaseManager : runtime::component::Component { * type of hook * * \note These must be registered across all nodes as they will be run in a - * collective epoch. This is for synchronized phase actions. + * collective epoch. This is for synchronized phase actions. The order in + * which hooks are collectively registered dictate the order they are + * executed. * * \param[in] type the type of trigger to register * \param[in] trigger the action to trigger From c471aab38cdff92375536879c164c94d338455a8 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 11:48:09 -0700 Subject: [PATCH 22/36] #875: phase: fix copy-paste bug --- src/vt/phase/phase_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index b01dc45c2a..8a5e20a1b9 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -74,7 +74,7 @@ PhaseManager::registerHookRooted(PhaseHook type, ActionType trigger) { in_next_phase_collective_, "Must not be in next phase to register" ); - bool const is_collective = true; + bool const is_collective = false; auto const type_bits = static_cast(type); auto const hook_id = next_rooted_hook_id_++; rooted_hooks_[type_bits][hook_id] = trigger; From 10bf14b6c64c126d4a3bbaef60f1a58365a6a4c2 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 12:01:12 -0700 Subject: [PATCH 23/36] #875: test: remove trigger --- tests/unit/rdma/test_rdma_collection_handle.extended.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/rdma/test_rdma_collection_handle.extended.cc b/tests/unit/rdma/test_rdma_collection_handle.extended.cc index 2364034a40..bd6a47096c 100644 --- a/tests/unit/rdma/test_rdma_collection_handle.extended.cc +++ b/tests/unit/rdma/test_rdma_collection_handle.extended.cc @@ -134,7 +134,7 @@ struct TestCol : vt::Collection, vt::Index2D> { if (not triggered_lb) { triggered_lb = true; //fmt::print("{}: triggering listeners\n", theContext()->getNode()); - theLBManager()->triggerListeners(0); + // theLBManager()->triggerListeners(0); } auto proxy = this->getCollectionProxy(); auto cb = theCB()->makeBcast< From d97aa0beeda68928369b36dcb34c28a3a2cc6022 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 13:12:20 -0700 Subject: [PATCH 24/36] #875: test: fix a small bug in the diagnotistic test --- tests/unit/diagnostics/test_diagnostic_value.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/diagnostics/test_diagnostic_value.cc b/tests/unit/diagnostics/test_diagnostic_value.cc index dd0c4f1d4a..fbd578747e 100644 --- a/tests/unit/diagnostics/test_diagnostic_value.cc +++ b/tests/unit/diagnostics/test_diagnostic_value.cc @@ -136,6 +136,10 @@ TEST_F(TestDiagnosticValue, test_diagnostic_value_2) { ValueType snapshot = 0; auto this_node = theContext()->getNode(); + auto num_nodes = theContext()->getNumNodes(); + if (num_nodes != 2) { + return; + } double num_to_set = this_node == 0 ? 100 : 175; From 879c45ff5505641322afeddd285f03deac7ee980 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 13:29:25 -0700 Subject: [PATCH 25/36] #875: docs: write documentation for phase management --- docs/md/lb-manager.md | 7 ++++--- docs/md/phase.md | 17 +++++++++++++++++ docs/md/vt.md | 1 + src/vt/phase/phase_manager.h | 17 ++++++++++++++++- 4 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 docs/md/phase.md diff --git a/docs/md/lb-manager.md b/docs/md/lb-manager.md index a9ab43e4f4..72c297047d 100644 --- a/docs/md/lb-manager.md +++ b/docs/md/lb-manager.md @@ -3,9 +3,10 @@ The LB manager component `vt::vrt::collection::balance::LBManager`, accessed via `vt::theLBManager()` manages and coordinates instances of load balancers. It -counts collections as they call `nextPhase` to ensure they are all ready before -load balancing begins. It reads the command-line arguments or LB specification -file to determine which load balancer to run. +potentially with start load balancing after a "phase" is completed; refer to +\ref phase for details about how to delineate phases in an application. The LB +manager reads command-line arguments or an LB specification file to determine which +load balancer to run at a given phase. To enable load balancing, the cmake flag \code{.cmake} -Dvt_lb_enabled=1 \endcode should be passed during building. This also enables automatic diff --git a/docs/md/phase.md b/docs/md/phase.md new file mode 100644 index 0000000000..d497cbe2c7 --- /dev/null +++ b/docs/md/phase.md @@ -0,0 +1,17 @@ +\page phase Phase Manager +\brief Manage phases of time + +The phase manager component `vt::phase::PhaseManager`, accessed via +`vt::thePhase()` allows the delineation of collective intervals of time across +all nodes. Load balancing, as well as other components, use phases as a boundary +to perform many operations over an application's execution such as work +redistribution, outputting of statistical data, or flushing trace data. + +The main user interface is a call to `thePhase()->nextPhaseCollective()` which +starts the next phase after performing a reduction. Thus, any work that belongs +in the preceding phase should be synchronized by the user before this is called +(e.g., by calling `vt::runInEpochCollective`). + +System components along with applications can register hooks with the phase +manager to determine when a new phase is starting, ending, and after migrations +have occurred. diff --git a/docs/md/vt.md b/docs/md/vt.md index 1b22fd7313..7aa5d384e0 100644 --- a/docs/md/vt.md +++ b/docs/md/vt.md @@ -53,6 +53,7 @@ management. | \subpage param | `vt::theParam()` | \copybrief param | @m_class{m-label m-danger} **Experimental** | | \subpage pipe | `vt::theCB()` | \copybrief pipe | @m_class{m-label m-success} **Core** | | \subpage node-stats | `vt::theNodeStats()` | \copybrief node-stats | @m_class{m-label m-warning} **Optional** | +| \subpage phase | `vt::thePhase()` | \copybrief phase | @m_class{m-label m-success} **Core** | | \subpage pool | `vt::thePool()` | \copybrief pool | @m_class{m-label m-success} **Core** | | \subpage rdma | `vt::theRDMA()` | \copybrief rdma | @m_class{m-label m-danger} **Experimental** | | \subpage rdmahandle | `vt::theHandleRDMA()` | \copybrief rdmahandle | @m_class{m-label m-warning} **Optional** | diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 6aafa9a1c5..283b923192 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -61,7 +61,22 @@ struct NextMsg; /** * \struct PhaseManager * - * \brief General management of phases in applications + * \brief General management of phases in an application to delineate collective + * intervals of time across nodes. + * + * Many system components use phases as a natural boundary for performing + * incremental operations as the runtime makes progress. For instance, traces + * may be flushed at phases boundaries and the load balancing framework might + * apply at strategy between phases. + * + * The main interface for users is invoking + * \c vt::thePhase()->nextPhaseCollective() to start the next phase. The system + * performs a reduction and blocks completion inside this call. Any work + * that belongs in the preceding phase should be synchronized before this is + * called. The runtime system or users can register hooks when a phase starts, + * ends, or after any migrations are complete. Hooks may be collective or + * rooted; collective hooks are invoked in the order in which they are + * registered and are always run in a collective epoch. */ struct PhaseManager : runtime::component::Component { using HookIDType = typename std::underlying_type::type; From c6211f0b8d1e2f8710bae1670aeed7853e9e03eb Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 15:57:49 -0700 Subject: [PATCH 26/36] #875: docs: fix typos in docs --- docs/md/lb-manager.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/md/lb-manager.md b/docs/md/lb-manager.md index 72c297047d..dffd54263a 100644 --- a/docs/md/lb-manager.md +++ b/docs/md/lb-manager.md @@ -3,10 +3,10 @@ The LB manager component `vt::vrt::collection::balance::LBManager`, accessed via `vt::theLBManager()` manages and coordinates instances of load balancers. It -potentially with start load balancing after a "phase" is completed; refer to +will potentially start load balancing after a "phase" is completed; refer to \ref phase for details about how to delineate phases in an application. The LB -manager reads command-line arguments or an LB specification file to determine which -load balancer to run at a given phase. +manager reads command-line arguments or an LB specification file to determine +which load balancer to run at a given phase. To enable load balancing, the cmake flag \code{.cmake} -Dvt_lb_enabled=1 \endcode should be passed during building. This also enables automatic From 4e22bfb4bdf13c512fe636c68d1c1d20507d4427 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 15:59:30 -0700 Subject: [PATCH 27/36] #875: docs: fix another typo --- src/vt/phase/phase_manager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 283b923192..2e955e8947 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -165,7 +165,7 @@ struct PhaseManager : runtime::component::Component { * \internal * \brief Run all the hooks registered here of a certain variety * - * \param[in] type type of hook to run designated by the enum \c PhsaeHook + * \param[in] type type of hook to run designated by the enum \c PhaseHook */ void runHooks(PhaseHook type); From 3c516f5af86bad13fe1cc5e9cb507d304a2292da Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 16:00:41 -0700 Subject: [PATCH 28/36] #875: test: combine into one condition --- src/vt/phase/phase_manager.cc | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 8a5e20a1b9..28d9d39174 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -161,11 +161,9 @@ void PhaseManager::runHooks(PhaseHook type) { // start out running all rooted hooks of a particular type { auto iter = rooted_hooks_.find(type_bits); - if (iter != rooted_hooks_.end()) { - if (iter->second.size() > 0) { - for (auto&& fn : iter->second) { - runInEpochRooted([=]{ fn.second(); }); - } + if (iter != rooted_hooks_.end() and iter->second.size() > 0) { + for (auto&& fn : iter->second) { + runInEpochRooted([=]{ fn.second(); }); } } } @@ -173,12 +171,10 @@ void PhaseManager::runHooks(PhaseHook type) { // then, run collective hooks that should be symmetric across nodes { auto iter = collective_hooks_.find(type_bits); - if (iter != collective_hooks_.end()) { - if (iter->second.size() > 0) { - // note, this second is a map, so they are ordered across nodes - for (auto&& fn : iter->second) { - runInEpochCollective([=]{ fn.second(); }); - } + if (iter != collective_hooks_.end() and iter->second.size() > 0) { + // note, this second is a map, so they are ordered across nodes + for (auto&& fn : iter->second) { + runInEpochCollective([=]{ fn.second(); }); } } } From 2c92f94d9c303676f3812be74264bc0f8328dadf Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 16:03:51 -0700 Subject: [PATCH 29/36] #875: phase: add manual run for hooks for testing purposes --- src/vt/phase/phase_manager.cc | 4 ++++ src/vt/phase/phase_manager.h | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 28d9d39174..748000e11e 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -180,4 +180,8 @@ void PhaseManager::runHooks(PhaseHook type) { } } +void PhaseManager::runHooksManual(PhaseHook type) { + runHooks(type); +} + }} /* end namespace vt::phase */ diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 2e955e8947..41d493af0d 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -169,6 +169,15 @@ struct PhaseManager : runtime::component::Component { */ void runHooks(PhaseHook type); +public: + /** + * \brief Run hooks manually + * + * \warning This is only intended to be used for testing and non-standard + * use cases where they need to be run. + */ + void runHooksManual(PhaseHook type); + private: PhaseType cur_phase_ = 0; /**< Current phase */ ObjGroupProxyType proxy_ = no_obj_group; /**< Objgroup proxy */ From d9755967d1ad8e738f1134d6e5f22aca80ae43e7 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 16:05:36 -0700 Subject: [PATCH 30/36] #875: test: adapt rdma test to run hooks the new way --- tests/unit/rdma/test_rdma_collection_handle.extended.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/rdma/test_rdma_collection_handle.extended.cc b/tests/unit/rdma/test_rdma_collection_handle.extended.cc index bd6a47096c..1badc02416 100644 --- a/tests/unit/rdma/test_rdma_collection_handle.extended.cc +++ b/tests/unit/rdma/test_rdma_collection_handle.extended.cc @@ -133,8 +133,8 @@ struct TestCol : vt::Collection, vt::Index2D> { void afterMigratePost(TestMsg*) { if (not triggered_lb) { triggered_lb = true; - //fmt::print("{}: triggering listeners\n", theContext()->getNode()); - // theLBManager()->triggerListeners(0); + //fmt::print("{}: run post migration hooks\n", theContext()->getNode()); + vt::thePhase()->runHooksManual(vt::phase::PhaseHook::EndPostMigration); } auto proxy = this->getCollectionProxy(); auto cb = theCB()->makeBcast< From 523b678d276a6ee1905a956aaadef63cf7da7a91 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Thu, 22 Oct 2020 16:19:09 -0700 Subject: [PATCH 31/36] #875: test: add new test for phase management --- tests/unit/phase/test_phase_management.cc | 110 ++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 tests/unit/phase/test_phase_management.cc diff --git a/tests/unit/phase/test_phase_management.cc b/tests/unit/phase/test_phase_management.cc new file mode 100644 index 0000000000..2de5c616ff --- /dev/null +++ b/tests/unit/phase/test_phase_management.cc @@ -0,0 +1,110 @@ +/* +//@HEADER +// ***************************************************************************** +// +// test_phase_management.cc +// DARMA Toolkit v. 1.0.0 +// DARMA/vt => Virtual Transport +// +// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#include + +#include "test_parallel_harness.h" + +#include + +namespace vt { namespace tests { namespace unit { + +using TestPhaseManagement = TestParallelHarness; + +TEST_F(TestPhaseManagement, test_phase_manager_1) { + auto phase_mgr = phase::PhaseManager::construct(); + + // start with phase 0 + EXPECT_EQ(phase_mgr->getCurrentPhase(), 0); + + int start_hooks = 0; + int end_hooks = 0; + int end_post_hooks = 0; + + auto start = [&]{ start_hooks++; }; + auto end = [&]{ end_hooks++; }; + auto endpost = [&]{ end_post_hooks++; }; + + // register a starting hook + auto hookid = phase_mgr->registerHookRooted(phase::PhaseHook::Start, start); + + // unregister it, make sure it doesn't run + phase_mgr->unregisterHook(hookid); + + // run startup function, which will trigger starting hooks + phase_mgr->startup(); + EXPECT_EQ(start_hooks, 0); + + auto hookid2 = phase_mgr->registerHookRooted(phase::PhaseHook::Start, start); + + // run startup function, which will trigger starting hooks + phase_mgr->startup(); + EXPECT_EQ(start_hooks, 1); + + auto hookid3 = phase_mgr->registerHookCollective(phase::PhaseHook::Start, start); + phase_mgr->unregisterHook(hookid3); + + // run startup function, which will trigger starting hooks + phase_mgr->startup(); + EXPECT_EQ(start_hooks, 2); + phase_mgr->unregisterHook(hookid2); + + phase_mgr->registerHookCollective(phase::PhaseHook::Start, start); + phase_mgr->registerHookCollective(phase::PhaseHook::End, end); + phase_mgr->registerHookCollective(phase::PhaseHook::EndPostMigration, endpost); + + phase_mgr->nextPhaseCollective(); + EXPECT_EQ(start_hooks, 3); + EXPECT_EQ(end_hooks, 1); + EXPECT_EQ(end_post_hooks, 1); + EXPECT_EQ(phase_mgr->getCurrentPhase(), 1); + + phase_mgr->nextPhaseCollective(); + EXPECT_EQ(start_hooks, 4); + EXPECT_EQ(end_hooks, 2); + EXPECT_EQ(end_post_hooks, 2); + EXPECT_EQ(phase_mgr->getCurrentPhase(), 2); +} + +}}} // end namespace vt::tests::unit From dad95153a4eb803794e52ff35c750bd48bcf9878 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Fri, 23 Oct 2020 12:24:26 -0700 Subject: [PATCH 32/36] #875: phase: change name of file to phase_hook_id --- src/vt/phase/{registered_phase_hook.h => phase_hook_id.h} | 8 ++++---- src/vt/phase/phase_manager.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) rename src/vt/phase/{registered_phase_hook.h => phase_hook_id.h} (93%) diff --git a/src/vt/phase/registered_phase_hook.h b/src/vt/phase/phase_hook_id.h similarity index 93% rename from src/vt/phase/registered_phase_hook.h rename to src/vt/phase/phase_hook_id.h index 9eeacbe2ee..887193858a 100644 --- a/src/vt/phase/registered_phase_hook.h +++ b/src/vt/phase/phase_hook_id.h @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// registered_phase_hook.h +// phase_hook_id.h // DARMA Toolkit v. 1.0.0 // DARMA/vt => Virtual Transport // @@ -42,8 +42,8 @@ //@HEADER */ -#if !defined INCLUDED_VT_PHASE_REGISTERED_PHASE_HOOK_H -#define INCLUDED_VT_PHASE_REGISTERED_PHASE_HOOK_H +#if !defined INCLUDED_VT_PHASE_PHASE_HOOK_ID_H +#define INCLUDED_VT_PHASE_PHASE_HOOK_ID_H #include "vt/phase/phase_hook_enum.h" @@ -105,4 +105,4 @@ struct PhaseHookID { }} /* end namespace vt::phase */ -#endif /*INCLUDED_VT_PHASE_REGISTERED_PHASE_HOOK_H*/ +#endif /*INCLUDED_VT_PHASE_PHASE_HOOK_ID_H*/ diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 41d493af0d..633b8a203c 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -48,7 +48,7 @@ #include "vt/configs/types/types_type.h" #include "vt/runtime/component/component_pack.h" #include "vt/phase/phase_hook_enum.h" -#include "vt/phase/registered_phase_hook.h" +#include "vt/phase/phase_hook_id.h" #include #include From cc64a8bcb2001628cbf78230a86c298a0fb5a399 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Fri, 23 Oct 2020 12:25:51 -0700 Subject: [PATCH 33/36] #875: phase: fix typos --- src/vt/phase/phase_manager.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index 633b8a203c..ea5db9701c 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -66,8 +66,8 @@ struct NextMsg; * * Many system components use phases as a natural boundary for performing * incremental operations as the runtime makes progress. For instance, traces - * may be flushed at phases boundaries and the load balancing framework might - * apply at strategy between phases. + * may be flushed at phase boundaries and the load balancing framework might + * apply a strategy between phases. * * The main interface for users is invoking * \c vt::thePhase()->nextPhaseCollective() to start the next phase. The system From 6f457f237055ec8b2bac5d6d0ef048b26dd26896 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Fri, 23 Oct 2020 12:30:15 -0700 Subject: [PATCH 34/36] #875: phase: remove some extra conditions not needed --- src/vt/phase/phase_manager.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 748000e11e..962134950d 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -161,7 +161,7 @@ void PhaseManager::runHooks(PhaseHook type) { // start out running all rooted hooks of a particular type { auto iter = rooted_hooks_.find(type_bits); - if (iter != rooted_hooks_.end() and iter->second.size() > 0) { + if (iter != rooted_hooks_.end()) { for (auto&& fn : iter->second) { runInEpochRooted([=]{ fn.second(); }); } @@ -171,7 +171,7 @@ void PhaseManager::runHooks(PhaseHook type) { // then, run collective hooks that should be symmetric across nodes { auto iter = collective_hooks_.find(type_bits); - if (iter != collective_hooks_.end() and iter->second.size() > 0) { + if (iter != collective_hooks_.end()) { // note, this second is a map, so they are ordered across nodes for (auto&& fn : iter->second) { runInEpochCollective([=]{ fn.second(); }); From 620d4b0127573d6230ebfc214fb4f705125fc4d9 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Fri, 23 Oct 2020 13:41:33 -0700 Subject: [PATCH 35/36] #875: phase: change assert to abort --- src/vt/phase/phase_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vt/phase/phase_manager.cc b/src/vt/phase/phase_manager.cc index 962134950d..53f3861f04 100644 --- a/src/vt/phase/phase_manager.cc +++ b/src/vt/phase/phase_manager.cc @@ -94,7 +94,7 @@ void PhaseManager::unregisterHook(PhaseHookID hook) { if (iter != hooks[type].end()) { hooks[type].erase(iter); } else { - vtAssert(false, "Could not find registered hook ID to erase"); + vtAbort("Could not find registered hook ID to erase"); } } From 9ee2b0b4fea1c17768450851f8237022b795d069 Mon Sep 17 00:00:00 2001 From: Jonathan Lifflander Date: Fri, 23 Oct 2020 13:57:40 -0700 Subject: [PATCH 36/36] #875: phase: add macro with date for phase manager feature --- src/vt/phase/phase_manager.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vt/phase/phase_manager.h b/src/vt/phase/phase_manager.h index ea5db9701c..2f423d8228 100644 --- a/src/vt/phase/phase_manager.h +++ b/src/vt/phase/phase_manager.h @@ -53,6 +53,8 @@ #include #include +#define VT_PHASE_MANAGER 20201023 + namespace vt { namespace phase { // fwd-decl for reduce messasge