Skip to content

Commit

Permalink
#1014 dump node stats before trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakub Strzebonski committed Sep 22, 2020
1 parent 1c22e6a commit ef7a932
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 67 deletions.
134 changes: 77 additions & 57 deletions src/vt/vrt/collection/balance/node_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ void NodeStats::startIterCleanup(PhaseType phase, int look_back) {
}
node_data_[phase] = std::move(new_data);

// Statistics output when LB is enabled and appropriate flag is enabled
if (theConfig()->vt_lb_stats) {
outputStatsForPhase(phase);
}

if (phase - look_back >= 0) {
node_data_.erase(phase - look_back);
node_subphase_data_.erase(phase - look_back);
Expand Down Expand Up @@ -170,6 +175,14 @@ void NodeStats::releaseLB() {
CollectionManager::releaseLBPhase(msg_hold.get());
}

void NodeStats::initialize() {
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->createStatsFile();
}
#endif
}

void NodeStats::createStatsFile() {
auto const node = theContext()->getNode();
auto const base_file = std::string(theConfig()->vt_lb_stats_file);
Expand Down Expand Up @@ -198,6 +211,17 @@ void NodeStats::createStatsFile() {
}

stats_file_ = fopen(file_name.c_str(), "w+");
vtAssertExpr(stats_file_ != nullptr);
}

void NodeStats::finalize() {
// If statistics are enabled, close output file and clear stats
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
closeStatsFile();
clearStats();
}
#endif
}

void NodeStats::closeStatsFile() {
Expand All @@ -207,74 +231,70 @@ void NodeStats::closeStatsFile() {
}
}

void NodeStats::outputStatsFile() {
if (stats_file_ == nullptr) {
createStatsFile();
}

void NodeStats::outputStatsForPhase(PhaseType phase) {
vtAssertExpr(stats_file_ != nullptr);

auto const num_iters = node_data_.size();
vt_print(lb, "NodeStats::outputStatsForPhase: file={}, phase={}\n", print_ptr(stats_file_), phase);

vt_print(lb, "NodeStats::outputStatsFile: file={}, iter={}\n", print_ptr(stats_file_), num_iters);
auto i = phase;
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();

for (size_t i = 0; i < num_iters; i++) {
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();
auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);

auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
for (size_t s = 0; s < subphases; s++) {
if (s > 0) {
obj_str += ",";
}

obj_str += "]\n";

fprintf(stats_file_, "%s", obj_str.c_str());
}
for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
}
obj_str += std::to_string(subphase_times[s]);
}

obj_str += "]\n";

fprintf(stats_file_, "%s", obj_str.c_str());
}

for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& comm = elm.first;
auto const recvSend = getRecvSend(comm);
auto const cat = static_cast<E>(comm.cat_);
auto obj_str = fmt::format("{},{},{},{},{}\n", i, recvSend.first, recvSend.second, elm.second.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
}

fflush(stats_file_);
}

std::pair<ElementIDType, ElementIDType> NodeStats::getRecvSend(CommKeyType const& comm) {
if (
comm.cat_ == CommCategory::SendRecv or
comm.cat_ == CommCategory::Broadcast
) {
return std::make_pair(comm.toObj(), comm.fromObj());
}

if (
comm.cat_ == CommCategory::NodeToCollection or
comm.cat_ == CommCategory::NodeToCollectionBcast
) {
return std::make_pair(comm.toObj(), comm.fromNode());
}

if (
comm.cat_ == CommCategory::CollectionToNode or
comm.cat_ == CommCategory::CollectionToNodeBcast
) {
return std::make_pair(comm.toNode(), comm.fromObj());
}

closeStatsFile();
vtAssert(false, "Invalid balance::CommCategory enum value");
return std::make_pair(ElementIDType{}, ElementIDType{});
}

ElementIDType NodeStats::addNodeStats(
Expand Down
9 changes: 7 additions & 2 deletions src/vt/vrt/collection/balance/node_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
void releaseLB();

/**
* \internal \brief Output stats file based on instrumented data
* \internal \brief Output stats file for given phase based on instrumented data
*
* The contents of the file consist of a series of records separated
* by newlines. Each record consists of comma separated fields. The
Expand Down Expand Up @@ -161,7 +161,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
* recipient and distinguishing point-to-point messages from
* broadcasts, as a decimal integer.
*/
void outputStatsFile();
void outputStatsForPhase(PhaseType phase);

/**
* \internal \brief Generate the next object element ID for LB
Expand Down Expand Up @@ -237,6 +237,9 @@ struct NodeStats : runtime::component::Component<NodeStats> {
*/
VirtualProxyType getCollectionProxyForElement(ElementIDType temp_id) const;

void initialize() override;
void finalize() override;

private:
/**
* \internal \brief Create the stats file
Expand All @@ -248,6 +251,8 @@ struct NodeStats : runtime::component::Component<NodeStats> {
*/
void closeStatsFile();

static std::pair<ElementIDType, ElementIDType> getRecvSend(CommKeyType const& comm);

private:
/// Local proxy to objgroup
objgroup::proxy::Proxy<NodeStats> proxy_;
Expand Down
8 changes: 0 additions & 8 deletions src/vt/vrt/collection/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,6 @@ CollectionManager::CollectionManager() { }

void CollectionManager::finalize() {
cleanupAll<>();

// Statistics output when LB is enabled and appropriate flag is enabled
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->outputStatsFile();
theNodeStats()->clearStats();
}
#endif
}

/*virtual*/ CollectionManager::~CollectionManager() { }
Expand Down

0 comments on commit ef7a932

Please sign in to comment.