Skip to content

Commit

Permalink
#1014 dump node stats before trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakub Strzebonski committed Sep 8, 2020
1 parent f9b7e74 commit 9e5ce14
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 56 deletions.
8 changes: 8 additions & 0 deletions src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ LBManager::makeLB(MsgSharedPtr<StartLBMsg> msg) {
lb, node,
"LBManager: finished migrations\n"
);

// Statistics output when LB is enabled and appropriate flag is enabled
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->outputStatsForPhase(phase);
}
#endif

theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded());
this->finishedRunningLB(phase);
});
Expand Down
125 changes: 73 additions & 52 deletions src/vt/vrt/collection/balance/node_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,78 +198,99 @@ void NodeStats::createStatsFile() {
stats_file_ = fopen(file_name.c_str(), "w+");
}

void NodeStats::openStatsFile() {
auto const node = theContext()->getNode();
auto const base_file = std::string(theConfig()->vt_lb_stats_file);
auto const dir = std::string(theConfig()->vt_lb_stats_dir);
auto const file = fmt::format("{}.{}.out", base_file, node);
auto const file_name = fmt::format("{}/{}", dir, file);

vt_debug_print(
lb, node,
"NodeStats: openStatsFile file={}\n", file_name
);

stats_file_ = fopen(file_name.c_str(), "a+");
}

void NodeStats::closeStatsFile() {
if (stats_file_) {
fclose(stats_file_);
stats_file_ = nullptr;
}
}

void NodeStats::outputStatsFile() {
void NodeStats::outputStatsForPhase(PhaseType phase) {
// TODO (STRZ) - directory cannot be created here. So where?
// if (stats_file_ == nullptr) {
// phase == 0
// ? createStatsFile()
// : openStatsFile();
// }

if (stats_file_ == nullptr) {
createStatsFile();
openStatsFile();
}

vtAssertExpr(stats_file_ != nullptr);

auto const num_iters = node_data_.size();
vt_print(lb, "NodeStats::outputStatsFile: file={}, phase={}\n", print_ptr(stats_file_), phase);

auto i = phase;
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();

auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
}

vt_print(lb, "NodeStats::outputStatsFile: file={}, iter={}\n", print_ptr(stats_file_), num_iters);
obj_str += "]\n";

for (size_t i = 0; i < num_iters; i++) {
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();
fprintf(stats_file_, "%s", obj_str.c_str());
}

auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
}
for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

obj_str += "]\n";
auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
}
for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
}
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
}
}

fflush(stats_file_);

closeStatsFile();
Expand Down
9 changes: 7 additions & 2 deletions src/vt/vrt/collection/balance/node_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
void releaseLB();

/**
* \internal \brief Output stats file based on instrumented data
* \internal \brief Output stats file for given phase based on instrumented data
*
* The contents of the file consist of a series of records separated
* by newlines. Each record consists of comma separated fields. The
Expand Down Expand Up @@ -161,7 +161,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
* recipient and distinguishing point-to-point messages from
* broadcasts, as a decimal integer.
*/
void outputStatsFile();
void outputStatsForPhase(PhaseType phase);

/**
* \internal \brief Generate the next object element ID for LB
Expand Down Expand Up @@ -243,6 +243,11 @@ struct NodeStats : runtime::component::Component<NodeStats> {
*/
void createStatsFile();

/**
* \internal \brief Open the stats file
*/
void openStatsFile();

/**
* \internal \brief Close the stats file
*/
Expand Down
3 changes: 1 addition & 2 deletions src/vt/vrt/collection/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,9 @@ CollectionManager::CollectionManager() { }
void CollectionManager::finalize() {
cleanupAll<>();

// Statistics output when LB is enabled and appropriate flag is enabled
// If statistics are enabled, clear them while finilizing
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->outputStatsFile();
theNodeStats()->clearStats();
}
#endif
Expand Down

0 comments on commit 9e5ce14

Please sign in to comment.