Skip to content

Commit

Permalink
#1014 dump node stats before trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakub Strzebonski committed Sep 10, 2020
1 parent 46a8d3e commit 1703aec
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 77 deletions.
8 changes: 8 additions & 0 deletions src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ LBManager::makeLB(MsgSharedPtr<StartLBMsg> msg) {
lb, node,
"LBManager: finished migrations\n"
);

// Statistics output when LB is enabled and appropriate flag is enabled
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->outputStatsForPhase(phase);
}
#endif

theNodeStats()->startIterCleanup(phase, model_->getNumPastPhasesNeeded());
this->finishedRunningLB(phase);
});
Expand Down
122 changes: 50 additions & 72 deletions src/vt/vrt/collection/balance/node_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,21 +182,6 @@ void NodeStats::createStatsFile() {
"NodeStats: createStatsFile file={}\n", file_name
);

// Node 0 creates the directory
if (not created_dir_ and node == 0) {
mkdir(dir.c_str(), S_IRWXU);
created_dir_ = true;
}

// Barrier: wait for node 0 to create directory before trying to put a file in
// the stats destination directory
if (curRT) {
curRT->systemSync();
} else {
// Something is wrong
vtAssert(false, "Trying to dump stats when VT runtime is deallocated?");
}

stats_file_ = fopen(file_name.c_str(), "w+");
}

Expand All @@ -207,74 +192,67 @@ void NodeStats::closeStatsFile() {
}
}

void NodeStats::outputStatsFile() {
if (stats_file_ == nullptr) {
createStatsFile();
}

void NodeStats::outputStatsForPhase(PhaseType phase) {
vtAssertExpr(stats_file_ != nullptr);

auto const num_iters = node_data_.size();
vt_print(lb, "NodeStats::outputStatsFile: file={}, phase={}\n", print_ptr(stats_file_), phase);

vt_print(lb, "NodeStats::outputStatsFile: file={}, iter={}\n", print_ptr(stats_file_), num_iters);
auto i = phase;
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();

for (size_t i = 0; i < num_iters; i++) {
for (auto&& elm : node_data_.at(i)) {
ElementIDType id = elm.first;
TimeType time = elm.second;
const auto& subphase_times = node_subphase_data_.at(i)[id];
size_t subphases = subphase_times.size();
auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
}

auto obj_str = fmt::format("{},{},{},{},[", i, id, time, subphases);
for (size_t s = 0; s < subphases; s++) {
obj_str += std::to_string(subphase_times[s]);
if (s != subphases - 1)
obj_str += ",";
}
obj_str += "]\n";

obj_str += "]\n";
fprintf(stats_file_, "%s", obj_str.c_str());
}

for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
}
for (auto&& elm : node_comm_.at(i)) {
using E = typename std::underlying_type<CommCategory>::type;

auto const& key = elm.first;
auto const& val = elm.second;
auto const cat = static_cast<E>(key.cat_);

if (
key.cat_ == CommCategory::SendRecv or
key.cat_ == CommCategory::Broadcast
) {
auto const to = key.toObj();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
}
} else if (
key.cat_ == CommCategory::NodeToCollection or
key.cat_ == CommCategory::NodeToCollectionBcast
) {
auto const to = key.toObj();
auto const from = key.fromNode();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else if (
key.cat_ == CommCategory::CollectionToNode or
key.cat_ == CommCategory::CollectionToNodeBcast
) {
auto const to = key.toNode();
auto const from = key.fromObj();
auto obj_str = fmt::format("{},{},{},{},{}\n", i, to, from, val.bytes, cat);
fprintf(stats_file_, "%s", obj_str.c_str());
} else {
vtAssert(false, "Invalid balance::CommCategory enum value");
}
}
fflush(stats_file_);

closeStatsFile();
fflush(stats_file_);
}

ElementIDType NodeStats::addNodeStats(
Expand Down
5 changes: 2 additions & 3 deletions src/vt/vrt/collection/balance/node_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
void releaseLB();

/**
* \internal \brief Output stats file based on instrumented data
* \internal \brief Output stats file for given phase based on instrumented data
*
* The contents of the file consist of a series of records separated
* by newlines. Each record consists of comma separated fields. The
Expand Down Expand Up @@ -161,7 +161,7 @@ struct NodeStats : runtime::component::Component<NodeStats> {
* recipient and distinguishing point-to-point messages from
* broadcasts, as a decimal integer.
*/
void outputStatsFile();
void outputStatsForPhase(PhaseType phase);

/**
* \internal \brief Generate the next object element ID for LB
Expand Down Expand Up @@ -237,7 +237,6 @@ struct NodeStats : runtime::component::Component<NodeStats> {
*/
VirtualProxyType getCollectionProxyForElement(ElementIDType temp_id) const;

private:
/**
* \internal \brief Create the stats file
*/
Expand Down
14 changes: 12 additions & 2 deletions src/vt/vrt/collection/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,23 @@ namespace vt { namespace vrt { namespace collection {

CollectionManager::CollectionManager() { }

void CollectionManager::initialize() {
// If statistics are enabled create output directory and file
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
mkdir(theConfig()->vt_lb_stats_dir.c_str(), S_IRWXU);
theNodeStats()->createStatsFile();
}
#endif
}

void CollectionManager::finalize() {
cleanupAll<>();

// Statistics output when LB is enabled and appropriate flag is enabled
// If statistics are enabled, close output file and clear stats
#if vt_check_enabled(lblite)
if (theConfig()->vt_lb_stats) {
theNodeStats()->outputStatsFile();
theNodeStats()->closeStatsFile();
theNodeStats()->clearStats();
}
#endif
Expand Down
1 change: 1 addition & 0 deletions src/vt/vrt/collection/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ struct CollectionManager

virtual ~CollectionManager();

void initialize() override;
void finalize() override;

std::string name() override { return "CollectionManager"; }
Expand Down

0 comments on commit 1703aec

Please sign in to comment.