Skip to content

Commit

Permalink
LightGBM would hang on MPI run if only some nodes have a fatal error. (
Browse files Browse the repository at this point in the history
…#2600)

* LightGBM would hang on MPI run if only some nodes have a fatal error. The issue is that the destructor of Application calls MPI_Finalize(), which will cause the program to hand and prevent from exiting. So we move the network finalization out of the destructor and call MPI_Finalize() or MPI_Abort() based on whether there was an unhandled exception.

* Minor updates: Remove excess debug logging, whitespaces.

* Add comments for new functions.
  • Loading branch information
ashok-ponnuswami-msft authored and guolinke committed Dec 3, 2019
1 parent 51ceef8 commit 6129208
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 4 deletions.
19 changes: 16 additions & 3 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,39 @@
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#include <LightGBM/application.h>
#include "network/linkers.h"

#include <iostream>

int main(int argc, char** argv) {
bool success = false;
try {
LightGBM::Application app(argc, argv);
app.Run();

#ifdef USE_MPI
LightGBM::Linkers::MpiFinalizeIfIsParallel();
#endif

success = true;
}
catch (const std::exception& ex) {
std::cerr << "Met Exceptions:" << std::endl;
std::cerr << ex.what() << std::endl;
exit(-1);
}
catch (const std::string& ex) {
std::cerr << "Met Exceptions:" << std::endl;
std::cerr << ex << std::endl;
exit(-1);
}
catch (...) {
std::cerr << "Unknown Exceptions" << std::endl;
}

if (!success) {
#ifdef USE_MPI
LightGBM::Linkers::MpiAbortIfIsParallel();
#endif

exit(-1);
}
}
}
17 changes: 17 additions & 0 deletions src/network/linkers.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,24 @@ class Linkers {

#endif // USE_SOCKET

#ifdef USE_MPI

/*!
* \brief Check if MPI has been initialized
*/
static bool IsMpiInitialized();

/*!
* \brief Finalize the MPI session if it was initialized
*/
static void MpiFinalizeIfIsParallel();

/*!
* \brief Abort the MPI session if it was initialized (called in case there was a error that needs abrupt ending)
*/
static void MpiAbortIfIsParallel();

#endif
private:
/*! \brief Rank of local machine */
int rank_;
Expand Down
26 changes: 25 additions & 1 deletion src/network/linkers_mpi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,35 @@ Linkers::Linkers(Config) {
}

Linkers::~Linkers() {
if (is_init_) {
// Don't call MPI_Finalize() here: If the destructor was called because only this node had an exception, calling MPI_Finalize() will cause all nodes to hang.
// Instead we will handle finalize/abort for MPI in main().
}

bool Linkers::IsMpiInitialized() {
int is_mpi_init;
MPI_SAFE_CALL(MPI_Initialized(&is_mpi_init));
return is_mpi_init;
}

void Linkers::MpiFinalizeIfIsParallel() {
if (IsMpiInitialized()) {
Log::Debug("Finalizing MPI session.");
MPI_SAFE_CALL(MPI_Finalize());
}
}

void Linkers::MpiAbortIfIsParallel() {
try {
if (IsMpiInitialized()) {
std::cerr << "Aborting MPI communication." << std::endl << std::flush;
MPI_SAFE_CALL(MPI_Abort(MPI_COMM_WORLD, -1));;
}
}
catch (...) {
std::cerr << "Exception was raised before aborting MPI. Aborting process..." << std::endl << std::flush;
abort();
}
}

} // namespace LightGBM
#endif // USE_MPI

0 comments on commit 6129208

Please sign in to comment.