-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Updating the event manager for MPI communication #136
Changes from 7 commits
bbb025f
4334656
2aed13c
bc1b042
b88a5f0
bfdf835
99c9a9e
1adf510
ba28f93
a0c459b
3df819a
ef00dde
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -166,6 +166,17 @@ void EventManager::Run(dataRepository::ManagedGroup * domain) | |
integer const verbosity = this->getReference<integer>(viewKeys.verbosity); | ||
integer exitFlag = 0; | ||
|
||
integer rank = 0; | ||
integer comm_size = 1; | ||
real64 *send_buffer = nullptr; | ||
real64 *receive_buffer = nullptr; | ||
#if USE_MPI | ||
MPI_Comm_rank(MPI_COMM_WORLD, &rank); | ||
MPI_Comm_size(MPI_COMM_WORLD, &comm_size); | ||
send_buffer = static_cast<real64 *>(malloc(2 * sizeof(real64))); | ||
receive_buffer = static_cast<real64 *>(malloc(2 * sizeof(real64) * comm_size)); | ||
#endif | ||
|
||
// Setup event targets | ||
this->forSubGroups<EventBase>([]( EventBase * subEvent ) -> void | ||
{ | ||
|
@@ -176,13 +187,20 @@ void EventManager::Run(dataRepository::ManagedGroup * domain) | |
while((time < maxTime) && (cycle < maxCycle) && (exitFlag == 0)) | ||
{ | ||
real64 nextDt = std::numeric_limits<real64>::max(); | ||
std::cout << "Time: " << time << "s, dt:" << dt << "s, Cycle: " << cycle << std::endl; | ||
|
||
if (rank == 0) | ||
{ | ||
std::cout << "Time: " << time << "s, dt:" << dt << "s, Cycle: " << cycle << std::endl; | ||
} | ||
|
||
this->forSubGroups<EventBase>([&]( EventBase * subEvent ) -> void | ||
{ | ||
// Calculate the event and sub-event forecasts | ||
// Note: because events can be nested, the mpi reduce for event | ||
// forecasts need to happen in EventBase. | ||
subEvent->CheckEvents(time, dt, cycle, domain); | ||
integer eventForecast = subEvent->GetForecast(); | ||
|
||
// Execute, signal events | ||
if (eventForecast == 1) | ||
{ | ||
subEvent->SignalToPrepareForExecution(time, dt, cycle, domain); | ||
|
@@ -193,31 +211,58 @@ void EventManager::Run(dataRepository::ManagedGroup * domain) | |
subEvent->Execute(time, dt, cycle, domain); | ||
} | ||
|
||
real64 requestedDt = 1e6; | ||
// Estimate the time-step for the next cycle | ||
if (eventForecast <= 1) | ||
{ | ||
requestedDt = subEvent->GetTimestepRequest(time + dt); | ||
real64 requestedDt = subEvent->GetTimestepRequest(time + dt); | ||
nextDt = std::min(requestedDt, nextDt); | ||
} | ||
nextDt = std::min(requestedDt, nextDt); | ||
|
||
// Check the exit flag | ||
exitFlag += subEvent->GetExitFlag(); | ||
|
||
// Debug information | ||
if (verbosity > 0) | ||
if ((verbosity > 0) && (rank == 0)) | ||
{ | ||
std::cout << " Event: " << subEvent->getName() << ", f=" << eventForecast << ", dt_r=" << requestedDt << std::endl; | ||
std::cout << " Event: " << subEvent->getName() << ", f=" << eventForecast << std::endl; | ||
} | ||
}); | ||
|
||
time += dt; | ||
++cycle; | ||
dt = nextDt; | ||
dt = (time + dt > maxTime) ? (maxTime - time) : dt; | ||
|
||
#if USE_MPI | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be more efficient than the two allreduce calls for the dt and exitFlag |
||
send_buffer[0] = dt; | ||
send_buffer[1] = static_cast<real64>(exitFlag); | ||
MPI_Gather(send_buffer, 2, MPI_DOUBLE, receive_buffer, 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); | ||
|
||
if (rank == 0) | ||
{ | ||
for (integer ii=0; ii<comm_size; ii++) | ||
{ | ||
send_buffer[0] = std::min(send_buffer[0], receive_buffer[2*ii]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just use dt instead of send_buffer[0] as the argument? |
||
send_buffer[1] += receive_buffer[2*ii + 1]; | ||
} | ||
} | ||
|
||
MPI_Bcast(send_buffer, 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); | ||
dt = send_buffer[0]; | ||
if (send_buffer[1] > 0.5) | ||
{ | ||
exitFlag = 1; | ||
} | ||
#endif | ||
} | ||
|
||
|
||
// Cleanup | ||
std::cout << "Cleaning up events" << std::endl; | ||
if (rank == 0) | ||
{ | ||
std::cout << "Cleaning up events" << std::endl; | ||
} | ||
|
||
this->forSubGroups<EventBase>([&]( EventBase * subEvent ) -> void | ||
{ | ||
subEvent->Cleanup(time, cycle, domain); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -306,6 +306,8 @@ void EventBase::Step(real64 const time, | |
integer const cycle, | ||
dataRepository::ManagedGroup * domain ) | ||
{ | ||
// Note: do we need an mpi barrier here? | ||
|
||
if (m_target != nullptr) | ||
{ | ||
m_target->Execute(time, dt, cycle, domain); | ||
|
@@ -377,4 +379,15 @@ void EventBase::Cleanup(real64 const& time_n, | |
|
||
|
||
|
||
integer EventBase::GetExitFlag() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We were not propagating any exit flags upwards to the main run loop. |
||
{ | ||
this->forSubGroups<EventBase>([&]( EventBase * subEvent ) -> void | ||
{ | ||
m_exitFlag += subEvent->GetExitFlag(); | ||
}); | ||
|
||
return m_exitFlag; | ||
} | ||
|
||
|
||
} /* namespace geosx */ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,14 @@ void HaltEvent::EstimateEventTiming(real64 const time, | |
m_realDt = currentTime - m_lastTime; | ||
m_lastTime = currentTime; | ||
integer forecast = static_cast<integer>((maxRuntime - (currentTime - m_startTime)) / m_realDt); | ||
|
||
// The timing for the ranks may differ slightly, so synchronize | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This Allreduce can happen here, or we could store a common time-stamp for each of the ranks somewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How will these be different? It seems that currentTime and m_lastTime will always be in sync?? You could sync up m_startTime in the constructor when it is set....wait what is going on here? Why do you care about the real world runtime? Is this for a maximum wall time? |
||
#if USE_MPI | ||
integer forecast_global; | ||
MPI_Allreduce(&forecast, &forecast_global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); | ||
forecast = forecast_global; | ||
#endif | ||
|
||
SetForecast(forecast); | ||
|
||
if (this->GetForecast() <= 0) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -245,6 +245,14 @@ void PeriodicEvent::CheckOptionalFunctionThreshold(real64 const time, | |
// Find the function (min, average, max) | ||
real64_array stats = function->EvaluateStats(m_functionTarget, time, mySet); | ||
result = stats[functionStatOption]; | ||
|
||
// Because the function applied to an object may differ by rank, synchronize | ||
// (Note: this shouldn't occur very often, since it is only called if the base forecast <= 0) | ||
#if USE_MPI | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This allreduce should be fairly uncommon, since it will only be applied when the following apply:
|
||
real64 result_global; | ||
MPI_Allreduce(&result, &result_global, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); | ||
result = result_global; | ||
#endif | ||
} | ||
|
||
// Forcast event | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
/* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This solver is meant to toy around with random time-step requests. I've been using it to make sure that the event manager, mpi communications are handling the time-stepping correctly. |
||
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
* Copyright (c) 2018, Lawrence Livermore National Security, LLC. | ||
* | ||
* Produced at the Lawrence Livermore National Laboratory | ||
* | ||
* LLNL-CODE-746361 | ||
* | ||
* All rights reserved. See COPYRIGHT for details. | ||
* | ||
* This file is part of the GEOSX Simulation Framework. | ||
* | ||
* GEOSX is a free software; you can redistribute it and/or modify it under | ||
* the terms of the GNU Lesser General Public License (as published by the | ||
* Free Software Foundation) version 2.1 dated February 1999. | ||
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
*/ | ||
|
||
|
||
#include "DummySolver.hpp" | ||
#include "dataRepository/ManagedGroup.hpp" | ||
#include <thread> | ||
#include <chrono> | ||
|
||
namespace geosx | ||
{ | ||
|
||
|
||
using namespace dataRepository; | ||
|
||
|
||
DummySolver::DummySolver( const std::string& name, | ||
ManagedGroup * const parent ): | ||
SolverBase( name, parent ) | ||
{} | ||
|
||
|
||
|
||
DummySolver::~DummySolver() | ||
{ | ||
// TODO Auto-generated destructor stub | ||
} | ||
|
||
|
||
void DummySolver::FillDocumentationNode() | ||
{ | ||
cxx_utilities::DocumentationNode * const docNode = this->getDocumentationNode(); | ||
SolverBase::FillDocumentationNode(); | ||
|
||
docNode->setName(this->CatalogName()); | ||
docNode->setSchemaType("Node"); | ||
docNode->setShortDescription("Dummy solver for testing time-stepping behavior"); | ||
|
||
docNode->AllocateChildNode( viewKeys.rand_scale.Key(), | ||
viewKeys.rand_scale.Key(), | ||
-1, | ||
"real64", | ||
"real64", | ||
"Scale for modifying requested dt", | ||
"Scale for modifying requested dt", | ||
"1e-9", | ||
"", | ||
1, | ||
1, | ||
0 ); | ||
|
||
} | ||
|
||
|
||
void DummySolver::Initialize( ManagedGroup * const problemManager ) | ||
{ | ||
integer rank = 0; | ||
#if USE_MPI | ||
MPI_Comm_rank(MPI_COMM_WORLD, &rank); | ||
#endif | ||
std::srand(rank * 12345); | ||
} | ||
|
||
|
||
real64 DummySolver::SolverStep( real64 const& time_n, | ||
real64 const& dt, | ||
const int cycleNumber, | ||
DomainPartition * domain ) | ||
{ | ||
std::this_thread::sleep_for(std::chrono::seconds(1)); | ||
return dt; | ||
} | ||
|
||
|
||
real64 DummySolver::GetTimestepRequest(real64 const time) | ||
{ | ||
integer rank = 0; | ||
#if USE_MPI | ||
MPI_Comm_rank(MPI_COMM_WORLD, &rank); | ||
#endif | ||
|
||
real64 const rand_scale = this->getReference<real64>(viewKeys.rand_scale); | ||
real64 dt_request = std::rand() * rand_scale; | ||
|
||
std::cout << "time=" << time << ", solver=" << this->getName() << ", rank=" << rank << ", dt_r=" << dt_request << std::endl; | ||
|
||
return dt_request; | ||
} | ||
|
||
|
||
REGISTER_CATALOG_ENTRY( SolverBase, DummySolver, std::string const &, ManagedGroup * const ) | ||
} /* namespace ANST */ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
/* | ||
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
* Copyright (c) 2018, Lawrence Livermore National Security, LLC. | ||
* | ||
* Produced at the Lawrence Livermore National Laboratory | ||
* | ||
* LLNL-CODE-746361 | ||
* | ||
* All rights reserved. See COPYRIGHT for details. | ||
* | ||
* This file is part of the GEOSX Simulation Framework. | ||
* | ||
* GEOSX is a free software; you can redistribute it and/or modify it under | ||
* the terms of the GNU Lesser General Public License (as published by the | ||
* Free Software Foundation) version 2.1 dated February 1999. | ||
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
*/ | ||
|
||
|
||
#ifndef DUMMYSOLVER_HPP_ | ||
#define DUMMYSOLVER_HPP_ | ||
|
||
#include "physicsSolvers/SolverBase.hpp" | ||
|
||
namespace geosx | ||
{ | ||
namespace dataRepository | ||
{ | ||
class ManagedGroup; | ||
} | ||
class DomainPartition; | ||
|
||
class DummySolver : public SolverBase | ||
{ | ||
public: | ||
DummySolver( const std::string& name, | ||
ManagedGroup * const parent ); | ||
|
||
|
||
virtual ~DummySolver() override; | ||
|
||
static string CatalogName() { return "DummySolver"; } | ||
|
||
virtual void FillDocumentationNode() override; | ||
|
||
virtual void Initialize( ManagedGroup * const problemManager ) override final; | ||
|
||
virtual real64 SolverStep( real64 const& time_n, | ||
real64 const& dt, | ||
integer const cycleNumber, | ||
DomainPartition * domain ) override; | ||
|
||
virtual real64 GetTimestepRequest(real64 const time) override; | ||
|
||
struct viewKeysStruct | ||
{ | ||
dataRepository::ViewKey rand_scale = { "rand_scale" }; | ||
} viewKeys; | ||
|
||
}; | ||
|
||
} /* namespace geosx */ | ||
|
||
#endif /* DUMMYSOLVER_HPP_ */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not deallocated anywhere that I can see??
It is better just to do something like
real64 send_buffer[2];
array1d< real64 > receive_buffer( 2 * comm_size )