Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SEAS checkpointing of PETSc TS object #59

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
e284992
Added basic functionality required to checkpoing a SEAS model. This c…
Dec 2, 2021
b0380fe
Remove guard on files
dmay23 Dec 2, 2021
d881b36
Clean utils.
dmay23 Dec 2, 2021
741e3c2
SEAS checkpoint TS solver.
dmay23 Dec 3, 2021
5dc8b70
solve(): break earlier if the number of requested steps have been per…
dmay23 Dec 6, 2021
80b1982
ts_utils: Output more members from TS and TSAdapt in the hope of pres…
dmay23 Dec 6, 2021
f40a8dd
Bugfix: Do not overwrite user vector
uphoffc Dec 7, 2021
aef2891
PetscTimeSolver: Add getter/setter for checkpoint filename and frequency
dmay23 Dec 7, 2021
cf05335
SeasConfig: New TOML parameters to control checkpointing of time inte…
dmay23 Dec 7, 2021
fad1cf9
Add safety of operator overload
Oct 5, 2023
0801af7
Updated checkpointing functionality. The major change relates to stor…
Oct 5, 2023
db8979b
Halt execution if a file failed to load when a restart was requested
Oct 5, 2023
3b489de
More error checking
Oct 5, 2023
c93f97c
Allow creating output file when simulataion starts inat non-zero time
Oct 12, 2023
7510663
Allow creating outputs file when simulation starts at non-zero time
Oct 12, 2023
8596c33
corrected typo
Oct 13, 2023
1dc36db
Merge pull request #60 from JeenaYun/dmay/seas-checkpoint
hpc4geo Oct 13, 2023
5cbb2aa
Removed toml options for ts checkpointing
Nov 2, 2023
1e9238d
Added alternative storage modes. Changed default settings
Nov 2, 2023
ce8c13c
Per checkpoint write, output (i) a csv with temporal info and (ii) a …
Nov 3, 2023
f6587fe
Removed unused checkpointing methods and variables
Nov 3, 2023
67cb17a
Removed useless code copied from PETSc source
Nov 3, 2023
3463e0a
Added more safety when loading TSAdapt
Nov 3, 2023
8458c1d
Merge remote-tracking branch 'origin/main' into dmay/seas-checkpoint
Thomas-Ulrich Nov 7, 2023
8a87903
fix compilation
Thomas-Ulrich Nov 7, 2023
64046d5
- this small change fixes for me an error that arises when compiling …
cpranger Nov 2, 2022
65d2480
add missing include
Thomas-Ulrich Mar 30, 2023
4c6f720
fix compilation
Thomas-Ulrich Nov 7, 2023
2ac9e62
ini commit toml
Thomas-Ulrich Nov 7, 2023
b8dc246
checkpoint parameters directly on the main toml node
Thomas-Ulrich Nov 7, 2023
0590fe7
re-instate last_checkpoint
Thomas-Ulrich Nov 8, 2023
43c0b2b
use the TOML sub-node for checkpointing
Thomas-Ulrich Nov 8, 2023
b8ebe24
Synchronize using cpu time from rank 0. Failure to synchronize may re…
hpc4geo Jan 30, 2024
c22b346
Force the generate flag to be syncd across comm
hpc4geo Feb 2, 2024
a4c44d2
Merge pull request #61 from Thomas-Ulrich/thomas/test_toml_cp
AliceGabriel Jun 6, 2024
254657f
Merge remote-tracking branch 'origin/main' into dmay/seas-checkpoint
Thomas-Ulrich Jun 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ set(APP_COMMON_SRCS
localoperator/RateAndStateBase.cpp
#pc/lspoly.c
pc/register.cpp
common/ts_util.c
common/vecnest_util.c
)
if(${LAPACK_FOUND})
list(APPEND APP_COMMON_SRCS
Expand Down
5 changes: 5 additions & 0 deletions app/common/CmdLine.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ readFromConfigurationFileAndCmdLine(TableSchema<T> const& schema, argparse::Argu
<< err << std::endl;
return std::nullopt;
}
if (!rawCfg.contains("ts_checkpoint")) {
// If it doesn't exist, add an empty "ts_checkpoint" section
// this allows using default checkpoint options in the tandem app
rawCfg.insert_or_assign("ts_checkpoint", toml::array{});
}

T cfg;
try {
Expand Down
9 changes: 8 additions & 1 deletion app/common/PetscTimeSolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@

namespace tndm {

PetscTimeSolverBase::PetscTimeSolverBase(MPI_Comm comm) {
PetscTimeSolverBase::PetscTimeSolverBase(MPI_Comm comm, Config const& cfg) {
CHKERRTHROW(TSCreate(comm, &ts_));
CHKERRTHROW(TSSetProblemType(ts_, TS_NONLINEAR));
CHKERRTHROW(TSSetExactFinalTime(ts_, TS_EXACTFINALTIME_MATCHSTEP));
CHKERRTHROW(TSSetFromOptions(ts_));

auto const& cfgcp = cfg.ts_checkpoint_config;

CHKERRTHROW(
ts_checkpoint_configure(ts_, cfgcp.save_directory.c_str(), cfgcp.frequency_step,
cfgcp.frequency_cputime_minutes, cfgcp.frequency_time_physical,
static_cast<int>(cfgcp.storage_type), cfgcp.storage_limited_size));

TSType time_scheme;
CHKERRTHROW(TSGetType(ts_, &time_scheme));

Expand Down
51 changes: 47 additions & 4 deletions app/common/PetscTimeSolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

#include "common/PetscUtil.h"
#include "common/PetscVector.h"

#include "tandem/SeasConfig.h"
extern "C" {
#include "ts_util.h"
#include "vecnest_util.h"
}
#include <petscsystypes.h>
#include <petscts.h>
#include <petscvec.h>
Expand All @@ -17,7 +21,7 @@ namespace tndm {

class PetscTimeSolverBase {
public:
PetscTimeSolverBase(MPI_Comm comm);
PetscTimeSolverBase(MPI_Comm comm, Config const& cfg);
~PetscTimeSolverBase();

std::size_t get_step_number() const;
Expand All @@ -33,15 +37,19 @@ class PetscTimeSolverBase {
template <std::size_t NumStateVecs> class PetscTimeSolver : public PetscTimeSolverBase {
public:
template <typename TimeOp>
PetscTimeSolver(TimeOp& timeop, std::array<std::unique_ptr<PetscVector>, NumStateVecs> state)
: PetscTimeSolverBase(timeop.comm()), state_(std::move(state)) {
PetscTimeSolver(TimeOp& timeop, std::array<std::unique_ptr<PetscVector>, NumStateVecs> state,
Config const& cfg)
: PetscTimeSolverBase(timeop.comm(), cfg), state_(std::move(state)),
ts_checkpoint_load_directory(std::move(cfg.ts_checkpoint_config.load_directory)),
comm(timeop.comm()) {

Vec x[NumStateVecs];
for (std::size_t n = 0; n < NumStateVecs; ++n) {
x[n] = state_[n]->vec();
}
MPI_Comm comm;
CHKERRTHROW(VecCreateNest(timeop.comm(), NumStateVecs, nullptr, x, &ts_state_));
CHKERRTHROW(VecNestUpgradeOperations(ts_state_));

std::apply([&timeop](auto&... x) { timeop.initial_condition((*x)...); }, state_);

Expand All @@ -51,6 +59,39 @@ template <std::size_t NumStateVecs> class PetscTimeSolver : public PetscTimeSolv
~PetscTimeSolver() { VecDestroy(&ts_state_); }

void solve(double upcoming_time) {
CHKERRTHROW(TSSetUp(ts_));

if (ts_checkpoint_load_directory.has_value()) {
int rank;
MPI_Comm_rank(comm, &rank);
std::string sload;
if (rank == 0) {
sload = ts_checkpoint_load_directory.value();
if (std::filesystem::is_regular_file(sload)) {
std::cout << "Retrieving the name of the last checkpoint from " << sload
<< std::endl;
std::ifstream file(sload);
if (file.is_open()) {
if (std::getline(file, sload)) {
if (not std::filesystem::is_directory(sload)) {
throw std::runtime_error(
"The first line of the file does not point to an existing "
"directory.");
}
} else {
throw std::runtime_error("The file is empty.");
}
file.close();
} else {
throw std::runtime_error("Failed to open the file.");
}
}
} else {
MPI_Bcast(&sload[0], sload.size(), MPI_CHAR, 0, MPI_COMM_WORLD);
}
const char* loadDirectory = sload.c_str();
CHKERRTHROW(ts_checkpoint_restart(ts_, loadDirectory));
}
CHKERRTHROW(TSSetMaxTime(ts_, upcoming_time));
CHKERRTHROW(TSSolve(ts_, ts_state_));
}
Expand Down Expand Up @@ -108,6 +149,8 @@ template <std::size_t NumStateVecs> class PetscTimeSolver : public PetscTimeSolv

std::array<std::unique_ptr<PetscVector>, NumStateVecs> state_;
Vec ts_state_ = nullptr;
std::optional<std::string> ts_checkpoint_load_directory;
MPI_Comm comm;
};

} // namespace tndm
Expand Down
Loading