Skip to content

Commit

Permalink
Utilities and checpoints (#131)
Browse files Browse the repository at this point in the history
* Add savefig for loss

* Add checkpointing

* Skip createdata if data already there
  • Loading branch information
SCiarella authored Dec 20, 2024
1 parent d42f0f5 commit 7340045
Show file tree
Hide file tree
Showing 16 changed files with 284 additions and 108 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
Expand All @@ -38,12 +37,13 @@ LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"

[sources]
NeuralClosure = {rev = "main", url = "https://github.com/DEEPDIP-project/NeuralClosure.jl.git"}

[extensions]
CoupledNODECUDA = ["CUDA", "cuDNN", "LuxCUDA"]
CoupledNODECUDA = ["CUDA"]
NavierStokes = ["IncompressibleNavierStokes", "NeuralClosure"]

[compat]
Expand Down
7 changes: 6 additions & 1 deletion ext/CoupledNODECUDA.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
module CoupledNODECUDA

using CoupledNODE
using CUDA: CUDA
ArrayType = CUDA.functional() ? CUDA.CuArray : Array
function ArrayType()
return CUDA.functional() ? CUDA.CuArray : Array
end

allowscalar = deepcopy(CUDA.allowscalar)

end
24 changes: 19 additions & 5 deletions ext/NavierStokes/callback.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,21 @@ The callback function is used during training to compute and log validation and
"""
function create_callback(
model, θ, val_io_data, loss_function, st;
callbackstate = (;
θmin = θ, loss_min = eltype(θ)(Inf), lhist_val = [],
lhist_train = [], lhist_nomodel = []),
nunroll = nothing, batch_size = nothing, rng = Random.Xoshiro(123), do_plot = true,
plot_train = true, plot_every = 10, average_window = 25, device = identity)
callbackstate = nothing,
nunroll = nothing,
batch_size = nothing,
rng = Random.Xoshiro(123),
do_plot = true,
plot_train = true,
plot_every = 10,
average_window = 25,
device = identity,
figfile = nothing)
if callbackstate === nothing
# Initialize the callback state
callbackstate = (; θmin = θ, loss_min = eltype(θ)(Inf), lhist_val = [],
lhist_train = [], lhist_nomodel = [])
end
if nunroll === nothing && batch_size === nothing
error("Either nunroll or batch_size must be provided")
elseif nunroll !== nothing
Expand Down Expand Up @@ -95,6 +105,10 @@ function create_callback(

CairoMakie.axislegend(ax)
display(fig)

if figfile !== nothing
CairoMakie.save(figfile, fig)
end
end
end
callbackstate
Expand Down
2 changes: 2 additions & 0 deletions simulations/Benchmark/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
slurm*
update_julia.out
84 changes: 47 additions & 37 deletions simulations/Benchmark/benchmark.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,35 @@ if false #src
end #src

@info "Script started"
@info VERSION

using Pkg
@info Pkg.status()

# Color palette for consistent theme throughout paper
palette = (; color = ["#3366cc", "#cc0000", "#669900", "#ff9900"])

########################################################################## #src
# Read the configuration file
using IncompressibleNavierStokes
using NeuralClosure
using CoupledNODE
NS = Base.get_extension(CoupledNODE, :NavierStokes)
global conf
try
conf = NS.read_config(ENV["CONF_FILE"])
@info "Reading configuration file from ENV"
catch
@info "Reading configuration file from default"
conf = NS.read_config("configs/conf.yaml")
end
########################################################################## #src

# Choose where to put output
basedir = haskey(ENV, "DEEPDIP") ? ENV["DEEPDIP"] : @__DIR__
outdir = joinpath(basedir, "output", "kolmogorov")
closure_name = conf["closure"]["name"]
outdir_model = joinpath(outdir, closure_name)
plotdir = joinpath(outdir, closure_name, "plots")
logdir = joinpath(outdir, closure_name, "logs")
ispath(outdir) || mkpath(outdir)
Expand Down Expand Up @@ -51,31 +72,22 @@ setsnelliuslogger(logfile)

using Accessors
using Adapt
# using GLMakie
using CairoMakie
using CoupledNODE
using CoupledNODE: loss_priori_lux, create_loss_post_lux
using CoupledNODE.NavierStokes: create_right_hand_side, create_right_hand_side_with_closure
using CUDA
using DifferentialEquations
using IncompressibleNavierStokes
using IncompressibleNavierStokes.RKMethods
using JLD2
using LaTeXStrings
using LinearAlgebra
using Lux
using LuxCUDA
using NeuralClosure
using NNlib
using Optimisers
using ParameterSchedulers
using Random
using SparseArrays

########################################################################## #src
# Read the configuration file
conf = read_config("test_conf.yaml")
########################################################################## #src

# ## Random number seeds
#
Expand All @@ -90,7 +102,7 @@ conf = read_config("test_conf.yaml")
#
# We define all the seeds here.

seeds = load_seeds(conf)
seeds = NS.load_seeds(conf)

########################################################################## #src

Expand All @@ -116,9 +128,10 @@ else
device = identity
clean() = nothing
end
conf["params"]["backend"] = deepcopy(backend)
@info backend
@info CUDA.versioninfo()

#add backend to conf
conf["params"]["backend"] = backend

########################################################################## #src

Expand All @@ -127,7 +140,8 @@ conf["params"]["backend"] = backend
# Create filtered DNS data for training, validation, and testing.

# Parameters
params = load_params(conf)
params = NS.load_params(conf)
@info params

# DNS seeds
ntrajectory = conf["ntrajectory"]
Expand All @@ -139,6 +153,7 @@ dns_seeds_test = dns_seeds[ntrajectory:ntrajectory]
# Create data
docreatedata = conf["docreatedata"]
docreatedata && createdata(; params, seeds = dns_seeds, outdir, taskid)
@info "Data generated"

# Computational time
docomp = conf["docomp"]
Expand Down Expand Up @@ -169,7 +184,7 @@ setups = map(nles -> getsetup(; params, nles), params.nles);
# All training sessions will start from the same θ₀
# for a fair comparison.

closure, θ_start, st = load_model(conf)
closure, θ_start, st = NS.load_model(conf)
# same model structure in INS format
closure_INS, θ_INS = cnn(;
setup = setups[1],
Expand All @@ -179,7 +194,7 @@ closure_INS, θ_INS = cnn(;
use_bias = [true,true, true,true, false],
rng = Xoshiro(seeds.θ_start),
)
#@assert θ_start == θ_INS
@assert θ_start == θ_INS

@info "Initialized CNN with $(length(θ_start)) parameters"

Expand Down Expand Up @@ -207,8 +222,6 @@ end
# Save parameters to disk after each run.
# Plot training progress (for a validation data batch).

# Parameter save files

# Train
let
dotrain = conf["priori"]["dotrain"]
Expand Down Expand Up @@ -288,18 +301,18 @@ end
# Save parameters to disk after each combination.
# Plot training progress (for a validation data batch).
#
# The time stepper `RKProject` allows for choosing when to project.
# [INS] The time stepper `RKProject` allows for choosing when to project.
# [CNODE] Only DCF (last) is supported since it appears to be the best one.

# First = DIF (Bad!)
# Last = DCF
projectorders = (ProjectOrder.Last, )
# I think that in practice we can only do DCF
projectorders = eval(Meta.parse(conf["posteriori"]["projectorders"]))
nprojectorders = length(projectorders)
@assert nprojectorders == 1 "Only DCF should be done"

# Train
let
dotrain = true
nepoch = 100
dotrain = conf["posteriori"]["dotrain"]
nepoch = conf["posteriori"]["nepoch"]
nepoch = 40
dotrain && trainpost(;
params,
projectorders,
Expand All @@ -309,14 +322,14 @@ let
postseed = seeds.post,
dns_seeds_train,
dns_seeds_valid,
nunroll = 5,
nunroll = conf["posteriori"]["nunroll"],
closure,
θ_start = θ_cnn_prior,
st,
opt = ClipAdam = OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1)),
nunroll_valid = 10,
opt = eval(Meta.parse(conf["posteriori"]["opt"])),
nunroll_valid = conf["posteriori"]["nunroll_valid"],
nepoch,
dt = T(1e-3),
dt = eval(Meta.parse(conf["posteriori"]["dt"])),
)
end

Expand Down Expand Up @@ -404,11 +417,11 @@ let
eprior.post[ig, ifil, iorder] = priori_err(device(θ_cnn_post[ig, ifil, iorder]))[1]
end
end
jldsave(joinpath(outdir, "eprior.jld2"); eprior...)
jldsave(joinpath(outdir_model, "eprior.jld2"); eprior...)
end
clean()

eprior = namedtupleload(joinpath(outdir, "eprior.jld2"))
eprior = namedtupleload(joinpath(outdir_model, "eprior.jld2"))

########################################################################## #src

Expand Down Expand Up @@ -445,26 +458,23 @@ let
dt = T(1e-3)

## No model
dudt_nomod = create_right_hand_side(
dudt_nomod = NS.create_right_hand_side(
setup, psolver)
err_post = create_loss_post_lux(dudt_nomod; sciml_solver = Tsit5(), dt = dt)
epost.nomodel[I] = err_post(closure, θ_cnn_post[I].*0 , st, data)[1]
# with closure
dudt = create_right_hand_side_with_closure(
dudt = NS.create_right_hand_side_with_closure(
setup, psolver, closure, st)
err_post = create_loss_post_lux(dudt; sciml_solver = Tsit5(), dt = dt)
epost.cnn_prior[I] = err_post(closure, device(θ_cnn_prior[ig, ifil]), st, data)[1]
epost.cnn_post[I] = err_post(closure, device(θ_cnn_post[I]), st, data)[1]
clean()
end
jldsave(joinpath(outdir, "epost.jld2"); epost...)
jldsave(joinpath(outdir_model, "epost.jld2"); epost...)
end

epost = namedtupleload(joinpath(outdir, "epost.jld2"))
epost = namedtupleload(joinpath(outdir_model, "epost.jld2"))

epost.nomodel
epost.cnn_prior
epost.cnn_post

########################################################################## #src

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ seeds:
prior: 345
post: 456
closure:
name: "CNN0"
name: "cnn_0"
type: cnn
radii: [2, 2, 2, 2, 2]
channels: [24, 24, 24, 24, 2]
Expand All @@ -33,6 +33,14 @@ closure:
rng: "Xoshiro(seeds.θ_start)"
priori:
dotrain: true
nepoch: 100
nepoch: 500
batchsize: 32
opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
posteriori:
dotrain: true
projectorders: "(ProjectOrder.Last, )"
nepoch: 200
opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1))"
nunroll: 5
nunroll_valid: 10
dt: T(1e-3)
48 changes: 48 additions & 0 deletions simulations/Benchmark/configs/conf_2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
docreatedata: false
docomp: true
ntrajectory: 8
T: "Float32"
params:
D: 2
lims: [0.0, 1.0]
Re: 6000.0
tburn: 0.5
tsim: 5.0
savefreq: 10
#ndns: 2048
#nles: [128]
ndns: 256
nles: [64]
filters: ["FaceAverage()"]
icfunc: "(setup, psolver, rng) -> random_field(setup, T(0); kp=20, psolver, rng)"
method: "RKMethods.Wray3(; T)"
bodyforce: "(dim, x, y, t) -> (dim == 1) * 5 * sinpi(8 * y)"
issteadybodyforce: true
processors: "(; log = timelogger(; nupdate=100))"
Δt: 0.001
seeds:
dns: 123
θ_start: 234
prior: 345
post: 456
closure:
name: "cnn_1"
type: cnn
radii: [2, 2, 2, 2, 2]
channels: [24, 24, 24, 24, 2]
activations: ["relu", "relu", "relu", "relu", "identity"]
use_bias: [true, true, true, true, false]
rng: "Xoshiro(seeds.θ_start)"
priori:
dotrain: true
nepoch: 500
batchsize: 32
opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
posteriori:
dotrain: true
projectorders: "(ProjectOrder.Last, )"
nepoch: 200
opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1))"
nunroll: 5
nunroll_valid: 10
dt: T(1e-3)
8 changes: 5 additions & 3 deletions simulations/Benchmark/job_a100.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#SBATCH --partition=gpu_a100
#SBATCH --time=05:00:00
#SBATCH --mail-type=BEGIN,END
#SBATCH --mail-user=s.ciarella@esciencecenter.nl
#SBATCH --array=1-8
# #SBATCH --mail-user=s.ciarella@esciencecenter.nl
#SBATCH --array=1-1
# #SBATCH --array=1-8

# Note:
# - gpu_a100: 18 cores
Expand All @@ -20,7 +21,8 @@ mkdir -p /scratch-shared/$USER
echo "Slurm job ID: $SLURM_JOB_ID"
echo "Slurm array task ID: $SLURM_ARRAY_TASK_ID"

export JULIA_DEPOT_PATH=/scratch-shared/$USER/.julia_a100:
export JULIA_DEPOT_PATH=/scratch-shared/$USER/.julia_a100
export CONF_FILE=$1

cd $HOME/CoupledNODE.jl/simulations/Benchmark

Expand Down
Loading

0 comments on commit 7340045

Please sign in to comment.