Utilities and checpoints (#131)

* Add savefig for loss * Add checkpointing * Skip createdata if data already there
DEEPDIP-project · Dec 20, 2024 · 7340045 · 7340045
1 parent d42f0f5
commit 7340045
Show file tree

Hide file tree

Showing 16 changed files with 284 additions and 108 deletions.
diff --git a/Project.toml b/Project.toml
@@ -12,7 +12,6 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
-Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
@@ -38,12 +37,13 @@ LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 
 [sources]
 NeuralClosure = {rev = "main", url = "https://github.com/DEEPDIP-project/NeuralClosure.jl.git"}
 
 [extensions]
-CoupledNODECUDA = ["CUDA", "cuDNN", "LuxCUDA"]
+CoupledNODECUDA = ["CUDA"]
 NavierStokes = ["IncompressibleNavierStokes", "NeuralClosure"]
 
 [compat]

diff --git a/ext/CoupledNODECUDA.jl b/ext/CoupledNODECUDA.jl
@@ -1,6 +1,11 @@
 module CoupledNODECUDA
 
+using CoupledNODE
 using CUDA: CUDA
-ArrayType = CUDA.functional() ? CUDA.CuArray : Array
+function ArrayType()
+    return CUDA.functional() ? CUDA.CuArray : Array
+end
+
+allowscalar = deepcopy(CUDA.allowscalar)
 
 end
diff --git a/ext/NavierStokes/callback.jl b/ext/NavierStokes/callback.jl
@@ -38,11 +38,21 @@ The callback function is used during training to compute and log validation and
 """
 function create_callback(
         model, θ, val_io_data, loss_function, st;
-        callbackstate = (;
-            θmin = θ, loss_min = eltype(θ)(Inf), lhist_val = [],
-            lhist_train = [], lhist_nomodel = []),
-        nunroll = nothing, batch_size = nothing, rng = Random.Xoshiro(123), do_plot = true,
-        plot_train = true, plot_every = 10, average_window = 25, device = identity)
+        callbackstate = nothing,
+        nunroll = nothing,
+        batch_size = nothing,
+        rng = Random.Xoshiro(123),
+        do_plot = true,
+        plot_train = true,
+        plot_every = 10,
+        average_window = 25,
+        device = identity,
+        figfile = nothing)
+    if callbackstate === nothing
+        # Initialize the callback state
+        callbackstate = (; θmin = θ, loss_min = eltype(θ)(Inf), lhist_val = [],
+            lhist_train = [], lhist_nomodel = [])
+    end
     if nunroll === nothing && batch_size === nothing
         error("Either nunroll or batch_size must be provided")
     elseif nunroll !== nothing
@@ -95,6 +105,10 @@ function create_callback(
 
                 CairoMakie.axislegend(ax)
                 display(fig)
+
+                if figfile !== nothing
+                    CairoMakie.save(figfile, fig)
+                end
             end
         end
         callbackstate

diff --git a/simulations/Benchmark/.gitignore b/simulations/Benchmark/.gitignore
@@ -0,0 +1,2 @@
+slurm*
+update_julia.out
diff --git a/simulations/Benchmark/benchmark.jl b/simulations/Benchmark/benchmark.jl
@@ -4,14 +4,35 @@ if false                      #src
 end                           #src
 
 @info "Script started"
+@info VERSION
+
+using Pkg
+@info Pkg.status()
 
 # Color palette for consistent theme throughout paper
 palette = (; color = ["#3366cc", "#cc0000", "#669900", "#ff9900"])
 
+########################################################################## #src
+# Read the configuration file
+using IncompressibleNavierStokes
+using NeuralClosure
+using CoupledNODE
+NS = Base.get_extension(CoupledNODE, :NavierStokes)
+global conf
+try
+    conf = NS.read_config(ENV["CONF_FILE"])
+    @info "Reading configuration file from ENV"
+catch
+    @info "Reading configuration file from default"
+    conf = NS.read_config("configs/conf.yaml")
+end
+########################################################################## #src
+
 # Choose where to put output
 basedir = haskey(ENV, "DEEPDIP") ? ENV["DEEPDIP"] : @__DIR__
 outdir = joinpath(basedir, "output", "kolmogorov")
 closure_name = conf["closure"]["name"]
+outdir_model = joinpath(outdir, closure_name)
 plotdir = joinpath(outdir, closure_name, "plots")
 logdir = joinpath(outdir, closure_name, "logs")
 ispath(outdir) || mkpath(outdir)
@@ -51,31 +72,22 @@ setsnelliuslogger(logfile)
 
 using Accessors
 using Adapt
-# using GLMakie
 using CairoMakie
-using CoupledNODE
 using CoupledNODE: loss_priori_lux, create_loss_post_lux
-using CoupledNODE.NavierStokes: create_right_hand_side, create_right_hand_side_with_closure
 using CUDA
 using DifferentialEquations
-using IncompressibleNavierStokes
 using IncompressibleNavierStokes.RKMethods
 using JLD2
 using LaTeXStrings
 using LinearAlgebra
 using Lux
 using LuxCUDA
-using NeuralClosure
 using NNlib
 using Optimisers
 using ParameterSchedulers
 using Random
 using SparseArrays
 
-########################################################################## #src
-# Read the configuration file
-conf = read_config("test_conf.yaml")
-########################################################################## #src
 
 # ## Random number seeds
 #
@@ -90,7 +102,7 @@ conf = read_config("test_conf.yaml")
 #
 # We define all the seeds here.
 
-seeds = load_seeds(conf)
+seeds = NS.load_seeds(conf)
 
 ########################################################################## #src
 
@@ -116,9 +128,10 @@ else
     device = identity
     clean() = nothing
 end
+conf["params"]["backend"] = deepcopy(backend)
+@info backend
+@info CUDA.versioninfo()
 
-#add backend to conf
-conf["params"]["backend"] = backend
 
 ########################################################################## #src
 
@@ -127,7 +140,8 @@ conf["params"]["backend"] = backend
 # Create filtered DNS data for training, validation, and testing.
 
 # Parameters
-params = load_params(conf)
+params = NS.load_params(conf)
+@info params
 
 # DNS seeds
 ntrajectory = conf["ntrajectory"]
@@ -139,6 +153,7 @@ dns_seeds_test = dns_seeds[ntrajectory:ntrajectory]
 # Create data
 docreatedata = conf["docreatedata"]
 docreatedata && createdata(; params, seeds = dns_seeds, outdir, taskid)
+@info "Data generated"
 
 # Computational time
 docomp = conf["docomp"]
@@ -169,7 +184,7 @@ setups = map(nles -> getsetup(; params, nles), params.nles);
 # All training sessions will start from the same θ₀
 # for a fair comparison.
 
-closure, θ_start, st = load_model(conf)
+closure, θ_start, st = NS.load_model(conf)
 # same model structure in INS format
 closure_INS, θ_INS = cnn(;
     setup = setups[1],
@@ -179,7 +194,7 @@ closure_INS, θ_INS = cnn(;
     use_bias = [true,true, true,true, false],
     rng = Xoshiro(seeds.θ_start),
 )
-#@assert θ_start == θ_INS
+@assert θ_start == θ_INS
 
 @info "Initialized CNN with $(length(θ_start)) parameters"
 
@@ -207,8 +222,6 @@ end
 # Save parameters to disk after each run.
 # Plot training progress (for a validation data batch).
 
-# Parameter save files
-
 # Train
 let
     dotrain = conf["priori"]["dotrain"]
@@ -288,18 +301,18 @@ end
 # Save parameters to disk after each combination.
 # Plot training progress (for a validation data batch).
 #
-# The time stepper `RKProject` allows for choosing when to project.
+# [INS] The time stepper `RKProject` allows for choosing when to project.
+# [CNODE] Only DCF (last) is supported since it appears to be the best one.
 
-# First = DIF (Bad!)
-# Last = DCF
-projectorders = (ProjectOrder.Last, )
-# I think that in practice we can only do DCF 
+projectorders = eval(Meta.parse(conf["posteriori"]["projectorders"]))
 nprojectorders = length(projectorders)
+@assert nprojectorders == 1 "Only DCF should be done"
 
 # Train
 let
-    dotrain = true
-    nepoch = 100
+    dotrain = conf["posteriori"]["dotrain"]
+    nepoch = conf["posteriori"]["nepoch"]
+    nepoch = 40
     dotrain && trainpost(;
         params,
         projectorders,
@@ -309,14 +322,14 @@ let
         postseed = seeds.post,
         dns_seeds_train,
         dns_seeds_valid,
-        nunroll = 5,
+        nunroll = conf["posteriori"]["nunroll"],
         closure,
         θ_start = θ_cnn_prior,
         st,
-        opt = ClipAdam = OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1)),
-        nunroll_valid = 10,
+        opt = eval(Meta.parse(conf["posteriori"]["opt"])),
+        nunroll_valid = conf["posteriori"]["nunroll_valid"],
         nepoch,
-        dt = T(1e-3),
+        dt = eval(Meta.parse(conf["posteriori"]["dt"])),
     )
 end
 
@@ -404,11 +417,11 @@ let
             eprior.post[ig, ifil, iorder] = priori_err(device(θ_cnn_post[ig, ifil, iorder]))[1]
         end
     end
-    jldsave(joinpath(outdir, "eprior.jld2"); eprior...)
+    jldsave(joinpath(outdir_model, "eprior.jld2"); eprior...)
 end
 clean()
 
-eprior = namedtupleload(joinpath(outdir, "eprior.jld2"))
+eprior = namedtupleload(joinpath(outdir_model, "eprior.jld2"))
 
 ########################################################################## #src
 
@@ -445,26 +458,23 @@ let
         dt = T(1e-3)
 
         ## No model
-        dudt_nomod = create_right_hand_side(
+        dudt_nomod = NS.create_right_hand_side(
             setup, psolver)
         err_post = create_loss_post_lux(dudt_nomod; sciml_solver = Tsit5(), dt = dt)
         epost.nomodel[I] = err_post(closure, θ_cnn_post[I].*0 , st, data)[1]
         # with closure
-        dudt = create_right_hand_side_with_closure(
+        dudt = NS.create_right_hand_side_with_closure(
             setup, psolver, closure, st)
         err_post = create_loss_post_lux(dudt; sciml_solver = Tsit5(), dt = dt)
         epost.cnn_prior[I] = err_post(closure, device(θ_cnn_prior[ig, ifil]), st, data)[1]
         epost.cnn_post[I] =  err_post(closure, device(θ_cnn_post[I]), st, data)[1]
         clean()
     end
-    jldsave(joinpath(outdir, "epost.jld2"); epost...)
+    jldsave(joinpath(outdir_model, "epost.jld2"); epost...)
 end
 
-epost = namedtupleload(joinpath(outdir, "epost.jld2"))
+epost = namedtupleload(joinpath(outdir_model, "epost.jld2"))
 
-epost.nomodel
-epost.cnn_prior
-epost.cnn_post
 
 ########################################################################## #src
 

diff --git a/simulations/Benchmark/conf.yaml → simulations/Benchmark/configs/conf.yaml b/simulations/Benchmark/conf.yaml → simulations/Benchmark/configs/conf.yaml
@@ -24,7 +24,7 @@ seeds:
   prior: 345
   post: 456
 closure:
-  name: "CNN0"
+  name: "cnn_0"
   type: cnn
   radii: [2, 2, 2, 2, 2]
   channels: [24, 24, 24, 24, 2]
@@ -33,6 +33,14 @@ closure:
   rng: "Xoshiro(seeds.θ_start)"
 priori:
   dotrain: true
-  nepoch: 100
+  nepoch: 500
   batchsize: 32
-  opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
+  opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
+posteriori:
+  dotrain: true
+  projectorders: "(ProjectOrder.Last, )"
+  nepoch: 200
+  opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1))"
+  nunroll: 5
+  nunroll_valid: 10
+  dt: T(1e-3)
diff --git a/simulations/Benchmark/configs/conf_2.yaml b/simulations/Benchmark/configs/conf_2.yaml
@@ -0,0 +1,48 @@
+docreatedata: false
+docomp: true
+ntrajectory: 8
+T: "Float32"
+params:
+  D: 2
+  lims: [0.0, 1.0]
+  Re: 6000.0
+  tburn: 0.5
+  tsim: 5.0
+  savefreq: 10
+  #ndns: 2048
+  #nles: [128]
+  ndns: 256
+  nles: [64]
+  filters: ["FaceAverage()"]
+  icfunc: "(setup, psolver, rng) -> random_field(setup, T(0); kp=20, psolver, rng)"
+  method: "RKMethods.Wray3(; T)"
+  bodyforce: "(dim, x, y, t) -> (dim == 1) * 5 * sinpi(8 * y)"
+  issteadybodyforce: true
+  processors: "(; log = timelogger(; nupdate=100))"
+  Δt: 0.001
+seeds:
+  dns: 123
+  θ_start: 234
+  prior: 345
+  post: 456
+closure:
+  name: "cnn_1"
+  type: cnn
+  radii: [2, 2, 2, 2, 2]
+  channels: [24, 24, 24, 24, 2]
+  activations: ["relu", "relu", "relu", "relu", "identity"]
+  use_bias: [true, true, true, true, false]
+  rng: "Xoshiro(seeds.θ_start)"
+priori:
+  dotrain: true
+  nepoch: 500
+  batchsize: 32
+  opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
+posteriori:
+  dotrain: true
+  projectorders: "(ProjectOrder.Last, )"
+  nepoch: 200
+  opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1))"
+  nunroll: 5
+  nunroll_valid: 10
+  dt: T(1e-3)
diff --git a/simulations/Benchmark/job_a100.sh b/simulations/Benchmark/job_a100.sh
@@ -7,8 +7,9 @@
 #SBATCH --partition=gpu_a100
 #SBATCH --time=05:00:00
 #SBATCH --mail-type=BEGIN,END
-#SBATCH --mail-user=s.ciarella@esciencecenter.nl
-#SBATCH --array=1-8
+# #SBATCH --mail-user=s.ciarella@esciencecenter.nl
+#SBATCH --array=1-1
+# #SBATCH --array=1-8
 
 # Note:
 # - gpu_a100: 18 cores
@@ -20,7 +21,8 @@ mkdir -p /scratch-shared/$USER
 echo "Slurm job ID: $SLURM_JOB_ID"
 echo "Slurm array task ID: $SLURM_ARRAY_TASK_ID"
 
-export JULIA_DEPOT_PATH=/scratch-shared/$USER/.julia_a100:
+export JULIA_DEPOT_PATH=/scratch-shared/$USER/.julia_a100
+export CONF_FILE=$1
 
 cd $HOME/CoupledNODE.jl/simulations/Benchmark