Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checkpointing, NetCDF, and golden master tests #140

Merged
merged 21 commits into from
Mar 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
001c2d9
Checkpointer constructor with kwargs.
ali-ramadhan Mar 19, 2019
f66808f
Fixed bug with filename function for ::Checkpointer.
ali-ramadhan Mar 19, 2019
774a181
Fixed FFT plan reconstruction during checkpointing.
ali-ramadhan Mar 19, 2019
e62c8c2
Basic checkpointing test.
ali-ramadhan Mar 19, 2019
755000c
Cooling disk temperature forcing function.
ali-ramadhan Mar 19, 2019
fdaa3d7
Set forcing functions to nothing when restoring.
ali-ramadhan Mar 19, 2019
205aec8
Thermal bubble checkpointing test works!
ali-ramadhan Mar 19, 2019
a3bdf43
Saving velocity fields at xC, yC, zC.
ali-ramadhan Mar 20, 2019
5b3fc4c
NetCDF thermal bubble integration test.
ali-ramadhan Mar 20, 2019
b880a9b
Forgot to commit the actual test.
ali-ramadhan Mar 20, 2019
8786ac3
Forgot to describe the NetCDF integration test.
ali-ramadhan Mar 20, 2019
34fca53
Thermal bubble golden master test.
ali-ramadhan Mar 20, 2019
ae3e5ba
Thermal bubble golden master checkpoint file.
ali-ramadhan Mar 20, 2019
5dfbb4f
Adding Random to manifest.
ali-ramadhan Mar 20, 2019
dca7290
Deep convection golden master test.
ali-ramadhan Mar 20, 2019
a8b6b6c
Deep convection golden master checkpoint file.
ali-ramadhan Mar 20, 2019
3b16880
Updating environment project manifests.
ali-ramadhan Mar 20, 2019
0d132d1
Golden master output is now stored as NetCDF.
ali-ramadhan Mar 21, 2019
354ab07
Merge branch 'master' into system-tests
ali-ramadhan Mar 21, 2019
5d6cec8
Fixed missing async kwarg for NetCDFOutputWriter constructor
ali-ramadhan Mar 21, 2019
371af73
Skipping deep convection golden master test.
ali-ramadhan Mar 21, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
22 changes: 17 additions & 5 deletions env/cpu/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ uuid = "8bf52ea8-c179-5cab-976a-9e18b702a9bc"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "49269e311ffe11ac5b334681d212329002a9832a"
git-tree-sha1 = "195a3ffcb8b0762684b6821de18f83a16455c6ea"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "1.5.1"
version = "2.0.0"

[[Conda]]
deps = ["Compat", "JSON", "VersionParsing"]
Expand All @@ -66,6 +66,12 @@ git-tree-sha1 = "3b868935adf4ce2115f5487e789553507739014c"
uuid = "a9693cdc-2bc8-5703-a9cd-1da358117377"
version = "0.1.0"

[[Crayons]]
deps = ["Test"]
git-tree-sha1 = "416737eea5c50ee5a08c588ea73d77d5eebc94e7"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "3.0.0"

[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand All @@ -75,7 +81,7 @@ deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

[[Distributed]]
deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

[[FFTW]]
Expand Down Expand Up @@ -121,7 +127,7 @@ uuid = "d9be37ee-ecc9-5288-90f1-b9ca67657a75"
version = "0.7.1"

[[InteractiveUtils]]
deps = ["LinearAlgebra", "Markdown"]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[JLD]]
Expand Down Expand Up @@ -261,14 +267,20 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[TimerOutputs]]
deps = ["Crayons", "Printf", "Test", "Unicode"]
git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.0"

[[URIParser]]
deps = ["Test", "Unicode"]
git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
uuid = "30578b45-9adc-5946-b283-645ec420af67"
version = "0.4.0"

[[UUIDs]]
deps = ["Random"]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[Unicode]]
Expand Down
3 changes: 3 additions & 0 deletions env/cpu/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@ GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
20 changes: 16 additions & 4 deletions env/gpu/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ version = "0.2.0"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "49269e311ffe11ac5b334681d212329002a9832a"
git-tree-sha1 = "195a3ffcb8b0762684b6821de18f83a16455c6ea"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "1.5.1"
version = "2.0.0"

[[Conda]]
deps = ["Compat", "JSON", "VersionParsing"]
Expand All @@ -96,6 +96,12 @@ git-tree-sha1 = "3b868935adf4ce2115f5487e789553507739014c"
uuid = "a9693cdc-2bc8-5703-a9cd-1da358117377"
version = "0.1.0"

[[Crayons]]
deps = ["Test"]
git-tree-sha1 = "416737eea5c50ee5a08c588ea73d77d5eebc94e7"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
version = "3.0.0"

[[CuArrays]]
deps = ["AbstractFFTs", "Adapt", "CUDAapi", "CUDAdrv", "CUDAnative", "DiffRules", "ForwardDiff", "GPUArrays", "LinearAlgebra", "MacroTools", "NNlib", "Pkg", "Printf", "Random", "SparseArrays", "Test"]
git-tree-sha1 = "c1cd8792ca783987fcba2ed0d6b3b58176e6b13e"
Expand Down Expand Up @@ -140,9 +146,9 @@ version = "1.0.5"

[[FillArrays]]
deps = ["LinearAlgebra", "Random", "SparseArrays", "Test"]
git-tree-sha1 = "471b7e33dc9c9c5b9170045dd57c8ba0927b2918"
git-tree-sha1 = "2def0123a4f3572234405b0e3d80bfe5d3e1a2a4"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "0.4.0"
version = "0.5.0"

[[Formatting]]
deps = ["Compat"]
Expand Down Expand Up @@ -357,6 +363,12 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[TimerOutputs]]
deps = ["Crayons", "Printf", "Test", "Unicode"]
git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
version = "0.5.0"

[[URIParser]]
deps = ["Test", "Unicode"]
git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
Expand Down
4 changes: 4 additions & 0 deletions env/gpu/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ authors = ["Ali Ramadhan <ali.hh.ramadhan@gmail.com>"]
version = "0.1.0"

[deps]
CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
Expand All @@ -12,5 +13,8 @@ GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
22 changes: 20 additions & 2 deletions examples/deep_convection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,20 @@ function impose_cooling_disk!(model::Model)
# geometries, probably by being able to define, e.g. a forcing only at the
# top, etc.
@inline function cooling_disk(u, v, w, T, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
ifelse(k == 1 && 0.2Nx < i < 0.8Nx && 0.2Ny < j < 0.8Ny, -4.5e-6, 0)
if k == 1
x = i*Δx
y = j*Δy
Lx = Nx*Δx
Ly = Ny*Δy
r² = (x - Lx/2)^2 + (y - Ly/2)^2
if r² < 600^2
return -4.5e-6
else
return 0
end
else
return 0
end
end

model.forcing = Forcing(nothing, nothing, nothing, cooling_disk, nothing)
Expand All @@ -75,6 +88,11 @@ impose_cooling_disk!(model)
nc_writer = NetCDFOutputWriter(dir=".", prefix="deep_convection_", frequency=20)
push!(model.output_writers, nc_writer)

time_step!(model; Nt=Nt, Δt=Δt)
# time_step!(model; Nt=Nt, Δt=Δt)
for i = 1:Nt
tic = time_ns()
time_step!(model, 1, Δt)
println("Time: $(model.clock.time) [$(prettytime(time_ns()-tic))]")
end

make_vertical_slice_movie(model, nc_writer, "T", Nt, Δt, 293.15, ceil(Int, Ny/2))
2 changes: 1 addition & 1 deletion src/models.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ mutable struct Model
pressures::PressureFields
G::SourceTerms
Gp::SourceTerms
forcing::Forcing
forcing # ::Forcing # No type so we can set to nothing while checkpointing.
stepper_tmp::StepperTemporaryFields
poisson_solver # ::PoissonSolver or ::PoissonSolverGPU
clock::Clock
Expand Down
56 changes: 36 additions & 20 deletions src/output_writers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ mutable struct BinaryOutputWriter <: OutputWriter
padding::Int
end

function Checkpointer(; dir=".", prefix="", frequency=1, padding=9)
Checkpointer(dir, prefix, frequency, padding)
end

function NetCDFOutputWriter(; dir=".", prefix="", frequency=1, padding=9, compression=5, async=false)
NetCDFOutputWriter(dir, prefix, frequency, padding, compression, async)
end
Expand All @@ -39,7 +43,7 @@ ext(fw::NetCDFOutputWriter) = ".nc"
ext(fw::Checkpointer) = ".jld"

filename(fw, name, iteration) = fw.filename_prefix * name * lpad(iteration, fw.padding, "0") * ext(fw)
filename(fw::Checkpointer, name, iteration) = filename(fw, "model_checkpoint", iteration)
filename(fw::Checkpointer, iteration) = filename(fw, "model_checkpoint_", iteration)

#
# Checkpointing functions
Expand All @@ -48,25 +52,34 @@ filename(fw::Checkpointer, name, iteration) = filename(fw, "model_checkpoint", i
function write_output(model::Model, chk::Checkpointer)
filepath = joinpath(chk.dir, filename(chk, model.clock.iteration))

# Do not include the spectral solver parameters. We want to avoid serializing
forcing_functions = model.forcing

# Do not include forcing functions and FFT plans. We want to avoid serializing
# FFTW and CuFFT plans as serializing functions is not supported by JLD, and
# seems like a tricky business in general.
model.forcing = nothing
model.poisson_solver = nothing

println("WARNING: Forcing functions are not serialized!")

println("[Checkpointer] Serializing model to disk: $filepath")
f = JLD.jldopen(filepath, "w", compress=true)
JLD.@write f model
close(f)

# Reconstruct PoissonSolver struct with FFT plans ?
println("[Checkpointer] Reconstructing FFT plans...")
metadata, grid, stepper_tmp = model.metadata, model.grid, model.stepper_tmp
if metadata.arch == :cpu
if metadata.arch == :CPU
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we save FFT plans?

It makes sense that a user should plan new FFTs if they are going to run a model, even if they are restarting a checkpoint on the same computer.

@christophernhill what do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't. We set model.poisson_solver = nothing before checkpointing then reconstruct the FFT plans afterwards.

So you're saying the user should be manually planning the FFTs themselves?

stepper_tmp.fCC1.data .= rand(metadata.float_type, grid.Nx, grid.Ny, grid.Nz)
poisson_solver = PoissonSolver(grid, stepper_tmp.fCC1, FFTW.PATIENT; verbose=true)
elseif metadata.arch == :gpu
model.poisson_solver = PoissonSolver(grid, stepper_tmp.fCC1, FFTW.PATIENT)
elseif metadata.arch == :GPU
stepper_tmp.fCC1.data .= CuArray{Complex{Float64}}(rand(metadata.float_type, grid.Nx, grid.Ny, grid.Nz))
poisson_solver = PoissonSolverGPU(grid, stepper_tmp.fCC1)
model.poisson_solver = PoissonSolverGPU(grid, stepper_tmp.fCC1)
end

# Putting back in the forcing functions.
model.forcing = forcing_functions

return nothing
end

Expand All @@ -76,16 +89,19 @@ function restore_from_checkpoint(filepath)
model = read(f, "model");
close(f)

# Reconstruct PoissonSolver struct with FFT plans.
println("Reconstructing FFT plans...")
metadata, grid, stepper_tmp = model.metadata, model.grid, model.stepper_tmp
if metadata.arch == :cpu
if metadata.arch == :CPU
stepper_tmp.fCC1.data .= rand(metadata.float_type, grid.Nx, grid.Ny, grid.Nz)
poisson_solver = PoissonSolver(grid, stepper_tmp.fCC1, FFTW.PATIENT; verbose=true)
elseif metadata.arch == :gpu
model.poisson_solver = PoissonSolver(grid, stepper_tmp.fCC1, FFTW.PATIENT)
elseif metadata.arch == :GPU
stepper_tmp.fCC1.data .= CuArray{Complex{Float64}}(rand(metadata.float_type, grid.Nx, grid.Ny, grid.Nz))
poisson_solver = PoissonSolverGPU(grid, stepper_tmp.fCC1)
model.poisson_solver = PoissonSolverGPU(grid, stepper_tmp.fCC1)
end

model.forcing = Forcing(nothing, nothing, nothing, nothing, nothing)
println("WARNING: Forcing functions have been set to nothing!")

return model
end

Expand All @@ -99,9 +115,9 @@ function write_output(model::Model, fw::BinaryOutputWriter)
filepath = joinpath(fw.dir, filename(fw, field_name, model.clock.iteration))

println("[BinaryOutputWriter] Writing $field_name to disk: $filepath")
if model.metadata == :cpu
if model.metadata == :CPU
write(filepath, field.data)
elseif model.metadata == :gpu
elseif model.metadata == :GPU
write(filepath, Array(field.data))
end
end
Expand Down Expand Up @@ -139,7 +155,7 @@ function write_output(model::Model, fw::NetCDFOutputWriter)
"T" => Array(model.tracers.T.data),
"S" => Array(model.tracers.S.data)
)

if fw.async
# Execute asynchronously on worker 2.
println("Using @async...")
Expand All @@ -156,10 +172,10 @@ end
function write_output_netcdf(fw::NetCDFOutputWriter, fields, iteration)
xC, yC, zC = fields["xC"], fields["yC"], fields["zC"]
xF, yF, zF = fields["xF"], fields["yF"], fields["zF"]

u, v, w = fields["u"], fields["v"], fields["w"]
T, S = fields["T"], fields["S"]

xC_attr = Dict("longname" => "Locations of the cell centers in the x-direction.", "units" => "m")
yC_attr = Dict("longname" => "Locations of the cell centers in the y-direction.", "units" => "m")
zC_attr = Dict("longname" => "Locations of the cell centers in the z-direction.", "units" => "m")
Expand All @@ -184,19 +200,19 @@ function write_output_netcdf(fw::NetCDFOutputWriter, fields, iteration)

isfile(filepath) && rm(filepath)

nccreate(filepath, "u", "xF", xF, xF_attr,
nccreate(filepath, "u", "xF", xC, xC_attr,
"yC", yC, yC_attr,
"zC", zC, zC_attr,
atts=u_attr, compress=fw.compression)

nccreate(filepath, "v", "xC", xC, xC_attr,
"yF", yF, yC_attr,
"yF", yC, yC_attr,
"zC", zC, zC_attr,
atts=v_attr, compress=fw.compression)

nccreate(filepath, "w", "xC", xC, xC_attr,
"yC", yC, yC_attr,
"zF", zF, zF_attr,
"zF", zC, zC_attr,
atts=w_attr, compress=fw.compression)

nccreate(filepath, "T", "xC", xC, xC_attr,
Expand Down
Binary file added test/deep_convection_golden_master_10.nc
Binary file not shown.
24 changes: 24 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -239,4 +239,28 @@ float_types = [Float32, Float64]
end
end
end

@testset "Output writers" begin
include("test_output_writers.jl")

@testset "Checkpointing" begin
run_thermal_bubble_checkpointer_tests()
end

@testset "NetCDF" begin
run_thermal_bubble_netcdf_tests()
end
end

@testset "Golden master tests" begin
include("test_golden_master.jl")

@testset "Thermal bubble" begin
run_thermal_bubble_golden_master_tests()
end

@testset "Deep convection" begin
run_deep_convection_golden_master_tests()
end
end
end # Oceananigans tests
Loading