Force stencil (#41)

Compares several implementation of the Laplace operator. The test uses the Gray-Scott model on grid of different sizes to compare the performances of different algorithms: * Old_Laplace is our current implementation that involves reshaping * *_c* algorithm that use circshift * stencil uses matrix multiplications for the finite differences * conv uses the convolution with a kernel * *Lux* use Lux framework to perform the convolution as a NN We also tested the gpu implementation. --------- Co-authored-by: Luisa Orozco <l.orozco@esciencecenter.nl>
DEEPDIP-project · Jul 23, 2024 · 490cc62 · 490cc62
1 parent 55eea4a
commit 490cc62
Show file tree

Hide file tree

Showing 9 changed files with 519 additions and 154 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,11 +5,13 @@ version = "1.0.0-DEV"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"
 DiffEqFlux = "aae7a2af-3d4f-5e19-a356-7da93b79d9d0"
 DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
@@ -26,19 +28,22 @@ Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
+Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
 LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 Metaheuristics = "bcdb8e00-2c21-11e9-3065-2b553b22f898"
 MethodOfLines = "94925ecb-adb7-4558-8ed8-f975c56a0bf4"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 OptimizationCMAEvolutionStrategy = "bd407f91-200f-4536-9381-e4ba712f53f8"
 OptimizationEvolutionary = "cb963754-43f6-435e-8d4b-99009ff27753"
@@ -54,9 +59,11 @@ RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
 ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cShiftedArrays = "afc28532-05cf-429b-8852-6ba2afc35909"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]

diff --git a/examples/plots/derivatives_CPUbenchmark_1024_1024_10.png b/examples/plots/derivatives_CPUbenchmark_1024_1024_10.png
diff --git a/examples/plots/derivatives_CPUbenchmark_1024_1024_200.png b/examples/plots/derivatives_CPUbenchmark_1024_1024_200.png
diff --git a/examples/plots/derivatives_CPUbenchmark_64_64_10.png b/examples/plots/derivatives_CPUbenchmark_64_64_10.png
diff --git a/examples/plots/derivatives_CPUbenchmark_64_64_1000.png b/examples/plots/derivatives_CPUbenchmark_64_64_1000.png
diff --git a/examples/src/test_GPU.jl b/examples/src/test_GPU.jl
@@ -1,101 +1,102 @@
-using Lux
-using LuxCUDA
-using SciMLSensitivity
-using DiffEqFlux
-using DifferentialEquations
-using Plots
-#using Plots.PlotMeasures
-using Zygote
-using Random
-rng = Random.seed!(1234)
-#using OptimizationOptimisers
-#using Statistics
-using ComponentArrays
 using CUDA
-#using Images
-#using Interpolations
-#using NNlib
-#using FFTW
-using DiffEqGPU
-# Test if CUDA is running
+CUDA.memory_status()
 CUDA.functional()
 CUDA.allowscalar(false)
 const ArrayType = CUDA.functional() ? CuArray : Array;
 const z = CUDA.functional() ? CUDA.zeros : (s...) -> zeros(Float32, s...);
-const solver_algo = CUDA.functional() ? GPUTsit5() : Tsit5();
 const MY_DEVICE = CUDA.functional() ? cu : identity;
 # and remember to use float32 if you plan to use a GPU
 const MY_TYPE = Float32;
-## Import our custom backend functions
-include("coupling_functions/functions_NODE.jl")
-include("coupling_functions/functions_CNODE_loss.jl")
-include("coupling_functions/functions_FDderivatives.jl");
+
+using Random
+rng = Random.seed!(1234)
 
 # ## Testing SciML on GPU
 
 # In this example, we want to test the GPU implementation of SciML. We also use float32 to speed up the computation on the GPU. 
+## on snellius test nodes, this is the largest grid that fits on the GPU
+## [notice that for smaller grids it is not convenient to use the GPU at all]
+#nux = nuy = nvx = nvy = 50;
+include("./../../src/grid.jl")
+dux = duy = dvx = dvy = 1.0f0
+nux = nuy = nvx = nvy = 1024
 
-dux = duy = dvx = dvy = 1.0f0;
-# on snellius test nodes, this is the largest grid that fits on the GPU
-# [notice that for smaller grids it is not convenient to use the GPU at all]
-nux = nuy = nvx = nvy = 50;
-grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy, convert_to_float32 = true);
-
-# Initial condition
-function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)
-    u_init = MY_TYPE.(U₀ .+ ε_u .* randn(grid.nux, grid.nuy, nsimulations))
-    v_init = MY_TYPE.(V₀ .+ ε_v .* randn(grid.nvx, grid.nvy, nsimulations))
+# Definition of the initial condition as a random perturbation over a constant background to add variety. 
+import Random
+function initial_condition(U₀, V₀, ε_u, ε_v; nsimulations = 1)
+    u_init = U₀ .+ ε_u .* Random.randn(Float32, nux, nuy, nsimulations)
+    v_init = V₀ .+ ε_v .* Random.randn(Float32, nvx, nvy, nsimulations)
     return u_init, v_init
 end
-U₀ = 0.5f0;
-V₀ = 0.25f0;
-ε_u = 0.05f0;
-ε_v = 0.1f0;
-u_initial, v_initial = initial_condition(grid, U₀, V₀, ε_u, ε_v, nsimulations = 4);
-uv0 = MY_TYPE.(vcat(
-    reshape(u_initial, grid.Nu, :), reshape(v_initial, grid.nvx * grid.nvy, :)));
-
-# RHS of GS model
-const D_u = 0.16f0;
-const D_v = 0.08f0;
-const f = 0.055f0;
-const k = 0.062f0;
-function create_functions(D_u, D_v, f, k, grid)
-    dux2 = grid.dux^2
-    duy2 = grid.duy^2
-    dvx2 = grid.dvx^2
-    dvy2 = grid.dvy^2
-    F_u(u, v) = D_u * Laplacian(u, dux2, duy2) .- u .* v .^ 2 .+ f .* (1.0f0 .- u)
-    G_v(u, v) = D_v * Laplacian(v, dvx2, dvy2) .+ u .* v .^ 2 .- (f + k) .* v
-    return F_u, G_v
-end
-F_u, G_v = create_functions(D_u, D_v, f, k, grid)
+U₀ = 0.5f0    # initial concentration of u
+V₀ = 0.25f0   # initial concentration of v
+ε_u = 0.05f0  # magnitude of the perturbation on u
+ε_v = 0.1f0   # magnitude of the perturbation on v
+nsim = 1
+u_initial, v_initial = initial_condition(U₀, V₀, ε_u, ε_v, nsimulations = nsim);
+
+# Declare the grid object 
+grid_GS_u = make_grid(dim = 2, dtype = MY_TYPE, dx = dux, nx = nux, dy = duy,
+    ny = nuy, nsim = nsim, grid_data = u_initial)
+grid_GS_v = make_grid(dim = 2, dtype = MY_TYPE, dx = dvx, nx = nvx, dy = dvy,
+    ny = nvy, nsim = nsim, grid_data = v_initial)
+
+# These are the GS parameters (also used in example 02.01) that we will try to learn
+D_u = 0.16f0
+D_v = 0.08f0
+f = 0.055f0
+k = 0.062f0;
+
+CUDA.memory_status()
+
+using ShiftedArrays
+include("./../../src/derivatives.jl")
+F_u(gu, gv) =  D_u * Laplacian(gu.grid_data, gu.dx, gv.dy) .- gu.grid_data .* gv.grid_data .^ 2 .+
+    f .* (1.0f0 .- gu.grid_data)
 
 using Profile
 # Trigger and test if the force keeps the array on the GPU
-@time zz = F_u(u_initial, v_initial);
-cuu = cu(u_initial);
-cuv = cu(v_initial);
-@time zz = F_u(cuu, cuv);
+@time zz = F_u(grid_GS_u, grid_GS_v);
+CUDA.reclaim()
+CUDA.memory_status()
+# Create the GPU grid
+cugu = make_grid(dim = 2, dtype = MY_TYPE, dx = cu(dux), nx = cu(nux), dy = cu(duy),
+    ny = cu(nuy), nsim = nsim, grid_data = cu(u_initial))
+typeof(cu(cugu.dx))
+cugv = make_grid(dim = 2, dtype = MY_TYPE, dx = cu(dvx), nx = cu(nvx), dy = cu(dvy),
+    ny = cu(nvy), nsim = cu(nsim), grid_data = cu(v_initial))
+CUDA.memory_status()
+@time zz = F_u(cugu, cugv);
 @time for i in 1:1000
     ##@profview for i in 1:100
     ##CUDA.@profile for i in 1:100
-    zz = F_u(u_initial, v_initial)
+    zz = F_u(grid_GS_u.grid_data, grid_GS_v.grid_data);
 end
 @time for i in 1:1000
     ##CUDA.@profile for i in 1:100
     ##@profview for i in 1:100
-    zz = F_u(cuu, cuv)
+    zz = F_u(cugu.grid_data, cugv.grid_data);
 end
+typeof(zz)
+
+
+
+using Lux
+using LuxCUDA
+using SciMLSensitivity
+using DiffEqFlux
+using DifferentialEquations
+using Zygote
+using DiffEqGPU
 
 # Typical cnode
-#f_CNODE_cpu = create_f_CNODE(F_u, G_v, grid; is_closed = false);
-f_CNODE_cpu = create_f_CNODE(create_functions, D_u, D_v, f, k, grid; is_closed = false);
+f_CNODE_cpu = create_f_CNODE((F_u, G_v), (grid_GS_u, grid_GS_v); is_closed = false)
 θ_cpu, st_cpu = Lux.setup(rng, f_CNODE_cpu);
 # the only difference for the gpu is that the grid lives on the gpu now
 #f_CNODE_gpu = create_f_CNODE(F_u, G_v, cu(grid); is_closed = false);
 f_CNODE_gpu = create_f_CNODE(
     create_functions, D_u, D_v, f, k, cu(grid); is_closed = false, gpu_mode = true);
+f_CNODE_gpu = create_f_CNODE((F_u, G_v), (grid_GS_u, grid_GS_v); is_closed = false)
 #f_CNODE_gpu = f_CNODE_gpu |> gpu;
 θ_gpu, st_gpu = Lux.setup(rng, f_CNODE_gpu);
 #θ_gpu = θ_gpu |> gpu;