From ef6c0a1bf040e4f6c1aff309f1faec514658b693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Wed, 23 Oct 2024 20:22:31 +0200 Subject: [PATCH 01/32] Add a basic reservoir version of CUDA linear solver --- Project.toml | 6 +- ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl | 6 + ext/JutulDarcyCUDAExt/cuda_utils.jl | 31 ++++ ext/JutulDarcyCUDAExt/ilu0.jl | 166 +++++++++++++++++++++ ext/JutulDarcyCUDAExt/krylov.jl | 0 src/ext/ext.jl | 1 + src/ext/ext_cuda.jl | 111 ++++++++++++++ test/gpu.jl | 117 +++++++-------- 8 files changed, 374 insertions(+), 64 deletions(-) create mode 100644 ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl create mode 100644 ext/JutulDarcyCUDAExt/cuda_utils.jl create mode 100644 ext/JutulDarcyCUDAExt/ilu0.jl create mode 100644 ext/JutulDarcyCUDAExt/krylov.jl create mode 100644 src/ext/ext_cuda.jl diff --git a/Project.toml b/Project.toml index 985523ca..bbcf5bbc 100644 --- a/Project.toml +++ b/Project.toml @@ -34,6 +34,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc" [weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" @@ -41,6 +42,7 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9" [extensions] +JutulDarcyCUDAExt = "CUDA" JutulDarcyGLMakieExt = "GLMakie" JutulDarcyMakieExt = "Makie" JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"] @@ -48,6 +50,7 @@ JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"] [compat] AlgebraicMultigrid = "0.5.1, 0.6.0" Artifacts = "1" +CUDA = "5" DataStructures = "0.18.13" Dates = "1" DelimitedFiles = "1.6" @@ -78,6 +81,7 @@ Tullio = "0.3.4" julia = "1.7" [extras] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" @@ -87,4 +91,4 @@ TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe" [targets] -test = ["Test", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"] +test = ["Test", "CUDA", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"] diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl new file mode 100644 index 00000000..4086db57 --- /dev/null +++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl @@ -0,0 +1,6 @@ +module JutulDarcyCUDAExt + using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays + include("ilu0.jl") + include("krylov.jl") + include("cuda_utils.jl") +end diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl new file mode 100644 index 00000000..18bc2c66 --- /dev/null +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -0,0 +1,31 @@ +function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim::Int, rowptr, colval, nzval, r0) + rowPtr = CuVector{Ti}(rowptr) + colVal = CuVector{Ti}(colval) + nzVal = CuVector{Tv}(nzval) + dims = blockDim.*sz + dir = 'C' + nnz = length(nzVal)÷(blockDim^2) + J_bsr = CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz) + r_cu = CuVector{Tv}(vec(r0)) + return (J_bsr, r_cu) +end + +function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12) + if ϵ > 0.0 + for (i, v) in enumerate(nzval) + if abs(v) < ϵ + if v > 0 + v = ϵ + else + v = ϵ + end + nzval[i] = v + end + end + end + copyto!(J.nzVal, nzval) +end + +function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r) + copyto!(r_cu, r) +end diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl new file mode 100644 index 00000000..08e99ad6 --- /dev/null +++ b/ext/JutulDarcyCUDAExt/ilu0.jl @@ -0,0 +1,166 @@ +function ilu0_invert!(y, A, Tv, c_char, data) + if Tv == Float32 + bname = CUDA.CUSPARSE.cusparseSbsrsv2_bufferSize + aname = CUDA.CUSPARSE.cusparseSbsrsv2_analysis + sname = CUDA.CUSPARSE.cusparseSbsrsv2_solve + else + bname = CUDA.CUSPARSE.cusparseDbsrsv2_bufferSize + aname = CUDA.CUSPARSE.cusparseDbsrsv2_analysis + sname = CUDA.CUSPARSE.cusparseDbsrsv2_solve + end + if c_char == 'U' + d_char = 'N' + kval = :ilu_upper + else + @assert c_char == 'L' + d_char = 'U' + kval = :ilu_lower + end + transa = 'N' + uplo = c_char + diag = d_char + alpha = 1.0 + index = 'O' + X = y + m,n = size(A) + if m != n + throw(DimensionMismatch("A must be square, but has dimensions ($m,$n)!")) + end + + mb = cld(m, A.blockDim) + + is_analysed = haskey(data, kval) + # is_analysed = false + if is_analysed + info, desc = data[kval] + else + info = CUDA.CUSPARSE.bsrsv2Info_t[0] + desc = CUDA.CUSPARSE.CuMatrixDescriptor('G', uplo, diag, index) + CUDA.CUSPARSE.cusparseCreateBsrsv2Info(info) + data[kval] = (info, desc) + end + + function bufferSize() + out = Ref{Cint}(1) + bname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb, + desc, nonzeros(A), A.rowPtr, A.colVal, A.blockDim, + info[1], out) + return out[] + end + CUDA.CUSPARSE.with_workspace(bufferSize) do buffer + if !is_analysed + aname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb, + desc, nonzeros(A), A.rowPtr, A.colVal, A.blockDim, + info[1], CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer) + posit = Ref{Cint}(1) + CUDA.CUSPARSE.cusparseXbsrsv2_zeroPivot(CUDA.CUSPARSE.handle(), info[1], posit) + if posit[] >= 0 + error("Structural/numerical zero in A at ($(posit[]),$(posit[])))") + end + end + sname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb, + alpha, desc, nonzeros(A), A.rowPtr, A.colVal, + A.blockDim, info[1], X, X, + CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer) + end + # Alternatively we could not cache setup and do the following: + # if c_char == 'U' + # CUDA.CUSPARSE.sv2!('N', 'U', 'N', 1.0, A, y, 'O') + # else + # @assert c_char == 'L' + # CUDA.CUSPARSE.sv2!('N', 'L', 'U', 1.0, A, y, 'O') + # end + return y +end + +function ilu0_gpu_update!(A, Tv, data) + if Tv == Float32 + bname = CUDA.CUSPARSE.cusparseSbsrilu02_bufferSize + aname = CUDA.CUSPARSE.cusparseSbsrilu02_analysis + sname = CUDA.CUSPARSE.cusparseSbsrilu02 + else + bname = CUDA.CUSPARSE.cusparseDbsrilu02_bufferSize + aname = CUDA.CUSPARSE.cusparseDbsrilu02_analysis + sname = CUDA.CUSPARSE.cusparseDbsrilu02 + end + mb = div(size(A, 1), A.blockDim) + ilu0_is_analyzed = haskey(data, :ilu0) + if ilu0_is_analyzed + info, desc = data[:ilu0] + else + info = CUDA.CUSPARSE.ILU0InfoBSR() + desc = CUDA.CUSPARSE.CuMatrixDescriptor('G', 'U', 'N', 'O') + data[:ilu0] = (info, desc) + end + + function bufferSize() + out = Ref{Cint}(1) + bname(CUDA.CUSPARSE.handle(), A.dir, mb, nnz(A), desc, nonzeros(A), + A.rowPtr, A.colVal, A.blockDim, info, out) + return out[] + end + + policy = CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL + CUDA.CUSPARSE.with_workspace(bufferSize) do buffer + if !ilu0_is_analyzed + aname( + CUDA.CUSPARSE.handle(), + A.dir, + mb, + nnz(A), + desc, + nonzeros(A), + A.rowPtr, + A.colVal, + A.blockDim, + info, + policy, + buffer + ) + posit = Ref{Cint}(1) + CUDA.CUSPARSE.cusparseXbsrilu02_zeroPivot(CUDA.CUSPARSE.handle(), info, posit) + if posit[] >= 0 + error("Structural/numerical zero in A at ($(posit[]),$(posit[])))") + end + end + sname( + CUDA.CUSPARSE.handle(), + A.dir, + mb, + SparseArrays.nnz(A), + desc, + nonzeros(A), + A.rowPtr, + A.colVal, + A.blockDim, + info, + policy, + buffer + ) + end +end + + +function Jutul.update_preconditioner!(ilu::ILUZeroPreconditioner, J_bsr::CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}, b, context, executor) where {Tv, Ti} + if isnothing(ilu.factor) + data = Dict() + data[:ilu0_gpu] = copy(J_bsr) + ilu.dim = size(J_bsr) + ilu.factor = data + end + data = ilu.factor + ilu_gpu = data[:ilu0_gpu] + copyto!(ilu_gpu.nzVal, J_bsr.nzVal) + ilu0_gpu_update!(ilu_gpu, Tv, data) +end + +function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv}) where Tv + copyto!(y, x) + # Equivialent but does not skip analyse + # sv2!('N', 'L', 'U', 1.0, P, y, 'O') + # sv2!('N', 'U', 'N', 1.0, P, y, 'O') + P = f.factor[:ilu0_gpu] + ilu0_invert!(y, P, Tv, 'L', f.factor) + ilu0_invert!(y, P, Tv, 'U', f.factor) + CUDA.synchronize() +end diff --git a/ext/JutulDarcyCUDAExt/krylov.jl b/ext/JutulDarcyCUDAExt/krylov.jl new file mode 100644 index 00000000..e69de29b diff --git a/src/ext/ext.jl b/src/ext/ext.jl index f1700836..0b716955 100644 --- a/src/ext/ext.jl +++ b/src/ext/ext.jl @@ -1,3 +1,4 @@ include("ext_makie.jl") include("ext_partitionedarrays.jl") +include("ext_cuda.jl") diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl new file mode 100644 index 00000000..593766a4 --- /dev/null +++ b/src/ext/ext_cuda.jl @@ -0,0 +1,111 @@ +mutable struct CUDAReservoirKrylov{Tv, Ti} <: Jutul.AbstractKrylov + solver::Symbol + config::IterativeSolverConfig + preconditioner + data + storage +end + +function CUDAReservoirKrylov(solver = :gmres, prec = ILUZeroPreconditioner(); + Float_t = Float64, + Int_t = Int32, + kwarg... + ) + cfg = IterativeSolverConfig(; kwarg...) + CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing) +end + +function Jutul.linear_solve!(lsys::Jutul.LinearizedSystem, + krylov::CUDAReservoirKrylov{Tv, Ti}, + model, + storage = nothing, + dt = nothing, + recorder = ProgressRecorder(), + executor = default_executor(); + dx = lsys.dx_buffer, + r = Jutul.vector_residual(lsys), + atol = Jutul.linear_solver_tolerance(krylov, :absolute), + rtol = Jutul.linear_solver_tolerance(krylov, :relative) + ) where {Tv, Ti} + cfg = krylov.config + + atol = convert(Tv, atol) + rtol = convert(Tv, rtol) + t_prep0 = @elapsed Jutul.prepare_linear_solve!(lsys) + + L = lsys[1,1] + J = L.jac + J::Jutul.StaticSparsityMatrixCSR + r0 = L.r_buffer + bz = Jutul.block_size(lsys) + sz = size(J) + n, m = sz + @assert n == m + is_first = !haskey(krylov.data, :J) + t_setup = @elapsed if is_first + krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, L.jac_buffer, r0) + end + J_bsr = krylov.data[:J] + r_cu = krylov.data[:r] + + t_gpu_update = @elapsed begin + update_gpu_block_residual!(r_cu, bz, r0) + update_gpu_block_system!(J_bsr, bz, L.jac_buffer) + end + t_prep = t_gpu_update + t_setup + t_prep0 + max_it = krylov.config.max_iterations + + Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) + prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv) + if cfg.precond_side == :right + preconditioner_arg = (N = prec_op, ) + else + preconditioner_arg = (M = prec_op, ) + end + solve_f, F = Jutul.krylov_jl_solve_function(krylov, J_bsr, r_cu) + solve_f(F, J_bsr, r_cu; + itmax = max_it, + preconditioner_arg..., + verbose = 0, + ldiv = false, + history = true, + rtol = rtol, + atol = atol + ) + dx_gpu, stats = (krylov.storage.x, krylov.storage.stats) + + copyto!(dx, dx_gpu) + @. dx = -dx + + res = stats.residuals + solved = stats.solved + n = length(res) - 1 + + t_prec = 0.0 + if prec_op isa Jutul.PrecondWrapper + t_prec_op = prec_op.time + t_prec_count = prec_op.count + else + t_prec_op = 0.0 + t_prec_count = 0 + end + return Jutul.linear_solve_return(solved, n, stats, precond = t_prec_op, precond_count = t_prec_count, prepare = t_prec + t_prep) +end + +function Base.show(io::IO, krylov::CUDAReservoirKrylov) + rtol = linear_solver_tolerance(krylov.config, :relative) + atol = linear_solver_tolerance(krylov.config, :absolute) + print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))") +end + +function build_gpu_block_system + +end + +function update_gpu_block_system! + +end + +function update_gpu_block_residual! + +end diff --git a/test/gpu.jl b/test/gpu.jl index 19590eb8..75de7e17 100644 --- a/test/gpu.jl +++ b/test/gpu.jl @@ -1,69 +1,60 @@ -using Jutul, JutulDarcy -using LinearAlgebra, Krylov -using Printf -using ForwardDiff -using CUDA, Test - -function test_single_phase_gpu(casename = "pico"; float_type = Float32, pvfrac=0.05, tstep = [1.0])#[1.0, 2.0]) - G, mrst_data = get_minimal_tpfa_grid_from_mrst(casename, extraout = true) - nc = number_of_cells(G) - # Parameters +using Jutul, JutulDarcy, CUDA, Test + +function solve_bl_lsolve(; nx = 10, ny = 1, nstep = nx*ny, lsolve = missing, backend = :csr, step_limit = nothing, kwarg...) + time = 1.0 + T = time + tstep = repeat([T/nstep], nstep) + mesh = CartesianMesh((nx, ny)) + domain = reservoir_domain(mesh) + nc = number_of_cells(domain) + timesteps = tstep*3600*24 bar = 1e5 - p0 = 100*bar # 100 bar - - # Single-phase liquid system (compressible pressure equation) - phase = LiquidPhase() - sys = SinglePhaseSystem(phase) - # Simulation model wraps grid and system together with context (which will be used for GPU etc) - ctx = SingleCUDAContext(float_type) - linsolve = GenericKrylov(Krylov.dqgmres, preconditioner = ILUZeroPreconditioner(), relative_tolerance = 0.01) - model = SimulationModel(G, sys, context = ctx) - - - s = model.secondary_variables - - mu_c = transfer(ctx, repeat([1e-3], 1, nc)) - mu = ConstantVariables(mu_c, Cells(), single_entity = false) - s[:PhaseViscosities] = mu - - s_d = transfer(ctx, ones(1, nc)) - sat = ConstantVariables(s_d, Cells(), single_entity = false) - s[:Saturations] = sat - s[:RelativePermeabilities] = sat - # System state - pv = physical_representation(model).pore_volumes - timesteps = tstep*3600*24 # 1 day, 2 days - tot_time = sum(timesteps) - irate = pvfrac*sum(pv)/tot_time - src = [SourceTerm(1, irate), - SourceTerm(nc, -irate)] - src = CuArray(src) - forces = setup_forces(model, sources = src) - - # State is dict with pressure in each cell - init = Dict(:Pressure => p0) - state0 = setup_state(model, init) - # Model parameters + p0 = 100*bar + sys = ImmiscibleSystem((LiquidPhase(), VaporPhase()), reference_densities = [100.0, 100.0]) + model = setup_reservoir_model(domain, sys, extra_out = false, backend = backend) + kr = BrooksCoreyRelativePermeabilities(sys, [2.0, 2.0], [0.2, 0.2]) + rmodel = reservoir_model(model) + c = [1e-3, 1e-3]/bar + density = ConstantCompressibilityDensities( + p_ref = 100*bar, + density_ref = [100.0, 100.0], + compressibility = c + ) + replace_variables!(rmodel, RelativePermeabilities = kr, PhaseMassDensities = density) + tot_time = sum(timesteps) + pv = pore_volume(domain) + irate = 500*sum(pv)/tot_time + src = SourceTerm(nc, irate, fractional_flow = [0.75, 0.25]) + bc = FlowBoundaryCondition(1, p0/2) + forces = setup_reservoir_forces(model, sources = src, bc = bc) parameters = setup_parameters(model) - parameters[:reference_densities] = CuArray(float_type.([1.0])) - - # linsolve = nothing - sim = Simulator(model, state0 = state0, parameters = parameters) - info_level = 2 - debug_level = 2 - simulate(sim, timesteps, forces = forces, linear_solver = linsolve, - debug_level = debug_level, info_level = info_level, output_states = false) - return true + state0 = setup_reservoir_state(model, Pressure = p0, Saturations = [0.25, 0.75]) + if ismissing(lsolve) + lsolve = reservoir_linsolve(model) + end + if !isnothing(step_limit) + timesteps = timesteps[1:step_limit] + end + states, report = simulate(state0, model, timesteps; failure_cuts_timestep = false, + forces = forces, parameters = parameters, linear_solver = lsolve, kwarg...) + return states[end][:Reservoir][:Saturations][1, :] end -if has_cuda_gpu() - CUDA.allowscalar(false) - casename = "pico" - @testset "GPU multiphase" begin - @testset "Basic flow - single precision" begin - @test test_single_phase_gpu(casename, float_type = Float32) - end - @testset "Basic flow - double precision" begin - @test test_single_phase_gpu(casename, float_type = Float64) + +if CUDA.functional() + do_solve(x) = solve_bl_lsolve( + lsolve = x, + info_level = -1 + ) + @testset "SimulationModel" begin + krylov_cpu = GenericKrylov(:bicgstab, preconditioner = ILUZeroPreconditioner()) + s_cpu = do_solve(krylov_cpu) + + for T in [Float32, Float64] + for solver in [:bicgstab, :gmres] + krylov_cu = JutulDarcy.CUDAReservoirKrylov(solver, Float_t = T) + s_cu = do_solve(krylov_cu) + @test s_cpu ≈ s_cu + end end end end From f778b4ff0cba9d24d0691e4864151c3a6fd6f038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 11:26:46 +0200 Subject: [PATCH 02/32] Update utils.jl --- src/utils.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/utils.jl b/src/utils.jl index ee8c7742..cbad05ef 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -630,18 +630,17 @@ function setup_reservoir_simulator(case::JutulCase; failure_cuts_timestep = true, max_timestep_cuts = 25, inc_tol_dz = Inf, - set_linear_solver = true, timesteps = :auto, relaxation = false, presolve_wells = false, parray_arg = Dict{Symbol, Any}(), + set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver), linear_solver_arg = Dict{Symbol, Any}(), extra_timing_setup = false, nldd_partition = missing, nldd_arg = Dict{Symbol, Any}(), kwarg... ) - set_linear_solver = set_linear_solver || linear_solver isa Symbol # Handle old kwarg... max_timestep = min(max_dt, max_timestep) extra_kwarg = Dict{Symbol, Any}() @@ -699,6 +698,8 @@ function setup_reservoir_simulator(case::JutulCase; if set_linear_solver if info_level < 1 v = -1 + elseif info_level > 4 + v = 1 else v = 0 end From e2ff69fe6abe55446ea2c8ecbdb549c60aa1e213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 13:52:01 +0200 Subject: [PATCH 03/32] Generalize CUDA a bit --- src/ext/ext_cuda.jl | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index 593766a4..cc764948 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -15,7 +15,7 @@ function CUDAReservoirKrylov(solver = :gmres, prec = ILUZeroPreconditioner(); CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing) end -function Jutul.linear_solve!(lsys::Jutul.LinearizedSystem, +function Jutul.linear_solve!(lsys::Jutul.LSystem, krylov::CUDAReservoirKrylov{Tv, Ti}, model, storage = nothing, @@ -36,21 +36,22 @@ function Jutul.linear_solve!(lsys::Jutul.LinearizedSystem, L = lsys[1,1] J = L.jac J::Jutul.StaticSparsityMatrixCSR - r0 = L.r_buffer - bz = Jutul.block_size(lsys) + csr_block_buffer = L.jac_buffer + + bz = Jutul.block_size(L) sz = size(J) n, m = sz @assert n == m is_first = !haskey(krylov.data, :J) t_setup = @elapsed if is_first - krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, L.jac_buffer, r0) + krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r) end J_bsr = krylov.data[:J] r_cu = krylov.data[:r] t_gpu_update = @elapsed begin - update_gpu_block_residual!(r_cu, bz, r0) - update_gpu_block_system!(J_bsr, bz, L.jac_buffer) + update_gpu_block_residual!(r_cu, bz, r) + update_gpu_block_system!(J_bsr, bz, csr_block_buffer) end t_prep = t_gpu_update + t_setup + t_prep0 max_it = krylov.config.max_iterations From 93bb1e1394685d6ee6d410f13c53c405bb0c87ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 14:55:37 +0200 Subject: [PATCH 04/32] Put in place some Schur stuff --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 59 +++++++++++++++++++++++++++++ src/ext/ext_cuda.jl | 7 ++++ 2 files changed, 66 insertions(+) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index 18bc2c66..dcffacb7 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -29,3 +29,62 @@ end function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r) copyto!(r_cu, r) end + +function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::LinearizedSystem) + return nothing +end + +function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSystem) + @assert size(lsys.subsystems) == (2, 2) + @assert length(lsys.schur_buffer) == 2 + buf_full_cpu = lsys.schur_buffer[1] + buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2] + B_cpu = lsys[1, 1].jac # Already transferred, not needed + C_cpu = lsys[1, 2].jac + D_cpu = lsys[2, 1].jac + E_cpu = lsys[2, 2].jac + + E_factor = only(lsys.factor.factor) + E_L_cpu = E_factor.L + E_U_cpu = E_factor.U + + to_gpu(x) = copy_to_gpu(x, Tv, Ti) + + D = to_gpu(D_cpu) + E_L = to_gpu(E_L_cpu) + E_U = to_gpu(E_U_cpu) + C = to_gpu(C_cpu) + + buf_1 = to_gpu(buf_1_cpu) + buf_2 = to_gpu(buf_2_cpu) + # Operations and format: + # B C + # D E + # mul!(b_buf_2, D_i, x) + # ldiv!(b_buf_1, E_i, b_buf_2) + # mul!(res, C_i, b_buf_1, -α, true) + + return Dict( + :C => C, + :D => D, + :buf_1 => buf_1, + :buf_2 => buf_2, + :E_L => E_L, + :E_U => E_U + ) +end + +function copy_to_gpu(x, Ti, Tv) + error("Internal converter not set up for $(typeof(x))") +end + +function copy_to_gpu(x::SparseMatrixCSC{Tvc, Tic}, Tv, Ti) where {Tvc, Tic} + if Tvc != Tv || Tic != Ti + x = SparseMatrixCSC{Tv, Ti}(x) + end + return CUDA.CUSPARSE.CuSparseMatrixCSC(x) +end + +function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc} + return convert(CuVector{Tv}, x) +end diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index cc764948..c7f96ff8 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -33,6 +33,8 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, rtol = convert(Tv, rtol) t_prep0 = @elapsed Jutul.prepare_linear_solve!(lsys) + is_single_linear_system = lsys isa LinearizedSystem + L = lsys[1,1] J = L.jac J::Jutul.StaticSparsityMatrixCSR @@ -45,6 +47,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, is_first = !haskey(krylov.data, :J) t_setup = @elapsed if is_first krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r) + krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys) end J_bsr = krylov.data[:J] r_cu = krylov.data[:r] @@ -107,6 +110,10 @@ function update_gpu_block_system! end +function build_gpu_schur_system + +end + function update_gpu_block_residual! end From b55ba61c4969ddcfc89c1ac23f652862eb75b0c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 15:07:56 +0200 Subject: [PATCH 05/32] Put in place some more factorization updates --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 15 +++++++++++++-- src/ext/ext_cuda.jl | 12 ++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index dcffacb7..8713879d 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -39,10 +39,10 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst @assert length(lsys.schur_buffer) == 2 buf_full_cpu = lsys.schur_buffer[1] buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2] - B_cpu = lsys[1, 1].jac # Already transferred, not needed + # B_cpu = lsys[1, 1].jac # Already transferred, not needed C_cpu = lsys[1, 2].jac D_cpu = lsys[2, 1].jac - E_cpu = lsys[2, 2].jac + # E_cpu = lsys[2, 2].jac # Only factorization needed I think. E_factor = only(lsys.factor.factor) E_L_cpu = E_factor.L @@ -74,6 +74,17 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst ) end +function JutulDarcy.update_gpu_schur_system!(schur, lsys) + C_cpu = lsys[1, 2].jac + copyto!(schur[:C].nzVal, C_cpu.nzval) + D_cpu = lsys[2, 1].jac + copyto!(schur[:D].nzVal, D_cpu.nzval) + E_factor = only(lsys.factor.factor) + copyto!(schur[:E_L].nzVal, E_factor.L.nzval) + copyto!(schur[:E_U].nzVal, E_factor.U.nzval) + return schur +end + function copy_to_gpu(x, Ti, Tv) error("Internal converter not set up for $(typeof(x))") end diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index c7f96ff8..8a94b9c6 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -51,10 +51,14 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, end J_bsr = krylov.data[:J] r_cu = krylov.data[:r] + schur = krylov.data[:schur] t_gpu_update = @elapsed begin - update_gpu_block_residual!(r_cu, bz, r) - update_gpu_block_system!(J_bsr, bz, csr_block_buffer) + if !is_first + update_gpu_block_residual!(r_cu, bz, r) + update_gpu_block_system!(J_bsr, bz, csr_block_buffer) + update_gpu_schur_system!(schur, lsys) + end end t_prep = t_gpu_update + t_setup + t_prep0 max_it = krylov.config.max_iterations @@ -117,3 +121,7 @@ end function update_gpu_block_residual! end + +function update_gpu_schur_system! + +end From e3c230e62680bfb64f109167301b43b6fe849be0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 15:17:17 +0200 Subject: [PATCH 06/32] Schur additions --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 4 ++++ src/ext/ext_cuda.jl | 30 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index 8713879d..04d19163 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -74,6 +74,10 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst ) end +function JutulDarcy.update_gpu_schur_system!(schur::Nothing, lsys) + return schur +end + function JutulDarcy.update_gpu_schur_system!(schur, lsys) C_cpu = lsys[1, 2].jac copyto!(schur[:C].nzVal, C_cpu.nzval) diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index 8a94b9c6..665e4cfd 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -70,8 +70,9 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, else preconditioner_arg = (M = prec_op, ) end - solve_f, F = Jutul.krylov_jl_solve_function(krylov, J_bsr, r_cu) - solve_f(F, J_bsr, r_cu; + operator = gpu_system_linear_operator(J_bsr, schur, Tv) + solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu) + solve_f(F, operator, r_cu; itmax = max_it, preconditioner_arg..., verbose = 0, @@ -106,6 +107,31 @@ function Base.show(io::IO, krylov::CUDAReservoirKrylov) print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))") end +function gpu_system_linear_operator(J, schur::Nothing, Tv) + return Jutul.LinearOperators.LinearOperator(J) +end + +function gpu_system_linear_operator(J, schur, Tv) + n, m = size(J) + C = schur[:C] + D = schur[:D] + E_L = schur[:E_L] + E_U = schur[:E_U] + buf_1 = schur[:buf_1] + buf_2 = schur[:buf_2] + mul!(x, y) = schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U) + return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!) +end + +function schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U) + mul!(buf_2, D, x) + ldiv!(buf_1, E_L, buf_2) + ldiv!(buf_2, E_U, buf_1) + mul!(res, C, buf_2, -α, true) + + error() +end + function build_gpu_block_system end From ca0572fdced0507da0ff8b4672c44380e48faa21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 15:33:15 +0200 Subject: [PATCH 07/32] Update mul --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 10 ++++++++++ src/ext/ext_cuda.jl | 9 ++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index 04d19163..18b16210 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -103,3 +103,13 @@ end function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc} return convert(CuVector{Tv}, x) end + +function JutulDarcy.schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U) + mul!(buf_2, D, x) + # ldiv!(buf_1, E_L, buf_2) + CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_L, buf_2, 'O') + # ldiv!(buf_2, E_U, buf_1) + CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_U, buf_2, 'O') + mul!(res, C, buf_2, -α, true) + error() +end diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index 665e4cfd..a7763e7f 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -119,17 +119,12 @@ function gpu_system_linear_operator(J, schur, Tv) E_U = schur[:E_U] buf_1 = schur[:buf_1] buf_2 = schur[:buf_2] - mul!(x, y) = schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U) + mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U) return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!) end -function schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U) - mul!(buf_2, D, x) - ldiv!(buf_1, E_L, buf_2) - ldiv!(buf_2, E_U, buf_1) - mul!(res, C, buf_2, -α, true) +function schur_mul_gpu! - error() end function build_gpu_block_system From 267a1cac2bfa5d4e2817532dfb12dd58b531d7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 20:52:19 +0200 Subject: [PATCH 08/32] Rough draft of schur mostly on GPU --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 42 ++++++++++++++--------------- ext/JutulDarcyCUDAExt/ilu0.jl | 2 +- src/ext/ext_cuda.jl | 18 ++++++++----- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index 18b16210..e3409ada 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -45,18 +45,18 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst # E_cpu = lsys[2, 2].jac # Only factorization needed I think. E_factor = only(lsys.factor.factor) - E_L_cpu = E_factor.L - E_U_cpu = E_factor.U + # E_L_cpu = E_factor.L + # E_U_cpu = E_factor.U to_gpu(x) = copy_to_gpu(x, Tv, Ti) D = to_gpu(D_cpu) - E_L = to_gpu(E_L_cpu) - E_U = to_gpu(E_U_cpu) + # E_L = to_gpu(E_L_cpu) + # E_U = to_gpu(E_U_cpu) C = to_gpu(C_cpu) - buf_1 = to_gpu(buf_1_cpu) - buf_2 = to_gpu(buf_2_cpu) + buf = to_gpu(buf_1_cpu) + # buf_2 = to_gpu(buf_2_cpu) # Operations and format: # B C # D E @@ -67,10 +67,10 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst return Dict( :C => C, :D => D, - :buf_1 => buf_1, - :buf_2 => buf_2, - :E_L => E_L, - :E_U => E_U + :buf => buf, + :buf_1_cpu => buf_1_cpu, + :buf_2_cpu => buf_2_cpu, + :E_factor => E_factor ) end @@ -83,9 +83,6 @@ function JutulDarcy.update_gpu_schur_system!(schur, lsys) copyto!(schur[:C].nzVal, C_cpu.nzval) D_cpu = lsys[2, 1].jac copyto!(schur[:D].nzVal, D_cpu.nzval) - E_factor = only(lsys.factor.factor) - copyto!(schur[:E_L].nzVal, E_factor.L.nzval) - copyto!(schur[:E_U].nzVal, E_factor.U.nzval) return schur end @@ -104,12 +101,15 @@ function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc} return convert(CuVector{Tv}, x) end -function JutulDarcy.schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U) - mul!(buf_2, D, x) - # ldiv!(buf_1, E_L, buf_2) - CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_L, buf_2, 'O') - # ldiv!(buf_2, E_U, buf_1) - CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_U, buf_2, 'O') - mul!(res, C, buf_2, -α, true) - error() +function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor) + # Working on GPU + mul!(y, J, x, α, β) + mul!(buf, D, x) + # Move over to CPU for this part + copyto!(buf1_cpu, buf) + ldiv!(buf2_cpu, E_factor, buf1_cpu) + # Back to GPU for the big array... + copyto!(buf, buf2_cpu) + mul!(y, C, buf, -α, true) + return y end diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl index 08e99ad6..d7e1fd99 100644 --- a/ext/JutulDarcyCUDAExt/ilu0.jl +++ b/ext/JutulDarcyCUDAExt/ilu0.jl @@ -1,4 +1,4 @@ -function ilu0_invert!(y, A, Tv, c_char, data) +function ilu0_invert!(y, A::CUDA.CUSPARSE.CuSparseMatrixBSR, Tv, c_char, data) if Tv == Float32 bname = CUDA.CUSPARSE.cusparseSbsrsv2_bufferSize aname = CUDA.CUSPARSE.cusparseSbsrsv2_analysis diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index a7763e7f..af312023 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -48,10 +48,12 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, t_setup = @elapsed if is_first krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r) krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys) + krylov.data[:dx_cpu] = zeros(n*bz) end J_bsr = krylov.data[:J] r_cu = krylov.data[:r] schur = krylov.data[:schur] + dx_cpu = krylov.data[:dx_cpu] t_gpu_update = @elapsed begin if !is_first @@ -83,8 +85,9 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, ) dx_gpu, stats = (krylov.storage.x, krylov.storage.stats) - copyto!(dx, dx_gpu) - @. dx = -dx + copyto!(dx_cpu, dx_gpu) + Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx) + # @. dx = -dx res = stats.residuals solved = stats.solved @@ -115,11 +118,12 @@ function gpu_system_linear_operator(J, schur, Tv) n, m = size(J) C = schur[:C] D = schur[:D] - E_L = schur[:E_L] - E_U = schur[:E_U] - buf_1 = schur[:buf_1] - buf_2 = schur[:buf_2] - mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U) + E_factor = schur[:E_factor] + buf = schur[:buf] + buf_1 = schur[:buf_1_cpu] + buf_2 = schur[:buf_2_cpu] + + mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf, buf_1, buf_2, E_factor) return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!) end From 391b16ff505e15809140a6d1b5bc2eef436a425e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 21:22:32 +0200 Subject: [PATCH 09/32] Make GPU and CPU operations independent in schur multiply --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index e3409ada..d0b1f75a 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -102,14 +102,16 @@ function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc} end function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor) - # Working on GPU - mul!(y, J, x, α, β) - mul!(buf, D, x) - # Move over to CPU for this part - copyto!(buf1_cpu, buf) - ldiv!(buf2_cpu, E_factor, buf1_cpu) - # Back to GPU for the big array... - copyto!(buf, buf2_cpu) + @sync begin + # Working on GPU + @async mul!(y, J, x, α, β) + mul!(buf, D, x) + # Move over to CPU for this part + copyto!(buf1_cpu, buf) + ldiv!(buf2_cpu, E_factor, buf1_cpu) + # Back to GPU for the big array... + copyto!(buf, buf2_cpu) + end mul!(y, C, buf, -α, true) return y end From a5d12e8bbfdbc4d865b9bc3c8204e43cd02b9b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Thu, 24 Oct 2024 21:49:48 +0200 Subject: [PATCH 10/32] Add timing to CUDA calls --- ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl | 4 ++++ ext/JutulDarcyCUDAExt/cuda_utils.jl | 2 +- src/ext/ext_cuda.jl | 17 +++++++++-------- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl index 4086db57..c401cbf8 100644 --- a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl +++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl @@ -1,5 +1,9 @@ module JutulDarcyCUDAExt using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays + import Jutul: @tic + + timeit_debug_enabled() = Jutul.timeit_debug_enabled() + include("ilu0.jl") include("krylov.jl") include("cuda_utils.jl") diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index d0b1f75a..1da70ab0 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -102,7 +102,7 @@ function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc} end function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor) - @sync begin + @tic "schur apply" @sync begin # Working on GPU @async mul!(y, J, x, α, β) mul!(buf, D, x) diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index af312023..57004655 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -31,7 +31,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, atol = convert(Tv, atol) rtol = convert(Tv, rtol) - t_prep0 = @elapsed Jutul.prepare_linear_solve!(lsys) + t_prep0 = @elapsed @tic "prepare" Jutul.prepare_linear_solve!(lsys) is_single_linear_system = lsys isa LinearizedSystem @@ -45,7 +45,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, n, m = sz @assert n == m is_first = !haskey(krylov.data, :J) - t_setup = @elapsed if is_first + t_setup = @elapsed @tic "initial_gpu" if is_first krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r) krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys) krylov.data[:dx_cpu] = zeros(n*bz) @@ -55,7 +55,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, schur = krylov.data[:schur] dx_cpu = krylov.data[:dx_cpu] - t_gpu_update = @elapsed begin + t_gpu_update = @elapsed @tic "to_gpu" begin if !is_first update_gpu_block_residual!(r_cu, bz, r) update_gpu_block_system!(J_bsr, bz, csr_block_buffer) @@ -65,7 +65,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, t_prep = t_gpu_update + t_setup + t_prep0 max_it = krylov.config.max_iterations - Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) + @tic "precond" Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv) if cfg.precond_side == :right preconditioner_arg = (N = prec_op, ) @@ -74,7 +74,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, end operator = gpu_system_linear_operator(J_bsr, schur, Tv) solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu) - solve_f(F, operator, r_cu; + @tic "solve" solve_f(F, operator, r_cu; itmax = max_it, preconditioner_arg..., verbose = 0, @@ -85,9 +85,10 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, ) dx_gpu, stats = (krylov.storage.x, krylov.storage.stats) - copyto!(dx_cpu, dx_gpu) - Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx) - # @. dx = -dx + @tic "update dx" begin + copyto!(dx_cpu, dx_gpu) + Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx) + end res = stats.residuals solved = stats.solved From 281e6524b07f7e4fb7ec03b286fca820d2a273fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 10:09:38 +0200 Subject: [PATCH 11/32] Update interface --- ext/JutulDarcyCUDAExt/ilu0.jl | 6 +++++- src/ext/ext_cuda.jl | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl index d7e1fd99..7577a1a3 100644 --- a/ext/JutulDarcyCUDAExt/ilu0.jl +++ b/ext/JutulDarcyCUDAExt/ilu0.jl @@ -162,5 +162,9 @@ function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv} P = f.factor[:ilu0_gpu] ilu0_invert!(y, P, Tv, 'L', f.factor) ilu0_invert!(y, P, Tv, 'U', f.factor) - CUDA.synchronize() + # CUDA.synchronize() +end + +function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu) + Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) end diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index 57004655..e444d960 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -65,7 +65,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, t_prep = t_gpu_update + t_setup + t_prep0 max_it = krylov.config.max_iterations - @tic "precond" Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) + @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu) prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv) if cfg.precond_side == :right preconditioner_arg = (N = prec_op, ) @@ -105,6 +105,10 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, return Jutul.linear_solve_return(solved, n, stats, precond = t_prec_op, precond_count = t_prec_count, prepare = t_prec + t_prep) end +function gpu_update_preconditioner! + +end + function Base.show(io::IO, krylov::CUDAReservoirKrylov) rtol = linear_solver_tolerance(krylov.config, :relative) atol = linear_solver_tolerance(krylov.config, :absolute) From e5e20eaec66ec6dc1625f7c88e6a6b0e8905d6d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 10:37:40 +0200 Subject: [PATCH 12/32] Draft extension to upcoming AMGX release --- Project.toml | 4 ++++ ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl | 12 ++++++++++++ src/ext/ext.jl | 2 +- src/ext/ext_amgx.jl | 16 ++++++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl create mode 100644 src/ext/ext_amgx.jl diff --git a/Project.toml b/Project.toml index bbcf5bbc..a10fc989 100644 --- a/Project.toml +++ b/Project.toml @@ -34,6 +34,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc" [weakdeps] +AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" @@ -42,6 +43,7 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9" [extensions] +JutulDarcyAMGXExt = ["AMGX", "CUDA"] JutulDarcyCUDAExt = "CUDA" JutulDarcyGLMakieExt = "GLMakie" JutulDarcyMakieExt = "Makie" @@ -50,6 +52,7 @@ JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"] [compat] AlgebraicMultigrid = "0.5.1, 0.6.0" Artifacts = "1" +AMGX = "0.2" CUDA = "5" DataStructures = "0.18.13" Dates = "1" @@ -81,6 +84,7 @@ Tullio = "0.3.4" julia = "1.7" [extras] +AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl new file mode 100644 index 00000000..b896355f --- /dev/null +++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl @@ -0,0 +1,12 @@ +module JutulDarcyAMGXExt + using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays, AMGX + import Jutul: @tic + + timeit_debug_enabled() = Jutul.timeit_debug_enabled() + + function JutulDarcy.gpu_update_preconditioner!(prec::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu) + error() + Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) + end + +end diff --git a/src/ext/ext.jl b/src/ext/ext.jl index 0b716955..aaa978d2 100644 --- a/src/ext/ext.jl +++ b/src/ext/ext.jl @@ -1,4 +1,4 @@ include("ext_makie.jl") include("ext_partitionedarrays.jl") include("ext_cuda.jl") - +include("ext_amgx.jl") diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl new file mode 100644 index 00000000..fb6035ec --- /dev/null +++ b/src/ext/ext_amgx.jl @@ -0,0 +1,16 @@ +struct AMGXPreconditioner <: Jutul.JutulPreconditioner + settings::Dict{Symbol, Any} + data::Dict{Symbol, Any} + function AMGXPreconditioner(settings::Dict{Symbol, Any}) + data = Dict{Symbol, Any}() + new(settings, data) + end +end + +function AMGXPreconditioner(; kwarg...) + settings = Dict{Symbol, Any}() + for (k, v) in kwarg + settings[k] = v + end + return AMGXPreconditioner(settings) +end From 5b3251b1b307051d79a083580fc5c16e2ac40d57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 12:59:43 +0200 Subject: [PATCH 13/32] Add some AMGX init code --- Project.toml | 2 +- ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl | 12 ++-- ext/JutulDarcyAMGXExt/cpr.jl | 71 ++++++++++++++++++++++ src/ext/ext_amgx.jl | 19 ++++++ 4 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 ext/JutulDarcyAMGXExt/cpr.jl diff --git a/Project.toml b/Project.toml index a10fc989..20bf60e1 100644 --- a/Project.toml +++ b/Project.toml @@ -43,7 +43,7 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9" [extensions] -JutulDarcyAMGXExt = ["AMGX", "CUDA"] +JutulDarcyAMGXExt = "AMGX" JutulDarcyCUDAExt = "CUDA" JutulDarcyGLMakieExt = "GLMakie" JutulDarcyMakieExt = "Makie" diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl index b896355f..b4baa274 100644 --- a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl +++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl @@ -1,12 +1,14 @@ module JutulDarcyAMGXExt - using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays, AMGX + using JutulDarcy, Jutul, AMGX, AMGX.CUDA, LinearAlgebra, SparseArrays + import JutulDarcy: AMGXPreconditioner import Jutul: @tic timeit_debug_enabled() = Jutul.timeit_debug_enabled() - function JutulDarcy.gpu_update_preconditioner!(prec::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu) - error() - Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) - end + include("cpr.jl") + function __init__() + # TODO: Figure out a way to not always do this? + AMGX.initialize() + end end diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl new file mode 100644 index 00000000..c0d97a50 --- /dev/null +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -0,0 +1,71 @@ +function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR) + data = amgx.data + is_first = !haskey(data, :A_p) + if is_first + initialize_amgx_structure!(amgx, A_p) + end + error() +end + +mutable struct AMGXStorage{C, R, V, M, S} + config::C + resources::R + r::V + x::V + matrix::M + solver::S + function AMGXStorage(config, amgx_mode = AMGX.dDDI) + resources = AMGX.Resources(config) + r = AMGX.AMGXVector(resources, amgx_mode) + x = AMGX.AMGXVector(resources, amgx_mode) + matrix = AMGX.AMGXMatrix(resources, amgx_mode) + solver = AMGX.Solver(resources, amgx_mode, config) + function finalize_storage!(amgx_s::AMGXStorage) + close(amgx_s.solver) + close(amgx_s.matrix) + close(amgx_s.r) + close(amgx_s.x) + close(amgx_s.resources) + close(amgx_s.config) + end + s = new{ + typeof(config), + typeof(resources), + typeof(r), + typeof(matrix), + typeof(solver) + }(config, resources, r, x, matrix, solver) + return finalizer(finalize_storage!, s) + end +end + +function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv + if Tv == Float64 + amgx_mode = AMGX.dDDI + else + amgx_mode = AMGX.dFFI + end + # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver. + n, m = size(A) + @assert n == m + config = AMGX.Config(amgx.settings) + s = AMGXStorage(config, amgx_mode) + # RHS and solution vectors to right size just in case + AMGX.set_zero!(s.x, n) + AMGX.set_zero!(s.r, n) + + row_ptr = Cint.(A.At.colptr .- 1) + colval = Cint.(A.At.rowval .- 1) + nzval = A.At.nzval + nzval = Tv.(nzval) + + AMGX.upload!(s.matrix, + row_ptr, + colval, + nzval + ) + + amgx.data[:storage] = s + return amgx +end + diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index fb6035ec..d1fae233 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -14,3 +14,22 @@ function AMGXPreconditioner(; kwarg...) end return AMGXPreconditioner(settings) end + +function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsityMatrixCSR, b, context::ParallelCSRContext, executor) + # Intentionally do nothing - a bit hackish +end + +function JutulDarcy.gpu_update_preconditioner!(cpr::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu) + Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor) + + JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p) + # Next transfer the coarse system to GPU + # Update if needed + # Then do the whole apply here + error() + # Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) +end + +function update_amgx_pressure_system! + +end \ No newline at end of file From d914ceb609807fc8c3d143227ee961257e5423e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 13:22:34 +0200 Subject: [PATCH 14/32] More CPR testing --- ext/JutulDarcyAMGXExt/cpr.jl | 63 +++++++++++++++++++----------------- src/cpr.jl | 6 ++-- src/ext/ext_amgx.jl | 28 ++++++++++------ 3 files changed, 57 insertions(+), 40 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index c0d97a50..078a95ce 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -1,10 +1,8 @@ function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR) data = amgx.data - is_first = !haskey(data, :A_p) - if is_first - initialize_amgx_structure!(amgx, A_p) - end - error() + initialize_amgx_structure!(amgx, A_p) + update_pressure_system!(amgx, A_p) + return amgx end mutable struct AMGXStorage{C, R, V, M, S} @@ -39,33 +37,40 @@ mutable struct AMGXStorage{C, R, V, M, S} end end -function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv - if Tv == Float64 - amgx_mode = AMGX.dDDI - else - amgx_mode = AMGX.dFFI - end - # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver. - n, m = size(A) - @assert n == m - config = AMGX.Config(amgx.settings) - s = AMGXStorage(config, amgx_mode) - # RHS and solution vectors to right size just in case - AMGX.set_zero!(s.x, n) - AMGX.set_zero!(s.r, n) +function update_pressure_system!(amgx, A_cpu) + A_gpu = amgx.data[:storage].matrix + AMGX.replace_coefficients!(A_gpu, A_cpu.At.nzval) + return amgx +end - row_ptr = Cint.(A.At.colptr .- 1) - colval = Cint.(A.At.rowval .- 1) - nzval = A.At.nzval - nzval = Tv.(nzval) +function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv + if !haskey(amgx.data, :storage) + if Tv == Float64 + amgx_mode = AMGX.dDDI + else + amgx_mode = AMGX.dFFI + end + # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver. + n, m = size(A) + @assert n == m + config = AMGX.Config(amgx.settings) + s = AMGXStorage(config, amgx_mode) + # RHS and solution vectors to right size just in case + AMGX.set_zero!(s.x, n) + AMGX.set_zero!(s.r, n) - AMGX.upload!(s.matrix, - row_ptr, - colval, - nzval - ) + row_ptr = Cint.(A.At.colptr .- 1) + colval = Cint.(A.At.rowval .- 1) + nzval = A.At.nzval + nzval = Tv.(nzval) - amgx.data[:storage] = s + AMGX.upload!(s.matrix, + row_ptr, + colval, + nzval + ) + amgx.data[:storage] = s + end return amgx end diff --git a/src/cpr.jl b/src/cpr.jl index a6b91fea..23233348 100644 --- a/src/cpr.jl +++ b/src/cpr.jl @@ -168,11 +168,13 @@ function default_psolve(; max_levels = 10, max_coarse = 10, amgcl_type = :amg, t end end -function update_preconditioner!(cpr::CPRPreconditioner, lsys, model, storage, recorder, executor) +function update_preconditioner!(cpr::CPRPreconditioner, lsys, model, storage, recorder, executor; update_system_precond = true) rmodel = reservoir_model(model, type = :flow) ctx = rmodel.context update_p = update_cpr_internals!(cpr, lsys, model, storage, recorder, executor) - @tic "s-precond" update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor) + if update_system_precond + @tic "s-precond" update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor) + end if update_p @tic "p-precond" update_preconditioner!(cpr.pressure_precond, cpr.storage.A_p, cpr.storage.r_p, ctx, executor) elseif should_update_cpr(cpr, recorder, :partial) diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index d1fae233..45a48100 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -15,21 +15,31 @@ function AMGXPreconditioner(; kwarg...) return AMGXPreconditioner(settings) end +const AMGXCPR = CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any} + function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsityMatrixCSR, b, context::ParallelCSRContext, executor) # Intentionally do nothing - a bit hackish end -function JutulDarcy.gpu_update_preconditioner!(cpr::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu) - Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor) - +function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu) + Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor, update_system_precond = false) + # Transfer pressure system to GPU JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p) - # Next transfer the coarse system to GPU - # Update if needed - # Then do the whole apply here - error() - # Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) end function update_amgx_pressure_system! -end \ No newline at end of file +end + +function Jutul.apply!(x, cpr::AMGXCPR, r) + # Apply smoother + cpr_s = cpr.storage + bz = cpr_s.block_size + + Jutul.apply!(x, cpr.system_precond, r) + # Correct the residual + # Construct pressure residual + # Apply pressure preconditioner + # Update increments for pressure + error() +end From 253ff5c04310d9decac109dc0c14d296174d071d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 19:46:40 +0200 Subject: [PATCH 15/32] More work on GPU CPR --- ext/JutulDarcyAMGXExt/cpr.jl | 31 +++++++++++++++++++--- ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl | 1 + ext/JutulDarcyCUDAExt/cpr.jl | 21 +++++++++++++++ ext/JutulDarcyCUDAExt/ilu0.jl | 4 +-- src/ext/ext_amgx.jl | 23 +++++++++++++--- src/ext/ext_cuda.jl | 9 +++++-- 6 files changed, 77 insertions(+), 12 deletions(-) create mode 100644 ext/JutulDarcyCUDAExt/cpr.jl diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 078a95ce..03f16bf4 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -1,6 +1,6 @@ -function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR) +function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) data = amgx.data - initialize_amgx_structure!(amgx, A_p) + initialize_amgx_structure!(amgx, A_p, Tv) update_pressure_system!(amgx, A_p) return amgx end @@ -43,14 +43,13 @@ function update_pressure_system!(amgx, A_cpu) return amgx end -function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv +function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) if !haskey(amgx.data, :storage) if Tv == Float64 amgx_mode = AMGX.dDDI else amgx_mode = AMGX.dFFI end - # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver. n, m = size(A) @assert n == m config = AMGX.Config(amgx.settings) @@ -70,7 +69,31 @@ function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR nzval ) amgx.data[:storage] = s + amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n) end return amgx end +function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op) + # TODO: Rename this. + data = cpr.pressure_precond.data + if !haskey(data, :w_p) + Tv = eltype(r_cu) + n = length(r_cu) + cpr_s = cpr.storage + bz = cpr_s.block_size + + # cpr.pressure_precond.data[:buffer_full] = AMGX.CUDA.CuVector{Tv}(undef, n) + bz_w, n_w = size(cpr_s.w_p) + @assert n == bz*n_w + data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w) + data[:main_system] = J_bsr + data[:operator] = op + end + # Put updated weights on GPU + w_p_cpu = cpr_s.w_p + w_p_gpu = data[:w_p] + @assert size(w_p_cpu) == size(w_p_gpu) + copyto!(w_p_gpu, w_p_cpu) + return cpr +end diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl index c401cbf8..5c467057 100644 --- a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl +++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl @@ -7,4 +7,5 @@ module JutulDarcyCUDAExt include("ilu0.jl") include("krylov.jl") include("cuda_utils.jl") + include("cpr.jl") end diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl new file mode 100644 index 00000000..73e6a871 --- /dev/null +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -0,0 +1,21 @@ +function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r) + # TODO: Actually use CUDA threads here. + @assert eltype(r_p) == eltype(w_p) == eltype(r) + CUDA.@cuda reduce_residual_kernel!(r_p, w_p, r) + return r_p +end + +function reduce_residual_kernel!(r_p, w_p, r_full) + index = threadIdx().x # this example only requires linear indexing, so just use `x` + stride = blockDim().x + T = eltype(r_p) + ncomp = size(w_p, 1) + for cell in index:stride:length(r_p) + v = zero(T) + for comp in axes(w_p, 1) + v += w_p[comp, cell] * r_full[(cell-1)*ncomp + comp] + end + r_p[cell] = v + end + return nothing +end diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl index 7577a1a3..8553ccd5 100644 --- a/ext/JutulDarcyCUDAExt/ilu0.jl +++ b/ext/JutulDarcyCUDAExt/ilu0.jl @@ -165,6 +165,6 @@ function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv} # CUDA.synchronize() end -function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu) - Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor) +function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op) + Jutul.update_preconditioner!(prec, J_bsr, r_cu, model.context, executor) end diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index 45a48100..f8d137de 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -21,24 +21,39 @@ function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsity # Intentionally do nothing - a bit hackish end -function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu) +function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op) Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor, update_system_precond = false) # Transfer pressure system to GPU - JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p) + JutulDarcy.gpu_update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op) + JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p, eltype(J_bsr)) + # How to get the linear operator in here? + gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op) end function update_amgx_pressure_system! end +function gpu_cpr_setup_buffers! + +end + function Jutul.apply!(x, cpr::AMGXCPR, r) # Apply smoother - cpr_s = cpr.storage - bz = cpr_s.block_size Jutul.apply!(x, cpr.system_precond, r) # Correct the residual + A_ps = cpr.pressure_precond.data[:operator] + # buf = cpr.pressure_precond.data[:buffer_full] + # buf_p = cpr.pressure_precond.data[:buffer_p] + JutulDarcy.correct_residual!(r, A_ps, x) # Construct pressure residual + r_p = cpr.pressure_precond.data[:buffer_p] + w_p = cpr.pressure_precond.data[:w_p] + ncomp, ncell = size(w_p) + gpu_reduce_residual!(r_p, w_p, r) + # @info "??" typeof(r_p) typeof(w_p) typeof(r) + # @tullio r_p[cell] = w_p[comp, cell] * r_tmp[comp, cell] # Apply pressure preconditioner # Update increments for pressure error() diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index e444d960..33e123ee 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -65,14 +65,15 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem, t_prep = t_gpu_update + t_setup + t_prep0 max_it = krylov.config.max_iterations - @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu) + operator = gpu_system_linear_operator(J_bsr, schur, Tv) + + @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, operator) prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv) if cfg.precond_side == :right preconditioner_arg = (N = prec_op, ) else preconditioner_arg = (M = prec_op, ) end - operator = gpu_system_linear_operator(J_bsr, schur, Tv) solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu) @tic "solve" solve_f(F, operator, r_cu; itmax = max_it, @@ -155,3 +156,7 @@ end function update_gpu_schur_system! end + +function gpu_reduce_residual! + +end From 7ee843ded74ecebf0adecf6d76a5671c02552668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 20:05:03 +0200 Subject: [PATCH 16/32] CPR apply seems to run once... --- ext/JutulDarcyAMGXExt/cpr.jl | 15 ++++++++++++++- ext/JutulDarcyCUDAExt/cpr.jl | 19 ++++++++++++++++++- src/ext/ext_amgx.jl | 12 +++++++----- src/ext/ext_cuda.jl | 4 ++++ 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 03f16bf4..5a33c96a 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -2,6 +2,8 @@ function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p:: data = amgx.data initialize_amgx_structure!(amgx, A_p, Tv) update_pressure_system!(amgx, A_p) + s = amgx.data[:storage] + AMGX.setup!(s.solver, s.matrix) return amgx end @@ -38,11 +40,22 @@ mutable struct AMGXStorage{C, R, V, M, S} end function update_pressure_system!(amgx, A_cpu) - A_gpu = amgx.data[:storage].matrix + s = amgx.data[:storage] + A_gpu = s.matrix AMGX.replace_coefficients!(A_gpu, A_cpu.At.nzval) return amgx end +function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p) + s = amgx.data[:storage] + n = length(r_p) + AMGX.upload!(s.r, r_p) + AMGX.set_zero!(s.x, n) + AMGX.solve!(s.x, s.solver, s.r) + AMGX.download!(r_p, s.x) + return r_p +end + function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) if !haskey(amgx.data, :storage) if Tv == Float64 diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index 73e6a871..254e30e0 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -6,7 +6,7 @@ function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r) end function reduce_residual_kernel!(r_p, w_p, r_full) - index = threadIdx().x # this example only requires linear indexing, so just use `x` + index = threadIdx().x stride = blockDim().x T = eltype(r_p) ncomp = size(w_p, 1) @@ -19,3 +19,20 @@ function reduce_residual_kernel!(r_p, w_p, r_full) end return nothing end + +function JutulDarcy.gpu_increment_pressure!(x, dp) + n = length(x) + bz = div(n, length(dp)) + @assert bz > 0 + CUDA.@cuda reduce_residual_kernel!(x, dp, bz) + return x +end + +function gpu_increment_pressure_kernel!(x, dp, bz) + index = threadIdx().x + stride = blockDim().x + for cell in index:stride:length(dp) + x[(cell-1)*bz + 1] = dp[cell] + end + return nothing +end diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index f8d137de..e5588df7 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -38,23 +38,25 @@ function gpu_cpr_setup_buffers! end +function gpu_amgx_solve! + +end + function Jutul.apply!(x, cpr::AMGXCPR, r) # Apply smoother - Jutul.apply!(x, cpr.system_precond, r) # Correct the residual A_ps = cpr.pressure_precond.data[:operator] # buf = cpr.pressure_precond.data[:buffer_full] - # buf_p = cpr.pressure_precond.data[:buffer_p] JutulDarcy.correct_residual!(r, A_ps, x) # Construct pressure residual r_p = cpr.pressure_precond.data[:buffer_p] w_p = cpr.pressure_precond.data[:w_p] ncomp, ncell = size(w_p) gpu_reduce_residual!(r_p, w_p, r) - # @info "??" typeof(r_p) typeof(w_p) typeof(r) - # @tullio r_p[cell] = w_p[comp, cell] * r_tmp[comp, cell] # Apply pressure preconditioner + amgx = cpr.pressure_precond + dp = gpu_amgx_solve!(amgx, r_p) # Update increments for pressure - error() + gpu_increment_pressure!(x, dp) end diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index 33e123ee..3c8289f1 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -160,3 +160,7 @@ end function gpu_reduce_residual! end + +function gpu_increment_pressure! + +end From 03a56551cc712c429e59240a75c0f75bca62d85a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 20:48:00 +0200 Subject: [PATCH 17/32] Fixes to kernel launches --- ext/JutulDarcyAMGXExt/cpr.jl | 2 +- ext/JutulDarcyCUDAExt/cpr.jl | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 5a33c96a..4dab6e10 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -90,10 +90,10 @@ end function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op) # TODO: Rename this. data = cpr.pressure_precond.data + cpr_s = cpr.storage if !haskey(data, :w_p) Tv = eltype(r_cu) n = length(r_cu) - cpr_s = cpr.storage bz = cpr_s.block_size # cpr.pressure_precond.data[:buffer_full] = AMGX.CUDA.CuVector{Tv}(undef, n) diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index 254e30e0..065a46e2 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -1,19 +1,24 @@ function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r) # TODO: Actually use CUDA threads here. @assert eltype(r_p) == eltype(w_p) == eltype(r) + @assert length(w_p) == length(r) + @assert size(w_p, 2) == length(r_p) CUDA.@cuda reduce_residual_kernel!(r_p, w_p, r) return r_p end -function reduce_residual_kernel!(r_p, w_p, r_full) +function reduce_residual_kernel!(r_p, w_p, r_ps) index = threadIdx().x stride = blockDim().x T = eltype(r_p) ncomp = size(w_p, 1) for cell in index:stride:length(r_p) v = zero(T) - for comp in axes(w_p, 1) - v += w_p[comp, cell] * r_full[(cell-1)*ncomp + comp] + for comp in 1:ncomp + ix = (cell-1)*ncomp + comp + wi = w_p[comp, cell] + ri = r_ps[ix] + v += wi*ri end r_p[cell] = v end @@ -24,7 +29,8 @@ function JutulDarcy.gpu_increment_pressure!(x, dp) n = length(x) bz = div(n, length(dp)) @assert bz > 0 - CUDA.@cuda reduce_residual_kernel!(x, dp, bz) + CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz) + CUDA.synchronize() return x end From eba4fb63bc417599fac87be4f67a0c40d12343cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 21:01:42 +0200 Subject: [PATCH 18/32] Sort-of-working GPU CPR with wells --- ext/JutulDarcyCUDAExt/cpr.jl | 2 +- src/ext/ext_amgx.jl | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index 065a46e2..b7dfbe18 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -38,7 +38,7 @@ function gpu_increment_pressure_kernel!(x, dp, bz) index = threadIdx().x stride = blockDim().x for cell in index:stride:length(dp) - x[(cell-1)*bz + 1] = dp[cell] + x[(cell-1)*bz + 1] += dp[cell] end return nothing end diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index e5588df7..acda3bbc 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -47,7 +47,15 @@ function Jutul.apply!(x, cpr::AMGXCPR, r) Jutul.apply!(x, cpr.system_precond, r) # Correct the residual A_ps = cpr.pressure_precond.data[:operator] - # buf = cpr.pressure_precond.data[:buffer_full] + if false + if !haskey(cpr.pressure_precond.data, :buffer_full) + cpr.pressure_precond.data[:buffer_full] = copy(r) + end + buf = cpr.pressure_precond.data[:buffer_full] + copyto!(buf, r) + r = buf + end + # r = copy(r) JutulDarcy.correct_residual!(r, A_ps, x) # Construct pressure residual r_p = cpr.pressure_precond.data[:buffer_p] From d0cffe61fb07a96f8e8c415570479ea5c5098f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Fri, 25 Oct 2024 21:06:47 +0200 Subject: [PATCH 19/32] Update format for AMGX config --- src/ext/ext_amgx.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index acda3bbc..ea7ba92d 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -1,16 +1,16 @@ struct AMGXPreconditioner <: Jutul.JutulPreconditioner - settings::Dict{Symbol, Any} + settings::Dict{String, Any} data::Dict{Symbol, Any} - function AMGXPreconditioner(settings::Dict{Symbol, Any}) + function AMGXPreconditioner(settings::Dict{String, Any}) data = Dict{Symbol, Any}() new(settings, data) end end function AMGXPreconditioner(; kwarg...) - settings = Dict{Symbol, Any}() + settings = Dict{String, Any}() for (k, v) in kwarg - settings[k] = v + settings["$k"] = v end return AMGXPreconditioner(settings) end From 38332514269d7a677086c1f6f575ce0ff97d9cd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Sun, 27 Oct 2024 18:44:55 +0100 Subject: [PATCH 20/32] Update copy of buffer in CUDA --- ext/JutulDarcyAMGXExt/cpr.jl | 3 +-- src/ext/ext_amgx.jl | 12 ++---------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 4dab6e10..2fc46496 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -95,8 +95,7 @@ function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op) Tv = eltype(r_cu) n = length(r_cu) bz = cpr_s.block_size - - # cpr.pressure_precond.data[:buffer_full] = AMGX.CUDA.CuVector{Tv}(undef, n) + cpr.pressure_precond.data[:buffer_full] = similar(r_cu) bz_w, n_w = size(cpr_s.w_p) @assert n == bz*n_w data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w) diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl index ea7ba92d..93d53f9e 100644 --- a/src/ext/ext_amgx.jl +++ b/src/ext/ext_amgx.jl @@ -47,16 +47,8 @@ function Jutul.apply!(x, cpr::AMGXCPR, r) Jutul.apply!(x, cpr.system_precond, r) # Correct the residual A_ps = cpr.pressure_precond.data[:operator] - if false - if !haskey(cpr.pressure_precond.data, :buffer_full) - cpr.pressure_precond.data[:buffer_full] = copy(r) - end - buf = cpr.pressure_precond.data[:buffer_full] - copyto!(buf, r) - r = buf - end - # r = copy(r) - JutulDarcy.correct_residual!(r, A_ps, x) + r_corrected = cpr.pressure_precond.data[:buffer_full] + JutulDarcy.correct_residual!(r_corrected, A_ps, x) # Construct pressure residual r_p = cpr.pressure_precond.data[:buffer_p] w_p = cpr.pressure_precond.data[:w_p] From b5ab2cc5118517c665b553481c5743b73967051f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Sun, 27 Oct 2024 19:50:47 +0100 Subject: [PATCH 21/32] Updates to CPR code --- ext/JutulDarcyAMGXExt/cpr.jl | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 2fc46496..40ba2230 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -1,9 +1,6 @@ function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) data = amgx.data - initialize_amgx_structure!(amgx, A_p, Tv) - update_pressure_system!(amgx, A_p) - s = amgx.data[:storage] - AMGX.setup!(s.solver, s.matrix) + update_pressure_system!(amgx, A_p, Tv) return amgx end @@ -21,6 +18,7 @@ mutable struct AMGXStorage{C, R, V, M, S} matrix = AMGX.AMGXMatrix(resources, amgx_mode) solver = AMGX.Solver(resources, amgx_mode, config) function finalize_storage!(amgx_s::AMGXStorage) + @async println("Finalizing AMGXStorage") close(amgx_s.solver) close(amgx_s.matrix) close(amgx_s.r) @@ -39,13 +37,6 @@ mutable struct AMGXStorage{C, R, V, M, S} end end -function update_pressure_system!(amgx, A_cpu) - s = amgx.data[:storage] - A_gpu = s.matrix - AMGX.replace_coefficients!(A_gpu, A_cpu.At.nzval) - return amgx -end - function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p) s = amgx.data[:storage] n = length(r_p) @@ -56,7 +47,7 @@ function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p) return r_p end -function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) +function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) if !haskey(amgx.data, :storage) if Tv == Float64 amgx_mode = AMGX.dDDI @@ -74,16 +65,23 @@ function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR row_ptr = Cint.(A.At.colptr .- 1) colval = Cint.(A.At.rowval .- 1) nzval = A.At.nzval - nzval = Tv.(nzval) - - AMGX.upload!(s.matrix, + # TODO: Support for other types than Float64, should be done in setup of + # pressure system + AMGX.pin_memory(nzval) + AMGX.upload!(s.matrix, row_ptr, colval, nzval ) amgx.data[:storage] = s amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n) + else + s = amgx.data[:storage] + A_gpu = s.matrix + @assert nnz(A) == nnz(A_gpu) + AMGX.replace_coefficients!(A_gpu, A.At.nzval) end + AMGX.setup!(s.solver, s.matrix) return amgx end From e54d62b175cebd51b3f9d203532be0e3b12d4b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Sun, 27 Oct 2024 20:28:50 +0100 Subject: [PATCH 22/32] Updates to CUDA --- ext/JutulDarcyAMGXExt/cpr.jl | 2 ++ ext/JutulDarcyCUDAExt/cpr.jl | 1 - src/ext/ext_cuda.jl | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 40ba2230..ff0a60e4 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -38,6 +38,8 @@ mutable struct AMGXStorage{C, R, V, M, S} end function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p) + # TODO: This sync call is maybe needd? + AMGX.CUDA.synchronize() s = amgx.data[:storage] n = length(r_p) AMGX.upload!(s.r, r_p) diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index b7dfbe18..8df6c932 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -30,7 +30,6 @@ function JutulDarcy.gpu_increment_pressure!(x, dp) bz = div(n, length(dp)) @assert bz > 0 CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz) - CUDA.synchronize() return x end diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl index 3c8289f1..15c2f21e 100644 --- a/src/ext/ext_cuda.jl +++ b/src/ext/ext_cuda.jl @@ -12,7 +12,7 @@ function CUDAReservoirKrylov(solver = :gmres, prec = ILUZeroPreconditioner(); kwarg... ) cfg = IterativeSolverConfig(; kwarg...) - CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing) + return CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing) end function Jutul.linear_solve!(lsys::Jutul.LSystem, @@ -111,8 +111,8 @@ function gpu_update_preconditioner! end function Base.show(io::IO, krylov::CUDAReservoirKrylov) - rtol = linear_solver_tolerance(krylov.config, :relative) - atol = linear_solver_tolerance(krylov.config, :absolute) + rtol = Jutul.linear_solver_tolerance(krylov.config, :relative) + atol = Jutul.linear_solver_tolerance(krylov.config, :absolute) print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))") end From f7f7d39ac1571f8d27b36e67e6b955a86e9eb751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 08:23:10 +0100 Subject: [PATCH 23/32] Fix for wells --- src/types.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/types.jl b/src/types.jl index 41cb3585..12209d2b 100644 --- a/src/types.jl +++ b/src/types.jl @@ -417,10 +417,12 @@ function SimpleWell( name = :Well, explicit_dp = true, surface_conditions = default_surface_cond(), - reference_depth = 0, + reference_depth = 0.0, volume = 1000.0, # Regularization volume for well, not a real volume kwarg... ) + reference_depth = convert(Float64, reference_depth) + volume = convert(Float64, volume) nr = length(reservoir_cells) WI, gdz = common_well_setup(nr; kwarg...) perf = (self = ones(Int64, nr), reservoir = vec(reservoir_cells), WI = WI, gdz = gdz) From a2c05c7f20f2521d2c08bac9c84b7c9ea5f3ead7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 08:23:29 +0100 Subject: [PATCH 24/32] Some CUDA code backup --- ext/JutulDarcyAMGXExt/cpr.jl | 1 + ext/JutulDarcyCUDAExt/cpr.jl | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index ff0a60e4..561ca597 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -78,6 +78,7 @@ function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.St amgx.data[:storage] = s amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n) else + pb = amgx.data[:buffer_p] s = amgx.data[:storage] A_gpu = s.matrix @assert nnz(A) == nnz(A_gpu) diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index 8df6c932..8560f018 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -16,11 +16,11 @@ function reduce_residual_kernel!(r_p, w_p, r_ps) v = zero(T) for comp in 1:ncomp ix = (cell-1)*ncomp + comp - wi = w_p[comp, cell] - ri = r_ps[ix] + @inbounds wi = w_p[comp, cell] + @inbounds ri = r_ps[ix] v += wi*ri end - r_p[cell] = v + @inbounds r_p[cell] = v end return nothing end @@ -29,7 +29,12 @@ function JutulDarcy.gpu_increment_pressure!(x, dp) n = length(x) bz = div(n, length(dp)) @assert bz > 0 - CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz) + CUDA.@sync CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz) + # kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz) + # threads, blocks = kernel_helper(kernel, x) + # CUDA.@sync begin + # kernel(x, dp, bz; threads, blocks) + # end return x end @@ -37,7 +42,15 @@ function gpu_increment_pressure_kernel!(x, dp, bz) index = threadIdx().x stride = blockDim().x for cell in index:stride:length(dp) - x[(cell-1)*bz + 1] += dp[cell] + @inbounds x[(cell-1)*bz + 1] += dp[cell] end return nothing end + +function kernel_helper(kernel, V) + config = launch_configuration(kernel.fun) + threads = min(length(V), config.threads) + blocks = cld(length(V), threads) + + return (threads, blocks) +end \ No newline at end of file From 017c1efe1e73c03cb91d8b3ee7d1c045c8ef55ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 09:06:46 +0100 Subject: [PATCH 25/32] Use CUDA threads in CPR code --- ext/JutulDarcyCUDAExt/cpr.jl | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index 8560f018..daad2d2f 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -1,9 +1,12 @@ function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r) - # TODO: Actually use CUDA threads here. @assert eltype(r_p) == eltype(w_p) == eltype(r) @assert length(w_p) == length(r) @assert size(w_p, 2) == length(r_p) - CUDA.@cuda reduce_residual_kernel!(r_p, w_p, r) + kernel = CUDA.@cuda launch = false reduce_residual_kernel!(r_p, w_p, r) + threads, blocks = kernel_helper(kernel, r_p) + CUDA.@sync begin + kernel(r_p, w_p, r; threads, blocks) + end return r_p end @@ -29,12 +32,11 @@ function JutulDarcy.gpu_increment_pressure!(x, dp) n = length(x) bz = div(n, length(dp)) @assert bz > 0 - CUDA.@sync CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz) - # kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz) - # threads, blocks = kernel_helper(kernel, x) - # CUDA.@sync begin - # kernel(x, dp, bz; threads, blocks) - # end + kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz) + threads, blocks = kernel_helper(kernel, dp) + CUDA.@sync begin + kernel(x, dp, bz; threads, blocks) + end return x end @@ -42,7 +44,7 @@ function gpu_increment_pressure_kernel!(x, dp, bz) index = threadIdx().x stride = blockDim().x for cell in index:stride:length(dp) - @inbounds x[(cell-1)*bz + 1] += dp[cell] + x[(cell-1)*bz + 1] += dp[cell] end return nothing end From c6f15b187ca979db26c2e438ff534452b3778aa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 09:29:18 +0100 Subject: [PATCH 26/32] Use view instead of custom kernel for CPR dp --- ext/JutulDarcyCUDAExt/cpr.jl | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl index daad2d2f..23e05e69 100644 --- a/ext/JutulDarcyCUDAExt/cpr.jl +++ b/ext/JutulDarcyCUDAExt/cpr.jl @@ -32,10 +32,17 @@ function JutulDarcy.gpu_increment_pressure!(x, dp) n = length(x) bz = div(n, length(dp)) @assert bz > 0 - kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz) - threads, blocks = kernel_helper(kernel, dp) - CUDA.@sync begin - kernel(x, dp, bz; threads, blocks) + if false + kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz) + threads, blocks = kernel_helper(kernel, dp) + threads = blocks = 1 + + CUDA.@sync begin + kernel(x, dp, bz; threads, blocks) + end + else + x_view = view(x, 1:bz:(n+1-bz)) + @. x_view += dp end return x end From b1a070ce552dbaeb9351f376c390556d48e592a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 10:07:22 +0100 Subject: [PATCH 27/32] Performance tweaks --- ext/JutulDarcyCUDAExt/cuda_utils.jl | 2 +- src/cpr.jl | 4 ++++ src/flux.jl | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index 1da70ab0..2a9904cf 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -19,7 +19,7 @@ function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12) else v = ϵ end - nzval[i] = v + @inbounds nzval[i] = v end end end diff --git a/src/cpr.jl b/src/cpr.jl index 23233348..3d8653a7 100644 --- a/src/cpr.jl +++ b/src/cpr.jl @@ -271,6 +271,10 @@ function update_pressure_system!(A_p::Jutul.StaticSparsityMatrixCSR, p_prec, A:: ncomp = size(w_p, 1) N = Val(ncomp) is_adjoint = Val(Jutul.represented_as_adjoint(matrix_layout(ctx))) + update_rows_csr(nz, A_p, w_p, cols, nz_s, N, is_adjoint, n) +end + +function update_rows_csr(nz, A_p, w_p, cols, nz_s, N, is_adjoint, n) @batch minbatch=tb for row in 1:n update_row_csr!(nz, A_p, w_p, cols, nz_s, row, N, is_adjoint) end diff --git a/src/flux.jl b/src/flux.jl index b8fb843b..6cd8b073 100644 --- a/src/flux.jl +++ b/src/flux.jl @@ -59,7 +59,7 @@ end return q end -function face_average_density(model, state, tpfa, phase) +@inline function face_average_density(model, state, tpfa, phase) ρ = state.PhaseMassDensities l = tpfa.left r = tpfa.right @@ -68,7 +68,7 @@ function face_average_density(model, state, tpfa, phase) return 0.5*(ρ_i + ρ_c) end -function face_average_density(model::CompositionalModel, state, tpfa, phase) +@inline function face_average_density(model::CompositionalModel, state, tpfa, phase) sys = flow_system(model.system) ρ = state.PhaseMassDensities l = tpfa.left From a58af9cf5f2478d2c33ec4ca6eab9dec136f00c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 10:21:42 +0100 Subject: [PATCH 28/32] Remove some code duplication --- src/flux.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/flux.jl b/src/flux.jl index 6cd8b073..dfe12b01 100644 --- a/src/flux.jl +++ b/src/flux.jl @@ -61,11 +61,7 @@ end @inline function face_average_density(model, state, tpfa, phase) ρ = state.PhaseMassDensities - l = tpfa.left - r = tpfa.right - @inbounds ρ_c = ρ[phase, l] - @inbounds ρ_i = ρ[phase, r] - return 0.5*(ρ_i + ρ_c) + return phase_face_average(ρ, tpfa, phase) end @inline function face_average_density(model::CompositionalModel, state, tpfa, phase) From 96c9495c6e0274bb073c58e4f6f1c948222dac6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 10:55:09 +0100 Subject: [PATCH 29/32] Reorder destructors for AMGX --- ext/JutulDarcyAMGXExt/cpr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 561ca597..8f902ea7 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -20,9 +20,9 @@ mutable struct AMGXStorage{C, R, V, M, S} function finalize_storage!(amgx_s::AMGXStorage) @async println("Finalizing AMGXStorage") close(amgx_s.solver) - close(amgx_s.matrix) close(amgx_s.r) close(amgx_s.x) + close(amgx_s.matrix) close(amgx_s.resources) close(amgx_s.config) end From 0824cfcea3788b11c7093325c342e18ddcc4881b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 15:13:25 +0100 Subject: [PATCH 30/32] Bump Jutul compat & version --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 20bf60e1..28158464 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JutulDarcy" uuid = "82210473-ab04-4dce-b31b-11573c4f8e0a" authors = ["Olav Møyner "] -version = "0.2.35" +version = "0.2.36" [deps] AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c" @@ -62,7 +62,7 @@ ForwardDiff = "0.10.30" GLMakie = "0.10.13" GeoEnergyIO = "1.1.12" HYPRE = "1.6.0, 1.7" -Jutul = "0.2.40" +Jutul = "0.2.41" Krylov = "0.9.1" LazyArtifacts = "1" LinearAlgebra = "1" From c3d398c867152b955e40f0d06db71ac2a09c4905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Mon, 28 Oct 2024 20:10:15 +0100 Subject: [PATCH 31/32] Add simple preconditioner version of AMGX --- ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl | 1 + ext/JutulDarcyAMGXExt/cpr.jl | 12 ++--- ext/JutulDarcyAMGXExt/precond.jl | 53 ++++++++++++++++++++++ ext/JutulDarcyCUDAExt/cuda_utils.jl | 2 +- ext/JutulDarcyCUDAExt/ilu0.jl | 1 + 5 files changed, 59 insertions(+), 10 deletions(-) create mode 100644 ext/JutulDarcyAMGXExt/precond.jl diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl index b4baa274..b1df4954 100644 --- a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl +++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl @@ -6,6 +6,7 @@ module JutulDarcyAMGXExt timeit_debug_enabled() = Jutul.timeit_debug_enabled() include("cpr.jl") + include("precond.jl") function __init__() # TODO: Figure out a way to not always do this? diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl index 8f902ea7..bf6d7498 100644 --- a/ext/JutulDarcyAMGXExt/cpr.jl +++ b/ext/JutulDarcyAMGXExt/cpr.jl @@ -38,15 +38,7 @@ mutable struct AMGXStorage{C, R, V, M, S} end function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p) - # TODO: This sync call is maybe needd? - AMGX.CUDA.synchronize() - s = amgx.data[:storage] - n = length(r_p) - AMGX.upload!(s.r, r_p) - AMGX.set_zero!(s.x, n) - AMGX.solve!(s.x, s.solver, s.r) - AMGX.download!(r_p, s.x) - return r_p + Jutul.apply!(r_p, amgx, r_p) end function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv) @@ -76,6 +68,8 @@ function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.St nzval ) amgx.data[:storage] = s + amgx.data[:block_size] = 1 + amgx.data[:n] = n amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n) else pb = amgx.data[:buffer_p] diff --git a/ext/JutulDarcyAMGXExt/precond.jl b/ext/JutulDarcyAMGXExt/precond.jl new file mode 100644 index 00000000..a4b42e50 --- /dev/null +++ b/ext/JutulDarcyAMGXExt/precond.jl @@ -0,0 +1,53 @@ +function JutulDarcy.gpu_update_preconditioner!(prec::JutulDarcy.AMGXPreconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op) + nzval = nonzeros(J_bsr) + bz = Jutul.block_size(lsys[1, 1]) + if haskey(prec.data, :storage) + s = prec.data[:storage] + AMGX.replace_coefficients!(s.matrix, nzval) + else + Tv = eltype(J_bsr) + if Tv == Float64 + amgx_mode = AMGX.dDDI + else + amgx_mode = AMGX.dFFI + end + n, m = size(J_bsr) + @assert n == m + N = n ÷ bz + config = AMGX.Config(prec.settings) + s = AMGXStorage(config, amgx_mode) + + AMGX.set_zero!(s.x, N, block_dim = bz) + AMGX.set_zero!(s.r, N, block_dim = bz) + + row_ptr = J_bsr.rowPtr + colval = J_bsr.colVal + AMGX.upload!( + s.matrix, + row_ptr, + colval, + nzval, + block_dims = (bz, bz) + ) + prec.data[:storage] = s + prec.data[:n] = N + prec.data[:block_size] = bz + end + AMGX.setup!(s.solver, s.matrix) +end + +function Jutul.operator_nrows(prec::JutulDarcy.AMGXPreconditioner) + return prec.data[:n]*prec.data[:block_size] +end + +function Jutul.apply!(x, amgx::JutulDarcy.AMGXPreconditioner, r) + # TODO: This sync call is maybe needd? + AMGX.CUDA.synchronize() + s = amgx.data[:storage] + bz = amgx.data[:block_size]::Int + n = amgx.data[:n]::Int + AMGX.upload!(s.r, r, block_dim = bz) + AMGX.set_zero!(s.x, n, block_dim = bz) + AMGX.solve!(s.x, s.solver, s.r) + AMGX.download!(x, s.x) +end diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl index 2a9904cf..cb5f2e42 100644 --- a/ext/JutulDarcyCUDAExt/cuda_utils.jl +++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl @@ -11,7 +11,7 @@ function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim end function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12) - if ϵ > 0.0 + if ϵ > 0.0 && false for (i, v) in enumerate(nzval) if abs(v) < ϵ if v > 0 diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl index 8553ccd5..9bc1e925 100644 --- a/ext/JutulDarcyCUDAExt/ilu0.jl +++ b/ext/JutulDarcyCUDAExt/ilu0.jl @@ -85,6 +85,7 @@ function ilu0_gpu_update!(A, Tv, data) end mb = div(size(A, 1), A.blockDim) ilu0_is_analyzed = haskey(data, :ilu0) + ilu0_is_analyzed = false if ilu0_is_analyzed info, desc = data[:ilu0] else From 240b37c3d7096b0e051521be60e2017efd1fffe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olav=20M=C3=B8yner?= Date: Tue, 29 Oct 2024 12:11:43 +0100 Subject: [PATCH 32/32] Clean up linear solver set option and re-enable disabled test --- src/test_utils/setup_multimodel.jl | 6 ------ src/utils.jl | 5 ++++- test/multimodel.jl | 15 ++++++--------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/test_utils/setup_multimodel.jl b/src/test_utils/setup_multimodel.jl index b1793224..4483837a 100644 --- a/src/test_utils/setup_multimodel.jl +++ b/src/test_utils/setup_multimodel.jl @@ -2,7 +2,6 @@ function simulate_mini_wellcase(::Val{:compositional_2ph_3c}; dims = (3, 1, 1), setuparg = NamedTuple(), output_path = nothing, - default_linsolve = true, nstep = 12*5, total_time = 30.0*si_unit(:day)*nstep, kwarg... @@ -52,7 +51,6 @@ function simulate_mini_wellcase(::Val{:compositional_2ph_3c}; sim, config = setup_reservoir_simulator( model, state0, parameters; info_level = -1, - set_linear_solver = !default_linsolve, error_on_incomplete = true, output_path = output_path, setuparg..., @@ -76,7 +74,6 @@ function simulate_mini_wellcase(::Val{:immiscible_2ph}; permeability = 0.1*9.869232667160130e-13, nstep = 12*5, total_time = 30.0*si_unit(:day)*nstep, - default_linsolve = true, kwarg...) # Some useful constants day = 3600*24 @@ -125,7 +122,6 @@ function simulate_mini_wellcase(::Val{:immiscible_2ph}; sim, config = setup_reservoir_simulator( model, state0, parameters; info_level = -1, - set_linear_solver = !default_linsolve, output_path = output_path, error_on_incomplete = true, setuparg... @@ -145,7 +141,6 @@ function simulate_mini_wellcase(::Val{:bo_spe1}; dims = (3, 1, 1), setuparg = NamedTuple(), output_path = nothing, - default_linsolve = true, nstep = 12*5, total_time = 30.0*si_unit(:day)*nstep, kwarg... @@ -199,7 +194,6 @@ function simulate_mini_wellcase(::Val{:bo_spe1}; sim, config = setup_reservoir_simulator( model, state0, parameters; info_level = -1, - set_linear_solver = !default_linsolve, output_path = output_path, error_on_incomplete = true, setuparg... diff --git a/src/utils.jl b/src/utils.jl index cbad05ef..d51a5336 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -634,13 +634,16 @@ function setup_reservoir_simulator(case::JutulCase; relaxation = false, presolve_wells = false, parray_arg = Dict{Symbol, Any}(), - set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver), + set_linear_solver = missing, linear_solver_arg = Dict{Symbol, Any}(), extra_timing_setup = false, nldd_partition = missing, nldd_arg = Dict{Symbol, Any}(), kwarg... ) + if ismissing(set_linear_solver) + set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver) + end # Handle old kwarg... max_timestep = min(max_dt, max_timestep) extra_kwarg = Dict{Symbol, Any}() diff --git a/test/multimodel.jl b/test/multimodel.jl index f989ab32..71474230 100644 --- a/test/multimodel.jl +++ b/test/multimodel.jl @@ -127,15 +127,12 @@ end for b in [false, true] for backend in [:csr, :csc] for gen_ad in [true, false] - for default_linsolve in [false, true] - @testset "Block=$b, backend=$b, defaulted linsolve=$default_linsolve" begin - arg = (general_ad = gen_ad, backend = backend, block_backend = b, default_linsolve = default_linsolve) - test_compositional_with_wells(; arg...) - # Disabled test until Jutul is updated. - # test_compositional_with_wells(; fast_flash = true, arg...) - test_immiscible_with_wells(; arg...) - test_blackoil_with_wells(; arg...) - end + @testset "Block=$b, backend=$b" begin + arg = (general_ad = gen_ad, backend = backend, block_backend = b) + test_compositional_with_wells(; arg...) + test_compositional_with_wells(; fast_flash = true, arg...) + test_immiscible_with_wells(; arg...) + test_blackoil_with_wells(; arg...) end end end