From ef6c0a1bf040e4f6c1aff309f1faec514658b693 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Wed, 23 Oct 2024 20:22:31 +0200
Subject: [PATCH 01/32] Add a basic reservoir version of CUDA linear solver

---
 Project.toml                               |   6 +-
 ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl |   6 +
 ext/JutulDarcyCUDAExt/cuda_utils.jl        |  31 ++++
 ext/JutulDarcyCUDAExt/ilu0.jl              | 166 +++++++++++++++++++++
 ext/JutulDarcyCUDAExt/krylov.jl            |   0
 src/ext/ext.jl                             |   1 +
 src/ext/ext_cuda.jl                        | 111 ++++++++++++++
 test/gpu.jl                                | 117 +++++++--------
 8 files changed, 374 insertions(+), 64 deletions(-)
 create mode 100644 ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
 create mode 100644 ext/JutulDarcyCUDAExt/cuda_utils.jl
 create mode 100644 ext/JutulDarcyCUDAExt/ilu0.jl
 create mode 100644 ext/JutulDarcyCUDAExt/krylov.jl
 create mode 100644 src/ext/ext_cuda.jl

diff --git a/Project.toml b/Project.toml
index 985523ca..bbcf5bbc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,6 +34,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -41,6 +42,7 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 
 [extensions]
+JutulDarcyCUDAExt = "CUDA"
 JutulDarcyGLMakieExt = "GLMakie"
 JutulDarcyMakieExt = "Makie"
 JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]
@@ -48,6 +50,7 @@ JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]
 [compat]
 AlgebraicMultigrid = "0.5.1, 0.6.0"
 Artifacts = "1"
+CUDA = "5"
 DataStructures = "0.18.13"
 Dates = "1"
 DelimitedFiles = "1.6"
@@ -78,6 +81,7 @@ Tullio = "0.3.4"
 julia = "1.7"
 
 [extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -87,4 +91,4 @@ TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 
 [targets]
-test = ["Test", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
+test = ["Test", "CUDA", "TestItemRunner", "TestItems", "HYPRE", "MPI", "PartitionedArrays"]
diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
new file mode 100644
index 00000000..4086db57
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
@@ -0,0 +1,6 @@
+module JutulDarcyCUDAExt
+    using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays
+    include("ilu0.jl")
+    include("krylov.jl")
+    include("cuda_utils.jl")
+end
diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
new file mode 100644
index 00000000..18bc2c66
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -0,0 +1,31 @@
+function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim::Int, rowptr, colval, nzval, r0)
+    rowPtr = CuVector{Ti}(rowptr)
+    colVal = CuVector{Ti}(colval)
+    nzVal = CuVector{Tv}(nzval)
+    dims = blockDim.*sz
+    dir = 'C'
+    nnz = length(nzVal)÷(blockDim^2)
+    J_bsr = CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
+    r_cu = CuVector{Tv}(vec(r0))
+    return (J_bsr, r_cu)
+end
+
+function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12)
+    if ϵ > 0.0
+        for (i, v) in enumerate(nzval)
+            if abs(v) < ϵ
+                if v > 0
+                    v = ϵ
+                else
+                    v = ϵ
+                end
+                nzval[i] = v
+            end
+        end
+    end
+    copyto!(J.nzVal, nzval)
+end
+
+function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r)
+    copyto!(r_cu, r)
+end
diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl
new file mode 100644
index 00000000..08e99ad6
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/ilu0.jl
@@ -0,0 +1,166 @@
+function ilu0_invert!(y, A, Tv, c_char, data)
+    if Tv == Float32
+        bname = CUDA.CUSPARSE.cusparseSbsrsv2_bufferSize
+        aname = CUDA.CUSPARSE.cusparseSbsrsv2_analysis
+        sname = CUDA.CUSPARSE.cusparseSbsrsv2_solve
+    else
+        bname = CUDA.CUSPARSE.cusparseDbsrsv2_bufferSize
+        aname = CUDA.CUSPARSE.cusparseDbsrsv2_analysis
+        sname = CUDA.CUSPARSE.cusparseDbsrsv2_solve
+    end
+    if c_char == 'U'
+        d_char = 'N'
+        kval = :ilu_upper
+    else
+        @assert c_char == 'L'
+        d_char = 'U'
+        kval = :ilu_lower
+    end
+    transa = 'N'
+    uplo = c_char
+    diag = d_char
+    alpha = 1.0
+    index = 'O'
+    X = y
+    m,n = size(A)
+    if m != n
+        throw(DimensionMismatch("A must be square, but has dimensions ($m,$n)!"))
+    end
+
+    mb = cld(m, A.blockDim)
+
+    is_analysed = haskey(data, kval)
+    # is_analysed = false
+    if is_analysed
+        info, desc = data[kval]
+    else
+        info = CUDA.CUSPARSE.bsrsv2Info_t[0]
+        desc = CUDA.CUSPARSE.CuMatrixDescriptor('G', uplo, diag, index)
+        CUDA.CUSPARSE.cusparseCreateBsrsv2Info(info)
+        data[kval] = (info, desc)
+    end
+
+    function bufferSize()
+        out = Ref{Cint}(1)
+        bname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb,
+            desc, nonzeros(A), A.rowPtr, A.colVal, A.blockDim,
+            info[1], out)
+        return out[]
+    end
+    CUDA.CUSPARSE.with_workspace(bufferSize) do buffer
+        if !is_analysed
+            aname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb,
+                    desc, nonzeros(A), A.rowPtr, A.colVal, A.blockDim,
+                    info[1], CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+            posit = Ref{Cint}(1)
+            CUDA.CUSPARSE.cusparseXbsrsv2_zeroPivot(CUDA.CUSPARSE.handle(), info[1], posit)
+            if posit[] >= 0
+                error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+            end
+        end
+        sname(CUDA.CUSPARSE.handle(), A.dir, transa, mb, A.nnzb,
+                alpha, desc, nonzeros(A), A.rowPtr, A.colVal,
+                A.blockDim, info[1], X, X,
+                CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+    end
+    # Alternatively we could not cache setup and do the following:
+    # if c_char == 'U'
+    #     CUDA.CUSPARSE.sv2!('N', 'U', 'N', 1.0, A, y, 'O')
+    # else
+    #     @assert c_char == 'L'
+    #     CUDA.CUSPARSE.sv2!('N', 'L', 'U', 1.0, A, y, 'O')
+    # end
+    return y
+end
+
+function ilu0_gpu_update!(A, Tv, data)
+    if Tv == Float32
+        bname = CUDA.CUSPARSE.cusparseSbsrilu02_bufferSize
+        aname = CUDA.CUSPARSE.cusparseSbsrilu02_analysis
+        sname = CUDA.CUSPARSE.cusparseSbsrilu02
+    else
+        bname = CUDA.CUSPARSE.cusparseDbsrilu02_bufferSize
+        aname = CUDA.CUSPARSE.cusparseDbsrilu02_analysis
+        sname = CUDA.CUSPARSE.cusparseDbsrilu02
+    end
+    mb = div(size(A, 1), A.blockDim)
+    ilu0_is_analyzed = haskey(data, :ilu0)
+    if ilu0_is_analyzed
+        info, desc = data[:ilu0]
+    else
+        info = CUDA.CUSPARSE.ILU0InfoBSR()
+        desc = CUDA.CUSPARSE.CuMatrixDescriptor('G', 'U', 'N', 'O')
+        data[:ilu0] = (info, desc)
+    end
+
+    function bufferSize()
+        out = Ref{Cint}(1)
+        bname(CUDA.CUSPARSE.handle(), A.dir, mb, nnz(A), desc, nonzeros(A),
+               A.rowPtr, A.colVal, A.blockDim, info, out)
+        return out[]
+    end
+
+    policy = CUDA.CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL
+    CUDA.CUSPARSE.with_workspace(bufferSize) do buffer
+        if !ilu0_is_analyzed
+            aname(
+                CUDA.CUSPARSE.handle(),
+                A.dir,
+                mb,
+                nnz(A),
+                desc,
+                nonzeros(A),
+                A.rowPtr,
+                A.colVal,
+                A.blockDim,
+                info,
+                policy,
+                buffer
+            )
+            posit = Ref{Cint}(1)
+            CUDA.CUSPARSE.cusparseXbsrilu02_zeroPivot(CUDA.CUSPARSE.handle(), info, posit)
+            if posit[] >= 0
+                error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+            end
+        end
+        sname(
+            CUDA.CUSPARSE.handle(),
+            A.dir,
+            mb,
+            SparseArrays.nnz(A),
+            desc,
+            nonzeros(A),
+            A.rowPtr,
+            A.colVal,
+            A.blockDim,
+            info,
+            policy,
+            buffer
+        )
+    end
+end
+
+
+function Jutul.update_preconditioner!(ilu::ILUZeroPreconditioner, J_bsr::CUDA.CUSPARSE.CuSparseMatrixBSR{Tv, Ti}, b, context, executor) where {Tv, Ti}
+    if isnothing(ilu.factor)
+        data = Dict()
+        data[:ilu0_gpu] = copy(J_bsr)
+        ilu.dim = size(J_bsr)
+        ilu.factor = data
+    end
+    data = ilu.factor
+    ilu_gpu = data[:ilu0_gpu]
+    copyto!(ilu_gpu.nzVal, J_bsr.nzVal)
+    ilu0_gpu_update!(ilu_gpu, Tv, data)
+end
+
+function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv}) where Tv
+    copyto!(y, x)
+    # Equivialent but does not skip analyse
+    # sv2!('N', 'L', 'U', 1.0, P, y, 'O')
+    # sv2!('N', 'U', 'N', 1.0, P, y, 'O')
+    P = f.factor[:ilu0_gpu]
+    ilu0_invert!(y, P, Tv, 'L', f.factor)
+    ilu0_invert!(y, P, Tv, 'U', f.factor)
+    CUDA.synchronize()
+end
diff --git a/ext/JutulDarcyCUDAExt/krylov.jl b/ext/JutulDarcyCUDAExt/krylov.jl
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ext/ext.jl b/src/ext/ext.jl
index f1700836..0b716955 100644
--- a/src/ext/ext.jl
+++ b/src/ext/ext.jl
@@ -1,3 +1,4 @@
 include("ext_makie.jl")
 include("ext_partitionedarrays.jl")
+include("ext_cuda.jl")
 
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
new file mode 100644
index 00000000..593766a4
--- /dev/null
+++ b/src/ext/ext_cuda.jl
@@ -0,0 +1,111 @@
+mutable struct CUDAReservoirKrylov{Tv, Ti} <: Jutul.AbstractKrylov
+    solver::Symbol
+    config::IterativeSolverConfig
+    preconditioner
+    data
+    storage
+end
+
+function CUDAReservoirKrylov(solver = :gmres, prec = ILUZeroPreconditioner();
+        Float_t = Float64,
+        Int_t = Int32,
+        kwarg...
+    )
+    cfg = IterativeSolverConfig(; kwarg...)
+    CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing)
+end
+
+function Jutul.linear_solve!(lsys::Jutul.LinearizedSystem,
+        krylov::CUDAReservoirKrylov{Tv, Ti},
+        model,
+        storage = nothing,
+        dt = nothing,
+        recorder = ProgressRecorder(),
+        executor = default_executor();
+        dx = lsys.dx_buffer,
+        r = Jutul.vector_residual(lsys),
+        atol = Jutul.linear_solver_tolerance(krylov, :absolute),
+        rtol = Jutul.linear_solver_tolerance(krylov, :relative)
+    ) where {Tv, Ti}
+    cfg = krylov.config
+
+    atol = convert(Tv, atol)
+    rtol = convert(Tv, rtol)
+    t_prep0 = @elapsed Jutul.prepare_linear_solve!(lsys)
+
+    L = lsys[1,1]
+    J = L.jac
+    J::Jutul.StaticSparsityMatrixCSR
+    r0 = L.r_buffer
+    bz = Jutul.block_size(lsys)
+    sz = size(J)
+    n, m = sz
+    @assert n == m
+    is_first = !haskey(krylov.data, :J)
+    t_setup = @elapsed if is_first
+        krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, L.jac_buffer, r0)
+    end
+    J_bsr = krylov.data[:J]
+    r_cu = krylov.data[:r]
+
+    t_gpu_update = @elapsed begin
+        update_gpu_block_residual!(r_cu, bz, r0)
+        update_gpu_block_system!(J_bsr, bz, L.jac_buffer)
+    end
+    t_prep = t_gpu_update + t_setup + t_prep0
+    max_it = krylov.config.max_iterations
+
+    Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
+    prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv)
+    if cfg.precond_side == :right
+        preconditioner_arg = (N = prec_op, )
+    else
+        preconditioner_arg = (M = prec_op, )
+    end
+    solve_f, F = Jutul.krylov_jl_solve_function(krylov, J_bsr, r_cu)
+    solve_f(F, J_bsr, r_cu;
+        itmax = max_it,
+        preconditioner_arg...,
+        verbose = 0,
+        ldiv = false,
+        history = true,
+        rtol = rtol,
+        atol = atol
+    )
+    dx_gpu, stats = (krylov.storage.x, krylov.storage.stats)
+
+    copyto!(dx, dx_gpu)
+    @. dx = -dx
+
+    res = stats.residuals
+    solved = stats.solved
+    n = length(res) - 1
+
+    t_prec = 0.0
+    if prec_op isa Jutul.PrecondWrapper
+        t_prec_op = prec_op.time
+        t_prec_count = prec_op.count
+    else
+        t_prec_op = 0.0
+        t_prec_count = 0
+    end
+    return Jutul.linear_solve_return(solved, n, stats, precond = t_prec_op, precond_count = t_prec_count, prepare = t_prec + t_prep)
+end
+
+function Base.show(io::IO, krylov::CUDAReservoirKrylov)
+    rtol = linear_solver_tolerance(krylov.config, :relative)
+    atol = linear_solver_tolerance(krylov.config, :absolute)
+    print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))")
+end
+
+function build_gpu_block_system
+
+end
+
+function update_gpu_block_system!
+
+end
+
+function update_gpu_block_residual!
+
+end
diff --git a/test/gpu.jl b/test/gpu.jl
index 19590eb8..75de7e17 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -1,69 +1,60 @@
-using Jutul, JutulDarcy
-using LinearAlgebra, Krylov
-using Printf
-using ForwardDiff
-using CUDA, Test
-
-function test_single_phase_gpu(casename = "pico"; float_type = Float32, pvfrac=0.05, tstep = [1.0])#[1.0, 2.0])
-    G, mrst_data = get_minimal_tpfa_grid_from_mrst(casename, extraout = true)
-    nc = number_of_cells(G)
-    # Parameters
+using Jutul, JutulDarcy, CUDA, Test
+
+function solve_bl_lsolve(; nx = 10, ny = 1, nstep = nx*ny, lsolve = missing, backend = :csr, step_limit = nothing, kwarg...)
+    time = 1.0
+    T = time
+    tstep = repeat([T/nstep], nstep)
+    mesh = CartesianMesh((nx, ny))
+    domain = reservoir_domain(mesh)
+    nc = number_of_cells(domain)
+    timesteps = tstep*3600*24
     bar = 1e5
-    p0 = 100*bar # 100 bar
-
-    # Single-phase liquid system (compressible pressure equation)
-    phase = LiquidPhase()
-    sys = SinglePhaseSystem(phase)
-    # Simulation model wraps grid and system together with context (which will be used for GPU etc)
-    ctx = SingleCUDAContext(float_type)
-    linsolve = GenericKrylov(Krylov.dqgmres, preconditioner = ILUZeroPreconditioner(), relative_tolerance = 0.01)
-    model = SimulationModel(G, sys, context = ctx)
-
-
-    s = model.secondary_variables
-
-    mu_c = transfer(ctx, repeat([1e-3], 1, nc))
-    mu = ConstantVariables(mu_c, Cells(), single_entity = false)
-    s[:PhaseViscosities] = mu
-
-    s_d = transfer(ctx, ones(1, nc))
-    sat = ConstantVariables(s_d, Cells(), single_entity = false)
-    s[:Saturations] = sat
-    s[:RelativePermeabilities] = sat
-    # System state
-    pv = physical_representation(model).pore_volumes
-    timesteps = tstep*3600*24 # 1 day, 2 days
-    tot_time = sum(timesteps)  
-    irate = pvfrac*sum(pv)/tot_time
-    src = [SourceTerm(1, irate), 
-           SourceTerm(nc, -irate)]
-    src = CuArray(src)
-    forces = setup_forces(model, sources = src)
-
-    # State is dict with pressure in each cell
-    init = Dict(:Pressure => p0)
-    state0 = setup_state(model, init)
-    # Model parameters
+    p0 = 100*bar
+    sys = ImmiscibleSystem((LiquidPhase(), VaporPhase()), reference_densities = [100.0, 100.0])
+    model = setup_reservoir_model(domain, sys, extra_out = false, backend = backend)
+    kr = BrooksCoreyRelativePermeabilities(sys, [2.0, 2.0], [0.2, 0.2])
+    rmodel = reservoir_model(model)
+    c = [1e-3, 1e-3]/bar
+    density = ConstantCompressibilityDensities(
+        p_ref = 100*bar,
+        density_ref = [100.0, 100.0],
+        compressibility = c
+    )
+    replace_variables!(rmodel, RelativePermeabilities = kr, PhaseMassDensities = density)
+    tot_time = sum(timesteps)
+    pv = pore_volume(domain)
+    irate = 500*sum(pv)/tot_time
+    src  = SourceTerm(nc, irate, fractional_flow = [0.75, 0.25])
+    bc = FlowBoundaryCondition(1, p0/2)
+    forces = setup_reservoir_forces(model, sources = src, bc = bc)
     parameters = setup_parameters(model)
-    parameters[:reference_densities] = CuArray(float_type.([1.0]))
-
-    # linsolve = nothing
-    sim = Simulator(model, state0 = state0, parameters = parameters)
-    info_level = 2
-    debug_level = 2
-    simulate(sim, timesteps, forces = forces, linear_solver = linsolve, 
-            debug_level = debug_level, info_level = info_level, output_states = false)
-    return true
+    state0 = setup_reservoir_state(model, Pressure = p0, Saturations = [0.25, 0.75])
+    if ismissing(lsolve)
+        lsolve = reservoir_linsolve(model)
+    end
+    if !isnothing(step_limit)
+        timesteps = timesteps[1:step_limit]
+    end
+    states, report = simulate(state0, model, timesteps; failure_cuts_timestep = false,
+        forces = forces, parameters = parameters, linear_solver = lsolve, kwarg...)
+    return states[end][:Reservoir][:Saturations][1, :]
 end
-if has_cuda_gpu()
-    CUDA.allowscalar(false)
-    casename = "pico"
-    @testset "GPU multiphase" begin
-        @testset "Basic flow - single precision" begin
-            @test test_single_phase_gpu(casename, float_type = Float32)
-        end
-        @testset "Basic flow - double precision" begin
-            @test test_single_phase_gpu(casename, float_type = Float64)
+
+if CUDA.functional()
+    do_solve(x) = solve_bl_lsolve(
+        lsolve = x,
+        info_level = -1
+    )
+    @testset "SimulationModel" begin
+        krylov_cpu = GenericKrylov(:bicgstab, preconditioner = ILUZeroPreconditioner())
+        s_cpu = do_solve(krylov_cpu)
+
+        for T in [Float32, Float64]
+            for solver in [:bicgstab, :gmres]
+                krylov_cu = JutulDarcy.CUDAReservoirKrylov(solver, Float_t = T)
+                s_cu = do_solve(krylov_cu)
+                @test s_cpu ≈ s_cu
+            end
         end
     end
 end

From f778b4ff0cba9d24d0691e4864151c3a6fd6f038 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 11:26:46 +0200
Subject: [PATCH 02/32] Update utils.jl

---
 src/utils.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index ee8c7742..cbad05ef 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -630,18 +630,17 @@ function setup_reservoir_simulator(case::JutulCase;
         failure_cuts_timestep = true,
         max_timestep_cuts = 25,
         inc_tol_dz = Inf,
-        set_linear_solver = true,
         timesteps = :auto,
         relaxation = false,
         presolve_wells = false,
         parray_arg = Dict{Symbol, Any}(),
+        set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver),
         linear_solver_arg = Dict{Symbol, Any}(),
         extra_timing_setup = false,
         nldd_partition = missing,
         nldd_arg = Dict{Symbol, Any}(),
         kwarg...
     )
-    set_linear_solver = set_linear_solver || linear_solver isa Symbol
     # Handle old kwarg...
     max_timestep = min(max_dt, max_timestep)
     extra_kwarg = Dict{Symbol, Any}()
@@ -699,6 +698,8 @@ function setup_reservoir_simulator(case::JutulCase;
     if set_linear_solver
         if info_level < 1
             v = -1
+        elseif info_level > 4
+            v = 1
         else
             v = 0
         end

From e2ff69fe6abe55446ea2c8ecbdb549c60aa1e213 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 13:52:01 +0200
Subject: [PATCH 03/32] Generalize CUDA a bit

---
 src/ext/ext_cuda.jl | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index 593766a4..cc764948 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -15,7 +15,7 @@ function CUDAReservoirKrylov(solver = :gmres, prec = ILUZeroPreconditioner();
     CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing)
 end
 
-function Jutul.linear_solve!(lsys::Jutul.LinearizedSystem,
+function Jutul.linear_solve!(lsys::Jutul.LSystem,
         krylov::CUDAReservoirKrylov{Tv, Ti},
         model,
         storage = nothing,
@@ -36,21 +36,22 @@ function Jutul.linear_solve!(lsys::Jutul.LinearizedSystem,
     L = lsys[1,1]
     J = L.jac
     J::Jutul.StaticSparsityMatrixCSR
-    r0 = L.r_buffer
-    bz = Jutul.block_size(lsys)
+    csr_block_buffer = L.jac_buffer
+
+    bz = Jutul.block_size(L)
     sz = size(J)
     n, m = sz
     @assert n == m
     is_first = !haskey(krylov.data, :J)
     t_setup = @elapsed if is_first
-        krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, L.jac_buffer, r0)
+        krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r)
     end
     J_bsr = krylov.data[:J]
     r_cu = krylov.data[:r]
 
     t_gpu_update = @elapsed begin
-        update_gpu_block_residual!(r_cu, bz, r0)
-        update_gpu_block_system!(J_bsr, bz, L.jac_buffer)
+        update_gpu_block_residual!(r_cu, bz, r)
+        update_gpu_block_system!(J_bsr, bz, csr_block_buffer)
     end
     t_prep = t_gpu_update + t_setup + t_prep0
     max_it = krylov.config.max_iterations

From 93bb1e1394685d6ee6d410f13c53c405bb0c87ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 14:55:37 +0200
Subject: [PATCH 04/32] Put in place some Schur stuff

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl | 59 +++++++++++++++++++++++++++++
 src/ext/ext_cuda.jl                 |  7 ++++
 2 files changed, 66 insertions(+)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index 18bc2c66..dcffacb7 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -29,3 +29,62 @@ end
 function JutulDarcy.update_gpu_block_residual!(r_cu, blockDim, r)
     copyto!(r_cu, r)
 end
+
+function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::LinearizedSystem)
+    return nothing
+end
+
+function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSystem)
+    @assert size(lsys.subsystems) == (2, 2)
+    @assert length(lsys.schur_buffer) == 2
+    buf_full_cpu = lsys.schur_buffer[1]
+    buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2]
+    B_cpu = lsys[1, 1].jac # Already transferred, not needed
+    C_cpu = lsys[1, 2].jac
+    D_cpu = lsys[2, 1].jac
+    E_cpu = lsys[2, 2].jac
+
+    E_factor = only(lsys.factor.factor)
+    E_L_cpu = E_factor.L
+    E_U_cpu = E_factor.U
+
+    to_gpu(x) = copy_to_gpu(x, Tv, Ti)
+
+    D = to_gpu(D_cpu)
+    E_L = to_gpu(E_L_cpu)
+    E_U = to_gpu(E_U_cpu)
+    C = to_gpu(C_cpu)
+
+    buf_1 = to_gpu(buf_1_cpu)
+    buf_2 = to_gpu(buf_2_cpu)
+    # Operations and format:
+    # B C
+    # D E
+    # mul!(b_buf_2, D_i, x)
+    # ldiv!(b_buf_1, E_i, b_buf_2)
+    # mul!(res, C_i, b_buf_1, -α, true)
+
+    return Dict(
+        :C => C,
+        :D => D,
+        :buf_1 => buf_1,
+        :buf_2 => buf_2,
+        :E_L => E_L,
+        :E_U => E_U
+    )
+end
+
+function copy_to_gpu(x, Ti, Tv)
+    error("Internal converter not set up for $(typeof(x))")
+end
+
+function copy_to_gpu(x::SparseMatrixCSC{Tvc, Tic}, Tv, Ti) where {Tvc, Tic}
+    if Tvc != Tv || Tic != Ti
+        x = SparseMatrixCSC{Tv, Ti}(x)
+    end
+    return CUDA.CUSPARSE.CuSparseMatrixCSC(x)
+end
+
+function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
+    return convert(CuVector{Tv}, x)
+end
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index cc764948..c7f96ff8 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -33,6 +33,8 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     rtol = convert(Tv, rtol)
     t_prep0 = @elapsed Jutul.prepare_linear_solve!(lsys)
 
+    is_single_linear_system = lsys isa LinearizedSystem
+
     L = lsys[1,1]
     J = L.jac
     J::Jutul.StaticSparsityMatrixCSR
@@ -45,6 +47,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     is_first = !haskey(krylov.data, :J)
     t_setup = @elapsed if is_first
         krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r)
+        krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys)
     end
     J_bsr = krylov.data[:J]
     r_cu = krylov.data[:r]
@@ -107,6 +110,10 @@ function update_gpu_block_system!
 
 end
 
+function build_gpu_schur_system
+
+end
+
 function update_gpu_block_residual!
 
 end

From b55ba61c4969ddcfc89c1ac23f652862eb75b0c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 15:07:56 +0200
Subject: [PATCH 05/32] Put in place some more factorization updates

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl | 15 +++++++++++++--
 src/ext/ext_cuda.jl                 | 12 ++++++++++--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index dcffacb7..8713879d 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -39,10 +39,10 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst
     @assert length(lsys.schur_buffer) == 2
     buf_full_cpu = lsys.schur_buffer[1]
     buf_1_cpu, buf_2_cpu = lsys.schur_buffer[2]
-    B_cpu = lsys[1, 1].jac # Already transferred, not needed
+    # B_cpu = lsys[1, 1].jac # Already transferred, not needed
     C_cpu = lsys[1, 2].jac
     D_cpu = lsys[2, 1].jac
-    E_cpu = lsys[2, 2].jac
+    # E_cpu = lsys[2, 2].jac # Only factorization needed I think.
 
     E_factor = only(lsys.factor.factor)
     E_L_cpu = E_factor.L
@@ -74,6 +74,17 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst
     )
 end
 
+function JutulDarcy.update_gpu_schur_system!(schur, lsys)
+    C_cpu = lsys[1, 2].jac
+    copyto!(schur[:C].nzVal, C_cpu.nzval)
+    D_cpu = lsys[2, 1].jac
+    copyto!(schur[:D].nzVal, D_cpu.nzval)
+    E_factor = only(lsys.factor.factor)
+    copyto!(schur[:E_L].nzVal, E_factor.L.nzval)
+    copyto!(schur[:E_U].nzVal, E_factor.U.nzval)
+    return schur
+end
+
 function copy_to_gpu(x, Ti, Tv)
     error("Internal converter not set up for $(typeof(x))")
 end
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index c7f96ff8..8a94b9c6 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -51,10 +51,14 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     end
     J_bsr = krylov.data[:J]
     r_cu = krylov.data[:r]
+    schur = krylov.data[:schur]
 
     t_gpu_update = @elapsed begin
-        update_gpu_block_residual!(r_cu, bz, r)
-        update_gpu_block_system!(J_bsr, bz, csr_block_buffer)
+        if !is_first
+            update_gpu_block_residual!(r_cu, bz, r)
+            update_gpu_block_system!(J_bsr, bz, csr_block_buffer)
+            update_gpu_schur_system!(schur, lsys)
+        end
     end
     t_prep = t_gpu_update + t_setup + t_prep0
     max_it = krylov.config.max_iterations
@@ -117,3 +121,7 @@ end
 function update_gpu_block_residual!
 
 end
+
+function update_gpu_schur_system!
+
+end

From e3c230e62680bfb64f109167301b43b6fe849be0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 15:17:17 +0200
Subject: [PATCH 06/32] Schur additions

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl |  4 ++++
 src/ext/ext_cuda.jl                 | 30 +++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index 8713879d..04d19163 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -74,6 +74,10 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst
     )
 end
 
+function JutulDarcy.update_gpu_schur_system!(schur::Nothing, lsys)
+    return schur
+end
+
 function JutulDarcy.update_gpu_schur_system!(schur, lsys)
     C_cpu = lsys[1, 2].jac
     copyto!(schur[:C].nzVal, C_cpu.nzval)
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index 8a94b9c6..665e4cfd 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -70,8 +70,9 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     else
         preconditioner_arg = (M = prec_op, )
     end
-    solve_f, F = Jutul.krylov_jl_solve_function(krylov, J_bsr, r_cu)
-    solve_f(F, J_bsr, r_cu;
+    operator = gpu_system_linear_operator(J_bsr, schur, Tv)
+    solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu)
+    solve_f(F, operator, r_cu;
         itmax = max_it,
         preconditioner_arg...,
         verbose = 0,
@@ -106,6 +107,31 @@ function Base.show(io::IO, krylov::CUDAReservoirKrylov)
     print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))")
 end
 
+function gpu_system_linear_operator(J, schur::Nothing, Tv)
+    return Jutul.LinearOperators.LinearOperator(J)
+end
+
+function gpu_system_linear_operator(J, schur, Tv)
+    n, m = size(J)
+    C = schur[:C]
+    D = schur[:D]
+    E_L = schur[:E_L]
+    E_U = schur[:E_U]
+    buf_1 = schur[:buf_1]
+    buf_2 = schur[:buf_2]
+    mul!(x, y) = schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U)
+    return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!)
+end
+
+function schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U)
+    mul!(buf_2, D, x)
+    ldiv!(buf_1, E_L, buf_2)
+    ldiv!(buf_2, E_U, buf_1)
+    mul!(res, C, buf_2, -α, true)
+
+    error()
+end
+
 function build_gpu_block_system
 
 end

From ca0572fdced0507da0ff8b4672c44380e48faa21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 15:33:15 +0200
Subject: [PATCH 07/32] Update mul

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl | 10 ++++++++++
 src/ext/ext_cuda.jl                 |  9 ++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index 04d19163..18b16210 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -103,3 +103,13 @@ end
 function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
     return convert(CuVector{Tv}, x)
 end
+
+function JutulDarcy.schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U)
+    mul!(buf_2, D, x)
+    # ldiv!(buf_1, E_L, buf_2)
+    CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_L, buf_2, 'O')
+    # ldiv!(buf_2, E_U, buf_1)
+    CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_U, buf_2, 'O')
+    mul!(res, C, buf_2, -α, true)
+    error()
+end
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index 665e4cfd..a7763e7f 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -119,17 +119,12 @@ function gpu_system_linear_operator(J, schur, Tv)
     E_U = schur[:E_U]
     buf_1 = schur[:buf_1]
     buf_2 = schur[:buf_2]
-    mul!(x, y) = schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U)
+    mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U)
     return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!)
 end
 
-function schur_mul_gpu!(x, y, J, C, D, buf_1, buf_2, E_L, E_U)
-    mul!(buf_2, D, x)
-    ldiv!(buf_1, E_L, buf_2)
-    ldiv!(buf_2, E_U, buf_1)
-    mul!(res, C, buf_2, -α, true)
+function schur_mul_gpu!
 
-    error()
 end
 
 function build_gpu_block_system

From 267a1cac2bfa5d4e2817532dfb12dd58b531d7e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 20:52:19 +0200
Subject: [PATCH 08/32] Rough draft of schur mostly on GPU

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl | 42 ++++++++++++++---------------
 ext/JutulDarcyCUDAExt/ilu0.jl       |  2 +-
 src/ext/ext_cuda.jl                 | 18 ++++++++-----
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index 18b16210..e3409ada 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -45,18 +45,18 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst
     # E_cpu = lsys[2, 2].jac # Only factorization needed I think.
 
     E_factor = only(lsys.factor.factor)
-    E_L_cpu = E_factor.L
-    E_U_cpu = E_factor.U
+    # E_L_cpu = E_factor.L
+    # E_U_cpu = E_factor.U
 
     to_gpu(x) = copy_to_gpu(x, Tv, Ti)
 
     D = to_gpu(D_cpu)
-    E_L = to_gpu(E_L_cpu)
-    E_U = to_gpu(E_U_cpu)
+    # E_L = to_gpu(E_L_cpu)
+    # E_U = to_gpu(E_U_cpu)
     C = to_gpu(C_cpu)
 
-    buf_1 = to_gpu(buf_1_cpu)
-    buf_2 = to_gpu(buf_2_cpu)
+    buf = to_gpu(buf_1_cpu)
+    # buf_2 = to_gpu(buf_2_cpu)
     # Operations and format:
     # B C
     # D E
@@ -67,10 +67,10 @@ function JutulDarcy.build_gpu_schur_system(Ti, Tv, bz, lsys::MultiLinearizedSyst
     return Dict(
         :C => C,
         :D => D,
-        :buf_1 => buf_1,
-        :buf_2 => buf_2,
-        :E_L => E_L,
-        :E_U => E_U
+        :buf => buf,
+        :buf_1_cpu => buf_1_cpu,
+        :buf_2_cpu => buf_2_cpu,
+        :E_factor => E_factor
     )
 end
 
@@ -83,9 +83,6 @@ function JutulDarcy.update_gpu_schur_system!(schur, lsys)
     copyto!(schur[:C].nzVal, C_cpu.nzval)
     D_cpu = lsys[2, 1].jac
     copyto!(schur[:D].nzVal, D_cpu.nzval)
-    E_factor = only(lsys.factor.factor)
-    copyto!(schur[:E_L].nzVal, E_factor.L.nzval)
-    copyto!(schur[:E_U].nzVal, E_factor.U.nzval)
     return schur
 end
 
@@ -104,12 +101,15 @@ function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
     return convert(CuVector{Tv}, x)
 end
 
-function JutulDarcy.schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U)
-    mul!(buf_2, D, x)
-    # ldiv!(buf_1, E_L, buf_2)
-    CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_L, buf_2, 'O')
-    # ldiv!(buf_2, E_U, buf_1)
-    CUDA.CUSPARSE.sv2!('N', 'L', 'N', 1.0, E_U, buf_2, 'O')
-    mul!(res, C, buf_2, -α, true)
-    error()
+function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor)
+    # Working on GPU
+    mul!(y, J, x, α, β)
+    mul!(buf, D, x)
+    # Move over to CPU for this part
+    copyto!(buf1_cpu, buf)
+    ldiv!(buf2_cpu, E_factor, buf1_cpu)
+    # Back to GPU for the big array...
+    copyto!(buf, buf2_cpu)
+    mul!(y, C, buf, -α, true)
+    return y
 end
diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl
index 08e99ad6..d7e1fd99 100644
--- a/ext/JutulDarcyCUDAExt/ilu0.jl
+++ b/ext/JutulDarcyCUDAExt/ilu0.jl
@@ -1,4 +1,4 @@
-function ilu0_invert!(y, A, Tv, c_char, data)
+function ilu0_invert!(y, A::CUDA.CUSPARSE.CuSparseMatrixBSR, Tv, c_char, data)
     if Tv == Float32
         bname = CUDA.CUSPARSE.cusparseSbsrsv2_bufferSize
         aname = CUDA.CUSPARSE.cusparseSbsrsv2_analysis
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index a7763e7f..af312023 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -48,10 +48,12 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     t_setup = @elapsed if is_first
         krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r)
         krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys)
+        krylov.data[:dx_cpu] = zeros(n*bz)
     end
     J_bsr = krylov.data[:J]
     r_cu = krylov.data[:r]
     schur = krylov.data[:schur]
+    dx_cpu = krylov.data[:dx_cpu]
 
     t_gpu_update = @elapsed begin
         if !is_first
@@ -83,8 +85,9 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     )
     dx_gpu, stats = (krylov.storage.x, krylov.storage.stats)
 
-    copyto!(dx, dx_gpu)
-    @. dx = -dx
+    copyto!(dx_cpu, dx_gpu)
+    Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx)
+    # @. dx = -dx
 
     res = stats.residuals
     solved = stats.solved
@@ -115,11 +118,12 @@ function gpu_system_linear_operator(J, schur, Tv)
     n, m = size(J)
     C = schur[:C]
     D = schur[:D]
-    E_L = schur[:E_L]
-    E_U = schur[:E_U]
-    buf_1 = schur[:buf_1]
-    buf_2 = schur[:buf_2]
-    mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf_1, buf_2, E_L, E_U)
+    E_factor = schur[:E_factor]
+    buf = schur[:buf]
+    buf_1 = schur[:buf_1_cpu]
+    buf_2 = schur[:buf_2_cpu]
+
+    mul!(x, y, α, β) = schur_mul_gpu!(x, y, α, β, J, C, D, buf, buf_1, buf_2, E_factor)
     return Jutul.LinearOperators.LinearOperator(Tv, n, m, false, false, mul!)
 end
 

From 391b16ff505e15809140a6d1b5bc2eef436a425e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 21:22:32 +0200
Subject: [PATCH 09/32] Make GPU and CPU operations independent in schur
 multiply

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index e3409ada..d0b1f75a 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -102,14 +102,16 @@ function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
 end
 
 function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor)
-    # Working on GPU
-    mul!(y, J, x, α, β)
-    mul!(buf, D, x)
-    # Move over to CPU for this part
-    copyto!(buf1_cpu, buf)
-    ldiv!(buf2_cpu, E_factor, buf1_cpu)
-    # Back to GPU for the big array...
-    copyto!(buf, buf2_cpu)
+    @sync begin 
+        # Working on GPU
+        @async mul!(y, J, x, α, β)
+        mul!(buf, D, x)
+        # Move over to CPU for this part
+        copyto!(buf1_cpu, buf)
+        ldiv!(buf2_cpu, E_factor, buf1_cpu)
+        # Back to GPU for the big array...
+        copyto!(buf, buf2_cpu)
+    end
     mul!(y, C, buf, -α, true)
     return y
 end

From a5d12e8bbfdbc4d865b9bc3c8204e43cd02b9b6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Thu, 24 Oct 2024 21:49:48 +0200
Subject: [PATCH 10/32] Add timing to CUDA calls

---
 ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl |  4 ++++
 ext/JutulDarcyCUDAExt/cuda_utils.jl        |  2 +-
 src/ext/ext_cuda.jl                        | 17 +++++++++--------
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
index 4086db57..c401cbf8 100644
--- a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
+++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
@@ -1,5 +1,9 @@
 module JutulDarcyCUDAExt
     using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays
+    import Jutul: @tic
+
+    timeit_debug_enabled() = Jutul.timeit_debug_enabled()
+
     include("ilu0.jl")
     include("krylov.jl")
     include("cuda_utils.jl")
diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index d0b1f75a..1da70ab0 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -102,7 +102,7 @@ function copy_to_gpu(x::Vector{Tvc}, Tv, Ti) where {Tvc}
 end
 
 function JutulDarcy.schur_mul_gpu!(y, x, α, β, J, C, D, buf::CuVector, buf1_cpu::Vector, buf2_cpu::Vector, E_factor)
-    @sync begin 
+    @tic "schur apply" @sync begin 
         # Working on GPU
         @async mul!(y, J, x, α, β)
         mul!(buf, D, x)
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index af312023..57004655 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -31,7 +31,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
 
     atol = convert(Tv, atol)
     rtol = convert(Tv, rtol)
-    t_prep0 = @elapsed Jutul.prepare_linear_solve!(lsys)
+    t_prep0 = @elapsed @tic "prepare" Jutul.prepare_linear_solve!(lsys)
 
     is_single_linear_system = lsys isa LinearizedSystem
 
@@ -45,7 +45,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     n, m = sz
     @assert n == m
     is_first = !haskey(krylov.data, :J)
-    t_setup = @elapsed if is_first
+    t_setup = @elapsed @tic "initial_gpu" if is_first
         krylov.data[:J], krylov.data[:r] = build_gpu_block_system(Ti, Tv, sz, bz, J.At.colptr, J.At.rowval, csr_block_buffer, r)
         krylov.data[:schur] = build_gpu_schur_system(Ti, Tv, bz, lsys)
         krylov.data[:dx_cpu] = zeros(n*bz)
@@ -55,7 +55,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     schur = krylov.data[:schur]
     dx_cpu = krylov.data[:dx_cpu]
 
-    t_gpu_update = @elapsed begin
+    t_gpu_update = @elapsed @tic "to_gpu" begin
         if !is_first
             update_gpu_block_residual!(r_cu, bz, r)
             update_gpu_block_system!(J_bsr, bz, csr_block_buffer)
@@ -65,7 +65,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     t_prep = t_gpu_update + t_setup + t_prep0
     max_it = krylov.config.max_iterations
 
-    Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
+    @tic "precond" Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
     prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv)
     if cfg.precond_side == :right
         preconditioner_arg = (N = prec_op, )
@@ -74,7 +74,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     end
     operator = gpu_system_linear_operator(J_bsr, schur, Tv)
     solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu)
-    solve_f(F, operator, r_cu;
+    @tic "solve" solve_f(F, operator, r_cu;
         itmax = max_it,
         preconditioner_arg...,
         verbose = 0,
@@ -85,9 +85,10 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     )
     dx_gpu, stats = (krylov.storage.x, krylov.storage.stats)
 
-    copyto!(dx_cpu, dx_gpu)
-    Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx)
-    # @. dx = -dx
+    @tic "update dx" begin
+        copyto!(dx_cpu, dx_gpu)
+        Jutul.update_dx_from_vector!(lsys, dx_cpu, dx = dx)
+    end
 
     res = stats.residuals
     solved = stats.solved

From 281e6524b07f7e4fb7ec03b286fca820d2a273fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 10:09:38 +0200
Subject: [PATCH 11/32] Update interface

---
 ext/JutulDarcyCUDAExt/ilu0.jl | 6 +++++-
 src/ext/ext_cuda.jl           | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl
index d7e1fd99..7577a1a3 100644
--- a/ext/JutulDarcyCUDAExt/ilu0.jl
+++ b/ext/JutulDarcyCUDAExt/ilu0.jl
@@ -162,5 +162,9 @@ function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv}
     P = f.factor[:ilu0_gpu]
     ilu0_invert!(y, P, Tv, 'L', f.factor)
     ilu0_invert!(y, P, Tv, 'U', f.factor)
-    CUDA.synchronize()
+    # CUDA.synchronize()
+end
+
+function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
+    Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
 end
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index 57004655..e444d960 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -65,7 +65,7 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     t_prep = t_gpu_update + t_setup + t_prep0
     max_it = krylov.config.max_iterations
 
-    @tic "precond" Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
+    @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
     prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv)
     if cfg.precond_side == :right
         preconditioner_arg = (N = prec_op, )
@@ -105,6 +105,10 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     return Jutul.linear_solve_return(solved, n, stats, precond = t_prec_op, precond_count = t_prec_count, prepare = t_prec + t_prep)
 end
 
+function gpu_update_preconditioner!
+
+end
+
 function Base.show(io::IO, krylov::CUDAReservoirKrylov)
     rtol = linear_solver_tolerance(krylov.config, :relative)
     atol = linear_solver_tolerance(krylov.config, :absolute)

From e5e20eaec66ec6dc1625f7c88e6a6b0e8905d6d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 10:37:40 +0200
Subject: [PATCH 12/32] Draft extension to upcoming AMGX release

---
 Project.toml                               |  4 ++++
 ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl | 12 ++++++++++++
 src/ext/ext.jl                             |  2 +-
 src/ext/ext_amgx.jl                        | 16 ++++++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
 create mode 100644 src/ext/ext_amgx.jl

diff --git a/Project.toml b/Project.toml
index bbcf5bbc..a10fc989 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,6 +34,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [weakdeps]
+AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
@@ -42,6 +43,7 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 
 [extensions]
+JutulDarcyAMGXExt = ["AMGX", "CUDA"]
 JutulDarcyCUDAExt = "CUDA"
 JutulDarcyGLMakieExt = "GLMakie"
 JutulDarcyMakieExt = "Makie"
@@ -50,6 +52,7 @@ JutulDarcyPartitionedArraysExt = ["PartitionedArrays", "MPI", "HYPRE"]
 [compat]
 AlgebraicMultigrid = "0.5.1, 0.6.0"
 Artifacts = "1"
+AMGX = "0.2"
 CUDA = "5"
 DataStructures = "0.18.13"
 Dates = "1"
@@ -81,6 +84,7 @@ Tullio = "0.3.4"
 julia = "1.7"
 
 [extras]
+AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
new file mode 100644
index 00000000..b896355f
--- /dev/null
+++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
@@ -0,0 +1,12 @@
+module JutulDarcyAMGXExt
+    using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays, AMGX
+    import Jutul: @tic
+
+    timeit_debug_enabled() = Jutul.timeit_debug_enabled()
+
+    function JutulDarcy.gpu_update_preconditioner!(prec::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
+        error()
+        Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
+    end
+
+end
diff --git a/src/ext/ext.jl b/src/ext/ext.jl
index 0b716955..aaa978d2 100644
--- a/src/ext/ext.jl
+++ b/src/ext/ext.jl
@@ -1,4 +1,4 @@
 include("ext_makie.jl")
 include("ext_partitionedarrays.jl")
 include("ext_cuda.jl")
-
+include("ext_amgx.jl")
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
new file mode 100644
index 00000000..fb6035ec
--- /dev/null
+++ b/src/ext/ext_amgx.jl
@@ -0,0 +1,16 @@
+struct AMGXPreconditioner <: Jutul.JutulPreconditioner
+    settings::Dict{Symbol, Any}
+    data::Dict{Symbol, Any}
+    function AMGXPreconditioner(settings::Dict{Symbol, Any})
+        data = Dict{Symbol, Any}()
+        new(settings, data)
+    end
+end
+
+function AMGXPreconditioner(; kwarg...)
+    settings = Dict{Symbol, Any}()
+    for (k, v) in kwarg
+        settings[k] = v
+    end
+    return AMGXPreconditioner(settings)
+end

From 5b3251b1b307051d79a083580fc5c16e2ac40d57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 12:59:43 +0200
Subject: [PATCH 13/32] Add some AMGX init code

---
 Project.toml                               |  2 +-
 ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl | 12 ++--
 ext/JutulDarcyAMGXExt/cpr.jl               | 71 ++++++++++++++++++++++
 src/ext/ext_amgx.jl                        | 19 ++++++
 4 files changed, 98 insertions(+), 6 deletions(-)
 create mode 100644 ext/JutulDarcyAMGXExt/cpr.jl

diff --git a/Project.toml b/Project.toml
index a10fc989..20bf60e1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -43,7 +43,7 @@ Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 
 [extensions]
-JutulDarcyAMGXExt = ["AMGX", "CUDA"]
+JutulDarcyAMGXExt = "AMGX"
 JutulDarcyCUDAExt = "CUDA"
 JutulDarcyGLMakieExt = "GLMakie"
 JutulDarcyMakieExt = "Makie"
diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
index b896355f..b4baa274 100644
--- a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
+++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
@@ -1,12 +1,14 @@
 module JutulDarcyAMGXExt
-    using Jutul, JutulDarcy, CUDA, LinearAlgebra, SparseArrays, AMGX
+    using JutulDarcy, Jutul, AMGX, AMGX.CUDA, LinearAlgebra, SparseArrays
+    import JutulDarcy: AMGXPreconditioner
     import Jutul: @tic
 
     timeit_debug_enabled() = Jutul.timeit_debug_enabled()
 
-    function JutulDarcy.gpu_update_preconditioner!(prec::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
-        error()
-        Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
-    end
+    include("cpr.jl")
 
+    function __init__()
+        # TODO: Figure out a way to not always do this?
+        AMGX.initialize()
+    end
 end
diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
new file mode 100644
index 00000000..c0d97a50
--- /dev/null
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -0,0 +1,71 @@
+function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR)
+    data = amgx.data
+    is_first = !haskey(data, :A_p)
+    if is_first
+        initialize_amgx_structure!(amgx, A_p)
+    end
+    error()
+end
+
+mutable struct AMGXStorage{C, R, V, M, S}
+    config::C
+    resources::R
+    r::V
+    x::V
+    matrix::M
+    solver::S
+    function AMGXStorage(config, amgx_mode = AMGX.dDDI)
+        resources = AMGX.Resources(config)
+        r = AMGX.AMGXVector(resources, amgx_mode)
+        x = AMGX.AMGXVector(resources, amgx_mode)
+        matrix = AMGX.AMGXMatrix(resources, amgx_mode)
+        solver = AMGX.Solver(resources, amgx_mode, config)
+        function finalize_storage!(amgx_s::AMGXStorage)
+            close(amgx_s.solver)
+            close(amgx_s.matrix)
+            close(amgx_s.r)
+            close(amgx_s.x)
+            close(amgx_s.resources)
+            close(amgx_s.config)
+        end
+        s = new{
+            typeof(config),
+            typeof(resources),
+            typeof(r),
+            typeof(matrix),
+            typeof(solver)
+        }(config, resources, r, x, matrix, solver)
+        return finalizer(finalize_storage!, s)
+    end
+end
+
+function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv
+    if Tv == Float64
+        amgx_mode = AMGX.dDDI
+    else
+        amgx_mode = AMGX.dFFI
+    end
+    # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver.
+    n, m = size(A)
+    @assert n == m
+    config = AMGX.Config(amgx.settings)
+    s = AMGXStorage(config, amgx_mode)
+    # RHS and solution vectors to right size just in case
+    AMGX.set_zero!(s.x, n)
+    AMGX.set_zero!(s.r, n)
+
+    row_ptr = Cint.(A.At.colptr .- 1)
+    colval = Cint.(A.At.rowval .- 1)
+    nzval = A.At.nzval
+    nzval = Tv.(nzval)
+
+    AMGX.upload!(s.matrix, 
+        row_ptr,
+        colval,
+        nzval
+    )
+
+    amgx.data[:storage] = s
+    return amgx
+end
+
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index fb6035ec..d1fae233 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -14,3 +14,22 @@ function AMGXPreconditioner(; kwarg...)
     end
     return AMGXPreconditioner(settings)
 end
+
+function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsityMatrixCSR, b, context::ParallelCSRContext, executor)
+    # Intentionally do nothing - a bit hackish
+end
+
+function JutulDarcy.gpu_update_preconditioner!(cpr::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
+    Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor)
+    
+    JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p)
+    # Next transfer the coarse system to GPU
+    # Update if needed
+    # Then do the whole apply here
+    error()
+    # Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
+end
+
+function update_amgx_pressure_system!
+
+end
\ No newline at end of file

From d914ceb609807fc8c3d143227ee961257e5423e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 13:22:34 +0200
Subject: [PATCH 14/32] More CPR testing

---
 ext/JutulDarcyAMGXExt/cpr.jl | 63 +++++++++++++++++++-----------------
 src/cpr.jl                   |  6 ++--
 src/ext/ext_amgx.jl          | 28 ++++++++++------
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index c0d97a50..078a95ce 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -1,10 +1,8 @@
 function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR)
     data = amgx.data
-    is_first = !haskey(data, :A_p)
-    if is_first
-        initialize_amgx_structure!(amgx, A_p)
-    end
-    error()
+    initialize_amgx_structure!(amgx, A_p)
+    update_pressure_system!(amgx, A_p)
+    return amgx
 end
 
 mutable struct AMGXStorage{C, R, V, M, S}
@@ -39,33 +37,40 @@ mutable struct AMGXStorage{C, R, V, M, S}
     end
 end
 
-function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv
-    if Tv == Float64
-        amgx_mode = AMGX.dDDI
-    else
-        amgx_mode = AMGX.dFFI
-    end
-    # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver.
-    n, m = size(A)
-    @assert n == m
-    config = AMGX.Config(amgx.settings)
-    s = AMGXStorage(config, amgx_mode)
-    # RHS and solution vectors to right size just in case
-    AMGX.set_zero!(s.x, n)
-    AMGX.set_zero!(s.r, n)
+function update_pressure_system!(amgx, A_cpu)
+    A_gpu = amgx.data[:storage].matrix
+    AMGX.replace_coefficients!(A_gpu, A_cpu.At.nzval)
+    return amgx
+end
 
-    row_ptr = Cint.(A.At.colptr .- 1)
-    colval = Cint.(A.At.rowval .- 1)
-    nzval = A.At.nzval
-    nzval = Tv.(nzval)
+function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv
+    if !haskey(amgx.data, :storage)
+        if Tv == Float64
+            amgx_mode = AMGX.dDDI
+        else
+            amgx_mode = AMGX.dFFI
+        end
+        # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver.
+        n, m = size(A)
+        @assert n == m
+        config = AMGX.Config(amgx.settings)
+        s = AMGXStorage(config, amgx_mode)
+        # RHS and solution vectors to right size just in case
+        AMGX.set_zero!(s.x, n)
+        AMGX.set_zero!(s.r, n)
 
-    AMGX.upload!(s.matrix, 
-        row_ptr,
-        colval,
-        nzval
-    )
+        row_ptr = Cint.(A.At.colptr .- 1)
+        colval = Cint.(A.At.rowval .- 1)
+        nzval = A.At.nzval
+        nzval = Tv.(nzval)
 
-    amgx.data[:storage] = s
+        AMGX.upload!(s.matrix, 
+            row_ptr,
+            colval,
+            nzval
+        )
+        amgx.data[:storage] = s
+    end
     return amgx
 end
 
diff --git a/src/cpr.jl b/src/cpr.jl
index a6b91fea..23233348 100644
--- a/src/cpr.jl
+++ b/src/cpr.jl
@@ -168,11 +168,13 @@ function default_psolve(; max_levels = 10, max_coarse = 10, amgcl_type = :amg, t
     end
 end
 
-function update_preconditioner!(cpr::CPRPreconditioner, lsys, model, storage, recorder, executor)
+function update_preconditioner!(cpr::CPRPreconditioner, lsys, model, storage, recorder, executor; update_system_precond = true)
     rmodel = reservoir_model(model, type = :flow)
     ctx = rmodel.context
     update_p = update_cpr_internals!(cpr, lsys, model, storage, recorder, executor)
-    @tic "s-precond" update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor)
+    if update_system_precond
+        @tic "s-precond" update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor)
+    end
     if update_p
         @tic "p-precond" update_preconditioner!(cpr.pressure_precond, cpr.storage.A_p, cpr.storage.r_p, ctx, executor)
     elseif should_update_cpr(cpr, recorder, :partial)
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index d1fae233..45a48100 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -15,21 +15,31 @@ function AMGXPreconditioner(; kwarg...)
     return AMGXPreconditioner(settings)
 end
 
+const AMGXCPR = CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}
+
 function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsityMatrixCSR, b, context::ParallelCSRContext, executor)
     # Intentionally do nothing - a bit hackish
 end
 
-function JutulDarcy.gpu_update_preconditioner!(cpr::CPRPreconditioner{JutulDarcy.AMGXPreconditioner, <:Any}, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
-    Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor)
-    
+function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
+    Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor, update_system_precond = false)
+    # Transfer pressure system to GPU
     JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p)
-    # Next transfer the coarse system to GPU
-    # Update if needed
-    # Then do the whole apply here
-    error()
-    # Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
 end
 
 function update_amgx_pressure_system!
 
-end
\ No newline at end of file
+end
+
+function Jutul.apply!(x, cpr::AMGXCPR, r)
+    # Apply smoother
+    cpr_s = cpr.storage
+    bz = cpr_s.block_size
+
+    Jutul.apply!(x, cpr.system_precond, r)
+    # Correct the residual
+    # Construct pressure residual
+    # Apply pressure preconditioner
+    # Update increments for pressure
+    error()
+end

From 253ff5c04310d9decac109dc0c14d296174d071d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 19:46:40 +0200
Subject: [PATCH 15/32] More work on GPU CPR

---
 ext/JutulDarcyAMGXExt/cpr.jl               | 31 +++++++++++++++++++---
 ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl |  1 +
 ext/JutulDarcyCUDAExt/cpr.jl               | 21 +++++++++++++++
 ext/JutulDarcyCUDAExt/ilu0.jl              |  4 +--
 src/ext/ext_amgx.jl                        | 23 +++++++++++++---
 src/ext/ext_cuda.jl                        |  9 +++++--
 6 files changed, 77 insertions(+), 12 deletions(-)
 create mode 100644 ext/JutulDarcyCUDAExt/cpr.jl

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 078a95ce..03f16bf4 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -1,6 +1,6 @@
-function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR)
+function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
     data = amgx.data
-    initialize_amgx_structure!(amgx, A_p)
+    initialize_amgx_structure!(amgx, A_p, Tv)
     update_pressure_system!(amgx, A_p)
     return amgx
 end
@@ -43,14 +43,13 @@ function update_pressure_system!(amgx, A_cpu)
     return amgx
 end
 
-function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR{Tv, <:Any}) where Tv
+function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
     if !haskey(amgx.data, :storage)
         if Tv == Float64
             amgx_mode = AMGX.dDDI
         else
             amgx_mode = AMGX.dFFI
         end
-        # TODO: Tv isn't really coming from the matrix, it should be set beforehand in the krylov solver.
         n, m = size(A)
         @assert n == m
         config = AMGX.Config(amgx.settings)
@@ -70,7 +69,31 @@ function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR
             nzval
         )
         amgx.data[:storage] = s
+        amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
     end
     return amgx
 end
 
+function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op)
+    # TODO: Rename this.
+    data = cpr.pressure_precond.data
+    if !haskey(data, :w_p)
+        Tv = eltype(r_cu)
+        n = length(r_cu)
+        cpr_s = cpr.storage
+        bz = cpr_s.block_size
+
+        # cpr.pressure_precond.data[:buffer_full] = AMGX.CUDA.CuVector{Tv}(undef, n)
+        bz_w, n_w = size(cpr_s.w_p)
+        @assert n == bz*n_w
+        data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w)
+        data[:main_system] = J_bsr
+        data[:operator] = op
+    end
+    # Put updated weights on GPU
+    w_p_cpu = cpr_s.w_p
+    w_p_gpu = data[:w_p]
+    @assert size(w_p_cpu) == size(w_p_gpu)
+    copyto!(w_p_gpu, w_p_cpu)
+    return cpr
+end
diff --git a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
index c401cbf8..5c467057 100644
--- a/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
+++ b/ext/JutulDarcyCUDAExt/JutulDarcyCUDAExt.jl
@@ -7,4 +7,5 @@ module JutulDarcyCUDAExt
     include("ilu0.jl")
     include("krylov.jl")
     include("cuda_utils.jl")
+    include("cpr.jl")
 end
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
new file mode 100644
index 00000000..73e6a871
--- /dev/null
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -0,0 +1,21 @@
+function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
+    # TODO: Actually use CUDA threads here.
+    @assert eltype(r_p) == eltype(w_p) == eltype(r)
+    CUDA.@cuda reduce_residual_kernel!(r_p, w_p, r)
+    return r_p
+end
+
+function reduce_residual_kernel!(r_p, w_p, r_full)
+    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+    stride = blockDim().x
+    T = eltype(r_p)
+    ncomp = size(w_p, 1)
+    for cell in index:stride:length(r_p)
+        v = zero(T)
+        for comp in axes(w_p, 1)
+            v += w_p[comp, cell] * r_full[(cell-1)*ncomp + comp]
+        end
+        r_p[cell] = v
+    end
+    return nothing
+end
diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl
index 7577a1a3..8553ccd5 100644
--- a/ext/JutulDarcyCUDAExt/ilu0.jl
+++ b/ext/JutulDarcyCUDAExt/ilu0.jl
@@ -165,6 +165,6 @@ function Jutul.apply!(y::CuVector{Tv}, f::ILUZeroPreconditioner, x::CuVector{Tv}
     # CUDA.synchronize()
 end
 
-function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
-    Jutul.update_preconditioner!(krylov.preconditioner, J_bsr, r_cu, model.context, executor)
+function JutulDarcy.gpu_update_preconditioner!(prec::ILUZeroPreconditioner, sys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    Jutul.update_preconditioner!(prec, J_bsr, r_cu, model.context, executor)
 end
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index 45a48100..f8d137de 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -21,24 +21,39 @@ function update_preconditioner!(amg::AMGXPreconditioner, A::Jutul.StaticSparsity
     # Intentionally do nothing - a bit hackish
 end
 
-function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
+function JutulDarcy.gpu_update_preconditioner!(cpr::AMGXCPR, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
     Jutul.update_preconditioner!(cpr, lsys, model, storage, recorder, executor, update_system_precond = false)
     # Transfer pressure system to GPU
-    JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p)
+    JutulDarcy.gpu_update_preconditioner!(cpr.system_precond, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    JutulDarcy.update_amgx_pressure_system!(cpr.pressure_precond, cpr.storage.A_p, eltype(J_bsr))
+    # How to get the linear operator in here?
+    gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op)
 end
 
 function update_amgx_pressure_system!
 
 end
 
+function gpu_cpr_setup_buffers!
+
+end
+
 function Jutul.apply!(x, cpr::AMGXCPR, r)
     # Apply smoother
-    cpr_s = cpr.storage
-    bz = cpr_s.block_size
 
     Jutul.apply!(x, cpr.system_precond, r)
     # Correct the residual
+    A_ps = cpr.pressure_precond.data[:operator]
+    # buf = cpr.pressure_precond.data[:buffer_full]
+    # buf_p = cpr.pressure_precond.data[:buffer_p]
+    JutulDarcy.correct_residual!(r, A_ps, x)
     # Construct pressure residual
+    r_p = cpr.pressure_precond.data[:buffer_p]
+    w_p = cpr.pressure_precond.data[:w_p]
+    ncomp, ncell = size(w_p)
+    gpu_reduce_residual!(r_p, w_p, r)
+    # @info "??" typeof(r_p) typeof(w_p) typeof(r)
+    # @tullio r_p[cell] = w_p[comp, cell] * r_tmp[comp, cell]
     # Apply pressure preconditioner
     # Update increments for pressure
     error()
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index e444d960..33e123ee 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -65,14 +65,15 @@ function Jutul.linear_solve!(lsys::Jutul.LSystem,
     t_prep = t_gpu_update + t_setup + t_prep0
     max_it = krylov.config.max_iterations
 
-    @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu)
+    operator = gpu_system_linear_operator(J_bsr, schur, Tv)
+
+    @tic "precond" gpu_update_preconditioner!(krylov.preconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, operator)
     prec_op = Jutul.preconditioner(krylov, lsys, model, storage, recorder, Tv)
     if cfg.precond_side == :right
         preconditioner_arg = (N = prec_op, )
     else
         preconditioner_arg = (M = prec_op, )
     end
-    operator = gpu_system_linear_operator(J_bsr, schur, Tv)
     solve_f, F = Jutul.krylov_jl_solve_function(krylov, operator, r_cu)
     @tic "solve" solve_f(F, operator, r_cu;
         itmax = max_it,
@@ -155,3 +156,7 @@ end
 function update_gpu_schur_system!
 
 end
+
+function gpu_reduce_residual!
+
+end

From 7ee843ded74ecebf0adecf6d76a5671c02552668 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 20:05:03 +0200
Subject: [PATCH 16/32] CPR apply seems to run once...

---
 ext/JutulDarcyAMGXExt/cpr.jl | 15 ++++++++++++++-
 ext/JutulDarcyCUDAExt/cpr.jl | 19 ++++++++++++++++++-
 src/ext/ext_amgx.jl          | 12 +++++++-----
 src/ext/ext_cuda.jl          |  4 ++++
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 03f16bf4..5a33c96a 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -2,6 +2,8 @@ function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::
     data = amgx.data
     initialize_amgx_structure!(amgx, A_p, Tv)
     update_pressure_system!(amgx, A_p)
+    s = amgx.data[:storage]
+    AMGX.setup!(s.solver, s.matrix)
     return amgx
 end
 
@@ -38,11 +40,22 @@ mutable struct AMGXStorage{C, R, V, M, S}
 end
 
 function update_pressure_system!(amgx, A_cpu)
-    A_gpu = amgx.data[:storage].matrix
+    s = amgx.data[:storage]
+    A_gpu = s.matrix
     AMGX.replace_coefficients!(A_gpu, A_cpu.At.nzval)
     return amgx
 end
 
+function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
+    s = amgx.data[:storage]
+    n = length(r_p)
+    AMGX.upload!(s.r, r_p)
+    AMGX.set_zero!(s.x, n)
+    AMGX.solve!(s.x, s.solver, s.r)
+    AMGX.download!(r_p, s.x)
+    return r_p
+end
+
 function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
     if !haskey(amgx.data, :storage)
         if Tv == Float64
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index 73e6a871..254e30e0 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -6,7 +6,7 @@ function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
 end
 
 function reduce_residual_kernel!(r_p, w_p, r_full)
-    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+    index = threadIdx().x
     stride = blockDim().x
     T = eltype(r_p)
     ncomp = size(w_p, 1)
@@ -19,3 +19,20 @@ function reduce_residual_kernel!(r_p, w_p, r_full)
     end
     return nothing
 end
+
+function JutulDarcy.gpu_increment_pressure!(x, dp)
+    n = length(x)
+    bz = div(n, length(dp))
+    @assert bz > 0
+    CUDA.@cuda reduce_residual_kernel!(x, dp, bz)
+    return x
+end
+
+function gpu_increment_pressure_kernel!(x, dp, bz)
+    index = threadIdx().x
+    stride = blockDim().x
+    for cell in index:stride:length(dp)
+        x[(cell-1)*bz + 1] = dp[cell]
+    end
+    return nothing
+end
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index f8d137de..e5588df7 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -38,23 +38,25 @@ function gpu_cpr_setup_buffers!
 
 end
 
+function gpu_amgx_solve!
+
+end
+
 function Jutul.apply!(x, cpr::AMGXCPR, r)
     # Apply smoother
-
     Jutul.apply!(x, cpr.system_precond, r)
     # Correct the residual
     A_ps = cpr.pressure_precond.data[:operator]
     # buf = cpr.pressure_precond.data[:buffer_full]
-    # buf_p = cpr.pressure_precond.data[:buffer_p]
     JutulDarcy.correct_residual!(r, A_ps, x)
     # Construct pressure residual
     r_p = cpr.pressure_precond.data[:buffer_p]
     w_p = cpr.pressure_precond.data[:w_p]
     ncomp, ncell = size(w_p)
     gpu_reduce_residual!(r_p, w_p, r)
-    # @info "??" typeof(r_p) typeof(w_p) typeof(r)
-    # @tullio r_p[cell] = w_p[comp, cell] * r_tmp[comp, cell]
     # Apply pressure preconditioner
+    amgx = cpr.pressure_precond
+    dp = gpu_amgx_solve!(amgx, r_p)
     # Update increments for pressure
-    error()
+    gpu_increment_pressure!(x, dp)
 end
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index 33e123ee..3c8289f1 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -160,3 +160,7 @@ end
 function gpu_reduce_residual!
 
 end
+
+function gpu_increment_pressure!
+
+end

From 03a56551cc712c429e59240a75c0f75bca62d85a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 20:48:00 +0200
Subject: [PATCH 17/32] Fixes to kernel launches

---
 ext/JutulDarcyAMGXExt/cpr.jl |  2 +-
 ext/JutulDarcyCUDAExt/cpr.jl | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 5a33c96a..4dab6e10 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -90,10 +90,10 @@ end
 function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op)
     # TODO: Rename this.
     data = cpr.pressure_precond.data
+    cpr_s = cpr.storage
     if !haskey(data, :w_p)
         Tv = eltype(r_cu)
         n = length(r_cu)
-        cpr_s = cpr.storage
         bz = cpr_s.block_size
 
         # cpr.pressure_precond.data[:buffer_full] = AMGX.CUDA.CuVector{Tv}(undef, n)
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index 254e30e0..065a46e2 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -1,19 +1,24 @@
 function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
     # TODO: Actually use CUDA threads here.
     @assert eltype(r_p) == eltype(w_p) == eltype(r)
+    @assert length(w_p) == length(r)
+    @assert size(w_p, 2) == length(r_p)
     CUDA.@cuda reduce_residual_kernel!(r_p, w_p, r)
     return r_p
 end
 
-function reduce_residual_kernel!(r_p, w_p, r_full)
+function reduce_residual_kernel!(r_p, w_p, r_ps)
     index = threadIdx().x
     stride = blockDim().x
     T = eltype(r_p)
     ncomp = size(w_p, 1)
     for cell in index:stride:length(r_p)
         v = zero(T)
-        for comp in axes(w_p, 1)
-            v += w_p[comp, cell] * r_full[(cell-1)*ncomp + comp]
+        for comp in 1:ncomp
+            ix = (cell-1)*ncomp + comp
+            wi = w_p[comp, cell]
+            ri = r_ps[ix]
+            v += wi*ri
         end
         r_p[cell] = v
     end
@@ -24,7 +29,8 @@ function JutulDarcy.gpu_increment_pressure!(x, dp)
     n = length(x)
     bz = div(n, length(dp))
     @assert bz > 0
-    CUDA.@cuda reduce_residual_kernel!(x, dp, bz)
+    CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz)
+    CUDA.synchronize()
     return x
 end
 

From eba4fb63bc417599fac87be4f67a0c40d12343cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 21:01:42 +0200
Subject: [PATCH 18/32] Sort-of-working GPU CPR with wells

---
 ext/JutulDarcyCUDAExt/cpr.jl |  2 +-
 src/ext/ext_amgx.jl          | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index 065a46e2..b7dfbe18 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -38,7 +38,7 @@ function gpu_increment_pressure_kernel!(x, dp, bz)
     index = threadIdx().x
     stride = blockDim().x
     for cell in index:stride:length(dp)
-        x[(cell-1)*bz + 1] = dp[cell]
+        x[(cell-1)*bz + 1] += dp[cell]
     end
     return nothing
 end
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index e5588df7..acda3bbc 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -47,7 +47,15 @@ function Jutul.apply!(x, cpr::AMGXCPR, r)
     Jutul.apply!(x, cpr.system_precond, r)
     # Correct the residual
     A_ps = cpr.pressure_precond.data[:operator]
-    # buf = cpr.pressure_precond.data[:buffer_full]
+    if false
+        if !haskey(cpr.pressure_precond.data, :buffer_full)
+            cpr.pressure_precond.data[:buffer_full] = copy(r)
+        end
+        buf = cpr.pressure_precond.data[:buffer_full]
+        copyto!(buf, r)
+        r = buf
+    end
+    # r = copy(r)
     JutulDarcy.correct_residual!(r, A_ps, x)
     # Construct pressure residual
     r_p = cpr.pressure_precond.data[:buffer_p]

From d0cffe61fb07a96f8e8c415570479ea5c5098f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Fri, 25 Oct 2024 21:06:47 +0200
Subject: [PATCH 19/32] Update format for AMGX config

---
 src/ext/ext_amgx.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index acda3bbc..ea7ba92d 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -1,16 +1,16 @@
 struct AMGXPreconditioner <: Jutul.JutulPreconditioner
-    settings::Dict{Symbol, Any}
+    settings::Dict{String, Any}
     data::Dict{Symbol, Any}
-    function AMGXPreconditioner(settings::Dict{Symbol, Any})
+    function AMGXPreconditioner(settings::Dict{String, Any})
         data = Dict{Symbol, Any}()
         new(settings, data)
     end
 end
 
 function AMGXPreconditioner(; kwarg...)
-    settings = Dict{Symbol, Any}()
+    settings = Dict{String, Any}()
     for (k, v) in kwarg
-        settings[k] = v
+        settings["$k"] = v
     end
     return AMGXPreconditioner(settings)
 end

From 38332514269d7a677086c1f6f575ce0ff97d9cd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Sun, 27 Oct 2024 18:44:55 +0100
Subject: [PATCH 20/32] Update copy of buffer in CUDA

---
 ext/JutulDarcyAMGXExt/cpr.jl |  3 +--
 src/ext/ext_amgx.jl          | 12 ++----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 4dab6e10..2fc46496 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -95,8 +95,7 @@ function JutulDarcy.gpu_cpr_setup_buffers!(cpr, J_bsr, r_cu, op)
         Tv = eltype(r_cu)
         n = length(r_cu)
         bz = cpr_s.block_size
-
-        # cpr.pressure_precond.data[:buffer_full] = AMGX.CUDA.CuVector{Tv}(undef, n)
+        cpr.pressure_precond.data[:buffer_full] = similar(r_cu)
         bz_w, n_w = size(cpr_s.w_p)
         @assert n == bz*n_w
         data[:w_p] = AMGX.CUDA.CuMatrix{Tv}(undef, bz_w, n_w)
diff --git a/src/ext/ext_amgx.jl b/src/ext/ext_amgx.jl
index ea7ba92d..93d53f9e 100644
--- a/src/ext/ext_amgx.jl
+++ b/src/ext/ext_amgx.jl
@@ -47,16 +47,8 @@ function Jutul.apply!(x, cpr::AMGXCPR, r)
     Jutul.apply!(x, cpr.system_precond, r)
     # Correct the residual
     A_ps = cpr.pressure_precond.data[:operator]
-    if false
-        if !haskey(cpr.pressure_precond.data, :buffer_full)
-            cpr.pressure_precond.data[:buffer_full] = copy(r)
-        end
-        buf = cpr.pressure_precond.data[:buffer_full]
-        copyto!(buf, r)
-        r = buf
-    end
-    # r = copy(r)
-    JutulDarcy.correct_residual!(r, A_ps, x)
+    r_corrected = cpr.pressure_precond.data[:buffer_full]
+    JutulDarcy.correct_residual!(r_corrected, A_ps, x)
     # Construct pressure residual
     r_p = cpr.pressure_precond.data[:buffer_p]
     w_p = cpr.pressure_precond.data[:w_p]

From b5ab2cc5118517c665b553481c5743b73967051f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Sun, 27 Oct 2024 19:50:47 +0100
Subject: [PATCH 21/32] Updates to CPR code

---
 ext/JutulDarcyAMGXExt/cpr.jl | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 2fc46496..40ba2230 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -1,9 +1,6 @@
 function JutulDarcy.update_amgx_pressure_system!(amgx::AMGXPreconditioner, A_p::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
     data = amgx.data
-    initialize_amgx_structure!(amgx, A_p, Tv)
-    update_pressure_system!(amgx, A_p)
-    s = amgx.data[:storage]
-    AMGX.setup!(s.solver, s.matrix)
+    update_pressure_system!(amgx, A_p, Tv)
     return amgx
 end
 
@@ -21,6 +18,7 @@ mutable struct AMGXStorage{C, R, V, M, S}
         matrix = AMGX.AMGXMatrix(resources, amgx_mode)
         solver = AMGX.Solver(resources, amgx_mode, config)
         function finalize_storage!(amgx_s::AMGXStorage)
+            @async println("Finalizing AMGXStorage")
             close(amgx_s.solver)
             close(amgx_s.matrix)
             close(amgx_s.r)
@@ -39,13 +37,6 @@ mutable struct AMGXStorage{C, R, V, M, S}
     end
 end
 
-function update_pressure_system!(amgx, A_cpu)
-    s = amgx.data[:storage]
-    A_gpu = s.matrix
-    AMGX.replace_coefficients!(A_gpu, A_cpu.At.nzval)
-    return amgx
-end
-
 function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
     s = amgx.data[:storage]
     n = length(r_p)
@@ -56,7 +47,7 @@ function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
     return r_p
 end
 
-function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
+function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
     if !haskey(amgx.data, :storage)
         if Tv == Float64
             amgx_mode = AMGX.dDDI
@@ -74,16 +65,23 @@ function initialize_amgx_structure!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR
         row_ptr = Cint.(A.At.colptr .- 1)
         colval = Cint.(A.At.rowval .- 1)
         nzval = A.At.nzval
-        nzval = Tv.(nzval)
-
-        AMGX.upload!(s.matrix, 
+        # TODO: Support for other types than Float64, should be done in setup of
+        # pressure system
+        AMGX.pin_memory(nzval)
+        AMGX.upload!(s.matrix,
             row_ptr,
             colval,
             nzval
         )
         amgx.data[:storage] = s
         amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
+    else
+        s = amgx.data[:storage]
+        A_gpu = s.matrix
+        @assert nnz(A) == nnz(A_gpu)
+        AMGX.replace_coefficients!(A_gpu, A.At.nzval)
     end
+    AMGX.setup!(s.solver, s.matrix)
     return amgx
 end
 

From e54d62b175cebd51b3f9d203532be0e3b12d4b10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Sun, 27 Oct 2024 20:28:50 +0100
Subject: [PATCH 22/32] Updates to CUDA

---
 ext/JutulDarcyAMGXExt/cpr.jl | 2 ++
 ext/JutulDarcyCUDAExt/cpr.jl | 1 -
 src/ext/ext_cuda.jl          | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 40ba2230..ff0a60e4 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -38,6 +38,8 @@ mutable struct AMGXStorage{C, R, V, M, S}
 end
 
 function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
+    # TODO: This sync call is maybe needd?
+    AMGX.CUDA.synchronize()
     s = amgx.data[:storage]
     n = length(r_p)
     AMGX.upload!(s.r, r_p)
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index b7dfbe18..8df6c932 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -30,7 +30,6 @@ function JutulDarcy.gpu_increment_pressure!(x, dp)
     bz = div(n, length(dp))
     @assert bz > 0
     CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz)
-    CUDA.synchronize()
     return x
 end
 
diff --git a/src/ext/ext_cuda.jl b/src/ext/ext_cuda.jl
index 3c8289f1..15c2f21e 100644
--- a/src/ext/ext_cuda.jl
+++ b/src/ext/ext_cuda.jl
@@ -12,7 +12,7 @@ function CUDAReservoirKrylov(solver = :gmres, prec = ILUZeroPreconditioner();
         kwarg...
     )
     cfg = IterativeSolverConfig(; kwarg...)
-    CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing)
+    return CUDAReservoirKrylov{Float_t, Int_t}(solver, cfg, prec, Dict{Symbol, Any}(), nothing)
 end
 
 function Jutul.linear_solve!(lsys::Jutul.LSystem,
@@ -111,8 +111,8 @@ function gpu_update_preconditioner!
 end
 
 function Base.show(io::IO, krylov::CUDAReservoirKrylov)
-    rtol = linear_solver_tolerance(krylov.config, :relative)
-    atol = linear_solver_tolerance(krylov.config, :absolute)
+    rtol = Jutul.linear_solver_tolerance(krylov.config, :relative)
+    atol = Jutul.linear_solver_tolerance(krylov.config, :absolute)
     print(io, "CUDAReservoirKrylov using $(krylov.solver) (ϵₐ=$atol, ϵ=$rtol) with prec = $(typeof(krylov.preconditioner))")
 end
 

From f7f7d39ac1571f8d27b36e67e6b955a86e9eb751 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 08:23:10 +0100
Subject: [PATCH 23/32] Fix for wells

---
 src/types.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/types.jl b/src/types.jl
index 41cb3585..12209d2b 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -417,10 +417,12 @@ function SimpleWell(
         name = :Well,
         explicit_dp = true,
         surface_conditions = default_surface_cond(),
-        reference_depth = 0,
+        reference_depth = 0.0,
         volume = 1000.0, # Regularization volume for well, not a real volume
         kwarg...
     )
+    reference_depth = convert(Float64, reference_depth)
+    volume = convert(Float64, volume)
     nr = length(reservoir_cells)
     WI, gdz = common_well_setup(nr; kwarg...)
     perf = (self = ones(Int64, nr), reservoir = vec(reservoir_cells), WI = WI, gdz = gdz)

From a2c05c7f20f2521d2c08bac9c84b7c9ea5f3ead7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 08:23:29 +0100
Subject: [PATCH 24/32] Some CUDA code backup

---
 ext/JutulDarcyAMGXExt/cpr.jl |  1 +
 ext/JutulDarcyCUDAExt/cpr.jl | 23 ++++++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index ff0a60e4..561ca597 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -78,6 +78,7 @@ function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.St
         amgx.data[:storage] = s
         amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
     else
+        pb = amgx.data[:buffer_p]
         s = amgx.data[:storage]
         A_gpu = s.matrix
         @assert nnz(A) == nnz(A_gpu)
diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index 8df6c932..8560f018 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -16,11 +16,11 @@ function reduce_residual_kernel!(r_p, w_p, r_ps)
         v = zero(T)
         for comp in 1:ncomp
             ix = (cell-1)*ncomp + comp
-            wi = w_p[comp, cell]
-            ri = r_ps[ix]
+            @inbounds wi = w_p[comp, cell]
+            @inbounds ri = r_ps[ix]
             v += wi*ri
         end
-        r_p[cell] = v
+        @inbounds r_p[cell] = v
     end
     return nothing
 end
@@ -29,7 +29,12 @@ function JutulDarcy.gpu_increment_pressure!(x, dp)
     n = length(x)
     bz = div(n, length(dp))
     @assert bz > 0
-    CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz)
+    CUDA.@sync CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz)
+    # kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
+    # threads, blocks = kernel_helper(kernel, x)
+    # CUDA.@sync begin
+    #    kernel(x, dp, bz; threads, blocks)
+    # end
     return x
 end
 
@@ -37,7 +42,15 @@ function gpu_increment_pressure_kernel!(x, dp, bz)
     index = threadIdx().x
     stride = blockDim().x
     for cell in index:stride:length(dp)
-        x[(cell-1)*bz + 1] += dp[cell]
+        @inbounds x[(cell-1)*bz + 1] += dp[cell]
     end
     return nothing
 end
+
+function kernel_helper(kernel, V)
+    config = launch_configuration(kernel.fun)
+    threads = min(length(V), config.threads)
+    blocks = cld(length(V), threads)
+
+    return (threads, blocks)
+end
\ No newline at end of file

From 017c1efe1e73c03cb91d8b3ee7d1c045c8ef55ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 09:06:46 +0100
Subject: [PATCH 25/32] Use CUDA threads in CPR code

---
 ext/JutulDarcyCUDAExt/cpr.jl | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index 8560f018..daad2d2f 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -1,9 +1,12 @@
 function JutulDarcy.gpu_reduce_residual!(r_p, w_p, r)
-    # TODO: Actually use CUDA threads here.
     @assert eltype(r_p) == eltype(w_p) == eltype(r)
     @assert length(w_p) == length(r)
     @assert size(w_p, 2) == length(r_p)
-    CUDA.@cuda reduce_residual_kernel!(r_p, w_p, r)
+    kernel = CUDA.@cuda launch = false reduce_residual_kernel!(r_p, w_p, r)
+    threads, blocks = kernel_helper(kernel, r_p)
+    CUDA.@sync begin
+        kernel(r_p, w_p, r; threads, blocks)
+    end
     return r_p
 end
 
@@ -29,12 +32,11 @@ function JutulDarcy.gpu_increment_pressure!(x, dp)
     n = length(x)
     bz = div(n, length(dp))
     @assert bz > 0
-    CUDA.@sync CUDA.@cuda gpu_increment_pressure_kernel!(x, dp, bz)
-    # kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
-    # threads, blocks = kernel_helper(kernel, x)
-    # CUDA.@sync begin
-    #    kernel(x, dp, bz; threads, blocks)
-    # end
+    kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
+    threads, blocks = kernel_helper(kernel, dp)
+    CUDA.@sync begin
+       kernel(x, dp, bz; threads, blocks)
+    end
     return x
 end
 
@@ -42,7 +44,7 @@ function gpu_increment_pressure_kernel!(x, dp, bz)
     index = threadIdx().x
     stride = blockDim().x
     for cell in index:stride:length(dp)
-        @inbounds x[(cell-1)*bz + 1] += dp[cell]
+        x[(cell-1)*bz + 1] += dp[cell]
     end
     return nothing
 end

From c6f15b187ca979db26c2e438ff534452b3778aa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 09:29:18 +0100
Subject: [PATCH 26/32] Use view instead of custom kernel for CPR dp

---
 ext/JutulDarcyCUDAExt/cpr.jl | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cpr.jl b/ext/JutulDarcyCUDAExt/cpr.jl
index daad2d2f..23e05e69 100644
--- a/ext/JutulDarcyCUDAExt/cpr.jl
+++ b/ext/JutulDarcyCUDAExt/cpr.jl
@@ -32,10 +32,17 @@ function JutulDarcy.gpu_increment_pressure!(x, dp)
     n = length(x)
     bz = div(n, length(dp))
     @assert bz > 0
-    kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
-    threads, blocks = kernel_helper(kernel, dp)
-    CUDA.@sync begin
-       kernel(x, dp, bz; threads, blocks)
+    if false
+        kernel = CUDA.@cuda launch=false gpu_increment_pressure_kernel!(x, dp, bz)
+        threads, blocks = kernel_helper(kernel, dp)
+        threads = blocks = 1
+
+        CUDA.@sync begin
+        kernel(x, dp, bz; threads, blocks)
+        end
+    else
+        x_view = view(x, 1:bz:(n+1-bz))
+        @. x_view += dp
     end
     return x
 end

From b1a070ce552dbaeb9351f376c390556d48e592a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 10:07:22 +0100
Subject: [PATCH 27/32] Performance tweaks

---
 ext/JutulDarcyCUDAExt/cuda_utils.jl | 2 +-
 src/cpr.jl                          | 4 ++++
 src/flux.jl                         | 4 ++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index 1da70ab0..2a9904cf 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -19,7 +19,7 @@ function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12)
                 else
                     v = ϵ
                 end
-                nzval[i] = v
+                @inbounds nzval[i] = v
             end
         end
     end
diff --git a/src/cpr.jl b/src/cpr.jl
index 23233348..3d8653a7 100644
--- a/src/cpr.jl
+++ b/src/cpr.jl
@@ -271,6 +271,10 @@ function update_pressure_system!(A_p::Jutul.StaticSparsityMatrixCSR, p_prec, A::
     ncomp = size(w_p, 1)
     N = Val(ncomp)
     is_adjoint = Val(Jutul.represented_as_adjoint(matrix_layout(ctx)))
+    update_rows_csr(nz, A_p, w_p, cols, nz_s, N, is_adjoint, n)
+end
+
+function update_rows_csr(nz, A_p, w_p, cols, nz_s, N, is_adjoint, n)
     @batch minbatch=tb for row in 1:n
         update_row_csr!(nz, A_p, w_p, cols, nz_s, row, N, is_adjoint)
     end
diff --git a/src/flux.jl b/src/flux.jl
index b8fb843b..6cd8b073 100644
--- a/src/flux.jl
+++ b/src/flux.jl
@@ -59,7 +59,7 @@ end
     return q
 end
 
-function face_average_density(model, state, tpfa, phase)
+@inline function face_average_density(model, state, tpfa, phase)
     ρ = state.PhaseMassDensities
     l = tpfa.left
     r = tpfa.right
@@ -68,7 +68,7 @@ function face_average_density(model, state, tpfa, phase)
     return 0.5*(ρ_i + ρ_c)
 end
 
-function face_average_density(model::CompositionalModel, state, tpfa, phase)
+@inline function face_average_density(model::CompositionalModel, state, tpfa, phase)
     sys = flow_system(model.system)
     ρ = state.PhaseMassDensities
     l = tpfa.left

From a58af9cf5f2478d2c33ec4ca6eab9dec136f00c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 10:21:42 +0100
Subject: [PATCH 28/32] Remove some code duplication

---
 src/flux.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/flux.jl b/src/flux.jl
index 6cd8b073..dfe12b01 100644
--- a/src/flux.jl
+++ b/src/flux.jl
@@ -61,11 +61,7 @@ end
 
 @inline function face_average_density(model, state, tpfa, phase)
     ρ = state.PhaseMassDensities
-    l = tpfa.left
-    r = tpfa.right
-    @inbounds ρ_c = ρ[phase, l]
-    @inbounds ρ_i = ρ[phase, r]
-    return 0.5*(ρ_i + ρ_c)
+    return phase_face_average(ρ, tpfa, phase)
 end
 
 @inline function face_average_density(model::CompositionalModel, state, tpfa, phase)

From 96c9495c6e0274bb073c58e4f6f1c948222dac6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 10:55:09 +0100
Subject: [PATCH 29/32] Reorder destructors for AMGX

---
 ext/JutulDarcyAMGXExt/cpr.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 561ca597..8f902ea7 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -20,9 +20,9 @@ mutable struct AMGXStorage{C, R, V, M, S}
         function finalize_storage!(amgx_s::AMGXStorage)
             @async println("Finalizing AMGXStorage")
             close(amgx_s.solver)
-            close(amgx_s.matrix)
             close(amgx_s.r)
             close(amgx_s.x)
+            close(amgx_s.matrix)
             close(amgx_s.resources)
             close(amgx_s.config)
         end

From 0824cfcea3788b11c7093325c342e18ddcc4881b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 15:13:25 +0100
Subject: [PATCH 30/32] Bump Jutul compat & version

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 20bf60e1..28158464 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "JutulDarcy"
 uuid = "82210473-ab04-4dce-b31b-11573c4f8e0a"
 authors = ["Olav Møyner <olav.moyner@gmail.com>"]
-version = "0.2.35"
+version = "0.2.36"
 
 [deps]
 AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
@@ -62,7 +62,7 @@ ForwardDiff = "0.10.30"
 GLMakie = "0.10.13"
 GeoEnergyIO = "1.1.12"
 HYPRE = "1.6.0, 1.7"
-Jutul = "0.2.40"
+Jutul = "0.2.41"
 Krylov = "0.9.1"
 LazyArtifacts = "1"
 LinearAlgebra = "1"

From c3d398c867152b955e40f0d06db71ac2a09c4905 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Mon, 28 Oct 2024 20:10:15 +0100
Subject: [PATCH 31/32] Add simple preconditioner version of AMGX

---
 ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl |  1 +
 ext/JutulDarcyAMGXExt/cpr.jl               | 12 ++---
 ext/JutulDarcyAMGXExt/precond.jl           | 53 ++++++++++++++++++++++
 ext/JutulDarcyCUDAExt/cuda_utils.jl        |  2 +-
 ext/JutulDarcyCUDAExt/ilu0.jl              |  1 +
 5 files changed, 59 insertions(+), 10 deletions(-)
 create mode 100644 ext/JutulDarcyAMGXExt/precond.jl

diff --git a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
index b4baa274..b1df4954 100644
--- a/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
+++ b/ext/JutulDarcyAMGXExt/JutulDarcyAMGXExt.jl
@@ -6,6 +6,7 @@ module JutulDarcyAMGXExt
     timeit_debug_enabled() = Jutul.timeit_debug_enabled()
 
     include("cpr.jl")
+    include("precond.jl")
 
     function __init__()
         # TODO: Figure out a way to not always do this?
diff --git a/ext/JutulDarcyAMGXExt/cpr.jl b/ext/JutulDarcyAMGXExt/cpr.jl
index 8f902ea7..bf6d7498 100644
--- a/ext/JutulDarcyAMGXExt/cpr.jl
+++ b/ext/JutulDarcyAMGXExt/cpr.jl
@@ -38,15 +38,7 @@ mutable struct AMGXStorage{C, R, V, M, S}
 end
 
 function JutulDarcy.gpu_amgx_solve!(amgx::AMGXPreconditioner, r_p)
-    # TODO: This sync call is maybe needd?
-    AMGX.CUDA.synchronize()
-    s = amgx.data[:storage]
-    n = length(r_p)
-    AMGX.upload!(s.r, r_p)
-    AMGX.set_zero!(s.x, n)
-    AMGX.solve!(s.x, s.solver, s.r)
-    AMGX.download!(r_p, s.x)
-    return r_p
+    Jutul.apply!(r_p, amgx, r_p)
 end
 
 function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.StaticSparsityMatrixCSR, Tv)
@@ -76,6 +68,8 @@ function update_pressure_system!(amgx::AMGXPreconditioner, A::Jutul.StaticCSR.St
             nzval
         )
         amgx.data[:storage] = s
+        amgx.data[:block_size] = 1
+        amgx.data[:n] = n
         amgx.data[:buffer_p] = AMGX.CUDA.CuVector{Tv}(undef, n)
     else
         pb = amgx.data[:buffer_p]
diff --git a/ext/JutulDarcyAMGXExt/precond.jl b/ext/JutulDarcyAMGXExt/precond.jl
new file mode 100644
index 00000000..a4b42e50
--- /dev/null
+++ b/ext/JutulDarcyAMGXExt/precond.jl
@@ -0,0 +1,53 @@
+function JutulDarcy.gpu_update_preconditioner!(prec::JutulDarcy.AMGXPreconditioner, lsys, model, storage, recorder, executor, krylov, J_bsr, r_cu, op)
+    nzval = nonzeros(J_bsr)
+    bz = Jutul.block_size(lsys[1, 1])
+    if haskey(prec.data, :storage)
+        s = prec.data[:storage]
+        AMGX.replace_coefficients!(s.matrix, nzval)
+    else
+        Tv = eltype(J_bsr)
+        if Tv == Float64
+            amgx_mode = AMGX.dDDI
+        else
+            amgx_mode = AMGX.dFFI
+        end
+        n, m = size(J_bsr)
+        @assert n == m
+        N = n ÷ bz
+        config = AMGX.Config(prec.settings)
+        s = AMGXStorage(config, amgx_mode)
+
+        AMGX.set_zero!(s.x, N, block_dim = bz)
+        AMGX.set_zero!(s.r, N, block_dim = bz)
+
+        row_ptr = J_bsr.rowPtr
+        colval = J_bsr.colVal
+        AMGX.upload!(
+            s.matrix,
+            row_ptr,
+            colval,
+            nzval,
+            block_dims = (bz, bz)
+        )
+        prec.data[:storage] = s
+        prec.data[:n] = N
+        prec.data[:block_size] = bz
+    end
+    AMGX.setup!(s.solver, s.matrix)
+end
+
+function Jutul.operator_nrows(prec::JutulDarcy.AMGXPreconditioner)
+    return prec.data[:n]*prec.data[:block_size]
+end
+
+function Jutul.apply!(x, amgx::JutulDarcy.AMGXPreconditioner, r)
+    # TODO: This sync call is maybe needd?
+    AMGX.CUDA.synchronize()
+    s = amgx.data[:storage]
+    bz = amgx.data[:block_size]::Int
+    n = amgx.data[:n]::Int
+    AMGX.upload!(s.r, r, block_dim = bz)
+    AMGX.set_zero!(s.x, n, block_dim = bz)
+    AMGX.solve!(s.x, s.solver, s.r)
+    AMGX.download!(x, s.x)
+end
diff --git a/ext/JutulDarcyCUDAExt/cuda_utils.jl b/ext/JutulDarcyCUDAExt/cuda_utils.jl
index 2a9904cf..cb5f2e42 100644
--- a/ext/JutulDarcyCUDAExt/cuda_utils.jl
+++ b/ext/JutulDarcyCUDAExt/cuda_utils.jl
@@ -11,7 +11,7 @@ function JutulDarcy.build_gpu_block_system(Ti, Tv, sz::Tuple{Int, Int}, blockDim
 end
 
 function JutulDarcy.update_gpu_block_system!(J, blockDim, nzval, ϵ = 1e-12)
-    if ϵ > 0.0
+    if ϵ > 0.0 && false
         for (i, v) in enumerate(nzval)
             if abs(v) < ϵ
                 if v > 0
diff --git a/ext/JutulDarcyCUDAExt/ilu0.jl b/ext/JutulDarcyCUDAExt/ilu0.jl
index 8553ccd5..9bc1e925 100644
--- a/ext/JutulDarcyCUDAExt/ilu0.jl
+++ b/ext/JutulDarcyCUDAExt/ilu0.jl
@@ -85,6 +85,7 @@ function ilu0_gpu_update!(A, Tv, data)
     end
     mb = div(size(A, 1), A.blockDim)
     ilu0_is_analyzed = haskey(data, :ilu0)
+    ilu0_is_analyzed = false
     if ilu0_is_analyzed
         info, desc = data[:ilu0]
     else

From 240b37c3d7096b0e051521be60e2017efd1fffe8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Olav=20M=C3=B8yner?= <olav.moyner@gmail.com>
Date: Tue, 29 Oct 2024 12:11:43 +0100
Subject: [PATCH 32/32] Clean up linear solver set option and re-enable
 disabled test

---
 src/test_utils/setup_multimodel.jl |  6 ------
 src/utils.jl                       |  5 ++++-
 test/multimodel.jl                 | 15 ++++++---------
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/test_utils/setup_multimodel.jl b/src/test_utils/setup_multimodel.jl
index b1793224..4483837a 100644
--- a/src/test_utils/setup_multimodel.jl
+++ b/src/test_utils/setup_multimodel.jl
@@ -2,7 +2,6 @@ function simulate_mini_wellcase(::Val{:compositional_2ph_3c};
         dims = (3, 1, 1),
         setuparg = NamedTuple(),
         output_path = nothing,
-        default_linsolve = true,
         nstep = 12*5,
         total_time = 30.0*si_unit(:day)*nstep,
         kwarg...
@@ -52,7 +51,6 @@ function simulate_mini_wellcase(::Val{:compositional_2ph_3c};
     sim, config = setup_reservoir_simulator(
         model, state0, parameters;
         info_level = -1,
-        set_linear_solver = !default_linsolve,
         error_on_incomplete = true,
         output_path = output_path,
         setuparg...,
@@ -76,7 +74,6 @@ function simulate_mini_wellcase(::Val{:immiscible_2ph};
         permeability = 0.1*9.869232667160130e-13,
         nstep = 12*5,
         total_time = 30.0*si_unit(:day)*nstep,
-        default_linsolve = true,
         kwarg...)
     # Some useful constants
     day = 3600*24
@@ -125,7 +122,6 @@ function simulate_mini_wellcase(::Val{:immiscible_2ph};
     sim, config = setup_reservoir_simulator(
         model, state0, parameters;
         info_level = -1,
-        set_linear_solver = !default_linsolve,
         output_path = output_path,
         error_on_incomplete = true,
         setuparg...
@@ -145,7 +141,6 @@ function simulate_mini_wellcase(::Val{:bo_spe1};
         dims = (3, 1, 1),
         setuparg = NamedTuple(),
         output_path = nothing,
-        default_linsolve = true,
         nstep = 12*5,
         total_time = 30.0*si_unit(:day)*nstep,
         kwarg...
@@ -199,7 +194,6 @@ function simulate_mini_wellcase(::Val{:bo_spe1};
     sim, config = setup_reservoir_simulator(
         model, state0, parameters;
         info_level = -1,
-        set_linear_solver = !default_linsolve,
         output_path = output_path,
         error_on_incomplete = true,
         setuparg...
diff --git a/src/utils.jl b/src/utils.jl
index cbad05ef..d51a5336 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -634,13 +634,16 @@ function setup_reservoir_simulator(case::JutulCase;
         relaxation = false,
         presolve_wells = false,
         parray_arg = Dict{Symbol, Any}(),
-        set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver),
+        set_linear_solver = missing,
         linear_solver_arg = Dict{Symbol, Any}(),
         extra_timing_setup = false,
         nldd_partition = missing,
         nldd_arg = Dict{Symbol, Any}(),
         kwarg...
     )
+    if ismissing(set_linear_solver)
+        set_linear_solver = linear_solver isa Symbol || ismissing(linear_solver)
+    end
     # Handle old kwarg...
     max_timestep = min(max_dt, max_timestep)
     extra_kwarg = Dict{Symbol, Any}()
diff --git a/test/multimodel.jl b/test/multimodel.jl
index f989ab32..71474230 100644
--- a/test/multimodel.jl
+++ b/test/multimodel.jl
@@ -127,15 +127,12 @@ end
     for b in [false, true]
         for backend in [:csr, :csc]
             for gen_ad in [true, false]
-                for default_linsolve in [false, true]
-                    @testset "Block=$b, backend=$b, defaulted linsolve=$default_linsolve" begin
-                        arg = (general_ad = gen_ad, backend = backend, block_backend = b, default_linsolve = default_linsolve)
-                        test_compositional_with_wells(; arg...)
-                        # Disabled test until Jutul is updated.
-                        # test_compositional_with_wells(; fast_flash = true, arg...)
-                        test_immiscible_with_wells(; arg...)
-                        test_blackoil_with_wells(; arg...)
-                    end
+                @testset "Block=$b, backend=$b" begin
+                    arg = (general_ad = gen_ad, backend = backend, block_backend = b)
+                    test_compositional_with_wells(; arg...)
+                    test_compositional_with_wells(; fast_flash = true, arg...)
+                    test_immiscible_with_wells(; arg...)
+                    test_blackoil_with_wells(; arg...)
                 end
             end
         end