Add kernel-based matrix cumsum (#416)

* Add kernel-based matrix cumsum * Fix some minor issues * Add tests to increase code coverage * Fix print devices for AMD * Expand tests and exclude Metal and kernel functions from coverage / formatting
JuliaHealth · Jun 21, 2024 · d624e8f · d624e8f
1 parent 13c382d
commit d624e8f
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 17 deletions.
diff --git a/KomaMRICore/ext/KomaAMDGPUExt.jl b/KomaMRICore/ext/KomaAMDGPUExt.jl
@@ -18,7 +18,7 @@ end
 
 function KomaMRICore._print_devices(::ROCBackend)
     devices = [
-        Symbol("($(i-1)$(i == 1 ? "*" : " "))") => d.name for
+        Symbol("($(i-1)$(i == 1 ? "*" : " "))") => AMDGPU.HIP.name(d) for
         (i, d) in enumerate(AMDGPU.devices())
     ]
     @info "$(length(AMDGPU.devices())) AMD capable device(s)." devices...

diff --git a/KomaMRICore/ext/KomaMetalExt.jl b/KomaMRICore/ext/KomaMetalExt.jl
@@ -1,3 +1,5 @@
+## COV_EXCL_START
+
 module KomaMetalExt
 
 using Metal
@@ -32,4 +34,6 @@ function __init__()
     @warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
 end
 
-end
+end
+
+## COV_EXCL_STOP
diff --git a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
@@ -35,6 +35,7 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::BlochDict,
+    backend::KA.Backend
 ) where {T<:Real}
     #Simulation
     #Motion
@@ -43,7 +44,7 @@ function run_spin_precession!(
     Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
     #Rotation
     if is_ADC_on(seq)
-        ϕ = T(-2π * γ) .* cumtrapz(seq.Δt', Bz)
+        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
     else
         ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
     end

diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
@@ -44,6 +44,7 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::SimulationMethod,
+    backend::KA.Backend
 ) where {T<:Real}
     #Simulation
     #Motion
@@ -52,7 +53,7 @@ function run_spin_precession!(
     Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
     #Rotation
     if is_ADC_on(seq)
-        ϕ = T(-2π * γ) .* cumtrapz(seq.Δt', Bz)
+        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
     else
         ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
     end

diff --git a/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl
@@ -0,0 +1,50 @@
+using KernelAbstractions: @index, @kernel
+
+## COV_EXCL_START
+#! format: off
+
+"""
+    cumsum2_kernel
+
+Simple kernel function, computes the cumulative sum of each row of a matrix. Operates
+in-place on the input matrix without allocating additional memory.
+
+# Arguments
+- 'A': matrix to compute cumsum on
+"""
+@kernel function cumsum_matrix_rows_kernel!(A)
+    i = @index(Global)
+
+    for k ∈ 2:size(A, 2)
+        @inbounds A[i, k] += A[i, k-1]
+    end
+end
+
+#! format: on
+## COV_EXCL_STOP
+
+"""
+    cumtrapz
+
+A more efficient GPU implementation of the cumtrapz method defined in TrapezoidalIntegration.jl.
+Uses a kernel to compute cumsum along the second dimension.
+
+# Arguments
+- `Δt`: (`1 x NΔt ::Matrix{Float64}`, `[s]`) delta time 1-row array
+- `x`: (`Ns x (NΔt+1) ::Matrix{Float64}`, `[T]`) magnitude of the field Gx * x + Gy * y +
+    Gz * z
+
+# Returns
+- `y`: (`Ns x NΔt ::Matrix{Float64}`, `[T*s]`) matrix where every column is the
+    cumulative integration over time of (Gx * x + Gy * y + Gz * z) * Δt for every spin of a
+    phantom
+"""
+function KomaMRIBase.cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}, backend::KA.GPU) where {T<:Real}
+    y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
+    cumsum_matrix_rows_kernel!(backend)(y, ndrange=size(y,1))
+    KA.synchronize(backend)
+    return y
+end
+
+#If on CPU, forward call to cumtrapz in KomaMRIBase
+KomaMRIBase.cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}, backend::KA.CPU) where {T<:Real} = KomaMRIBase.cumtrapz(Δt, x)
diff --git a/KomaMRICore/src/simulation/Functors.jl b/KomaMRICore/src/simulation/Functors.jl
@@ -22,7 +22,7 @@ See also [`f32`](@ref) and [`f64`](@ref) to change element type only.
 # Examples
 ```julia
 using CUDA
-x = gpu(x)
+x = x |> gpu
 ```
 """
 function gpu(x)
@@ -31,7 +31,8 @@ function gpu(x)
     if (BACKEND[] isa KA.GPU)
         return gpu(x, BACKEND[])
     else
-        @error "function 'gpu' called with no functional backends available. Add 'using CUDA / Metal / AMDGPU / oneAPI' to your code and try again"
+        @warn "function 'gpu' called with no functional backends available. Add 'using CUDA / Metal / AMDGPU / oneAPI' to your code and try again"
+        return x
     end
 end
 

diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -2,6 +2,7 @@ abstract type SimulationMethod end #get all available types by using subtypes(Ko
 abstract type SpinStateRepresentation{T<:Real} end #get all available types by using subtypes(KomaMRI.SpinStateRepresentation)
 
 #Defined methods:
+include("Bloch/KernelFunctions.jl")
 include("Bloch/BlochSimulationMethod.jl")       #Defines Bloch simulation method
 include("Bloch/BlochDictSimulationMethod.jl")   #Defines BlochDict simulation method
 
@@ -82,15 +83,16 @@ function run_spin_precession_parallel!(
     seq::DiscreteSequence{T},
     sig::AbstractArray{Complex{T}},
     Xt::SpinStateRepresentation{T},
-    sim_method::SimulationMethod;
+    sim_method::SimulationMethod,
+    backend::KA.Backend;
     Nthreads=Threads.nthreads(),
 ) where {T<:Real}
     parts = kfoldperm(length(obj), Nthreads)
     dims = [Colon() for i in 1:(ndims(sig) - 1)] # :,:,:,... Ndim times
 
     ThreadsX.foreach(enumerate(parts)) do (i, p)
         run_spin_precession!(
-            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method
+            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend
         )
     end
 
@@ -166,7 +168,8 @@ function run_sim_time_iter!(
     seq::DiscreteSequence,
     sig::AbstractArray{Complex{T}},
     Xt::SpinStateRepresentation{T},
-    sim_method::SimulationMethod;
+    sim_method::SimulationMethod,
+    backend::KA.Backend;
     Nblocks=1,
     Nthreads=Threads.nthreads(),
     parts=[1:length(seq)],
@@ -193,7 +196,7 @@ function run_sim_time_iter!(
             rfs += 1
         else
             run_spin_precession_parallel!(
-                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method; Nthreads
+                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend; Nthreads
             )
         end
         samples += Nadc
@@ -357,10 +360,10 @@ function simulate(
     if backend isa KA.GPU
         isnothing(sim_params["gpu_device"]) || set_device!(backend, sim_params["gpu_device"])
         gpu_name = device_name(backend)
-        obj = gpu(obj, backend) #Phantom
-        seqd = gpu(seqd, backend) #DiscreteSequence
-        Xt = gpu(Xt, backend) #SpinStateRepresentation
-        sig = gpu(sig, backend) #Signal
+        obj = obj |> gpu #Phantom
+        seqd = seqd |> gpu #DiscreteSequence
+        Xt = Xt |> gpu #SpinStateRepresentation
+        sig = sig |> gpu #Signal
     end
 
     # Simulation
@@ -373,7 +376,8 @@ function simulate(
         seqd,
         sig,
         Xt,
-        sim_params["sim_method"];
+        sim_params["sim_method"],
+        backend;
         Nblocks=length(parts),
         Nthreads=sim_params["Nthreads"],
         parts,

diff --git a/KomaMRICore/test/Project.toml b/KomaMRICore/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 KomaMRIBase = "d0bc0b20-b151-4d03-b2a4-6ca51751cb9c"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"

diff --git a/KomaMRICore/test/runtests.jl b/KomaMRICore/test/runtests.jl
@@ -1,8 +1,9 @@
 using TestItems, TestItemRunner
 
 ### NOTE: by default, tests are run on the CPU with the number of threads set to 
-#   Threads.nthreads(). To run on a specific GPU backend, pass the name of the 
-#   trigger package ("AMDGPU", "CUDA", "Metal", or "oneAPI") as a test argument. 
+#   Threads.nthreads(). To run on a specific GPU backend, add the name of the 
+#   backend package ("AMDGPU", "CUDA", "Metal", or "oneAPI") to the test/Project.toml
+#   file in KomaMRICore and pass the name as a test argument.
 #   
 #   Example:
 #
@@ -385,3 +386,28 @@ end
     # For the time being, always pass the test
     @test true
 end
+
+@testitem "GPU Functions" tags=[:core] begin
+    using Suppressor
+    import KernelAbstractions as KA
+    include("initialize.jl")
+
+    x = ones(Float32, 1000)
+    @suppress begin
+        y = x |> gpu
+        if USE_GPU
+            @test KA.get_backend(y) isa KA.GPU
+            y = y |> cpu
+            @test KA.get_backend(y) isa KA.CPU
+        else
+            # Test that gpu and cpu are no-ops
+            @test y == x
+            y = y |> cpu
+            @test y == x
+        end
+    end
+
+
+    @suppress print_devices()
+    @test true
+end