JuliaHealth · rkierulf · Jul 31, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 25, 2024
@@ -14,7 +14,7 @@ using MRIBase
     Profile, RawAcquisitionData, AcquisitionData, AcquisitionHeader, EncodingCounters, Limit
 using MAT   # For loading example phantoms
 
-global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
+const global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
 
 # Hardware
 include("datatypes/Scanner.jl")
@@ -43,7 +43,7 @@ export discretize, get_adc_phase_compensation, get_adc_sampling_times
 export is_Gx_on, is_Gy_on, is_Gz_on, is_RF_on, is_ADC_on
 export times, ampls, freqs
 # This are also used for simulation
-export kfoldperm, trapz, cumtrapz
+export kfoldperm, trapz, cumtrapz, trapz!, cumtrapz!
 # Phantom
 export brain_phantom2D, brain_phantom3D, pelvis_phantom2D, heart_phantom
 # Motion

diff --git a/KomaMRIBase/src/timing/TrapezoidalIntegration.jl b/KomaMRIBase/src/timing/TrapezoidalIntegration.jl
@@ -28,6 +28,10 @@
     y = sum(y)
     return y
 end
+@inline function trapz!(ϕ::AbstractVector{T}, Tz::AbstractMatrix{TX}, Δt::AbstractMatrix{T}, x::AbstractMatrix{TX}) where {T<:Real, TX<:Union{T, Complex{T}}}
+    Tz .= (x[:, 2:end] .+ x[:, 1:end-1]) .* Δt .* T(-π .* γ)
+    ϕ .= sum(Tz, dims=2)
+end
 
 """
     y = cumtrapz(Δt, x)
@@ -45,12 +49,16 @@
     phantom
 """
 function cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}) where {T<:Real}
-    y =  (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
+    y =  (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt ./ 2)
     y = cumsum(y, dims=2)
     return y
 end
 function cumtrapz(Δt::AbstractVector{T}, x::AbstractVector{T}) where {T<:Real}
-    y = (x[2:end] .+ x[1:end-1]) .* (Δt / 2)
+    y = (x[2:end] .+ x[1:end-1]) .* (Δt ./ 2)
     y = cumsum(y)
     return y
 end
+@inline function cumtrapz!(ϕ::AbstractArray{T}, Tz::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real}
+    Tz .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ)
+    cumsum!(ϕ, Tz, dims=2)
+end
diff --git a/KomaMRICore/Project.toml b/KomaMRICore/Project.toml
@@ -31,7 +31,7 @@ CUDA = "3, 4, 5"
 Functors = "0.4"
 KernelAbstractions = "0.9"
 KomaMRIBase = "0.9"
-Metal = "1"
+Metal = "1.2"
 ProgressMeter = "1"
 Reexport = "1"
 ThreadsX = "0.1"

@@ -9,6 +9,7 @@
 KomaMRICore.set_device!(::ROCBackend, dev_idx::Integer) = AMDGPU.device_id!(dev_idx)
 KomaMRICore.set_device!(::ROCBackend, dev::AMDGPU.HIPDevice) = AMDGPU.device!(dev)
 KomaMRICore.device_name(::ROCBackend) = AMDGPU.HIP.name(AMDGPU.device())
+@inline KomaMRICore._cis(x) = cis(x)
 
 function KomaMRICore._print_devices(::ROCBackend)
     devices = [

@@ -8,6 +8,7 @@
 KomaMRICore.isfunctional(::CUDABackend) = CUDA.functional()
 KomaMRICore.set_device!(::CUDABackend, val) = CUDA.device!(val)
 KomaMRICore.device_name(::CUDABackend) = CUDA.name(CUDA.device())
+@inline KomaMRICore._cis(x) = cis(x)
 
 function KomaMRICore._print_devices(::CUDABackend)
     devices = [

@@ -11,21 +11,14 @@ KomaMRICore.isfunctional(::MetalBackend) = Metal.functional()
 KomaMRICore.set_device!(::MetalBackend, device_index::Integer) = device_index == 1 || @warn "Metal does not support multiple gpu devices. Ignoring the device setting."
 KomaMRICore.set_device!(::MetalBackend, dev::Metal.MTLDevice) = Metal.device!(dev)
 KomaMRICore.device_name(::MetalBackend) = String(Metal.current_device().name)
+@inline KomaMRICore._cis(x) = cis(x)
 
 function KomaMRICore._print_devices(::MetalBackend)
     @info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
 end
 
-#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
-#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
-#can be removed
-Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
-Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
-Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))
-
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
-    @warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
 end
 
 end

@@ -8,6 +8,7 @@
 KomaMRICore.isfunctional(::oneAPIBackend) = oneAPI.functional()
 KomaMRICore.set_device!(::oneAPIBackend, val) = oneAPI.device!(val)
 KomaMRICore.device_name(::oneAPIBackend) = oneAPI.properties(oneAPI.device()).name
+@inline KomaMRICore._cis(x) = cos(x) + im * sin(x)
 
 function KomaMRICore._print_devices(::oneAPIBackend)
     devices = [
@@ -20,9 +21,41 @@
 #Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
 #Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
 Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
-Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
 Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))
 
+using KernelAbstractions: @index, @kernel, @Const, synchronize
+
+"""Naive cumsum implementation for matrix, parallelizes along the first dimension"""
+Base.cumsum(A::oneArray{T}; dims) where T = begin
+    dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
+    backend = oneAPIBackend()
+    B = similar(A)
+    cumsum_kernel = naive_cumsum!(backend)
+    cumsum_kernel(B, A, ndrange=size(A,1))
+    synchronize(backend)
+    return B
+end
+
+Base.cumsum!(B::oneArray{T}, A::oneArray{T}; dims) where T = begin
+    dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
+    backend = oneAPIBackend()
+    cumsum_kernel = naive_cumsum!(backend)
+    cumsum_kernel(B, A, ndrange=size(A,1))
+    synchronize(backend)
+end
+
+## COV_EXCL_START
+@kernel function naive_cumsum!(B, @Const(A))
+    i = @index(Global)
+
+    cur_val = 0.0f0
+    for k ∈ 1:size(A, 2)
+        @inbounds cur_val += A[i, k]
+        @inbounds B[i, k] = cur_val
+    end
+end
+## COV_EXCL_STOP
+
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
     @warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"

diff --git a/KomaMRICore/src/simulation/GPUFunctions.jl b/KomaMRICore/src/simulation/GPUFunctions.jl
@@ -9,6 +9,10 @@ _print_devices(::KA.CPU) = @info "CPU: $(length(Sys.cpu_info())) x $(Sys.cpu_inf
 name(::KA.CPU) = "CPU"
 set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'" 
 
+#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
+#for now we use a custom function for each backend to implement
+function _cis end
+
 """
     get_backend(use_gpu)
 

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -1,11 +1,11 @@
-"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
+"""Stores preallocated structs for use in Bloch CPU run_spin_precession! and run_spin_excitation! functions."""
 struct BlochCPUPrealloc{T} <: PreallocResult{T}
     M::Mag{T}                               # Mag{T}
     Bz_old::AbstractVector{T}               # Vector{T}(Nspins x 1)
     Bz_new::AbstractVector{T}               # Vector{T}(Nspins x 1)
     ϕ::AbstractVector{T}                    # Vector{T}(Nspins x 1)
-    φ::AbstractVector{T}                    # Vector{T}(Nspins x 1)
     Rot::Spinor{T}                          # Spinor{T}
+    scaled_Δw::AbstractVector{T}            # Vector{T}(Nspins x 1)
 end
 
 Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
@@ -14,13 +14,13 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
         p.Bz_old[i],
         p.Bz_new[i],
         p.ϕ[i],
-        p.φ[i],
-        p.Rot[i]
+        p.Rot[i],
+        p.scaled_Δw[i]
     )
 end
 
-"""Preallocates arrays for use in run_spin_precession."""
-function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
+"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
+function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
     return BlochCPUPrealloc(
         Mag(
             similar(M.xy),
@@ -29,11 +29,11 @@ function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}
         similar(obj.x),
         similar(obj.x),
         similar(obj.x),
-        similar(obj.x),
         Spinor(
             similar(M.xy),
             similar(M.xy)
-        )
+        ),
+        obj.Δw ./ T(2π .* γ)
     )
 end
 
@@ -63,8 +63,9 @@ function run_spin_precession!(
     Bz_new = prealloc.Bz_new
     ϕ = prealloc.ϕ
     Mxy = prealloc.M.xy
+    scaled_Δw = prealloc.scaled_Δw
     fill!(ϕ, zero(T))
-    @. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
+    @. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + scaled_Δw
 
     # Fill sig[1] if needed
     ADC_idx = 1
@@ -79,7 +80,7 @@ function run_spin_precession!(
         t_seq += seq.Δt[seq_idx-1]
 
         #Effective Field
-        @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
+        @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + scaled_Δw
 
         #Rotation
         @. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]
@@ -116,24 +117,21 @@ function run_spin_excitation!(
     backend::KA.CPU,
     prealloc::BlochCPUPrealloc
 ) where {T<:Real}
-    ΔBz = prealloc.Bz_old
-    Bz = prealloc.Bz_new
-    B = prealloc.ϕ
-    φ = prealloc.φ
+    Bz = prealloc.Bz_old
+    B = prealloc.Bz_new
+    φ = prealloc.ϕ
     α = prealloc.Rot.α
     β = prealloc.Rot.β
+    scaled_Δw = prealloc.scaled_Δw
     Maux_xy = prealloc.M.xy
     Maux_z = prealloc.M.z
 
-    #Can be calculated outside of loop
-    @. ΔBz = p.Δw / T(2π * γ)
-
     #Simulation
     for s in seq #This iterates over seq, "s = seq[i,:]"
         #Motion
         x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
         #Effective field
-        @. Bz = (s.Gx * x + s.Gy * y + s.Gz * z) + ΔBz - s.Δf / T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
+        @. Bz = (s.Gx * x + s.Gy * y + s.Gz * z) + scaled_Δw - s.Δf / T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
         @. B = sqrt(abs(s.B1)^2 + abs(Bz)^2)
         @. B[B == 0] = eps(T)
         #Spinor Rotation