Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize run_spin_precession! for GPU #459

Merged
merged 13 commits into from
Jul 31, 2024
4 changes: 2 additions & 2 deletions KomaMRIBase/src/KomaMRIBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using MRIBase
Profile, RawAcquisitionData, AcquisitionData, AcquisitionHeader, EncodingCounters, Limit
using MAT # For loading example phantoms

global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
const global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T

# Hardware
include("datatypes/Scanner.jl")
Expand Down Expand Up @@ -43,7 +43,7 @@ export discretize, get_adc_phase_compensation, get_adc_sampling_times
export is_Gx_on, is_Gy_on, is_Gz_on, is_RF_on, is_ADC_on
export times, ampls, freqs
# This are also used for simulation
export kfoldperm, trapz, cumtrapz
export kfoldperm, trapz, cumtrapz, trapz!, cumtrapz!
# Phantom
export brain_phantom2D, brain_phantom3D, pelvis_phantom2D, heart_phantom
# Motion
Expand Down
12 changes: 10 additions & 2 deletions KomaMRIBase/src/timing/TrapezoidalIntegration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
y = sum(y)
return y
end
@inline function trapz!(ϕ::AbstractVector{T}, Tz::AbstractMatrix{TX}, Δt::AbstractMatrix{T}, x::AbstractMatrix{TX}) where {T<:Real, TX<:Union{T, Complex{T}}}
Tz .= (x[:, 2:end] .+ x[:, 1:end-1]) .* Δt .* T(-π .* γ)
ϕ .= sum(Tz, dims=2)

Check warning on line 33 in KomaMRIBase/src/timing/TrapezoidalIntegration.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRIBase/src/timing/TrapezoidalIntegration.jl#L31-L33

Added lines #L31 - L33 were not covered by tests
end
cncastillo marked this conversation as resolved.
Show resolved Hide resolved

"""
y = cumtrapz(Δt, x)
Expand All @@ -45,12 +49,16 @@
phantom
"""
function cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}) where {T<:Real}
y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt ./ 2)
y = cumsum(y, dims=2)
return y
end
function cumtrapz(Δt::AbstractVector{T}, x::AbstractVector{T}) where {T<:Real}
y = (x[2:end] .+ x[1:end-1]) .* (Δt / 2)
y = (x[2:end] .+ x[1:end-1]) .* (Δt ./ 2)

Check warning on line 57 in KomaMRIBase/src/timing/TrapezoidalIntegration.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRIBase/src/timing/TrapezoidalIntegration.jl#L57

Added line #L57 was not covered by tests
y = cumsum(y)
return y
end
@inline function cumtrapz!(ϕ::AbstractArray{T}, Tz::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real}
Tz .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ)
cumsum!(ϕ, Tz, dims=2)

Check warning on line 63 in KomaMRIBase/src/timing/TrapezoidalIntegration.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRIBase/src/timing/TrapezoidalIntegration.jl#L61-L63

Added lines #L61 - L63 were not covered by tests
end
cncastillo marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion KomaMRICore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CUDA = "3, 4, 5"
Functors = "0.4"
KernelAbstractions = "0.9"
KomaMRIBase = "0.9"
Metal = "1"
Metal = "1.2"
ProgressMeter = "1"
Reexport = "1"
ThreadsX = "0.1"
Expand Down
1 change: 1 addition & 0 deletions KomaMRICore/ext/KomaAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
KomaMRICore.set_device!(::ROCBackend, dev_idx::Integer) = AMDGPU.device_id!(dev_idx)
KomaMRICore.set_device!(::ROCBackend, dev::AMDGPU.HIPDevice) = AMDGPU.device!(dev)
KomaMRICore.device_name(::ROCBackend) = AMDGPU.HIP.name(AMDGPU.device())
@inline KomaMRICore._cis(x) = cis(x)

Check warning on line 12 in KomaMRICore/ext/KomaAMDGPUExt.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRICore/ext/KomaAMDGPUExt.jl#L12

Added line #L12 was not covered by tests

function KomaMRICore._print_devices(::ROCBackend)
devices = [
Expand Down
1 change: 1 addition & 0 deletions KomaMRICore/ext/KomaCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
KomaMRICore.isfunctional(::CUDABackend) = CUDA.functional()
KomaMRICore.set_device!(::CUDABackend, val) = CUDA.device!(val)
KomaMRICore.device_name(::CUDABackend) = CUDA.name(CUDA.device())
@inline KomaMRICore._cis(x) = cis(x)

Check warning on line 11 in KomaMRICore/ext/KomaCUDAExt.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRICore/ext/KomaCUDAExt.jl#L11

Added line #L11 was not covered by tests

function KomaMRICore._print_devices(::CUDABackend)
devices = [
Expand Down
9 changes: 1 addition & 8 deletions KomaMRICore/ext/KomaMetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,14 @@ KomaMRICore.isfunctional(::MetalBackend) = Metal.functional()
KomaMRICore.set_device!(::MetalBackend, device_index::Integer) = device_index == 1 || @warn "Metal does not support multiple gpu devices. Ignoring the device setting."
KomaMRICore.set_device!(::MetalBackend, dev::Metal.MTLDevice) = Metal.device!(dev)
KomaMRICore.device_name(::MetalBackend) = String(Metal.current_device().name)
@inline KomaMRICore._cis(x) = cis(x)

function KomaMRICore._print_devices(::MetalBackend)
@info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
end

#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
#can be removed
Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))

function __init__()
push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
@warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
end

end
Expand Down
35 changes: 34 additions & 1 deletion KomaMRICore/ext/KomaoneAPIExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
KomaMRICore.isfunctional(::oneAPIBackend) = oneAPI.functional()
KomaMRICore.set_device!(::oneAPIBackend, val) = oneAPI.device!(val)
KomaMRICore.device_name(::oneAPIBackend) = oneAPI.properties(oneAPI.device()).name
@inline KomaMRICore._cis(x) = cos(x) + im * sin(x)

Check warning on line 11 in KomaMRICore/ext/KomaoneAPIExt.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRICore/ext/KomaoneAPIExt.jl#L11

Added line #L11 was not covered by tests

function KomaMRICore._print_devices(::oneAPIBackend)
devices = [
Expand All @@ -20,9 +21,41 @@
#Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))

using KernelAbstractions: @index, @kernel, @Const, synchronize

"""Naive cumsum implementation for matrix, parallelizes along the first dimension"""
Base.cumsum(A::oneArray{T}; dims) where T = begin
dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
backend = oneAPIBackend()
B = similar(A)
cumsum_kernel = naive_cumsum!(backend)
cumsum_kernel(B, A, ndrange=size(A,1))
synchronize(backend)
return B
end

Base.cumsum!(B::oneArray{T}, A::oneArray{T}; dims) where T = begin
dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
backend = oneAPIBackend()
cumsum_kernel = naive_cumsum!(backend)
cumsum_kernel(B, A, ndrange=size(A,1))
synchronize(backend)
end

## COV_EXCL_START
@kernel function naive_cumsum!(B, @Const(A))
i = @index(Global)

cur_val = 0.0f0
for k ∈ 1:size(A, 2)
@inbounds cur_val += A[i, k]
@inbounds B[i, k] = cur_val
end
end
## COV_EXCL_STOP

function __init__()
push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
@warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"
Expand Down
4 changes: 4 additions & 0 deletions KomaMRICore/src/simulation/GPUFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ _print_devices(::KA.CPU) = @info "CPU: $(length(Sys.cpu_info())) x $(Sys.cpu_inf
name(::KA.CPU) = "CPU"
set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'"

#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
#for now we use a custom function for each backend to implement
function _cis end

"""
get_backend(use_gpu)

Expand Down
34 changes: 16 additions & 18 deletions KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
"""Stores preallocated structs for use in Bloch CPU run_spin_precession! and run_spin_excitation! functions."""
struct BlochCPUPrealloc{T} <: PreallocResult{T}
M::Mag{T} # Mag{T}
Bz_old::AbstractVector{T} # Vector{T}(Nspins x 1)
Bz_new::AbstractVector{T} # Vector{T}(Nspins x 1)
ϕ::AbstractVector{T} # Vector{T}(Nspins x 1)
φ::AbstractVector{T} # Vector{T}(Nspins x 1)
Rot::Spinor{T} # Spinor{T}
scaled_Δw::AbstractVector{T} # Vector{T}(Nspins x 1)
end
cncastillo marked this conversation as resolved.
Show resolved Hide resolved

Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
Expand All @@ -14,13 +14,13 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
p.Bz_old[i],
p.Bz_new[i],
p.ϕ[i],
p.φ[i],
p.Rot[i]
p.Rot[i],
p.scaled_Δw[i]
)
end

"""Preallocates arrays for use in run_spin_precession."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
return BlochCPUPrealloc(
Mag(
similar(M.xy),
Expand All @@ -29,11 +29,11 @@ function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}
similar(obj.x),
similar(obj.x),
similar(obj.x),
similar(obj.x),
Spinor(
similar(M.xy),
similar(M.xy)
)
),
obj.Δw ./ T(2π .* γ)
)
end

Expand Down Expand Up @@ -63,8 +63,9 @@ function run_spin_precession!(
Bz_new = prealloc.Bz_new
ϕ = prealloc.ϕ
Mxy = prealloc.M.xy
scaled_Δw = prealloc.scaled_Δw
fill!(ϕ, zero(T))
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + scaled_Δw

# Fill sig[1] if needed
ADC_idx = 1
Expand All @@ -79,7 +80,7 @@ function run_spin_precession!(
t_seq += seq.Δt[seq_idx-1]

#Effective Field
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + scaled_Δw

#Rotation
@. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]
Expand Down Expand Up @@ -116,24 +117,21 @@ function run_spin_excitation!(
backend::KA.CPU,
prealloc::BlochCPUPrealloc
) where {T<:Real}
ΔBz = prealloc.Bz_old
Bz = prealloc.Bz_new
B = prealloc.ϕ
φ = prealloc.φ
Bz = prealloc.Bz_old
B = prealloc.Bz_new
φ = prealloc.ϕ
α = prealloc.Rot.α
β = prealloc.Rot.β
scaled_Δw = prealloc.scaled_Δw
Maux_xy = prealloc.M.xy
Maux_z = prealloc.M.z

#Can be calculated outside of loop
@. ΔBz = p.Δw / T(2π * γ)

#Simulation
for s in seq #This iterates over seq, "s = seq[i,:]"
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
#Effective field
@. Bz = (s.Gx * x + s.Gy * y + s.Gz * z) + ΔBz - s.Δf / T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
@. Bz = (s.Gx * x + s.Gy * y + s.Gz * z) + scaled_Δw - s.Δf / T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
@. B = sqrt(abs(s.B1)^2 + abs(Bz)^2)
@. B[B == 0] = eps(T)
#Spinor Rotation
Expand Down
Loading