Add Distributed Examples to Documentation (#468)

* Create 4-run-distributed-simulations.md Add distributed examples in "how-to" section * Update 4-run-distributed-simulations.md Correct typo * Add space * Make use case for multi-node simulation clearer * Add set_device! override without backend parameter * Add function for combining two RawAcquisitionData structs * Simplify example scripts * Remove sentence * Add isapprox function for testing * Add RawAcquisitionData test * Add images to assets folder * Update examples * Delete docs/src/assets/KomamultiGPU.svg * Delete docs/src/assets/KomamultiNode.svg * Delete docs/src/assets/KomamultiNodeCPU.svg * Add files via upload * Fix broken image * Use correct image for multiGPU * Update to use distributed macro
JuliaHealth · Aug 23, 2024 · a26d598 · a26d598 · github-actions · Aug 23, 2024
1 parent 57f8698
commit a26d598
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 2 deletions.
diff --git a/KomaMRICore/src/rawdata/ISMRMRD.jl b/KomaMRICore/src/rawdata/ISMRMRD.jl
@@ -238,3 +238,22 @@ Base.show(io::IO, raw::RawAcquisitionData) = begin
  print(io, "RawAcqData[$seq_name | $(length(raw.profiles)) Profile(s) of $Nt×$Nc]")
  end
 end
+
+Base.:+(sig1::RawAcquisitionData, sig2::RawAcquisitionData) = RawAcquisitionData(
+ sig1.params,
+ [Profile(
+ sig1.profiles[i].head,
+ sig1.profiles[i].traj,
+ sig1.profiles[i].data .+ sig2.profiles[i].data
+ ) for i=1:length(sig1.profiles)]
+)
+
+Base.isapprox(sig1::RawAcquisitionData, sig2::RawAcquisitionData; kwargs...) = begin
+ (length(sig1.profiles) == length(sig2.profiles)) || return false
+
+ for i=1:length(sig1.profiles)
+ isapprox(sig1.profiles[i].data, sig2.profiles[i].data; kwargs...) || return false
+ end
+
+ return true
+end
diff --git a/KomaMRICore/src/simulation/GPUFunctions.jl b/KomaMRICore/src/simulation/GPUFunctions.jl
@@ -8,6 +8,7 @@ _print_devices(backend) = @error "_print_devices called with invalid backend typ
 _print_devices(::KA.CPU) = @info "CPU: $(length(Sys.cpu_info())) x $(Sys.cpu_info()[1].model)"
 name(::KA.CPU) = "CPU"
 set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'" 
+set_device!(val) = set_device!(get_backend(true), val)
 
 #oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
 #for now we use a custom function for each backend to implement
@@ -96,4 +97,4 @@ function print_devices(use_gpu = true)
  _print_devices(backend)
 end
 
-export print_devices
+export print_devices, set_device!
diff --git a/KomaMRICore/test/runtests.jl b/KomaMRICore/test/runtests.jl
@@ -169,7 +169,26 @@ const CI = get(ENV, "CI", nothing)
  @test true
 end
 
-# Test ISMRMRD
+@testitem "ISMRMRD" tags=[:core] begin
+ using Suppressor
+ include("initialize_backend.jl")
+
+ seq = PulseDesigner.EPI_example()
+ sys = Scanner()
+ obj = brain_phantom2D()
+ parts = kfoldperm(length(obj), 2)
+
+ sim_params = KomaMRICore.default_sim_params()
+ sim_params["return_type"] = "raw"
+ sim_params["gpu"] = USE_GPU
+
+ sig1 = @suppress simulate(obj[parts[1]], seq, sys; sim_params)
+ sig2 = @suppress simulate(obj[parts[2]], seq, sys; sim_params)
+ sig = @suppress simulate(obj, seq, sys; sim_params)
+
+ @test isapprox(sig, sig1 + sig2; rtol=0.001)
+end
+
 @testitem "signal_to_raw_data" tags=[:core] begin
  using Suppressor
  include("initialize_backend.jl")

diff --git a/docs/src/assets/KomamultiGPU.svg b/docs/src/assets/KomamultiGPU.svg
diff --git a/docs/src/assets/KomamultiNode.svg b/docs/src/assets/KomamultiNode.svg
diff --git a/docs/src/assets/KomamultiNodeCPU.svg b/docs/src/assets/KomamultiNodeCPU.svg
diff --git a/docs/src/how-to/4-run-distributed-simulations.md b/docs/src/how-to/4-run-distributed-simulations.md
@@ -0,0 +1,105 @@
+# Run Distributed Simulations 
+
+While KomaMRI provides built-in support for CPU and GPU parallelization, it is sometimes desirable to distribute simulation work even further across multiple GPUs or compute nodes. This can be done by using Distributed.jl and making use of the independent spin property: each spin in the system is independent from the rest, so the phantom spins can be subdivided into separate simulations and results recombined, as in the diagram below:
+
+```@raw html
+<p align="center"><img width="90%" src="../../assets/KomamultiNode.svg"/></p>
+```
+
+The following two examples demonstrate how to use Distributed.jl to run a simulation using multiple GPUS, and using multiple nodes in an HPC cluster.
+
+## Using Multiple GPUs
+
+To run a simulation using multiple GPUs, the phantom object can be divided using the kfoldperm function. Distributed.jl can then be used to start one Julia worker process per available device so that each device simulates a different part of the object. The results can then be fetched asynchronously by the main process and combined to produce a final signal. This process is shown in the diagram below: 
+
+```@raw html
+<p align="center"><img width="90%" src="../../assets/KomamultiGPU.svg"/></p>
+```
+
+The code for doing so is shown below:
+
+!!! details "SLURM Script Requesting Multiple GPUs"
+
+ ```sh
+ #!/bin/bash
+ #SBATCH --job-name # Enter job name
+ #SBATCH -t # Enter max runtime for job
+ #SBATCH -p # Enter partition on which to run the job
+ #SBATCH --cpus-per-task=1 # Request 1 CPU
+ #SBATCH --gpus= # Enter number of GPUs to request
+ #SBATCH -o # Enter file path to write stdout to
+ #SBATCH -e # Enter file path to write stderr to
+
+ julia script.jl
+ ```
+
+```julia
+using Distributed
+using CUDA
+
+#Add workers based on the number of available devices
+addprocs(length(devices()))
+
+#Define inputs on each worker process
+@everywhere begin
+ using KomaMRI, CUDA
+ sys = Scanner()
+ seq = PulseDesigner.EPI_example()
+ obj = brain_phantom2D()
+ #Divide phantom
+ parts = kfoldperm(length(obj), nworkers())
+end
+
+#Distribute simulation across workers
+raw = Distributed.@distributed (+) for i=1:nworkers()
+ KomaMRICore.set_device!(i-1) #Sets device for this worker, note that CUDA devices are indexed from 0
+ simulate(obj[parts[i]], seq, sys)
+end
+```
+
+## Using Multiple Nodes in an HPC Cluster
+
+The script below uses the package ClusterManagers.jl to initialize worker processes on a SLURM cluster based on the number of tasks specified in the #SBATCH --ntasks directive. This can be useful to divide simulation work among multiple compute nodes if the problem is too large to fit into memory for a single computer, or if the number of desired workers is greater than the typical number of CPU cores available. An illustration of this is shown below:
+
+```@raw html
+<p align="center"><img width="90%" src="../../assets/KomamultiNodeCPU.svg"/></p>
+```
+
+!!! details "SLURM Script Requesting Multiple Nodes"
+
+ ```sh
+ #!/bin/bash
+ #SBATCH --job-name # Enter job name here
+ #SBATCH -t # Enter max runtime for job
+ #SBATCH -p # Enter partition on which to run the job
+ #SBATCH --nodes # Enter number of nodes on which to run the job
+ #SBATCH --ntasks # Should be equal to number of nodes
+ #SBATCH --ntasks-per-node=1 # Run each task on a separate node
+ #SBATCH --cpus-per-task # Enter number of CPU threads to use per node
+ #SBATCH -o # Enter file path to write stdout to
+ #SBATCH -e # Enter file path to write stderr to
+
+ julia script.jl
+ ```
+
+```julia
+using Distributed
+using ClusterManagers
+
+#Add workers based on the specified number of SLURM tasks
+addprocs(SlurmManager(parse(Int, ENV["SLURM_NTASKS"])))
+
+#Define inputs on each worker process
+@everywhere begin
+ using KomaMRI
+ sys = Scanner()
+ seq = PulseDesigner.EPI_example()
+ obj = brain_phantom2D()
+ parts = kfoldperm(length(obj), nworkers())
+end
+
+#Distribute simulation across workers
+raw = Distributed.@distributed (+) for i=1:nworkers()
+ simulate(obj[parts[i]], seq, sys)
+end
+```
Benchmark suite	Current: `a26d598`	Previous: `57f8698`	Ratio
`MRI Lab/Bloch/CPU/2 thread(s)`	`224991845.5` ns	`226618506` ns	`0.99`
`MRI Lab/Bloch/CPU/4 thread(s)`	`174863887` ns	`174536994` ns	`1.00`
`MRI Lab/Bloch/CPU/8 thread(s)`	`90572834` ns	`146360095.5` ns	`0.62`
`MRI Lab/Bloch/CPU/1 thread(s)`	`347400444` ns	`347644824` ns	`1.00`
`MRI Lab/Bloch/GPU/CUDA`	`57092571.5` ns	`57253633` ns	`1.00`
`MRI Lab/Bloch/GPU/oneAPI`	`522866366` ns	`515042255.5` ns	`1.02`
`MRI Lab/Bloch/GPU/Metal`	`568303458` ns	`541353541` ns	`1.05`
`MRI Lab/Bloch/GPU/AMDGPU`	`36981171` ns	`37619574.5` ns	`0.98`
`Slice Selection 3D/Bloch/CPU/2 thread(s)`	`1151416902` ns	`1024148878` ns	`1.12`
`Slice Selection 3D/Bloch/CPU/4 thread(s)`	`580233533` ns	`580936747` ns	`1.00`
`Slice Selection 3D/Bloch/CPU/8 thread(s)`	`341164837` ns	`386777586` ns	`0.88`
`Slice Selection 3D/Bloch/CPU/1 thread(s)`	`1930414196.5` ns	`1925568005.5` ns	`1.00`
`Slice Selection 3D/Bloch/GPU/CUDA`	`101438582.5` ns	`100754922` ns	`1.01`
`Slice Selection 3D/Bloch/GPU/oneAPI`	`636188156.5` ns	`654922437.5` ns	`0.97`
`Slice Selection 3D/Bloch/GPU/Metal`	`565478250` ns	`564653500` ns	`1.00`
`Slice Selection 3D/Bloch/GPU/AMDGPU`	`60929383` ns	`60779232` ns	`1.00`