Merge pull request #84 from eth-cscs/gpu-exts

Use extensions for GPU dependencies
eth-cscs · Jan 22, 2024 · d6d5c03 · d6d5c03
2 parents 13a00ff + 65e3e49
commit d6d5c03
Show file tree

Hide file tree

Showing 29 changed files with 816 additions and 507 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,9 +20,9 @@ jobs:
  fail-fast: false
  matrix:
  version:
- # - '1.7' # Skipping this version because of AMDGPU deps compat issue (rocBLAS_jll)
+ - '1.9' # Minimum version supporting extensions
  - '1' # Latest stable 1.x release of Julia
- - 'nightly'
+ # - 'nightly'
  os:
  - ubuntu-latest
  - macOS-latest

diff --git a/Project.toml b/Project.toml
@@ -4,15 +4,17 @@ uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 version = "0.14.0"
 
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 
 [extensions]
 ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization"
+ImplicitGlobalGrid_AMDGPUExt = "AMDGPU"
+ImplicitGlobalGrid_CUDAExt = "CUDA"
 
 [compat]
 AMDGPU = "0.5, 0.6, 0.7, 0.8"
@@ -27,4 +29,4 @@ MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "MPIPreferences", "LoopVectorization"]
+test = ["Test", "MPIPreferences", "AMDGPU", "CUDA", "LoopVectorization"]
diff --git a/README.md b/README.md
@@ -45,7 +45,8 @@ The following Multi-GPU 3-D heat diffusion solver illustrates how these function
 ## 50-lines Multi-GPU example
 This simple Multi-GPU 3-D heat diffusion solver uses ImplicitGlobalGrid. It relies fully on the broadcasting capabilities of [CUDA.jl]'s `CuArray` type to perform the stencil-computations with maximal simplicity ([CUDA.jl] enables also writing explicit GPU kernels which can lead to significantly better performance for these computations).
 ```julia
-using ImplicitGlobalGrid, CUDA
+using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid
 
 @views d_xa(A) = A[2:end , : , : ] .- A[1:end-1, : , : ];
 @views d_xi(A) = A[2:end ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1];
@@ -108,7 +109,8 @@ ImplicitGlobalGrid provides a function to gather an array from each process into
 
 This enables straightforward in-situ visualization or monitoring of Multi-GPU/CPU applications using e.g. the [Julia Plots package] as shown in the following (the GR backend is used as it is particularly fast according to the [Julia Plots documentation]). It is enough to add a couple of lines to the previous example (omitted unmodified lines are represented with `#(...)`):
 ```julia
-using ImplicitGlobalGrid, CUDA, Plots
+using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid, Plots
 #(...)
 
 @views function diffusion3D()
@@ -230,12 +232,15 @@ search: ImplicitGlobalGrid
 
  To see a description of a function type ?<functionname>.
 
+ │ Activation of device support
+ │
+ │ The support for a device type (CUDA or AMDGPU) is activated by importing the corresponding module (CUDA or AMDGPU) before
+ │ importing ImplicitGlobalGrid (the corresponding extension will be loaded).
+
  │ Performance note
  │
- │ If the system supports CUDA-aware MPI (for Nvidia GPUs) or
- │ ROCm-aware MPI (for AMD GPUs), it may be activated for
- │ ImplicitGlobalGrid by setting one of the following environment
- │ variables (at latest before the call to init_global_grid):
+ │ If the system supports CUDA-aware MPI (for Nvidia GPUs) or ROCm-aware MPI (for AMD GPUs), it may be activated for
+ │ ImplicitGlobalGrid by setting one of the following environment variables (at latest before the call to init_global_grid):
  │
  │ shell> export IGG_CUDAAWARE_MPI=1
  │

diff --git a/examples/diffusion3D_multigpu_CuArrays.jl b/examples/diffusion3D_multigpu_CuArrays.jl
@@ -1,4 +1,5 @@
-using ImplicitGlobalGrid, CUDA, Plots
+using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid, Plots
 
 @views d_xa(A) = A[2:end , : , : ] .- A[1:end-1, : , : ];
 @views d_xi(A) = A[2:end ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1];

diff --git a/examples/diffusion3D_multigpu_CuArrays_novis.jl b/examples/diffusion3D_multigpu_CuArrays_novis.jl
@@ -1,4 +1,5 @@
-using ImplicitGlobalGrid, CUDA
+using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid
 
 @views d_xa(A) = A[2:end , : , : ] .- A[1:end-1, : , : ];
 @views d_xi(A) = A[2:end ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1];

diff --git a/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl b/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl
@@ -1,4 +1,5 @@
-using ImplicitGlobalGrid, CUDA, Plots
+using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid, Plots
 #(...)
 
 @views function diffusion3D()

diff --git a/ext/ImplicitGlobalGrid_AMDGPUExt.jl b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
@@ -0,0 +1,5 @@
+module ImplicitGlobalGrid_AMDGPUExt
+ include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "shared.jl"))
+ include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "select_device.jl"))
+ include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "update_halo.jl"))
+end
diff --git a/ext/ImplicitGlobalGrid_CUDAExt.jl b/ext/ImplicitGlobalGrid_CUDAExt.jl
@@ -0,0 +1,5 @@
+module ImplicitGlobalGrid_CUDAExt
+ include(joinpath(@__DIR__, "..", "src", "CUDAExt", "shared.jl"))
+ include(joinpath(@__DIR__, "..", "src", "CUDAExt", "select_device.jl"))
+ include(joinpath(@__DIR__, "..", "src", "CUDAExt", "update_halo.jl"))
+end
diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
@@ -0,0 +1,22 @@
+# shared.jl
+
+is_rocarray(A::GGArray) = false
+
+
+# select_device.jl
+
+function nb_rocdevices end
+function rocdevice! end
+
+
+# update_halo.jl
+
+function free_update_halo_rocbuffers end
+function init_rocbufs_arrays end
+function init_rocbufs end
+function reinterpret_rocbufs end
+function reallocate_undersized_rocbufs end
+function reregister_rocbufs end
+function get_rocsendbufs_raw end
+function get_rocrecvbufs_raw end
+function allocate_rocstreams end
diff --git a/src/AMDGPUExt/select_device.jl b/src/AMDGPUExt/select_device.jl
@@ -0,0 +1,2 @@
+ImplicitGlobalGrid.nb_rocdevices() = length(AMDGPU.devices())
+ImplicitGlobalGrid.rocdevice!(device_id) = AMDGPU.device_id!(device_id)
diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
@@ -0,0 +1,42 @@
+import ImplicitGlobalGrid
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, amdgpuaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_rocarray
+import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
+using AMDGPU
+
+
+##------
+## TYPES
+
+const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
+
+
+##------------------------------------
+## HANDLING OF CUDA AND AMDGPU SUPPORT
+
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = true
+ImplicitGlobalGrid.is_functional(::Val{:AMDGPU}) = AMDGPU.functional()
+
+
+##-------------
+## SYNTAX SUGAR
+
+ImplicitGlobalGrid.is_rocarray(A::ROCArray) = true #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+
+
+##--------------------------------------------------------------------------------
+## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
+
+ImplicitGlobalGrid.wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw))
+
+Base.size(A::ROCField) = Base.size(A.A)
+Base.size(A::ROCField, args...) = Base.size(A.A, args...)
+Base.length(A::ROCField) = Base.length(A.A)
+Base.ndims(A::ROCField) = Base.ndims(A.A)
+Base.eltype(A::ROCField) = Base.eltype(A.A)
+
+##---------------
+## AMDGPU functions
+
+function ImplicitGlobalGrid.register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
+ return unsafe_wrap(ROCArray, pointer(buf), size(buf))
+end