From 4b64b427d4139bb8bd5e560809ee02080d7cdf3c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 15:58:07 +0100
Subject: [PATCH 01/34] update project for using gpu extensions

---
 Project.toml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 739de78..0d0aa91 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,15 +4,17 @@ uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 version = "0.14.0"
 
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 
 [extensions]
 ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization"
+ImplicitGlobalGrid_AMDGPUExt = "AMDGPU"
+ImplicitGlobalGrid_CUDAExt = "CUDA"
 
 [compat]
 AMDGPU = "0.5, 0.6, 0.7, 0.8"
@@ -27,4 +29,4 @@ MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "MPIPreferences", "LoopVectorization"]
+test = ["Test", "MPIPreferences", "AMDGPU", "CUDA", "LoopVectorization"]

From 8b3d6697db2cddfa06a2b16beaa65a8c7d220cac Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 15:59:47 +0100
Subject: [PATCH 02/34] add defaults file include for extensions

---
 src/ImplicitGlobalGrid.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl
index 628d799..3721eed 100644
--- a/src/ImplicitGlobalGrid.jl
+++ b/src/ImplicitGlobalGrid.jl
@@ -42,6 +42,8 @@ using .Exceptions
 include("shared.jl")
 
 ## Alphabetical include of defaults for extensions
+include(joinpath("AMDGPUExt", "defaults.jl"))
+include(joinpath("CUDAExt", "defaults.jl"))
 include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl"))
 
 ## Alphabetical include of files

From 61d9ccd93c792a10509b99c8847caa8d1c5b6e83 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 16:03:10 +0100
Subject: [PATCH 03/34] move types and more in shared.jl to extensions

---
 src/AMDGPUExt/shared.jl | 18 ++++++++++++++++++
 src/CUDAExt/shared.jl   | 21 +++++++++++++++++++++
 src/shared.jl           | 37 +++++++++----------------------------
 3 files changed, 48 insertions(+), 28 deletions(-)
 create mode 100644 src/AMDGPUExt/shared.jl
 create mode 100644 src/CUDAExt/shared.jl

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
new file mode 100644
index 0000000..17c588f
--- /dev/null
+++ b/src/AMDGPUExt/shared.jl
@@ -0,0 +1,18 @@
+const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
+
+is_rocarray(A::GGArray) = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+
+wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw))
+
+Base.size(A::ROCField)          = Base.size(A.A)
+Base.size(A::ROCField, args...) = Base.size(A.A, args...)
+Base.length(A::ROCField)        = Base.length(A.A)
+Base.ndims(A::ROCField)         = Base.ndims(A.A)
+Base.eltype(A::ROCField)        = Base.eltype(A.A)
+
+##---------------
+## AMDGPU functions
+
+function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
+    return unsafe_wrap(ROCArray, pointer(buf), size(buf))
+end
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
new file mode 100644
index 0000000..d7ca46b
--- /dev/null
+++ b/src/CUDAExt/shared.jl
@@ -0,0 +1,21 @@
+const CuField{T,N} = GGField{T,N,CuArray{T,N}}
+
+is_cuarray(A::GGArray) = typeof(A) <: CuArray   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+
+wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw))
+
+Base.size(A::CuField)          = Base.size(A.A)
+Base.size(A::CuField, args...) = Base.size(A.A, args...)
+Base.length(A::CuField)        = Base.length(A.A)
+Base.ndims(A::CuField)         = Base.ndims(A.A)
+Base.eltype(A::CuField)        = Base.eltype(A.A)
+
+
+##---------------
+## CUDA functions
+
+function register(::Type{<:CuArray},buf::Array{T}) where T <: GGNumber
+    rbuf = CUDA.Mem.register(CUDA.Mem.Host, pointer(buf), sizeof(buf), CUDA.Mem.HOSTREGISTER_DEVICEMAP);
+    rbuf_d = convert(CuPtr{T}, rbuf);
+    return unsafe_wrap(CuArray, rbuf_d, size(buf)), rbuf;
+end
diff --git a/src/shared.jl b/src/shared.jl
index b9a9907..7d89f7f 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -40,13 +40,11 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU"
 
 const GGInt                           = Cint
 const GGNumber                        = Number
-const GGArray{T,N}                    = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
+const GGArray{T,N}                    = DenseArray{T,N} # TODO: was Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}}
 const GGField{T,N,T_array}            = NamedTuple{(:A, :halowidths), Tuple{T_array, Tuple{GGInt,GGInt,GGInt}}} where {T_array<:GGArray{T,N}}
 const GGFieldConvertible{T,N,T_array} = NamedTuple{(:A, :halowidths), <:Tuple{T_array, Tuple{T2,T2,T2}}} where {T_array<:GGArray{T,N}, T2<:Integer}
 const GGField{}(t::NamedTuple)        = GGField{eltype(t.A),ndims(t.A),typeof(t.A)}((t.A, GGInt.(t.halowidths)))
 const CPUField{T,N}                   = GGField{T,N,Array{T,N}}
-const CuField{T,N}                    = GGField{T,N,CuArray{T,N}}
-const ROCField{T,N}                   = GGField{T,N,ROCArray{T,N}}
 
 "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests)
 struct GlobalGrid
@@ -116,8 +114,6 @@ any_array(fields::GGField...)          = any([is_array(A.A) for A in fields])
 any_cuarray(fields::GGField...)        = any([is_cuarray(A.A) for A in fields])
 any_rocarray(fields::GGField...)       = any([is_rocarray(A.A) for A in fields])
 is_array(A::GGArray)                   = typeof(A) <: Array
-is_cuarray(A::GGArray)                 = typeof(A) <: CuArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
-is_rocarray(A::GGArray)                = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
 
 ##--------------------------------------------------------------------------------
@@ -126,31 +122,16 @@ is_rocarray(A::GGArray)                = typeof(A) <: ROCArray  #NOTE: this func
 wrap_field(A::GGField)                 = A
 wrap_field(A::GGFieldConvertible)      = GGField(A)
 wrap_field(A::Array, hw::Tuple)        = CPUField{eltype(A), ndims(A)}((A, hw))
-wrap_field(A::CuArray, hw::Tuple)      = CuField{eltype(A), ndims(A)}((A, hw))
-wrap_field(A::ROCArray, hw::Tuple)     = ROCField{eltype(A), ndims(A)}((A, hw))
 wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw)
 wrap_field(A::GGArray)                 = wrap_field(A, hw_default()...)
 
-Base.size(A::Union{GGField, CPUField, CuField, ROCField})          = Base.size(A.A)
-Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...)
-Base.length(A::Union{GGField, CPUField, CuField, ROCField})        = Base.length(A.A)
-Base.ndims(A::Union{GGField, CPUField, CuField, ROCField})         = Base.ndims(A.A)
-Base.eltype(A::Union{GGField, CPUField, CuField, ROCField})        = Base.eltype(A.A)
+Base.size(A::Union{GGField, CPUField})          = Base.size(A.A)
+Base.size(A::Union{GGField, CPUField}, args...) = Base.size(A.A, args...)
+Base.length(A::Union{GGField, CPUField})        = Base.length(A.A)
+Base.ndims(A::Union{GGField, CPUField})         = Base.ndims(A.A)
+Base.eltype(A::Union{GGField, CPUField})        = Base.eltype(A.A)
 
 
-##---------------
-## CUDA functions
-
-function register(::Type{<:CuArray},buf::Array{T}) where T <: GGNumber
-    rbuf = CUDA.Mem.register(CUDA.Mem.Host, pointer(buf), sizeof(buf), CUDA.Mem.HOSTREGISTER_DEVICEMAP);
-    rbuf_d = convert(CuPtr{T}, rbuf);
-    return unsafe_wrap(CuArray, rbuf_d, size(buf)), rbuf;
-end
-
-
-##---------------
-## AMDGPU functions
-
-function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
-    return unsafe_wrap(ROCArray, pointer(buf), size(buf))
-end
+##------------------------------------------
+## CUDA AND AMDGPU COMMON EXTENSION DEFAULTS
+function register end
\ No newline at end of file

From 6c148c0feece90a3d6aa1ca9a79356ee3136df26 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 16:04:08 +0100
Subject: [PATCH 04/34] add defaults file include for extensions

---
 src/AMDGPUExt/defaults.jl | 1 +
 src/CUDAExt/defaults.jl   | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 src/AMDGPUExt/defaults.jl
 create mode 100644 src/CUDAExt/defaults.jl

diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
new file mode 100644
index 0000000..627a6fe
--- /dev/null
+++ b/src/AMDGPUExt/defaults.jl
@@ -0,0 +1 @@
+is_rocarray(A::GGArray) = false
\ No newline at end of file
diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl
new file mode 100644
index 0000000..184f015
--- /dev/null
+++ b/src/CUDAExt/defaults.jl
@@ -0,0 +1 @@
+is_cuarray(A::GGArray) = false

From d4da7bf4d0a2629afee7e00af38f43202f50ea03 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 16:04:48 +0100
Subject: [PATCH 05/34] create extension modules

---
 ext/ImplicitGlobalGrid_AMDGPUExt.jl | 3 +++
 ext/ImplicitGlobalGrid_CUDAExt.jl   | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 ext/ImplicitGlobalGrid_AMDGPUExt.jl
 create mode 100644 ext/ImplicitGlobalGrid_CUDAExt.jl

diff --git a/ext/ImplicitGlobalGrid_AMDGPUExt.jl b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
new file mode 100644
index 0000000..b0bf2e8
--- /dev/null
+++ b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
@@ -0,0 +1,3 @@
+module ImplicitGlobalGrid_AMDGPUExt
+    include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "update_halo.jl"))
+end
\ No newline at end of file
diff --git a/ext/ImplicitGlobalGrid_CUDAExt.jl b/ext/ImplicitGlobalGrid_CUDAExt.jl
new file mode 100644
index 0000000..2d1b311
--- /dev/null
+++ b/ext/ImplicitGlobalGrid_CUDAExt.jl
@@ -0,0 +1,3 @@
+module ImplicitGlobalGrid_CUDAExt
+    include(joinpath(@__DIR__, "..", "src", "CUDAExt", "update_halo.jl"))
+end
\ No newline at end of file

From cc3767b31d2582d8f10bff93fe85525f11777b74 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 16:06:16 +0100
Subject: [PATCH 06/34] move code related to buffer allocation to extensions

---
 src/AMDGPUExt/update_halo.jl |  99 +++++++++++++++++++++
 src/CUDAExt/update_halo.jl   | 111 ++++++++++++++++++++++++
 src/update_halo.jl           | 161 +++--------------------------------
 3 files changed, 221 insertions(+), 150 deletions(-)
 create mode 100644 src/AMDGPUExt/update_halo.jl
 create mode 100644 src/CUDAExt/update_halo.jl

diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
new file mode 100644
index 0000000..acbdea7
--- /dev/null
+++ b/src/AMDGPUExt/update_halo.jl
@@ -0,0 +1,99 @@
+##---------------------------------------
+## FUNCTIONS RELATED TO BUFFER ALLOCATION
+
+# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time.
+
+let
+    global free_update_halo_rocbuffers, reset_roc_buffers, free_rocbufs
+    global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat
+    rocsendbufs_raw = nothing
+    rocrecvbufs_raw = nothing
+    # INFO: no need for roc host buffers
+
+    function free_update_halo_rocbuffers()
+        free_rocbufs(rocsendbufs_raw)
+        free_rocbufs(rocrecvbufs_raw)
+        # INFO: no need for roc host buffers
+        reset_roc_buffers()
+    end
+
+    function free_rocbufs(bufs)
+        if (bufs !== nothing)
+            for i = 1:length(bufs)
+                for n = 1:length(bufs[i])
+                    if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
+                end
+            end
+        end
+    end
+
+    # INFO: no need for roc host buffers
+    # function unregister_rocbufs(bufs)
+    # end
+
+    function reset_roc_buffers()
+        rocsendbufs_raw = nothing
+        rocrecvbufs_raw = nothing
+        # INFO: no need for roc host buffers
+    end
+
+
+    # (AMDGPU functions)
+
+    function init_rocbufs_arrays()
+        rocsendbufs_raw = Array{Array{Any,1},1}();
+        rocrecvbufs_raw = Array{Array{Any,1},1}();
+        # INFO: no need for roc host buffers
+    end
+
+    function init_rocbufs(T::DataType, fields::GGField...)
+        while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
+        while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
+        # INFO: no need for roc host buffers
+    end
+
+    function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer)
+        if (eltype(rocsendbufs_raw[i][n]) != T) rocsendbufs_raw[i][n] = reinterpret(T, rocsendbufs_raw[i][n]); end
+        if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end
+    end
+
+    function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
+        rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
+        rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
+    end
+
+    function reregister_rocbufs(T::DataType, i::Integer, n::Integer)
+        # INFO: no need for roc host buffers
+        rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]);
+        rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]);
+    end
+
+
+    # (AMDGPU functions)
+
+    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
+        return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A)));
+    end
+
+    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
+        return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A)));
+    end
+
+
+    # (GPU functions)
+
+    #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
+    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T} where T <: GGNumber
+        return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
+    end
+
+    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T} where T <: GGNumber
+        return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
+    end
+
+
+    # Make sendbufs_raw and recvbufs_raw accessible for unit testing.
+    global get_rocsendbufs_raw, get_rocrecvbufs_raw
+    get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw)
+    get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw)
+end
\ No newline at end of file
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
new file mode 100644
index 0000000..3124a85
--- /dev/null
+++ b/src/CUDAExt/update_halo.jl
@@ -0,0 +1,111 @@
+##---------------------------------------
+## FUNCTIONS RELATED TO BUFFER ALLOCATION
+
+# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time.
+
+let
+    global free_update_halo_cubuffers, reset_cu_buffers, free_cubufs, unregister_cubufs
+    global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat
+    cusendbufs_raw = nothing
+    curecvbufs_raw = nothing
+    cusendbufs_raw_h = nothing
+    curecvbufs_raw_h = nothing
+
+    function free_update_halo_cubuffers()
+        free_cubufs(cusendbufs_raw)
+        free_cubufs(curecvbufs_raw)
+        unregister_cubufs(cusendbufs_raw_h)
+        unregister_cubufs(curecvbufs_raw_h)
+        reset_cu_buffers()
+    end
+
+    function free_cubufs(bufs)
+        if (bufs !== nothing)
+            for i = 1:length(bufs)
+                for n = 1:length(bufs[i])
+                    if is_cuarray(bufs[i][n]) CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end
+                end
+            end
+        end
+    end
+
+    function unregister_cubufs(bufs)
+        if (bufs !== nothing)
+            for i = 1:length(bufs)
+                for n = 1:length(bufs[i])
+                    if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end
+                end
+            end
+        end
+    end
+
+    function reset_cu_buffers()
+        cusendbufs_raw = nothing
+        curecvbufs_raw = nothing
+        cusendbufs_raw_h = nothing
+        curecvbufs_raw_h = nothing
+    end
+
+
+    # (CUDA functions)
+
+    function init_cubufs_arrays()
+        cusendbufs_raw = Array{Array{Any,1},1}();
+        curecvbufs_raw = Array{Array{Any,1},1}();
+        cusendbufs_raw_h = Array{Array{Any,1},1}();
+        curecvbufs_raw_h = Array{Array{Any,1},1}();
+    end
+
+    function init_cubufs(T::DataType, fields::GGField...)
+        while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end
+        while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end
+        while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end
+        while (length(curecvbufs_raw_h) < length(fields)) push!(curecvbufs_raw_h, [[], []]); end
+    end
+
+    function reinterpret_cubufs(T::DataType, i::Integer, n::Integer)
+        if (eltype(cusendbufs_raw[i][n]) != T) cusendbufs_raw[i][n] = reinterpret(T, cusendbufs_raw[i][n]); end
+        if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end
+    end
+
+    function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
+        cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
+        curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
+    end
+
+    function reregister_cubufs(T::DataType, i::Integer, n::Integer)
+        if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T))
+        if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T))
+        cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]);
+        curecvbufs_raw[i][n], curecvbufs_raw_h[i][n] = register(CuArray,recvbufs_raw[i][n]);
+    end
+
+
+    # (CUDA functions)
+
+    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
+        return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
+    end
+
+    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
+        return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
+    end
+
+
+    # (GPU functions)
+
+    #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
+    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T} where T <: GGNumber
+        return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
+    end
+
+    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T} where T <: GGNumber
+        return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
+    end
+
+
+    # Make sendbufs_raw and recvbufs_raw accessible for unit testing.
+    global get_cusendbufs_raw, get_curecvbufs_raw
+    get_cusendbufs_raw()  = deepcopy(cusendbufs_raw)
+    get_curecvbufs_raw()  = deepcopy(curecvbufs_raw)
+end
\ No newline at end of file
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 4be9d87..2c38461 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -95,60 +95,25 @@ halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), siz
 # NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time.
 
 let
+    #TODO: this was: global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat
     global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat
     sendbufs_raw = nothing
     recvbufs_raw = nothing
-    cusendbufs_raw = nothing
-    curecvbufs_raw = nothing
-    cusendbufs_raw_h = nothing
-    curecvbufs_raw_h = nothing
-    rocsendbufs_raw = nothing
-    rocrecvbufs_raw = nothing
-    # INFO: no need for roc host buffers
 
     function free_update_halo_buffers()
-        if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(cusendbufs_raw) end
-        if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(curecvbufs_raw) end
-        if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(cusendbufs_raw_h) end
-        if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(curecvbufs_raw_h) end
-        if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end
-        if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end
-        # INFO: no need to unregister roc host buffers
-        sendbufs_raw = nothing
-        recvbufs_raw = nothing
-        cusendbufs_raw = nothing
-        curecvbufs_raw = nothing
-        cusendbufs_raw_h = nothing
-        curecvbufs_raw_h = nothing
-        rocsendbufs_raw = nothing
-        rocrecvbufs_raw = nothing
-        # INFO: no need for roc host buffers
-        GC.gc()
+        free_update_halo_cpubuffers()
+        if (cuda_enabled() && none(cudaaware_MPI()))     free_update_halo_cubuffers() end
+        if (amdgpu_enabled() && none(amdgpuaware_MPI())) free_update_halo_rocbuffers() end
+        GC.gc() #TODO: see how to modify this!
     end
 
-
-    # (CUDA, AMDGPU functions)
-
-    function free_gpubufs(bufs)
-        if (bufs !== nothing)
-            for i = 1:length(bufs)
-                for n = 1:length(bufs[i])
-                    if is_cuarray(bufs[i][n])  CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end
-                    if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU
-                end
-            end
-        end
+    function free_update_halo_cpubuffers()
+        reset_cpu_buffers();
     end
 
-    function unregister_gpubufs(bufs)
-        if (bufs !== nothing)
-            for i = 1:length(bufs)
-                for n = 1:length(bufs[i])
-                    if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end
-                    # INFO: no need for roc host buffers
-                end
-            end
-        end
+    function reset_cpu_buffers()
+        sendbufs_raw = nothing
+        recvbufs_raw = nothing
     end
 
     # Allocate for each field two send and recv buffers (one for the left and one for the right neighbour of a dimension). The required length of the buffer is given by the maximal number of halo elements in any of the dimensions. Note that buffers are not allocated separately for each dimension, as the updates are performed one dimension at a time (required for correctness).
@@ -215,71 +180,6 @@ let
     end
 
 
-    # (CUDA functions)
-
-    function init_cubufs_arrays()
-        cusendbufs_raw = Array{Array{Any,1},1}();
-        curecvbufs_raw = Array{Array{Any,1},1}();
-        cusendbufs_raw_h = Array{Array{Any,1},1}();
-        curecvbufs_raw_h = Array{Array{Any,1},1}();
-    end
-
-    function init_cubufs(T::DataType, fields::GGField...)
-        while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end
-        while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end
-        while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end
-        while (length(curecvbufs_raw_h) < length(fields)) push!(curecvbufs_raw_h, [[], []]); end
-    end
-
-    function reinterpret_cubufs(T::DataType, i::Integer, n::Integer)
-        if (eltype(cusendbufs_raw[i][n]) != T) cusendbufs_raw[i][n] = reinterpret(T, cusendbufs_raw[i][n]); end
-        if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end
-    end
-
-    function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
-        cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
-        curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
-    end
-
-    function reregister_cubufs(T::DataType, i::Integer, n::Integer)
-        if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T))
-        if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T))
-        cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]);
-        curecvbufs_raw[i][n], curecvbufs_raw_h[i][n] = register(CuArray,recvbufs_raw[i][n]);
-    end
-
-
-    # (AMDGPU functions)
-
-    function init_rocbufs_arrays()
-        rocsendbufs_raw = Array{Array{Any,1},1}();
-        rocrecvbufs_raw = Array{Array{Any,1},1}();
-        # INFO: no need for roc host buffers
-    end
-
-    function init_rocbufs(T::DataType, fields::GGField...)
-        while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
-        while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end
-        # INFO: no need for roc host buffers
-    end
-
-    function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer)
-        if (eltype(rocsendbufs_raw[i][n]) != T) rocsendbufs_raw[i][n] = reinterpret(T, rocsendbufs_raw[i][n]); end
-        if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end
-    end
-
-    function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
-        rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
-        rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
-    end
-
-    function reregister_rocbufs(T::DataType, i::Integer, n::Integer)
-        # INFO: no need for roc host buffers
-        rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]);
-        rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]);
-    end
-
-
     # (CPU functions)
 
     function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber
@@ -298,49 +198,10 @@ let
         return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
-
-    # (CUDA functions)
-
-    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
-        return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
-    end
-
-    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
-        return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
-    end
-
-
-    # (AMDGPU functions)
-
-    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
-        return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A)));
-    end
-
-    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
-        return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A)));
-    end
-
-
-    # (GPU functions)
-
-    #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
-    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber
-        return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
-    end
-
-    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber
-        return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
-    end
-
-
     # Make sendbufs_raw and recvbufs_raw accessible for unit testing.
-    global get_sendbufs_raw, get_recvbufs_raw, get_cusendbufs_raw, get_curecvbufs_raw, get_rocsendbufs_raw, get_rocrecvbufs_raw
+    global get_sendbufs_raw, get_recvbufs_raw
     get_sendbufs_raw()    = deepcopy(sendbufs_raw)
     get_recvbufs_raw()    = deepcopy(recvbufs_raw)
-    get_cusendbufs_raw()  = deepcopy(cusendbufs_raw)
-    get_curecvbufs_raw()  = deepcopy(curecvbufs_raw)
-    get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw)
-    get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw)
 end
 
 

From 2c6e065b25bb258fb67366430b0f966948c80914 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 16:41:09 +0100
Subject: [PATCH 07/34] move code related to accessing buffers to extensions

---
 src/AMDGPUExt/update_halo.jl | 135 ++++++++++++++++++-
 src/CUDAExt/update_halo.jl   | 125 +++++++++++++++++-
 src/update_halo.jl           | 250 -----------------------------------
 3 files changed, 258 insertions(+), 252 deletions(-)

diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index acbdea7..381f5f4 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -96,4 +96,137 @@ let
     global get_rocsendbufs_raw, get_rocrecvbufs_raw
     get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw)
     get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw)
-end
\ No newline at end of file
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function allocate_rocstreams(fields::GGField...)
+    allocate_rocstreams_iwrite(fields...);
+    allocate_rocstreams_iread(fields...);
+end
+
+let
+    global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
+
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iwrite(fields::GGField...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            # DEBUG: the follow section needs perf testing
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            # else
+            #     write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
+            # end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_rocstreams_iread, wait_iread
+
+    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
+
+    function allocate_rocstreams_iread(fields::GGField...)
+        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
+            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            # DEBUG: the follow section needs perf testing
+            # DEBUG 2: commenting read_h2d_async! for now
+            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            # else
+            #     read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
+            # end
+        end
+    end
+
+end
+
+
+# (AMDGPU functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
+    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
+    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
+    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
+    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(sendbuf), AMDGPU.Mem.HostBuffer,
+        pointer(A), typeof(A.buf),
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
+        srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+    buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
+    AMDGPU.Mem.unsafe_copy3d!(
+        pointer(A), typeof(A.buf),
+        pointer(recvbuf), AMDGPU.Mem.HostBuffer,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
+        srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
+        async=true, stream=rocstream
+    )
+    return nothing
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
+    @inbounds AMDGPU.copyto!(dst, src)
+end
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index 3124a85..bd58653 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -108,4 +108,127 @@ let
     global get_cusendbufs_raw, get_curecvbufs_raw
     get_cusendbufs_raw()  = deepcopy(cusendbufs_raw)
     get_curecvbufs_raw()  = deepcopy(curecvbufs_raw)
-end
\ No newline at end of file
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function allocate_custreams(fields::GGField...)
+    allocate_custreams_iwrite(fields...);
+    allocate_custreams_iread(fields...);
+end
+
+let
+    global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite
+
+    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+
+    function allocate_custreams_iwrite(fields::GGField...)
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
+            end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_custreams_iread, wait_iread
+
+    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
+
+    function allocate_custreams_iread(fields::GGField...)
+        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
+            end
+        end
+    end
+end
+
+
+# (CUDA functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
+    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
+    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
+    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
+    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+    CUDA.Mem.unsafe_copy3d!(
+        pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
+        dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
+        async=true, stream=custream
+    )
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+    CUDA.Mem.unsafe_copy3d!(
+        pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
+        dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
+        async=true, stream=custream
+    )
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
+    @inbounds CUDA.copyto!(dst, src)
+end
+
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 2c38461..3413973 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -273,143 +273,6 @@ let
 end
 
 
-# (CUDA functions)
-
-function allocate_custreams(fields::GGField...)
-    allocate_custreams_iwrite(fields...);
-    allocate_custreams_iread(fields...);
-end
-
-let
-    global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite
-
-    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
-
-    function allocate_custreams_iwrite(fields::GGField...)
-        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
-            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = sendranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]);
-            end
-        end
-    end
-end
-
-let
-    global iread_recvbufs!, allocate_custreams_iread, wait_iread
-
-    custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]);
-
-    function allocate_custreams_iread(fields::GGField...)
-        if length(fields) > size(custreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
-            custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            if dim == 1 || cudaaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = recvranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            else
-                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]);
-            end
-        end
-    end
-end
-
-
-# (AMDGPU functions)
-
-function allocate_rocstreams(fields::GGField...)
-    allocate_rocstreams_iwrite(fields...);
-    allocate_rocstreams_iread(fields...);
-end
-
-let
-    global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
-
-    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
-
-    function allocate_rocstreams_iwrite(fields::GGField...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
-            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            # DEBUG: the follow section needs perf testing
-            # DEBUG 2: commenting read_h2d_async! for now
-            # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = sendranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            # else
-            #     write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]);
-            # end
-        end
-    end
-end
-
-let
-    global iread_recvbufs!, allocate_rocstreams_iread, wait_iread
-
-    rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0)
-
-    wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]);
-
-    function allocate_rocstreams_iread(fields::GGField...)
-        if length(fields) > size(rocstreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a ROCField
-            rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
-        end
-    end
-
-    function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber
-        A, halowidths = F;
-        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-            # DEBUG: the follow section needs perf testing
-            # DEBUG 2: commenting read_h2d_async! for now
-            # if dim == 1 || amdgpuaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
-                ranges = recvranges(n, dim, F);
-                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
-                halosize = [r[end] - r[1] + 1 for r in ranges];
-                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
-                @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
-            # else
-            #     read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]);
-            # end
-        end
-    end
-
-end
-
-
 # (CPU/GPU functions)
 
 # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1).
@@ -472,105 +335,6 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni
 end
 
 
-# (CUDA functions)
-
-# Write to the send buffer on the host or device from the array on the device (d2x).
-function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
-    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
-    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
-    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
-    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
-    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
-    return nothing
-end
-
-# Read from the receive buffer on the host or device and store on the array on the device (x2d).
-function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
-    ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
-    iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
-    iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
-    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
-    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
-    return nothing
-end
-
-# Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
-    CUDA.Mem.unsafe_copy3d!(
-        pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
-        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
-        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
-        srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
-        dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
-        async=true, stream=custream
-    )
-end
-
-# Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
-    CUDA.Mem.unsafe_copy3d!(
-        pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
-        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
-        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-        srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
-        dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
-        async=true, stream=custream
-    )
-end
-
-
-# (AMDGPU functions)
-
-# Write to the send buffer on the host or device from the array on the device (d2x).
-function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
-    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
-    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
-    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
-    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
-    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
-    return nothing
-end
-
-# Read from the receive buffer on the host or device and store on the array on the device (x2d).
-function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
-    ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
-    iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
-    iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
-    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
-    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
-    return nothing
-end
-
-# Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
-    buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
-    AMDGPU.Mem.unsafe_copy3d!(
-        pointer(sendbuf), AMDGPU.Mem.HostBuffer,
-        pointer(A), typeof(A.buf),
-        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
-        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
-        dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
-        srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2),
-        async=true, stream=rocstream
-    )
-    return nothing
-end
-
-# Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
-    buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
-    AMDGPU.Mem.unsafe_copy3d!(
-        pointer(A), typeof(A.buf),
-        pointer(recvbuf), AMDGPU.Mem.HostBuffer,
-        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
-        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
-        dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2),
-        srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2),
-        async=true, stream=rocstream
-    )
-    return nothing
-end
-
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
 
@@ -643,20 +407,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T
 end
 
 
-# (CUDA functions)
-
-function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
-    @inbounds CUDA.copyto!(dst, src)
-end
-
-
-# (AMDGPU functions)
-
-function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
-    @inbounds AMDGPU.copyto!(dst, src)
-end
-
-
 ##-------------------------------------------
 ## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS
 

From 88408c0cfd3153e6080dfc0c109bddc4684a3737 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 17:08:38 +0100
Subject: [PATCH 08/34] clean up shared

---
 src/AMDGPUExt/shared.jl | 16 ++++++++++++++++
 src/CUDAExt/shared.jl   | 16 ++++++++++++++++
 src/shared.jl           |  4 ++--
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 17c588f..af60b24 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -1,7 +1,23 @@
+import ImplicitGlobalGrid
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber
+using AMDGPU
+
+
+##------
+## TYPES
+
 const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 
+
+##-------------
+## SYNTAX SUGAR
+
 is_rocarray(A::GGArray) = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
+
+##--------------------------------------------------------------------------------
+## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
+
 wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw))
 
 Base.size(A::ROCField)          = Base.size(A.A)
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index d7ca46b..6ac949d 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -1,7 +1,23 @@
+import ImplicitGlobalGrid
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber
+using CUDA
+
+
+##------
+## TYPES
+
 const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 
+
+##-------------
+## SYNTAX SUGAR
+
 is_cuarray(A::GGArray) = typeof(A) <: CuArray   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
+
+##--------------------------------------------------------------------------------
+## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
+
 wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw))
 
 Base.size(A::CuField)          = Base.size(A.A)
diff --git a/src/shared.jl b/src/shared.jl
index 7d89f7f..82106ff 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -1,6 +1,6 @@
 import MPI
-using CUDA
-using AMDGPU
+using CUDA   #TODO: to be removed!
+using AMDGPU #TODO: to be removed!
 using Base.Threads
 
 

From 9562e1d9ab74a5d9af812d54a42a3ad99009d336 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 17:16:00 +0100
Subject: [PATCH 09/34] clean up shared

---
 src/AMDGPUExt/defaults.jl | 2 ++
 src/AMDGPUExt/shared.jl   | 4 ++--
 src/CUDAExt/defaults.jl   | 2 ++
 src/CUDAExt/shared.jl     | 4 ++--
 src/shared.jl             | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
index 627a6fe..b2bcf49 100644
--- a/src/AMDGPUExt/defaults.jl
+++ b/src/AMDGPUExt/defaults.jl
@@ -1 +1,3 @@
+# shared.jl
+
 is_rocarray(A::GGArray) = false
\ No newline at end of file
diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index af60b24..8a0af8e 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -12,13 +12,13 @@ const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 ##-------------
 ## SYNTAX SUGAR
 
-is_rocarray(A::GGArray) = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+ImplicitGlobalGrid.is_rocarray(A::GGArray) = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
 
 ##--------------------------------------------------------------------------------
 ## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
 
-wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw))
+ImplicitGlobalGrid.wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw))
 
 Base.size(A::ROCField)          = Base.size(A.A)
 Base.size(A::ROCField, args...) = Base.size(A.A, args...)
diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl
index 184f015..3bd9ecb 100644
--- a/src/CUDAExt/defaults.jl
+++ b/src/CUDAExt/defaults.jl
@@ -1 +1,3 @@
+# shared.jl
+
 is_cuarray(A::GGArray) = false
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 6ac949d..4781fc4 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -12,13 +12,13 @@ const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 ##-------------
 ## SYNTAX SUGAR
 
-is_cuarray(A::GGArray) = typeof(A) <: CuArray   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+ImplicitGlobalGrid.is_cuarray(A::GGArray) = typeof(A) <: CuArray   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
 
 ##--------------------------------------------------------------------------------
 ## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
 
-wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw))
+ImplicitGlobalGrid.wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw))
 
 Base.size(A::CuField)          = Base.size(A.A)
 Base.size(A::CuField, args...) = Base.size(A.A, args...)
diff --git a/src/shared.jl b/src/shared.jl
index 82106ff..27104c8 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -134,4 +134,4 @@ Base.eltype(A::Union{GGField, CPUField})        = Base.eltype(A.A)
 
 ##------------------------------------------
 ## CUDA AND AMDGPU COMMON EXTENSION DEFAULTS
-function register end
\ No newline at end of file
+# TODO: this should not be required as only called from the extensions #function register end
\ No newline at end of file

From 5cc267e6795d303bfe7412403cd3c19682285568 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 17:19:41 +0100
Subject: [PATCH 10/34] include shared.jl in extension module

---
 ext/ImplicitGlobalGrid_AMDGPUExt.jl | 1 +
 ext/ImplicitGlobalGrid_CUDAExt.jl   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/ext/ImplicitGlobalGrid_AMDGPUExt.jl b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
index b0bf2e8..dbc5bf3 100644
--- a/ext/ImplicitGlobalGrid_AMDGPUExt.jl
+++ b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
@@ -1,3 +1,4 @@
 module ImplicitGlobalGrid_AMDGPUExt
+    include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "shared.jl"))
     include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "update_halo.jl"))
 end
\ No newline at end of file
diff --git a/ext/ImplicitGlobalGrid_CUDAExt.jl b/ext/ImplicitGlobalGrid_CUDAExt.jl
index 2d1b311..381fd59 100644
--- a/ext/ImplicitGlobalGrid_CUDAExt.jl
+++ b/ext/ImplicitGlobalGrid_CUDAExt.jl
@@ -1,3 +1,4 @@
 module ImplicitGlobalGrid_CUDAExt
+    include(joinpath(@__DIR__, "..", "src", "CUDAExt", "shared.jl"))
     include(joinpath(@__DIR__, "..", "src", "CUDAExt", "update_halo.jl"))
 end
\ No newline at end of file

From 1867033380b0cc3182c48a5323ca02ddd5ad8220 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 19:11:22 +0100
Subject: [PATCH 11/34] create defaults an extension methods related to buffer
 allocation

---
 src/AMDGPUExt/defaults.jl    | 14 +++++++++++-
 src/AMDGPUExt/update_halo.jl | 29 +++++++++++++++++++++----
 src/CUDAExt/defaults.jl      | 12 +++++++++++
 src/CUDAExt/update_halo.jl   | 29 +++++++++++++++++++++----
 src/update_halo.jl           | 41 +++++++++++++++++++-----------------
 5 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
index b2bcf49..ce33e7a 100644
--- a/src/AMDGPUExt/defaults.jl
+++ b/src/AMDGPUExt/defaults.jl
@@ -1,3 +1,15 @@
 # shared.jl
 
-is_rocarray(A::GGArray) = false
\ No newline at end of file
+is_rocarray(A::GGArray) = false
+
+
+# update_halo.jl
+
+function free_update_halo_rocbuffers end
+function init_rocbufs_arrays end
+function init_rocbufs end
+function reinterpret_rocbufs end
+function reallocate_undersized_rocbufs end
+function reregister_rocbufs end
+function get_rocsendbufs_raw end
+function get_rocrecvbufs_raw end
\ No newline at end of file
diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index 381f5f4..b8110a5 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -3,8 +3,21 @@
 
 # NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time.
 
+ImplicitGlobalGrid.free_update_halo_rocbuffers(args...) = free_update_halo_rocbuffers(args...)
+ImplicitGlobalGrid.init_rocbufs_arrays(args...) = init_rocbufs_arrays(args...)
+ImplicitGlobalGrid.init_rocbufs(args...) = init_rocbufs(args...)
+ImplicitGlobalGrid.reinterpret_rocbufs(args...) = reinterpret_rocbufs(args...)
+ImplicitGlobalGrid.reallocate_undersized_rocbufs(args...) = reallocate_undersized_rocbufs(args...)
+ImplicitGlobalGrid.reregister_rocbufs(args...) = reregister_rocbufs(args...)
+ImplicitGlobalGrid.get_rocsendbufs_raw(args...) = get_rocsendbufs_raw(args...)
+ImplicitGlobalGrid.get_rocrecvbufs_raw(args...) = get_rocrecvbufs_raw(args...)
+ImplicitGlobalGrid.gpusendbuf(args..., A::ROCField) = gpusendbuf(args..., A)
+ImplicitGlobalGrid.gpurecvbuf(args..., A::ROCField) = gpurecvbuf(args..., A)
+ImplicitGlobalGrid.gpusendbuf_flat(args..., A::ROCField) = gpusendbuf_flat(args..., A)
+ImplicitGlobalGrid.gpurecvbuf_flat(args..., A::ROCField) = gpurecvbuf_flat(args..., A)
+
 let
-    global free_update_halo_rocbuffers, reset_roc_buffers, free_rocbufs
+    global free_update_halo_rocbuffers, init_rocbufs_arrays, init_rocbufs, reinterpret_rocbufs, reregister_rocbufs, reallocate_undersized_rocbufs
     global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat
     rocsendbufs_raw = nothing
     rocrecvbufs_raw = nothing
@@ -57,12 +70,20 @@ let
         if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end
     end
 
+    function reallocate_undersized_rocbufs(T::DataType, i::Integer, max_halo_elems::Integer)
+        if (!isnothing(rocsendbufs_raw) && length(rocsendbufs_raw[i][1]) < max_halo_elems)
+            for n = 1:NNEIGHBORS_PER_DIM
+                if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+            end
+        end
+    end
+
     function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
         rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
         rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
     end
 
-    function reregister_rocbufs(T::DataType, i::Integer, n::Integer)
+    function reregister_rocbufs(T::DataType, i::Integer, n::Integer, sendbufs_raw, recvbufs_raw)
         # INFO: no need for roc host buffers
         rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]);
         rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]);
@@ -83,11 +104,11 @@ let
     # (GPU functions)
 
     #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
-    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T} where T <: GGNumber
+    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
         return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
-    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T} where T <: GGNumber
+    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber
         return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl
index 3bd9ecb..957178b 100644
--- a/src/CUDAExt/defaults.jl
+++ b/src/CUDAExt/defaults.jl
@@ -1,3 +1,15 @@
 # shared.jl
 
 is_cuarray(A::GGArray) = false
+
+
+# update_halo.jl
+
+function free_update_halo_cubuffers end
+function init_cubufs_arrays end
+function init_cubufs end
+function reinterpret_cubufs end
+function reallocate_undersized_cubufs end
+function reregister_cubufs end
+function get_cusendbufs_raw end
+function get_curecvbufs_raw end
\ No newline at end of file
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index bd58653..5c5c9aa 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -3,8 +3,21 @@
 
 # NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time.
 
+ImplicitGlobalGrid.free_update_halo_cubuffers(args...) = free_update_halo_cubuffers(args...)
+ImplicitGlobalGrid.init_cubufs_arrays(args...) = init_cubufs_arrays(args...)
+ImplicitGlobalGrid.init_cubufs(args...) = init_cubufs(args...)
+ImplicitGlobalGrid.reinterpret_cubufs(args...) = reinterpret_cubufs(args...)
+ImplicitGlobalGrid.reallocate_undersized_cubufs(args...) = reallocate_undersized_cubufs(args...)
+ImplicitGlobalGrid.reregister_cubufs(args...) = reregister_cubufs(args...)
+ImplicitGlobalGrid.get_cusendbufs_raw(args...) = get_cusendbufs_raw(args...)
+ImplicitGlobalGrid.get_curecvbufs_raw(args...) = get_curecvbufs_raw(args...)
+ImplicitGlobalGrid.gpusendbuf(args..., A::CuField)= gpusendbuf(args..., A)
+ImplicitGlobalGrid.gpurecvbuf(args..., A::CuField)= gpurecvbuf(args..., A)
+ImplicitGlobalGrid.gpusendbuf_flat(args..., A::CuField)= gpusendbuf_flat(args..., A)
+ImplicitGlobalGrid.gpurecvbuf_flat(args..., A::CuField)= gpurecvbuf_flat(args..., A)
+
 let
-    global free_update_halo_cubuffers, reset_cu_buffers, free_cubufs, unregister_cubufs
+    global free_update_halo_cubuffers, init_cubufs_arrays, init_cubufs, reinterpret_cubufs, reregister_cubufs, reallocate_undersized_cubufs
     global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat
     cusendbufs_raw = nothing
     curecvbufs_raw = nothing
@@ -68,12 +81,20 @@ let
         if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end
     end
 
+    function reallocate_undersized_cubufs(T::DataType, i::Integer, max_halo_elems::Integer)
+        if (!isnothing(cusendbufs_raw) && length(cusendbufs_raw[i][1]) < max_halo_elems)
+            for n = 1:NNEIGHBORS_PER_DIM
+                if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+            end
+        end
+    end
+
     function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
         cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
         curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
     end
 
-    function reregister_cubufs(T::DataType, i::Integer, n::Integer)
+    function reregister_cubufs(T::DataType, i::Integer, n::Integer, sendbufs_raw, recvbufs_raw)
         if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T))
         if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T))
         cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]);
@@ -95,11 +116,11 @@ let
     # (GPU functions)
 
     #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
-    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T} where T <: GGNumber
+    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
         return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
-    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T} where T <: GGNumber
+    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber
         return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 3413973..fc6bb9d 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -96,7 +96,8 @@ halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), siz
 
 let
     #TODO: this was: global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat
-    global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat
+    global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat # TODO: this is to be removed if the corresponding functions are moved.
+    global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat
     sendbufs_raw = nothing
     recvbufs_raw = nothing
 
@@ -135,24 +136,9 @@ let
                 if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end
             end
             max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1]));
-            if (length(sendbufs_raw[i][1]) < max_halo_elems)
-                for n = 1:NNEIGHBORS_PER_DIM
-                    reallocate_bufs(T, i, n, max_halo_elems);
-                    if (is_cuarray(A) && none(cudaaware_MPI())) reregister_cubufs(T, i, n); end  # Host memory is page-locked (and mapped to device memory) to ensure optimal access performance (from kernel or with 3-D memcopy).
-                    if (is_rocarray(A) && none(amdgpuaware_MPI())) reregister_rocbufs(T, i, n); end  # ...
-                end
-                GC.gc(); # Too small buffers had been replaced with larger ones; free the now unused memory.
-            end
-            if (!isnothing(cusendbufs_raw) && length(cusendbufs_raw[i][1]) < max_halo_elems)
-                for n = 1:NNEIGHBORS_PER_DIM
-                    if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
-                end
-            end
-            if (!isnothing(rocsendbufs_raw) && length(rocsendbufs_raw[i][1]) < max_halo_elems)
-                for n = 1:NNEIGHBORS_PER_DIM
-                    if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
-                end
-            end
+            reallocate_undersized_hostbufs(T, i, max_halo_elems);
+            if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_undersized_cubufs(T, i, max_halo_elems) end
+            if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_undersized_rocbufs(T, i, max_halo_elems) end
         end
     end
 
@@ -174,6 +160,17 @@ let
         if (eltype(recvbufs_raw[i][n]) != T) recvbufs_raw[i][n] = reinterpret(T, recvbufs_raw[i][n]); end
     end
 
+    function reallocate_undersized_hostbufs(T::DataType, i::Integer, max_halo_elems::Integer)
+        if (length(sendbufs_raw[i][1]) < max_halo_elems)
+            for n = 1:NNEIGHBORS_PER_DIM
+                reallocate_bufs(T, i, n, max_halo_elems);
+                if (is_cuarray(A) && none(cudaaware_MPI())) reregister_cubufs(T, i, n, sendbufs_raw, recvbufs_raw); end  # Host memory is page-locked (and mapped to device memory) to ensure optimal access performance (from kernel or with 3-D memcopy).
+                if (is_rocarray(A) && none(amdgpuaware_MPI())) reregister_rocbufs(T, i, n, sendbufs_raw, recvbufs_raw); end  # ...
+            end
+            GC.gc(); # Too small buffers had been replaced with larger ones; free the now unused memory.
+        end
+    end
+
     function reallocate_bufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
         sendbufs_raw[i][n] = zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
         recvbufs_raw[i][n] = zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
@@ -198,6 +195,12 @@ let
         return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
+    # (GPU defaults) #TODO: see where to move this! Maybe to shared_defaults.jl in src?
+    function gpusendbuf end
+    function gpurecvbuf end
+    function gpusendbuf_flat end
+    function gpurecvbuf_flat end
+
     # Make sendbufs_raw and recvbufs_raw accessible for unit testing.
     global get_sendbufs_raw, get_recvbufs_raw
     get_sendbufs_raw()    = deepcopy(sendbufs_raw)

From 82a3cc228e3a204184ec3448c617915e9e84059f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 19:18:36 +0100
Subject: [PATCH 12/34] import constants related to buffer allocation

---
 src/AMDGPUExt/shared.jl | 1 +
 src/CUDAExt/shared.jl   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 8a0af8e..f8d2617 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -1,5 +1,6 @@
 import ImplicitGlobalGrid
 import ImplicitGlobalGrid: GGArray, GGField, GGNumber
+import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
 using AMDGPU
 
 
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 4781fc4..6b56438 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -1,5 +1,6 @@
 import ImplicitGlobalGrid
 import ImplicitGlobalGrid: GGArray, GGField, GGNumber
+import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
 using CUDA
 
 

From 9efc8ab0cac441b7791fef566c1402ab791e7a1f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Jan 2024 19:36:07 +0100
Subject: [PATCH 13/34] create defaults and extension methods related to buffer
 accessing

---
 src/AMDGPUExt/defaults.jl    |  3 ++-
 src/AMDGPUExt/update_halo.jl | 12 ++++++------
 src/CUDAExt/defaults.jl      |  3 ++-
 src/CUDAExt/update_halo.jl   | 12 ++++++------
 src/update_halo.jl           |  8 ++++++++
 5 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
index ce33e7a..50a1523 100644
--- a/src/AMDGPUExt/defaults.jl
+++ b/src/AMDGPUExt/defaults.jl
@@ -12,4 +12,5 @@ function reinterpret_rocbufs end
 function reallocate_undersized_rocbufs end
 function reregister_rocbufs end
 function get_rocsendbufs_raw end
-function get_rocrecvbufs_raw end
\ No newline at end of file
+function get_rocrecvbufs_raw end
+function allocate_rocstreams end
\ No newline at end of file
diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index b8110a5..0cc6b75 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -123,7 +123,7 @@ end
 ##----------------------------------------------
 ## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
 
-function allocate_rocstreams(fields::GGField...)
+function ImplicitGlobalGrid.allocate_rocstreams(fields::GGField...)
     allocate_rocstreams_iwrite(fields...);
     allocate_rocstreams_iread(fields...);
 end
@@ -195,7 +195,7 @@ end
 # (AMDGPU functions)
 
 # Write to the send buffer on the host or device from the array on the device (d2x).
-function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+function ImplicitGlobalGrid.write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
     ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1
     iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1
     iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1
@@ -205,7 +205,7 @@ function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendran
 end
 
 # Read from the receive buffer on the host or device and store on the array on the device (x2d).
-function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+function ImplicitGlobalGrid.read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
     ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1
     iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1
     iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1
@@ -215,7 +215,7 @@ function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrang
 end
 
 # Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+function ImplicitGlobalGrid.write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
     buf_view = reshape(sendbuf, Tuple(length.(sendranges)))
     AMDGPU.Mem.unsafe_copy3d!(
         pointer(sendbuf), AMDGPU.Mem.HostBuffer,
@@ -230,7 +230,7 @@ function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges:
 end
 
 # Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
+function ImplicitGlobalGrid.read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer
     buf_view = reshape(recvbuf, Tuple(length.(recvranges)))
     AMDGPU.Mem.unsafe_copy3d!(
         pointer(A), typeof(A.buf),
@@ -248,6 +248,6 @@ end
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
 
-function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
+function ImplicitGlobalGrid.gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber
     @inbounds AMDGPU.copyto!(dst, src)
 end
diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl
index 957178b..5389086 100644
--- a/src/CUDAExt/defaults.jl
+++ b/src/CUDAExt/defaults.jl
@@ -12,4 +12,5 @@ function reinterpret_cubufs end
 function reallocate_undersized_cubufs end
 function reregister_cubufs end
 function get_cusendbufs_raw end
-function get_curecvbufs_raw end
\ No newline at end of file
+function get_curecvbufs_raw end
+function allocate_custreams end
\ No newline at end of file
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index 5c5c9aa..ea5e999 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -135,7 +135,7 @@ end
 ##----------------------------------------------
 ## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
 
-function allocate_custreams(fields::GGField...)
+function ImplicitGlobalGrid.allocate_custreams(fields::GGField...)
     allocate_custreams_iwrite(fields...);
     allocate_custreams_iread(fields...);
 end
@@ -202,7 +202,7 @@ end
 # (CUDA functions)
 
 # Write to the send buffer on the host or device from the array on the device (d2x).
-function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+function ImplicitGlobalGrid.write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
     ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1
     iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1
     iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1
@@ -212,7 +212,7 @@ function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrange
 end
 
 # Read from the receive buffer on the host or device and store on the array on the device (x2d).
-function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+function ImplicitGlobalGrid.read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
     ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1
     iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1
     iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1
@@ -222,7 +222,7 @@ function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex
 end
 
 # Write to the send buffer on the host from the array on the device (d2h).
-function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+function ImplicitGlobalGrid.write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
     CUDA.Mem.unsafe_copy3d!(
         pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device,
         length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
@@ -234,7 +234,7 @@ function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::
 end
 
 # Read from the receive buffer on the host and store on the array on the device (h2d).
-function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
+function ImplicitGlobalGrid.read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer
     CUDA.Mem.unsafe_copy3d!(
         pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host,
         length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
@@ -249,7 +249,7 @@ end
 ##------------------------------
 ## FUNCTIONS TO SEND/RECV FIELDS
 
-function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
+function ImplicitGlobalGrid.gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber
     @inbounds CUDA.copyto!(dst, src)
 end
 
diff --git a/src/update_halo.jl b/src/update_halo.jl
index fc6bb9d..aad7008 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -276,6 +276,14 @@ let
 end
 
 
+# TODO: see where to move this! Maybe to shared_defaults.jl in src?
+function write_d2x! end
+function read_x2d! end
+function write_d2h_async! end
+function read_h2d_async! end
+
+function gpumemcopy! end
+
 # (CPU/GPU functions)
 
 # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1).

From 0a4dc181130c5d228635b8a105bb948b356b4595 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 13:25:28 +0100
Subject: [PATCH 14/34] move shared default methods to defaults_shared.jl

---
 src/ImplicitGlobalGrid.jl |  1 +
 src/defaults_shared.jl    | 13 +++++++++++++
 src/update_halo.jl        | 15 ---------------
 3 files changed, 14 insertions(+), 15 deletions(-)
 create mode 100644 src/defaults_shared.jl

diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl
index 3721eed..d460765 100644
--- a/src/ImplicitGlobalGrid.jl
+++ b/src/ImplicitGlobalGrid.jl
@@ -42,6 +42,7 @@ using .Exceptions
 include("shared.jl")
 
 ## Alphabetical include of defaults for extensions
+include("defaults_shared.jl")
 include(joinpath("AMDGPUExt", "defaults.jl"))
 include(joinpath("CUDAExt", "defaults.jl"))
 include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl"))
diff --git a/src/defaults_shared.jl b/src/defaults_shared.jl
new file mode 100644
index 0000000..7fe1e81
--- /dev/null
+++ b/src/defaults_shared.jl
@@ -0,0 +1,13 @@
+# update_halo.jl
+
+function gpusendbuf end
+function gpurecvbuf end
+function gpusendbuf_flat end
+function gpurecvbuf_flat end
+
+function write_d2x! end
+function read_x2d! end
+function write_d2h_async! end
+function read_h2d_async! end
+
+function gpumemcopy! end
diff --git a/src/update_halo.jl b/src/update_halo.jl
index aad7008..ddcac51 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -96,7 +96,6 @@ halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), siz
 
 let
     #TODO: this was: global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat
-    global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat # TODO: this is to be removed if the corresponding functions are moved.
     global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat
     sendbufs_raw = nothing
     recvbufs_raw = nothing
@@ -195,12 +194,6 @@ let
         return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A));
     end
 
-    # (GPU defaults) #TODO: see where to move this! Maybe to shared_defaults.jl in src?
-    function gpusendbuf end
-    function gpurecvbuf end
-    function gpusendbuf_flat end
-    function gpurecvbuf_flat end
-
     # Make sendbufs_raw and recvbufs_raw accessible for unit testing.
     global get_sendbufs_raw, get_recvbufs_raw
     get_sendbufs_raw()    = deepcopy(sendbufs_raw)
@@ -276,14 +269,6 @@ let
 end
 
 
-# TODO: see where to move this! Maybe to shared_defaults.jl in src?
-function write_d2x! end
-function read_x2d! end
-function write_d2h_async! end
-function read_h2d_async! end
-
-function gpumemcopy! end
-
 # (CPU/GPU functions)
 
 # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1).

From 441aacdfb9924bc81dfc75a6ca3c681b395a5c4b Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 15:45:57 +0100
Subject: [PATCH 15/34] update handling of gpu support checking if extension is
 loaded

---
 src/AMDGPUExt/shared.jl |  6 ++++++
 src/CUDAExt/shared.jl   |  6 ++++++
 src/init_global_grid.jl | 12 +++++++-----
 src/select_device.jl    |  4 +---
 src/shared.jl           | 26 +++++++++++---------------
 src/update_halo.jl      |  3 +--
 6 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index f8d2617..4b30446 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -10,6 +10,12 @@ using AMDGPU
 const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 
 
+##------------------------------------
+## HANDLING OF CUDA AND AMDGPU SUPPORT
+
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = (@assert AMDGPU.functional(); return true)
+
+
 ##-------------
 ## SYNTAX SUGAR
 
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 6b56438..d8f7a95 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -10,6 +10,12 @@ using CUDA
 const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 
 
+##------------------------------------
+## HANDLING OF CUDA AND AMDGPU SUPPORT
+
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = (@assert CUDA.functional(true); return true)
+
+
 ##-------------
 ## SYNTAX SUGAR
 
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index 62656cc..0e3ed41 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -18,8 +18,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de
     - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
     - `init_MPI::Bool=true`: whether to initialize MPI (`true`) or not (`false`).
-    - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as `"auto"`.
-    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU is functional and `device_type` not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
+    - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) was imported before ImplicitGlobalGrid; if both were imported, an error will be given if `device_type` is set as `"auto"`.
+    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU was imported and `device_type` is not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
     For more information, refer to the documentation of MPI.jl / MPI.
 
 # Return values
@@ -40,6 +40,8 @@ See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
 """
 function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
     if grid_is_initialized() error("The global grid has already been initialized.") end
+    set_cuda_loaded()
+    set_amdgpu_loaded()
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
     periods           = [periodx, periody, periodz];
@@ -69,10 +71,10 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end
     end
     if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
-    if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
+    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && amdgpu_loaded()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
     if (device_type != DEVICE_TYPE_NONE)
-        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_functional()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
-        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
     end
     if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
     if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
diff --git a/src/select_device.jl b/src/select_device.jl
index a571c7e..984e672 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -16,10 +16,8 @@ function select_device()
     if cuda_enabled() || amdgpu_enabled()
         check_initialized();
         if cuda_enabled()
-            @assert CUDA.functional(true)
             nb_devices = length(CUDA.devices())
         elseif amdgpu_enabled()
-            @assert AMDGPU.functional()
             nb_devices = length(AMDGPU.devices())
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())
@@ -31,7 +29,7 @@ function select_device()
         end
         return device_id
     else
-        error("Cannot select a device because neither CUDA nor AMDGPU is enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded).")
+        error("Cannot select a device because neither CUDA nor AMDGPU is enabled (meaning that the corresponding module was not imported before ImplicitGlobalGrid).")
     end
 end
 
diff --git a/src/shared.jl b/src/shared.jl
index 27104c8..f75a135 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -1,24 +1,20 @@
 import MPI
-using CUDA   #TODO: to be removed!
-using AMDGPU #TODO: to be removed!
 using Base.Threads
 
 
-##-------------------------
+##------------------------------------
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
-let
-    global cuda_functional, amdgpu_functional, set_cuda_functional, set_amdgpu_functional
-    _cuda_functional::Bool           = false
-    _amdgpu_functional::Bool         = false
-    cuda_functional()::Bool          = _cuda_functional
-    amdgpu_functional()::Bool        = _amdgpu_functional
-    set_cuda_functional(val::Bool)   = (_cuda_functional = val;)
-    set_amdgpu_functional(val::Bool) = (_amdgpu_functional = val;)
-end
 
-function __init__()
-    set_cuda_functional(CUDA.functional())
-    set_amdgpu_functional(AMDGPU.functional())
+is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
+
+let
+    global cuda_loaded, amdgpu_loaded, set_cuda_loaded, set_amdgpu_loaded
+    _cuda_loaded::Bool    = false
+    _amdgpu_loaded::Bool  = false
+    cuda_loaded()::Bool   = _cuda_loaded
+    amdgpu_loaded()::Bool = _amdgpu_loaded
+    set_cuda_loaded()     = (_cuda_loaded   = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
+    set_amdgpu_loaded()   = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
 end
 
 
diff --git a/src/update_halo.jl b/src/update_halo.jl
index ddcac51..9b1c48b 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -35,8 +35,7 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...)
 end
 
 function _update_halo!(fields::GGField...)
-    if (any_cuarray(fields...) && !cuda_enabled())    error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end    #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`.
-    if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end  #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`.
+    if (!cuda_enabled() && !amdgpu_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later).
     allocate_bufs(fields...);
     if any_array(fields...) allocate_tasks(fields...); end
     if any_cuarray(fields...) allocate_custreams(fields...); end

From ead45ca2e4f266036f53a9502ab23d800deefaf1 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 15:47:14 +0100
Subject: [PATCH 16/34] move module imports triggering extensions before
 ImplicitGlobalGrid

---
 test/test_finalize_global_grid.jl | 2 +-
 test/test_gather.jl               | 2 +-
 test/test_init_global_grid.jl     | 2 +-
 test/test_select_device.jl        | 5 ++---
 test/test_tools.jl                | 2 +-
 test/test_update_halo.jl          | 6 ++----
 6 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/test/test_finalize_global_grid.jl b/test/test_finalize_global_grid.jl
index dac6678..d80bfce 100644
--- a/test/test_finalize_global_grid.jl
+++ b/test/test_finalize_global_grid.jl
@@ -1,7 +1,7 @@
 push!(LOAD_PATH, "../src")
 using Test
+import MPI, CUDA, AMDGPU
 using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
-import MPI
 import ImplicitGlobalGrid: @require
 
 
diff --git a/test/test_gather.jl b/test/test_gather.jl
index 42cc4af..9debc3a 100644
--- a/test/test_gather.jl
+++ b/test/test_gather.jl
@@ -1,7 +1,7 @@
 push!(LOAD_PATH, "../src")
 using Test
+import MPI, CUDA, AMDGPU
 using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
-import MPI
 import ImplicitGlobalGrid: @require
 
 
diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl
index 9076a94..19c1d02 100644
--- a/test/test_init_global_grid.jl
+++ b/test/test_init_global_grid.jl
@@ -1,7 +1,7 @@
 push!(LOAD_PATH, "../src")
 using Test
+import MPI, CUDA, AMDGPU
 using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
-import MPI
 import ImplicitGlobalGrid: @require
 
 
diff --git a/test/test_select_device.jl b/test/test_select_device.jl
index 4a5b37a..79c2c7a 100644
--- a/test/test_select_device.jl
+++ b/test/test_select_device.jl
@@ -1,10 +1,9 @@
 # NOTE: All tests of this file can be run with any number of processes.
 push!(LOAD_PATH, "../src")
 using Test
-using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
 import MPI
-using CUDA
-using AMDGPU
+using CUDA, AMDGPU
+using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
 import ImplicitGlobalGrid: @require
 
 test_cuda = CUDA.functional()
diff --git a/test/test_tools.jl b/test/test_tools.jl
index fdcf432..d2785d2 100644
--- a/test/test_tools.jl
+++ b/test/test_tools.jl
@@ -1,7 +1,7 @@
 push!(LOAD_PATH, "../src")
 using Test
+import MPI, CUDA, AMDGPU
 using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
-import MPI
 import ImplicitGlobalGrid: @require
 macro coords(i) :(GG.global_grid().coords[$i]) end
 
diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
index fb57038..1bff613 100644
--- a/test/test_update_halo.jl
+++ b/test/test_update_halo.jl
@@ -4,11 +4,9 @@
 
 push!(LOAD_PATH, "../src")
 using Test
-import LoopVectorization
+import MPI, LoopVectorization
+using CUDA, AMDGPU
 using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
-import MPI
-using CUDA
-using AMDGPU
 import ImplicitGlobalGrid: @require, longnameof
 
 test_cuda = CUDA.functional()

From f9832aa69ee1edc425dc1540654c7082dd65477f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 16:05:32 +0100
Subject: [PATCH 17/34] fix error check methods

---
 src/AMDGPUExt/shared.jl | 2 +-
 src/CUDAExt/shared.jl   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 4b30446..24c7b1c 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -19,7 +19,7 @@ ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = (@assert AM
 ##-------------
 ## SYNTAX SUGAR
 
-ImplicitGlobalGrid.is_rocarray(A::GGArray) = typeof(A) <: ROCArray  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+ImplicitGlobalGrid.is_rocarray(A::ROCArray) = true  #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
 
 ##--------------------------------------------------------------------------------
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index d8f7a95..ed97b93 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -19,7 +19,7 @@ ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = (@assert CUDA
 ##-------------
 ## SYNTAX SUGAR
 
-ImplicitGlobalGrid.is_cuarray(A::GGArray) = typeof(A) <: CuArray   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+ImplicitGlobalGrid.is_cuarray(A::CuArray) = true   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
 
 
 ##--------------------------------------------------------------------------------

From 48f6737f589ced931063fbf960eda73d1d9a2290 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 16:25:47 +0100
Subject: [PATCH 18/34] fix gpusendbuf methods

---
 src/AMDGPUExt/update_halo.jl | 8 ++++----
 src/CUDAExt/update_halo.jl   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index 0cc6b75..2b26056 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -11,10 +11,10 @@ ImplicitGlobalGrid.reallocate_undersized_rocbufs(args...) = reallocate_undersize
 ImplicitGlobalGrid.reregister_rocbufs(args...) = reregister_rocbufs(args...)
 ImplicitGlobalGrid.get_rocsendbufs_raw(args...) = get_rocsendbufs_raw(args...)
 ImplicitGlobalGrid.get_rocrecvbufs_raw(args...) = get_rocrecvbufs_raw(args...)
-ImplicitGlobalGrid.gpusendbuf(args..., A::ROCField) = gpusendbuf(args..., A)
-ImplicitGlobalGrid.gpurecvbuf(args..., A::ROCField) = gpurecvbuf(args..., A)
-ImplicitGlobalGrid.gpusendbuf_flat(args..., A::ROCField) = gpusendbuf_flat(args..., A)
-ImplicitGlobalGrid.gpurecvbuf_flat(args..., A::ROCField) = gpurecvbuf_flat(args..., A)
+ImplicitGlobalGrid.gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpusendbuf(n,dim,i,A)
+ImplicitGlobalGrid.gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpurecvbuf(n,dim,i,A)
+ImplicitGlobalGrid.gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpusendbuf_flat(n,dim,i,A)
+ImplicitGlobalGrid.gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpurecvbuf_flat(n,dim,i,A)
 
 let
     global free_update_halo_rocbuffers, init_rocbufs_arrays, init_rocbufs, reinterpret_rocbufs, reregister_rocbufs, reallocate_undersized_rocbufs
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index ea5e999..5469aba 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -11,10 +11,10 @@ ImplicitGlobalGrid.reallocate_undersized_cubufs(args...) = reallocate_undersized
 ImplicitGlobalGrid.reregister_cubufs(args...) = reregister_cubufs(args...)
 ImplicitGlobalGrid.get_cusendbufs_raw(args...) = get_cusendbufs_raw(args...)
 ImplicitGlobalGrid.get_curecvbufs_raw(args...) = get_curecvbufs_raw(args...)
-ImplicitGlobalGrid.gpusendbuf(args..., A::CuField)= gpusendbuf(args..., A)
-ImplicitGlobalGrid.gpurecvbuf(args..., A::CuField)= gpurecvbuf(args..., A)
-ImplicitGlobalGrid.gpusendbuf_flat(args..., A::CuField)= gpusendbuf_flat(args..., A)
-ImplicitGlobalGrid.gpurecvbuf_flat(args..., A::CuField)= gpurecvbuf_flat(args..., A)
+ImplicitGlobalGrid.gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpusendbuf(n,dim,i,A)
+ImplicitGlobalGrid.gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpurecvbuf(n,dim,i,A)
+ImplicitGlobalGrid.gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpusendbuf_flat(n,dim,i,A)
+ImplicitGlobalGrid.gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpurecvbuf_flat(n,dim,i,A)
 
 let
     global free_update_halo_cubuffers, init_cubufs_arrays, init_cubufs, reinterpret_cubufs, reregister_cubufs, reallocate_undersized_cubufs

From 87a9a40a2cb2f6062fdc13168668018263a53eaa Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 17:30:45 +0100
Subject: [PATCH 19/34] move functional check to select device enabling
 cpu-only without functional and unit tests with one functional

---
 src/AMDGPUExt/shared.jl | 2 +-
 src/CUDAExt/shared.jl   | 2 +-
 src/select_device.jl    | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 24c7b1c..50f7d3c 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -13,7 +13,7 @@ const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 ##------------------------------------
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
-ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = (@assert AMDGPU.functional(); return true)
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = true
 
 
 ##-------------
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index ed97b93..3f6930a 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -13,7 +13,7 @@ const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 ##------------------------------------
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
-ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = (@assert CUDA.functional(true); return true)
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = true
 
 
 ##-------------
diff --git a/src/select_device.jl b/src/select_device.jl
index 984e672..f97817d 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -13,11 +13,14 @@ Select the device (GPU) corresponding to the node-local MPI rank and return its
 See also: [`init_global_grid`](@ref)
 """
 function select_device()
+    if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).")
     if cuda_enabled() || amdgpu_enabled()
         check_initialized();
         if cuda_enabled()
+            @assert CUDA.functional(true)
             nb_devices = length(CUDA.devices())
         elseif amdgpu_enabled()
+            @assert AMDGPU.functional()
             nb_devices = length(AMDGPU.devices())
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())

From 180f060c5d93fc42c37d77f33a6991a21a9c3470 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 17:32:59 +0100
Subject: [PATCH 20/34] move functional check to select device enabling
 cpu-only without functional and unit tests with one functional

---
 src/select_device.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/select_device.jl b/src/select_device.jl
index f97817d..fc3d5c0 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -13,7 +13,7 @@ Select the device (GPU) corresponding to the node-local MPI rank and return its
 See also: [`init_global_grid`](@ref)
 """
 function select_device()
-    if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).")
+    if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end
     if cuda_enabled() || amdgpu_enabled()
         check_initialized();
         if cuda_enabled()

From 4d9ebea32dab4f6199d5b0b1a4529ac11edcaa3f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 19:19:18 +0100
Subject: [PATCH 21/34] introduce functional check and make select_device
 extension compatible

---
 ext/ImplicitGlobalGrid_AMDGPUExt.jl |  1 +
 ext/ImplicitGlobalGrid_CUDAExt.jl   |  1 +
 src/AMDGPUExt/defaults.jl           |  6 ++++++
 src/AMDGPUExt/select_device.jl      |  2 ++
 src/AMDGPUExt/shared.jl             |  1 +
 src/CUDAExt/defaults.jl             |  6 ++++++
 src/CUDAExt/select_device.jl        |  2 ++
 src/CUDAExt/shared.jl               |  1 +
 src/defaults_shared.jl              |  6 ++++++
 src/init_global_grid.jl             |  8 +++++---
 src/select_device.jl                | 14 +++++++-------
 src/shared.jl                       | 22 +++++++++++++---------
 12 files changed, 51 insertions(+), 19 deletions(-)
 create mode 100644 src/AMDGPUExt/select_device.jl
 create mode 100644 src/CUDAExt/select_device.jl

diff --git a/ext/ImplicitGlobalGrid_AMDGPUExt.jl b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
index dbc5bf3..5ac806f 100644
--- a/ext/ImplicitGlobalGrid_AMDGPUExt.jl
+++ b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
@@ -1,4 +1,5 @@
 module ImplicitGlobalGrid_AMDGPUExt
     include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "select_device.jl"))
     include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "update_halo.jl"))
 end
\ No newline at end of file
diff --git a/ext/ImplicitGlobalGrid_CUDAExt.jl b/ext/ImplicitGlobalGrid_CUDAExt.jl
index 381fd59..58775fd 100644
--- a/ext/ImplicitGlobalGrid_CUDAExt.jl
+++ b/ext/ImplicitGlobalGrid_CUDAExt.jl
@@ -1,4 +1,5 @@
 module ImplicitGlobalGrid_CUDAExt
     include(joinpath(@__DIR__, "..", "src", "CUDAExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "CUDAExt", "select_device.jl"))
     include(joinpath(@__DIR__, "..", "src", "CUDAExt", "update_halo.jl"))
 end
\ No newline at end of file
diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
index 50a1523..9fec08b 100644
--- a/src/AMDGPUExt/defaults.jl
+++ b/src/AMDGPUExt/defaults.jl
@@ -3,6 +3,12 @@
 is_rocarray(A::GGArray) = false
 
 
+# select_device.jl
+
+function nb_rocdevices end
+function rocdevice! end
+
+
 # update_halo.jl
 
 function free_update_halo_rocbuffers end
diff --git a/src/AMDGPUExt/select_device.jl b/src/AMDGPUExt/select_device.jl
new file mode 100644
index 0000000..cb8cce3
--- /dev/null
+++ b/src/AMDGPUExt/select_device.jl
@@ -0,0 +1,2 @@
+ImplicitGlobalGrid.nb_rocdevices()       = length(AMDGPU.devices())
+ImplicitGlobalGrid.rocdevice!(device_id) = AMDGPU.device_id!(device_id)
\ No newline at end of file
diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 50f7d3c..402cdc2 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -14,6 +14,7 @@ const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
 ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = true
+ImplicitGlobalGrid.is_functional(::Val{:AMDGPU})                   = AMDGPU.functional()
 
 
 ##-------------
diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl
index 5389086..187f4c5 100644
--- a/src/CUDAExt/defaults.jl
+++ b/src/CUDAExt/defaults.jl
@@ -3,6 +3,12 @@
 is_cuarray(A::GGArray) = false
 
 
+# select_device.jl
+
+function nb_cudevices end
+function cudevice! end
+
+
 # update_halo.jl
 
 function free_update_halo_cubuffers end
diff --git a/src/CUDAExt/select_device.jl b/src/CUDAExt/select_device.jl
new file mode 100644
index 0000000..bcffa29
--- /dev/null
+++ b/src/CUDAExt/select_device.jl
@@ -0,0 +1,2 @@
+ImplicitGlobalGrid.nb_cudevices()       = length(CUDA.devices())
+ImplicitGlobalGrid.cudevice!(device_id) = CUDA.device!(device_id)
\ No newline at end of file
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 3f6930a..93fc6dc 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -14,6 +14,7 @@ const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
 ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = true
+ImplicitGlobalGrid.is_functional(::Val{:CUDA})                   = CUDA.functional(true)
 
 
 ##-------------
diff --git a/src/defaults_shared.jl b/src/defaults_shared.jl
index 7fe1e81..334dae9 100644
--- a/src/defaults_shared.jl
+++ b/src/defaults_shared.jl
@@ -1,3 +1,9 @@
+# shared.jl
+
+is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
+is_functional(arg) = false
+
+
 # update_halo.jl
 
 function gpusendbuf end
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index 0e3ed41..4f8ba63 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -41,7 +41,9 @@ See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
 function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
     if grid_is_initialized() error("The global grid has already been initialized.") end
     set_cuda_loaded()
+    set_cuda_functional()
     set_amdgpu_loaded()
+    set_amdgpu_functional()
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
     periods           = [periodx, periody, periodz];
@@ -71,10 +73,10 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end
     end
     if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
-    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && amdgpu_loaded()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
+    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
     if (device_type != DEVICE_TYPE_NONE)
-        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
-        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded() && cuda_functional()  end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() && amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
     end
     if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
     if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
diff --git a/src/select_device.jl b/src/select_device.jl
index fc3d5c0..5df62cf 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -13,22 +13,22 @@ Select the device (GPU) corresponding to the node-local MPI rank and return its
 See also: [`init_global_grid`](@ref)
 """
 function select_device()
+    check_initialized()
     if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end
     if cuda_enabled() || amdgpu_enabled()
-        check_initialized();
         if cuda_enabled()
-            @assert CUDA.functional(true)
-            nb_devices = length(CUDA.devices())
+            @assert cuda_functional()
+            nb_devices = nb_cudevices()
         elseif amdgpu_enabled()
-            @assert AMDGPU.functional()
-            nb_devices = length(AMDGPU.devices())
+            @assert amdgpu_functional()
+            nb_devices = nb_rocdevices()
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())
         if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end
         me_l      = MPI.Comm_rank(comm_l)
         device_id = amdgpu_enabled() ? me_l+1 : me_l
-        if     cuda_enabled()   CUDA.device!(device_id)
-        elseif amdgpu_enabled() AMDGPU.device_id!(device_id)
+        if     cuda_enabled()   cudevice!(device_id)
+        elseif amdgpu_enabled() rocdevice!(device_id)
         end
         return device_id
     else
diff --git a/src/shared.jl b/src/shared.jl
index f75a135..1e6a4a9 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -5,16 +5,20 @@ using Base.Threads
 ##------------------------------------
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
-is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
-
 let
-    global cuda_loaded, amdgpu_loaded, set_cuda_loaded, set_amdgpu_loaded
-    _cuda_loaded::Bool    = false
-    _amdgpu_loaded::Bool  = false
-    cuda_loaded()::Bool   = _cuda_loaded
-    amdgpu_loaded()::Bool = _amdgpu_loaded
-    set_cuda_loaded()     = (_cuda_loaded   = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
-    set_amdgpu_loaded()   = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
+    global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional
+    _cuda_loaded::Bool        = false
+    _cuda_functional::Bool    = false
+    _amdgpu_loaded::Bool      = false
+    _amdgpu_functional::Bool  = false
+    cuda_loaded()::Bool       = _cuda_loaded
+    cuda_functional()::Bool   = _cuda_functional
+    amdgpu_loaded()::Bool     = _amdgpu_loaded
+    amdgpu_functional()::Bool = _amdgpu_functional
+    set_cuda_loaded()         = (_cuda_loaded = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
+    set_cuda_functional()     = (_cuda_functional = is_functional(Val(:CUDA)))
+    set_amdgpu_loaded()       = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
+    set_amdgpu_functional()   = (_amdgpu_functional = is_functional(Val(:AMDGPU)))
 end
 
 

From 25f127be390dc6e2f8de65cd3086e85b2d31a77c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 19:44:19 +0100
Subject: [PATCH 22/34] fix reallocation

---
 src/AMDGPUExt/update_halo.jl | 2 +-
 src/CUDAExt/update_halo.jl   | 2 +-
 src/update_halo.jl           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index 2b26056..267d421 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -73,7 +73,7 @@ let
     function reallocate_undersized_rocbufs(T::DataType, i::Integer, max_halo_elems::Integer)
         if (!isnothing(rocsendbufs_raw) && length(rocsendbufs_raw[i][1]) < max_halo_elems)
             for n = 1:NNEIGHBORS_PER_DIM
-                if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+                reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); # Too small buffers had been replaced with larger ones; free the unused memory immediately.
             end
         end
     end
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index 5469aba..8b6d957 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -84,7 +84,7 @@ let
     function reallocate_undersized_cubufs(T::DataType, i::Integer, max_halo_elems::Integer)
         if (!isnothing(cusendbufs_raw) && length(cusendbufs_raw[i][1]) < max_halo_elems)
             for n = 1:NNEIGHBORS_PER_DIM
-                if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+                reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); # Too small buffers had been replaced with larger ones; free the unused memory immediately.
             end
         end
     end
diff --git a/src/update_halo.jl b/src/update_halo.jl
index 9b1c48b..7661ae6 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -134,7 +134,7 @@ let
                 if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end
             end
             max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1]));
-            reallocate_undersized_hostbufs(T, i, max_halo_elems);
+            reallocate_undersized_hostbufs(T, i, max_halo_elems, A);
             if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_undersized_cubufs(T, i, max_halo_elems) end
             if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_undersized_rocbufs(T, i, max_halo_elems) end
         end
@@ -158,7 +158,7 @@ let
         if (eltype(recvbufs_raw[i][n]) != T) recvbufs_raw[i][n] = reinterpret(T, recvbufs_raw[i][n]); end
     end
 
-    function reallocate_undersized_hostbufs(T::DataType, i::Integer, max_halo_elems::Integer)
+    function reallocate_undersized_hostbufs(T::DataType, i::Integer, max_halo_elems::Integer, A::GGArray)
         if (length(sendbufs_raw[i][1]) < max_halo_elems)
             for n = 1:NNEIGHBORS_PER_DIM
                 reallocate_bufs(T, i, n, max_halo_elems);

From 92138e2af94a785f2ca803955b6d41d5617ad4ce Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 21:11:17 +0100
Subject: [PATCH 23/34] add missing imports in extensions

---
 src/AMDGPUExt/shared.jl    | 5 +++--
 src/CUDAExt/shared.jl      | 4 ++--
 src/CUDAExt/update_halo.jl | 5 +++++
 src/defaults_shared.jl     | 1 +
 src/shared.jl              | 1 +
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 402cdc2..40bc7a8 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -1,5 +1,6 @@
 import ImplicitGlobalGrid
-import ImplicitGlobalGrid: GGArray, GGField, GGNumber
+# import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, cudaaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, is_cuarray, register
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, cudaaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_rocarray
 import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
 using AMDGPU
 
@@ -37,6 +38,6 @@ Base.eltype(A::ROCField)        = Base.eltype(A.A)
 ##---------------
 ## AMDGPU functions
 
-function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
+function ImplicitGlobalGrid.register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber
     return unsafe_wrap(ROCArray, pointer(buf), size(buf))
 end
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 93fc6dc..880f0ff 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -1,5 +1,5 @@
 import ImplicitGlobalGrid
-import ImplicitGlobalGrid: GGArray, GGField, GGNumber
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, cudaaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_cuarray
 import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
 using CUDA
 
@@ -38,7 +38,7 @@ Base.eltype(A::CuField)        = Base.eltype(A.A)
 ##---------------
 ## CUDA functions
 
-function register(::Type{<:CuArray},buf::Array{T}) where T <: GGNumber
+function ImplicitGlobalGrid.register(::Type{<:CuArray},buf::Array{T}) where T <: GGNumber
     rbuf = CUDA.Mem.register(CUDA.Mem.Host, pointer(buf), sizeof(buf), CUDA.Mem.HOSTREGISTER_DEVICEMAP);
     rbuf_d = convert(CuPtr{T}, rbuf);
     return unsafe_wrap(CuArray, rbuf_d, size(buf)), rbuf;
diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl
index 8b6d957..27bdcf2 100644
--- a/src/CUDAExt/update_halo.jl
+++ b/src/CUDAExt/update_halo.jl
@@ -140,6 +140,11 @@ function ImplicitGlobalGrid.allocate_custreams(fields::GGField...)
     allocate_custreams_iread(fields...);
 end
 
+ImplicitGlobalGrid.iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where {T <: GGNumber} = iwrite_sendbufs!(n,dim,F,i)
+ImplicitGlobalGrid.iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where {T <: GGNumber} = iread_recvbufs!(n,dim,F,i)
+ImplicitGlobalGrid.wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where {T <: GGNumber} = wait_iwrite(n,A,i)
+ImplicitGlobalGrid.wait_iread(n::Integer, A::CuField{T}, i::Integer) where {T <: GGNumber} = wait_iread(n,A,i)
+
 let
     global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite
 
diff --git a/src/defaults_shared.jl b/src/defaults_shared.jl
index 334dae9..97a58e8 100644
--- a/src/defaults_shared.jl
+++ b/src/defaults_shared.jl
@@ -2,6 +2,7 @@
 
 is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
 is_functional(arg) = false
+function register end
 
 
 # update_halo.jl
diff --git a/src/shared.jl b/src/shared.jl
index 1e6a4a9..47fbe89 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -33,6 +33,7 @@ const DEVICE_TYPE_NONE = "none"
 const DEVICE_TYPE_AUTO = "auto"
 const DEVICE_TYPE_CUDA = "CUDA"
 const DEVICE_TYPE_AMDGPU = "AMDGPU"
+const SUPPORTED_DEVICE_TYPES = [DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]
 
 
 ##------

From 60ed94564dd923c0957df4e40e30940166463824 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 21:11:57 +0100
Subject: [PATCH 24/34] update run test file to give warning if tests are
 skipped

---
 test/runtests.jl | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index a6a5800..fd8a36d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,17 +2,29 @@
 push!(LOAD_PATH, "../src") # FIXME: to be removed everywhere?
 
 import ImplicitGlobalGrid # Precompile it.
+import ImplicitGlobalGrid: SUPPORTED_DEVICE_TYPES, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU
+@static if (DEVICE_TYPE_CUDA in SUPPORTED_DEVICE_TYPES) import CUDA end
+@static if (DEVICE_TYPE_AMDGPU in SUPPORTED_DEVICE_TYPES) import AMDGPU end
 
 excludedfiles = ["test_excluded.jl"];
 
 function runtests()
     exename   = joinpath(Sys.BINDIR, Base.julia_exename())
     testdir   = pwd()
-    istest(f) = endswith(f, ".jl") && startswith(f, "test_")
-    testfiles = sort(filter(istest, readdir(testdir)))
+    istest(f) = endswith(f, ".jl") && startswith(basename(f), "test_")
+    testfiles = sort(filter(istest, vcat([joinpath.(root, files) for (root, dirs, files) in walkdir(testdir)]...)))
 
     nfail = 0
     printstyled("Testing package ImplicitGlobalGrid.jl\n"; bold=true, color=:white)
+
+    if (DEVICE_TYPE_CUDA in SUPPORTED_DEVICE_TYPES && !CUDA.functional())
+        @warn "Test Skip: All CUDA tests will be skipped because CUDA is not functional (if this is unexpected type `import CUDA; CUDA.functional(true)` to debug your CUDA installation)."
+    end
+
+    if (DEVICE_TYPE_AMDGPU in SUPPORTED_DEVICE_TYPES && !AMDGPU.functional())
+        @warn "Test Skip: All AMDGPU tests will be skipped because AMDGPU is not functional (if this is unexpected type `import AMDGPU; AMDGPU.functional()` to debug your AMDGPU installation)."
+    end
+
     for f in testfiles
         println("")
         if f ∈ excludedfiles
@@ -28,4 +40,5 @@ function runtests()
     end
     return nfail
 end
+
 exit(runtests())

From a128dfa07a4bbe9fd73aba598c7f14c0520ba9e9 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 12:12:18 +0100
Subject: [PATCH 25/34] fix AMDGPUExt imports

---
 src/AMDGPUExt/shared.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 40bc7a8..d0e102c 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -1,6 +1,5 @@
 import ImplicitGlobalGrid
-# import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, cudaaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, is_cuarray, register
-import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, cudaaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_rocarray
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, amdgpuaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_rocarray
 import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
 using AMDGPU
 

From 1246bc8b54717fd7e650cd841fa25da44577af0e Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 12:26:40 +0100
Subject: [PATCH 26/34] add device support info in init

---
 src/init_global_grid.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index 4f8ba63..e81fb26 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -105,7 +105,11 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     end
     nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2
     set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet));
-    if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]))"); end
+    cuda_support_string   = (cuda_enabled && all(cudaaware_MPI))     ? "CUDA-aware"   : (cuda_enabled && any(cudaaware_MPI))     ? "CUDA(-aware)"   : (cuda_enabled)   ? "CUDA"   : "";
+    amdgpu_support_string = (amdgpu_enabled && all(amdgpuaware_MPI)) ? "AMDGPU-aware" : (amdgpu_enabled && any(amdgpuaware_MPI)) ? "AMDGPU(-aware)" : (amdgpu_enabled) ? "AMDGPU" : "";
+    gpu_support_string    = join(filter(!isempty, [cuda_support_string, amdgpu_support_string]), ", ");
+    support_string        = isempty(gpu_support_string) ? "CPU-only" : gpu_support_string;
+    if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]); device support: $support_string)"); end
     if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end
     init_timing_functions();
     return me, dims, nprocs, coords, comm_cart; # The typical use case requires only these variables; the remaining can be obtained calling get_global_grid() if needed.

From 9516e2614f335f501770d43c8deef672d9bc300c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 12:29:38 +0100
Subject: [PATCH 27/34] add device support info in init

---
 src/CUDAExt/shared.jl   | 2 +-
 src/init_global_grid.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 880f0ff..af2408b 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -14,7 +14,7 @@ const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
 ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = true
-ImplicitGlobalGrid.is_functional(::Val{:CUDA})                   = CUDA.functional(true)
+ImplicitGlobalGrid.is_functional(::Val{:CUDA})                   = CUDA.functional()
 
 
 ##-------------
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index e81fb26..3be1986 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -108,7 +108,7 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     cuda_support_string   = (cuda_enabled && all(cudaaware_MPI))     ? "CUDA-aware"   : (cuda_enabled && any(cudaaware_MPI))     ? "CUDA(-aware)"   : (cuda_enabled)   ? "CUDA"   : "";
     amdgpu_support_string = (amdgpu_enabled && all(amdgpuaware_MPI)) ? "AMDGPU-aware" : (amdgpu_enabled && any(amdgpuaware_MPI)) ? "AMDGPU(-aware)" : (amdgpu_enabled) ? "AMDGPU" : "";
     gpu_support_string    = join(filter(!isempty, [cuda_support_string, amdgpu_support_string]), ", ");
-    support_string        = isempty(gpu_support_string) ? "CPU-only" : gpu_support_string;
+    support_string        = isempty(gpu_support_string) ? "none" : gpu_support_string;
     if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]); device support: $support_string)"); end
     if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end
     init_timing_functions();

From 08ce3a45ed892cb3cdb6f79394559d4196cfc37a Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 12:58:30 +0100
Subject: [PATCH 28/34] add note about importing module to load extension in
 module doc string

---
 src/ImplicitGlobalGrid.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl
index d460765..d844f45 100644
--- a/src/ImplicitGlobalGrid.jl
+++ b/src/ImplicitGlobalGrid.jl
@@ -23,6 +23,9 @@ https://github.com/eth-cscs/ImplicitGlobalGrid.jl
 
 To see a description of a function type `?<functionname>`.
 
+!!! note "Activation of device support"
+    The support for a device type (CUDA or AMDGPU) is activated by importing the corresponding module (CUDA or AMDGPU) before importing ImplicitGlobalGrid (the corresponding extension will be loaded).
+
 !!! note "Performance note"
     If the system supports CUDA-aware MPI (for Nvidia GPUs) or ROCm-aware MPI (for AMD GPUs), it may be activated for ImplicitGlobalGrid by setting one of the following environment variables (at latest before the call to `init_global_grid`):
     ```shell

From 120ef10e03cde00dcdeea3aeacb7f5c32db84e0f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 12:59:34 +0100
Subject: [PATCH 29/34] update examples to import CUDA before
 ImplicitGlobalGrid

---
 examples/diffusion3D_multigpu_CuArrays.jl         | 3 ++-
 examples/diffusion3D_multigpu_CuArrays_novis.jl   | 3 ++-
 examples/diffusion3D_multigpu_CuArrays_onlyvis.jl | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/diffusion3D_multigpu_CuArrays.jl b/examples/diffusion3D_multigpu_CuArrays.jl
index 778288c..b9293e8 100644
--- a/examples/diffusion3D_multigpu_CuArrays.jl
+++ b/examples/diffusion3D_multigpu_CuArrays.jl
@@ -1,4 +1,5 @@
-using ImplicitGlobalGrid, CUDA, Plots
+using CUDA                       # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid, Plots
 
 @views d_xa(A) = A[2:end  , :     , :     ] .- A[1:end-1, :     , :     ];
 @views d_xi(A) = A[2:end  ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1];
diff --git a/examples/diffusion3D_multigpu_CuArrays_novis.jl b/examples/diffusion3D_multigpu_CuArrays_novis.jl
index 3302391..57c88a6 100644
--- a/examples/diffusion3D_multigpu_CuArrays_novis.jl
+++ b/examples/diffusion3D_multigpu_CuArrays_novis.jl
@@ -1,4 +1,5 @@
-using ImplicitGlobalGrid, CUDA
+using CUDA                # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid
 
 @views d_xa(A) = A[2:end  , :     , :     ] .- A[1:end-1, :     , :     ];
 @views d_xi(A) = A[2:end  ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1];
diff --git a/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl b/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl
index 9a185d7..6cd0b9f 100644
--- a/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl
+++ b/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl
@@ -1,4 +1,5 @@
-using ImplicitGlobalGrid, CUDA, Plots
+using CUDA                       # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid, Plots
 #(...)
 
 @views function diffusion3D()

From c58947daa079fd52c97dc3b43ee9b0810ee08fa4 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 13:00:20 +0100
Subject: [PATCH 30/34] update examples in documentation to import CUDA before
 ImplicitGlobalGrid

---
 README.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bd061a2..669fdd0 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,8 @@ The following Multi-GPU 3-D heat diffusion solver illustrates how these function
 ## 50-lines Multi-GPU example
 This simple Multi-GPU 3-D heat diffusion solver uses ImplicitGlobalGrid. It relies fully on the broadcasting capabilities of [CUDA.jl]'s `CuArray` type to perform the stencil-computations with maximal simplicity ([CUDA.jl] enables also writing explicit GPU kernels which can lead to significantly better performance for these computations).
 ```julia
-using ImplicitGlobalGrid, CUDA
+using CUDA                # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid
 
 @views d_xa(A) = A[2:end  , :     , :     ] .- A[1:end-1, :     , :     ];
 @views d_xi(A) = A[2:end  ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1];
@@ -108,7 +109,8 @@ ImplicitGlobalGrid provides a function to gather an array from each process into
 
 This enables straightforward in-situ visualization or monitoring of Multi-GPU/CPU applications using e.g. the [Julia Plots package] as shown in the following (the GR backend is used as it is particularly fast according to the [Julia Plots documentation]). It is enough to add a couple of lines to the previous example (omitted unmodified lines are represented with `#(...)`):
 ```julia
-using ImplicitGlobalGrid, CUDA, Plots
+using CUDA                       # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support
+using ImplicitGlobalGrid, Plots
 #(...)
 
 @views function diffusion3D()
@@ -230,12 +232,15 @@ search: ImplicitGlobalGrid
 
   To see a description of a function type ?<functionname>.
 
+  │ Activation of device support
+  │
+  │  The support for a device type (CUDA or AMDGPU) is activated by importing the corresponding module (CUDA or AMDGPU) before
+  │  importing ImplicitGlobalGrid (the corresponding extension will be loaded).
+
   │ Performance note
   │
-  │  If the system supports CUDA-aware MPI (for Nvidia GPUs) or
-  │  ROCm-aware MPI (for AMD GPUs), it may be activated for
-  │  ImplicitGlobalGrid by setting one of the following environment
-  │  variables (at latest before the call to init_global_grid):
+  │  If the system supports CUDA-aware MPI (for Nvidia GPUs) or ROCm-aware MPI (for AMD GPUs), it may be activated for
+  │  ImplicitGlobalGrid by setting one of the following environment variables (at latest before the call to init_global_grid):
   │
   │  shell> export IGG_CUDAAWARE_MPI=1
   │

From 3d5d80538216e56ca749b3f17498e987a0b9d69e Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 13:08:07 +0100
Subject: [PATCH 31/34] add all_arrays

---
 src/shared.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/shared.jl b/src/shared.jl
index 47fbe89..b2bfad3 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -114,6 +114,9 @@ has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL
 any_array(fields::GGField...)          = any([is_array(A.A) for A in fields])
 any_cuarray(fields::GGField...)        = any([is_cuarray(A.A) for A in fields])
 any_rocarray(fields::GGField...)       = any([is_rocarray(A.A) for A in fields])
+all_arrays(fields::GGField...)         = all([is_array(A.A) for A in fields])
+all_cuarrays(fields::GGField...)       = all([is_cuarray(A.A) for A in fields])
+all_rocarrays(fields::GGField...)      = all([is_rocarray(A.A) for A in fields])
 is_array(A::GGArray)                   = typeof(A) <: Array
 
 

From c8712e68fc7220cd002165377be8eb6d1fb34a9b Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 19 Jan 2024 15:24:52 +0100
Subject: [PATCH 32/34] add Julia 1.9 and remove nightly from CI

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7c37789..b261117 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,9 +20,9 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          # - '1.7' # Skipping this version because of AMDGPU deps compat issue (rocBLAS_jll)
+          - '1.9' # Minimum version supporting extensions
           - '1'   # Latest stable 1.x release of Julia
-          - 'nightly'
+          # - 'nightly'
         os:
           - ubuntu-latest
           - macOS-latest

From 040facc48a725dd1d83c1f542f92201d72d64c37 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 22 Jan 2024 15:56:56 +0100
Subject: [PATCH 33/34] add missing AMDGPU methods

---
 src/AMDGPUExt/update_halo.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl
index 267d421..b06f861 100644
--- a/src/AMDGPUExt/update_halo.jl
+++ b/src/AMDGPUExt/update_halo.jl
@@ -128,6 +128,11 @@ function ImplicitGlobalGrid.allocate_rocstreams(fields::GGField...)
     allocate_rocstreams_iread(fields...);
 end
 
+ImplicitGlobalGrid.iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where {T <: GGNumber} = iwrite_sendbufs!(n,dim,F,i)
+ImplicitGlobalGrid.iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where {T <: GGNumber} = iread_recvbufs!(n,dim,F,i)
+ImplicitGlobalGrid.wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where {T <: GGNumber} = wait_iwrite(n,A,i)
+ImplicitGlobalGrid.wait_iread(n::Integer, A::ROCField{T}, i::Integer) where {T <: GGNumber} = wait_iread(n,A,i)
+
 let
     global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite
 

From 65e3e498fe9ef6959351113f7918947a46c3d626 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 22 Jan 2024 15:57:46 +0100
Subject: [PATCH 34/34] fix AMDGPU device selection test

---
 test/test_select_device.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_select_device.jl b/test/test_select_device.jl
index 79c2c7a..10cd4d7 100644
--- a/test/test_select_device.jl
+++ b/test/test_select_device.jl
@@ -33,13 +33,13 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD); # NOTE: these tests can run with any num
             @testset "\"AMDGPU\"" begin
                 me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU");
                 gpu_id = select_device();
-                @test gpu_id < length(AMDGPU.devices())
+                @test gpu_id <= length(AMDGPU.devices())
                 finalize_global_grid(finalize_MPI=false);
             end;
             @testset "\"auto\"" begin
                 me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto");
                 gpu_id = select_device();
-                @test gpu_id < length(AMDGPU.devices())
+                @test gpu_id <= length(AMDGPU.devices())
                 finalize_global_grid(finalize_MPI=false);
             end;
         end