diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c37789..b261117 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,9 +20,9 @@ jobs: fail-fast: false matrix: version: - # - '1.7' # Skipping this version because of AMDGPU deps compat issue (rocBLAS_jll) + - '1.9' # Minimum version supporting extensions - '1' # Latest stable 1.x release of Julia - - 'nightly' + # - 'nightly' os: - ubuntu-latest - macOS-latest diff --git a/Project.toml b/Project.toml index 739de78..0d0aa91 100644 --- a/Project.toml +++ b/Project.toml @@ -4,15 +4,17 @@ uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0" version = "0.14.0" [deps] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" [weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" [extensions] ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization" +ImplicitGlobalGrid_AMDGPUExt = "AMDGPU" +ImplicitGlobalGrid_CUDAExt = "CUDA" [compat] AMDGPU = "0.5, 0.6, 0.7, 0.8" @@ -27,4 +29,4 @@ MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "MPIPreferences", "LoopVectorization"] +test = ["Test", "MPIPreferences", "AMDGPU", "CUDA", "LoopVectorization"] diff --git a/README.md b/README.md index bd061a2..669fdd0 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,8 @@ The following Multi-GPU 3-D heat diffusion solver illustrates how these function ## 50-lines Multi-GPU example This simple Multi-GPU 3-D heat diffusion solver uses ImplicitGlobalGrid. It relies fully on the broadcasting capabilities of [CUDA.jl]'s `CuArray` type to perform the stencil-computations with maximal simplicity ([CUDA.jl] enables also writing explicit GPU kernels which can lead to significantly better performance for these computations). ```julia -using ImplicitGlobalGrid, CUDA +using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support +using ImplicitGlobalGrid @views d_xa(A) = A[2:end , : , : ] .- A[1:end-1, : , : ]; @views d_xi(A) = A[2:end ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1]; @@ -108,7 +109,8 @@ ImplicitGlobalGrid provides a function to gather an array from each process into This enables straightforward in-situ visualization or monitoring of Multi-GPU/CPU applications using e.g. the [Julia Plots package] as shown in the following (the GR backend is used as it is particularly fast according to the [Julia Plots documentation]). It is enough to add a couple of lines to the previous example (omitted unmodified lines are represented with `#(...)`): ```julia -using ImplicitGlobalGrid, CUDA, Plots +using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support +using ImplicitGlobalGrid, Plots #(...) @views function diffusion3D() @@ -230,12 +232,15 @@ search: ImplicitGlobalGrid To see a description of a function type ?. + │ Activation of device support + │ + │ The support for a device type (CUDA or AMDGPU) is activated by importing the corresponding module (CUDA or AMDGPU) before + │ importing ImplicitGlobalGrid (the corresponding extension will be loaded). + │ Performance note │ - │ If the system supports CUDA-aware MPI (for Nvidia GPUs) or - │ ROCm-aware MPI (for AMD GPUs), it may be activated for - │ ImplicitGlobalGrid by setting one of the following environment - │ variables (at latest before the call to init_global_grid): + │ If the system supports CUDA-aware MPI (for Nvidia GPUs) or ROCm-aware MPI (for AMD GPUs), it may be activated for + │ ImplicitGlobalGrid by setting one of the following environment variables (at latest before the call to init_global_grid): │ │ shell> export IGG_CUDAAWARE_MPI=1 │ diff --git a/examples/diffusion3D_multigpu_CuArrays.jl b/examples/diffusion3D_multigpu_CuArrays.jl index 778288c..b9293e8 100644 --- a/examples/diffusion3D_multigpu_CuArrays.jl +++ b/examples/diffusion3D_multigpu_CuArrays.jl @@ -1,4 +1,5 @@ -using ImplicitGlobalGrid, CUDA, Plots +using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support +using ImplicitGlobalGrid, Plots @views d_xa(A) = A[2:end , : , : ] .- A[1:end-1, : , : ]; @views d_xi(A) = A[2:end ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1]; diff --git a/examples/diffusion3D_multigpu_CuArrays_novis.jl b/examples/diffusion3D_multigpu_CuArrays_novis.jl index 3302391..57c88a6 100644 --- a/examples/diffusion3D_multigpu_CuArrays_novis.jl +++ b/examples/diffusion3D_multigpu_CuArrays_novis.jl @@ -1,4 +1,5 @@ -using ImplicitGlobalGrid, CUDA +using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support +using ImplicitGlobalGrid @views d_xa(A) = A[2:end , : , : ] .- A[1:end-1, : , : ]; @views d_xi(A) = A[2:end ,2:end-1,2:end-1] .- A[1:end-1,2:end-1,2:end-1]; diff --git a/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl b/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl index 9a185d7..6cd0b9f 100644 --- a/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl +++ b/examples/diffusion3D_multigpu_CuArrays_onlyvis.jl @@ -1,4 +1,5 @@ -using ImplicitGlobalGrid, CUDA, Plots +using CUDA # Import CUDA before ImplicitGlobalGrid to activate its CUDA device support +using ImplicitGlobalGrid, Plots #(...) @views function diffusion3D() diff --git a/ext/ImplicitGlobalGrid_AMDGPUExt.jl b/ext/ImplicitGlobalGrid_AMDGPUExt.jl new file mode 100644 index 0000000..5ac806f --- /dev/null +++ b/ext/ImplicitGlobalGrid_AMDGPUExt.jl @@ -0,0 +1,5 @@ +module ImplicitGlobalGrid_AMDGPUExt + include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "shared.jl")) + include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "select_device.jl")) + include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "update_halo.jl")) +end \ No newline at end of file diff --git a/ext/ImplicitGlobalGrid_CUDAExt.jl b/ext/ImplicitGlobalGrid_CUDAExt.jl new file mode 100644 index 0000000..58775fd --- /dev/null +++ b/ext/ImplicitGlobalGrid_CUDAExt.jl @@ -0,0 +1,5 @@ +module ImplicitGlobalGrid_CUDAExt + include(joinpath(@__DIR__, "..", "src", "CUDAExt", "shared.jl")) + include(joinpath(@__DIR__, "..", "src", "CUDAExt", "select_device.jl")) + include(joinpath(@__DIR__, "..", "src", "CUDAExt", "update_halo.jl")) +end \ No newline at end of file diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl new file mode 100644 index 0000000..9fec08b --- /dev/null +++ b/src/AMDGPUExt/defaults.jl @@ -0,0 +1,22 @@ +# shared.jl + +is_rocarray(A::GGArray) = false + + +# select_device.jl + +function nb_rocdevices end +function rocdevice! end + + +# update_halo.jl + +function free_update_halo_rocbuffers end +function init_rocbufs_arrays end +function init_rocbufs end +function reinterpret_rocbufs end +function reallocate_undersized_rocbufs end +function reregister_rocbufs end +function get_rocsendbufs_raw end +function get_rocrecvbufs_raw end +function allocate_rocstreams end \ No newline at end of file diff --git a/src/AMDGPUExt/select_device.jl b/src/AMDGPUExt/select_device.jl new file mode 100644 index 0000000..cb8cce3 --- /dev/null +++ b/src/AMDGPUExt/select_device.jl @@ -0,0 +1,2 @@ +ImplicitGlobalGrid.nb_rocdevices() = length(AMDGPU.devices()) +ImplicitGlobalGrid.rocdevice!(device_id) = AMDGPU.device_id!(device_id) \ No newline at end of file diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl new file mode 100644 index 0000000..d0e102c --- /dev/null +++ b/src/AMDGPUExt/shared.jl @@ -0,0 +1,42 @@ +import ImplicitGlobalGrid +import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, amdgpuaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_rocarray +import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY +using AMDGPU + + +##------ +## TYPES + +const ROCField{T,N} = GGField{T,N,ROCArray{T,N}} + + +##------------------------------------ +## HANDLING OF CUDA AND AMDGPU SUPPORT + +ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = true +ImplicitGlobalGrid.is_functional(::Val{:AMDGPU}) = AMDGPU.functional() + + +##------------- +## SYNTAX SUGAR + +ImplicitGlobalGrid.is_rocarray(A::ROCArray) = true #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. + + +##-------------------------------------------------------------------------------- +## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS + +ImplicitGlobalGrid.wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw)) + +Base.size(A::ROCField) = Base.size(A.A) +Base.size(A::ROCField, args...) = Base.size(A.A, args...) +Base.length(A::ROCField) = Base.length(A.A) +Base.ndims(A::ROCField) = Base.ndims(A.A) +Base.eltype(A::ROCField) = Base.eltype(A.A) + +##--------------- +## AMDGPU functions + +function ImplicitGlobalGrid.register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber + return unsafe_wrap(ROCArray, pointer(buf), size(buf)) +end diff --git a/src/AMDGPUExt/update_halo.jl b/src/AMDGPUExt/update_halo.jl new file mode 100644 index 0000000..b06f861 --- /dev/null +++ b/src/AMDGPUExt/update_halo.jl @@ -0,0 +1,258 @@ +##--------------------------------------- +## FUNCTIONS RELATED TO BUFFER ALLOCATION + +# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. + +ImplicitGlobalGrid.free_update_halo_rocbuffers(args...) = free_update_halo_rocbuffers(args...) +ImplicitGlobalGrid.init_rocbufs_arrays(args...) = init_rocbufs_arrays(args...) +ImplicitGlobalGrid.init_rocbufs(args...) = init_rocbufs(args...) +ImplicitGlobalGrid.reinterpret_rocbufs(args...) = reinterpret_rocbufs(args...) +ImplicitGlobalGrid.reallocate_undersized_rocbufs(args...) = reallocate_undersized_rocbufs(args...) +ImplicitGlobalGrid.reregister_rocbufs(args...) = reregister_rocbufs(args...) +ImplicitGlobalGrid.get_rocsendbufs_raw(args...) = get_rocsendbufs_raw(args...) +ImplicitGlobalGrid.get_rocrecvbufs_raw(args...) = get_rocrecvbufs_raw(args...) +ImplicitGlobalGrid.gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpusendbuf(n,dim,i,A) +ImplicitGlobalGrid.gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpurecvbuf(n,dim,i,A) +ImplicitGlobalGrid.gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpusendbuf_flat(n,dim,i,A) +ImplicitGlobalGrid.gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where {T <: GGNumber} = gpurecvbuf_flat(n,dim,i,A) + +let + global free_update_halo_rocbuffers, init_rocbufs_arrays, init_rocbufs, reinterpret_rocbufs, reregister_rocbufs, reallocate_undersized_rocbufs + global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat + rocsendbufs_raw = nothing + rocrecvbufs_raw = nothing + # INFO: no need for roc host buffers + + function free_update_halo_rocbuffers() + free_rocbufs(rocsendbufs_raw) + free_rocbufs(rocrecvbufs_raw) + # INFO: no need for roc host buffers + reset_roc_buffers() + end + + function free_rocbufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU + end + end + end + end + + # INFO: no need for roc host buffers + # function unregister_rocbufs(bufs) + # end + + function reset_roc_buffers() + rocsendbufs_raw = nothing + rocrecvbufs_raw = nothing + # INFO: no need for roc host buffers + end + + + # (AMDGPU functions) + + function init_rocbufs_arrays() + rocsendbufs_raw = Array{Array{Any,1},1}(); + rocrecvbufs_raw = Array{Array{Any,1},1}(); + # INFO: no need for roc host buffers + end + + function init_rocbufs(T::DataType, fields::GGField...) + while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end + while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end + # INFO: no need for roc host buffers + end + + function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer) + if (eltype(rocsendbufs_raw[i][n]) != T) rocsendbufs_raw[i][n] = reinterpret(T, rocsendbufs_raw[i][n]); end + if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end + end + + function reallocate_undersized_rocbufs(T::DataType, i::Integer, max_halo_elems::Integer) + if (!isnothing(rocsendbufs_raw) && length(rocsendbufs_raw[i][1]) < max_halo_elems) + for n = 1:NNEIGHBORS_PER_DIM + reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); # Too small buffers had been replaced with larger ones; free the unused memory immediately. + end + end + end + + function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) + rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. + rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); + end + + function reregister_rocbufs(T::DataType, i::Integer, n::Integer, sendbufs_raw, recvbufs_raw) + # INFO: no need for roc host buffers + rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]); + rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]); + end + + + # (AMDGPU functions) + + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber + return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); + end + + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber + return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); + end + + + # (GPU functions) + + #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. + function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber + return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber + return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + + # Make sendbufs_raw and recvbufs_raw accessible for unit testing. + global get_rocsendbufs_raw, get_rocrecvbufs_raw + get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw) + get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw) +end + + +##---------------------------------------------- +## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS + +function ImplicitGlobalGrid.allocate_rocstreams(fields::GGField...) + allocate_rocstreams_iwrite(fields...); + allocate_rocstreams_iread(fields...); +end + +ImplicitGlobalGrid.iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where {T <: GGNumber} = iwrite_sendbufs!(n,dim,F,i) +ImplicitGlobalGrid.iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where {T <: GGNumber} = iread_recvbufs!(n,dim,F,i) +ImplicitGlobalGrid.wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where {T <: GGNumber} = wait_iwrite(n,A,i) +ImplicitGlobalGrid.wait_iread(n::Integer, A::ROCField{T}, i::Integer) where {T <: GGNumber} = wait_iread(n,A,i) + +let + global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite + + rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); + + function allocate_rocstreams_iwrite(fields::GGField...) + if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField + rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + # DEBUG: the follow section needs perf testing + # DEBUG 2: commenting read_h2d_async! for now + # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = sendranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + # else + # write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]); + # end + end + end +end + +let + global iread_recvbufs!, allocate_rocstreams_iread, wait_iread + + rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); + + function allocate_rocstreams_iread(fields::GGField...) + if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField + rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + # DEBUG: the follow section needs perf testing + # DEBUG 2: commenting read_h2d_async! for now + # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = recvranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + # else + # read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]); + # end + end + end + +end + + +# (AMDGPU functions) + +# Write to the send buffer on the host or device from the array on the device (d2x). +function ImplicitGlobalGrid.write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1 + iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1 + iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1 + if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; + return nothing +end + +# Read from the receive buffer on the host or device and store on the array on the device (x2d). +function ImplicitGlobalGrid.read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1 + iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1 + iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1 + if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; + return nothing +end + +# Write to the send buffer on the host from the array on the device (d2h). +function ImplicitGlobalGrid.write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer + buf_view = reshape(sendbuf, Tuple(length.(sendranges))) + AMDGPU.Mem.unsafe_copy3d!( + pointer(sendbuf), AMDGPU.Mem.HostBuffer, + pointer(A), typeof(A.buf), + length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); + srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), + dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2), + srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2), + async=true, stream=rocstream + ) + return nothing +end + +# Read from the receive buffer on the host and store on the array on the device (h2d). +function ImplicitGlobalGrid.read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer + buf_view = reshape(recvbuf, Tuple(length.(recvranges))) + AMDGPU.Mem.unsafe_copy3d!( + pointer(A), typeof(A.buf), + pointer(recvbuf), AMDGPU.Mem.HostBuffer, + length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); + dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), + dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2), + srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2), + async=true, stream=rocstream + ) + return nothing +end + + +##------------------------------ +## FUNCTIONS TO SEND/RECV FIELDS + +function ImplicitGlobalGrid.gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber + @inbounds AMDGPU.copyto!(dst, src) +end diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl new file mode 100644 index 0000000..187f4c5 --- /dev/null +++ b/src/CUDAExt/defaults.jl @@ -0,0 +1,22 @@ +# shared.jl + +is_cuarray(A::GGArray) = false + + +# select_device.jl + +function nb_cudevices end +function cudevice! end + + +# update_halo.jl + +function free_update_halo_cubuffers end +function init_cubufs_arrays end +function init_cubufs end +function reinterpret_cubufs end +function reallocate_undersized_cubufs end +function reregister_cubufs end +function get_cusendbufs_raw end +function get_curecvbufs_raw end +function allocate_custreams end \ No newline at end of file diff --git a/src/CUDAExt/select_device.jl b/src/CUDAExt/select_device.jl new file mode 100644 index 0000000..bcffa29 --- /dev/null +++ b/src/CUDAExt/select_device.jl @@ -0,0 +1,2 @@ +ImplicitGlobalGrid.nb_cudevices() = length(CUDA.devices()) +ImplicitGlobalGrid.cudevice!(device_id) = CUDA.device!(device_id) \ No newline at end of file diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl new file mode 100644 index 0000000..af2408b --- /dev/null +++ b/src/CUDAExt/shared.jl @@ -0,0 +1,45 @@ +import ImplicitGlobalGrid +import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, cudaaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_cuarray +import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY +using CUDA + + +##------ +## TYPES + +const CuField{T,N} = GGField{T,N,CuArray{T,N}} + + +##------------------------------------ +## HANDLING OF CUDA AND AMDGPU SUPPORT + +ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = true +ImplicitGlobalGrid.is_functional(::Val{:CUDA}) = CUDA.functional() + + +##------------- +## SYNTAX SUGAR + +ImplicitGlobalGrid.is_cuarray(A::CuArray) = true #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. + + +##-------------------------------------------------------------------------------- +## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS + +ImplicitGlobalGrid.wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw)) + +Base.size(A::CuField) = Base.size(A.A) +Base.size(A::CuField, args...) = Base.size(A.A, args...) +Base.length(A::CuField) = Base.length(A.A) +Base.ndims(A::CuField) = Base.ndims(A.A) +Base.eltype(A::CuField) = Base.eltype(A.A) + + +##--------------- +## CUDA functions + +function ImplicitGlobalGrid.register(::Type{<:CuArray},buf::Array{T}) where T <: GGNumber + rbuf = CUDA.Mem.register(CUDA.Mem.Host, pointer(buf), sizeof(buf), CUDA.Mem.HOSTREGISTER_DEVICEMAP); + rbuf_d = convert(CuPtr{T}, rbuf); + return unsafe_wrap(CuArray, rbuf_d, size(buf)), rbuf; +end diff --git a/src/CUDAExt/update_halo.jl b/src/CUDAExt/update_halo.jl new file mode 100644 index 0000000..27bdcf2 --- /dev/null +++ b/src/CUDAExt/update_halo.jl @@ -0,0 +1,260 @@ +##--------------------------------------- +## FUNCTIONS RELATED TO BUFFER ALLOCATION + +# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. + +ImplicitGlobalGrid.free_update_halo_cubuffers(args...) = free_update_halo_cubuffers(args...) +ImplicitGlobalGrid.init_cubufs_arrays(args...) = init_cubufs_arrays(args...) +ImplicitGlobalGrid.init_cubufs(args...) = init_cubufs(args...) +ImplicitGlobalGrid.reinterpret_cubufs(args...) = reinterpret_cubufs(args...) +ImplicitGlobalGrid.reallocate_undersized_cubufs(args...) = reallocate_undersized_cubufs(args...) +ImplicitGlobalGrid.reregister_cubufs(args...) = reregister_cubufs(args...) +ImplicitGlobalGrid.get_cusendbufs_raw(args...) = get_cusendbufs_raw(args...) +ImplicitGlobalGrid.get_curecvbufs_raw(args...) = get_curecvbufs_raw(args...) +ImplicitGlobalGrid.gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpusendbuf(n,dim,i,A) +ImplicitGlobalGrid.gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpurecvbuf(n,dim,i,A) +ImplicitGlobalGrid.gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpusendbuf_flat(n,dim,i,A) +ImplicitGlobalGrid.gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where {T <: GGNumber} = gpurecvbuf_flat(n,dim,i,A) + +let + global free_update_halo_cubuffers, init_cubufs_arrays, init_cubufs, reinterpret_cubufs, reregister_cubufs, reallocate_undersized_cubufs + global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat + cusendbufs_raw = nothing + curecvbufs_raw = nothing + cusendbufs_raw_h = nothing + curecvbufs_raw_h = nothing + + function free_update_halo_cubuffers() + free_cubufs(cusendbufs_raw) + free_cubufs(curecvbufs_raw) + unregister_cubufs(cusendbufs_raw_h) + unregister_cubufs(curecvbufs_raw_h) + reset_cu_buffers() + end + + function free_cubufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if is_cuarray(bufs[i][n]) CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end + end + end + end + end + + function unregister_cubufs(bufs) + if (bufs !== nothing) + for i = 1:length(bufs) + for n = 1:length(bufs[i]) + if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end + end + end + end + end + + function reset_cu_buffers() + cusendbufs_raw = nothing + curecvbufs_raw = nothing + cusendbufs_raw_h = nothing + curecvbufs_raw_h = nothing + end + + + # (CUDA functions) + + function init_cubufs_arrays() + cusendbufs_raw = Array{Array{Any,1},1}(); + curecvbufs_raw = Array{Array{Any,1},1}(); + cusendbufs_raw_h = Array{Array{Any,1},1}(); + curecvbufs_raw_h = Array{Array{Any,1},1}(); + end + + function init_cubufs(T::DataType, fields::GGField...) + while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end + while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end + while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end + while (length(curecvbufs_raw_h) < length(fields)) push!(curecvbufs_raw_h, [[], []]); end + end + + function reinterpret_cubufs(T::DataType, i::Integer, n::Integer) + if (eltype(cusendbufs_raw[i][n]) != T) cusendbufs_raw[i][n] = reinterpret(T, cusendbufs_raw[i][n]); end + if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end + end + + function reallocate_undersized_cubufs(T::DataType, i::Integer, max_halo_elems::Integer) + if (!isnothing(cusendbufs_raw) && length(cusendbufs_raw[i][1]) < max_halo_elems) + for n = 1:NNEIGHBORS_PER_DIM + reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); # Too small buffers had been replaced with larger ones; free the unused memory immediately. + end + end + end + + function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) + cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. + curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); + end + + function reregister_cubufs(T::DataType, i::Integer, n::Integer, sendbufs_raw, recvbufs_raw) + if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T)) + if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T)) + cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]); + curecvbufs_raw[i][n], curecvbufs_raw_h[i][n] = register(CuArray,recvbufs_raw[i][n]); + end + + + # (CUDA functions) + + function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber + return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); + end + + function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber + return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); + end + + + # (GPU functions) + + #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. + function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber + return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber + return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); + end + + + # Make sendbufs_raw and recvbufs_raw accessible for unit testing. + global get_cusendbufs_raw, get_curecvbufs_raw + get_cusendbufs_raw() = deepcopy(cusendbufs_raw) + get_curecvbufs_raw() = deepcopy(curecvbufs_raw) +end + + +##---------------------------------------------- +## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS + +function ImplicitGlobalGrid.allocate_custreams(fields::GGField...) + allocate_custreams_iwrite(fields...); + allocate_custreams_iread(fields...); +end + +ImplicitGlobalGrid.iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where {T <: GGNumber} = iwrite_sendbufs!(n,dim,F,i) +ImplicitGlobalGrid.iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where {T <: GGNumber} = iread_recvbufs!(n,dim,F,i) +ImplicitGlobalGrid.wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where {T <: GGNumber} = wait_iwrite(n,A,i) +ImplicitGlobalGrid.wait_iread(n::Integer, A::CuField{T}, i::Integer) where {T <: GGNumber} = wait_iread(n,A,i) + +let + global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite + + custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); + + function allocate_custreams_iwrite(fields::GGField...) + if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField + custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = sendranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + else + write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]); + end + end + end +end + +let + global iread_recvbufs!, allocate_custreams_iread, wait_iread + + custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) + + wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); + + function allocate_custreams_iread(fields::GGField...) + if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField + custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. + end + end + + function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber + A, halowidths = F; + if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... + if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). + ranges = recvranges(n, dim, F); + nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); + halosize = [r[end] - r[1] + 1 for r in ranges]; + nblocks = Tuple(ceil.(Int, halosize./nthreads)); + @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); + else + read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]); + end + end + end +end + + +# (CUDA functions) + +# Write to the send buffer on the host or device from the array on the device (d2x). +function ImplicitGlobalGrid.write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1 + iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1 + iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1 + if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end + gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; + return nothing +end + +# Read from the receive buffer on the host or device and store on the array on the device (x2d). +function ImplicitGlobalGrid.read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber + ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1 + iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1 + iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1 + if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end + A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; + return nothing +end + +# Write to the send buffer on the host from the array on the device (d2h). +function ImplicitGlobalGrid.write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer + CUDA.Mem.unsafe_copy3d!( + pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device, + length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); + srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), + srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2), + dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]), + async=true, stream=custream + ) +end + +# Read from the receive buffer on the host and store on the array on the device (h2d). +function ImplicitGlobalGrid.read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer + CUDA.Mem.unsafe_copy3d!( + pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host, + length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); + dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), + srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]), + dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2), + async=true, stream=custream + ) +end + + +##------------------------------ +## FUNCTIONS TO SEND/RECV FIELDS + +function ImplicitGlobalGrid.gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber + @inbounds CUDA.copyto!(dst, src) +end + diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl index 628d799..d844f45 100644 --- a/src/ImplicitGlobalGrid.jl +++ b/src/ImplicitGlobalGrid.jl @@ -23,6 +23,9 @@ https://github.com/eth-cscs/ImplicitGlobalGrid.jl To see a description of a function type `?`. +!!! note "Activation of device support" + The support for a device type (CUDA or AMDGPU) is activated by importing the corresponding module (CUDA or AMDGPU) before importing ImplicitGlobalGrid (the corresponding extension will be loaded). + !!! note "Performance note" If the system supports CUDA-aware MPI (for Nvidia GPUs) or ROCm-aware MPI (for AMD GPUs), it may be activated for ImplicitGlobalGrid by setting one of the following environment variables (at latest before the call to `init_global_grid`): ```shell @@ -42,6 +45,9 @@ using .Exceptions include("shared.jl") ## Alphabetical include of defaults for extensions +include("defaults_shared.jl") +include(joinpath("AMDGPUExt", "defaults.jl")) +include(joinpath("CUDAExt", "defaults.jl")) include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl")) ## Alphabetical include of files diff --git a/src/defaults_shared.jl b/src/defaults_shared.jl new file mode 100644 index 0000000..97a58e8 --- /dev/null +++ b/src/defaults_shared.jl @@ -0,0 +1,20 @@ +# shared.jl + +is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing) +is_functional(arg) = false +function register end + + +# update_halo.jl + +function gpusendbuf end +function gpurecvbuf end +function gpusendbuf_flat end +function gpurecvbuf_flat end + +function write_d2x! end +function read_x2d! end +function write_d2h_async! end +function read_h2d_async! end + +function gpumemcopy! end diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl index 62656cc..3be1986 100644 --- a/src/init_global_grid.jl +++ b/src/init_global_grid.jl @@ -18,8 +18,8 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology. - `init_MPI::Bool=true`: whether to initialize MPI (`true`) or not (`false`). - - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) is functional; if both are functional, an error will be given if `device_type` is set as `"auto"`. - - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU is functional and `device_type` not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref). + - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) was imported before ImplicitGlobalGrid; if both were imported, an error will be given if `device_type` is set as `"auto"`. + - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU was imported and `device_type` is not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref). For more information, refer to the documentation of MPI.jl / MPI. # Return values @@ -40,6 +40,10 @@ See also: [`finalize_global_grid`](@ref), [`select_device`](@ref) """ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false) if grid_is_initialized() error("The global grid has already been initialized.") end + set_cuda_loaded() + set_cuda_functional() + set_amdgpu_loaded() + set_amdgpu_functional() nxyz = [nx, ny, nz]; dims = [dimx, dimy, dimz]; periods = [periodx, periody, periodz]; @@ -69,10 +73,10 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end end if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end - if ((device_type == DEVICE_TYPE_AUTO) && cuda_functional() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU are functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end + if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end if (device_type != DEVICE_TYPE_NONE) - if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. - if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. + if (device_type in [DEVICE_TYPE_CUDA, DEVICE_TYPE_AUTO]) cuda_enabled = cuda_loaded() && cuda_functional() end # NOTE: cuda could be enabled/disabled depending on some additional criteria. + if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() && amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria. end if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end @@ -101,7 +105,11 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 end nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2 set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet)); - if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]))"); end + cuda_support_string = (cuda_enabled && all(cudaaware_MPI)) ? "CUDA-aware" : (cuda_enabled && any(cudaaware_MPI)) ? "CUDA(-aware)" : (cuda_enabled) ? "CUDA" : ""; + amdgpu_support_string = (amdgpu_enabled && all(amdgpuaware_MPI)) ? "AMDGPU-aware" : (amdgpu_enabled && any(amdgpuaware_MPI)) ? "AMDGPU(-aware)" : (amdgpu_enabled) ? "AMDGPU" : ""; + gpu_support_string = join(filter(!isempty, [cuda_support_string, amdgpu_support_string]), ", "); + support_string = isempty(gpu_support_string) ? "none" : gpu_support_string; + if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]); device support: $support_string)"); end if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end init_timing_functions(); return me, dims, nprocs, coords, comm_cart; # The typical use case requires only these variables; the remaining can be obtained calling get_global_grid() if needed. diff --git a/src/select_device.jl b/src/select_device.jl index a571c7e..5df62cf 100644 --- a/src/select_device.jl +++ b/src/select_device.jl @@ -13,25 +13,26 @@ Select the device (GPU) corresponding to the node-local MPI rank and return its See also: [`init_global_grid`](@ref) """ function select_device() + check_initialized() + if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end if cuda_enabled() || amdgpu_enabled() - check_initialized(); if cuda_enabled() - @assert CUDA.functional(true) - nb_devices = length(CUDA.devices()) + @assert cuda_functional() + nb_devices = nb_cudevices() elseif amdgpu_enabled() - @assert AMDGPU.functional() - nb_devices = length(AMDGPU.devices()) + @assert amdgpu_functional() + nb_devices = nb_rocdevices() end comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me()) if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end me_l = MPI.Comm_rank(comm_l) device_id = amdgpu_enabled() ? me_l+1 : me_l - if cuda_enabled() CUDA.device!(device_id) - elseif amdgpu_enabled() AMDGPU.device_id!(device_id) + if cuda_enabled() cudevice!(device_id) + elseif amdgpu_enabled() rocdevice!(device_id) end return device_id else - error("Cannot select a device because neither CUDA nor AMDGPU is enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded).") + error("Cannot select a device because neither CUDA nor AMDGPU is enabled (meaning that the corresponding module was not imported before ImplicitGlobalGrid).") end end diff --git a/src/shared.jl b/src/shared.jl index b9a9907..b2bfad3 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -1,24 +1,24 @@ import MPI -using CUDA -using AMDGPU using Base.Threads -##------------------------- +##------------------------------------ ## HANDLING OF CUDA AND AMDGPU SUPPORT -let - global cuda_functional, amdgpu_functional, set_cuda_functional, set_amdgpu_functional - _cuda_functional::Bool = false - _amdgpu_functional::Bool = false - cuda_functional()::Bool = _cuda_functional - amdgpu_functional()::Bool = _amdgpu_functional - set_cuda_functional(val::Bool) = (_cuda_functional = val;) - set_amdgpu_functional(val::Bool) = (_amdgpu_functional = val;) -end -function __init__() - set_cuda_functional(CUDA.functional()) - set_amdgpu_functional(AMDGPU.functional()) +let + global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional + _cuda_loaded::Bool = false + _cuda_functional::Bool = false + _amdgpu_loaded::Bool = false + _amdgpu_functional::Bool = false + cuda_loaded()::Bool = _cuda_loaded + cuda_functional()::Bool = _cuda_functional + amdgpu_loaded()::Bool = _amdgpu_loaded + amdgpu_functional()::Bool = _amdgpu_functional + set_cuda_loaded() = (_cuda_loaded = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt))) + set_cuda_functional() = (_cuda_functional = is_functional(Val(:CUDA))) + set_amdgpu_loaded() = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt))) + set_amdgpu_functional() = (_amdgpu_functional = is_functional(Val(:AMDGPU))) end @@ -33,6 +33,7 @@ const DEVICE_TYPE_NONE = "none" const DEVICE_TYPE_AUTO = "auto" const DEVICE_TYPE_CUDA = "CUDA" const DEVICE_TYPE_AMDGPU = "AMDGPU" +const SUPPORTED_DEVICE_TYPES = [DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU] ##------ @@ -40,13 +41,11 @@ const DEVICE_TYPE_AMDGPU = "AMDGPU" const GGInt = Cint const GGNumber = Number -const GGArray{T,N} = Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}} +const GGArray{T,N} = DenseArray{T,N} # TODO: was Union{Array{T,N}, CuArray{T,N}, ROCArray{T,N}} const GGField{T,N,T_array} = NamedTuple{(:A, :halowidths), Tuple{T_array, Tuple{GGInt,GGInt,GGInt}}} where {T_array<:GGArray{T,N}} const GGFieldConvertible{T,N,T_array} = NamedTuple{(:A, :halowidths), <:Tuple{T_array, Tuple{T2,T2,T2}}} where {T_array<:GGArray{T,N}, T2<:Integer} const GGField{}(t::NamedTuple) = GGField{eltype(t.A),ndims(t.A),typeof(t.A)}((t.A, GGInt.(t.halowidths))) const CPUField{T,N} = GGField{T,N,Array{T,N}} -const CuField{T,N} = GGField{T,N,CuArray{T,N}} -const ROCField{T,N} = GGField{T,N,ROCArray{T,N}} "An GlobalGrid struct contains information on the grid and the corresponding MPI communicator." # Note: type GlobalGrid is immutable, i.e. users can only read, but not modify it (except the actual entries of arrays can be modified, e.g. dims .= dims - useful for writing tests) struct GlobalGrid @@ -115,9 +114,10 @@ has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL any_array(fields::GGField...) = any([is_array(A.A) for A in fields]) any_cuarray(fields::GGField...) = any([is_cuarray(A.A) for A in fields]) any_rocarray(fields::GGField...) = any([is_rocarray(A.A) for A in fields]) +all_arrays(fields::GGField...) = all([is_array(A.A) for A in fields]) +all_cuarrays(fields::GGField...) = all([is_cuarray(A.A) for A in fields]) +all_rocarrays(fields::GGField...) = all([is_rocarray(A.A) for A in fields]) is_array(A::GGArray) = typeof(A) <: Array -is_cuarray(A::GGArray) = typeof(A) <: CuArray #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. -is_rocarray(A::GGArray) = typeof(A) <: ROCArray #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer. ##-------------------------------------------------------------------------------- @@ -126,31 +126,16 @@ is_rocarray(A::GGArray) = typeof(A) <: ROCArray #NOTE: this func wrap_field(A::GGField) = A wrap_field(A::GGFieldConvertible) = GGField(A) wrap_field(A::Array, hw::Tuple) = CPUField{eltype(A), ndims(A)}((A, hw)) -wrap_field(A::CuArray, hw::Tuple) = CuField{eltype(A), ndims(A)}((A, hw)) -wrap_field(A::ROCArray, hw::Tuple) = ROCField{eltype(A), ndims(A)}((A, hw)) wrap_field(A::GGArray, hw::Integer...) = wrap_field(A, hw) wrap_field(A::GGArray) = wrap_field(A, hw_default()...) -Base.size(A::Union{GGField, CPUField, CuField, ROCField}) = Base.size(A.A) -Base.size(A::Union{GGField, CPUField, CuField, ROCField}, args...) = Base.size(A.A, args...) -Base.length(A::Union{GGField, CPUField, CuField, ROCField}) = Base.length(A.A) -Base.ndims(A::Union{GGField, CPUField, CuField, ROCField}) = Base.ndims(A.A) -Base.eltype(A::Union{GGField, CPUField, CuField, ROCField}) = Base.eltype(A.A) +Base.size(A::Union{GGField, CPUField}) = Base.size(A.A) +Base.size(A::Union{GGField, CPUField}, args...) = Base.size(A.A, args...) +Base.length(A::Union{GGField, CPUField}) = Base.length(A.A) +Base.ndims(A::Union{GGField, CPUField}) = Base.ndims(A.A) +Base.eltype(A::Union{GGField, CPUField}) = Base.eltype(A.A) -##--------------- -## CUDA functions - -function register(::Type{<:CuArray},buf::Array{T}) where T <: GGNumber - rbuf = CUDA.Mem.register(CUDA.Mem.Host, pointer(buf), sizeof(buf), CUDA.Mem.HOSTREGISTER_DEVICEMAP); - rbuf_d = convert(CuPtr{T}, rbuf); - return unsafe_wrap(CuArray, rbuf_d, size(buf)), rbuf; -end - - -##--------------- -## AMDGPU functions - -function register(::Type{<:ROCArray},buf::Array{T}) where T <: GGNumber - return unsafe_wrap(ROCArray, pointer(buf), size(buf)) -end +##------------------------------------------ +## CUDA AND AMDGPU COMMON EXTENSION DEFAULTS +# TODO: this should not be required as only called from the extensions #function register end \ No newline at end of file diff --git a/src/update_halo.jl b/src/update_halo.jl index 4be9d87..7661ae6 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -35,8 +35,7 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...) end function _update_halo!(fields::GGField...) - if (any_cuarray(fields...) && !cuda_enabled()) error("CUDA is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end #NOTE: in the following, it is only required to check for `cuda_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)`. - if (any_rocarray(fields...) && !amdgpu_enabled()) error("AMDGPU is not enabled (possibly detected non functional when the ImplicitGlobalGrid module was loaded)."); end #NOTE: in the following, it is only required to check for `amdgpu_enabled()` when the context does not imply `any_rocarray(fields...)` or `is_rocarray(A)`. + if (!cuda_enabled() && !amdgpu_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later). allocate_bufs(fields...); if any_array(fields...) allocate_tasks(fields...); end if any_cuarray(fields...) allocate_custreams(fields...); end @@ -95,60 +94,25 @@ halosize(dim::Integer, A::GGField) = (dim==1) ? (A.halowidths[1], size(A,2), siz # NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time. let - global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat + #TODO: this was: global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat, gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat, rocsendbuf, rocrecvbuf, rocsendbuf_flat, rocrecvbuf_flat + global free_update_halo_buffers, allocate_bufs, sendbuf, recvbuf, sendbuf_flat, recvbuf_flat sendbufs_raw = nothing recvbufs_raw = nothing - cusendbufs_raw = nothing - curecvbufs_raw = nothing - cusendbufs_raw_h = nothing - curecvbufs_raw_h = nothing - rocsendbufs_raw = nothing - rocrecvbufs_raw = nothing - # INFO: no need for roc host buffers function free_update_halo_buffers() - if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(cusendbufs_raw) end - if (cuda_enabled() && any(cudaaware_MPI())) free_gpubufs(curecvbufs_raw) end - if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(cusendbufs_raw_h) end - if (cuda_enabled() && none(cudaaware_MPI())) unregister_gpubufs(curecvbufs_raw_h) end - if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocsendbufs_raw) end - if (amdgpu_enabled() && any(amdgpuaware_MPI())) free_gpubufs(rocrecvbufs_raw) end - # INFO: no need to unregister roc host buffers - sendbufs_raw = nothing - recvbufs_raw = nothing - cusendbufs_raw = nothing - curecvbufs_raw = nothing - cusendbufs_raw_h = nothing - curecvbufs_raw_h = nothing - rocsendbufs_raw = nothing - rocrecvbufs_raw = nothing - # INFO: no need for roc host buffers - GC.gc() + free_update_halo_cpubuffers() + if (cuda_enabled() && none(cudaaware_MPI())) free_update_halo_cubuffers() end + if (amdgpu_enabled() && none(amdgpuaware_MPI())) free_update_halo_rocbuffers() end + GC.gc() #TODO: see how to modify this! end - - # (CUDA, AMDGPU functions) - - function free_gpubufs(bufs) - if (bufs !== nothing) - for i = 1:length(bufs) - for n = 1:length(bufs[i]) - if is_cuarray(bufs[i][n]) CUDA.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end - if is_rocarray(bufs[i][n]) AMDGPU.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end # DEBUG: unsafe_free should be managed in AMDGPU - end - end - end + function free_update_halo_cpubuffers() + reset_cpu_buffers(); end - function unregister_gpubufs(bufs) - if (bufs !== nothing) - for i = 1:length(bufs) - for n = 1:length(bufs[i]) - if (isa(bufs[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end - # INFO: no need for roc host buffers - end - end - end + function reset_cpu_buffers() + sendbufs_raw = nothing + recvbufs_raw = nothing end # Allocate for each field two send and recv buffers (one for the left and one for the right neighbour of a dimension). The required length of the buffer is given by the maximal number of halo elements in any of the dimensions. Note that buffers are not allocated separately for each dimension, as the updates are performed one dimension at a time (required for correctness). @@ -170,24 +134,9 @@ let if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end end max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1])); - if (length(sendbufs_raw[i][1]) < max_halo_elems) - for n = 1:NNEIGHBORS_PER_DIM - reallocate_bufs(T, i, n, max_halo_elems); - if (is_cuarray(A) && none(cudaaware_MPI())) reregister_cubufs(T, i, n); end # Host memory is page-locked (and mapped to device memory) to ensure optimal access performance (from kernel or with 3-D memcopy). - if (is_rocarray(A) && none(amdgpuaware_MPI())) reregister_rocbufs(T, i, n); end # ... - end - GC.gc(); # Too small buffers had been replaced with larger ones; free the now unused memory. - end - if (!isnothing(cusendbufs_raw) && length(cusendbufs_raw[i][1]) < max_halo_elems) - for n = 1:NNEIGHBORS_PER_DIM - if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_cubufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately. - end - end - if (!isnothing(rocsendbufs_raw) && length(rocsendbufs_raw[i][1]) < max_halo_elems) - for n = 1:NNEIGHBORS_PER_DIM - if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_rocbufs(T, i, n, max_halo_elems); GC.gc(); end # Too small buffers had been replaced with larger ones; free the unused memory immediately. - end - end + reallocate_undersized_hostbufs(T, i, max_halo_elems, A); + if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_undersized_cubufs(T, i, max_halo_elems) end + if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_undersized_rocbufs(T, i, max_halo_elems) end end end @@ -209,77 +158,23 @@ let if (eltype(recvbufs_raw[i][n]) != T) recvbufs_raw[i][n] = reinterpret(T, recvbufs_raw[i][n]); end end + function reallocate_undersized_hostbufs(T::DataType, i::Integer, max_halo_elems::Integer, A::GGArray) + if (length(sendbufs_raw[i][1]) < max_halo_elems) + for n = 1:NNEIGHBORS_PER_DIM + reallocate_bufs(T, i, n, max_halo_elems); + if (is_cuarray(A) && none(cudaaware_MPI())) reregister_cubufs(T, i, n, sendbufs_raw, recvbufs_raw); end # Host memory is page-locked (and mapped to device memory) to ensure optimal access performance (from kernel or with 3-D memcopy). + if (is_rocarray(A) && none(amdgpuaware_MPI())) reregister_rocbufs(T, i, n, sendbufs_raw, recvbufs_raw); end # ... + end + GC.gc(); # Too small buffers had been replaced with larger ones; free the now unused memory. + end + end + function reallocate_bufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) sendbufs_raw[i][n] = zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. recvbufs_raw[i][n] = zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); end - # (CUDA functions) - - function init_cubufs_arrays() - cusendbufs_raw = Array{Array{Any,1},1}(); - curecvbufs_raw = Array{Array{Any,1},1}(); - cusendbufs_raw_h = Array{Array{Any,1},1}(); - curecvbufs_raw_h = Array{Array{Any,1},1}(); - end - - function init_cubufs(T::DataType, fields::GGField...) - while (length(cusendbufs_raw) < length(fields)) push!(cusendbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end - while (length(curecvbufs_raw) < length(fields)) push!(curecvbufs_raw, [CuArray{T}(undef,0), CuArray{T}(undef,0)]); end - while (length(cusendbufs_raw_h) < length(fields)) push!(cusendbufs_raw_h, [[], []]); end - while (length(curecvbufs_raw_h) < length(fields)) push!(curecvbufs_raw_h, [[], []]); end - end - - function reinterpret_cubufs(T::DataType, i::Integer, n::Integer) - if (eltype(cusendbufs_raw[i][n]) != T) cusendbufs_raw[i][n] = reinterpret(T, cusendbufs_raw[i][n]); end - if (eltype(curecvbufs_raw[i][n]) != T) curecvbufs_raw[i][n] = reinterpret(T, curecvbufs_raw[i][n]); end - end - - function reallocate_cubufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) - cusendbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. - curecvbufs_raw[i][n] = CUDA.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); - end - - function reregister_cubufs(T::DataType, i::Integer, n::Integer) - if (isa(cusendbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(cusendbufs_raw_h[i][n]); cusendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T)) - if (isa(curecvbufs_raw_h[i][n],CUDA.Mem.HostBuffer)) CUDA.Mem.unregister(curecvbufs_raw_h[i][n]); curecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T)) - cusendbufs_raw[i][n], cusendbufs_raw_h[i][n] = register(CuArray,sendbufs_raw[i][n]); - curecvbufs_raw[i][n], curecvbufs_raw_h[i][n] = register(CuArray,recvbufs_raw[i][n]); - end - - - # (AMDGPU functions) - - function init_rocbufs_arrays() - rocsendbufs_raw = Array{Array{Any,1},1}(); - rocrecvbufs_raw = Array{Array{Any,1},1}(); - # INFO: no need for roc host buffers - end - - function init_rocbufs(T::DataType, fields::GGField...) - while (length(rocsendbufs_raw) < length(fields)) push!(rocsendbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end - while (length(rocrecvbufs_raw) < length(fields)) push!(rocrecvbufs_raw, [ROCArray{T}(undef,0), ROCArray{T}(undef,0)]); end - # INFO: no need for roc host buffers - end - - function reinterpret_rocbufs(T::DataType, i::Integer, n::Integer) - if (eltype(rocsendbufs_raw[i][n]) != T) rocsendbufs_raw[i][n] = reinterpret(T, rocsendbufs_raw[i][n]); end - if (eltype(rocrecvbufs_raw[i][n]) != T) rocrecvbufs_raw[i][n] = reinterpret(T, rocrecvbufs_raw[i][n]); end - end - - function reallocate_rocbufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer) - rocsendbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater. - rocrecvbufs_raw[i][n] = AMDGPU.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); - end - - function reregister_rocbufs(T::DataType, i::Integer, n::Integer) - # INFO: no need for roc host buffers - rocsendbufs_raw[i][n] = register(ROCArray,sendbufs_raw[i][n]); - rocrecvbufs_raw[i][n] = register(ROCArray,recvbufs_raw[i][n]); - end - - # (CPU functions) function sendbuf_flat(n::Integer, dim::Integer, i::Integer, A::GGField{T}) where T <: GGNumber @@ -298,49 +193,10 @@ let return reshape(recvbuf_flat(n,dim,i,A), halosize(dim,A)); end - - # (CUDA functions) - - function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber - return view(cusendbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); - end - - function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::CuField{T}) where T <: GGNumber - return view(curecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A))); - end - - - # (AMDGPU functions) - - function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber - return view(rocsendbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); - end - - function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::ROCField{T}) where T <: GGNumber - return view(rocrecvbufs_raw[i][n]::ROCVector{T},1:prod(halosize(dim,A))); - end - - - # (GPU functions) - - #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber and GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others. - function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber - return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A)); - end - - function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::Union{CuField{T}, ROCField{T}}) where T <: GGNumber - return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A)); - end - - # Make sendbufs_raw and recvbufs_raw accessible for unit testing. - global get_sendbufs_raw, get_recvbufs_raw, get_cusendbufs_raw, get_curecvbufs_raw, get_rocsendbufs_raw, get_rocrecvbufs_raw + global get_sendbufs_raw, get_recvbufs_raw get_sendbufs_raw() = deepcopy(sendbufs_raw) get_recvbufs_raw() = deepcopy(recvbufs_raw) - get_cusendbufs_raw() = deepcopy(cusendbufs_raw) - get_curecvbufs_raw() = deepcopy(curecvbufs_raw) - get_rocsendbufs_raw() = deepcopy(rocsendbufs_raw) - get_rocrecvbufs_raw() = deepcopy(rocrecvbufs_raw) end @@ -412,143 +268,6 @@ let end -# (CUDA functions) - -function allocate_custreams(fields::GGField...) - allocate_custreams_iwrite(fields...); - allocate_custreams_iread(fields...); -end - -let - global iwrite_sendbufs!, allocate_custreams_iwrite, wait_iwrite - - custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iwrite(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); - - function allocate_custreams_iwrite(fields::GGField...) - if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField - custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iwrite_sendbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = sendranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - else - write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), custreams[n,i]); - end - end - end -end - -let - global iread_recvbufs!, allocate_custreams_iread, wait_iread - - custreams = Array{CuStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iread(n::Integer, A::CuField{T}, i::Integer) where T <: GGNumber = CUDA.synchronize(custreams[n,i]); - - function allocate_custreams_iread(fields::GGField...) - if length(fields) > size(custreams,2) # Note: for simplicity, we create a stream for every field even if it is not a CuField - custreams = [custreams [CuStream(; flags=CUDA.STREAM_NON_BLOCKING, priority=CUDA.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(custreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iread_recvbufs!(n::Integer, dim::Integer, F::CuField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - if dim == 1 || cudaaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = recvranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @cuda blocks=nblocks threads=nthreads stream=custreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - else - read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), custreams[n,i]); - end - end - end -end - - -# (AMDGPU functions) - -function allocate_rocstreams(fields::GGField...) - allocate_rocstreams_iwrite(fields...); - allocate_rocstreams_iread(fields...); -end - -let - global iwrite_sendbufs!, allocate_rocstreams_iwrite, wait_iwrite - - rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iwrite(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); - - function allocate_rocstreams_iwrite(fields::GGField...) - if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField - rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iwrite_sendbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - # DEBUG: the follow section needs perf testing - # DEBUG 2: commenting read_h2d_async! for now - # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = sendranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - # else - # write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), rocstreams[n,i]); - # end - end - end -end - -let - global iread_recvbufs!, allocate_rocstreams_iread, wait_iread - - rocstreams = Array{AMDGPU.HIPStream}(undef, NNEIGHBORS_PER_DIM, 0) - - wait_iread(n::Integer, A::ROCField{T}, i::Integer) where T <: GGNumber = AMDGPU.synchronize(rocstreams[n,i]); - - function allocate_rocstreams_iread(fields::GGField...) - if length(fields) > size(rocstreams,2) # Note: for simplicity, we create a stream for every field even if it is not a ROCField - rocstreams = [rocstreams [AMDGPU.HIPStream(:high) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(rocstreams,2))]]; # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels. - end - end - - function iread_recvbufs!(n::Integer, dim::Integer, F::ROCField{T}, i::Integer) where T <: GGNumber - A, halowidths = F; - if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth... - # DEBUG: the follow section needs perf testing - # DEBUG 2: commenting read_h2d_async! for now - # if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case). - ranges = recvranges(n, dim, F); - nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1); - halosize = [r[end] - r[1] + 1 for r in ranges]; - nblocks = Tuple(ceil.(Int, halosize./nthreads)); - @roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim); - # else - # read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), rocstreams[n,i]); - # end - end - end - -end - - # (CPU/GPU functions) # Return the ranges from A to be sent. It will always return ranges for the dimensions x,y and z even if the A is 1D or 2D (for 2D, the 3rd range is 1:1; for 1D, the 2nd and 3rd range are 1:1). @@ -611,105 +330,6 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni end -# (CUDA functions) - -# Write to the send buffer on the host or device from the array on the device (d2x). -function write_d2x!(gpusendbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + sendrangex[1] - 1 - iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + sendrangey[1] - 1 - iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + sendrangez[1] - 1 - if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end - gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; - return nothing -end - -# Read from the receive buffer on the host or device and store on the array on the device (x2d). -function read_x2d!(gpurecvbuf::CuDeviceArray{T}, A::CuDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (CUDA.blockIdx().x-1) * CUDA.blockDim().x + CUDA.threadIdx().x + recvrangex[1] - 1 - iy = (CUDA.blockIdx().y-1) * CUDA.blockDim().y + CUDA.threadIdx().y + recvrangey[1] - 1 - iz = (CUDA.blockIdx().z-1) * CUDA.blockDim().z + CUDA.threadIdx().z + recvrangez[1] - 1 - if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end - A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; - return nothing -end - -# Write to the send buffer on the host from the array on the device (d2h). -function write_d2h_async!(sendbuf::AbstractArray{T}, A::CuArray{T}, sendranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer - CUDA.Mem.unsafe_copy3d!( - pointer(sendbuf), CUDA.Mem.Host, pointer(A), CUDA.Mem.Device, - length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); - srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), - srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2), - dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]), - async=true, stream=custream - ) -end - -# Read from the receive buffer on the host and store on the array on the device (h2d). -function read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, custream::CuStream) where T <: GGNumber where T2 <: Integer - CUDA.Mem.unsafe_copy3d!( - pointer(A), CUDA.Mem.Device, pointer(recvbuf), CUDA.Mem.Host, - length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); - dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), - srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]), - dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2), - async=true, stream=custream - ) -end - - -# (AMDGPU functions) - -# Write to the send buffer on the host or device from the array on the device (d2x). -function write_d2x!(gpusendbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + sendrangex[1] - 1 - iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + sendrangey[1] - 1 - iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + sendrangez[1] - 1 - if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end - gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz]; - return nothing -end - -# Read from the receive buffer on the host or device and store on the array on the device (x2d). -function read_x2d!(gpurecvbuf::ROCDeviceArray{T}, A::ROCDeviceArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber - ix = (AMDGPU.workgroupIdx().x-1) * AMDGPU.workgroupDim().x + AMDGPU.workitemIdx().x + recvrangex[1] - 1 - iy = (AMDGPU.workgroupIdx().y-1) * AMDGPU.workgroupDim().y + AMDGPU.workitemIdx().y + recvrangey[1] - 1 - iz = (AMDGPU.workgroupIdx().z-1) * AMDGPU.workgroupDim().z + AMDGPU.workitemIdx().z + recvrangez[1] - 1 - if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end - A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)]; - return nothing -end - -# Write to the send buffer on the host from the array on the device (d2h). -function write_d2h_async!(sendbuf::AbstractArray{T}, A::ROCArray{T}, sendranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer - buf_view = reshape(sendbuf, Tuple(length.(sendranges))) - AMDGPU.Mem.unsafe_copy3d!( - pointer(sendbuf), AMDGPU.Mem.HostBuffer, - pointer(A), typeof(A.buf), - length(sendranges[1]), length(sendranges[2]), length(sendranges[3]); - srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]), - dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2), - srcPitch=sizeof(T) * size(A, 1), srcHeight=size(A, 2), - async=true, stream=rocstream - ) - return nothing -end - -# Read from the receive buffer on the host and store on the array on the device (h2d). -function read_h2d_async!(recvbuf::AbstractArray{T}, A::ROCArray{T}, recvranges::Array{UnitRange{T2},1}, rocstream::AMDGPU.HIPStream) where T <: GGNumber where T2 <: Integer - buf_view = reshape(recvbuf, Tuple(length.(recvranges))) - AMDGPU.Mem.unsafe_copy3d!( - pointer(A), typeof(A.buf), - pointer(recvbuf), AMDGPU.Mem.HostBuffer, - length(recvranges[1]), length(recvranges[2]), length(recvranges[3]); - dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]), - dstPitch=sizeof(T) * size(A, 1), dstHeight=size(A, 2), - srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2), - async=true, stream=rocstream - ) - return nothing -end - ##------------------------------ ## FUNCTIONS TO SEND/RECV FIELDS @@ -782,20 +402,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T end -# (CUDA functions) - -function gpumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber - @inbounds CUDA.copyto!(dst, src) -end - - -# (AMDGPU functions) - -function gpumemcopy!(dst::ROCArray{T}, src::ROCArray{T}) where T <: GGNumber - @inbounds AMDGPU.copyto!(dst, src) -end - - ##------------------------------------------- ## FUNCTIONS FOR CHECKING THE INPUT ARGUMENTS diff --git a/test/runtests.jl b/test/runtests.jl index a6a5800..fd8a36d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,17 +2,29 @@ push!(LOAD_PATH, "../src") # FIXME: to be removed everywhere? import ImplicitGlobalGrid # Precompile it. +import ImplicitGlobalGrid: SUPPORTED_DEVICE_TYPES, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU +@static if (DEVICE_TYPE_CUDA in SUPPORTED_DEVICE_TYPES) import CUDA end +@static if (DEVICE_TYPE_AMDGPU in SUPPORTED_DEVICE_TYPES) import AMDGPU end excludedfiles = ["test_excluded.jl"]; function runtests() exename = joinpath(Sys.BINDIR, Base.julia_exename()) testdir = pwd() - istest(f) = endswith(f, ".jl") && startswith(f, "test_") - testfiles = sort(filter(istest, readdir(testdir))) + istest(f) = endswith(f, ".jl") && startswith(basename(f), "test_") + testfiles = sort(filter(istest, vcat([joinpath.(root, files) for (root, dirs, files) in walkdir(testdir)]...))) nfail = 0 printstyled("Testing package ImplicitGlobalGrid.jl\n"; bold=true, color=:white) + + if (DEVICE_TYPE_CUDA in SUPPORTED_DEVICE_TYPES && !CUDA.functional()) + @warn "Test Skip: All CUDA tests will be skipped because CUDA is not functional (if this is unexpected type `import CUDA; CUDA.functional(true)` to debug your CUDA installation)." + end + + if (DEVICE_TYPE_AMDGPU in SUPPORTED_DEVICE_TYPES && !AMDGPU.functional()) + @warn "Test Skip: All AMDGPU tests will be skipped because AMDGPU is not functional (if this is unexpected type `import AMDGPU; AMDGPU.functional()` to debug your AMDGPU installation)." + end + for f in testfiles println("") if f ∈ excludedfiles @@ -28,4 +40,5 @@ function runtests() end return nfail end + exit(runtests()) diff --git a/test/test_finalize_global_grid.jl b/test/test_finalize_global_grid.jl index dac6678..d80bfce 100644 --- a/test/test_finalize_global_grid.jl +++ b/test/test_finalize_global_grid.jl @@ -1,7 +1,7 @@ push!(LOAD_PATH, "../src") using Test +import MPI, CUDA, AMDGPU using ImplicitGlobalGrid; GG = ImplicitGlobalGrid -import MPI import ImplicitGlobalGrid: @require diff --git a/test/test_gather.jl b/test/test_gather.jl index 42cc4af..9debc3a 100644 --- a/test/test_gather.jl +++ b/test/test_gather.jl @@ -1,7 +1,7 @@ push!(LOAD_PATH, "../src") using Test +import MPI, CUDA, AMDGPU using ImplicitGlobalGrid; GG = ImplicitGlobalGrid -import MPI import ImplicitGlobalGrid: @require diff --git a/test/test_init_global_grid.jl b/test/test_init_global_grid.jl index 9076a94..19c1d02 100644 --- a/test/test_init_global_grid.jl +++ b/test/test_init_global_grid.jl @@ -1,7 +1,7 @@ push!(LOAD_PATH, "../src") using Test +import MPI, CUDA, AMDGPU using ImplicitGlobalGrid; GG = ImplicitGlobalGrid -import MPI import ImplicitGlobalGrid: @require diff --git a/test/test_select_device.jl b/test/test_select_device.jl index 4a5b37a..10cd4d7 100644 --- a/test/test_select_device.jl +++ b/test/test_select_device.jl @@ -1,10 +1,9 @@ # NOTE: All tests of this file can be run with any number of processes. push!(LOAD_PATH, "../src") using Test -using ImplicitGlobalGrid; GG = ImplicitGlobalGrid import MPI -using CUDA -using AMDGPU +using CUDA, AMDGPU +using ImplicitGlobalGrid; GG = ImplicitGlobalGrid import ImplicitGlobalGrid: @require test_cuda = CUDA.functional() @@ -34,13 +33,13 @@ nprocs = MPI.Comm_size(MPI.COMM_WORLD); # NOTE: these tests can run with any num @testset "\"AMDGPU\"" begin me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="AMDGPU"); gpu_id = select_device(); - @test gpu_id < length(AMDGPU.devices()) + @test gpu_id <= length(AMDGPU.devices()) finalize_global_grid(finalize_MPI=false); end; @testset "\"auto\"" begin me, = init_global_grid(3, 4, 5; quiet=true, init_MPI=false, device_type="auto"); gpu_id = select_device(); - @test gpu_id < length(AMDGPU.devices()) + @test gpu_id <= length(AMDGPU.devices()) finalize_global_grid(finalize_MPI=false); end; end diff --git a/test/test_tools.jl b/test/test_tools.jl index fdcf432..d2785d2 100644 --- a/test/test_tools.jl +++ b/test/test_tools.jl @@ -1,7 +1,7 @@ push!(LOAD_PATH, "../src") using Test +import MPI, CUDA, AMDGPU using ImplicitGlobalGrid; GG = ImplicitGlobalGrid -import MPI import ImplicitGlobalGrid: @require macro coords(i) :(GG.global_grid().coords[$i]) end diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl index fb57038..1bff613 100644 --- a/test/test_update_halo.jl +++ b/test/test_update_halo.jl @@ -4,11 +4,9 @@ push!(LOAD_PATH, "../src") using Test -import LoopVectorization +import MPI, LoopVectorization +using CUDA, AMDGPU using ImplicitGlobalGrid; GG = ImplicitGlobalGrid -import MPI -using CUDA -using AMDGPU import ImplicitGlobalGrid: @require, longnameof test_cuda = CUDA.functional()