From 2044c45c37f7c789391d5410c4817a6804e363c8 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Fri, 18 Oct 2024 15:15:53 -0500 Subject: [PATCH 01/14] Refactor array and function dispatching --- ext/JACCAMDGPU/JACCAMDGPU.jl | 19 +++++++------ ext/JACCAMDGPU/array.jl | 4 +-- ext/JACCCUDA/JACCCUDA.jl | 18 ++++++------ ext/JACCCUDA/JACCMULTI.jl | 12 ++++---- ext/JACCCUDA/array.jl | 4 +-- ext/JACCONEAPI/JACCONEAPI.jl | 16 +++++------ ext/JACCONEAPI/array.jl | 4 +-- src/JACC.jl | 55 ++++++++++++++++++++++++++++++------ src/JACCMULTI.jl | 29 +++++++++++++++++-- src/JACCPreferences.jl | 1 + src/array.jl | 8 ++++-- 11 files changed, 118 insertions(+), 52 deletions(-) diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl index 32e408d..8df67f4 100644 --- a/ext/JACCAMDGPU/JACCAMDGPU.jl +++ b/ext/JACCAMDGPU/JACCAMDGPU.jl @@ -9,7 +9,11 @@ include("array.jl") include("JACCEXPERIMENTAL.jl") using .experimental -function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +const AMDGPUBackend = ROCBackend + +JACC.get_backend(::Val{:amdgpu}) = AMDGPUBackend() + +function JACC.parallel_for(::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} numThreads = 512 threads = min(N, numThreads) blocks = ceil(Int, N / threads) @@ -21,7 +25,7 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} end function JACC.parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} numThreads = 16 Mthreads = min(M, numThreads) Nthreads = min(N, numThreads) @@ -35,7 +39,7 @@ function JACC.parallel_for( end function JACC.parallel_for( - (L, M, N)::Tuple{I, I, I}, f::F, x...) where { + ::AMDGPUBackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where { I <: Integer, F <: Function} numThreads = 32 Lthreads = min(L, numThreads) @@ -52,7 +56,7 @@ function JACC.parallel_for( end function JACC.parallel_reduce( - N::I, f::F, x...) where {I <: Integer, F <: Function} + ::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} numThreads = 512 threads = min(N, numThreads) blocks = ceil(Int, N / threads) @@ -68,7 +72,7 @@ function JACC.parallel_reduce( end function JACC.parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} numThreads = 16 Mthreads = min(M, numThreads) Nthreads = min(N, numThreads) @@ -389,9 +393,6 @@ function JACC.shared(x::ROCDeviceArray{T,N}) where {T,N} return shmem end - -function __init__() - const JACC.Array = AMDGPU.ROCArray{T, N} where {T, N} -end +JACC.array_type(::AMDGPUBackend) = AMDGPU.ROCArray{T, N} where {T, N} end # module JACCAMDGPU diff --git a/ext/JACCAMDGPU/array.jl b/ext/JACCAMDGPU/array.jl index b826080..baeebcf 100644 --- a/ext/JACCAMDGPU/array.jl +++ b/ext/JACCAMDGPU/array.jl @@ -1,8 +1,8 @@ -function JACC.zeros(T, dims...) +function JACC.zeros(::AMDGPUBackend, T, dims...) return AMDGPU.zeros(T, dims...) end -function JACC.ones(T, dims...) +function JACC.ones(::AMDGPUBackend, T, dims...) return AMDGPU.ones(T, dims...) end diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl index 20aa6a1..517f3e8 100644 --- a/ext/JACCCUDA/JACCCUDA.jl +++ b/ext/JACCCUDA/JACCCUDA.jl @@ -5,7 +5,6 @@ using JACC, CUDA # overloaded array functions include("array.jl") - include("JACCMULTI.jl") using .multi @@ -13,7 +12,9 @@ using .multi include("JACCEXPERIMENTAL.jl") using .experimental -function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +JACC.get_backend(::Val{:cuda}) = CUDABackend() + +function JACC.parallel_for(::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function} #parallel_args = (N, f, x...) #parallel_kargs = cudaconvert.(parallel_args) #parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...} @@ -28,7 +29,7 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} end function JACC.parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the # potential benefit from using shared memory. #numThreads = 32 @@ -42,7 +43,7 @@ function JACC.parallel_for( end function JACC.parallel_for( - (L, M, N)::Tuple{I, I, I}, f::F, x...) where { + ::CUDABackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where { I <: Integer, F <: Function} #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the # potential benefit from using shared memory. @@ -58,7 +59,7 @@ function JACC.parallel_for( end function JACC.parallel_reduce( - N::I, f::F, x...) where {I <: Integer, F <: Function} + ::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function} numThreads = 512 threads = min(N, numThreads) blocks = ceil(Int, N / threads) @@ -72,7 +73,7 @@ function JACC.parallel_reduce( end function JACC.parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} numThreads = 16 Mthreads = min(M, numThreads) Nthreads = min(N, numThreads) @@ -402,9 +403,6 @@ function JACC.shared(x::CuDeviceArray{T,N}) where {T,N} return shmem end - -function __init__() - const JACC.Array = CUDA.CuArray{T, N} where {T, N} -end +JACC.array_type(::CUDABackend) = CUDA.CuArray{T, N} where {T, N} end # module JACCCUDA diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl index 73e75a2..c6e94c9 100644 --- a/ext/JACCCUDA/JACCMULTI.jl +++ b/ext/JACCCUDA/JACCMULTI.jl @@ -2,7 +2,7 @@ module multi using JACC, CUDA -function JACC.multi.Array(x::Base.Array{T,N}) where {T,N} +function JACC.multi.Array(::CUDABackend, x::Base.Array{T,N}) where {T,N} ndev = length(devices()) ret = Vector{Any}(undef, 2) @@ -52,7 +52,7 @@ function JACC.multi.Array(x::Base.Array{T,N}) where {T,N} end -function JACC.multi.copy(x::Vector{Any}, y::Vector{Any}) +function JACC.multi.copy(::CUDABackend, x::Vector{Any}, y::Vector{Any}) device!(0) ndev = length(devices()) @@ -74,7 +74,7 @@ function JACC.multi.copy(x::Vector{Any}, y::Vector{Any}) end -function JACC.multi.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.multi.parallel_for(::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function} device!(0) ndev = length(devices()) @@ -98,7 +98,7 @@ function JACC.multi.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Fun end -function JACC.multi.parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.multi.parallel_reduce(::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function} device!(0) ndev = length(devices()) @@ -139,7 +139,7 @@ function JACC.multi.parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: return final_rret end -function JACC.multi.parallel_for( +function JACC.multi.parallel_for(::CUDABackend, (M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function} ndev = length(devices()) @@ -165,7 +165,7 @@ function JACC.multi.parallel_for( end -function JACC.multi.parallel_reduce( +function JACC.multi.parallel_reduce(::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} ndev = length(devices()) diff --git a/ext/JACCCUDA/array.jl b/ext/JACCCUDA/array.jl index 5cf21e0..53fbf5d 100644 --- a/ext/JACCCUDA/array.jl +++ b/ext/JACCCUDA/array.jl @@ -1,8 +1,8 @@ -function JACC.zeros(T, dims...) +function JACC.zeros(::CUDABackend, T, dims...) return CUDA.zeros(T, dims...) end -function JACC.ones(T, dims...) +function JACC.ones(::CUDABackend, T, dims...) return CUDA.ones(T, dims...) end diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl index b865a75..929146c 100644 --- a/ext/JACCONEAPI/JACCONEAPI.jl +++ b/ext/JACCONEAPI/JACCONEAPI.jl @@ -9,7 +9,9 @@ include("array.jl") include("JACCEXPERIMENTAL.jl") using .experimental -function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +JACC.get_backend(::Val{:oneapi}) = oneAPIBackend() + +function JACC.parallel_for(::oneAPIBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} #maxPossibleItems = oneAPI.oneL0.compute_properties(device().maxTotalGroupSize) maxPossibleItems = 256 items = min(N, maxPossibleItems) @@ -22,7 +24,7 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} end function JACC.parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::oneAPIBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} maxPossibleItems = 16 Mitems = min(M, maxPossibleItems) Nitems = min(N, maxPossibleItems) @@ -33,7 +35,7 @@ function JACC.parallel_for( end function JACC.parallel_for( - (L, M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::oneAPIBackend, (L, M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} maxPossibleItems = 16 Litems = min(M, maxPossibleItems) Mitems = min(M, maxPossibleItems) @@ -47,7 +49,7 @@ function JACC.parallel_for( end function JACC.parallel_reduce( - N::I, f::F, x...) where {I <: Integer, F <: Function} + ::oneAPIBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} numItems = 256 items = min(N, numItems) groups = ceil(Int, N / items) @@ -60,7 +62,7 @@ function JACC.parallel_reduce( end function JACC.parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::oneAPIBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} numItems = 16 Mitems = min(M, numItems) Nitems = min(N, numItems) @@ -379,8 +381,6 @@ function JACC.shared(x::oneDeviceArray{T,N}) where {T,N} return shmem end -function __init__() - const JACC.Array = oneAPI.oneArray{T, N} where {T, N} -end +JACC.array_type(::oneAPIBackend) = oneAPI.oneArray{T, N} where {T, N} end # module JACCONEAPI diff --git a/ext/JACCONEAPI/array.jl b/ext/JACCONEAPI/array.jl index de479f2..a5a9a0a 100644 --- a/ext/JACCONEAPI/array.jl +++ b/ext/JACCONEAPI/array.jl @@ -1,8 +1,8 @@ -function JACC.zeros(T, dims...) +function JACC.zeros(::oneAPIBackend, T, dims...) return oneAPI.zeros(T, dims...) end -function JACC.ones(T, dims...) +function JACC.ones(::oneAPIBackend, T, dims...) return oneAPI.ones(T, dims...) end diff --git a/src/JACC.jl b/src/JACC.jl index 333d2c0..78dd67e 100644 --- a/src/JACC.jl +++ b/src/JACC.jl @@ -1,9 +1,12 @@ -__precompile__(false) module JACC import Atomix: @atomic -# module to set back end preferences + +# module to set backend preferences include("JACCPreferences.jl") + +struct ThreadsBackend end + include("helper.jl") # overloaded array functions include("array.jl") @@ -17,19 +20,22 @@ using .multi include("JACCEXPERIMENTAL.jl") using .experimental +get_backend(::Val{:threads}) = ThreadsBackend() + export Array, @atomic export parallel_for +export parallel_reduce global Array -function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +function parallel_for(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} @maybe_threaded for i in 1:N f(i, x...) end end function parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} @maybe_threaded for j in 1:N for i in 1:M f(i, j, x...) @@ -38,7 +44,7 @@ function parallel_for( end function parallel_for( - (L, M, N)::Tuple{I, I, I}, f::F, x...) where { + ::ThreadsBackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where { I <: Integer, F <: Function} # only threaded at the first level (no collapse equivalent) @maybe_threaded for k in 1:N @@ -50,7 +56,7 @@ function parallel_for( end end -function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} +function parallel_reduce(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} tmp = zeros(Threads.nthreads()) ret = zeros(1) @maybe_threaded for i in 1:N @@ -63,7 +69,7 @@ function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} end function parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + ::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} tmp = zeros(Threads.nthreads()) ret = zeros(1) @maybe_threaded for j in 1:N @@ -77,12 +83,43 @@ function parallel_reduce( return ret end +array_type(::ThreadsBackend) = Base.Array{T, N} where {T, N} + function shared(x::Base.Array{T,N}) where {T,N} return x end -function __init__() - const JACC.Array = Base.Array{T, N} where {T, N} +struct Array{T, N} end +(::Type{Array{T, N}})(args...; kwargs...) where {T, N} = + array_type(){T, N}(args...; kwargs...) +(::Type{Array{T}})(args...; kwargs...) where {T} = + array_type(){T}(args...; kwargs...) +(::Type{Array})(args...; kwargs...) = + array_type()(args...; kwargs...) + +default_backend() = get_backend(JACCPreferences._backend_dispatchable) + +array_type() = array_type(default_backend()) + +function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} + return parallel_for(default_backend(), N, f, x...) +end + +function parallel_for( + (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + return parallel_for(default_backend(), (M, N), f, x...) +end + +function parallel_for((L, M, N)::Tuple{I, I, I}, f::F, x...) where {I <: Integer, F <: Function} + return parallel_for(default_backend(), (L, M, N), f, x...) +end + +function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} + return parallel_reduce(default_backend(), N, f, x...) +end + +function parallel_reduce((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + return parallel_reduce(default_backend(), (M, N), f, x...) end end # module JACC diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl index b86be05..8ec15bb 100644 --- a/src/JACCMULTI.jl +++ b/src/JACCMULTI.jl @@ -1,24 +1,49 @@ module multi using JACC +import JACC: ThreadsBackend -function Array(x::Base.Array{T,N}) where {T,N} +function Array(::ThreadsBackend, x::Base.Array{T,N}) where {T,N} return x end +function copy(::ThreadsBackend, x::Vector{Any}, y::Vector{Any}) +end + +function parallel_for(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} +end + +function parallel_for(::ThreadsBackend, (M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function} +end + +function parallel_reduce(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function} +end + +function parallel_reduce(::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +end + +function Array(x::Base.Array{T,N}) where {T,N} + return Array(default_backend(), x) +end + function copy(x::Vector{Any}, y::Vector{Any}) + return copy(default_backend(), x, y) end function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} + return parallel_for(default_backend(), N, f, x...) end -function parallel_for((M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function} +function parallel_for((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + return parallel_for(default_backend(), (M, N), f, x...) end function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} + return parallel_reduce(default_backend(), N, f, x...) end function parallel_reduce((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} + return parallel_reduce(default_backend(), (M, N), f, x...) end end # module multi diff --git a/src/JACCPreferences.jl b/src/JACCPreferences.jl index f66ddce..45bfb2c 100644 --- a/src/JACCPreferences.jl +++ b/src/JACCPreferences.jl @@ -16,5 +16,6 @@ function set_backend(new_backend::String) end const backend = @load_preference("backend", "threads") +const _backend_dispatchable = Val{Symbol(backend)}() end # module JACCPreferences diff --git a/src/array.jl b/src/array.jl index 1c1e703..9f7f581 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,8 +1,12 @@ -function zeros(T, dims...) +function zeros(::ThreadsBackend, T, dims...) return Base.zeros(T, dims...) end -function ones(T, dims...) +function ones(::ThreadsBackend, T, dims...) return Base.ones(T, dims...) end + +zeros(T, dims...) = zeros(default_backend(), T, dims...) + +ones(T, dims...) = ones(default_backend(), T, dims...) From b7ae0d446033d4083732faf9764a00e461a06c48 Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Mon, 21 Oct 2024 09:48:53 -0400 Subject: [PATCH 02/14] Added JACC.multi implementation for the AMDGPU backend, non-included the special function for ghost cell management. Tests don't work, we need more than one GPU for this. I tested the test codes in Frontier. It worked well there. Also, the tests do not work on MI100-cousteau, even using the last version of AMDGPU. I changed Project. toml to include a more moderm version of AMDGPU.jl which we need for multi GPU support --- Project.toml | 3 +- ext/JACCAMDGPU/JACCAMDGPU.jl | 3 + ext/JACCAMDGPU/JACCMULTI.jl | 508 +++++++++++++++++++++++++++++++++++ ext/JACCCUDA/JACCCUDA.jl | 1 - ext/JACCCUDA/JACCMULTI.jl | 4 + src/JACCMULTI.jl | 3 + test/tests_amdgpu.jl | 41 +++ 7 files changed, 561 insertions(+), 2 deletions(-) create mode 100644 ext/JACCAMDGPU/JACCMULTI.jl diff --git a/Project.toml b/Project.toml index 52629e0..35cdfb6 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["pedrovalerolara ", "williamfgc 512 + while ii <= N + tmp += @inbounds red[ii] + ii += 512 + end + elseif (i <= N) + tmp = @inbounds red[i] + end + shared_mem[workitemIdx().x] = tmp + AMDGPU.sync_workgroup() + if (i <= 256) + shared_mem[i] += shared_mem[i + 256] + end + AMDGPU.sync_workgroup() + if (i <= 128) + shared_mem[i] += shared_mem[i + 128] + end + AMDGPU.sync_workgroup() + if (i <= 64) + shared_mem[i] += shared_mem[i + 64] + end + AMDGPU.sync_workgroup() + if (i <= 32) + shared_mem[i] += shared_mem[i + 32] + end + AMDGPU.sync_workgroup() + if (i <= 16) + shared_mem[i] += shared_mem[i + 16] + end + AMDGPU.sync_workgroup() + if (i <= 8) + shared_mem[i] += shared_mem[i + 8] + end + AMDGPU.sync_workgroup() + if (i <= 4) + shared_mem[i] += shared_mem[i + 4] + end + AMDGPU.sync_workgroup() + if (i <= 2) + shared_mem[i] += shared_mem[i + 2] + end + AMDGPU.sync_workgroup() + if (i == 1) + shared_mem[i] += shared_mem[i + 1] + ret[1] = shared_mem[1] + end + AMDGPU.sync_workgroup() + return nothing +end + +function _multi_parallel_reduce_amdgpu_MN((M, N), dev_id, ret, f, x...) + shared_mem = @ROCStaticLocalArray(Float64, 16*16) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y + ti = workitemIdx().x + tj = workitemIdx().y + bi = workgroupIdx().x + bj = workgroupIdx().y + + tmp::Float64 = 0.0 + shared_mem[((ti - 1) * 16) + tj] = tmp + + if (i <= M && j <= N) + tmp = @inbounds f(dev_id, i, j, x...) + shared_mem[(ti - 1) * 16 + tj] = tmp + end + AMDGPU.sync_workgroup() + if (ti <= 8 && tj <= 8 && ti + 8 <= M && tj + 8 <= N) + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 7) * 16) + (tj + 8)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 8)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 7) * 16) + tj] + end + AMDGPU.sync_workgroup() + if (ti <= 4 && tj <= 4 && ti + 4 <= M && tj + 4 <= N) + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 3) * 16) + (tj + 4)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 4)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 3) * 16) + tj] + end + AMDGPU.sync_workgroup() + if (ti <= 2 && tj <= 2 && ti + 2 <= M && tj + 2 <= N) + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 1) * 16) + (tj + 2)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 2)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 1) * 16) + tj] + end + AMDGPU.sync_workgroup() + if (ti == 1 && tj == 1 && ti + 1 <= M && tj + 1 <= N) + shared_mem[((ti - 1) * 16) + tj] += shared_mem[ti * 16 + (tj + 1)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 1)] + shared_mem[((ti - 1) * 16) + tj] += shared_mem[ti * 16 + tj] + ret[bi, bj] = shared_mem[((ti - 1) * 16) + tj] + end + return nothing +end + +function _multi_reduce_kernel_amdgpu_MN((M, N), red, ret) + shared_mem = @ROCStaticLocalArray(Float64, 16*16) + i = workitemIdx().x + j = workitemIdx().y + ii = i + jj = j + + tmp::Float64 = 0.0 + shared_mem[(i - 1) * 16 + j] = tmp + + if M > 16 && N > 16 + while ii <= M + jj = workitemIdx().y + while jj <= N + tmp = tmp + @inbounds red[ii, jj] + jj += 16 + end + ii += 16 + end + elseif M > 16 + while ii <= N + tmp = tmp + @inbounds red[ii, jj] + ii += 16 + end + elseif N > 16 + while jj <= N + tmp = tmp + @inbounds red[ii, jj] + jj += 16 + end + elseif M <= 16 && N <= 16 + if i <= M && j <= N + tmp = tmp + @inbounds red[i, j] + end + end + shared_mem[(i - 1) * 16 + j] = tmp + red[i, j] = shared_mem[(i - 1) * 16 + j] + AMDGPU.sync_workgroup() + if (i <= 8 && j <= 8) + if (i + 8 <= M && j + 8 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 7) * 16) + (j + 8)] + end + if (i <= M && j + 8 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 8)] + end + if (i + 8 <= M && j <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 7) * 16) + j] + end + end + AMDGPU.sync_workgroup() + if (i <= 4 && j <= 4) + if (i + 4 <= M && j + 4 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 3) * 16) + (j + 4)] + end + if (i <= M && j + 4 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 4)] + end + if (i + 4 <= M && j <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 3) * 16) + j] + end + end + AMDGPU.sync_workgroup() + if (i <= 2 && j <= 2) + if (i + 2 <= M && j + 2 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 1) * 16) + (j + 2)] + end + if (i <= M && j + 2 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 2)] + end + if (i + 2 <= M && j <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 1) * 16) + j] + end + end + AMDGPU.sync_workgroup() + if (i == 1 && j == 1) + if (i + 1 <= M && j + 1 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[i * 16 + (j + 1)] + end + if (i <= M && j + 1 <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 1)] + end + if (i + 1 <= M && j <= N) + shared_mem[((i - 1) * 16) + j] += shared_mem[i * 16 + j] + end + ret[1] = shared_mem[((i - 1) * 16) + j] + end + return nothing +end + + + + +end # module multi diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl index 20aa6a1..4c77d88 100644 --- a/ext/JACCCUDA/JACCCUDA.jl +++ b/ext/JACCCUDA/JACCCUDA.jl @@ -5,7 +5,6 @@ using JACC, CUDA # overloaded array functions include("array.jl") - include("JACCMULTI.jl") using .multi diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl index 73e75a2..bdd6ef3 100644 --- a/ext/JACCCUDA/JACCMULTI.jl +++ b/ext/JACCCUDA/JACCMULTI.jl @@ -2,6 +2,10 @@ module multi using JACC, CUDA +function JACC.multi.ndev() + return length(devices()) +end + function JACC.multi.Array(x::Base.Array{T,N}) where {T,N} ndev = length(devices()) diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl index b86be05..6dd4c10 100644 --- a/src/JACCMULTI.jl +++ b/src/JACCMULTI.jl @@ -2,6 +2,9 @@ module multi using JACC +function ndev() +end + function Array(x::Base.Array{T,N}) where {T,N} return x end diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index 669476e..d2364aa 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -265,3 +265,44 @@ end C_expected = Float32(2.0) .* ones(Float32, L, M, N) @test Array(C)≈C_expected rtol=1e-5 end + +#@testset "JACC.multi" begin + +# x = ones(1_000) +# y = ones(1_000) +# jx = JACC.multi.Array(x) +# jy = JACC.multi.Array(y) +# jxresult = ones(1_000) +# alpha = 2.0 + +# function seq_axpy(N, alpha, x, y) +# for i in 1:N +# @inbounds x[i] += alpha * y[i] +# end +# end + +# function multi_axpy(dev_id, i, alpha, x, y) +# @inbounds x[dev_id][i] += alpha * y[dev_id][i] +# end + +# function seq_dot(N, x, y) +# r = 0.0 +# for i in 1:N +# @inbounds r += x[i] * y[i] +# end +# return r +# end + +# function multi_dot(dev_id, i, x, y) +# @inbounds x[dev_id][i] * y[dev_id][i] +# end + + #ref_result = seq_axpy(1_000, x, y) + #jresult = JACC.multi.parallel_reduce(1_000, multi_dot, jx[1], jy[1]) +# seq_axpy(1_000, alpha, x, y) +# JACC.multi.parallel_for(1_000, multi_axpy, alpha, jx[1], jy[1]) + + #result = Base.Array(jresult) + + #@test jresult[1]≈ref_result rtol=1e-8 +#end From 1d1c415c26dadc8620da96fb11c0bee3cd824c0b Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Mon, 21 Oct 2024 10:15:51 -0400 Subject: [PATCH 03/14] Update JACCMULTI.jl --- ext/JACCAMDGPU/JACCMULTI.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl index a1cfaf3..e7fb66a 100644 --- a/ext/JACCAMDGPU/JACCMULTI.jl +++ b/ext/JACCAMDGPU/JACCMULTI.jl @@ -502,7 +502,4 @@ function _multi_reduce_kernel_amdgpu_MN((M, N), red, ret) return nothing end - - - end # module multi From 6e04356a4f276e512f530e3cdd7f98ba05c4ecb1 Mon Sep 17 00:00:00 2001 From: William F Godoy Date: Mon, 21 Oct 2024 16:24:59 -0400 Subject: [PATCH 04/14] Update to latest checkout action --- .github/workflows/ci-gpu-AMD.yaml | 8 ++++---- .github/workflows/ci-gpu-NVIDIA.yaml | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci-gpu-AMD.yaml b/.github/workflows/ci-gpu-AMD.yaml index e9a5c58..49a293b 100644 --- a/.github/workflows/ci-gpu-AMD.yaml +++ b/.github/workflows/ci-gpu-AMD.yaml @@ -36,11 +36,11 @@ jobs: - name: GitHub API Request if: steps.check.outputs.triggered == 'true' id: request - uses: octokit/request-action@v2.1.9 + uses: octokit/request-action@v2.x with: route: ${{github.event.issue.pull_request.url}} env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} # Create a separate PR status pointing at GitHub Actions tab URL # just like any other third-party service @@ -67,7 +67,7 @@ jobs: if: steps.check.outputs.triggered == 'true' uses: actions/checkout@v4 with: - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{secrets.GITHUB_TOKEN}} repository: ${{fromJson(steps.request.outputs.data).head.repo.full_name}} ref: ${{steps.pr_data.outputs.branch}} @@ -92,7 +92,7 @@ jobs: if: always() && steps.check.outputs.triggered == 'true' uses: geekdude/github-status-action-v2@v1.1.10 with: - authToken: ${{ secrets.GITHUB_TOKEN }} + authToken: ${{secrets.GITHUB_TOKEN}} context: "ci-GPU-AMD ${{matrix.jobname}}" state: ${{job.status}} sha: ${{fromJson(steps.request.outputs.data).head.sha}} diff --git a/.github/workflows/ci-gpu-NVIDIA.yaml b/.github/workflows/ci-gpu-NVIDIA.yaml index 0b67c93..8ebe935 100644 --- a/.github/workflows/ci-gpu-NVIDIA.yaml +++ b/.github/workflows/ci-gpu-NVIDIA.yaml @@ -36,7 +36,7 @@ jobs: - name: GitHub API Request if: steps.check.outputs.triggered == 'true' id: request - uses: octokit/request-action@v2.1.9 + uses: octokit/request-action@v2.x with: route: ${{github.event.issue.pull_request.url}} env: @@ -49,7 +49,7 @@ jobs: uses: geekdude/github-status-action-v2@v1.1.10 with: authToken: ${{secrets.GITHUB_TOKEN}} - context: "ci-gpu-nvidia-ornl ${{ matrix.jobname }}" + context: "ci-gpu-NVIDIA ${{ matrix.jobname }}" state: "pending" sha: ${{fromJson(steps.request.outputs.data).head.sha}} target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} @@ -86,7 +86,7 @@ jobs: uses: geekdude/github-status-action-v2@v1.1.10 with: authToken: ${{secrets.GITHUB_TOKEN}} - context: "ci-gpu-nvidia-ornl ${{matrix.jobname}}" + context: "ci-gpu-NVIDIA ${{matrix.jobname}}" state: ${{job.status}} sha: ${{fromJson(steps.request.outputs.data).head.sha}} target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} From ae4759bddb1e4266ea60568d2dfc1b900ca4ac7e Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Mon, 28 Oct 2024 09:36:38 -0500 Subject: [PATCH 05/14] Revert AMDGPU to v1.0 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 52629e0..419afe9 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,7 @@ JACCCUDA = ["CUDA"] JACCONEAPI = ["oneAPI"] [compat] -AMDGPU = "0.8" +AMDGPU = "1.0" Atomix = "0.1" CUDA = "5" Preferences = "1.4.0" From 67bc4989cc0ef2cf28de3489cd1296ff6f480781 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Mon, 28 Oct 2024 09:53:53 -0500 Subject: [PATCH 06/14] Remove version from runtests --- test/runtests.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 73df172..b17a5c9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,7 +11,8 @@ const backend = JACC.JACCPreferences.backend include("tests_cuda.jl") elseif backend == "amdgpu" - Pkg.add(; name = "AMDGPU", version = "v0.8.6") + # Pkg.add(; name = "AMDGPU", version = "v0.8.6") + Pkg.add("AMDGPU") println("AMDGPU backend loaded") include("tests_amdgpu.jl") From a58d1ada6d3246eb85e74c13eb03ba86afcaf1f5 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Mon, 28 Oct 2024 10:01:36 -0500 Subject: [PATCH 07/14] Allow more recent versions of julia --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 419afe9..599c8f7 100644 --- a/Project.toml +++ b/Project.toml @@ -22,4 +22,4 @@ AMDGPU = "1.0" Atomix = "0.1" CUDA = "5" Preferences = "1.4.0" -julia = "1.9.0" +julia = ">= 1.9.0" From c327dcaf29737804349f9c212b41ab9584079629 Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Tue, 29 Oct 2024 12:10:39 -0400 Subject: [PATCH 08/14] Added stencil-aware funtions for JACC.multi on CUDA back ends. There are still a couple of things to implement, such as the ghost copy functions for 2D arrays as well as capability for 3D Arrays --- ext/JACCCUDA/JACCMULTI.jl | 288 +++++++++++++++++++++++++++++++++++++- src/JACCMULTI.jl | 15 ++ 2 files changed, 300 insertions(+), 3 deletions(-) diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl index bdd6ef3..a43800a 100644 --- a/ext/JACCCUDA/JACCMULTI.jl +++ b/ext/JACCCUDA/JACCMULTI.jl @@ -56,24 +56,235 @@ function JACC.multi.Array(x::Base.Array{T,N}) where {T,N} end +function JACC.multi.gArray(x::Base.Array{T,N}) where {T,N} + + ndev = length(devices()) + ret = Vector{Any}(undef, 2) + + if ndims(x) == 1 + + device!(0) + s_array = length(x) + s_arrays = ceil(Int, s_array/ndev) + array_ret = Vector{Any}(undef, ndev) + pointer_ret = Vector{CuDeviceVector{T,CUDA.AS.Global}}(undef, ndev) + + for i in 1:ndev + device!(i-1) + if i == 1 + array_ret[i] = CuArray(x[((i-1)*s_arrays)+1:(i*s_arrays)+1]) + elseif i == ndev + array_ret[i] = CuArray(x[(((i-1)*s_arrays)+1)-1:i*s_arrays]) + else + array_ret[i] = CuArray(x[(((i-1)*s_arrays)+1)-1:(i*s_arrays)+1]) + end + pointer_ret[i] = cudaconvert(array_ret[i]) + end + + device!(0) + cuda_pointer_ret = CuArray(pointer_ret) + ret[1] = cuda_pointer_ret + ret[2] = array_ret + + elseif ndims(x) == 2 + + device!(0) + s_col_array = size(x,2) + s_col_arrays = ceil(Int, s_col_array/ndev) + array_ret = Vector{Any}(undef, ndev) + pointer_ret = Vector{CuDeviceMatrix{T,CUDA.AS.Global}}(undef, ndev) + + for i in 1:ndev + device!(i-1) + array_ret[i] = CuArray(x[:,((i-1)*s_col_arrays)+1:i*s_col_arrays]) + pointer_ret[i] = cudaconvert(array_ret[i]) + end + + device!(0) + + cuda_pointer_ret = CuArray(pointer_ret) + ret[1] = cuda_pointer_ret + ret[2] = array_ret + + end + + return ret + +end + function JACC.multi.copy(x::Vector{Any}, y::Vector{Any}) + + device!(0) + ndev = length(devices()) + + if ndims(x[2][1]) == 1 + + for i in 1:ndev + device!(i-1) + size = length(x[2][i]) + numThreads = 512 + threads = min(size, numThreads) + blocks = ceil(Int, size / threads) + @cuda threads=threads blocks=blocks _multi_copy(i, x[1], y[1]) + end + + for i in 1:ndev + device!(i-1) + synchronize() + end + + elseif ndims(x[2][1]) == 2 + + for i in 1:ndev + device!(i-1) + ssize = size(x[2][i]) + numThreads = 16 + Mthreads = min(ssize[1], numThreads) + Mblocks = ceil(Int, ssize[1] / Mthreads) + Nthreads = min(ssize[2], numThreads) + Nblocks = ceil(Int, ssize[2] / Nthreads) + @cuda threads=(Mthreads,Nthreads) blocks=(Mblocks,Nblocks) _multi_copy_2d(i, x[1], y[1]) + end + + for i in 1:ndev + device!(i-1) + synchronize() + end + + end + + device!(0) + +end + +function JACC.multi.gid(dev_id::I, i::I, ndev::I) where{I <: Integer} + ind = 0 + if dev_id == 1 + ind = i + elseif dev_id == ndev + ind = i + 1 + else + ind = i + 1 + end + return ind +end + + +function JACC.multi.gswap(x::Vector{Any}) + device!(0) ndev = length(devices()) + + if ndims(x[2][1]) == 1 + + #Left to right swapping + for i in 1:ndev-1 + device!(i-1) + tmp = Base.Array(x[2][i]) + size = length(tmp) + ghost_lr = tmp[size-1] + device!(i) + @cuda threads=32 blocks=1 _multi_swap_ghost_lr(i+1, x[1], ndev, size, ghost_lr) + end + + #Right to left swapping + for i in 2:ndev + device!(i-1) + tmp = Base.Array(x[2][i]) + size = length(tmp) + ghost_rl = tmp[2] + device!(i-2) + @cuda threads=32 blocks=1 _multi_swap_ghost_rl(i-1, x[1], ndev, size, ghost_rl) + end + + for i in 1:ndev + device!(i-1) + synchronize() + end + + elseif ndims(x[2][1]) == 2 + #Left to right swapping + for i in 1:ndev-1 + device!(i-1) + dim = size(x[2][i]) + tmp = Base.Array(x[2][i][:,dim[2]-1]) + device!(i) + ghost_lr = CuArray(tmp) + numThreads = 512 + threads = min(dim[1], numThreads) + blocks = ceil(Int, dim[1] / threads) + @cuda threads=threads blocks=blocks _multi_swap_2d_ghost_lr(i+1, x[1], ndev, dim[1], ghost_lr) + end + + #Right to left swapping + for i in 2:ndev + device!(i-1) + tmp = Base.Array(x[2][i][:,2]) + device!(i-2) + dim = size(x[2][i-1]) + ghost_rl = CuArray(tmp) + numThreads = 512 + threads = min(dim[1], numThreads) + blocks = ceil(Int, dim[1] / threads) + @cuda threads=threads blocks=blocks _multi_swap_2d_ghost_rl(i-1, x[1], ndev, dim[1], dim[2], ghost_rl) + end + + for i in 1:ndev + device!(i-1) + synchronize() + end + + end + + device!(0) + +end + +function JACC.multi.gcopytoarray(x::Vector{Any}, y::Vector{Any}) + #x is the array and y is the ghost array + device!(0) + ndev = length(devices()) + for i in 1:ndev device!(i-1) - size = length(x[2][i]) + size = length(y[2][i]) numThreads = 512 threads = min(size, numThreads) blocks = ceil(Int, size / threads) - @cuda threads=threads blocks=blocks _multi_copy(i, x[1], y[1]) + @cuda threads=threads blocks=blocks _multi_copy_ghosttoarray(i, x[1], y[1], size, ndev) + #synchronize() end - + for i in 1:ndev device!(i-1) synchronize() end + + device!(0) + +end +function JACC.multi.copytogarray(x::Vector{Any}, y::Vector{Any}) + #x is the ghost array and y is the array + device!(0) + ndev = length(devices()) + + for i in 1:ndev + device!(i-1) + size = length(x[2][i]) + numThreads = 512 + threads = min(size, numThreads) + blocks = ceil(Int, size / threads) + @cuda threads=threads blocks=blocks _multi_copy_arraytoghost(i, x[1], y[1], size, ndev) + #synchronize() + end + + for i in 1:ndev + device!(i-1) + synchronize() + end + device!(0) end @@ -215,6 +426,77 @@ function JACC.multi.parallel_reduce( end +function _multi_copy(dev_id, x, y) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + @inbounds x[dev_id][i] = y[dev_id][i] + return nothing +end + +function _multi_copy_2d(dev_id, x, y) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + @inbounds x[dev_id][i,j] = y[dev_id][i,j] + return nothing +end + +function _multi_copy_ghosttoarray(dev_id, x, y, size, ndev) + #x is the array and y is the ghost array + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if dev_id == 1 && i < size + @inbounds x[dev_id][i] = y[dev_id][i] + elseif dev_id == ndev && i > 1 + @inbounds x[dev_id][i-1] = y[dev_id][i] + elseif i > 1 && i < size + @inbounds x[dev_id][i-1] = y[dev_id][i] + end + return nothing +end + +function _multi_copy_arraytoghost(dev_id, x, y, size, ndev) + #x is the ghost array and y is the array + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if dev_id == 1 && i < size + @inbounds x[dev_id][i] = y[dev_id][i] + elseif dev_id == ndev && i < size + @inbounds x[dev_id][i+1] = y[dev_id][i] + elseif i > 1 && i < size + @inbounds x[dev_id][i] = y[dev_id][i] + end + return nothing +end + +function _multi_swap_ghost_lr(dev_id, x, ndev, size, ghost) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i == 1 + x[dev_id][i] = ghost + end + return nothing +end + +function _multi_swap_2d_ghost_lr(dev_id, x, ndev, size, ghost) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i < size + 1 + x[dev_id][i, 1] = ghost[i] + end + return nothing +end + +function _multi_swap_ghost_rl(dev_id, x, ndev, size, ghost) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i == 1 + x[dev_id][size] = ghost + end + return nothing +end + +function _multi_swap_2d_ghost_rl(dev_id, x, ndev, size, col, ghost) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i < size + 1 + x[dev_id][i, col] = ghost[i] + end + return nothing +end + function _multi_parallel_for_cuda(N, dev_id, f, x...) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x if i <= N diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl index 6dd4c10..3457d3c 100644 --- a/src/JACCMULTI.jl +++ b/src/JACCMULTI.jl @@ -9,6 +9,21 @@ function Array(x::Base.Array{T,N}) where {T,N} return x end +function gArray(x::Base.Array{T,N}) where {T,N} +end + +function gid(dev_id::I, i::I, ndev::I) where{I <: Integer} +end + +function gswap(x::Vector{Any}) +end + +function gcopytoarray(x::Vector{Any}, y::Vector{Any}) +end + +function copytogarray(x::Vector{Any}, y::Vector{Any}) +end + function copy(x::Vector{Any}, y::Vector{Any}) end From ffe94481ca2397af194c4d82a5eee3ddf618c013 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Wed, 30 Oct 2024 10:45:53 -0500 Subject: [PATCH 09/14] Fix errors and restrict version --- Project.toml | 2 +- ext/JACCAMDGPU/JACCMULTI.jl | 1 + test/runtests.jl | 4 +- test/tests_amdgpu.jl | 104 ++++++++++++++++++------------------ 4 files changed, 56 insertions(+), 55 deletions(-) diff --git a/Project.toml b/Project.toml index 599c8f7..6e04426 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,7 @@ JACCCUDA = ["CUDA"] JACCONEAPI = ["oneAPI"] [compat] -AMDGPU = "1.0" +AMDGPU = "0.8" Atomix = "0.1" CUDA = "5" Preferences = "1.4.0" diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl index c5aefce..8af4bd3 100644 --- a/ext/JACCAMDGPU/JACCMULTI.jl +++ b/ext/JACCAMDGPU/JACCMULTI.jl @@ -1,6 +1,7 @@ module multi using JACC, AMDGPU +using JACCAMDGPU: AMDGPUBackend function JACC.multi.ndev(::AMDGPUBackend) return length(AMDGPU.devices()) diff --git a/test/runtests.jl b/test/runtests.jl index b17a5c9..e9a04e7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,8 +11,8 @@ const backend = JACC.JACCPreferences.backend include("tests_cuda.jl") elseif backend == "amdgpu" - # Pkg.add(; name = "AMDGPU", version = "v0.8.6") - Pkg.add("AMDGPU") + Pkg.add(; name = "AMDGPU", version = "v0.8.6") + # Pkg.add("AMDGPU") println("AMDGPU backend loaded") include("tests_amdgpu.jl") diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index d2364aa..61a6742 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -147,57 +147,57 @@ end # result = Array(jresult) # @test result[1]≈ref_result rtol=1e-8 - x = ones(1_000) - y = ones(1_000) - y1 = y*2 - jx = JACC.ones(1_000) - jy = JACC.ones(1_000) - jy1 = jy*2 - alpha = 2.0 - - function seq_axpy(N, alpha, x, y) - for i in 1:N - @inbounds x[i] += alpha * y[i] - end - end - - function seq_dot(N, x, y) - r = 0.0 - for i in 1:N - @inbounds r += x[i] * y[i] - end - return r - end - function seq_scal(N, alpha, x) - for i in 1:N - @inbounds x[i] = alpha * x[i] - end - end - - function seq_asum(N, x) - r = 0.0 - for i in 1:N - @inbounds r += abs(x[i]) - end - return r - end - - function seq_nrm2(N, x) - sum_sq = 0.0 - for i in 1:N - @inbounds sum_sq += x[i]*x[i] - end - r = sqrt(sum_sq) - return r - end - - function seq_swap(N, x, y1) - for i in 1:N - @inbounds t = x[i] - @inbounds x[i] = y1[i] - @inbounds y1[i] = t - end - end +# x = ones(1_000) +# y = ones(1_000) +# y1 = y*2 +# jx = JACC.ones(1_000) +# jy = JACC.ones(1_000) +# jy1 = jy*2 +# alpha = 2.0 +# +# function seq_axpy(N, alpha, x, y) +# for i in 1:N +# @inbounds x[i] += alpha * y[i] +# end +# end +# +# function seq_dot(N, x, y) +# r = 0.0 +# for i in 1:N +# @inbounds r += x[i] * y[i] +# end +# return r +# end +# function seq_scal(N, alpha, x) +# for i in 1:N +# @inbounds x[i] = alpha * x[i] +# end +# end +# +# function seq_asum(N, x) +# r = 0.0 +# for i in 1:N +# @inbounds r += abs(x[i]) +# end +# return r +# end +# +# function seq_nrm2(N, x) +# sum_sq = 0.0 +# for i in 1:N +# @inbounds sum_sq += x[i]*x[i] +# end +# r = sqrt(sum_sq) +# return r +# end +# +# function seq_swap(N, x, y1) +# for i in 1:N +# @inbounds t = x[i] +# @inbounds x[i] = y1[i] +# @inbounds y1[i] = t +# end +# end # ref_result = seq_axpy(1_000, alpha, x, y) # ref_result = seq_dot(1_000, x, y) @@ -229,7 +229,7 @@ end # @test x == Array(jx) # @test y1 == Array(jy1) -end +# end @testset "Add-2D" begin function add!(i, j, A, B, C) From 7ca63bbb74ff6140a1c1661be3a5e88ffd92efa8 Mon Sep 17 00:00:00 2001 From: William F Godoy Date: Wed, 30 Oct 2024 14:31:45 -0400 Subject: [PATCH 10/14] Bump Julia to 1.10 on AMD GPU CI --- .github/workflows/ci-gpu-AMD.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-gpu-AMD.yaml b/.github/workflows/ci-gpu-AMD.yaml index 49a293b..815f26e 100644 --- a/.github/workflows/ci-gpu-AMD.yaml +++ b/.github/workflows/ci-gpu-AMD.yaml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - jobname: [ROCM6-JULIA1_9_1-AMDGPU0_8_6] + jobname: [ROCM6-JULIA1_10_4-AMDGPU0_8_6] steps: # Only trigger CI for certain "actors" (those commenting the PR, not the PR originator) @@ -75,7 +75,7 @@ jobs: if: steps.check.outputs.triggered == 'true' run: | source /etc/profile.d/lmod.sh - module load julia/1.9.1 + module load julia/1.10.4 module load rocm julia --project -e 'using Pkg; Pkg.instantiate()' julia --project -e 'using JACC.JACCPreferences; JACCPreferences.set_backend("AMDGPU")' @@ -84,7 +84,7 @@ jobs: if: steps.check.outputs.triggered == 'true' run: | source /etc/profile.d/lmod.sh - module load julia/1.9.1 + module load julia/1.10.4 module load rocm julia --project -e 'using Pkg; Pkg.test()' From c286f946e15603415dd42c205893905f3a8cf868 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Wed, 30 Oct 2024 14:31:17 -0500 Subject: [PATCH 11/14] Added range checks to parallel_for implementations --- ext/JACCAMDGPU/JACCAMDGPU.jl | 18 ++++++++++++------ ext/JACCCUDA/JACCCUDA.jl | 22 +++++++++++++--------- ext/JACCONEAPI/JACCONEAPI.jl | 18 ++++++++++++------ 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl index 2b901c6..35f69d0 100644 --- a/ext/JACCAMDGPU/JACCAMDGPU.jl +++ b/ext/JACCAMDGPU/JACCAMDGPU.jl @@ -23,7 +23,7 @@ function JACC.parallel_for(::AMDGPUBackend, N::I, f::F, x...) where {I <: Intege # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA shmem_size = 2 * threads * sizeof(Float64) - @roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(f, x...) + @roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(N, f, x...) AMDGPU.synchronize() end @@ -37,7 +37,7 @@ function JACC.parallel_for( # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA shmem_size = 2 * Mthreads * Nthreads * sizeof(Float64) - @roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN(f, x...) + @roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN((M,N), f, x...) AMDGPU.synchronize() end @@ -54,7 +54,7 @@ function JACC.parallel_for( # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA shmem_size = 2 * Lthreads * Mthreads * Nthreads * sizeof(Float64) - @roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN(f, x...) + @roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN((L,M,N), f, x...) AMDGPU.synchronize() end @@ -92,23 +92,29 @@ function JACC.parallel_reduce( return rret end -function _parallel_for_amdgpu(f, x...) +function _parallel_for_amdgpu(N, f, x...) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i > N && return nothing f(i, x...) return nothing end -function _parallel_for_amdgpu_MN(f, x...) +function _parallel_for_amdgpu_MN((M,N), f, x...) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y + i > M && return nothing + j > N && return nothing f(i, j, x...) return nothing end -function _parallel_for_amdgpu_LMN(f, x...) +function _parallel_for_amdgpu_LMN((L,M,N), f, x...) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y k = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z + i > L && return nothing + j > M && return nothing + k > N && return nothing f(i, j, k, x...) return nothing end diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl index 517f3e8..f3fd39c 100644 --- a/ext/JACCCUDA/JACCCUDA.jl +++ b/ext/JACCCUDA/JACCCUDA.jl @@ -39,7 +39,7 @@ function JACC.parallel_for( Mblocks = ceil(Int, M / Mthreads) Nblocks = ceil(Int, N / Nthreads) shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) - CUDA.@sync @cuda threads = (Mthreads, Nthreads) blocks = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_MN(f, x...) + CUDA.@sync @cuda threads = (Mthreads, Nthreads) blocks = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_MN((M,N), f, x...) end function JACC.parallel_for( @@ -55,7 +55,7 @@ function JACC.parallel_for( Mblocks = ceil(Int, M / Mthreads) Nblocks = ceil(Int, N / Nthreads) shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) - CUDA.@sync @cuda threads = (Lthreads, Mthreads, Nthreads) blocks = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_LMN(f, x...) + CUDA.@sync @cuda threads = (Lthreads, Mthreads, Nthreads) blocks = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_LMN((L,M,N), f, x...) end function JACC.parallel_reduce( @@ -93,23 +93,27 @@ end function _parallel_for_cuda(N, f, x...) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if i <= N - f(i, x...) - end + i > N && return nothing + f(i, x...) return nothing end -function _parallel_for_cuda_MN(f, x...) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - i = (blockIdx().y - 1) * blockDim().y + threadIdx().y +function _parallel_for_cuda_MN((M,N), f, x...) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + i > M && return nothing + j > N && return nothing f(i, j, x...) return nothing end -function _parallel_for_cuda_LMN(f, x...) +function _parallel_for_cuda_LMN((L,M,N), f, x...) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + i > L && return nothing + j > M && return nothing + k > N && return nothing f(i, j, k, x...) return nothing end diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl index 929146c..8d9ad14 100644 --- a/ext/JACCONEAPI/JACCONEAPI.jl +++ b/ext/JACCONEAPI/JACCONEAPI.jl @@ -20,7 +20,7 @@ function JACC.parallel_for(::oneAPIBackend, N::I, f::F, x...) where {I <: Intege # We must know how to get the max shared memory to be used in oneAPI as it is done in CUDA #shmem_size = 2 * threads * sizeof(Float64) #oneAPI.@sync @oneapi items = items groups = groups shmem = shmem_size _parallel_for_oneapi(f, x...) - oneAPI.@sync @oneapi items = items groups = groups _parallel_for_oneapi(f, x...) + oneAPI.@sync @oneapi items = items groups = groups _parallel_for_oneapi(N, f, x...) end function JACC.parallel_for( @@ -30,7 +30,7 @@ function JACC.parallel_for( Nitems = min(N, maxPossibleItems) Mgroups = ceil(Int, M / Mitems) Ngroups = ceil(Int, N / Nitems) - oneAPI.@sync @oneapi items=(Mitems, Nitems) groups=(Mgroups, Ngroups) _parallel_for_oneapi_MN( + oneAPI.@sync @oneapi items=(Mitems, Nitems) groups=(Mgroups, Ngroups) _parallel_for_oneapi_MN((M,N), f, x...) end @@ -44,7 +44,7 @@ function JACC.parallel_for( Mgroups = ceil(Int, M / Mitems) Ngroups = ceil(Int, N / Nitems) oneAPI.@sync @oneapi items=(Litems, Mitems, Nitems) groups=( - Lgroups, Mgroups, Ngroups) _parallel_for_oneapi_LMN( + Lgroups, Mgroups, Ngroups) _parallel_for_oneapi_LMN((L,M,N), f, x...) end @@ -77,23 +77,29 @@ function JACC.parallel_reduce( return rret end -function _parallel_for_oneapi(f, x...) +function _parallel_for_oneapi(N, f, x...) i = get_global_id() + i > N && return nothing f(i, x...) return nothing end -function _parallel_for_oneapi_MN(f, x...) +function _parallel_for_oneapi_MN((M,N), f, x...) j = get_global_id(0) i = get_global_id(1) + i > M && return nothing + j > N && return nothing f(i, j, x...) return nothing end -function _parallel_for_oneapi_LMN(f, x...) +function _parallel_for_oneapi_LMN((L,M,N), f, x...) i = get_global_id(0) j = get_global_id(1) k = get_global_id(2) + i > L && return nothing + j > M && return nothing + k > N && return nothing f(i, j, k, x...) return nothing end From 242418e0fe035ccccdc4c6d3f22fc3c51f0435e7 Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Thu, 31 Oct 2024 11:30:54 -0400 Subject: [PATCH 12/14] Added ghost cells support for JACC.multi on AMDGPU backend. There is still work to do to support bidimensional and tridimensional arrays --- ext/JACCAMDGPU/JACCMULTI.jl | 322 +++++++++++++++++++++++++++++++++++- ext/JACCCUDA/JACCMULTI.jl | 15 ++ 2 files changed, 335 insertions(+), 2 deletions(-) diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl index 8af4bd3..c42cf15 100644 --- a/ext/JACCAMDGPU/JACCMULTI.jl +++ b/ext/JACCAMDGPU/JACCMULTI.jl @@ -69,8 +69,255 @@ function JACC.multi.Array(::AMDGPUBackend, x::Base.Array{T,N}) where {T,N} end +function JACC.multi.gArray(::AMDGPUBackend, x::Base.Array{T,N}) where {T,N} + + ndev = length(AMDGPU.devices()) + ret = Vector{Any}(undef, 2) + + if ndims(x) == 1 + + AMDGPU.device!(AMDGPU.device(1)) + s_array = length(x) + s_arrays = ceil(Int, s_array/ndev) + array_ret = Vector{Any}(undef, ndev) + pointer_ret = Vector{AMDGPU.Device.ROCDeviceVector{T,AMDGPU.AS.Global}}(undef, ndev) + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + if i == 1 + array_ret[i] = ROCArray(x[((i-1)*s_arrays)+1:(i*s_arrays)+1]) + elseif i == ndev + array_ret[i] = ROCArray(x[(((i-1)*s_arrays)+1)-1:i*s_arrays]) + else + array_ret[i] = ROCArray(x[(((i-1)*s_arrays)+1)-1:(i*s_arrays)+1]) + end + pointer_ret[i] = AMDGPU.rocconvert(array_ret[i]) + end + + AMDGPU.device!(AMDGPU.device(1)) + #amdgpu_pointer_ret = ROCArray(pointer_ret) + amdgpu_pointer_ret = get_portable_rocarray(pointer_ret) + copyto!(amdgpu_pointer_ret, pointer_ret) + ret[1] = amdgpu_pointer_ret + ret[2] = array_ret + + elseif ndims(x) == 2 + + AMDGPU.device!(AMDGPU.device(1)) + #s_row_array = size(x,1) + s_col_array = size(x,2) + s_col_arrays = ceil(Int, s_col_array/ndev) + #println(s_col_arrays) + array_ret = Vector{Any}(undef, ndev) + pointer_ret = Vector{AMDGPU.Device.ROCDeviceMatrix{T,1}}(undef, ndev) + + i_col_arrays = floor(Int, s_col_array/ndev) + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + if i == 1 + array_ret[i] = ROCArray(x[:,((i-1)*s_col_arrays)+1:i*s_col_arrays+1]) + elseif i == ndev + array_ret[i] = ROCArray(x[:,(((i-1)*s_col_arrays)+1)-1:i*s_col_arrays]) + else + array_ret[i] = ROCArray(x[:,(((i-1)*s_col_arrays)+1)-1:i*s_col_arrays+1]) + end + pointer_ret[i] = AMDGPU.rocconvert(array_ret[i]) + end + + AMDGPU.device!(AMDGPU.device(1)) + #amdgpu_pointer_ret = ROCArray(pointer_ret) + amdgpu_pointer_ret = get_portable_rocarray(pointer_ret) + copyto!(amdgpu_pointer_ret, pointer_ret) + ret[1] = amdgpu_pointer_ret + ret[2] = array_ret + + end + + return ret + +end + function JACC.multi.copy(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any}) + AMDGPU.device!(AMDGPU.device(1)) + ndev = length(AMDGPU.devices()) + + if ndims(x[2][1]) == 1 + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + size = length(x[2][i]) + numThreads = 512 + threads = min(size, numThreads) + blocks = ceil(Int, size / threads) + @roc groupsize=threads gridsize=blocks _multi_copy(i, x[1], y[1]) + end + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + AMDGPU.synchronize() + end + + elseif ndims(x[2][1]) == 2 + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + ssize = size(x[2][i]) + numThreads = 16 + Mthreads = min(ssize[1], numThreads) + Mblocks = ceil(Int, ssize[1] / Mthreads) + Nthreads = min(ssize[2], numThreads) + Nblocks = ceil(Int, ssize[2] / Nthreads) + @roc groupsize=(Mthreads,Nthreads) gridsize=(Mblocks,Nblocks) _multi_copy_2d(i, x[1], y[1]) + end + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + AMDGPU.synchronize() + end + + end + + AMDGPU.device!(AMDGPU.device(1)) + +end + +function JACC.multi.gid(::AMDGPUBackend, dev_id::I, i::I, ndev::I) where{I <: Integer} + + ind = 0 + + if dev_id == 1 + ind = i + elseif dev_id == ndev + ind = i + 1 + else + ind = i + 1 + end + + return ind +end + +function JACC.multi.gid(::AMDGPUBackend, dev_id::I, (i,j)::Tuple{I,I}, ndev::I) where{I <: Integer} + + ind = (0, 0) + + if dev_id == 1 + ind = (i, j) + elseif dev_id == ndev + ind = (i, j+1) + else + ind = (i, j+1) + end + + return ind + +end + +function JACC.multi.gswap(::AMDGPUBackend, x::Vector{Any}) + + AMDGPU.device!(AMDGPU.device(1)) + ndev = length(AMDGPU.devices()) + + if ndims(x[2][1]) == 1 + + #Left to right swapping + for i in 1:ndev-1 + AMDGPU.device!(AMDGPU.device(i)) + tmp = Base.Array(x[2][i]) + size = length(tmp) + ghost_lr = tmp[size-1] + AMDGPU.device!(AMDGPU.device(i+1)) + @roc groupsize=32 gridsize=1 _multi_swap_ghost_lr(i+1, x[1], ndev, size, ghost_lr) + end + + #Right to left swapping + for i in 2:ndev + AMDGPU.device!(AMDGPU.device(i)) + tmp = Base.Array(x[2][i]) + if (i - 1) == 1 + size = length(tmp) - 1 + else + size = length(tmp) + end + ghost_rl = tmp[2] + AMDGPU.device!(AMDGPU.device(i-1)) + @roc groupsize=32 gridsize=1 _multi_swap_ghost_rl(i-1, x[1], ndev, size, ghost_rl) + end + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + AMDGPU.synchronize() + end + + elseif ndims(x[2][1]) == 2 + + #Left to right swapping + for i in 1:ndev-1 + AMDGPU.device!(AMDGPU.device(i)) + dim = size(x[2][i]) + tmp = Base.Array(x[2][i][:,dim[2]-1]) + AMDGPU.device!(AMDGPU.device(i+1)) + ghost_lr = ROCArray(tmp) + numThreads = 512 + threads = min(dim[1], numThreads) + blocks = ceil(Int, dim[1] / threads) + #x[2][i+1][:,1] = ghost_lr + @roc groupsize=threads gridsize=blocks _multi_swap_2d_ghost_lr(i+1, x[1], ndev, dim[1], ghost_lr) + #AMDGPU.synchronize() + end + + #Right to left swapping + for i in 2:ndev + AMDGPU.device!(AMDGPU.device(i)) + tmp = Base.Array(x[2][i][:,2]) + AMDGPU.device!(AMDGPU.device(i-1)) + dim = size(x[2][i-1]) + ghost_rl = ROCArray(tmp) + numThreads = 512 + threads = min(dim[1], numThreads) + blocks = ceil(Int, dim[1] / threads) + @roc groupsize=threads gridsize=blocks _multi_swap_2d_ghost_rl(i-1, x[1], ndev, dim[1], dim[2], ghost_rl) + #AMDGPU.synchronize() + end + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + AMDGPU.synchronize() + end + + end + + AMDGPU.device!(AMDGPU.device(1)) + +end + +function JACC.multi.gcopytoarray(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any}) + + #x is the array and y is the ghost array + AMDGPU.device!(AMDGPU.device(1)) + ndev = length(AMDGPU.devices()) + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + size = length(y[2][i]) + numThreads = 512 + threads = min(size, numThreads) + blocks = ceil(Int, size / threads) + @roc groupsize=threads gridsize=blocks _multi_copy_ghosttoarray(i, x[1], y[1], size, ndev) + end + + for i in 1:ndev + AMDGPU.device!(AMDGPU.device(i)) + AMDGPU.synchronize() + end + + AMDGPU.device!(AMDGPU.device(1)) + +end + +function JACC.multi.copytogarray(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any}) + + #x is the ghost array and y is the array AMDGPU.device!(AMDGPU.device(1)) ndev = length(AMDGPU.devices()) @@ -80,8 +327,7 @@ function JACC.multi.copy(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any}) numThreads = 512 threads = min(size, numThreads) blocks = ceil(Int, size / threads) - @roc groupsize=threads gridsize=blocks _multi_copy(i, x[1], y[1]) - #AMDGPU.synchronize() + @roc groupsize=threads gridsize=blocks _multi_copy_arraytoghost(i, x[1], y[1], size, ndev) end for i in 1:ndev @@ -248,6 +494,78 @@ function JACC.multi.parallel_reduce(::AMDGPUBackend, end +function _multi_copy(dev_id, x, y) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + @inbounds x[dev_id][i] = y[dev_id][i] + return nothing +end + +function _multi_copy_2d(dev_id, x, y) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y + @inbounds x[dev_id][i,j] = y[dev_id][i,j] + return nothing +end + +function _multi_copy_ghosttoarray(dev_id, x, y, size, ndev) + #x is the array and y is the ghost array + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if dev_id == 1 && i < size + @inbounds x[dev_id][i] = y[dev_id][i] + elseif dev_id == ndev && i > 1 + @inbounds x[dev_id][i-1] = y[dev_id][i] + elseif i > 1 && i < size + @inbounds x[dev_id][i-1] = y[dev_id][i] + end + return nothing +end + +function _multi_copy_arraytoghost(dev_id, x, y, size, ndev) + #x is the ghost array and y is the array + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if dev_id == 1 && i < size + @inbounds x[dev_id][i] = y[dev_id][i] + elseif dev_id == ndev && i < size + @inbounds x[dev_id][i+1] = y[dev_id][i] + elseif i > 1 && i < size + @inbounds x[dev_id][i] = y[dev_id][i] + end + return nothing +end + +function _multi_swap_ghost_lr(dev_id, x, ndev, size, ghost) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i == 1 + x[dev_id][i] = ghost + end + return nothing +end + +function _multi_swap_2d_ghost_lr(dev_id, x, ndev, size, ghost) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i < size + 1 + x[dev_id][i, 1] = ghost[i] + end + return nothing +end + +function _multi_swap_ghost_rl(dev_id, x, ndev, size, ghost) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i == 1 + #x[dev_id][120] = ghost + x[dev_id][size] = ghost + end + return nothing +end + +function _multi_swap_2d_ghost_rl(dev_id, x, ndev, size, col, ghost) + i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + if i < size + 1 + x[dev_id][i, col] = ghost[i] + end + return nothing +end + function _multi_parallel_for_amdgpu(N, dev_id, f, x...) i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x if i <= N diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl index 50f855b..85a0375 100644 --- a/ext/JACCCUDA/JACCMULTI.jl +++ b/ext/JACCCUDA/JACCMULTI.jl @@ -169,6 +169,21 @@ function JACC.multi.gid(::CUDABackend, dev_id::I, i::I, ndev::I) where{I <: Inte return ind end +function JACC.multi.gid(::CUDABackend, dev_id::I, (i,j)::Tuple{I,I}, ndev::I) where{I <: Integer} + + ind = (0, 0) + + if dev_id == 1 + ind = (i, j) + elseif dev_id == ndev + ind = (i, j+1) + else + ind = (i, j+1) + end + + return ind + +end function JACC.multi.gswap(::CUDABackend, x::Vector{Any}) From bd447d148cf6a1865c7f263209154239943accd6 Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Thu, 31 Oct 2024 16:12:33 -0400 Subject: [PATCH 13/14] Fixed JACCMULTI module for the new precompiling code. Note that we need a AMDGPU version of at least 1.0 or above for JACC.multi --- Project.toml | 2 +- src/JACCMULTI.jl | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Project.toml b/Project.toml index 6e04426..599c8f7 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,7 @@ JACCCUDA = ["CUDA"] JACCONEAPI = ["oneAPI"] [compat] -AMDGPU = "0.8" +AMDGPU = "1.0" Atomix = "0.1" CUDA = "5" Preferences = "1.4.0" diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl index 2260d94..fb0a7e1 100644 --- a/src/JACCMULTI.jl +++ b/src/JACCMULTI.jl @@ -41,51 +41,51 @@ function parallel_reduce(::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) wher end function ndev() - return ndev(default_backend()) + return ndev(JACC.default_backend()) end function Array(x::Base.Array{T,N}) where {T,N} - return Array(default_backend(), x) + return Array(JACC.default_backend(), x) end function gArray(x::Base.Array{T,N}) where {T,N} - return gArray(default_backend(), x) + return gArray(JACC.default_backend(), x) end function gid(dev_id::I, i::I, ndev::I) where{I <: Integer} - return gid(default_backend(), dev_id, i, ndev) + return gid(JACC.default_backend(), dev_id, i, ndev) end function gswap(x::Vector{Any}) - return gswap(default_backend(), x) + return gswap(JACC.default_backend(), x) end function gcopytoarray(x::Vector{Any}, y::Vector{Any}) - return gcopytoarray(default_backend(), x, y) + return gcopytoarray(JACC.default_backend(), x, y) end function copytogarray(x::Vector{Any}, y::Vector{Any}) - return copytogarray(default_backend(), x, y) + return copytogarray(JACC.default_backend(), x, y) end function copy(x::Vector{Any}, y::Vector{Any}) - return copy(default_backend(), x, y) + return copy(JACC.default_backend(), x, y) end function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} - return parallel_for(default_backend(), N, f, x...) + return parallel_for(JACC.default_backend(), N, f, x...) end function parallel_for((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} - return parallel_for(default_backend(), (M, N), f, x...) + return parallel_for(JACC.default_backend(), (M, N), f, x...) end function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} - return parallel_reduce(default_backend(), N, f, x...) + return parallel_reduce(JACC.default_backend(), N, f, x...) end function parallel_reduce((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} - return parallel_reduce(default_backend(), (M, N), f, x...) + return parallel_reduce(JACC.default_backend(), (M, N), f, x...) end end # module multi From bdfd909c3666318e32bdfbbee7dc2292132f17f1 Mon Sep 17 00:00:00 2001 From: Philip Fackler Date: Fri, 1 Nov 2024 12:18:29 -0500 Subject: [PATCH 14/14] Moved and refactored most tests as common portable versions --- src/JACC.jl | 4 +- test/runtests.jl | 2 + test/tests_amdgpu.jl | 136 +------------- test/tests_cuda.jl | 307 +----------------------------- test/tests_oneapi.jl | 99 ---------- test/tests_threads.jl | 396 +-------------------------------------- test/unittests.jl | 421 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 433 insertions(+), 932 deletions(-) create mode 100644 test/unittests.jl diff --git a/src/JACC.jl b/src/JACC.jl index 78dd67e..b986383 100644 --- a/src/JACC.jl +++ b/src/JACC.jl @@ -5,6 +5,8 @@ import Atomix: @atomic # module to set backend preferences include("JACCPreferences.jl") +default_backend() = get_backend(JACCPreferences._backend_dispatchable) + struct ThreadsBackend end include("helper.jl") @@ -97,8 +99,6 @@ struct Array{T, N} end (::Type{Array})(args...; kwargs...) = array_type()(args...; kwargs...) -default_backend() = get_backend(JACCPreferences._backend_dispatchable) - array_type() = array_type(default_backend()) function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} diff --git a/test/runtests.jl b/test/runtests.jl index e9a04e7..8ae0d30 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -25,3 +25,5 @@ elseif backend == "threads" println("Threads backend loaded") include("tests_threads.jl") end + +include("unittests.jl") diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index 61a6742..5707691 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -6,117 +6,20 @@ using Test @test JACC.JACCPreferences.backend == "amdgpu" end -@testset "VectorAddLambda" begin - function f(i, a) - @inbounds a[i] += 5.0 - end - - N = 10 - dims = (N) - a = round.(rand(Float32, dims) * 100) - - a_device = JACC.Array(a) - JACC.parallel_for(N, f, a_device) - - a_expected = a .+ 5.0 - @test Array(a_device)≈a_expected rtol=1e-5 -end - -@testset "AXPY" begin - function axpy(i, alpha, x, y) - @inbounds x[i] += alpha * y[i] - end - - function seq_axpy(N, alpha, x, y) - @inbounds for i in 1:N - x[i] += alpha * y[i] - end - end - - N = 10 - # Generate random vectors x and y of length N for the interval [0, 100] - x = round.(rand(Float32, N) * 100) - y = round.(rand(Float32, N) * 100) - alpha = 2.5 - - x_device = JACC.Array(x) - y_device = JACC.Array(y) - JACC.parallel_for(N, axpy, alpha, x_device, y_device) - - x_expected = x - seq_axpy(N, alpha, x_expected, y) - - @test Array(x_device)≈x_expected rtol=1e-1 -end - -@testset "AtomicCounter" begin - function axpy_counter!(i, alpha, x, y, counter) - @inbounds x[i] += alpha * y[i] - JACC.@atomic counter[1] += 1 - end - - N = Int32(10) - # Generate random vectors x and y of length N for the interval [0, 100] - alpha = 2.5 - - x = JACC.Array(round.(rand(Float32, N) * 100)) - y = JACC.Array(round.(rand(Float32, N) * 100)) - counter = JACC.Array{Int32}([0]) - JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter) - - @test Array(counter)[1] == N -end - -@testset "zeros" begin +@testset "zeros_type" begin N = 10 x = JACC.zeros(Float64, N) @test typeof(x) == AMDGPU.ROCArray{Float64, 1, AMDGPU.Runtime.Mem.HIPBuffer} @test eltype(x) == Float64 - @test zeros(N)≈Array(x) rtol=1e-5 - - function add_one(i, x) - @inbounds x[i] += 1 - end - - JACC.parallel_for(N, add_one, x) - @test ones(N)≈Array(x) rtol=1e-5 end -@testset "ones" begin +@testset "ones_type" begin N = 10 x = JACC.ones(Float64, N) @test typeof(x) == AMDGPU.ROCArray{Float64, 1, AMDGPU.Runtime.Mem.HIPBuffer} @test eltype(x) == Float64 - @test ones(N)≈Array(x) rtol=1e-5 - - function minus_one(i, x) - @inbounds x[i] -= 1 - end - - JACC.parallel_for(N, minus_one, x) - @test zeros(N)≈Array(x) rtol=1e-5 end -@testset "shared" begin - N = 100 - alpha = 2.5 - x = JACC.ones(Float64, N) - x_shared = JACC.ones(Float64, N) - y = JACC.ones(Float64, N) - - function scal(i, x, y, alpha) - @inbounds x[i] = y[i] * alpha - end - - function scal_shared(i, x, y, alpha) - y_shared = JACC.shared(y) - @inbounds x[i] = y_shared[i] * alpha - end - - JACC.parallel_for(N, scal, x, y, alpha) - JACC.parallel_for(N, scal_shared, x_shared, y, alpha) - @test x≈x_shared rtol=1e-8 -end #@testset "JACC.BLAS" begin # function seq_axpy(N, alpha, x, y) @@ -231,41 +134,6 @@ end # end -@testset "Add-2D" begin - function add!(i, j, A, B, C) - @inbounds C[i, j] = A[i, j] + B[i, j] - end - - M = 10 - N = 10 - A = JACC.Array(ones(Float32, M, N)) - B = JACC.Array(ones(Float32, M, N)) - C = JACC.Array(zeros(Float32, M, N)) - - JACC.parallel_for((M, N), add!, A, B, C) - - C_expected = Float32(2.0) .* ones(Float32, M, N) - @test Array(C)≈C_expected rtol=1e-5 -end - -@testset "Add-3D" begin - function add!(i, j, k, A, B, C) - @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k] - end - - L = 10 - M = 10 - N = 10 - A = JACC.Array(ones(Float32, L, M, N)) - B = JACC.Array(ones(Float32, L, M, N)) - C = JACC.Array(zeros(Float32, L, M, N)) - - JACC.parallel_for((L, M, N), add!, A, B, C) - - C_expected = Float32(2.0) .* ones(Float32, L, M, N) - @test Array(C)≈C_expected rtol=1e-5 -end - #@testset "JACC.multi" begin # x = ones(1_000) diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl index 0dab756..2e71272 100644 --- a/test/tests_cuda.jl +++ b/test/tests_cuda.jl @@ -6,160 +6,20 @@ using Test @test JACC.JACCPreferences.backend == "cuda" end -@testset "VectorAddLambda" begin - function f(i, a) - @inbounds a[i] += 5.0 - end - - N = 10 - dims = (N) - a = round.(rand(Float32, dims) * 100) - - a_device = JACC.Array(a) - JACC.parallel_for(N, f, a_device) - - a_expected = a .+ 5.0 - @test Array(a_device)≈a_expected rtol=1e-5 -end - -@testset "AXPY" begin - function axpy(i, alpha, x, y) - @inbounds x[i] += alpha * y[i] - end - - function seq_axpy(N, alpha, x, y) - @inbounds for i in 1:N - x[i] += alpha * y[i] - end - end - - N = 10 - # Generate random vectors x and y of length N for the interval [0, 100] - x = round.(rand(Float32, N) * 100) - y = round.(rand(Float32, N) * 100) - alpha = 2.5 - - x_device = JACC.Array(x) - y_device = JACC.Array(y) - JACC.parallel_for(N, axpy, alpha, x_device, y_device) - - x_expected = x - seq_axpy(N, alpha, x_expected, y) - - @test Array(x_device)≈x_expected rtol=1e-1 -end - -@testset "AtomicCounter" begin - function axpy_counter!(i, alpha, x, y, counter) - @inbounds x[i] += alpha * y[i] - JACC.@atomic counter[1] += 1 - end - - N = Int32(10) - # Generate random vectors x and y of length N for the interval [0, 100] - alpha = 2.5 - - x = JACC.Array(round.(rand(Float32, N) * 100)) - y = JACC.Array(round.(rand(Float32, N) * 100)) - counter = JACC.Array{Int32}([0]) - JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter) - - @test Array(counter)[1] == N -end - -@testset "zeros" begin +@testset "zeros_type" begin N = 10 x = JACC.zeros(Float64, N) - @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer} + @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.DeviceMemory} @test eltype(x) == Float64 - @test zeros(N)≈Array(x) rtol=1e-5 - - function add_one(i, x) - @inbounds x[i] += 1 - end - - JACC.parallel_for(N, add_one, x) - @test ones(N)≈Array(x) rtol=1e-5 end -@testset "ones" begin +@testset "ones_type" begin N = 10 x = JACC.ones(Float64, N) - @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer} + @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.DeviceMemory} @test eltype(x) == Float64 - @test ones(N)≈Array(x) rtol=1e-5 - - function minus_one(i, x) - @inbounds x[i] -= 1 - end - - JACC.parallel_for(N, minus_one, x) - @test zeros(N)≈Array(x) rtol=1e-5 end -# @testset "CG" begin - -# function matvecmul(i, a1, a2, a3, x, y, SIZE) -# if i == 1 -# y[i] = a2[i] * x[i] + a1[i] * x[i+1] -# elseif i == SIZE -# y[i] = a3[i] * x[i-1] + a2[i] * x[i] -# elseif i > 1 && i < SIZE -# y[i] = a3[i] * x[i-1] + a1[i] * +x[i] + a1[i] * +x[i+1] -# end -# end - -# function dot(i, x, y) -# @inbounds return x[i] * y[i] -# end - -# function axpy(i, alpha, x, y) -# @inbounds x[i] += alpha[1, 1] * y[i] -# end - -# SIZE = 10 -# a0 = JACC.ones(Float64, SIZE) -# a1 = JACC.ones(Float64, SIZE) -# a2 = JACC.ones(Float64, SIZE) -# r = JACC.ones(Float64, SIZE) -# p = JACC.ones(Float64, SIZE) -# s = JACC.zeros(Float64, SIZE) -# x = JACC.zeros(Float64, SIZE) -# r_old = JACC.zeros(Float64, SIZE) -# r_aux = JACC.zeros(Float64, SIZE) -# a1 = a1 * 4 -# r = r * 0.5 -# p = p * 0.5 -# global cond = one(Float64) - -# while cond[1, 1] >= 1e-14 - -# r_old = copy(r) - -# JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE) - -# alpha0 = JACC.parallel_reduce(SIZE, dot, r, r) -# alpha1 = JACC.parallel_reduce(SIZE, dot, p, s) - -# alpha = alpha0 / alpha1 -# negative_alpha = alpha * (-1.0) - -# JACC.parallel_for(SIZE, axpy, negative_alpha, r, s) -# JACC.parallel_for(SIZE, axpy, alpha, x, p) - -# beta0 = JACC.parallel_reduce(SIZE, dot, r, r) -# beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old) -# beta = beta0 / beta1 - -# r_aux = copy(r) - -# JACC.parallel_for(SIZE, axpy, beta, r_aux, p) -# ccond = JACC.parallel_reduce(SIZE, dot, r, r) -# global cond = ccond -# p = copy(r_aux) -# end -# @test cond[1, 1] <= 1e-14 -# end # @testset "LBM" begin @@ -274,26 +134,6 @@ end # end -@testset "shared" begin - N = 100 - alpha = 2.5 - x = JACC.ones(Float64, N) - x_shared = JACC.ones(Float64, N) - y = JACC.ones(Float64, N) - - function scal(i, x, y, alpha) - @inbounds x[i] = y[i] * alpha - end - - function scal_shared(i, x, y, alpha) - y_shared = JACC.shared(y) - @inbounds x[i] = y_shared[i] * alpha - end - - JACC.parallel_for(N, scal, x, y, alpha) - JACC.parallel_for(N, scal_shared, x_shared, y, alpha) - @test x≈x_shared rtol=1e-8 -end # @testset VectorAddLoop begin # N = 1024 @@ -332,145 +172,6 @@ end # end # end -@testset "JACC.BLAS" begin - - x = ones(1_000) - y = ones(1_000) - y1 = y*2 - jx = JACC.ones(1_000) - jy = JACC.ones(1_000) - jy1 = jy*2 - alpha = 2.0 - - function seq_axpy(N, alpha, x, y) - for i in 1:N - @inbounds x[i] += alpha * y[i] - end - end - - - function seq_dot(N, x, y) - r = 0.0 - for i in 1:N - @inbounds r += x[i] * y[i] - end - return r - end - - x = ones(1_000) - y = ones(1_000) - jx = JACC.ones(1_000) - jy = JACC.ones(1_000) - alpha = 2.0 - - seq_axpy(1_000, alpha, x, y) - ref_result = seq_dot(1_000, x, y) - - JACC.BLAS.axpy(1_000, alpha, jx, jy) - jresult = JACC.BLAS.dot(1_000, jx, jy) - result = Array(jresult) - - @test result[1]≈ref_result rtol=1e-8 -end - -@testset "Add-2D" begin - function add!(i, j, A, B, C) - @inbounds C[i, j] = A[i, j] + B[i, j] - end - - M = 10 - N = 10 - A = JACC.Array(ones(Float32, M, N)) - B = JACC.Array(ones(Float32, M, N)) - C = JACC.Array(zeros(Float32, M, N)) - - JACC.parallel_for((M, N), add!, A, B, C) - - C_expected = Float32(2.0) .* ones(Float32, M, N) - @test Array(C)≈C_expected rtol=1e-5 -end - -@testset "Add-3D" begin - function add!(i, j, k, A, B, C) - @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k] - end - - L = 10 - M = 10 - N = 10 - A = JACC.Array(ones(Float32, L, M, N)) - B = JACC.Array(ones(Float32, L, M, N)) - C = JACC.Array(zeros(Float32, L, M, N)) - - JACC.parallel_for((L, M, N), add!, A, B, C) - - C_expected = Float32(2.0) .* ones(Float32, L, M, N) - @test Array(C)≈C_expected rtol=1e-5 - - function seq_scal(N, alpha, x) - for i in 1:N - @inbounds x[i] = alpha * x[i] - end - end - - function seq_asum(N, x) - r = 0.0 - for i in 1:N - @inbounds r += abs(x[i]) - end - return r - end - - function seq_nrm2(N, x) - sum_sq = 0.0 - for i in 1:N - @inbounds sum_sq += x[i]*x[i] - end - r = sqrt(sum_sq) - return r - end - - function seq_swap(N, x, y1) - for i in 1:N - @inbounds t = x[i] - @inbounds x[i] = y1[i] - @inbounds y1[i] = t - end - end - - # Comparing JACC.BLAS with regular seuential functions - # seq_axpy(1_000, alpha, x, y) - # ref_result = seq_dot(1_000, x, y) - # JACC.BLAS.axpy(1_000, alpha, jx, jy) - # jresult = JACC.BLAS.dot(1_000, jx, jy) - # result = Array(jresult) - # @test result[1]≈ref_result rtol=1e-8 - - - # seq_scal(1_000, alpha, x) - # JACC.BLAS.scal(1_000, alpha, jx) - # @test x≈Array(jx) atol=1e-8 - - # seq_axpy(1_000, alpha, x, y) - # JACC.BLAS.axpy(1_000, alpha, jx, jy) - # @test x≈Array(jx) atol=1e-8 - - # r1 = seq_dot(1_000, x, y) - # r2 = JACC.BLAS.dot(1_000, jx, jy) - # @test r1≈Array(r2)[1] atol=1e-8 - - # r1 = seq_asum(1_000, x) - # r2 = JACC.BLAS.asum(1_000, jx) - # r1 = seq_nrm2(1_000, x) - # r2 = JACC.BLAS.nrm2(1_000, jx) - # @test r1≈Array(r2)[1] atol=1e-8 - - #seq_swap(1_000, x, y1) - #JACC.BLAS.swap(1_000, jx, jy1) - #@test x == Array(jx) - #@test y1 == Array(jy1) - -end #@testset "JACC.multi" begin diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl index 2ea6767..07a2f0d 100644 --- a/test/tests_oneapi.jl +++ b/test/tests_oneapi.jl @@ -5,102 +5,3 @@ using Test @testset "TestBackend" begin @test JACC.JACCPreferences.backend == "oneapi" end - -@testset "VectorAddLambda" begin - function f(i, a) - @inbounds a[i] += 5.0 - end - - N = 10 - dims = (N) - a = round.(rand(Float32, dims) * 100) - - a_device = JACC.Array(a) - JACC.parallel_for(N, f, a_device) - - a_expected = a .+ 5.0 - @test Array(a_device)≈a_expected rtol=1e-5 -end - -@testset "AXPY" begin - function axpy(i, alpha, x, y) - @inbounds x[i] += alpha * y[i] - end - - function seq_axpy(N, alpha, x, y) - @inbounds for i in 1:N - x[i] += alpha * y[i] - end - end - - N = 10 - # Generate random vectors x and y of length N for the interval [0, 100] - x = round.(rand(Float32, N) * 100) - y = round.(rand(Float32, N) * 100) - alpha::Float32 = 2.5 - - x_device = JACC.Array(x) - y_device = JACC.Array(y) - JACC.parallel_for(N, axpy, alpha, x_device, y_device) - - x_expected = x - seq_axpy(N, alpha, x_expected, y) - - @test Array(x_device)≈x_expected rtol=1e-1 -end - -@testset "shared" begin - N::Int32 = 100 - alpha::Float32 = 2.5 - x = JACC.ones(Float32, N) - x_shared = JACC.ones(Float32, N) - y = JACC.ones(Float32, N) - - function scal(i, x, y, alpha) - @inbounds x[i] = y[i] * alpha - end - - function scal_shared(i, x, y, alpha) - y_shared = JACC.shared(y) - @inbounds x[i] = y_shared[i] * alpha - end - - JACC.parallel_for(N, scal, x, y, alpha) - JACC.parallel_for(N, scal_shared, x_shared, y, alpha) - @test x≈x_shared rtol=1e-8 -end - -#@testset "JACC.BLAS" begin - -# function seq_axpy(N, alpha, x, y) -# for i in 1:N -# @inbounds x[i] += alpha * y[i] -# end -# end - -# function seq_dot(N, x, y) -# r = 0.0 -# for i in 1:N -# @inbounds r += x[i] * y[i] -# end -# return r -# end - -# SIZE = Int32(1_000) -# x = ones(Float32, SIZE) -# y = ones(Float32, SIZE) -# jx = JACC.ones(Float32, SIZE) -# jy = JACC.ones(Float32, SIZE) -# alpha = Float32(2.0) - -# seq_axpy(SIZE, alpha, x, y) -# ref_result = seq_dot(SIZE, x, y) - -# JACC.BLAS.axpy(SIZE, alpha, jx, jy) -# jresult = JACC.BLAS.dot(SIZE, jx, jy) -# result = Array(jresult) - -# @test result[1]≈ref_result rtol=1e-8 - -#end - diff --git a/test/tests_threads.jl b/test/tests_threads.jl index 99cf13b..2fd35be 100644 --- a/test/tests_threads.jl +++ b/test/tests_threads.jl @@ -5,408 +5,16 @@ using Test @test JACC.JACCPreferences.backend == "threads" end -@testset "VectorAddLambda" begin - function f(x, a) - @inbounds a[x] += 5.0 - end - - dims = (10) - a = round.(rand(Float32, dims) * 100) - a_expected = a .+ 5.0 - - JACC.parallel_for(10, f, a) - - @test a≈a_expected rtol=1e-5 -end - -@testset "AXPY" begin - function seq_axpy(N, alpha, x, y) - Threads.@threads for i in 1:N - @inbounds x[i] += alpha * y[i] - end - end - - function axpy(i, alpha, x, y) - if i <= length(x) - @inbounds x[i] += alpha * y[i] - end - end - - N = 10 - # Generate random vectors x and y of length N for the interval [0, 100] - x = round.(rand(Float32, N) * 100) - y = round.(rand(Float32, N) * 100) - alpha = 2.5 - - x_host_JACC = JACC.Array(x) - y_host_JACC = JACC.Array(y) - JACC.parallel_for(N, axpy, alpha, x_host_JACC, y_host_JACC) - - x_expected = x - seq_axpy(N, alpha, x_expected, y) - - @test x_host_JACC≈x_expected rtol=1e-1 -end - -@testset "AtomicCounter" begin - function axpy_counter!(i, alpha, x, y, counter) - @inbounds x[i] += alpha * y[i] - JACC.@atomic counter[1] += 1 - end - - N = Int32(10) - # Generate random vectors x and y of length N for the interval [0, 100] - alpha = 2.5 - counter = zeros(Int32, 1) - - x_device = JACC.Array(round.(rand(Float32, N) * 100)) - y_device = JACC.Array(round.(rand(Float32, N) * 100)) - counter = JACC.Array{Int32}([0]) - JACC.parallel_for(N, axpy_counter!, alpha, x_device, y_device, counter) - - @test counter[1] == N -end - -@testset "zeros" begin +@testset "zeros_type" begin N = 10 x = JACC.zeros(Float32, N) @test typeof(x) == Vector{Float32} @test eltype(x) == Float32 - @test zeros(N)≈x rtol=1e-5 - - function add_one(i, x) - @inbounds x[i] += 1 - end - - JACC.parallel_for(N, add_one, x) - @test ones(N)≈x rtol=1e-5 end -@testset "ones" begin +@testset "ones_type" begin N = 10 x = JACC.ones(Float64, N) @test typeof(x) == Vector{Float64} @test eltype(x) == Float64 - @test ones(N)≈x rtol=1e-5 - - function minus_one(i, x) - @inbounds x[i] -= 1 - end - - JACC.parallel_for(N, minus_one, x) - @test zeros(N)≈x rtol=1e-5 -end - -@testset "CG" begin - function matvecmul(i, a1, a2, a3, x, y, SIZE) - if i == 1 - y[i] = a2[i] * x[i] + a1[i] * x[i + 1] - elseif i == SIZE - y[i] = a3[i] * x[i - 1] + a2[i] * x[i] - elseif i > 1 && i < SIZE - y[i] = a3[i] * x[i - 1] + a1[i] * +x[i] + a1[i] * +x[i + 1] - end - end - - function dot(i, x, y) - @inbounds return x[i] * y[i] - end - - function axpy(i, alpha, x, y) - @inbounds x[i] += alpha[1, 1] * y[i] - end - - SIZE = 10 - a0 = JACC.ones(Float64, SIZE) - a1 = JACC.ones(Float64, SIZE) - a2 = JACC.ones(Float64, SIZE) - r = JACC.ones(Float64, SIZE) - p = JACC.ones(Float64, SIZE) - s = JACC.zeros(Float64, SIZE) - x = JACC.zeros(Float64, SIZE) - r_old = JACC.zeros(Float64, SIZE) - r_aux = JACC.zeros(Float64, SIZE) - a1 = a1 * 4 - r = r * 0.5 - p = p * 0.5 - global cond = one(Float64) - - while cond[1, 1] >= 1e-14 - r_old = copy(r) - - JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE) - - alpha0 = JACC.parallel_reduce(SIZE, dot, r, r) - alpha1 = JACC.parallel_reduce(SIZE, dot, p, s) - - alpha = alpha0 / alpha1 - negative_alpha = alpha * (-1.0) - - JACC.parallel_for(SIZE, axpy, negative_alpha, r, s) - JACC.parallel_for(SIZE, axpy, alpha, x, p) - - beta0 = JACC.parallel_reduce(SIZE, dot, r, r) - beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old) - beta = beta0 / beta1 - - r_aux = copy(r) - - JACC.parallel_for(SIZE, axpy, beta, r_aux, p) - ccond = JACC.parallel_reduce(SIZE, dot, r, r) - global cond = ccond - p = copy(r_aux) - end - @test cond[1, 1] <= 1e-14 -end - -@testset "LBM" begin - function lbm_kernel(x, y, f, f1, f2, t, w, cx, cy, SIZE) - u = 0.0 - v = 0.0 - p = 0.0 - x_stream = 0 - y_stream = 0 - - if x > 1 && x < SIZE && y > 1 && y < SIZE - for k in 1:9 - @inbounds x_stream = x - cx[k] - @inbounds y_stream = y - cy[k] - ind = (k - 1) * SIZE * SIZE + x * SIZE + y - iind = (k - 1) * SIZE * SIZE + x_stream * SIZE + y_stream - @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)] - end - for k in 1:9 - ind = (k - 1) * SIZE * SIZE + x * SIZE + y - @inbounds p = p[1, 1] + f[ind] - @inbounds u = u[1, 1] + f[ind] * cx[k] - @inbounds v = v[1, 1] + f[ind] * cy[k] - end - u = u / p - v = v / p - for k in 1:9 - @inbounds cu = cx[k] * u + cy[k] * v - @inbounds feq = w[k] * p * - (1.0 + 3.0 * cu + cu * cu - - 1.5 * ((u * u) + (v * v))) - ind = (k - 1) * SIZE * SIZE + x * SIZE + y - @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] * - (1.0 - 1.0 / t) + feq * 1 / t - end - end - end - - function lbm_threads(f, f1, f2, t, w, cx, cy, SIZE) - Threads.@sync Threads.@threads for x in 1:SIZE - for y in 1:SIZE - u = 0.0 - v = 0.0 - p = 0.0 - x_stream = 0 - y_stream = 0 - - if x > 1 && x < SIZE && y > 1 && y < SIZE - for k in 1:9 - @inbounds x_stream = x - cx[k] - @inbounds y_stream = y - cy[k] - ind = (k - 1) * SIZE * SIZE + x * SIZE + y - iind = (k - 1) * SIZE * SIZE + x_stream * SIZE + - y_stream - @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)] - end - for k in 1:9 - ind = (k - 1) * SIZE * SIZE + x * SIZE + y - @inbounds p = p[1, 1] + f[ind] - @inbounds u = u[1, 1] + f[ind] * cx[k] - @inbounds v = v[1, 1] + f[ind] * cy[k] - end - u = u / p - v = v / p - for k in 1:9 - @inbounds cu = cx[k] * u + cy[k] * v - @inbounds feq = w[k] * p * - (1.0 + 3.0 * cu + cu * cu - - 1.5 * ((u * u) + (v * v))) - ind = (k - 1) * SIZE * SIZE + x * SIZE + y - @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] * - (1.0 - 1.0 / t) + - feq * 1 / t - end - end - end - end - end - - SIZE = 10 - f = ones(SIZE * SIZE * 9) .* 2.0 - f1 = ones(SIZE * SIZE * 9) .* 3.0 - f2 = ones(SIZE * SIZE * 9) .* 4.0 - cx = zeros(9) - cy = zeros(9) - cx[1] = 0 - cy[1] = 0 - cx[2] = 1 - cy[2] = 0 - cx[3] = -1 - cy[3] = 0 - cx[4] = 0 - cy[4] = 1 - cx[5] = 0 - cy[5] = -1 - cx[6] = 1 - cy[6] = 1 - cx[7] = -1 - cy[7] = 1 - cx[8] = -1 - cy[8] = -1 - cx[9] = 1 - cy[9] = -1 - w = ones(9) - t = 1.0 - - df = JACC.Array(f) - df1 = JACC.Array(f1) - df2 = JACC.Array(f2) - dcx = JACC.Array(cx) - dcy = JACC.Array(cy) - dw = JACC.Array(w) - - JACC.parallel_for( - (SIZE, SIZE), lbm_kernel, df, df1, df2, t, dw, dcx, dcy, SIZE) - - lbm_threads(f, f1, f2, t, w, cx, cy, SIZE) - - @test f2≈df2 rtol=1e-1 -end - -@testset "JACC.BLAS" begin - x = ones(1_000) - y = ones(1_000) - y1 = y*2 - jx = JACC.ones(1_000) - jy = JACC.ones(1_000) - jy1 = jy*2 - alpha = 2.0 - - function seq_axpy(N, alpha, x, y) - for i in 1:N - @inbounds x[i] += alpha * y[i] - end - end - - function seq_dot(N, x, y) - r = 0.0 - for i in 1:N - @inbounds r += x[i] * y[i] - end - return r - end - - seq_axpy(1_000, alpha, x, y) - ref_result = seq_dot(1_000, x, y) - - JACC.BLAS.axpy(1_000, alpha, jx, jy) - jresult = JACC.BLAS.dot(1_000, jx, jy) - result = jresult[1] - - @test result≈ref_result rtol=1e-8 -end - -# 2D -@testset "Add-2D" begin - function add!(i, j, A, B, C) - @inbounds C[i, j] = A[i, j] + B[i, j] - end - - M = 10 - N = 10 - A = JACC.Array(ones(Float32, M, N)) - B = JACC.Array(ones(Float32, M, N)) - C = JACC.Array(zeros(Float32, M, N)) - - JACC.parallel_for((M, N), add!, A, B, C) - - C_expected = Float32(2.0) .* ones(Float32, M, N) - @test C≈C_expected rtol=1e-5 -end - -@testset "Add-3D" begin - function add!(i, j, k, A, B, C) - @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k] - end - - L = 10 - M = 10 - N = 10 - A = JACC.Array(ones(Float32, L, M, N)) - B = JACC.Array(ones(Float32, L, M, N)) - C = JACC.Array(zeros(Float32, L, M, N)) - - for i in 1:L - for j in 1:M - for k in 1:N - C[i, j, k] = A[i, j, k] + B[i, j, k] - end - end - end - - JACC.parallel_for((L, M, N), add!, A, B, C) - - C_expected = Float32(2.0) .* ones(Float32, L, M, N) - @test C≈C_expected rtol=1e-5 - - function seq_scal(N, alpha, x) - for i in 1:N - @inbounds x[i] = alpha * x[i] - end - end - - function seq_asum(N, x) - r = 0.0 - for i in 1:N - @inbounds r += abs(x[i]) - end - return r - end - - function seq_nrm2(N, x) - sum_sq = 0.0 - for i in 1:N - @inbounds sum_sq += x[i]*x[i] - end - r = sqrt(sum_sq) - return r - end - - function seq_swap(N, x, y1) - for i in 1:N - @inbounds t = x[i] - @inbounds x[i] = y1[i] - @inbounds y1[i] = t - end - end - - # Comparing JACC.BLAS with regular seuential functions - - # seq_scal(1_000, alpha, x) - # JACC.BLAS.scal(1_000, alpha, jx) - # @test x≈jx atol=1e-8 - - # seq_axpy(1_000, alpha, x, y) - # JACC.BLAS.axpy(1_000, alpha, jx, jy) - # @test x≈jx atol=1e-8 - - # r1 = seq_dot(1_000, x, y) - # r2 = JACC.BLAS.dot(1_000, jx, jy) - # @test r1≈r2 atol=1e-8 - - # r1 = seq_asum(1_000, x) - # r2 = JACC.BLAS.asum(1_000, jx) - # r1 = seq_nrm2(1_000, x) - # r2 = JACC.BLAS.nrm2(1_000, jx) - # @test r1≈r2[1] atol=1e-8 - - # seq_swap(1_000, x, y1) - # JACC.BLAS.swap(1_000, jx, jy1) - # @test x == jx - # @test y1 == jy1 end diff --git a/test/unittests.jl b/test/unittests.jl new file mode 100644 index 0000000..02fc51a --- /dev/null +++ b/test/unittests.jl @@ -0,0 +1,421 @@ +import JACC +using Test + +@testset "VectorAddLambda" begin + function f(i, a) + @inbounds a[i] += 5.0 + end + + N = 10 + dims = (N) + a = round.(rand(Float32, dims) * 100) + + a_device = JACC.Array(a) + JACC.parallel_for(N, f, a_device) + + a_expected = a .+ 5.0 + @test Array(a_device)≈a_expected rtol=1e-5 +end + +@testset "AXPY" begin + function axpy(i, alpha, x, y) + @inbounds x[i] += alpha * y[i] + end + + function seq_axpy(N, alpha, x, y) + @inbounds for i in 1:N + x[i] += alpha * y[i] + end + end + + N = 10 + # Generate random vectors x and y of length N for the interval [0, 100] + x = round.(rand(Float32, N) * 100) + y = round.(rand(Float32, N) * 100) + alpha = 2.5 + + x_device = JACC.Array(x) + y_device = JACC.Array(y) + JACC.parallel_for(N, axpy, alpha, x_device, y_device) + + x_expected = x + seq_axpy(N, alpha, x_expected, y) + + @test Array(x_device)≈x_expected rtol=1e-1 +end + +@testset "zeros" begin + N = 10 + x = JACC.zeros(Float64, N) + @test eltype(x) == Float64 + @test zeros(N)≈Array(x) rtol=1e-5 + + function add_one(i, x) + @inbounds x[i] += 1 + end + + JACC.parallel_for(N, add_one, x) + @test ones(N) ≈ Core.Array(x) rtol=1e-5 +end + +@testset "ones" begin + N = 10 + x = JACC.ones(Float64, N) + @test eltype(x) == Float64 + @test ones(N)≈Array(x) rtol=1e-5 + + function minus_one(i, x) + @inbounds x[i] -= 1 + end + + JACC.parallel_for(N, minus_one, x) + @test zeros(N) ≈ Array(x) rtol=1e-5 +end + +@testset "AtomicCounter" begin + function axpy_counter!(i, alpha, x, y, counter) + @inbounds x[i] += alpha * y[i] + JACC.@atomic counter[1] += 1 + end + + N = Int32(10) + # Generate random vectors x and y of length N for the interval [0, 100] + alpha = 2.5 + + x = JACC.Array(round.(rand(Float32, N) * 100)) + y = JACC.Array(round.(rand(Float32, N) * 100)) + counter = JACC.Array{Int32}([0]) + JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter) + + @test Array(counter)[1] == N +end + +@testset "shared" begin + N = 100 + alpha = 2.5 + x = JACC.ones(Float64, N) + x_shared = JACC.ones(Float64, N) + y = JACC.ones(Float64, N) + + function scal(i, x, y, alpha) + @inbounds x[i] = y[i] * alpha + end + + function scal_shared(i, x, y, alpha) + y_shared = JACC.shared(y) + @inbounds x[i] = y_shared[i] * alpha + end + + JACC.parallel_for(N, scal, x, y, alpha) + JACC.parallel_for(N, scal_shared, x_shared, y, alpha) + @test x≈x_shared rtol=1e-8 +end + +@testset "JACC.BLAS" begin + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + + @test Array(jresult)[1]≈ref_result rtol=1e-8 +end + +@testset "Add-2D" begin + function add!(i, j, A, B, C) + @inbounds C[i, j] = A[i, j] + B[i, j] + end + + M = 10 + N = 10 + A = JACC.ones(Float32, M, N) + B = JACC.ones(Float32, M, N) + C = JACC.zeros(Float32, M, N) + + JACC.parallel_for((M, N), add!, A, B, C) + + C_expected = Float32(2.0) .* ones(Float32, M, N) + @test Array(C)≈C_expected rtol=1e-5 +end + +@testset "Add-3D" begin + function add!(i, j, k, A, B, C) + @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k] + end + + L = 10 + M = 10 + N = 10 + A = JACC.ones(Float32, L, M, N) + B = JACC.ones(Float32, L, M, N) + C = JACC.zeros(Float32, L, M, N) + + JACC.parallel_for((L, M, N), add!, A, B, C) + + C_expected = Float32(2.0) .* ones(Float32, L, M, N) + @test Array(C)≈C_expected rtol=1e-5 + + function seq_scal(N, alpha, x) + for i in 1:N + @inbounds x[i] = alpha * x[i] + end + end + + function seq_asum(N, x) + r = 0.0 + for i in 1:N + @inbounds r += abs(x[i]) + end + return r + end + + function seq_nrm2(N, x) + sum_sq = 0.0 + for i in 1:N + @inbounds sum_sq += x[i]*x[i] + end + r = sqrt(sum_sq) + return r + end + + function seq_swap(N, x, y1) + for i in 1:N + @inbounds t = x[i] + @inbounds x[i] = y1[i] + @inbounds y1[i] = t + end + end + + # Comparing JACC.BLAS with regular seuential functions + # seq_axpy(1_000, alpha, x, y) + # ref_result = seq_dot(1_000, x, y) + # JACC.BLAS.axpy(1_000, alpha, jx, jy) + # jresult = JACC.BLAS.dot(1_000, jx, jy) + # result = Array(jresult) + # @test result[1]≈ref_result rtol=1e-8 + + + # seq_scal(1_000, alpha, x) + # JACC.BLAS.scal(1_000, alpha, jx) + # @test x≈Array(jx) atol=1e-8 + + # seq_axpy(1_000, alpha, x, y) + # JACC.BLAS.axpy(1_000, alpha, jx, jy) + # @test x≈Array(jx) atol=1e-8 + + # r1 = seq_dot(1_000, x, y) + # r2 = JACC.BLAS.dot(1_000, jx, jy) + # @test r1≈Array(r2)[1] atol=1e-8 + + # r1 = seq_asum(1_000, x) + # r2 = JACC.BLAS.asum(1_000, jx) + # r1 = seq_nrm2(1_000, x) + # r2 = JACC.BLAS.nrm2(1_000, jx) + # @test r1≈Array(r2)[1] atol=1e-8 + + #seq_swap(1_000, x, y1) + #JACC.BLAS.swap(1_000, jx, jy1) + #@test x == Array(jx) + #@test y1 == Array(jy1) +end + +@testset "CG" begin + function matvecmul(i, a1, a2, a3, x, y, SIZE) + if i == 1 + y[i] = a2[i] * x[i] + a1[i] * x[i + 1] + elseif i == SIZE + y[i] = a3[i] * x[i - 1] + a2[i] * x[i] + elseif i > 1 && i < SIZE + y[i] = a3[i] * x[i - 1] + a1[i] * +x[i] + a1[i] * +x[i + 1] + end + end + + function dot(i, x, y) + @inbounds return x[i] * y[i] + end + + function axpy(i, alpha, x, y) + @inbounds x[i] += alpha[1, 1] * y[i] + end + + SIZE = 10 + a0 = JACC.ones(Float64, SIZE) + a1 = JACC.ones(Float64, SIZE) + a2 = JACC.ones(Float64, SIZE) + r = JACC.ones(Float64, SIZE) + p = JACC.ones(Float64, SIZE) + s = JACC.zeros(Float64, SIZE) + x = JACC.zeros(Float64, SIZE) + r_old = JACC.zeros(Float64, SIZE) + r_aux = JACC.zeros(Float64, SIZE) + a1 = a1 * 4 + r = r * 0.5 + p = p * 0.5 + global cond = one(Float64) + + while cond[1, 1] >= 1e-14 + r_old = copy(r) + + JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE) + + alpha0 = JACC.parallel_reduce(SIZE, dot, r, r) + alpha1 = JACC.parallel_reduce(SIZE, dot, p, s) + + alpha = alpha0 / alpha1 + negative_alpha = alpha * (-1.0) + + JACC.parallel_for(SIZE, axpy, negative_alpha, r, s) + JACC.parallel_for(SIZE, axpy, alpha, x, p) + + beta0 = JACC.parallel_reduce(SIZE, dot, r, r) + beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old) + beta = beta0 / beta1 + + r_aux = copy(r) + + JACC.parallel_for(SIZE, axpy, beta, r_aux, p) + ccond = JACC.parallel_reduce(SIZE, dot, r, r) + global cond = Array(ccond) + p = copy(r_aux) + end + @test cond[1, 1] <= 1e-14 +end + +@testset "LBM" begin + function lbm_kernel(x, y, f, f1, f2, t, w, cx, cy, SIZE) + u = 0.0 + v = 0.0 + p = 0.0 + x_stream = 0 + y_stream = 0 + + if x > 1 && x < SIZE && y > 1 && y < SIZE + for k in 1:9 + @inbounds x_stream = x - cx[k] + @inbounds y_stream = y - cy[k] + ind = (k - 1) * SIZE * SIZE + x * SIZE + y + iind = (k - 1) * SIZE * SIZE + x_stream * SIZE + y_stream + @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)] + end + for k in 1:9 + ind = (k - 1) * SIZE * SIZE + x * SIZE + y + @inbounds p = p[1, 1] + f[ind] + @inbounds u = u[1, 1] + f[ind] * cx[k] + @inbounds v = v[1, 1] + f[ind] * cy[k] + end + u = u / p + v = v / p + for k in 1:9 + @inbounds cu = cx[k] * u + cy[k] * v + @inbounds feq = w[k] * p * + (1.0 + 3.0 * cu + cu * cu - + 1.5 * ((u * u) + (v * v))) + ind = (k - 1) * SIZE * SIZE + x * SIZE + y + @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] * + (1.0 - 1.0 / t) + feq * 1 / t + end + end + end + + function lbm_threads(f, f1, f2, t, w, cx, cy, SIZE) + Threads.@sync Threads.@threads for x in 1:SIZE + for y in 1:SIZE + u = 0.0 + v = 0.0 + p = 0.0 + x_stream = 0 + y_stream = 0 + + if x > 1 && x < SIZE && y > 1 && y < SIZE + for k in 1:9 + @inbounds x_stream = x - cx[k] + @inbounds y_stream = y - cy[k] + ind = (k - 1) * SIZE * SIZE + x * SIZE + y + iind = (k - 1) * SIZE * SIZE + x_stream * SIZE + + y_stream + @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)] + end + for k in 1:9 + ind = (k - 1) * SIZE * SIZE + x * SIZE + y + @inbounds p = p[1, 1] + f[ind] + @inbounds u = u[1, 1] + f[ind] * cx[k] + @inbounds v = v[1, 1] + f[ind] * cy[k] + end + u = u / p + v = v / p + for k in 1:9 + @inbounds cu = cx[k] * u + cy[k] * v + @inbounds feq = w[k] * p * + (1.0 + 3.0 * cu + cu * cu - + 1.5 * ((u * u) + (v * v))) + ind = (k - 1) * SIZE * SIZE + x * SIZE + y + @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] * + (1.0 - 1.0 / t) + + feq * 1 / t + end + end + end + end + end + + SIZE = 10 + f = ones(SIZE * SIZE * 9) .* 2.0 + f1 = ones(SIZE * SIZE * 9) .* 3.0 + f2 = ones(SIZE * SIZE * 9) .* 4.0 + cx = zeros(9) + cy = zeros(9) + cx[1] = 0 + cy[1] = 0 + cx[2] = 1 + cy[2] = 0 + cx[3] = -1 + cy[3] = 0 + cx[4] = 0 + cy[4] = 1 + cx[5] = 0 + cy[5] = -1 + cx[6] = 1 + cy[6] = 1 + cx[7] = -1 + cy[7] = 1 + cx[8] = -1 + cy[8] = -1 + cx[9] = 1 + cy[9] = -1 + w = ones(9) + t = 1.0 + + df = JACC.Array(f) + df1 = JACC.Array(f1) + df2 = JACC.Array(f2) + dcx = JACC.Array(cx) + dcy = JACC.Array(cy) + dw = JACC.Array(w) + + JACC.parallel_for( + (SIZE, SIZE), lbm_kernel, df, df1, df2, t, dw, dcx, dcy, SIZE) + + lbm_threads(f, f1, f2, t, w, cx, cy, SIZE) + + @test f2≈Array(df2) rtol=1e-1 +end