From 2044c45c37f7c789391d5410c4817a6804e363c8 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Fri, 18 Oct 2024 15:15:53 -0500
Subject: [PATCH 01/14] Refactor array and function dispatching

---
 ext/JACCAMDGPU/JACCAMDGPU.jl | 19 +++++++------
 ext/JACCAMDGPU/array.jl      |  4 +--
 ext/JACCCUDA/JACCCUDA.jl     | 18 ++++++------
 ext/JACCCUDA/JACCMULTI.jl    | 12 ++++----
 ext/JACCCUDA/array.jl        |  4 +--
 ext/JACCONEAPI/JACCONEAPI.jl | 16 +++++------
 ext/JACCONEAPI/array.jl      |  4 +--
 src/JACC.jl                  | 55 ++++++++++++++++++++++++++++++------
 src/JACCMULTI.jl             | 29 +++++++++++++++++--
 src/JACCPreferences.jl       |  1 +
 src/array.jl                 |  8 ++++--
 11 files changed, 118 insertions(+), 52 deletions(-)

diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
index 32e408d..8df67f4 100644
--- a/ext/JACCAMDGPU/JACCAMDGPU.jl
+++ b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -9,7 +9,11 @@ include("array.jl")
 include("JACCEXPERIMENTAL.jl")
 using .experimental
 
-function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+const AMDGPUBackend = ROCBackend
+
+JACC.get_backend(::Val{:amdgpu}) = AMDGPUBackend()
+
+function JACC.parallel_for(::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 512
     threads = min(N, numThreads)
     blocks = ceil(Int, N / threads)
@@ -21,7 +25,7 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
 end
 
 function JACC.parallel_for(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 16
     Mthreads = min(M, numThreads)
     Nthreads = min(N, numThreads)
@@ -35,7 +39,7 @@ function JACC.parallel_for(
 end
 
 function JACC.parallel_for(
-        (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
+        ::AMDGPUBackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
         I <: Integer, F <: Function}
     numThreads = 32
     Lthreads = min(L, numThreads)
@@ -52,7 +56,7 @@ function JACC.parallel_for(
 end
 
 function JACC.parallel_reduce(
-        N::I, f::F, x...) where {I <: Integer, F <: Function}
+        ::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 512
     threads = min(N, numThreads)
     blocks = ceil(Int, N / threads)
@@ -68,7 +72,7 @@ function JACC.parallel_reduce(
 end
 
 function JACC.parallel_reduce(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 16
     Mthreads = min(M, numThreads)
     Nthreads = min(N, numThreads)
@@ -389,9 +393,6 @@ function JACC.shared(x::ROCDeviceArray{T,N}) where {T,N}
   return shmem
 end
 
-
-function __init__()
-    const JACC.Array = AMDGPU.ROCArray{T, N} where {T, N}
-end
+JACC.array_type(::AMDGPUBackend) = AMDGPU.ROCArray{T, N} where {T, N}
 
 end # module JACCAMDGPU
diff --git a/ext/JACCAMDGPU/array.jl b/ext/JACCAMDGPU/array.jl
index b826080..baeebcf 100644
--- a/ext/JACCAMDGPU/array.jl
+++ b/ext/JACCAMDGPU/array.jl
@@ -1,8 +1,8 @@
 
-function JACC.zeros(T, dims...)
+function JACC.zeros(::AMDGPUBackend, T, dims...)
     return AMDGPU.zeros(T, dims...)
 end
 
-function JACC.ones(T, dims...)
+function JACC.ones(::AMDGPUBackend, T, dims...)
     return AMDGPU.ones(T, dims...)
 end
diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
index 20aa6a1..517f3e8 100644
--- a/ext/JACCCUDA/JACCCUDA.jl
+++ b/ext/JACCCUDA/JACCCUDA.jl
@@ -5,7 +5,6 @@ using JACC, CUDA
 # overloaded array functions
 include("array.jl")
 
-
 include("JACCMULTI.jl")
 using .multi
 
@@ -13,7 +12,9 @@ using .multi
 include("JACCEXPERIMENTAL.jl")
 using .experimental
 
-function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+JACC.get_backend(::Val{:cuda}) = CUDABackend()
+
+function JACC.parallel_for(::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     #parallel_args = (N, f, x...)
     #parallel_kargs = cudaconvert.(parallel_args)
     #parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...}
@@ -28,7 +29,7 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
 end
 
 function JACC.parallel_for(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the
     # potential benefit from using shared memory.
     #numThreads = 32
@@ -42,7 +43,7 @@ function JACC.parallel_for(
 end
 
 function JACC.parallel_for(
-        (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
+        ::CUDABackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
         I <: Integer, F <: Function}
     #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the
     # potential benefit from using shared memory.
@@ -58,7 +59,7 @@ function JACC.parallel_for(
 end
 
 function JACC.parallel_reduce(
-        N::I, f::F, x...) where {I <: Integer, F <: Function}
+        ::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 512
     threads = min(N, numThreads)
     blocks = ceil(Int, N / threads)
@@ -72,7 +73,7 @@ function JACC.parallel_reduce(
 end
 
 function JACC.parallel_reduce(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 16
     Mthreads = min(M, numThreads)
     Nthreads = min(N, numThreads)
@@ -402,9 +403,6 @@ function JACC.shared(x::CuDeviceArray{T,N}) where {T,N}
   return shmem
 end
 
-
-function __init__()
-    const JACC.Array = CUDA.CuArray{T, N} where {T, N}
-end
+JACC.array_type(::CUDABackend) = CUDA.CuArray{T, N} where {T, N}
 
 end # module JACCCUDA
diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl
index 73e75a2..c6e94c9 100644
--- a/ext/JACCCUDA/JACCMULTI.jl
+++ b/ext/JACCCUDA/JACCMULTI.jl
@@ -2,7 +2,7 @@ module multi
 
 using JACC, CUDA
 
-function JACC.multi.Array(x::Base.Array{T,N}) where {T,N}
+function JACC.multi.Array(::CUDABackend, x::Base.Array{T,N}) where {T,N}
 
   ndev = length(devices())
   ret = Vector{Any}(undef, 2)  
@@ -52,7 +52,7 @@ function JACC.multi.Array(x::Base.Array{T,N}) where {T,N}
 
 end
 
-function JACC.multi.copy(x::Vector{Any}, y::Vector{Any})
+function JACC.multi.copy(::CUDABackend, x::Vector{Any}, y::Vector{Any})
    device!(0) 
    ndev = length(devices())
   
@@ -74,7 +74,7 @@ function JACC.multi.copy(x::Vector{Any}, y::Vector{Any})
   
 end
 
-function JACC.multi.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+function JACC.multi.parallel_for(::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
 
   device!(0)
   ndev = length(devices())
@@ -98,7 +98,7 @@ function JACC.multi.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Fun
 
 end
 
-function JACC.multi.parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
+function JACC.multi.parallel_reduce(::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     
     device!(0)
     ndev = length(devices())
@@ -139,7 +139,7 @@ function JACC.multi.parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <:
     return final_rret
 end
 
-function JACC.multi.parallel_for(
+function JACC.multi.parallel_for(::CUDABackend, 
          (M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function}
 
   ndev = length(devices())
@@ -165,7 +165,7 @@ function JACC.multi.parallel_for(
 
 end
 
-function JACC.multi.parallel_reduce(
+function JACC.multi.parallel_reduce(::CUDABackend, 
         (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
 
   ndev = length(devices())
diff --git a/ext/JACCCUDA/array.jl b/ext/JACCCUDA/array.jl
index 5cf21e0..53fbf5d 100644
--- a/ext/JACCCUDA/array.jl
+++ b/ext/JACCCUDA/array.jl
@@ -1,8 +1,8 @@
 
-function JACC.zeros(T, dims...)
+function JACC.zeros(::CUDABackend, T, dims...)
     return CUDA.zeros(T, dims...)
 end
 
-function JACC.ones(T, dims...)
+function JACC.ones(::CUDABackend, T, dims...)
     return CUDA.ones(T, dims...)
 end
diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl
index b865a75..929146c 100644
--- a/ext/JACCONEAPI/JACCONEAPI.jl
+++ b/ext/JACCONEAPI/JACCONEAPI.jl
@@ -9,7 +9,9 @@ include("array.jl")
 include("JACCEXPERIMENTAL.jl")
 using .experimental
 
-function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+JACC.get_backend(::Val{:oneapi}) = oneAPIBackend()
+
+function JACC.parallel_for(::oneAPIBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     #maxPossibleItems = oneAPI.oneL0.compute_properties(device().maxTotalGroupSize)
     maxPossibleItems = 256
     items = min(N, maxPossibleItems)
@@ -22,7 +24,7 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
 end
 
 function JACC.parallel_for(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::oneAPIBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     maxPossibleItems = 16
     Mitems = min(M, maxPossibleItems)
     Nitems = min(N, maxPossibleItems)
@@ -33,7 +35,7 @@ function JACC.parallel_for(
 end
 
 function JACC.parallel_for(
-        (L, M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::oneAPIBackend, (L, M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     maxPossibleItems = 16
     Litems = min(M, maxPossibleItems)
     Mitems = min(M, maxPossibleItems)
@@ -47,7 +49,7 @@ function JACC.parallel_for(
 end
 
 function JACC.parallel_reduce(
-        N::I, f::F, x...) where {I <: Integer, F <: Function}
+        ::oneAPIBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     numItems = 256
     items = min(N, numItems)
     groups = ceil(Int, N / items)
@@ -60,7 +62,7 @@ function JACC.parallel_reduce(
 end
 
 function JACC.parallel_reduce(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::oneAPIBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     numItems = 16
     Mitems = min(M, numItems)
     Nitems = min(N, numItems)
@@ -379,8 +381,6 @@ function JACC.shared(x::oneDeviceArray{T,N}) where {T,N}
   return shmem
 end
 
-function __init__()
-    const JACC.Array = oneAPI.oneArray{T, N} where {T, N}
-end
+JACC.array_type(::oneAPIBackend) = oneAPI.oneArray{T, N} where {T, N}
 
 end # module JACCONEAPI
diff --git a/ext/JACCONEAPI/array.jl b/ext/JACCONEAPI/array.jl
index de479f2..a5a9a0a 100644
--- a/ext/JACCONEAPI/array.jl
+++ b/ext/JACCONEAPI/array.jl
@@ -1,8 +1,8 @@
 
-function JACC.zeros(T, dims...)
+function JACC.zeros(::oneAPIBackend, T, dims...)
     return oneAPI.zeros(T, dims...)
 end
 
-function JACC.ones(T, dims...)
+function JACC.ones(::oneAPIBackend, T, dims...)
     return oneAPI.ones(T, dims...)
 end
diff --git a/src/JACC.jl b/src/JACC.jl
index 333d2c0..78dd67e 100644
--- a/src/JACC.jl
+++ b/src/JACC.jl
@@ -1,9 +1,12 @@
-__precompile__(false)
 module JACC
 
 import Atomix: @atomic
-# module to set back end preferences 
+
+# module to set backend preferences 
 include("JACCPreferences.jl")
+
+struct ThreadsBackend end
+
 include("helper.jl")
 # overloaded array functions
 include("array.jl")
@@ -17,19 +20,22 @@ using .multi
 include("JACCEXPERIMENTAL.jl")
 using .experimental
 
+get_backend(::Val{:threads}) = ThreadsBackend()
+
 export Array, @atomic
 export parallel_for
+export parallel_reduce
 
 global Array
 
-function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+function parallel_for(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     @maybe_threaded for i in 1:N
         f(i, x...)
     end
 end
 
 function parallel_for(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     @maybe_threaded for j in 1:N
         for i in 1:M
             f(i, j, x...)
@@ -38,7 +44,7 @@ function parallel_for(
 end
 
 function parallel_for(
-        (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
+        ::ThreadsBackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
         I <: Integer, F <: Function}
     # only threaded at the first level (no collapse equivalent)
     @maybe_threaded for k in 1:N
@@ -50,7 +56,7 @@ function parallel_for(
     end
 end
 
-function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
+function parallel_reduce(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     tmp = zeros(Threads.nthreads())
     ret = zeros(1)
     @maybe_threaded for i in 1:N
@@ -63,7 +69,7 @@ function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
 end
 
 function parallel_reduce(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     tmp = zeros(Threads.nthreads())
     ret = zeros(1)
     @maybe_threaded for j in 1:N
@@ -77,12 +83,43 @@ function parallel_reduce(
     return ret
 end
 
+array_type(::ThreadsBackend) = Base.Array{T, N} where {T, N}
+
 function shared(x::Base.Array{T,N}) where {T,N}
   return x
 end
 
-function __init__()
-    const JACC.Array = Base.Array{T, N} where {T, N}
+struct Array{T, N} end
+(::Type{Array{T, N}})(args...; kwargs...) where {T, N} =
+    array_type(){T, N}(args...; kwargs...)
+(::Type{Array{T}})(args...; kwargs...) where {T} =
+    array_type(){T}(args...; kwargs...)
+(::Type{Array})(args...; kwargs...) =
+    array_type()(args...; kwargs...)
+
+default_backend() = get_backend(JACCPreferences._backend_dispatchable)
+
+array_type() = array_type(default_backend())
+
+function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_for(default_backend(), N, f, x...)
+end
+
+function parallel_for(
+        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_for(default_backend(), (M, N), f, x...)
+end
+
+function parallel_for((L, M, N)::Tuple{I, I, I}, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_for(default_backend(), (L, M, N), f, x...)
+end
+
+function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_reduce(default_backend(), N, f, x...)
+end
+
+function parallel_reduce((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_reduce(default_backend(), (M, N), f, x...)
 end
 
 end # module JACC
diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl
index b86be05..8ec15bb 100644
--- a/src/JACCMULTI.jl
+++ b/src/JACCMULTI.jl
@@ -1,24 +1,49 @@
 module multi
 
 using JACC
+import JACC: ThreadsBackend
 
-function Array(x::Base.Array{T,N}) where {T,N}
+function Array(::ThreadsBackend, x::Base.Array{T,N}) where {T,N}
   return x
 end
 
+function copy(::ThreadsBackend, x::Vector{Any}, y::Vector{Any})
+end
+
+function parallel_for(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
+end
+
+function parallel_for(::ThreadsBackend, (M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function}
+end
+
+function parallel_reduce(::ThreadsBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
+end
+
+function parallel_reduce(::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+end
+
+function Array(x::Base.Array{T,N}) where {T,N}
+    return Array(default_backend(), x)
+end
+
 function copy(x::Vector{Any}, y::Vector{Any})
+    return copy(default_backend(), x, y)
 end
 
 function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_for(default_backend(), N, f, x...)
 end
 
-function parallel_for((M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function}
+function parallel_for((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_for(default_backend(), (M, N), f, x...)
 end
 
 function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_reduce(default_backend(), N, f, x...)
 end
 
 function parallel_reduce((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+    return parallel_reduce(default_backend(), (M, N), f, x...)
 end
 
 end # module multi
diff --git a/src/JACCPreferences.jl b/src/JACCPreferences.jl
index f66ddce..45bfb2c 100644
--- a/src/JACCPreferences.jl
+++ b/src/JACCPreferences.jl
@@ -16,5 +16,6 @@ function set_backend(new_backend::String)
 end
 
 const backend = @load_preference("backend", "threads")
+const _backend_dispatchable = Val{Symbol(backend)}()
 
 end # module JACCPreferences
diff --git a/src/array.jl b/src/array.jl
index 1c1e703..9f7f581 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -1,8 +1,12 @@
 
-function zeros(T, dims...)
+function zeros(::ThreadsBackend, T, dims...)
     return Base.zeros(T, dims...)
 end
 
-function ones(T, dims...)
+function ones(::ThreadsBackend, T, dims...)
     return Base.ones(T, dims...)
 end
+
+zeros(T, dims...) = zeros(default_backend(), T, dims...)
+
+ones(T, dims...) = ones(default_backend(), T, dims...)

From b7ae0d446033d4083732faf9764a00e461a06c48 Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.com>
Date: Mon, 21 Oct 2024 09:48:53 -0400
Subject: [PATCH 02/14] Added JACC.multi implementation for the AMDGPU backend,
 non-included the special function for ghost cell management. Tests don't
 work, we need more than one GPU for this. I tested the test codes in
 Frontier. It worked well there. Also, the tests do not work on
 MI100-cousteau, even using the last version of AMDGPU. I changed Project.
 toml to include a more moderm version of AMDGPU.jl which we need for multi
 GPU support

---
 Project.toml                 |   3 +-
 ext/JACCAMDGPU/JACCAMDGPU.jl |   3 +
 ext/JACCAMDGPU/JACCMULTI.jl  | 508 +++++++++++++++++++++++++++++++++++
 ext/JACCCUDA/JACCCUDA.jl     |   1 -
 ext/JACCCUDA/JACCMULTI.jl    |   4 +
 src/JACCMULTI.jl             |   3 +
 test/tests_amdgpu.jl         |  41 +++
 7 files changed, 561 insertions(+), 2 deletions(-)
 create mode 100644 ext/JACCAMDGPU/JACCMULTI.jl

diff --git a/Project.toml b/Project.toml
index 52629e0..35cdfb6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["pedrovalerolara <valerolarap@ornl.gov>", "williamfgc <williamfgc@yah
 version = "0.0.5"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 
@@ -18,7 +19,7 @@ JACCCUDA = ["CUDA"]
 JACCONEAPI = ["oneAPI"]
 
 [compat]
-AMDGPU = "0.8"
+AMDGPU = "1.0"
 Atomix = "0.1"
 CUDA = "5"
 Preferences = "1.4.0"
diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
index 32e408d..9e20a4b 100644
--- a/ext/JACCAMDGPU/JACCAMDGPU.jl
+++ b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -5,6 +5,9 @@ using JACC, AMDGPU
 # overloaded array functions
 include("array.jl")
 
+include("JACCMULTI.jl")
+using .multi
+
 # overloaded experimental functions
 include("JACCEXPERIMENTAL.jl")
 using .experimental
diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl
new file mode 100644
index 0000000..a1cfaf3
--- /dev/null
+++ b/ext/JACCAMDGPU/JACCMULTI.jl
@@ -0,0 +1,508 @@
+module multi
+
+using JACC, AMDGPU
+
+function JACC.multi.ndev()
+  return length(AMDGPU.devices())
+end
+
+function get_portable_rocarray(x::Base.Array{T, N}) where {T, N}
+    dims = size(x)
+    bytesize = sizeof(T) * prod(dims)
+    buf = AMDGPU.Mem.HostBuffer(bytesize, AMDGPU.HIP.hipHostAllocPortable)
+    ROCArray{T, N}(AMDGPU.GPUArrays.DataRef(AMDGPU.pool_free, AMDGPU.Managed(buf)), dims)
+end
+
+function JACC.multi.Array(x::Base.Array{T,N}) where {T,N}
+
+  ret = Vector{Any}(undef, 2)  
+  ndev = length(AMDGPU.devices())
+
+  if ndims(x) == 1
+
+    AMDGPU.device!(AMDGPU.device(1))
+    s_array = length(x)
+    s_arrays = ceil(Int, s_array/ndev)
+    #println(s_arrays)
+    array_ret = Vector{Any}(undef, ndev)  
+    pointer_ret = Vector{AMDGPU.Device.ROCDeviceVector{T,AMDGPU.Device.AS.Global}}(undef, ndev)  
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      array_ret[i] = ROCArray(x[((i-1)*s_arrays)+1:i*s_arrays])
+      pointer_ret[i] = AMDGPU.rocconvert(array_ret[i])
+    end
+    
+    AMDGPU.device!(AMDGPU.device(1))
+    #amdgpu_pointer_ret = ROCArray(pointer_ret)
+    amdgpu_pointer_ret = get_portable_rocarray(pointer_ret)
+    copyto!(amdgpu_pointer_ret, pointer_ret)
+    ret[1] = amdgpu_pointer_ret
+    ret[2] = array_ret
+
+  elseif ndims(x) == 2
+
+    AMDGPU.device!(AMDGPU.device(1))
+    #s_row_array = size(x,1)
+    s_col_array = size(x,2)
+    s_col_arrays = ceil(Int, s_col_array/ndev)
+    array_ret = Vector{Any}(undef, ndev)  
+    pointer_ret = Vector{AMDGPU.Device.ROCDeviceMatrix{T,1}}(undef, ndev)  
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      array_ret[i] = ROCArray(x[:,((i-1)*s_col_arrays)+1:i*s_col_arrays])
+      pointer_ret[i] = AMDGPU.rocconvert(array_ret[i])
+    end
+  
+    AMDGPU.device!(AMDGPU.device(1))
+    #amdgpu_pointer_ret = ROCArray(pointer_ret)
+    amdgpu_pointer_ret = get_portable_rocarray(pointer_ret)
+    copyto!(amdgpu_pointer_ret, pointer_ret)
+    ret[1] = amdgpu_pointer_ret
+    ret[2] = array_ret
+
+  end
+
+  return ret
+
+end
+
+function JACC.multi.copy(x::Vector{Any}, y::Vector{Any})
+   
+   AMDGPU.device!(AMDGPU.device(1))
+   ndev = length(AMDGPU.devices())
+
+   for i in 1:ndev
+       AMDGPU.device!(AMDGPU.device(i))
+       size = length(x[2][i])
+       numThreads = 512
+       threads = min(size, numThreads)
+       blocks = ceil(Int, size / threads)
+       @roc groupsize=threads gridsize=blocks _multi_copy(i, x[1], y[1])
+       #AMDGPU.synchronize()
+   end
+
+   for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+   end
+
+   AMDGPU.device!(AMDGPU.device(1))
+
+end
+
+function JACC.multi.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+
+  ndev = length(AMDGPU.devices())
+  N_multi = ceil(Int, N/ndev)
+  numThreads = 256
+  threads = min(N_multi, numThreads)
+  blocks = ceil(Int, N_multi / threads)
+  
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    dev_id = i
+    @roc groupsize=threads gridsize=blocks _multi_parallel_for_amdgpu(N_multi, dev_id, f, x...)
+  end
+
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    AMDGPU.synchronize()
+  end
+  
+  AMDGPU.device!(AMDGPU.device(1))
+
+end
+
+function JACC.multi.parallel_for((M, N)::Tuple{I,I}, f::F, x...) where {I <: Integer, F <: Function}
+
+  ndev = length(AMDGPU.devices())
+  N_multi = ceil(Int, N/ndev)
+  numThreads = 16
+  Mthreads = min(M, numThreads)
+  Nthreads = min(N_multi, numThreads)
+  Mblocks = ceil(Int, M / Mthreads)
+  Nblocks = ceil(Int, N_multi / Nthreads)
+
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    dev_id = i
+    @roc groupsize=(Mthreads, Nthreads) gridsize=(Mblocks, Nblocks) _multi_parallel_for_amdgpu_MN(M, N_multi, dev_id, f, x...)
+  end
+
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    AMDGPU.synchronize()
+  end
+  
+  AMDGPU.device!(AMDGPU.device(1))
+
+end
+
+function JACC.multi.parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
+
+    AMDGPU.device!(AMDGPU.device(1))
+    ndev = length(AMDGPU.devices())
+    ret = Vector{Any}(undef, ndev)
+    rret = Vector{Any}(undef, ndev)
+    N_multi = ceil(Int, N/ndev)
+    numThreads = 512
+    threads = min(N_multi, numThreads)
+    blocks = ceil(Int, N_multi / threads)
+    final_rret = AMDGPU.zeros(Float64, 1)
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      ret[i] = AMDGPU.zeros(Float64, blocks)
+      rret[i] = AMDGPU.zeros(Float64, 1)
+    end
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      dev_id = i
+      @roc groupsize=threads gridsize=blocks _multi_parallel_reduce_amdgpu(
+        N_multi, dev_id, ret[i], f, x...)
+    end
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      dev_id = i
+      @roc groupsize=threads gridsize=1 _multi_reduce_kernel_amdgpu(
+        blocks, ret[i], rret[i])
+    end
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+    end
+
+    tmp_rret = Vector{Any}(undef, ndev)
+    tmp_final_rret = 0.0
+
+    for i in 1:ndev
+      tmp_rret[i] = zeros(Float64, 1)
+      AMDGPU.device!(AMDGPU.device(i))
+      tmp_rret[i] = Base.Array(rret[i])
+      #println(tmp_rret[i][1])
+    end
+
+    AMDGPU.device!(AMDGPU.device(1))
+    for i in 1:ndev
+      tmp_final_rret += tmp_rret[i][1]
+    end
+    final_rret = tmp_final_rret
+
+    return final_rret
+end
+
+function JACC.multi.parallel_reduce(
+        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+
+  ndev = length(devices())
+  ret = Vector{Any}(undef, ndev)
+  rret = Vector{Any}(undef, ndev)
+  N_multi = ceil(Int, N/ndev)
+  numThreads = 16
+  Mthreads = min(M, numThreads)
+  Nthreads = min(N_multi, numThreads)
+  Mblocks = ceil(Int, M / Mthreads)
+  Nblocks = ceil(Int, N_multi / Nthreads)
+  final_rret = AMDGPU.zeros(Float64, 1)
+
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    ret[i] = AMDGPU.zeros(Float64, (Mblocks, Nblocks))
+    rret[i] = AMDGPU.zeros(Float64, 1)
+  end
+
+  for i in 1:ndev
+	  AMDGPU.device!(AMDGPU.device(i))
+    dev_id = i
+
+    @roc groupsize=(Mthreads, Nthreads) gridsize=(Mblocks, Nblocks) _multi_parallel_reduce_amdgpu_MN(
+        (M, N_multi), dev_id, ret[i], f, x...)
+
+    @roc groupsize=(Mthreads, Nthreads) gridsize=(1, 1) _multi_reduce_kernel_amdgpu_MN(
+        (Mblocks, Nblocks), ret[i], rret[i])
+  end
+
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    AMDGPU.synchronize()
+  end
+  
+  tmp = zeros(ndev)
+
+  for i in 1:ndev
+    AMDGPU.device!(AMDGPU.device(i))
+    tmp[i] = Base.Array(rret[i])
+  end
+
+  AMDGPU.device!(AMDGPU.device(1))
+  for i in 1:ndev
+    final_rret += tmp[i]
+  end
+
+  return final_rret
+
+end
+
+function _multi_parallel_for_amdgpu(N, dev_id, f, x...)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i <= N
+        f(dev_id, i, x...)
+    end
+    return nothing
+end
+
+function _multi_parallel_for_amdgpu_MN(M, N, dev_id, f, x...)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    if (i <= M) && (j <= N)
+        f(dev_id, i, j, x...)
+    end
+    return nothing
+end
+
+function _multi_parallel_reduce_amdgpu(N, dev_id, ret, f, x...)
+    shared_mem = @ROCStaticLocalArray(Float64, 512)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    ti = workitemIdx().x
+    tmp::Float64 = 0.0
+    shared_mem[ti] = 0.0
+
+    if i <= N
+        tmp = @inbounds f(dev_id, i, x...)
+        shared_mem[workitemIdx().x] = tmp
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 256)
+        shared_mem[ti] += shared_mem[ti + 256]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 128)
+        shared_mem[ti] += shared_mem[ti + 128]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 64)
+        shared_mem[ti] += shared_mem[ti + 64]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 32)
+        shared_mem[ti] += shared_mem[ti + 32]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 16)
+        shared_mem[ti] += shared_mem[ti + 16]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 8)
+        shared_mem[ti] += shared_mem[ti + 8]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 4)
+        shared_mem[ti] += shared_mem[ti + 4]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 2)
+        shared_mem[ti] += shared_mem[ti + 2]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti == 1)
+        shared_mem[ti] += shared_mem[ti + 1]
+	ret[workgroupIdx().x] = shared_mem[ti]
+    end
+    return nothing
+end
+
+function _multi_reduce_kernel_amdgpu(N, red, ret)
+    shared_mem = @ROCStaticLocalArray(Float64, 512)
+    i = workitemIdx().x
+    ii = i
+    tmp::Float64 = 0.0
+    if N > 512
+        while ii <= N
+            tmp += @inbounds red[ii]
+            ii += 512
+        end
+    elseif (i <= N)
+          tmp = @inbounds red[i]
+    end
+    shared_mem[workitemIdx().x] = tmp
+    AMDGPU.sync_workgroup()
+    if (i <= 256)
+        shared_mem[i] += shared_mem[i + 256]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 128)
+        shared_mem[i] += shared_mem[i + 128]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 64)
+        shared_mem[i] += shared_mem[i + 64]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 32)
+        shared_mem[i] += shared_mem[i + 32]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 16)
+        shared_mem[i] += shared_mem[i + 16]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 8)
+        shared_mem[i] += shared_mem[i + 8]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 4)
+        shared_mem[i] += shared_mem[i + 4]
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 2)
+        shared_mem[i] += shared_mem[i + 2]
+    end
+    AMDGPU.sync_workgroup()
+    if (i == 1)
+        shared_mem[i] += shared_mem[i + 1]
+        ret[1] = shared_mem[1]
+    end
+    AMDGPU.sync_workgroup()
+    return nothing
+end
+
+function _multi_parallel_reduce_amdgpu_MN((M, N), dev_id, ret, f, x...)
+    shared_mem = @ROCStaticLocalArray(Float64, 16*16)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    ti = workitemIdx().x
+    tj = workitemIdx().y
+    bi = workgroupIdx().x
+    bj = workgroupIdx().y
+
+    tmp::Float64 = 0.0
+    shared_mem[((ti - 1) * 16) + tj] = tmp
+
+    if (i <= M && j <= N)
+        tmp = @inbounds f(dev_id, i, j, x...)
+        shared_mem[(ti - 1) * 16 + tj] = tmp
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 8 && tj <= 8 && ti + 8 <= M && tj + 8 <= N)
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 7) * 16) + (tj + 8)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 8)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 7) * 16) + tj]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 4 && tj <= 4 && ti + 4 <= M && tj + 4 <= N)
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 3) * 16) + (tj + 4)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 4)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 3) * 16) + tj]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti <= 2 && tj <= 2 && ti + 2 <= M && tj + 2 <= N)
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 1) * 16) + (tj + 2)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 2)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti + 1) * 16) + tj]
+    end
+    AMDGPU.sync_workgroup()
+    if (ti == 1 && tj == 1 && ti + 1 <= M && tj + 1 <= N)
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[ti * 16 + (tj + 1)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[((ti - 1) * 16) + (tj + 1)]
+        shared_mem[((ti - 1) * 16) + tj] += shared_mem[ti * 16 + tj]
+        ret[bi, bj] = shared_mem[((ti - 1) * 16) + tj]
+    end
+    return nothing
+end
+
+function _multi_reduce_kernel_amdgpu_MN((M, N), red, ret)
+    shared_mem = @ROCStaticLocalArray(Float64, 16*16)
+    i = workitemIdx().x
+    j = workitemIdx().y
+    ii = i
+    jj = j
+
+    tmp::Float64 = 0.0
+    shared_mem[(i - 1) * 16 + j] = tmp
+
+    if M > 16 && N > 16
+        while ii <= M
+            jj = workitemIdx().y
+            while jj <= N
+                tmp = tmp + @inbounds red[ii, jj]
+                jj += 16
+            end
+            ii += 16
+        end
+    elseif M > 16
+        while ii <= N
+            tmp = tmp + @inbounds red[ii, jj]
+            ii += 16
+        end
+    elseif N > 16
+        while jj <= N
+            tmp = tmp + @inbounds red[ii, jj]
+            jj += 16
+        end
+    elseif M <= 16 && N <= 16
+        if i <= M && j <= N
+            tmp = tmp + @inbounds red[i, j]
+        end
+    end
+    shared_mem[(i - 1) * 16 + j] = tmp
+    red[i, j] = shared_mem[(i - 1) * 16 + j]
+    AMDGPU.sync_workgroup()
+    if (i <= 8 && j <= 8)
+        if (i + 8 <= M && j + 8 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 7) * 16) + (j + 8)]
+        end
+        if (i <= M && j + 8 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 8)]
+        end
+        if (i + 8 <= M && j <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 7) * 16) + j]
+        end
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 4 && j <= 4)
+        if (i + 4 <= M && j + 4 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 3) * 16) + (j + 4)]
+        end
+        if (i <= M && j + 4 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 4)]
+        end
+        if (i + 4 <= M && j <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 3) * 16) + j]
+        end
+    end
+    AMDGPU.sync_workgroup()
+    if (i <= 2 && j <= 2)
+        if (i + 2 <= M && j + 2 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 1) * 16) + (j + 2)]
+        end
+        if (i <= M && j + 2 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 2)]
+        end
+        if (i + 2 <= M && j <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i + 1) * 16) + j]
+        end
+    end
+    AMDGPU.sync_workgroup()
+    if (i == 1 && j == 1)
+        if (i + 1 <= M && j + 1 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[i * 16 + (j + 1)]
+        end
+        if (i <= M && j + 1 <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[((i - 1) * 16) + (j + 1)]
+        end
+        if (i + 1 <= M && j <= N)
+            shared_mem[((i - 1) * 16) + j] += shared_mem[i * 16 + j]
+        end
+        ret[1] = shared_mem[((i - 1) * 16) + j]
+    end
+    return nothing
+end
+
+
+
+
+end # module multi
diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
index 20aa6a1..4c77d88 100644
--- a/ext/JACCCUDA/JACCCUDA.jl
+++ b/ext/JACCCUDA/JACCCUDA.jl
@@ -5,7 +5,6 @@ using JACC, CUDA
 # overloaded array functions
 include("array.jl")
 
-
 include("JACCMULTI.jl")
 using .multi
 
diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl
index 73e75a2..bdd6ef3 100644
--- a/ext/JACCCUDA/JACCMULTI.jl
+++ b/ext/JACCCUDA/JACCMULTI.jl
@@ -2,6 +2,10 @@ module multi
 
 using JACC, CUDA
 
+function JACC.multi.ndev()
+  return length(devices())
+end
+
 function JACC.multi.Array(x::Base.Array{T,N}) where {T,N}
 
   ndev = length(devices())
diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl
index b86be05..6dd4c10 100644
--- a/src/JACCMULTI.jl
+++ b/src/JACCMULTI.jl
@@ -2,6 +2,9 @@ module multi
 
 using JACC
 
+function ndev()
+end
+
 function Array(x::Base.Array{T,N}) where {T,N}
   return x
 end
diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl
index 669476e..d2364aa 100644
--- a/test/tests_amdgpu.jl
+++ b/test/tests_amdgpu.jl
@@ -265,3 +265,44 @@ end
     C_expected = Float32(2.0) .* ones(Float32, L, M, N)
     @test Array(C)≈C_expected rtol=1e-5
 end
+
+#@testset "JACC.multi" begin
+
+#    x = ones(1_000)
+#    y = ones(1_000)
+#    jx = JACC.multi.Array(x)
+#    jy = JACC.multi.Array(y)
+#    jxresult = ones(1_000)
+#    alpha = 2.0
+
+#    function seq_axpy(N, alpha, x, y)
+#        for i in 1:N
+#            @inbounds x[i] += alpha * y[i]
+#        end
+#    end
+    
+#    function multi_axpy(dev_id, i, alpha, x, y)
+#	    @inbounds x[dev_id][i] += alpha * y[dev_id][i]
+#    end
+    
+#    function seq_dot(N, x, y)
+#        r = 0.0
+#        for i in 1:N
+#            @inbounds r += x[i] * y[i]
+#        end
+#        return r
+#    end
+
+#    function multi_dot(dev_id, i, x, y)
+#        @inbounds x[dev_id][i] * y[dev_id][i]
+#    end
+
+    #ref_result = seq_axpy(1_000, x, y)
+    #jresult = JACC.multi.parallel_reduce(1_000, multi_dot, jx[1], jy[1])
+#    seq_axpy(1_000, alpha, x, y)
+#    JACC.multi.parallel_for(1_000, multi_axpy, alpha, jx[1], jy[1])
+
+    #result = Base.Array(jresult)
+
+    #@test jresult[1]≈ref_result rtol=1e-8
+#end

From 1d1c415c26dadc8620da96fb11c0bee3cd824c0b Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.gov>
Date: Mon, 21 Oct 2024 10:15:51 -0400
Subject: [PATCH 03/14] Update JACCMULTI.jl

---
 ext/JACCAMDGPU/JACCMULTI.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl
index a1cfaf3..e7fb66a 100644
--- a/ext/JACCAMDGPU/JACCMULTI.jl
+++ b/ext/JACCAMDGPU/JACCMULTI.jl
@@ -502,7 +502,4 @@ function _multi_reduce_kernel_amdgpu_MN((M, N), red, ret)
     return nothing
 end
 
-
-
-
 end # module multi

From 6e04356a4f276e512f530e3cdd7f98ba05c4ecb1 Mon Sep 17 00:00:00 2001
From: William F Godoy <williamfgc@yahoo.com>
Date: Mon, 21 Oct 2024 16:24:59 -0400
Subject: [PATCH 04/14] Update to latest checkout action

---
 .github/workflows/ci-gpu-AMD.yaml    | 8 ++++----
 .github/workflows/ci-gpu-NVIDIA.yaml | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci-gpu-AMD.yaml b/.github/workflows/ci-gpu-AMD.yaml
index e9a5c58..49a293b 100644
--- a/.github/workflows/ci-gpu-AMD.yaml
+++ b/.github/workflows/ci-gpu-AMD.yaml
@@ -36,11 +36,11 @@ jobs:
       - name: GitHub API Request
         if: steps.check.outputs.triggered == 'true'
         id: request
-        uses: octokit/request-action@v2.1.9
+        uses: octokit/request-action@v2.x
         with:
           route: ${{github.event.issue.pull_request.url}}
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
 
       # Create a separate PR status pointing at GitHub Actions tab URL
       # just like any other third-party service
@@ -67,7 +67,7 @@ jobs:
         if: steps.check.outputs.triggered == 'true'
         uses: actions/checkout@v4
         with:
-          token: ${{ secrets.GITHUB_TOKEN }}
+          token: ${{secrets.GITHUB_TOKEN}}
           repository: ${{fromJson(steps.request.outputs.data).head.repo.full_name}}
           ref: ${{steps.pr_data.outputs.branch}}
 
@@ -92,7 +92,7 @@ jobs:
         if: always() && steps.check.outputs.triggered == 'true'
         uses: geekdude/github-status-action-v2@v1.1.10
         with:
-          authToken: ${{ secrets.GITHUB_TOKEN }}
+          authToken: ${{secrets.GITHUB_TOKEN}}
           context: "ci-GPU-AMD ${{matrix.jobname}}"
           state: ${{job.status}}
           sha: ${{fromJson(steps.request.outputs.data).head.sha}}
diff --git a/.github/workflows/ci-gpu-NVIDIA.yaml b/.github/workflows/ci-gpu-NVIDIA.yaml
index 0b67c93..8ebe935 100644
--- a/.github/workflows/ci-gpu-NVIDIA.yaml
+++ b/.github/workflows/ci-gpu-NVIDIA.yaml
@@ -36,7 +36,7 @@ jobs:
       - name: GitHub API Request
         if: steps.check.outputs.triggered == 'true'
         id: request
-        uses: octokit/request-action@v2.1.9
+        uses: octokit/request-action@v2.x
         with:
           route: ${{github.event.issue.pull_request.url}}
         env:
@@ -49,7 +49,7 @@ jobs:
         uses: geekdude/github-status-action-v2@v1.1.10
         with:
           authToken: ${{secrets.GITHUB_TOKEN}}
-          context: "ci-gpu-nvidia-ornl ${{ matrix.jobname }}"
+          context: "ci-gpu-NVIDIA ${{ matrix.jobname }}"
           state: "pending"
           sha: ${{fromJson(steps.request.outputs.data).head.sha}}
           target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
@@ -86,7 +86,7 @@ jobs:
         uses: geekdude/github-status-action-v2@v1.1.10
         with:
           authToken: ${{secrets.GITHUB_TOKEN}}
-          context: "ci-gpu-nvidia-ornl ${{matrix.jobname}}"
+          context: "ci-gpu-NVIDIA ${{matrix.jobname}}"
           state: ${{job.status}}
           sha: ${{fromJson(steps.request.outputs.data).head.sha}}
           target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}

From ae4759bddb1e4266ea60568d2dfc1b900ca4ac7e Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Mon, 28 Oct 2024 09:36:38 -0500
Subject: [PATCH 05/14] Revert AMDGPU to v1.0

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 52629e0..419afe9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,7 +18,7 @@ JACCCUDA = ["CUDA"]
 JACCONEAPI = ["oneAPI"]
 
 [compat]
-AMDGPU = "0.8"
+AMDGPU = "1.0"
 Atomix = "0.1"
 CUDA = "5"
 Preferences = "1.4.0"

From 67bc4989cc0ef2cf28de3489cd1296ff6f480781 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Mon, 28 Oct 2024 09:53:53 -0500
Subject: [PATCH 06/14] Remove version from runtests

---
 test/runtests.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 73df172..b17a5c9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,7 +11,8 @@ const backend = JACC.JACCPreferences.backend
     include("tests_cuda.jl")
 
 elseif backend == "amdgpu"
-    Pkg.add(; name = "AMDGPU", version = "v0.8.6")
+    # Pkg.add(; name = "AMDGPU", version = "v0.8.6")
+    Pkg.add("AMDGPU")
     println("AMDGPU backend loaded")
     include("tests_amdgpu.jl")
 

From a58d1ada6d3246eb85e74c13eb03ba86afcaf1f5 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Mon, 28 Oct 2024 10:01:36 -0500
Subject: [PATCH 07/14] Allow more recent versions of julia

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 419afe9..599c8f7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,4 +22,4 @@ AMDGPU = "1.0"
 Atomix = "0.1"
 CUDA = "5"
 Preferences = "1.4.0"
-julia = "1.9.0"
+julia = ">= 1.9.0"

From c327dcaf29737804349f9c212b41ab9584079629 Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.com>
Date: Tue, 29 Oct 2024 12:10:39 -0400
Subject: [PATCH 08/14] Added stencil-aware funtions for JACC.multi on CUDA
 back ends. There are still a couple of things to implement, such as the ghost
 copy functions for 2D arrays as well as capability for 3D Arrays

---
 ext/JACCCUDA/JACCMULTI.jl | 288 +++++++++++++++++++++++++++++++++++++-
 src/JACCMULTI.jl          |  15 ++
 2 files changed, 300 insertions(+), 3 deletions(-)

diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl
index bdd6ef3..a43800a 100644
--- a/ext/JACCCUDA/JACCMULTI.jl
+++ b/ext/JACCCUDA/JACCMULTI.jl
@@ -56,24 +56,235 @@ function JACC.multi.Array(x::Base.Array{T,N}) where {T,N}
 
 end
 
+function JACC.multi.gArray(x::Base.Array{T,N}) where {T,N}
+
+  ndev = length(devices())
+  ret = Vector{Any}(undef, 2)  
+
+  if ndims(x) == 1
+
+    device!(0)
+    s_array = length(x)
+    s_arrays = ceil(Int, s_array/ndev)
+    array_ret = Vector{Any}(undef, ndev)  
+    pointer_ret = Vector{CuDeviceVector{T,CUDA.AS.Global}}(undef, ndev)  
+
+    for i in 1:ndev
+      device!(i-1)
+      if i == 1
+          array_ret[i] = CuArray(x[((i-1)*s_arrays)+1:(i*s_arrays)+1])
+      elseif i == ndev
+          array_ret[i] = CuArray(x[(((i-1)*s_arrays)+1)-1:i*s_arrays])
+      else
+          array_ret[i] = CuArray(x[(((i-1)*s_arrays)+1)-1:(i*s_arrays)+1])
+      end
+      pointer_ret[i] = cudaconvert(array_ret[i])
+    end
+  
+    device!(0)
+    cuda_pointer_ret = CuArray(pointer_ret)
+    ret[1] = cuda_pointer_ret
+    ret[2] = array_ret
+
+  elseif ndims(x) == 2
+
+    device!(0)
+    s_col_array = size(x,2)
+    s_col_arrays = ceil(Int, s_col_array/ndev)
+    array_ret = Vector{Any}(undef, ndev)  
+    pointer_ret = Vector{CuDeviceMatrix{T,CUDA.AS.Global}}(undef, ndev)  
+
+    for i in 1:ndev
+      device!(i-1)
+      array_ret[i] = CuArray(x[:,((i-1)*s_col_arrays)+1:i*s_col_arrays])
+      pointer_ret[i] = cudaconvert(array_ret[i])
+    end
+  
+    device!(0)
+  
+    cuda_pointer_ret = CuArray(pointer_ret)
+    ret[1] = cuda_pointer_ret
+    ret[2] = array_ret
+
+  end
+
+  return ret
+
+end
+
 function JACC.multi.copy(x::Vector{Any}, y::Vector{Any})
+   
+  device!(0) 
+  ndev = length(devices())
+   
+  if ndims(x[2][1]) == 1
+  
+    for i in 1:ndev
+      device!(i-1)
+      size = length(x[2][i])
+      numThreads = 512
+      threads = min(size, numThreads)
+      blocks = ceil(Int, size / threads)
+      @cuda threads=threads blocks=blocks _multi_copy(i, x[1], y[1])
+    end
+    
+    for i in 1:ndev
+      device!(i-1)
+      synchronize()
+    end 
+
+  elseif ndims(x[2][1]) == 2
+
+    for i in 1:ndev
+      device!(i-1)
+      ssize = size(x[2][i])
+      numThreads = 16
+      Mthreads = min(ssize[1], numThreads)
+      Mblocks = ceil(Int, ssize[1] / Mthreads)
+      Nthreads = min(ssize[2], numThreads)
+      Nblocks = ceil(Int, ssize[2] / Nthreads)
+      @cuda threads=(Mthreads,Nthreads) blocks=(Mblocks,Nblocks) _multi_copy_2d(i, x[1], y[1])
+    end
+     
+    for i in 1:ndev
+      device!(i-1)
+      synchronize()
+    end
+   
+   end
+
+   device!(0) 
+  
+end
+
+function JACC.multi.gid(dev_id::I, i::I, ndev::I) where{I <: Integer}
+    ind = 0
+    if dev_id == 1
+        ind = i
+    elseif dev_id == ndev
+        ind = i + 1
+    else
+        ind = i + 1
+    end    
+    return ind
+end
+
+
+function JACC.multi.gswap(x::Vector{Any})
+   
    device!(0) 
    ndev = length(devices())
+   
+   if ndims(x[2][1]) == 1
+   
+     #Left to right swapping
+     for i in 1:ndev-1
+       device!(i-1)
+       tmp = Base.Array(x[2][i])
+       size = length(tmp)
+       ghost_lr = tmp[size-1]
+       device!(i)
+       @cuda threads=32 blocks=1 _multi_swap_ghost_lr(i+1, x[1], ndev, size, ghost_lr)
+     end
+   
+     #Right to left swapping
+     for i in 2:ndev
+       device!(i-1)
+       tmp = Base.Array(x[2][i])
+       size = length(tmp)
+       ghost_rl = tmp[2]
+       device!(i-2)
+       @cuda threads=32 blocks=1 _multi_swap_ghost_rl(i-1, x[1], ndev, size, ghost_rl)
+     end
+   
+     for i in 1:ndev
+       device!(i-1)
+       synchronize()
+     end 
+
+   elseif ndims(x[2][1]) == 2
   
+     #Left to right swapping
+     for i in 1:ndev-1
+       device!(i-1)
+       dim = size(x[2][i])
+       tmp = Base.Array(x[2][i][:,dim[2]-1])
+       device!(i)
+       ghost_lr = CuArray(tmp)
+       numThreads = 512
+       threads = min(dim[1], numThreads)
+       blocks = ceil(Int, dim[1] / threads)
+       @cuda threads=threads blocks=blocks _multi_swap_2d_ghost_lr(i+1, x[1], ndev, dim[1], ghost_lr)
+     end
+
+     #Right to left swapping
+     for i in 2:ndev
+       device!(i-1)
+       tmp = Base.Array(x[2][i][:,2])
+       device!(i-2)
+       dim = size(x[2][i-1])
+       ghost_rl = CuArray(tmp)
+       numThreads = 512
+       threads = min(dim[1], numThreads)
+       blocks = ceil(Int, dim[1] / threads)
+       @cuda threads=threads blocks=blocks _multi_swap_2d_ghost_rl(i-1, x[1], ndev, dim[1], dim[2], ghost_rl)
+     end
+
+     for i in 1:ndev
+        device!(i-1)
+        synchronize()
+     end
+
+   end 
+     
+   device!(0)
+
+end
+
+function JACC.multi.gcopytoarray(x::Vector{Any}, y::Vector{Any})
+   #x is the array and y is the ghost array
+   device!(0) 
+   ndev = length(devices())
+   
    for i in 1:ndev
        device!(i-1)
-       size = length(x[2][i])
+       size = length(y[2][i])
        numThreads = 512
        threads = min(size, numThreads)
        blocks = ceil(Int, size / threads)
-       @cuda threads=threads blocks=blocks _multi_copy(i, x[1], y[1])
+       @cuda threads=threads blocks=blocks _multi_copy_ghosttoarray(i, x[1], y[1], size, ndev)
+       #synchronize() 
    end
-    
+   
    for i in 1:ndev
       device!(i-1)
       synchronize()
    end 
+   
+   device!(0) 
+  
+end
 
+function JACC.multi.copytogarray(x::Vector{Any}, y::Vector{Any})
+   #x is the ghost array and y is the array
+   device!(0) 
+   ndev = length(devices())
+   
+   for i in 1:ndev
+       device!(i-1)
+       size = length(x[2][i])
+       numThreads = 512
+       threads = min(size, numThreads)
+       blocks = ceil(Int, size / threads)
+       @cuda threads=threads blocks=blocks _multi_copy_arraytoghost(i, x[1], y[1], size, ndev)
+       #synchronize() 
+   end
+   
+   for i in 1:ndev
+      device!(i-1)
+      synchronize()
+   end 
+   
    device!(0) 
   
 end
@@ -215,6 +426,77 @@ function JACC.multi.parallel_reduce(
 
 end
 
+function _multi_copy(dev_id, x, y)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    @inbounds x[dev_id][i] = y[dev_id][i]
+    return nothing
+end
+
+function _multi_copy_2d(dev_id, x, y)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    @inbounds x[dev_id][i,j] = y[dev_id][i,j]
+    return nothing
+end
+
+function _multi_copy_ghosttoarray(dev_id, x, y, size, ndev)
+    #x is the array and y is the ghost array
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if dev_id == 1 && i < size
+        @inbounds x[dev_id][i] = y[dev_id][i]
+    elseif dev_id == ndev && i > 1
+        @inbounds x[dev_id][i-1] = y[dev_id][i]
+    elseif i > 1 && i < size
+        @inbounds x[dev_id][i-1] = y[dev_id][i]
+    end
+    return nothing
+end
+
+function _multi_copy_arraytoghost(dev_id, x, y, size, ndev)
+    #x is the ghost array and y is the array
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if dev_id == 1 && i < size
+        @inbounds x[dev_id][i] = y[dev_id][i]
+    elseif dev_id == ndev && i < size
+        @inbounds x[dev_id][i+1] = y[dev_id][i]
+    elseif i > 1 && i < size
+        @inbounds x[dev_id][i] = y[dev_id][i]
+    end
+    return nothing
+end
+
+function _multi_swap_ghost_lr(dev_id, x, ndev, size, ghost)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if i == 1
+        x[dev_id][i] = ghost
+    end
+    return nothing
+end
+
+function _multi_swap_2d_ghost_lr(dev_id, x, ndev, size, ghost)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if i < size + 1
+      x[dev_id][i, 1] = ghost[i]
+    end
+    return nothing
+end
+
+function _multi_swap_ghost_rl(dev_id, x, ndev, size, ghost)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if i == 1
+        x[dev_id][size] = ghost
+    end
+    return nothing
+end
+
+function _multi_swap_2d_ghost_rl(dev_id, x, ndev, size, col, ghost)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if i < size + 1
+      x[dev_id][i, col] = ghost[i]
+    end
+    return nothing
+end
+
 function _multi_parallel_for_cuda(N, dev_id, f, x...)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if i <= N
diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl
index 6dd4c10..3457d3c 100644
--- a/src/JACCMULTI.jl
+++ b/src/JACCMULTI.jl
@@ -9,6 +9,21 @@ function Array(x::Base.Array{T,N}) where {T,N}
   return x
 end
 
+function gArray(x::Base.Array{T,N}) where {T,N}
+end
+
+function gid(dev_id::I, i::I, ndev::I) where{I <: Integer}
+end
+
+function gswap(x::Vector{Any})
+end
+
+function gcopytoarray(x::Vector{Any}, y::Vector{Any})
+end
+
+function copytogarray(x::Vector{Any}, y::Vector{Any})
+end
+
 function copy(x::Vector{Any}, y::Vector{Any})
 end
 

From ffe94481ca2397af194c4d82a5eee3ddf618c013 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Wed, 30 Oct 2024 10:45:53 -0500
Subject: [PATCH 09/14] Fix errors and restrict version

---
 Project.toml                |   2 +-
 ext/JACCAMDGPU/JACCMULTI.jl |   1 +
 test/runtests.jl            |   4 +-
 test/tests_amdgpu.jl        | 104 ++++++++++++++++++------------------
 4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/Project.toml b/Project.toml
index 599c8f7..6e04426 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,7 +18,7 @@ JACCCUDA = ["CUDA"]
 JACCONEAPI = ["oneAPI"]
 
 [compat]
-AMDGPU = "1.0"
+AMDGPU = "0.8"
 Atomix = "0.1"
 CUDA = "5"
 Preferences = "1.4.0"
diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl
index c5aefce..8af4bd3 100644
--- a/ext/JACCAMDGPU/JACCMULTI.jl
+++ b/ext/JACCAMDGPU/JACCMULTI.jl
@@ -1,6 +1,7 @@
 module multi
 
 using JACC, AMDGPU
+using JACCAMDGPU: AMDGPUBackend
 
 function JACC.multi.ndev(::AMDGPUBackend)
   return length(AMDGPU.devices())
diff --git a/test/runtests.jl b/test/runtests.jl
index b17a5c9..e9a04e7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,8 +11,8 @@ const backend = JACC.JACCPreferences.backend
     include("tests_cuda.jl")
 
 elseif backend == "amdgpu"
-    # Pkg.add(; name = "AMDGPU", version = "v0.8.6")
-    Pkg.add("AMDGPU")
+    Pkg.add(; name = "AMDGPU", version = "v0.8.6")
+    # Pkg.add("AMDGPU")
     println("AMDGPU backend loaded")
     include("tests_amdgpu.jl")
 
diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl
index d2364aa..61a6742 100644
--- a/test/tests_amdgpu.jl
+++ b/test/tests_amdgpu.jl
@@ -147,57 +147,57 @@ end
 #    result = Array(jresult)     
 
 #    @test result[1]≈ref_result rtol=1e-8
-    x = ones(1_000)
-    y = ones(1_000)
-    y1 = y*2
-    jx = JACC.ones(1_000)
-    jy = JACC.ones(1_000)
-    jy1 = jy*2
-    alpha = 2.0
-
-   function seq_axpy(N, alpha, x, y)
-       for i in 1:N
-           @inbounds x[i] += alpha * y[i]
-       end
-   end
-    
-   function seq_dot(N, x, y)
-       r = 0.0
-       for i in 1:N
-           @inbounds r += x[i] * y[i]
-       end
-       return r
-   end
-   function seq_scal(N, alpha, x)
-        for i in 1:N
-            @inbounds x[i] = alpha * x[i]
-        end
-    end
-
-    function seq_asum(N, x)
-        r = 0.0
-        for i in 1:N
-            @inbounds r += abs(x[i])
-        end
-        return r
-    end
-
-    function seq_nrm2(N, x)
-        sum_sq = 0.0
-        for i in 1:N
-            @inbounds sum_sq += x[i]*x[i]
-        end
-        r = sqrt(sum_sq)
-        return r
-    end
-
-    function seq_swap(N, x, y1)
-        for i in 1:N
-            @inbounds t = x[i]
-            @inbounds x[i] = y1[i]
-            @inbounds y1[i] = t
-        end       
-    end
+#    x = ones(1_000)
+#    y = ones(1_000)
+#    y1 = y*2
+#    jx = JACC.ones(1_000)
+#    jy = JACC.ones(1_000)
+#    jy1 = jy*2
+#    alpha = 2.0
+# 
+#   function seq_axpy(N, alpha, x, y)
+#       for i in 1:N
+#           @inbounds x[i] += alpha * y[i]
+#       end
+#   end
+#    
+#   function seq_dot(N, x, y)
+#       r = 0.0
+#       for i in 1:N
+#           @inbounds r += x[i] * y[i]
+#       end
+#       return r
+#   end
+#   function seq_scal(N, alpha, x)
+#        for i in 1:N
+#            @inbounds x[i] = alpha * x[i]
+#        end
+#    end
+# 
+#    function seq_asum(N, x)
+#        r = 0.0
+#        for i in 1:N
+#            @inbounds r += abs(x[i])
+#        end
+#        return r
+#    end
+# 
+#    function seq_nrm2(N, x)
+#        sum_sq = 0.0
+#        for i in 1:N
+#            @inbounds sum_sq += x[i]*x[i]
+#        end
+#        r = sqrt(sum_sq)
+#        return r
+#    end
+# 
+#    function seq_swap(N, x, y1)
+#        for i in 1:N
+#            @inbounds t = x[i]
+#            @inbounds x[i] = y1[i]
+#            @inbounds y1[i] = t
+#        end       
+#    end
   
     # ref_result = seq_axpy(1_000, alpha, x, y)
     # ref_result = seq_dot(1_000, x, y)   
@@ -229,7 +229,7 @@ end
     # @test x == Array(jx)
     # @test y1 == Array(jy1)
 
-end
+# end
 
 @testset "Add-2D" begin
     function add!(i, j, A, B, C)

From 7ca63bbb74ff6140a1c1661be3a5e88ffd92efa8 Mon Sep 17 00:00:00 2001
From: William F Godoy <williamfgc@yahoo.com>
Date: Wed, 30 Oct 2024 14:31:45 -0400
Subject: [PATCH 10/14] Bump Julia to 1.10 on AMD GPU CI

---
 .github/workflows/ci-gpu-AMD.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-gpu-AMD.yaml b/.github/workflows/ci-gpu-AMD.yaml
index 49a293b..815f26e 100644
--- a/.github/workflows/ci-gpu-AMD.yaml
+++ b/.github/workflows/ci-gpu-AMD.yaml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        jobname: [ROCM6-JULIA1_9_1-AMDGPU0_8_6]
+        jobname: [ROCM6-JULIA1_10_4-AMDGPU0_8_6]
 
     steps:
       # Only trigger CI for certain "actors" (those commenting the PR, not the PR originator)
@@ -75,7 +75,7 @@ jobs:
         if: steps.check.outputs.triggered == 'true'
         run: |
           source /etc/profile.d/lmod.sh
-          module load julia/1.9.1
+          module load julia/1.10.4
           module load rocm
           julia --project -e 'using Pkg; Pkg.instantiate()'
           julia --project -e 'using JACC.JACCPreferences; JACCPreferences.set_backend("AMDGPU")'
@@ -84,7 +84,7 @@ jobs:
         if: steps.check.outputs.triggered == 'true'
         run: |
           source /etc/profile.d/lmod.sh
-          module load julia/1.9.1
+          module load julia/1.10.4
           module load rocm
           julia --project -e 'using Pkg; Pkg.test()'
 

From c286f946e15603415dd42c205893905f3a8cf868 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Wed, 30 Oct 2024 14:31:17 -0500
Subject: [PATCH 11/14] Added range checks to parallel_for implementations

---
 ext/JACCAMDGPU/JACCAMDGPU.jl | 18 ++++++++++++------
 ext/JACCCUDA/JACCCUDA.jl     | 22 +++++++++++++---------
 ext/JACCONEAPI/JACCONEAPI.jl | 18 ++++++++++++------
 3 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
index 2b901c6..35f69d0 100644
--- a/ext/JACCAMDGPU/JACCAMDGPU.jl
+++ b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -23,7 +23,7 @@ function JACC.parallel_for(::AMDGPUBackend, N::I, f::F, x...) where {I <: Intege
     # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
     shmem_size = 2 * threads * sizeof(Float64)
-    @roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(f, x...)
+    @roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(N, f, x...)
     AMDGPU.synchronize()
 end
 
@@ -37,7 +37,7 @@ function JACC.parallel_for(
     # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
     shmem_size = 2 * Mthreads * Nthreads * sizeof(Float64)
-    @roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN(f, x...)
+    @roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN((M,N), f, x...)
     AMDGPU.synchronize()
 end
 
@@ -54,7 +54,7 @@ function JACC.parallel_for(
     # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
     shmem_size = 2 * Lthreads * Mthreads * Nthreads * sizeof(Float64)
-    @roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN(f, x...)
+    @roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN((L,M,N), f, x...)
     AMDGPU.synchronize()
 end
 
@@ -92,23 +92,29 @@ function JACC.parallel_reduce(
     return rret
 end
 
-function _parallel_for_amdgpu(f, x...)
+function _parallel_for_amdgpu(N, f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i > N && return nothing
     f(i, x...)
     return nothing
 end
 
-function _parallel_for_amdgpu_MN(f, x...)
+function _parallel_for_amdgpu_MN((M,N), f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
     j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    i > M && return nothing
+    j > N && return nothing
     f(i, j, x...)
     return nothing
 end
 
-function _parallel_for_amdgpu_LMN(f, x...)
+function _parallel_for_amdgpu_LMN((L,M,N), f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
     j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
     k = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z
+    i > L && return nothing
+    j > M && return nothing
+    k > N && return nothing
     f(i, j, k, x...)
     return nothing
 end
diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
index 517f3e8..f3fd39c 100644
--- a/ext/JACCCUDA/JACCCUDA.jl
+++ b/ext/JACCCUDA/JACCCUDA.jl
@@ -39,7 +39,7 @@ function JACC.parallel_for(
     Mblocks = ceil(Int, M / Mthreads)
     Nblocks = ceil(Int, N / Nthreads)
     shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
-    CUDA.@sync @cuda threads = (Mthreads, Nthreads) blocks = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_MN(f, x...)
+    CUDA.@sync @cuda threads = (Mthreads, Nthreads) blocks = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_MN((M,N), f, x...)
 end
 
 function JACC.parallel_for(
@@ -55,7 +55,7 @@ function JACC.parallel_for(
     Mblocks = ceil(Int, M / Mthreads)
     Nblocks = ceil(Int, N / Nthreads)
     shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
-    CUDA.@sync @cuda threads = (Lthreads, Mthreads, Nthreads) blocks = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_LMN(f, x...)
+    CUDA.@sync @cuda threads = (Lthreads, Mthreads, Nthreads) blocks = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_cuda_LMN((L,M,N), f, x...)
 end
 
 function JACC.parallel_reduce(
@@ -93,23 +93,27 @@ end
 
 function _parallel_for_cuda(N, f, x...)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    if i <= N
-      f(i, x...)
-    end
+    i > N && return nothing
+    f(i, x...)
     return nothing
 end
 
-function _parallel_for_cuda_MN(f, x...)
-    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    i = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+function _parallel_for_cuda_MN((M,N), f, x...)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    i > M && return nothing
+    j > N && return nothing
     f(i, j, x...)
     return nothing
 end
 
-function _parallel_for_cuda_LMN(f, x...)
+function _parallel_for_cuda_LMN((L,M,N), f, x...)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+    i > L && return nothing
+    j > M && return nothing
+    k > N && return nothing
     f(i, j, k, x...)
     return nothing
 end
diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl
index 929146c..8d9ad14 100644
--- a/ext/JACCONEAPI/JACCONEAPI.jl
+++ b/ext/JACCONEAPI/JACCONEAPI.jl
@@ -20,7 +20,7 @@ function JACC.parallel_for(::oneAPIBackend, N::I, f::F, x...) where {I <: Intege
     # We must know how to get the max shared memory to be used in oneAPI as it is done in CUDA
     #shmem_size = 2 * threads * sizeof(Float64)
     #oneAPI.@sync @oneapi items = items groups = groups shmem = shmem_size _parallel_for_oneapi(f, x...)
-    oneAPI.@sync @oneapi items = items groups = groups _parallel_for_oneapi(f, x...)
+    oneAPI.@sync @oneapi items = items groups = groups _parallel_for_oneapi(N, f, x...)
 end
 
 function JACC.parallel_for(
@@ -30,7 +30,7 @@ function JACC.parallel_for(
     Nitems = min(N, maxPossibleItems)
     Mgroups = ceil(Int, M / Mitems)
     Ngroups = ceil(Int, N / Nitems)
-    oneAPI.@sync @oneapi items=(Mitems, Nitems) groups=(Mgroups, Ngroups) _parallel_for_oneapi_MN(
+    oneAPI.@sync @oneapi items=(Mitems, Nitems) groups=(Mgroups, Ngroups) _parallel_for_oneapi_MN((M,N),
         f, x...)
 end
 
@@ -44,7 +44,7 @@ function JACC.parallel_for(
     Mgroups = ceil(Int, M / Mitems)
     Ngroups = ceil(Int, N / Nitems)
     oneAPI.@sync @oneapi items=(Litems, Mitems, Nitems) groups=(
-        Lgroups, Mgroups, Ngroups) _parallel_for_oneapi_LMN(
+        Lgroups, Mgroups, Ngroups) _parallel_for_oneapi_LMN((L,M,N), 
         f, x...)
 end
 
@@ -77,23 +77,29 @@ function JACC.parallel_reduce(
     return rret
 end
 
-function _parallel_for_oneapi(f, x...)
+function _parallel_for_oneapi(N, f, x...)
     i = get_global_id()
+    i > N && return nothing
     f(i, x...)
     return nothing
 end
 
-function _parallel_for_oneapi_MN(f, x...)
+function _parallel_for_oneapi_MN((M,N), f, x...)
     j = get_global_id(0)
     i = get_global_id(1)
+    i > M && return nothing
+    j > N && return nothing
     f(i, j, x...)
     return nothing
 end
 
-function _parallel_for_oneapi_LMN(f, x...)
+function _parallel_for_oneapi_LMN((L,M,N), f, x...)
     i = get_global_id(0)
     j = get_global_id(1)
     k = get_global_id(2)
+    i > L && return nothing
+    j > M && return nothing
+    k > N && return nothing
     f(i, j, k, x...)
     return nothing
 end

From 242418e0fe035ccccdc4c6d3f22fc3c51f0435e7 Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.com>
Date: Thu, 31 Oct 2024 11:30:54 -0400
Subject: [PATCH 12/14] Added ghost cells support for JACC.multi on AMDGPU
 backend. There is still work to do to support bidimensional and
 tridimensional arrays

---
 ext/JACCAMDGPU/JACCMULTI.jl | 322 +++++++++++++++++++++++++++++++++++-
 ext/JACCCUDA/JACCMULTI.jl   |  15 ++
 2 files changed, 335 insertions(+), 2 deletions(-)

diff --git a/ext/JACCAMDGPU/JACCMULTI.jl b/ext/JACCAMDGPU/JACCMULTI.jl
index 8af4bd3..c42cf15 100644
--- a/ext/JACCAMDGPU/JACCMULTI.jl
+++ b/ext/JACCAMDGPU/JACCMULTI.jl
@@ -69,8 +69,255 @@ function JACC.multi.Array(::AMDGPUBackend, x::Base.Array{T,N}) where {T,N}
 
 end
 
+function JACC.multi.gArray(::AMDGPUBackend, x::Base.Array{T,N}) where {T,N}
+
+  ndev = length(AMDGPU.devices())
+  ret = Vector{Any}(undef, 2)
+
+  if ndims(x) == 1
+
+    AMDGPU.device!(AMDGPU.device(1))
+    s_array = length(x)
+    s_arrays = ceil(Int, s_array/ndev)
+    array_ret = Vector{Any}(undef, ndev)
+    pointer_ret = Vector{AMDGPU.Device.ROCDeviceVector{T,AMDGPU.AS.Global}}(undef, ndev)
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      if i == 1
+          array_ret[i] = ROCArray(x[((i-1)*s_arrays)+1:(i*s_arrays)+1])
+      elseif i == ndev
+          array_ret[i] = ROCArray(x[(((i-1)*s_arrays)+1)-1:i*s_arrays])
+      else
+          array_ret[i] = ROCArray(x[(((i-1)*s_arrays)+1)-1:(i*s_arrays)+1])
+      end
+      pointer_ret[i] = AMDGPU.rocconvert(array_ret[i])
+    end
+
+    AMDGPU.device!(AMDGPU.device(1))
+    #amdgpu_pointer_ret = ROCArray(pointer_ret)
+    amdgpu_pointer_ret = get_portable_rocarray(pointer_ret)
+    copyto!(amdgpu_pointer_ret, pointer_ret)
+    ret[1] = amdgpu_pointer_ret
+    ret[2] = array_ret
+
+  elseif ndims(x) == 2
+
+    AMDGPU.device!(AMDGPU.device(1))
+    #s_row_array = size(x,1)
+    s_col_array = size(x,2)
+    s_col_arrays = ceil(Int, s_col_array/ndev)
+    #println(s_col_arrays)
+    array_ret = Vector{Any}(undef, ndev)
+    pointer_ret = Vector{AMDGPU.Device.ROCDeviceMatrix{T,1}}(undef, ndev)  
+
+    i_col_arrays = floor(Int, s_col_array/ndev)
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      if i == 1
+        array_ret[i] = ROCArray(x[:,((i-1)*s_col_arrays)+1:i*s_col_arrays+1])
+      elseif i == ndev
+        array_ret[i] = ROCArray(x[:,(((i-1)*s_col_arrays)+1)-1:i*s_col_arrays])
+      else
+        array_ret[i] = ROCArray(x[:,(((i-1)*s_col_arrays)+1)-1:i*s_col_arrays+1])
+      end
+      pointer_ret[i] = AMDGPU.rocconvert(array_ret[i])
+    end
+
+    AMDGPU.device!(AMDGPU.device(1))
+    #amdgpu_pointer_ret = ROCArray(pointer_ret)
+    amdgpu_pointer_ret = get_portable_rocarray(pointer_ret)
+    copyto!(amdgpu_pointer_ret, pointer_ret)
+    ret[1] = amdgpu_pointer_ret
+    ret[2] = array_ret
+
+  end
+
+  return ret
+
+end
+
 function JACC.multi.copy(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any})
    
+  AMDGPU.device!(AMDGPU.device(1))
+  ndev = length(AMDGPU.devices())
+  
+  if ndims(x[2][1]) == 1
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      size = length(x[2][i])
+      numThreads = 512
+      threads = min(size, numThreads)
+      blocks = ceil(Int, size / threads)
+      @roc groupsize=threads gridsize=blocks _multi_copy(i, x[1], y[1])
+    end
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+    end
+  
+  elseif ndims(x[2][1]) == 2
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      ssize = size(x[2][i])
+      numThreads = 16
+      Mthreads = min(ssize[1], numThreads)
+      Mblocks = ceil(Int, ssize[1] / Mthreads)
+      Nthreads = min(ssize[2], numThreads)
+      Nblocks = ceil(Int, ssize[2] / Nthreads)
+      @roc groupsize=(Mthreads,Nthreads) gridsize=(Mblocks,Nblocks) _multi_copy_2d(i, x[1], y[1])
+    end
+     
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+    end
+
+  end
+ 
+  AMDGPU.device!(AMDGPU.device(1))
+
+end
+
+function JACC.multi.gid(::AMDGPUBackend, dev_id::I, i::I, ndev::I) where{I <: Integer}
+
+    ind = 0
+    
+    if dev_id == 1
+        ind = i
+    elseif dev_id == ndev
+        ind = i + 1
+    else
+        ind = i + 1
+    end
+
+    return ind
+end
+
+function JACC.multi.gid(::AMDGPUBackend, dev_id::I, (i,j)::Tuple{I,I}, ndev::I) where{I <: Integer}
+    
+    ind = (0, 0)
+    
+    if dev_id == 1
+      ind = (i, j)
+    elseif dev_id == ndev
+      ind = (i, j+1)
+    else
+      ind = (i, j+1)
+    end
+    
+    return ind
+
+end
+
+function JACC.multi.gswap(::AMDGPUBackend, x::Vector{Any})
+
+  AMDGPU.device!(AMDGPU.device(1))
+  ndev = length(AMDGPU.devices())
+
+  if ndims(x[2][1]) == 1
+
+    #Left to right swapping
+    for i in 1:ndev-1
+      AMDGPU.device!(AMDGPU.device(i))
+      tmp = Base.Array(x[2][i])
+      size = length(tmp)
+      ghost_lr = tmp[size-1]
+      AMDGPU.device!(AMDGPU.device(i+1))
+      @roc groupsize=32 gridsize=1 _multi_swap_ghost_lr(i+1, x[1], ndev, size, ghost_lr)
+    end
+
+    #Right to left swapping
+    for i in 2:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      tmp = Base.Array(x[2][i])
+      if (i - 1) == 1
+        size = length(tmp) - 1
+      else 
+        size = length(tmp)
+      end
+      ghost_rl = tmp[2]
+      AMDGPU.device!(AMDGPU.device(i-1))
+      @roc groupsize=32 gridsize=1 _multi_swap_ghost_rl(i-1, x[1], ndev, size, ghost_rl)
+    end
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+    end
+  
+  elseif ndims(x[2][1]) == 2
+
+    #Left to right swapping
+    for i in 1:ndev-1
+      AMDGPU.device!(AMDGPU.device(i))
+      dim = size(x[2][i])
+      tmp = Base.Array(x[2][i][:,dim[2]-1])
+      AMDGPU.device!(AMDGPU.device(i+1))
+      ghost_lr = ROCArray(tmp)
+      numThreads = 512
+      threads = min(dim[1], numThreads)
+      blocks = ceil(Int, dim[1] / threads)
+      #x[2][i+1][:,1] = ghost_lr
+      @roc groupsize=threads gridsize=blocks _multi_swap_2d_ghost_lr(i+1, x[1], ndev, dim[1], ghost_lr)
+      #AMDGPU.synchronize()
+    end
+
+    #Right to left swapping
+    for i in 2:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      tmp = Base.Array(x[2][i][:,2])
+      AMDGPU.device!(AMDGPU.device(i-1))
+      dim = size(x[2][i-1])
+      ghost_rl = ROCArray(tmp)
+      numThreads = 512
+      threads = min(dim[1], numThreads)
+      blocks = ceil(Int, dim[1] / threads)
+      @roc groupsize=threads gridsize=blocks _multi_swap_2d_ghost_rl(i-1, x[1], ndev, dim[1], dim[2], ghost_rl)
+      #AMDGPU.synchronize()
+    end
+
+    for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+    end
+  
+  end
+
+  AMDGPU.device!(AMDGPU.device(1))
+
+end
+
+function JACC.multi.gcopytoarray(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any})
+   
+   #x is the array and y is the ghost array
+   AMDGPU.device!(AMDGPU.device(1))
+   ndev = length(AMDGPU.devices())
+
+   for i in 1:ndev
+       AMDGPU.device!(AMDGPU.device(i))
+       size = length(y[2][i])
+       numThreads = 512
+       threads = min(size, numThreads)
+       blocks = ceil(Int, size / threads)
+       @roc groupsize=threads gridsize=blocks _multi_copy_ghosttoarray(i, x[1], y[1], size, ndev)
+   end
+
+   for i in 1:ndev
+      AMDGPU.device!(AMDGPU.device(i))
+      AMDGPU.synchronize()
+   end
+
+   AMDGPU.device!(AMDGPU.device(1))
+
+end
+
+function JACC.multi.copytogarray(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any})
+
+   #x is the ghost array and y is the array
    AMDGPU.device!(AMDGPU.device(1))
    ndev = length(AMDGPU.devices())
 
@@ -80,8 +327,7 @@ function JACC.multi.copy(::AMDGPUBackend, x::Vector{Any}, y::Vector{Any})
        numThreads = 512
        threads = min(size, numThreads)
        blocks = ceil(Int, size / threads)
-       @roc groupsize=threads gridsize=blocks _multi_copy(i, x[1], y[1])
-       #AMDGPU.synchronize()
+       @roc groupsize=threads gridsize=blocks _multi_copy_arraytoghost(i, x[1], y[1], size, ndev)
    end
 
    for i in 1:ndev
@@ -248,6 +494,78 @@ function JACC.multi.parallel_reduce(::AMDGPUBackend,
 
 end
 
+function _multi_copy(dev_id, x, y)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    @inbounds x[dev_id][i] = y[dev_id][i]
+    return nothing
+end
+
+function _multi_copy_2d(dev_id, x, y)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    @inbounds x[dev_id][i,j] = y[dev_id][i,j]
+    return nothing
+end
+
+function _multi_copy_ghosttoarray(dev_id, x, y, size, ndev)
+    #x is the array and y is the ghost array
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if dev_id == 1 && i < size
+        @inbounds x[dev_id][i] = y[dev_id][i]
+    elseif dev_id == ndev && i > 1
+        @inbounds x[dev_id][i-1] = y[dev_id][i]
+    elseif i > 1 && i < size
+        @inbounds x[dev_id][i-1] = y[dev_id][i]
+    end
+    return nothing
+end
+
+function _multi_copy_arraytoghost(dev_id, x, y, size, ndev)
+    #x is the ghost array and y is the array
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if dev_id == 1 && i < size
+        @inbounds x[dev_id][i] = y[dev_id][i]
+    elseif dev_id == ndev && i < size
+        @inbounds x[dev_id][i+1] = y[dev_id][i]
+    elseif i > 1 && i < size
+        @inbounds x[dev_id][i] = y[dev_id][i]
+    end
+    return nothing
+end
+
+function _multi_swap_ghost_lr(dev_id, x, ndev, size, ghost)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i == 1
+        x[dev_id][i] = ghost
+    end
+    return nothing
+end
+
+function _multi_swap_2d_ghost_lr(dev_id, x, ndev, size, ghost)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i < size + 1
+      x[dev_id][i, 1] = ghost[i]
+    end
+    return nothing
+end
+
+function _multi_swap_ghost_rl(dev_id, x, ndev, size, ghost)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i == 1
+        #x[dev_id][120] = ghost
+        x[dev_id][size] = ghost
+    end
+    return nothing
+end
+
+function _multi_swap_2d_ghost_rl(dev_id, x, ndev, size, col, ghost)
+    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    if i < size + 1
+      x[dev_id][i, col] = ghost[i]
+    end
+    return nothing
+end
+
 function _multi_parallel_for_amdgpu(N, dev_id, f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
     if i <= N
diff --git a/ext/JACCCUDA/JACCMULTI.jl b/ext/JACCCUDA/JACCMULTI.jl
index 50f855b..85a0375 100644
--- a/ext/JACCCUDA/JACCMULTI.jl
+++ b/ext/JACCCUDA/JACCMULTI.jl
@@ -169,6 +169,21 @@ function JACC.multi.gid(::CUDABackend, dev_id::I, i::I, ndev::I) where{I <: Inte
     return ind
 end
 
+function JACC.multi.gid(::CUDABackend, dev_id::I, (i,j)::Tuple{I,I}, ndev::I) where{I <: Integer}
+    
+    ind = (0, 0)
+    
+    if dev_id == 1
+      ind = (i, j)
+    elseif dev_id == ndev
+      ind = (i, j+1)
+    else
+      ind = (i, j+1)
+    end
+    
+    return ind
+
+end
 
 function JACC.multi.gswap(::CUDABackend, x::Vector{Any})
    

From bd447d148cf6a1865c7f263209154239943accd6 Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.com>
Date: Thu, 31 Oct 2024 16:12:33 -0400
Subject: [PATCH 13/14] Fixed JACCMULTI module for the new precompiling code.
 Note that we need a AMDGPU version of at least 1.0 or above for JACC.multi

---
 Project.toml     |  2 +-
 src/JACCMULTI.jl | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6e04426..599c8f7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,7 +18,7 @@ JACCCUDA = ["CUDA"]
 JACCONEAPI = ["oneAPI"]
 
 [compat]
-AMDGPU = "0.8"
+AMDGPU = "1.0"
 Atomix = "0.1"
 CUDA = "5"
 Preferences = "1.4.0"
diff --git a/src/JACCMULTI.jl b/src/JACCMULTI.jl
index 2260d94..fb0a7e1 100644
--- a/src/JACCMULTI.jl
+++ b/src/JACCMULTI.jl
@@ -41,51 +41,51 @@ function parallel_reduce(::ThreadsBackend, (M, N)::Tuple{I, I}, f::F, x...) wher
 end
 
 function ndev()
-    return ndev(default_backend())
+    return ndev(JACC.default_backend())
 end
 
 function Array(x::Base.Array{T,N}) where {T,N}
-    return Array(default_backend(), x)
+    return Array(JACC.default_backend(), x)
 end
 
 function gArray(x::Base.Array{T,N}) where {T,N}
-    return gArray(default_backend(), x)
+    return gArray(JACC.default_backend(), x)
 end
 
 function gid(dev_id::I, i::I, ndev::I) where{I <: Integer}
-    return gid(default_backend(), dev_id, i, ndev)
+    return gid(JACC.default_backend(), dev_id, i, ndev)
 end
 
 function gswap(x::Vector{Any})
-    return gswap(default_backend(), x)
+    return gswap(JACC.default_backend(), x)
 end
 
 function gcopytoarray(x::Vector{Any}, y::Vector{Any})
-    return gcopytoarray(default_backend(), x, y)
+    return gcopytoarray(JACC.default_backend(), x, y)
 end
 
 function copytogarray(x::Vector{Any}, y::Vector{Any})
-    return copytogarray(default_backend(), x, y)
+    return copytogarray(JACC.default_backend(), x, y)
 end
 
 function copy(x::Vector{Any}, y::Vector{Any})
-    return copy(default_backend(), x, y)
+    return copy(JACC.default_backend(), x, y)
 end
 
 function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
-    return parallel_for(default_backend(), N, f, x...)
+    return parallel_for(JACC.default_backend(), N, f, x...)
 end
 
 function parallel_for((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
-    return parallel_for(default_backend(), (M, N), f, x...)
+    return parallel_for(JACC.default_backend(), (M, N), f, x...)
 end
 
 function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function}
-    return parallel_reduce(default_backend(), N, f, x...)
+    return parallel_reduce(JACC.default_backend(), N, f, x...)
 end
 
 function parallel_reduce((M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
-    return parallel_reduce(default_backend(), (M, N), f, x...)
+    return parallel_reduce(JACC.default_backend(), (M, N), f, x...)
 end
 
 end # module multi

From bdfd909c3666318e32bdfbbee7dc2292132f17f1 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Fri, 1 Nov 2024 12:18:29 -0500
Subject: [PATCH 14/14] Moved and refactored most tests as common portable
 versions

---
 src/JACC.jl           |   4 +-
 test/runtests.jl      |   2 +
 test/tests_amdgpu.jl  | 136 +-------------
 test/tests_cuda.jl    | 307 +-----------------------------
 test/tests_oneapi.jl  |  99 ----------
 test/tests_threads.jl | 396 +--------------------------------------
 test/unittests.jl     | 421 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 433 insertions(+), 932 deletions(-)
 create mode 100644 test/unittests.jl

diff --git a/src/JACC.jl b/src/JACC.jl
index 78dd67e..b986383 100644
--- a/src/JACC.jl
+++ b/src/JACC.jl
@@ -5,6 +5,8 @@ import Atomix: @atomic
 # module to set backend preferences 
 include("JACCPreferences.jl")
 
+default_backend() = get_backend(JACCPreferences._backend_dispatchable)
+
 struct ThreadsBackend end
 
 include("helper.jl")
@@ -97,8 +99,6 @@ struct Array{T, N} end
 (::Type{Array})(args...; kwargs...) =
     array_type()(args...; kwargs...)
 
-default_backend() = get_backend(JACCPreferences._backend_dispatchable)
-
 array_type() = array_type(default_backend())
 
 function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
diff --git a/test/runtests.jl b/test/runtests.jl
index e9a04e7..8ae0d30 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -25,3 +25,5 @@ elseif backend == "threads"
     println("Threads backend loaded")
     include("tests_threads.jl")
 end
+
+include("unittests.jl")
diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl
index 61a6742..5707691 100644
--- a/test/tests_amdgpu.jl
+++ b/test/tests_amdgpu.jl
@@ -6,117 +6,20 @@ using Test
     @test JACC.JACCPreferences.backend == "amdgpu"
 end
 
-@testset "VectorAddLambda" begin
-    function f(i, a)
-        @inbounds a[i] += 5.0
-    end
-
-    N = 10
-    dims = (N)
-    a = round.(rand(Float32, dims) * 100)
-
-    a_device = JACC.Array(a)
-    JACC.parallel_for(N, f, a_device)
-
-    a_expected = a .+ 5.0
-    @test Array(a_device)≈a_expected rtol=1e-5
-end
-
-@testset "AXPY" begin
-    function axpy(i, alpha, x, y)
-        @inbounds x[i] += alpha * y[i]
-    end
-
-    function seq_axpy(N, alpha, x, y)
-        @inbounds for i in 1:N
-            x[i] += alpha * y[i]
-        end
-    end
-
-    N = 10
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    x = round.(rand(Float32, N) * 100)
-    y = round.(rand(Float32, N) * 100)
-    alpha = 2.5
-
-    x_device = JACC.Array(x)
-    y_device = JACC.Array(y)
-    JACC.parallel_for(N, axpy, alpha, x_device, y_device)
-
-    x_expected = x
-    seq_axpy(N, alpha, x_expected, y)
-
-    @test Array(x_device)≈x_expected rtol=1e-1
-end
-
-@testset "AtomicCounter" begin
-    function axpy_counter!(i, alpha, x, y, counter)
-        @inbounds x[i] += alpha * y[i]
-        JACC.@atomic counter[1] += 1
-    end
-
-    N = Int32(10)
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    alpha = 2.5
-
-    x = JACC.Array(round.(rand(Float32, N) * 100))
-    y = JACC.Array(round.(rand(Float32, N) * 100))
-    counter = JACC.Array{Int32}([0])
-    JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter)
-
-    @test Array(counter)[1] == N
-end
-
-@testset "zeros" begin
+@testset "zeros_type" begin
     N = 10
     x = JACC.zeros(Float64, N)
     @test typeof(x) == AMDGPU.ROCArray{Float64, 1, AMDGPU.Runtime.Mem.HIPBuffer}
     @test eltype(x) == Float64
-    @test zeros(N)≈Array(x) rtol=1e-5
-
-    function add_one(i, x)
-        @inbounds x[i] += 1
-    end
-
-    JACC.parallel_for(N, add_one, x)
-    @test ones(N)≈Array(x) rtol=1e-5
 end
 
-@testset "ones" begin
+@testset "ones_type" begin
     N = 10
     x = JACC.ones(Float64, N)
     @test typeof(x) == AMDGPU.ROCArray{Float64, 1, AMDGPU.Runtime.Mem.HIPBuffer}
     @test eltype(x) == Float64
-    @test ones(N)≈Array(x) rtol=1e-5
-
-    function minus_one(i, x)
-        @inbounds x[i] -= 1
-    end
-
-    JACC.parallel_for(N, minus_one, x)
-    @test zeros(N)≈Array(x) rtol=1e-5
 end
 
-@testset "shared" begin
-    N = 100
-    alpha = 2.5
-    x = JACC.ones(Float64, N)
-    x_shared = JACC.ones(Float64, N)
-    y = JACC.ones(Float64, N)
-
-    function scal(i, x, y, alpha)
-        @inbounds x[i] = y[i] * alpha
-    end
-    
-    function scal_shared(i, x, y, alpha)
-        y_shared = JACC.shared(y) 
-        @inbounds x[i] = y_shared[i] * alpha
-    end
-
-    JACC.parallel_for(N, scal, x, y, alpha)
-    JACC.parallel_for(N, scal_shared, x_shared, y, alpha)
-    @test x≈x_shared rtol=1e-8
-end
 
 #@testset "JACC.BLAS" begin
 #    function seq_axpy(N, alpha, x, y)
@@ -231,41 +134,6 @@ end
 
 # end
 
-@testset "Add-2D" begin
-    function add!(i, j, A, B, C)
-        @inbounds C[i, j] = A[i, j] + B[i, j]
-    end
-
-    M = 10
-    N = 10
-    A = JACC.Array(ones(Float32, M, N))
-    B = JACC.Array(ones(Float32, M, N))
-    C = JACC.Array(zeros(Float32, M, N))
-
-    JACC.parallel_for((M, N), add!, A, B, C)
-
-    C_expected = Float32(2.0) .* ones(Float32, M, N)
-    @test Array(C)≈C_expected rtol=1e-5
-end
-
-@testset "Add-3D" begin
-    function add!(i, j, k, A, B, C)
-        @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k]
-    end
-
-    L = 10
-    M = 10
-    N = 10
-    A = JACC.Array(ones(Float32, L, M, N))
-    B = JACC.Array(ones(Float32, L, M, N))
-    C = JACC.Array(zeros(Float32, L, M, N))
-
-    JACC.parallel_for((L, M, N), add!, A, B, C)
-
-    C_expected = Float32(2.0) .* ones(Float32, L, M, N)
-    @test Array(C)≈C_expected rtol=1e-5
-end
-
 #@testset "JACC.multi" begin
 
 #    x = ones(1_000)
diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl
index 0dab756..2e71272 100644
--- a/test/tests_cuda.jl
+++ b/test/tests_cuda.jl
@@ -6,160 +6,20 @@ using Test
     @test JACC.JACCPreferences.backend == "cuda"
 end
 
-@testset "VectorAddLambda" begin
-    function f(i, a)
-        @inbounds a[i] += 5.0
-    end
-
-    N = 10
-    dims = (N)
-    a = round.(rand(Float32, dims) * 100)
-
-    a_device = JACC.Array(a)
-    JACC.parallel_for(N, f, a_device)
-
-    a_expected = a .+ 5.0
-    @test Array(a_device)≈a_expected rtol=1e-5
-end
-
-@testset "AXPY" begin
-    function axpy(i, alpha, x, y)
-        @inbounds x[i] += alpha * y[i]
-    end
-
-    function seq_axpy(N, alpha, x, y)
-        @inbounds for i in 1:N
-            x[i] += alpha * y[i]
-        end
-    end
-
-    N = 10
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    x = round.(rand(Float32, N) * 100)
-    y = round.(rand(Float32, N) * 100)
-    alpha = 2.5
-
-    x_device = JACC.Array(x)
-    y_device = JACC.Array(y)
-    JACC.parallel_for(N, axpy, alpha, x_device, y_device)
-
-    x_expected = x
-    seq_axpy(N, alpha, x_expected, y)
-
-    @test Array(x_device)≈x_expected rtol=1e-1
-end
-
-@testset "AtomicCounter" begin
-    function axpy_counter!(i, alpha, x, y, counter)
-        @inbounds x[i] += alpha * y[i]
-        JACC.@atomic counter[1] += 1
-    end
-
-    N = Int32(10)
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    alpha = 2.5
-
-    x = JACC.Array(round.(rand(Float32, N) * 100))
-    y = JACC.Array(round.(rand(Float32, N) * 100))
-    counter = JACC.Array{Int32}([0])
-    JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter)
-
-    @test Array(counter)[1] == N
-end
-
-@testset "zeros" begin
+@testset "zeros_type" begin
     N = 10
     x = JACC.zeros(Float64, N)
-    @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}
+    @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.DeviceMemory}
     @test eltype(x) == Float64
-    @test zeros(N)≈Array(x) rtol=1e-5
-
-    function add_one(i, x)
-        @inbounds x[i] += 1
-    end
-
-    JACC.parallel_for(N, add_one, x)
-    @test ones(N)≈Array(x) rtol=1e-5
 end
 
-@testset "ones" begin
+@testset "ones_type" begin
     N = 10
     x = JACC.ones(Float64, N)
-    @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}
+    @test typeof(x) == CUDA.CuArray{Float64, 1, CUDA.DeviceMemory}
     @test eltype(x) == Float64
-    @test ones(N)≈Array(x) rtol=1e-5
-
-    function minus_one(i, x)
-        @inbounds x[i] -= 1
-    end
-
-    JACC.parallel_for(N, minus_one, x)
-    @test zeros(N)≈Array(x) rtol=1e-5
 end
 
-# @testset "CG" begin
-
-# 	function matvecmul(i, a1, a2, a3, x, y, SIZE)
-# 		if i == 1
-# 			y[i] = a2[i] * x[i] + a1[i] * x[i+1]
-# 		elseif i == SIZE
-# 			y[i] = a3[i] * x[i-1] + a2[i] * x[i]
-# 		elseif i > 1 && i < SIZE
-# 			y[i] = a3[i] * x[i-1] + a1[i] * +x[i] + a1[i] * +x[i+1]
-# 		end
-# 	end
-
-# 	function dot(i, x, y)
-# 		@inbounds return x[i] * y[i]
-# 	end
-
-# 	function axpy(i, alpha, x, y)
-# 		@inbounds x[i] += alpha[1, 1] * y[i]
-# 	end
-
-# 	SIZE = 10
-# 	a0 = JACC.ones(Float64, SIZE)
-# 	a1 = JACC.ones(Float64, SIZE)
-# 	a2 = JACC.ones(Float64, SIZE)
-# 	r = JACC.ones(Float64, SIZE)
-# 	p = JACC.ones(Float64, SIZE)
-# 	s = JACC.zeros(Float64, SIZE)
-# 	x = JACC.zeros(Float64, SIZE)
-# 	r_old = JACC.zeros(Float64, SIZE)
-# 	r_aux = JACC.zeros(Float64, SIZE)
-# 	a1 = a1 * 4
-# 	r = r * 0.5
-# 	p = p * 0.5
-# 	global cond = one(Float64)
-
-# 	while cond[1, 1] >= 1e-14
-
-# 		r_old = copy(r)
-
-# 		JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE)
-
-# 		alpha0 = JACC.parallel_reduce(SIZE, dot, r, r)
-# 		alpha1 = JACC.parallel_reduce(SIZE, dot, p, s)
-
-# 		alpha = alpha0 / alpha1
-# 		negative_alpha = alpha * (-1.0)
-
-# 		JACC.parallel_for(SIZE, axpy, negative_alpha, r, s)
-# 		JACC.parallel_for(SIZE, axpy, alpha, x, p)
-
-# 		beta0 = JACC.parallel_reduce(SIZE, dot, r, r)
-# 		beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old)
-# 		beta = beta0 / beta1
-
-# 		r_aux = copy(r)
-
-# 		JACC.parallel_for(SIZE, axpy, beta, r_aux, p)
-# 		ccond = JACC.parallel_reduce(SIZE, dot, r, r)
-# 		global cond = ccond
-# 		p = copy(r_aux)
-# 	end
-# 	@test cond[1, 1] <= 1e-14
-# end
 
 # @testset "LBM" begin
 
@@ -274,26 +134,6 @@ end
 
 # end
 
-@testset "shared" begin
-    N = 100
-    alpha = 2.5
-    x = JACC.ones(Float64, N)
-    x_shared = JACC.ones(Float64, N)
-    y = JACC.ones(Float64, N)
-
-    function scal(i, x, y, alpha)
-        @inbounds x[i] = y[i] * alpha
-    end
-    
-    function scal_shared(i, x, y, alpha)
-        y_shared = JACC.shared(y) 
-        @inbounds x[i] = y_shared[i] * alpha
-    end
-
-    JACC.parallel_for(N, scal, x, y, alpha)
-    JACC.parallel_for(N, scal_shared, x_shared, y, alpha)
-    @test x≈x_shared rtol=1e-8
-end
 
 # @testset VectorAddLoop begin
 #     N = 1024
@@ -332,145 +172,6 @@ end
 #     end
 # end
 
-@testset "JACC.BLAS" begin
-
-    x = ones(1_000)
-    y = ones(1_000)
-    y1 = y*2
-    jx = JACC.ones(1_000)
-    jy = JACC.ones(1_000)
-    jy1 = jy*2
-    alpha = 2.0
-
-    function seq_axpy(N, alpha, x, y)
-        for i in 1:N
-            @inbounds x[i] += alpha * y[i]
-        end
-    end
-
-
-    function seq_dot(N, x, y)
-        r = 0.0
-        for i in 1:N
-            @inbounds r += x[i] * y[i]
-        end
-        return r
-    end
-
-    x = ones(1_000)
-    y = ones(1_000)
-    jx = JACC.ones(1_000)
-    jy = JACC.ones(1_000)
-    alpha = 2.0
-
-    seq_axpy(1_000, alpha, x, y)
-    ref_result = seq_dot(1_000, x, y)
-
-    JACC.BLAS.axpy(1_000, alpha, jx, jy)
-    jresult = JACC.BLAS.dot(1_000, jx, jy)
-    result = Array(jresult)
-
-    @test result[1]≈ref_result rtol=1e-8
-end
-
-@testset "Add-2D" begin
-    function add!(i, j, A, B, C)
-        @inbounds C[i, j] = A[i, j] + B[i, j]
-    end
-
-    M = 10
-    N = 10
-    A = JACC.Array(ones(Float32, M, N))
-    B = JACC.Array(ones(Float32, M, N))
-    C = JACC.Array(zeros(Float32, M, N))
-
-    JACC.parallel_for((M, N), add!, A, B, C)
-
-    C_expected = Float32(2.0) .* ones(Float32, M, N)
-    @test Array(C)≈C_expected rtol=1e-5
-end
-
-@testset "Add-3D" begin
-    function add!(i, j, k, A, B, C)
-        @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k]
-    end
-
-    L = 10
-    M = 10
-    N = 10
-    A = JACC.Array(ones(Float32, L, M, N))
-    B = JACC.Array(ones(Float32, L, M, N))
-    C = JACC.Array(zeros(Float32, L, M, N))
-
-    JACC.parallel_for((L, M, N), add!, A, B, C)
-
-    C_expected = Float32(2.0) .* ones(Float32, L, M, N)
-    @test Array(C)≈C_expected rtol=1e-5
- 
-    function seq_scal(N, alpha, x)
-        for i in 1:N
-            @inbounds x[i] = alpha * x[i]
-        end
-    end
-  
-    function seq_asum(N, x)
-        r = 0.0
-        for i in 1:N
-            @inbounds r += abs(x[i])
-        end
-        return r
-    end
-
-    function seq_nrm2(N, x)
-        sum_sq = 0.0
-        for i in 1:N
-            @inbounds sum_sq += x[i]*x[i]
-        end
-        r = sqrt(sum_sq)
-        return r
-    end
-
-    function seq_swap(N, x, y1)
-        for i in 1:N
-            @inbounds t = x[i]
-            @inbounds x[i] = y1[i]
-            @inbounds y1[i] = t
-        end       
-    end
-
-    # Comparing JACC.BLAS with regular seuential functions
-    # seq_axpy(1_000, alpha, x, y)
-    # ref_result = seq_dot(1_000, x, y)  
-    # JACC.BLAS.axpy(1_000, alpha, jx, jy)
-    # jresult = JACC.BLAS.dot(1_000, jx, jy)
-    # result = Array(jresult)        
-    # @test result[1]≈ref_result rtol=1e-8
-
-
-    # seq_scal(1_000, alpha, x)
-    # JACC.BLAS.scal(1_000, alpha, jx)
-    # @test x≈Array(jx) atol=1e-8 
-
-    # seq_axpy(1_000, alpha, x, y)
-    # JACC.BLAS.axpy(1_000, alpha, jx, jy)
-    # @test x≈Array(jx) atol=1e-8
-
-    # r1 = seq_dot(1_000, x, y) 
-    # r2 = JACC.BLAS.dot(1_000, jx, jy)
-    # @test r1≈Array(r2)[1] atol=1e-8 
-    
-    # r1 = seq_asum(1_000, x)
-    # r2 = JACC.BLAS.asum(1_000, jx)
-    # r1 = seq_nrm2(1_000, x)
-    # r2 = JACC.BLAS.nrm2(1_000, jx)
-    # @test r1≈Array(r2)[1] atol=1e-8
-
-    #seq_swap(1_000, x, y1)  
-    #JACC.BLAS.swap(1_000, jx, jy1)
-    #@test x == Array(jx)
-    #@test y1 == Array(jy1)
-
-end
 
 #@testset "JACC.multi" begin
 
diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl
index 2ea6767..07a2f0d 100644
--- a/test/tests_oneapi.jl
+++ b/test/tests_oneapi.jl
@@ -5,102 +5,3 @@ using Test
 @testset "TestBackend" begin
     @test JACC.JACCPreferences.backend == "oneapi"
 end
-
-@testset "VectorAddLambda" begin
-    function f(i, a)
-        @inbounds a[i] += 5.0
-    end
-
-    N = 10
-    dims = (N)
-    a = round.(rand(Float32, dims) * 100)
-
-    a_device = JACC.Array(a)
-    JACC.parallel_for(N, f, a_device)
-
-    a_expected = a .+ 5.0
-    @test Array(a_device)≈a_expected rtol=1e-5
-end
-
-@testset "AXPY" begin
-    function axpy(i, alpha, x, y)
-        @inbounds x[i] += alpha * y[i]
-    end
-
-    function seq_axpy(N, alpha, x, y)
-        @inbounds for i in 1:N
-            x[i] += alpha * y[i]
-        end
-    end
-
-    N = 10
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    x = round.(rand(Float32, N) * 100)
-    y = round.(rand(Float32, N) * 100)
-    alpha::Float32 = 2.5
-
-    x_device = JACC.Array(x)
-    y_device = JACC.Array(y)
-    JACC.parallel_for(N, axpy, alpha, x_device, y_device)
-
-    x_expected = x
-    seq_axpy(N, alpha, x_expected, y)
-
-    @test Array(x_device)≈x_expected rtol=1e-1
-end
-
-@testset "shared" begin
-    N::Int32 = 100
-    alpha::Float32 = 2.5
-    x = JACC.ones(Float32, N)
-    x_shared = JACC.ones(Float32, N)
-    y = JACC.ones(Float32, N)
-
-    function scal(i, x, y, alpha)
-        @inbounds x[i] = y[i] * alpha
-    end
-    
-    function scal_shared(i, x, y, alpha)
-        y_shared = JACC.shared(y) 
-        @inbounds x[i] = y_shared[i] * alpha
-    end
-
-    JACC.parallel_for(N, scal, x, y, alpha)
-    JACC.parallel_for(N, scal_shared, x_shared, y, alpha)
-    @test x≈x_shared rtol=1e-8
-end
-
-#@testset "JACC.BLAS" begin
-
-#    function seq_axpy(N, alpha, x, y)
-#        for i in 1:N
-#            @inbounds x[i] += alpha * y[i]
-#        end
-#    end
-    
-#    function seq_dot(N, x, y)
-#        r = 0.0
-#        for i in 1:N
-#            @inbounds r += x[i] * y[i]
-#        end
-#        return r
-#   end
-    
-#    SIZE = Int32(1_000)
-#    x = ones(Float32, SIZE)
-#    y = ones(Float32, SIZE)
-#    jx = JACC.ones(Float32, SIZE)
-#    jy = JACC.ones(Float32, SIZE)
-#    alpha = Float32(2.0)
-    
-#    seq_axpy(SIZE, alpha, x, y)
-#    ref_result = seq_dot(SIZE, x, y)
-    
-#    JACC.BLAS.axpy(SIZE, alpha, jx, jy)
-#    jresult = JACC.BLAS.dot(SIZE, jx, jy)
-#    result = Array(jresult)     
-    
-#    @test result[1]≈ref_result rtol=1e-8
-
-#end
-
diff --git a/test/tests_threads.jl b/test/tests_threads.jl
index 99cf13b..2fd35be 100644
--- a/test/tests_threads.jl
+++ b/test/tests_threads.jl
@@ -5,408 +5,16 @@ using Test
     @test JACC.JACCPreferences.backend == "threads"
 end
 
-@testset "VectorAddLambda" begin
-    function f(x, a)
-        @inbounds a[x] += 5.0
-    end
-
-    dims = (10)
-    a = round.(rand(Float32, dims) * 100)
-    a_expected = a .+ 5.0
-
-    JACC.parallel_for(10, f, a)
-
-    @test a≈a_expected rtol=1e-5
-end
-
-@testset "AXPY" begin
-    function seq_axpy(N, alpha, x, y)
-        Threads.@threads for i in 1:N
-            @inbounds x[i] += alpha * y[i]
-        end
-    end
-
-    function axpy(i, alpha, x, y)
-        if i <= length(x)
-            @inbounds x[i] += alpha * y[i]
-        end
-    end
-
-    N = 10
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    x = round.(rand(Float32, N) * 100)
-    y = round.(rand(Float32, N) * 100)
-    alpha = 2.5
-
-    x_host_JACC = JACC.Array(x)
-    y_host_JACC = JACC.Array(y)
-    JACC.parallel_for(N, axpy, alpha, x_host_JACC, y_host_JACC)
-
-    x_expected = x
-    seq_axpy(N, alpha, x_expected, y)
-
-    @test x_host_JACC≈x_expected rtol=1e-1
-end
-
-@testset "AtomicCounter" begin
-    function axpy_counter!(i, alpha, x, y, counter)
-        @inbounds x[i] += alpha * y[i]
-        JACC.@atomic counter[1] += 1
-    end
-
-    N = Int32(10)
-    # Generate random vectors x and y of length N for the interval [0, 100]
-    alpha = 2.5
-    counter = zeros(Int32, 1)
-
-    x_device = JACC.Array(round.(rand(Float32, N) * 100))
-    y_device = JACC.Array(round.(rand(Float32, N) * 100))
-    counter = JACC.Array{Int32}([0])
-    JACC.parallel_for(N, axpy_counter!, alpha, x_device, y_device, counter)
-
-    @test counter[1] == N
-end
-
-@testset "zeros" begin
+@testset "zeros_type" begin
     N = 10
     x = JACC.zeros(Float32, N)
     @test typeof(x) == Vector{Float32}
     @test eltype(x) == Float32
-    @test zeros(N)≈x rtol=1e-5
-
-    function add_one(i, x)
-        @inbounds x[i] += 1
-    end
-
-    JACC.parallel_for(N, add_one, x)
-    @test ones(N)≈x rtol=1e-5
 end
 
-@testset "ones" begin
+@testset "ones_type" begin
     N = 10
     x = JACC.ones(Float64, N)
     @test typeof(x) == Vector{Float64}
     @test eltype(x) == Float64
-    @test ones(N)≈x rtol=1e-5
-
-    function minus_one(i, x)
-        @inbounds x[i] -= 1
-    end
-
-    JACC.parallel_for(N, minus_one, x)
-    @test zeros(N)≈x rtol=1e-5
-end
-
-@testset "CG" begin
-    function matvecmul(i, a1, a2, a3, x, y, SIZE)
-        if i == 1
-            y[i] = a2[i] * x[i] + a1[i] * x[i + 1]
-        elseif i == SIZE
-            y[i] = a3[i] * x[i - 1] + a2[i] * x[i]
-        elseif i > 1 && i < SIZE
-            y[i] = a3[i] * x[i - 1] + a1[i] * +x[i] + a1[i] * +x[i + 1]
-        end
-    end
-
-    function dot(i, x, y)
-        @inbounds return x[i] * y[i]
-    end
-
-    function axpy(i, alpha, x, y)
-        @inbounds x[i] += alpha[1, 1] * y[i]
-    end
-
-	SIZE = 10
-	a0 = JACC.ones(Float64, SIZE)
-	a1 = JACC.ones(Float64, SIZE)
-	a2 = JACC.ones(Float64, SIZE)
-	r = JACC.ones(Float64, SIZE)
-	p = JACC.ones(Float64, SIZE)
-	s = JACC.zeros(Float64, SIZE)
-	x = JACC.zeros(Float64, SIZE)
-	r_old = JACC.zeros(Float64, SIZE)
-	r_aux = JACC.zeros(Float64, SIZE)
-	a1 = a1 * 4
-	r = r * 0.5
-	p = p * 0.5
-	global cond = one(Float64)
-
-    while cond[1, 1] >= 1e-14
-        r_old = copy(r)
-
-        JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE)
-
-        alpha0 = JACC.parallel_reduce(SIZE, dot, r, r)
-        alpha1 = JACC.parallel_reduce(SIZE, dot, p, s)
-
-        alpha = alpha0 / alpha1
-        negative_alpha = alpha * (-1.0)
-
-        JACC.parallel_for(SIZE, axpy, negative_alpha, r, s)
-        JACC.parallel_for(SIZE, axpy, alpha, x, p)
-
-        beta0 = JACC.parallel_reduce(SIZE, dot, r, r)
-        beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old)
-        beta = beta0 / beta1
-
-        r_aux = copy(r)
-
-        JACC.parallel_for(SIZE, axpy, beta, r_aux, p)
-        ccond = JACC.parallel_reduce(SIZE, dot, r, r)
-        global cond = ccond
-        p = copy(r_aux)
-    end
-    @test cond[1, 1] <= 1e-14
-end
-
-@testset "LBM" begin
-    function lbm_kernel(x, y, f, f1, f2, t, w, cx, cy, SIZE)
-        u = 0.0
-        v = 0.0
-        p = 0.0
-        x_stream = 0
-        y_stream = 0
-
-        if x > 1 && x < SIZE && y > 1 && y < SIZE
-            for k in 1:9
-                @inbounds x_stream = x - cx[k]
-                @inbounds y_stream = y - cy[k]
-                ind = (k - 1) * SIZE * SIZE + x * SIZE + y
-                iind = (k - 1) * SIZE * SIZE + x_stream * SIZE + y_stream
-                @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)]
-            end
-            for k in 1:9
-                ind = (k - 1) * SIZE * SIZE + x * SIZE + y
-                @inbounds p = p[1, 1] + f[ind]
-                @inbounds u = u[1, 1] + f[ind] * cx[k]
-                @inbounds v = v[1, 1] + f[ind] * cy[k]
-            end
-            u = u / p
-            v = v / p
-            for k in 1:9
-                @inbounds cu = cx[k] * u + cy[k] * v
-                @inbounds feq = w[k] * p *
-                                (1.0 + 3.0 * cu + cu * cu -
-                                 1.5 * ((u * u) + (v * v)))
-                ind = (k - 1) * SIZE * SIZE + x * SIZE + y
-                @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] *
-                                                (1.0 - 1.0 / t) + feq * 1 / t
-            end
-        end
-    end
-
-    function lbm_threads(f, f1, f2, t, w, cx, cy, SIZE)
-        Threads.@sync Threads.@threads for x in 1:SIZE
-            for y in 1:SIZE
-                u = 0.0
-                v = 0.0
-                p = 0.0
-                x_stream = 0
-                y_stream = 0
-
-                if x > 1 && x < SIZE && y > 1 && y < SIZE
-                    for k in 1:9
-                        @inbounds x_stream = x - cx[k]
-                        @inbounds y_stream = y - cy[k]
-                        ind = (k - 1) * SIZE * SIZE + x * SIZE + y
-                        iind = (k - 1) * SIZE * SIZE + x_stream * SIZE +
-                               y_stream
-                        @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)]
-                    end
-                    for k in 1:9
-                        ind = (k - 1) * SIZE * SIZE + x * SIZE + y
-                        @inbounds p = p[1, 1] + f[ind]
-                        @inbounds u = u[1, 1] + f[ind] * cx[k]
-                        @inbounds v = v[1, 1] + f[ind] * cy[k]
-                    end
-                    u = u / p
-                    v = v / p
-                    for k in 1:9
-                        @inbounds cu = cx[k] * u + cy[k] * v
-                        @inbounds feq = w[k] * p *
-                                        (1.0 + 3.0 * cu + cu * cu -
-                                         1.5 * ((u * u) + (v * v)))
-                        ind = (k - 1) * SIZE * SIZE + x * SIZE + y
-                        @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] *
-                                                        (1.0 - 1.0 / t) +
-                                                        feq * 1 / t
-                    end
-                end
-            end
-        end
-    end
-
-	SIZE = 10
-	f = ones(SIZE * SIZE * 9) .* 2.0
-	f1 = ones(SIZE * SIZE * 9) .* 3.0
-	f2 = ones(SIZE * SIZE * 9) .* 4.0
-	cx = zeros(9)
-	cy = zeros(9)
-	cx[1] = 0
-	cy[1] = 0
-	cx[2] = 1
-	cy[2] = 0
-	cx[3] = -1
-	cy[3] = 0
-	cx[4] = 0
-	cy[4] = 1
-	cx[5] = 0
-	cy[5] = -1
-	cx[6] = 1
-	cy[6] = 1
-	cx[7] = -1
-	cy[7] = 1
-	cx[8] = -1
-	cy[8] = -1
-	cx[9] = 1
-	cy[9] = -1
-	w = ones(9)
-	t = 1.0
-
-    df = JACC.Array(f)
-    df1 = JACC.Array(f1)
-    df2 = JACC.Array(f2)
-    dcx = JACC.Array(cx)
-    dcy = JACC.Array(cy)
-    dw = JACC.Array(w)
-
-    JACC.parallel_for(
-        (SIZE, SIZE), lbm_kernel, df, df1, df2, t, dw, dcx, dcy, SIZE)
-
-    lbm_threads(f, f1, f2, t, w, cx, cy, SIZE)
-
-    @test f2≈df2 rtol=1e-1
-end
-
-@testset "JACC.BLAS" begin
-    x = ones(1_000)
-    y = ones(1_000)
-    y1 = y*2
-    jx = JACC.ones(1_000)
-    jy = JACC.ones(1_000)
-    jy1 = jy*2
-    alpha = 2.0
-
-    function seq_axpy(N, alpha, x, y)
-        for i in 1:N
-            @inbounds x[i] += alpha * y[i]
-        end
-    end
-
-    function seq_dot(N, x, y)
-        r = 0.0
-        for i in 1:N
-            @inbounds r += x[i] * y[i]
-        end
-        return r
-    end
-
-    seq_axpy(1_000, alpha, x, y)
-    ref_result = seq_dot(1_000, x, y)
-
-    JACC.BLAS.axpy(1_000, alpha, jx, jy)
-    jresult = JACC.BLAS.dot(1_000, jx, jy)
-    result = jresult[1]
-
-    @test result≈ref_result rtol=1e-8
-end
-
-# 2D 
-@testset "Add-2D" begin
-    function add!(i, j, A, B, C)
-        @inbounds C[i, j] = A[i, j] + B[i, j]
-    end
-
-    M = 10
-    N = 10
-    A = JACC.Array(ones(Float32, M, N))
-    B = JACC.Array(ones(Float32, M, N))
-    C = JACC.Array(zeros(Float32, M, N))
-
-    JACC.parallel_for((M, N), add!, A, B, C)
-
-    C_expected = Float32(2.0) .* ones(Float32, M, N)
-    @test C≈C_expected rtol=1e-5
-end
-
-@testset "Add-3D" begin
-    function add!(i, j, k, A, B, C)
-        @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k]
-    end
-
-    L = 10
-    M = 10
-    N = 10
-    A = JACC.Array(ones(Float32, L, M, N))
-    B = JACC.Array(ones(Float32, L, M, N))
-    C = JACC.Array(zeros(Float32, L, M, N))
-
-    for i in 1:L
-        for j in 1:M
-            for k in 1:N
-                C[i, j, k] = A[i, j, k] + B[i, j, k]
-            end
-        end
-    end
-
-    JACC.parallel_for((L, M, N), add!, A, B, C)
-
-    C_expected = Float32(2.0) .* ones(Float32, L, M, N)
-    @test C≈C_expected rtol=1e-5
- 
-    function seq_scal(N, alpha, x)
-        for i in 1:N
-            @inbounds x[i] = alpha * x[i]
-        end
-    end
-  
-    function seq_asum(N, x)
-        r = 0.0
-        for i in 1:N
-            @inbounds r += abs(x[i])
-        end
-        return r
-    end
-
-    function seq_nrm2(N, x)
-        sum_sq = 0.0
-        for i in 1:N
-            @inbounds sum_sq += x[i]*x[i]
-        end
-        r = sqrt(sum_sq)
-        return r
-    end
-
-    function seq_swap(N, x, y1)
-        for i in 1:N
-            @inbounds t = x[i]
-            @inbounds x[i] = y1[i]
-            @inbounds y1[i] = t
-        end       
-    end
-
-    # Comparing JACC.BLAS with regular seuential functions
-
-    # seq_scal(1_000, alpha, x)
-    # JACC.BLAS.scal(1_000, alpha, jx)
-    # @test x≈jx atol=1e-8 
-
-    # seq_axpy(1_000, alpha, x, y)
-    # JACC.BLAS.axpy(1_000, alpha, jx, jy)
-    # @test x≈jx atol=1e-8
-
-    # r1 = seq_dot(1_000, x, y) 
-    # r2 = JACC.BLAS.dot(1_000, jx, jy) 
-    # @test r1≈r2 atol=1e-8 
-    
-    # r1 = seq_asum(1_000, x)
-    # r2 = JACC.BLAS.asum(1_000, jx)
-    # r1 = seq_nrm2(1_000, x)
-    # r2 = JACC.BLAS.nrm2(1_000, jx)
-    # @test r1≈r2[1] atol=1e-8
-
-    # seq_swap(1_000, x, y1)  
-    # JACC.BLAS.swap(1_000, jx, jy1)
-    # @test x == jx
-    # @test y1 == jy1
 end
diff --git a/test/unittests.jl b/test/unittests.jl
new file mode 100644
index 0000000..02fc51a
--- /dev/null
+++ b/test/unittests.jl
@@ -0,0 +1,421 @@
+import JACC
+using Test
+
+@testset "VectorAddLambda" begin
+    function f(i, a)
+        @inbounds a[i] += 5.0
+    end
+
+    N = 10
+    dims = (N)
+    a = round.(rand(Float32, dims) * 100)
+
+    a_device = JACC.Array(a)
+    JACC.parallel_for(N, f, a_device)
+
+    a_expected = a .+ 5.0
+    @test Array(a_device)≈a_expected rtol=1e-5
+end
+
+@testset "AXPY" begin
+    function axpy(i, alpha, x, y)
+        @inbounds x[i] += alpha * y[i]
+    end
+
+    function seq_axpy(N, alpha, x, y)
+        @inbounds for i in 1:N
+            x[i] += alpha * y[i]
+        end
+    end
+
+    N = 10
+    # Generate random vectors x and y of length N for the interval [0, 100]
+    x = round.(rand(Float32, N) * 100)
+    y = round.(rand(Float32, N) * 100)
+    alpha = 2.5
+
+    x_device = JACC.Array(x)
+    y_device = JACC.Array(y)
+    JACC.parallel_for(N, axpy, alpha, x_device, y_device)
+
+    x_expected = x
+    seq_axpy(N, alpha, x_expected, y)
+
+    @test Array(x_device)≈x_expected rtol=1e-1
+end
+
+@testset "zeros" begin
+    N = 10
+    x = JACC.zeros(Float64, N)
+    @test eltype(x) == Float64
+    @test zeros(N)≈Array(x) rtol=1e-5
+
+    function add_one(i, x)
+        @inbounds x[i] += 1
+    end
+
+    JACC.parallel_for(N, add_one, x)
+    @test ones(N) ≈ Core.Array(x) rtol=1e-5
+end
+
+@testset "ones" begin
+    N = 10
+    x = JACC.ones(Float64, N)
+    @test eltype(x) == Float64
+    @test ones(N)≈Array(x) rtol=1e-5
+
+    function minus_one(i, x)
+        @inbounds x[i] -= 1
+    end
+
+    JACC.parallel_for(N, minus_one, x)
+    @test zeros(N) ≈ Array(x) rtol=1e-5
+end
+
+@testset "AtomicCounter" begin
+    function axpy_counter!(i, alpha, x, y, counter)
+        @inbounds x[i] += alpha * y[i]
+        JACC.@atomic counter[1] += 1
+    end
+
+    N = Int32(10)
+    # Generate random vectors x and y of length N for the interval [0, 100]
+    alpha = 2.5
+
+    x = JACC.Array(round.(rand(Float32, N) * 100))
+    y = JACC.Array(round.(rand(Float32, N) * 100))
+    counter = JACC.Array{Int32}([0])
+    JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter)
+
+    @test Array(counter)[1] == N
+end
+
+@testset "shared" begin
+    N = 100
+    alpha = 2.5
+    x = JACC.ones(Float64, N)
+    x_shared = JACC.ones(Float64, N)
+    y = JACC.ones(Float64, N)
+
+    function scal(i, x, y, alpha)
+        @inbounds x[i] = y[i] * alpha
+    end
+    
+    function scal_shared(i, x, y, alpha)
+        y_shared = JACC.shared(y) 
+        @inbounds x[i] = y_shared[i] * alpha
+    end
+
+    JACC.parallel_for(N, scal, x, y, alpha)
+    JACC.parallel_for(N, scal_shared, x_shared, y, alpha)
+    @test x≈x_shared rtol=1e-8
+end
+
+@testset "JACC.BLAS" begin
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+
+    @test Array(jresult)[1]≈ref_result rtol=1e-8
+end
+
+@testset "Add-2D" begin
+    function add!(i, j, A, B, C)
+        @inbounds C[i, j] = A[i, j] + B[i, j]
+    end
+
+    M = 10
+    N = 10
+    A = JACC.ones(Float32, M, N)
+    B = JACC.ones(Float32, M, N)
+    C = JACC.zeros(Float32, M, N)
+
+    JACC.parallel_for((M, N), add!, A, B, C)
+
+    C_expected = Float32(2.0) .* ones(Float32, M, N)
+    @test Array(C)≈C_expected rtol=1e-5
+end
+
+@testset "Add-3D" begin
+    function add!(i, j, k, A, B, C)
+        @inbounds C[i, j, k] = A[i, j, k] + B[i, j, k]
+    end
+
+    L = 10
+    M = 10
+    N = 10
+    A = JACC.ones(Float32, L, M, N)
+    B = JACC.ones(Float32, L, M, N)
+    C = JACC.zeros(Float32, L, M, N)
+
+    JACC.parallel_for((L, M, N), add!, A, B, C)
+
+    C_expected = Float32(2.0) .* ones(Float32, L, M, N)
+    @test Array(C)≈C_expected rtol=1e-5
+ 
+    function seq_scal(N, alpha, x)
+        for i in 1:N
+            @inbounds x[i] = alpha * x[i]
+        end
+    end
+  
+    function seq_asum(N, x)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += abs(x[i])
+        end
+        return r
+    end
+
+    function seq_nrm2(N, x)
+        sum_sq = 0.0
+        for i in 1:N
+            @inbounds sum_sq += x[i]*x[i]
+        end
+        r = sqrt(sum_sq)
+        return r
+    end
+
+    function seq_swap(N, x, y1)
+        for i in 1:N
+            @inbounds t = x[i]
+            @inbounds x[i] = y1[i]
+            @inbounds y1[i] = t
+        end       
+    end
+
+    # Comparing JACC.BLAS with regular seuential functions
+    # seq_axpy(1_000, alpha, x, y)
+    # ref_result = seq_dot(1_000, x, y)  
+    # JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    # jresult = JACC.BLAS.dot(1_000, jx, jy)
+    # result = Array(jresult)        
+    # @test result[1]≈ref_result rtol=1e-8
+
+
+    # seq_scal(1_000, alpha, x)
+    # JACC.BLAS.scal(1_000, alpha, jx)
+    # @test x≈Array(jx) atol=1e-8 
+
+    # seq_axpy(1_000, alpha, x, y)
+    # JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    # @test x≈Array(jx) atol=1e-8
+
+    # r1 = seq_dot(1_000, x, y) 
+    # r2 = JACC.BLAS.dot(1_000, jx, jy)
+    # @test r1≈Array(r2)[1] atol=1e-8 
+    
+    # r1 = seq_asum(1_000, x)
+    # r2 = JACC.BLAS.asum(1_000, jx)
+    # r1 = seq_nrm2(1_000, x)
+    # r2 = JACC.BLAS.nrm2(1_000, jx)
+    # @test r1≈Array(r2)[1] atol=1e-8
+
+    #seq_swap(1_000, x, y1)  
+    #JACC.BLAS.swap(1_000, jx, jy1)
+    #@test x == Array(jx)
+    #@test y1 == Array(jy1)
+end
+
+@testset "CG" begin
+    function matvecmul(i, a1, a2, a3, x, y, SIZE)
+        if i == 1
+            y[i] = a2[i] * x[i] + a1[i] * x[i + 1]
+        elseif i == SIZE
+            y[i] = a3[i] * x[i - 1] + a2[i] * x[i]
+        elseif i > 1 && i < SIZE
+            y[i] = a3[i] * x[i - 1] + a1[i] * +x[i] + a1[i] * +x[i + 1]
+        end
+    end
+
+    function dot(i, x, y)
+        @inbounds return x[i] * y[i]
+    end
+
+    function axpy(i, alpha, x, y)
+        @inbounds x[i] += alpha[1, 1] * y[i]
+    end
+
+	SIZE = 10
+	a0 = JACC.ones(Float64, SIZE)
+	a1 = JACC.ones(Float64, SIZE)
+	a2 = JACC.ones(Float64, SIZE)
+	r = JACC.ones(Float64, SIZE)
+	p = JACC.ones(Float64, SIZE)
+	s = JACC.zeros(Float64, SIZE)
+	x = JACC.zeros(Float64, SIZE)
+	r_old = JACC.zeros(Float64, SIZE)
+	r_aux = JACC.zeros(Float64, SIZE)
+	a1 = a1 * 4
+	r = r * 0.5
+	p = p * 0.5
+	global cond = one(Float64)
+
+    while cond[1, 1] >= 1e-14
+        r_old = copy(r)
+
+        JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE)
+
+        alpha0 = JACC.parallel_reduce(SIZE, dot, r, r)
+        alpha1 = JACC.parallel_reduce(SIZE, dot, p, s)
+
+        alpha = alpha0 / alpha1
+        negative_alpha = alpha * (-1.0)
+
+        JACC.parallel_for(SIZE, axpy, negative_alpha, r, s)
+        JACC.parallel_for(SIZE, axpy, alpha, x, p)
+
+        beta0 = JACC.parallel_reduce(SIZE, dot, r, r)
+        beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old)
+        beta = beta0 / beta1
+
+        r_aux = copy(r)
+
+        JACC.parallel_for(SIZE, axpy, beta, r_aux, p)
+        ccond = JACC.parallel_reduce(SIZE, dot, r, r)
+        global cond = Array(ccond)
+        p = copy(r_aux)
+    end
+    @test cond[1, 1] <= 1e-14
+end
+
+@testset "LBM" begin
+    function lbm_kernel(x, y, f, f1, f2, t, w, cx, cy, SIZE)
+        u = 0.0
+        v = 0.0
+        p = 0.0
+        x_stream = 0
+        y_stream = 0
+
+        if x > 1 && x < SIZE && y > 1 && y < SIZE
+            for k in 1:9
+                @inbounds x_stream = x - cx[k]
+                @inbounds y_stream = y - cy[k]
+                ind = (k - 1) * SIZE * SIZE + x * SIZE + y
+                iind = (k - 1) * SIZE * SIZE + x_stream * SIZE + y_stream
+                @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)]
+            end
+            for k in 1:9
+                ind = (k - 1) * SIZE * SIZE + x * SIZE + y
+                @inbounds p = p[1, 1] + f[ind]
+                @inbounds u = u[1, 1] + f[ind] * cx[k]
+                @inbounds v = v[1, 1] + f[ind] * cy[k]
+            end
+            u = u / p
+            v = v / p
+            for k in 1:9
+                @inbounds cu = cx[k] * u + cy[k] * v
+                @inbounds feq = w[k] * p *
+                                (1.0 + 3.0 * cu + cu * cu -
+                                 1.5 * ((u * u) + (v * v)))
+                ind = (k - 1) * SIZE * SIZE + x * SIZE + y
+                @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] *
+                                                (1.0 - 1.0 / t) + feq * 1 / t
+            end
+        end
+    end
+
+    function lbm_threads(f, f1, f2, t, w, cx, cy, SIZE)
+        Threads.@sync Threads.@threads for x in 1:SIZE
+            for y in 1:SIZE
+                u = 0.0
+                v = 0.0
+                p = 0.0
+                x_stream = 0
+                y_stream = 0
+
+                if x > 1 && x < SIZE && y > 1 && y < SIZE
+                    for k in 1:9
+                        @inbounds x_stream = x - cx[k]
+                        @inbounds y_stream = y - cy[k]
+                        ind = (k - 1) * SIZE * SIZE + x * SIZE + y
+                        iind = (k - 1) * SIZE * SIZE + x_stream * SIZE +
+                               y_stream
+                        @inbounds f[trunc(Int, ind)] = f1[trunc(Int, iind)]
+                    end
+                    for k in 1:9
+                        ind = (k - 1) * SIZE * SIZE + x * SIZE + y
+                        @inbounds p = p[1, 1] + f[ind]
+                        @inbounds u = u[1, 1] + f[ind] * cx[k]
+                        @inbounds v = v[1, 1] + f[ind] * cy[k]
+                    end
+                    u = u / p
+                    v = v / p
+                    for k in 1:9
+                        @inbounds cu = cx[k] * u + cy[k] * v
+                        @inbounds feq = w[k] * p *
+                                        (1.0 + 3.0 * cu + cu * cu -
+                                         1.5 * ((u * u) + (v * v)))
+                        ind = (k - 1) * SIZE * SIZE + x * SIZE + y
+                        @inbounds f2[trunc(Int, ind)] = f[trunc(Int, ind)] *
+                                                        (1.0 - 1.0 / t) +
+                                                        feq * 1 / t
+                    end
+                end
+            end
+        end
+    end
+
+	SIZE = 10
+	f = ones(SIZE * SIZE * 9) .* 2.0
+	f1 = ones(SIZE * SIZE * 9) .* 3.0
+	f2 = ones(SIZE * SIZE * 9) .* 4.0
+	cx = zeros(9)
+	cy = zeros(9)
+	cx[1] = 0
+	cy[1] = 0
+	cx[2] = 1
+	cy[2] = 0
+	cx[3] = -1
+	cy[3] = 0
+	cx[4] = 0
+	cy[4] = 1
+	cx[5] = 0
+	cy[5] = -1
+	cx[6] = 1
+	cy[6] = 1
+	cx[7] = -1
+	cy[7] = 1
+	cx[8] = -1
+	cy[8] = -1
+	cx[9] = 1
+	cy[9] = -1
+	w = ones(9)
+	t = 1.0
+
+    df = JACC.Array(f)
+    df1 = JACC.Array(f1)
+    df2 = JACC.Array(f2)
+    dcx = JACC.Array(cx)
+    dcy = JACC.Array(cy)
+    dw = JACC.Array(w)
+
+    JACC.parallel_for(
+        (SIZE, SIZE), lbm_kernel, df, df1, df2, t, dw, dcx, dcy, SIZE)
+
+    lbm_threads(f, f1, f2, t, w, cx, cy, SIZE)
+
+    @test f2≈Array(df2) rtol=1e-1
+end