Merge remote-tracking branch 'origin/main' into parallel_reduce-op-param

JuliaORNL · Nov 8, 2024 · 0f5ba43 · 0f5ba43
2 parents d7eeba7 + 9140459
commit 0f5ba43
Show file tree

Hide file tree

Showing 21 changed files with 1,878 additions and 1,199 deletions.
diff --git a/.github/workflows/ci-gpu-AMD.yaml b/.github/workflows/ci-gpu-AMD.yaml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        jobname: [ROCM6-JULIA1_9_1-AMDGPU0_8_6]
+        jobname: [ROCM6-JULIA1_10_4-AMDGPU0_8_6]
 
     steps:
       # Only trigger CI for certain "actors" (those commenting the PR, not the PR originator)
@@ -36,11 +36,11 @@ jobs:
       - name: GitHub API Request
         if: steps.check.outputs.triggered == 'true'
         id: request
-        uses: octokit/request-action@v2.1.9
+        uses: octokit/request-action@v2.x
         with:
           route: ${{github.event.issue.pull_request.url}}
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
 
       # Create a separate PR status pointing at GitHub Actions tab URL
       # just like any other third-party service
@@ -67,15 +67,15 @@ jobs:
         if: steps.check.outputs.triggered == 'true'
         uses: actions/checkout@v4
         with:
-          token: ${{ secrets.GITHUB_TOKEN }}
+          token: ${{secrets.GITHUB_TOKEN}}
           repository: ${{fromJson(steps.request.outputs.data).head.repo.full_name}}
           ref: ${{steps.pr_data.outputs.branch}}
 
       - name: Instantiate
         if: steps.check.outputs.triggered == 'true'
         run: |
           source /etc/profile.d/lmod.sh
-          module load julia/1.9.1
+          module load julia/1.10.4
           module load rocm
           julia --project -e 'using Pkg; Pkg.instantiate()'
           julia --project -e 'using JACC.JACCPreferences; JACCPreferences.set_backend("AMDGPU")'
@@ -84,15 +84,15 @@ jobs:
         if: steps.check.outputs.triggered == 'true'
         run: |
           source /etc/profile.d/lmod.sh
-          module load julia/1.9.1
+          module load julia/1.10.4
           module load rocm
           julia --project -e 'using Pkg; Pkg.test()'
 
       - name: Report PR status
         if: always() && steps.check.outputs.triggered == 'true'
         uses: geekdude/github-status-action-v2@v1.1.10
         with:
-          authToken: ${{ secrets.GITHUB_TOKEN }}
+          authToken: ${{secrets.GITHUB_TOKEN}}
           context: "ci-GPU-AMD ${{matrix.jobname}}"
           state: ${{job.status}}
           sha: ${{fromJson(steps.request.outputs.data).head.sha}}

diff --git a/.github/workflows/ci-gpu-NVIDIA.yaml b/.github/workflows/ci-gpu-NVIDIA.yaml
@@ -36,7 +36,7 @@ jobs:
       - name: GitHub API Request
         if: steps.check.outputs.triggered == 'true'
         id: request
-        uses: octokit/request-action@v2.1.9
+        uses: octokit/request-action@v2.x
         with:
           route: ${{github.event.issue.pull_request.url}}
         env:
@@ -49,7 +49,7 @@ jobs:
         uses: geekdude/github-status-action-v2@v1.1.10
         with:
           authToken: ${{secrets.GITHUB_TOKEN}}
-          context: "ci-gpu-nvidia-ornl ${{ matrix.jobname }}"
+          context: "ci-gpu-NVIDIA ${{ matrix.jobname }}"
           state: "pending"
           sha: ${{fromJson(steps.request.outputs.data).head.sha}}
           target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
@@ -86,7 +86,7 @@ jobs:
         uses: geekdude/github-status-action-v2@v1.1.10
         with:
           authToken: ${{secrets.GITHUB_TOKEN}}
-          context: "ci-gpu-nvidia-ornl ${{matrix.jobname}}"
+          context: "ci-gpu-NVIDIA ${{matrix.jobname}}"
           state: ${{job.status}}
           sha: ${{fromJson(steps.request.outputs.data).head.sha}}
           target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
diff --git a/Project.toml b/Project.toml
@@ -22,4 +22,4 @@ AMDGPU = "0.8"
 Atomix = "0.1"
 CUDA = "5"
 Preferences = "1.4.0"
-julia = "1.9.0"
+julia = ">= 1.9.0"
diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -2,26 +2,33 @@ module JACCAMDGPU
 
 using JACC, AMDGPU
 
+const AMDGPUBackend = ROCBackend
+
 # overloaded array functions
 include("array.jl")
 
+include("JACCMULTI.jl")
+using .multi
+
 # overloaded experimental functions
 include("JACCEXPERIMENTAL.jl")
 using .experimental
 
-function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
+JACC.get_backend(::Val{:amdgpu}) = AMDGPUBackend()
+
+function JACC.parallel_for(::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 512
     threads = min(N, numThreads)
     blocks = ceil(Int, N / threads)
     # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
     shmem_size = 2 * threads * sizeof(Float64)
-    @roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(f, x...)
+    @roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(N, f, x...)
     AMDGPU.synchronize()
 end
 
 function JACC.parallel_for(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 16
     Mthreads = min(M, numThreads)
     Nthreads = min(N, numThreads)
@@ -30,12 +37,12 @@ function JACC.parallel_for(
     # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
     shmem_size = 2 * Mthreads * Nthreads * sizeof(Float64)
-    @roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN(f, x...)
+    @roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN((M,N), f, x...)
     AMDGPU.synchronize()
 end
 
 function JACC.parallel_for(
-        (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
+        ::AMDGPUBackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
         I <: Integer, F <: Function}
     numThreads = 32
     Lthreads = min(L, numThreads)
@@ -47,12 +54,12 @@ function JACC.parallel_for(
     # shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
     # We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
     shmem_size = 2 * Lthreads * Mthreads * Nthreads * sizeof(Float64)
-    @roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN(f, x...)
+    @roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN((L,M,N), f, x...)
     AMDGPU.synchronize()
 end
 
 function JACC.parallel_reduce(
-        N::I, f::F, x...) where {I <: Integer, F <: Function}
+        ::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 512
     threads = min(N, numThreads)
     blocks = ceil(Int, N / threads)
@@ -68,7 +75,7 @@ function JACC.parallel_reduce(
 end
 
 function JACC.parallel_reduce(
-        (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
+        ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
     numThreads = 16
     Mthreads = min(M, numThreads)
     Nthreads = min(N, numThreads)
@@ -85,23 +92,29 @@ function JACC.parallel_reduce(
     return rret
 end
 
-function _parallel_for_amdgpu(f, x...)
+function _parallel_for_amdgpu(N, f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i > N && return nothing
     f(i, x...)
     return nothing
 end
 
-function _parallel_for_amdgpu_MN(f, x...)
+function _parallel_for_amdgpu_MN((M,N), f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
     j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    i > M && return nothing
+    j > N && return nothing
     f(i, j, x...)
     return nothing
 end
 
-function _parallel_for_amdgpu_LMN(f, x...)
+function _parallel_for_amdgpu_LMN((L,M,N), f, x...)
     i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
     j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
     k = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z
+    i > L && return nothing
+    j > M && return nothing
+    k > N && return nothing
     f(i, j, k, x...)
     return nothing
 end
@@ -389,9 +402,6 @@ function JACC.shared(x::ROCDeviceArray{T,N}) where {T,N}
   return shmem
 end
 
-
-function __init__()
-    const JACC.Array = AMDGPU.ROCArray{T, N} where {T, N}
-end
+JACC.array_type(::AMDGPUBackend) = AMDGPU.ROCArray{T, N} where {T, N}
 
 end # module JACCAMDGPU