Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into parallel_reduce-op-param
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipFackler committed Nov 8, 2024
2 parents d7eeba7 + 9140459 commit 0f5ba43
Show file tree
Hide file tree
Showing 21 changed files with 1,878 additions and 1,199 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/ci-gpu-AMD.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
strategy:
fail-fast: false
matrix:
jobname: [ROCM6-JULIA1_9_1-AMDGPU0_8_6]
jobname: [ROCM6-JULIA1_10_4-AMDGPU0_8_6]

steps:
# Only trigger CI for certain "actors" (those commenting the PR, not the PR originator)
Expand All @@ -36,11 +36,11 @@ jobs:
- name: GitHub API Request
if: steps.check.outputs.triggered == 'true'
id: request
uses: octokit/request-action@v2.1.9
uses: octokit/request-action@v2.x
with:
route: ${{github.event.issue.pull_request.url}}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}

# Create a separate PR status pointing at GitHub Actions tab URL
# just like any other third-party service
Expand All @@ -67,15 +67,15 @@ jobs:
if: steps.check.outputs.triggered == 'true'
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
token: ${{secrets.GITHUB_TOKEN}}
repository: ${{fromJson(steps.request.outputs.data).head.repo.full_name}}
ref: ${{steps.pr_data.outputs.branch}}

- name: Instantiate
if: steps.check.outputs.triggered == 'true'
run: |
source /etc/profile.d/lmod.sh
module load julia/1.9.1
module load julia/1.10.4
module load rocm
julia --project -e 'using Pkg; Pkg.instantiate()'
julia --project -e 'using JACC.JACCPreferences; JACCPreferences.set_backend("AMDGPU")'
Expand All @@ -84,15 +84,15 @@ jobs:
if: steps.check.outputs.triggered == 'true'
run: |
source /etc/profile.d/lmod.sh
module load julia/1.9.1
module load julia/1.10.4
module load rocm
julia --project -e 'using Pkg; Pkg.test()'
- name: Report PR status
if: always() && steps.check.outputs.triggered == 'true'
uses: geekdude/github-status-action-v2@v1.1.10
with:
authToken: ${{ secrets.GITHUB_TOKEN }}
authToken: ${{secrets.GITHUB_TOKEN}}
context: "ci-GPU-AMD ${{matrix.jobname}}"
state: ${{job.status}}
sha: ${{fromJson(steps.request.outputs.data).head.sha}}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci-gpu-NVIDIA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: GitHub API Request
if: steps.check.outputs.triggered == 'true'
id: request
uses: octokit/request-action@v2.1.9
uses: octokit/request-action@v2.x
with:
route: ${{github.event.issue.pull_request.url}}
env:
Expand All @@ -49,7 +49,7 @@ jobs:
uses: geekdude/github-status-action-v2@v1.1.10
with:
authToken: ${{secrets.GITHUB_TOKEN}}
context: "ci-gpu-nvidia-ornl ${{ matrix.jobname }}"
context: "ci-gpu-NVIDIA ${{ matrix.jobname }}"
state: "pending"
sha: ${{fromJson(steps.request.outputs.data).head.sha}}
target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:
uses: geekdude/github-status-action-v2@v1.1.10
with:
authToken: ${{secrets.GITHUB_TOKEN}}
context: "ci-gpu-nvidia-ornl ${{matrix.jobname}}"
context: "ci-gpu-NVIDIA ${{matrix.jobname}}"
state: ${{job.status}}
sha: ${{fromJson(steps.request.outputs.data).head.sha}}
target_url: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ AMDGPU = "0.8"
Atomix = "0.1"
CUDA = "5"
Preferences = "1.4.0"
julia = "1.9.0"
julia = ">= 1.9.0"
40 changes: 25 additions & 15 deletions ext/JACCAMDGPU/JACCAMDGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,33 @@ module JACCAMDGPU

using JACC, AMDGPU

const AMDGPUBackend = ROCBackend

# overloaded array functions
include("array.jl")

include("JACCMULTI.jl")
using .multi

# overloaded experimental functions
include("JACCEXPERIMENTAL.jl")
using .experimental

function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function}
JACC.get_backend(::Val{:amdgpu}) = AMDGPUBackend()

function JACC.parallel_for(::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
numThreads = 512
threads = min(N, numThreads)
blocks = ceil(Int, N / threads)
# shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
# We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
shmem_size = 2 * threads * sizeof(Float64)
@roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(f, x...)
@roc groupsize = threads gridsize = blocks shmem = shmem_size _parallel_for_amdgpu(N, f, x...)
AMDGPU.synchronize()
end

function JACC.parallel_for(
(M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
numThreads = 16
Mthreads = min(M, numThreads)
Nthreads = min(N, numThreads)
Expand All @@ -30,12 +37,12 @@ function JACC.parallel_for(
# shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
# We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
shmem_size = 2 * Mthreads * Nthreads * sizeof(Float64)
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN(f, x...)
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_MN((M,N), f, x...)
AMDGPU.synchronize()
end

function JACC.parallel_for(
(L, M, N)::Tuple{I, I, I}, f::F, x...) where {
::AMDGPUBackend, (L, M, N)::Tuple{I, I, I}, f::F, x...) where {
I <: Integer, F <: Function}
numThreads = 32
Lthreads = min(L, numThreads)
Expand All @@ -47,12 +54,12 @@ function JACC.parallel_for(
# shmem_size = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
# We must know how to get the max shared memory to be used in AMDGPU as it is done in CUDA
shmem_size = 2 * Lthreads * Mthreads * Nthreads * sizeof(Float64)
@roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN(f, x...)
@roc groupsize = (Lthreads, Mthreads, Nthreads) gridsize = (Lblocks, Mblocks, Nblocks) shmem = shmem_size _parallel_for_amdgpu_LMN((L,M,N), f, x...)
AMDGPU.synchronize()
end

function JACC.parallel_reduce(
N::I, f::F, x...) where {I <: Integer, F <: Function}
::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
numThreads = 512
threads = min(N, numThreads)
blocks = ceil(Int, N / threads)
Expand All @@ -68,7 +75,7 @@ function JACC.parallel_reduce(
end

function JACC.parallel_reduce(
(M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function}
numThreads = 16
Mthreads = min(M, numThreads)
Nthreads = min(N, numThreads)
Expand All @@ -85,23 +92,29 @@ function JACC.parallel_reduce(
return rret
end

function _parallel_for_amdgpu(f, x...)
function _parallel_for_amdgpu(N, f, x...)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
i > N && return nothing
f(i, x...)
return nothing
end

function _parallel_for_amdgpu_MN(f, x...)
function _parallel_for_amdgpu_MN((M,N), f, x...)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
i > M && return nothing
j > N && return nothing
f(i, j, x...)
return nothing
end

function _parallel_for_amdgpu_LMN(f, x...)
function _parallel_for_amdgpu_LMN((L,M,N), f, x...)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
k = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z
i > L && return nothing
j > M && return nothing
k > N && return nothing
f(i, j, k, x...)
return nothing
end
Expand Down Expand Up @@ -389,9 +402,6 @@ function JACC.shared(x::ROCDeviceArray{T,N}) where {T,N}
return shmem
end


function __init__()
const JACC.Array = AMDGPU.ROCArray{T, N} where {T, N}
end
JACC.array_type(::AMDGPUBackend) = AMDGPU.ROCArray{T, N} where {T, N}

end # module JACCAMDGPU
Loading

0 comments on commit 0f5ba43

Please sign in to comment.