Skip to content

Commit

Permalink
Merge pull request #33 from JuliaGPU/jps/fix-multigpu-move
Browse files Browse the repository at this point in the history
Fix multi-GPU data movement
  • Loading branch information
jpsamaroo authored Dec 20, 2023
2 parents 319a71e + 10f3c85 commit 8f9d530
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 28 deletions.
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ MetalExt = "Metal"
ROCExt = "AMDGPU"

[compat]
AMDGPU = "0.4"
Adapt = "1, 2, 3"
CUDA = "3, 4"
AMDGPU = "0.8.1"
Adapt = "1, 2, 3, 4"
CUDA = "3, 4, 5"
Dagger = "0.17, 0.18"
KernelAbstractions = "0.9"
MemPool = "0.3, 0.4"
Metal = "0.3, 0.4"
Metal = "0.3, 0.4, 0.5"
Requires = "1"
julia = "1.7"
18 changes: 18 additions & 0 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,24 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.C
end
end

function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray)
# TODO: No extra allocations here
if CUDA.device(x) == collect(CUDA.devices())[to_proc.device+1]
return x
end
DaggerGPU.with_device(to_proc) do
_x = similar(x)
copyto!(_x, x)
return _x
end
end

function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{T,N}) where {T,N}
_x = Array{T,N}(undef, size(x))
copyto!(_x, x)
return _x
end

function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
@nospecialize f args kwargs
tls = Dagger.get_tls()
Expand Down
66 changes: 42 additions & 24 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,28 +79,37 @@ end
CuArrayDeviceProc
end
@test DaggerGPU.processor(:CUDA) === cuproc
b = generate_thunks()
c = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=1)) do
@test fetch(Dagger.@spawn isongpu(b))
Dagger.@spawn sum(b)
ndevices = length(collect(CUDA.devices()))

@testset "Arrays (GPU $gpu)" for gpu in 1:min(ndevices, 2)
b = generate_thunks()
c = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=gpu)) do
@test fetch(Dagger.@spawn isongpu(b))
Dagger.@spawn sum(b)
end
@test !fetch(Dagger.@spawn isongpu(b))
@test fetch(Dagger.@spawn identity(c)) == 20
end
@test !fetch(Dagger.@spawn isongpu(b))
@test fetch(Dagger.@spawn identity(c)) == 20

@testset "KernelAbstractions" begin
@testset "KernelAbstractions (GPU $gpu)" for gpu in 1:min(ndevices, 2)
A = rand(Float32, 8)
DA, T = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=1)) do
DA, T = Dagger.with_options(;scope=Dagger.scope(cuda_gpu=gpu)) do
fetch(Dagger.@spawn fill_thunk(A, 2.3f0))
end
@test all(DA .== 2.3f0)
@test T <: CuArray

A = CUDA.rand(128)
B = CUDA.zeros(128)
Dagger.with_options(;scope=Dagger.scope(worker=1,cuda_gpu=1)) do
local A, B
CUDA.device!(gpu-1) do
A = CUDA.rand(128)
B = CUDA.zeros(128)
end
Dagger.with_options(;scope=Dagger.scope(worker=1,cuda_gpu=gpu)) do
fetch(Dagger.@spawn Kernel(copy_kernel)(B, A; ndrange=length(A)))
end
@test all(B .== A)
CUDA.device!(gpu-1) do
@test all(B .== A)
end
end
end
end
Expand All @@ -115,28 +124,37 @@ end
ROCArrayDeviceProc
end
@test DaggerGPU.processor(:ROC) === rocproc
b = generate_thunks()
c = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=1)) do
@test fetch(Dagger.@spawn isongpu(b))
Dagger.@spawn sum(b)
ndevices = length(AMDGPU.devices())

@testset "Arrays (GPU $gpu)" for gpu in 1:min(ndevices, 2)
b = generate_thunks()
c = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=gpu)) do
@test fetch(Dagger.@spawn isongpu(b))
Dagger.@spawn sum(b)
end
@test !fetch(Dagger.@spawn isongpu(b))
@test fetch(Dagger.@spawn identity(c)) == 20
end
@test !fetch(Dagger.@spawn isongpu(b))
@test fetch(Dagger.@spawn identity(c)) == 20

@testset "KernelAbstractions" begin
@testset "KernelAbstractions (GPU $gpu)" for gpu in 1:min(ndevices, 2)
A = rand(Float32, 8)
DA, T = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=1)) do
DA, T = Dagger.with_options(;scope=Dagger.scope(rocm_gpu=gpu)) do
fetch(Dagger.@spawn fill_thunk(A, 2.3f0))
end
@test all(DA .== 2.3f0)
@test T <: ROCArray

A = AMDGPU.rand(128)
B = AMDGPU.zeros(128)
Dagger.with_options(;scope=Dagger.scope(worker=1,rocm_gpu=1)) do
local A, B
AMDGPU.device!(AMDGPU.devices()[gpu]) do
A = AMDGPU.rand(128)
B = AMDGPU.zeros(128)
end
Dagger.with_options(;scope=Dagger.scope(worker=1,rocm_gpu=gpu)) do
fetch(Dagger.@spawn Kernel(copy_kernel)(B, A; ndrange=length(A)))
end
@test all(B .== A)
AMDGPU.device!(AMDGPU.devices()[gpu]) do
@test all(B .== A)
end
end
end
end
Expand Down

0 comments on commit 8f9d530

Please sign in to comment.