Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU support for Base.repeat #126

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/GPUArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ include("convolution.jl")
include("testsuite/testsuite.jl")
include("jlbackend.jl")
include("random.jl")
include("repeat.jl")

export GPUArray, gpu_call, thread_blocks_heuristic, global_size, synchronize_threads
export linear_index, @linearidx, @cartesianidx, convolution!, device, synchronize
export linear_index, @linearidx, @cartesianidx, convolution!, device, synchronize, gpu_repeat, gpu_repeat_grad
export JLArray

end # module
119 changes: 119 additions & 0 deletions src/repeat.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
using Base: CartesianIndex, tail, cat_fill!
using CUDAnative

@generated function ntuple_args(f, ::Val{N}, args::Vararg{<: Any, Nargs}) where {N, Nargs}
expr = Expr(:tuple)
for i = 1:N
call = Expr(:call, :f, i)
for j = 1:Nargs
push!(call.args, :(args[$j]))
end
push!(expr.args, call)
end
quote
Base.@_inline_meta
$expr
end
end

rep_shapes(A, i, o) = _rshps((), (), size(A), i, o)

_rshps(shp, shp_i, ::Tuple{}, ::Tuple{}, ::Tuple{}) = (shp, shp_i)
@inline _rshps(shp, shp_i, ::Tuple{}, ::Tuple{}, o) =
_rshps((shp..., o[1]), (shp_i..., 1), (), (), tail(o))
@inline _rshps(shp, shp_i, ::Tuple{}, i, ::Tuple{}) = (n = i[1];
_rshps((shp..., n), (shp_i..., n), (), tail(i), ()))
@inline _rshps(shp, shp_i, ::Tuple{}, i, o) = (n = i[1];
_rshps((shp..., n * o[1]), (shp_i..., n), (), tail(i), tail(o)))
@inline _rshps(shp, shp_i, sz, i, o) = (n = sz[1] * i[1];
_rshps((shp..., n * o[1]), (shp_i..., n), tail(sz), tail(i), tail(o)))
_rshps(shp, shp_i, sz, ::Tuple{}, ::Tuple{}) =
(n = length(shp); N = n + length(sz); _reperr("inner", n, N))
_rshps(shp, shp_i, sz, ::Tuple{}, o) =
(n = length(shp); N = n + length(sz); _reperr("inner", n, N))
_rshps(shp, shp_i, sz, i, ::Tuple{}) =
(n = length(shp); N = n + length(sz); _reperr("outer", n, N))
_reperr(s, n, N) = throw(ArgumentError("number of " * s * " repetitions " *
"($n) cannot be less than number of dimensions of input ($N)"))

function repeat_kernel(state, A::AbstractArray{T}, out::AbstractArray{T}, inner, outer, Asize, outSize, inner_shape) where T
ilin = linear_index(state)
idx = GPUArrays.gpu_ind2sub(outSize, ilin)
if (idx[1] > Asize[1] || idx[2] > Asize[2])
return
end

# save_print("idx ", idx)

for n1 in UInt32(1):UInt32(outer[UInt32(1)])
for n2 in UInt32(1):UInt32(outer[UInt32(2)])
inner_start_indices = ntuple_args(Val{length(inner)}(), inner, idx, n1, n2, inner_shape) do i, inner, idx, n1, n2, inner_shape
if UInt32(i) == UInt32(1)
@inbounds return ((UInt32(1)) + (idx[i] - UInt32(1)) * inner[i]) + ((n1 - UInt32(1)) * inner_shape[UInt32(1)])
elseif UInt32(i) == UInt32(2)
@inbounds return ((UInt32(1)) + (idx[i] - UInt32(1)) * inner[i]) + ((n2 - UInt32(1)) * inner_shape[UInt32(2)])
else
@inbounds return ((UInt32(1)) + (idx[i] - UInt32(1)) * inner[i])
end
end

inner_end_indices = ntuple_args(Val{length(inner)}(), inner, idx, n1, n2, inner_shape) do i, inner, idx, n1, n2, inner_shape
if UInt32(i) == UInt32(1)
@inbounds return ((inner[i]) + (idx[i] - UInt32(1)) * inner[i]) + ((n1 - UInt32(1)) * inner_shape[UInt32(1)])
elseif UInt32(i) == UInt32(2)
@inbounds return ((inner[i]) + (idx[i] - UInt32(1)) * inner[i]) + ((n2 - UInt32(1)) * inner_shape[UInt32(2)])
else
@inbounds return ((inner[i]) + (idx[i] - UInt32(1)) * inner[i])
end
end

for i in inner_start_indices[1]:inner_end_indices[1]
for j in inner_start_indices[2]:inner_end_indices[2]
@inbounds out[i, j] = A[idx[1], idx[2]]
end
end
end
end

synchronize_threads(state)

return
end


function repeat_back_kernel(state, A::AbstractArray{T}, delta::AbstractArray{T}, out::AbstractArray{T}, inner, outer, ::Val{dims}, Asize) where {T, dims}
ilin = linear_index(state)
idx = GPUArrays.gpu_ind2sub(Asize, ilin)
if (idx[1] > Asize[1] || idx[2] > Asize[2])
return
end

@inbounds src_idx_1 = mod1(div(idx[1] - 1, inner[1]) + 1, Asize[1])
@inbounds src_idx_2 = mod1(div(idx[2] - 1, inner[2]) + 1, Asize[2])
synchronize_threads(state)
@inbounds const_here = out[src_idx_1, src_idx_2]
# synchronize_threads(state)

# CUDAnative.@cuprintf("idx[1]: %d\nidx[2]: %d\nsrc_idx_1: %d\nsrc_idx_2: %d\nconst_here: %d\ndelta[idx...]: %f\n\n", idx[1], idx[2], src_idx_1, src_idx_2, const_here, delta[idx...])

@inbounds out[src_idx_1, src_idx_2] = const_here + delta[idx...]
synchronize_threads(state)

return
end

function gpu_repeat(A::GPUArray, inner, outer)
shape, inner_shape = rep_shapes(A, inner, outer)
R = similar(A, shape)
if any(iszero, shape)
return R
end
gpu_call(repeat_kernel, delta, (A, R, inner, outer, UInt32.(size(A)), UInt32.(shape), UInt32.(inner_shape)))
return R
end

function gpu_repeat_grad(A::GPUArray, delta::GPUArray, inner, outer)
R = zeros(typeof(A), size(A))
gpu_call(repeat_back_kernel, A, (A, delta, R, inner, outer, Val{UInt32.(ndims(A))}(), UInt32.(size(A))))
return R
end