-
Notifications
You must be signed in to change notification settings - Fork 81
/
Copy pathabstractarray.jl
228 lines (182 loc) · 7.72 KB
/
abstractarray.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# core definition of the AbstractGPUArray type
# input/output
## serialization
using Serialization: AbstractSerializer, serialize_type
function Serialization.serialize(s::AbstractSerializer, t::T) where T <: AbstractGPUArray
serialize_type(s, T)
serialize(s, Array(t))
end
function Serialization.deserialize(s::AbstractSerializer, ::Type{T}) where T <: AbstractGPUArray
A = deserialize(s)
T(A)
end
## showing
struct ToArray end
Adapt.adapt_storage(::ToArray, xs::AbstractGPUArray) = convert(Array, xs)
# display
Base.print_array(io::IO, X::AnyGPUArray) =
Base.print_array(io, adapt(ToArray(), X))
# show
Base._show_nonempty(io::IO, X::AnyGPUArray, prefix::String) =
Base._show_nonempty(io, adapt(ToArray(), X), prefix)
Base._show_empty(io::IO, X::AnyGPUArray) =
Base._show_empty(io, adapt(ToArray(), X))
Base.show_vector(io::IO, v::AnyGPUArray, args...) =
Base.show_vector(io, adapt(ToArray(), v), args...)
## collect to CPU (discarding wrapper type)
collect_to_cpu(xs::AbstractArray) = collect(adapt(ToArray(), xs))
Base.collect(X::AnyGPUArray) = collect_to_cpu(X)
# memory copying
function Base.copy!(dst::AbstractGPUVector, src::AbstractGPUVector)
axes(dst) == axes(src) || throw(ArgumentError(
"arrays must have the same axes for `copy!`. consider using `copyto!` instead"))
copyto!(dst, src)
end
## basic linear copies of identically-typed memory
# expects the GPU array type to have linear `copyto!` methods (i.e. accepting an integer
# offset and length) from and to CPU arrays and between GPU arrays.
for (D, S) in ((AnyGPUArray, Array),
(Array, AnyGPUArray),
(AnyGPUArray, AnyGPUArray))
@eval begin
function Base.copyto!(dest::$D{<:Any, N}, rdest::UnitRange,
src::$S{<:Any, N}, ssrc::UnitRange) where {N}
drange = CartesianIndices((rdest,))
srange = CartesianIndices((ssrc,))
copyto!(dest, drange, src, srange)
end
function Base.copyto!(dest::$D, d_range::CartesianIndices{1},
src::$S, s_range::CartesianIndices{1})
len = length(d_range)
if length(s_range) != len
throw(ArgumentError("Copy range needs same length. Found: dest: $len, src: $(length(s_range))"))
end
len == 0 && return dest
d_offset = first(d_range)[1]
s_offset = first(s_range)[1]
copyto!(dest, d_offset, src, s_offset, len)
end
Base.copyto!(dest::$D, src::$S) = copyto!(dest, 1, src, 1, length(src))
end
end
# kernel-based variant for copying between wrapped GPU arrays
function linear_copy_kernel!(ctx::AbstractKernelContext, dest, dstart, src, sstart, n)
i = linear_index(ctx)-1
if i < n
@inbounds dest[dstart+i] = src[sstart+i]
end
return
end
function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
src::AnyGPUArray, sstart::Integer, n::Integer)
n == 0 && return dest
n < 0 && throw(ArgumentError(string("tried to copy n=", n, " elements, but n should be nonnegative")))
destinds, srcinds = LinearIndices(dest), LinearIndices(src)
(checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1))
(checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart+n-1)) || throw(BoundsError(src, sstart:sstart+n-1))
gpu_call(linear_copy_kernel!,
dest, dstart, src, sstart, n;
elements=n)
return dest
end
# variants that materialize the GPU wrapper before copying from or to the CPU
function Base.copyto!(dest::Array, dstart::Integer,
src::WrappedGPUArray, sstart::Integer, n::Integer)
n == 0 && return dest
temp = similar(parent(src), n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
function Base.copyto!(dest::WrappedGPUArray, dstart::Integer,
src::Array, sstart::Integer, n::Integer)
n == 0 && return dest
temp = similar(parent(dest), n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
# variants that converts values on the CPU when there's a type mismatch
#
# we prefer to convert on the CPU where there's typically more memory / less memory pressure
# to quickly perform these very lightweight conversions
function Base.copyto!(dest::Array{T}, dstart::Integer,
src::AnyGPUArray{U}, sstart::Integer,
n::Integer) where {T,U}
n == 0 && return dest
temp = Vector{U}(undef, n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
function Base.copyto!(dest::AnyGPUArray{T}, dstart::Integer,
src::Array{U}, sstart::Integer, n::Integer) where {T,U}
n == 0 && return dest
temp = Vector{T}(undef, n)
copyto!(temp, 1, src, sstart, n)
copyto!(dest, dstart, temp, 1, n)
return dest
end
## generalized blocks of heterogeneous memory
function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, length)
i = linear_index(ctx)
if i <= length
idx = CartesianIndices(shape)[i]
@inbounds dest[idx + dest_offsets] = src[idx + src_offsets]
end
return
end
function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{N},
src::AnyGPUArray{<:Any, N}, srccrange::CartesianIndices{N}) where {N}
shape = size(destcrange)
if shape != size(srccrange)
throw(ArgumentError("Ranges don't match their size. Found: $shape, $(size(srccrange))"))
end
len = length(destcrange)
len == 0 && return dest
dest_offsets = first(destcrange) - oneunit(CartesianIndex{N})
src_offsets = first(srccrange) - oneunit(CartesianIndex{N})
gpu_call(cartesian_copy_kernel!,
dest, dest_offsets, src, src_offsets, shape, len;
elements=len)
dest
end
for (dstTyp, srcTyp) in (AbstractGPUArray=>Array, Array=>AbstractGPUArray)
@eval function Base.copyto!(dst::$dstTyp{T,N}, dstrange::CartesianIndices{N},
src::$srcTyp{T,N}, srcrange::CartesianIndices{N}) where {T,N}
isempty(dstrange) && return dst
if size(dstrange) != size(srcrange)
throw(ArgumentError("source and destination must have same size (got $(size(srcrange)) and $(size(dstrange)))"))
end
# figure out how many dimensions of the Cartesian ranges map onto contiguous memory
# in both source and destination. we will copy these one by one as linear ranges.
contiguous_dims = 1
for dim in 2:N
# a slice is broken up if the previous dimension didn't cover the entire range
if axes(src, dim-1) == axes(srcrange, dim-1) &&
axes(dst, dim-1) == axes(dstrange, dim-1)
contiguous_dims = dim
else
break
end
end
m = prod(size(dstrange)[1:contiguous_dims]) # inner, contiguous length
n = prod(size(dstrange)[contiguous_dims+1:end]) # outer non-contiguous length
@assert m*n == length(srcrange) == length(dstrange)
# copy linear slices
for i in 1:m:m*n
srcoff = LinearIndices(src)[srcrange[i]]
dstoff = LinearIndices(dst)[dstrange[i]]
# TODO: Use asynchronous memory copies
copyto!(dst, dstoff, src, srcoff, m)
end
dst
end
end
## other
Base.copy(x::AbstractGPUArray) = error("Not implemented") # COV_EXCL_LINE
Base.deepcopy(x::AbstractGPUArray) = copy(x)
# filtering
# TODO: filter!
# revert of JuliaLang/julia#31929
Base.filter(f, As::AbstractGPUArray) = As[map(f, As)::AbstractGPUArray{Bool}]