From 04d7ababff4ba949538af2da697bab897d80e3d3 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Tue, 6 Apr 2021 14:29:31 -0500 Subject: [PATCH 01/11] Support multiple GPU backends --- src/Flux.jl | 1 + src/functor.jl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Flux.jl b/src/Flux.jl index 5e6776d601..a36a341c89 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -32,6 +32,7 @@ export Descent, ADAM, Momentum, Nesterov, RMSProp, using CUDA const use_cuda = Ref(false) +const default_gpu_converter = Ref{Function}(identity) include("utils.jl") include("zeros.jl") diff --git a/src/functor.jl b/src/functor.jl index afda1f5b84..c07907635e 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -66,7 +66,7 @@ end cpu(m) = fmap(x -> adapt(Array, x), m) -gpu(x) = use_cuda[] ? fmap(CUDA.cu, x) : x +gpu(x) = fmap(default_gpu_converter[], x) # Precision From 3080aa85cd29edf4e78a0441facee7613a7b8b01 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Apr 2021 12:46:26 +0200 Subject: [PATCH 02/11] Prepare to move CUDA-specific functionality to separate package. --- Manifest.toml | 68 ----------- Project.toml | 6 +- src/Flux.jl | 14 +-- src/cuda/cuda.jl | 12 -- src/cuda/cudnn.jl | 20 --- src/losses/Losses.jl | 2 - src/losses/ctc-gpu.jl | 232 ----------------------------------- src/onehot.jl | 4 - test/ctc-gpu.jl | 27 ++--- test/cuda/cuda.jl | 65 ---------- test/cuda/cudnn.jl | 44 ------- test/cuda/curnn.jl | 58 --------- test/cuda/layers.jl | 261 ---------------------------------------- test/cuda/losses.jl | 38 ------ test/cuda/runtests.jl | 19 --- test/cuda/test_utils.jl | 72 ----------- test/runtests.jl | 12 +- 17 files changed, 18 insertions(+), 936 deletions(-) delete mode 100644 src/cuda/cuda.jl delete mode 100644 src/cuda/cudnn.jl delete mode 100644 src/losses/ctc-gpu.jl delete mode 100644 test/cuda/cuda.jl delete mode 100644 test/cuda/cudnn.jl delete mode 100644 test/cuda/curnn.jl delete mode 100644 test/cuda/layers.jl delete mode 100644 test/cuda/losses.jl delete mode 100644 test/cuda/runtests.jl delete mode 100644 test/cuda/test_utils.jl diff --git a/Manifest.toml b/Manifest.toml index 455ea5bddf..4aad025690 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -23,26 +23,9 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -[[BFloat16s]] -deps = ["LinearAlgebra", "Test"] -git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.1.0" - [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" -uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" - -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"] -git-tree-sha1 = "a6ce96dcf22fc4f1bfdfac02d54f0b77ecf2a4cc" -uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.0.3" - [[ChainRules]] deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Random", "Reexport", "Requires", "Statistics"] git-tree-sha1 = "1f410fba5c04d03ab712f348f1542e6059376547" @@ -128,11 +111,6 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" deps = ["ArgTools", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -[[ExprTools]] -git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.3" - [[FillArrays]] deps = ["LinearAlgebra", "Random", "SparseArrays"] git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a" @@ -163,12 +141,6 @@ git-tree-sha1 = "9c95b2fd5c16bc7f97371e9f92f0fef77e0f5957" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "6.2.2" -[[GPUCompiler]] -deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "6eadd2321dc3ac0fc9d530ab01c2caa7fe5d74c6" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.11.4" - [[IRTools]] deps = ["InteractiveUtils", "MacroTools", "Test"] git-tree-sha1 = "c67e7515a11f726f44083e74f218d134396d6510" @@ -191,16 +163,6 @@ git-tree-sha1 = "07cb43290a840908a771552911a6274bc6c072c7" uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" version = "0.8.4" -[[LLVM]] -deps = ["CEnum", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "3.6.0" - -[[LazyArtifacts]] -deps = ["Artifacts", "Pkg"] -uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" - [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" @@ -247,12 +209,6 @@ git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" version = "0.5.0" -[[Memoize]] -deps = ["MacroTools"] -git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" -uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" -version = "0.4.4" - [[Missings]] deps = ["DataAPI"] git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c" @@ -271,12 +227,6 @@ git-tree-sha1 = "80b8360670f445d88b3475e88b33bbcc92f7866e" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" version = "0.7.19" -[[NNlibCUDA]] -deps = ["CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"] -git-tree-sha1 = "4b368b466bcdd25d448a5b20de4b7e481d68b88e" -uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d" -version = "0.1.0" - [[NaNMath]] git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" @@ -322,12 +272,6 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -[[RandomNumbers]] -deps = ["Random", "Requires"] -git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" -uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" -version = "1.4.0" - [[Reexport]] git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" uuid = "189a3867-3050-52da-a836-e630ba90ab69" @@ -342,12 +286,6 @@ version = "1.1.3" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -[[Scratch]] -deps = ["Dates"] -git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6" -uuid = "6c6a2e73-6563-6170-7368-637461726353" -version = "1.0.3" - [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -402,12 +340,6 @@ uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[[TimerOutputs]] -deps = ["Printf"] -git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236" -uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.8" - [[TranscodingStreams]] deps = ["Random", "Test"] git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" diff --git a/Project.toml b/Project.toml index 0a106defa5..89ada44fe0 100644 --- a/Project.toml +++ b/Project.toml @@ -5,17 +5,15 @@ version = "0.12.3" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d" -Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -29,14 +27,12 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] AbstractTrees = "0.3" Adapt = "3.0" -CUDA = "3" CodecZlib = "0.7" Colors = "0.12" Functors = "0.2.1" Juno = "0.8" MacroTools = "0.5" NNlib = "0.7.14" -NNlibCUDA = "0.1" Reexport = "0.2, 1.0" StatsBase = "0.33" ZipFile = "0.9" diff --git a/src/Flux.jl b/src/Flux.jl index a36a341c89..b75f7700d7 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -8,6 +8,7 @@ using Zygote, MacroTools, Juno, Reexport using MacroTools: @forward @reexport using NNlib using Zygote: Params, @adjoint, gradient, pullback, @nograd +using GPUArrays export gradient @@ -30,8 +31,6 @@ export Descent, ADAM, Momentum, Nesterov, RMSProp, WeightDecay, ClipValue, ClipNorm -using CUDA -const use_cuda = Ref(false) const default_gpu_converter = Ref{Function}(identity) include("utils.jl") @@ -55,15 +54,4 @@ using .Losses # TODO: stop importing Losses in Flux's namespace in v0.12 include("deprecations.jl") -include("cuda/cuda.jl") - -function __init__() - use_cuda[] = CUDA.functional() # Can be overridden after load with `Flux.use_cuda[] = false` - if CUDA.functional() - if !CUDA.has_cudnn() - @warn "CUDA.jl found cuda, but did not find libcudnn. Some functionality will not be available." - end - end -end - end # module diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl deleted file mode 100644 index 0ef3c65308..0000000000 --- a/src/cuda/cuda.jl +++ /dev/null @@ -1,12 +0,0 @@ -module CUDAint - -using ..CUDA - -import ..Flux: Flux -import Zygote -using Zygote: @adjoint -import NNlib, NNlibCUDA - -include("cudnn.jl") - -end diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl deleted file mode 100644 index 4a3b2618c8..0000000000 --- a/src/cuda/cudnn.jl +++ /dev/null @@ -1,20 +0,0 @@ -import NNlibCUDA: batchnorm, ∇batchnorm - -function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, - cache=nothing) where T<:Union{Float32, Float64} - - @assert BN.affine "BatchNorm: only affine=true supported on gpu" - @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu" - @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wronng number of channels" - return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; - cache=cache, alpha=1, beta=0, eps=BN.ϵ, - training=Flux._isactive(BN))) -end - -@adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...) - y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) - function batchnorm_pullback(Δ) - ∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)..., nothing, nothing, nothing - end - y, batchnorm_pullback -end diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl index bf944f9231..5736e3201a 100644 --- a/src/losses/Losses.jl +++ b/src/losses/Losses.jl @@ -4,7 +4,6 @@ using Statistics using Zygote using Zygote: @adjoint using ..Flux: ofeltype, epseltype -using CUDA using NNlib: logsoftmax, logσ import Base.Broadcast: broadcasted @@ -24,6 +23,5 @@ export mse, mae, msle, include("utils.jl") include("functions.jl") include("ctc.jl") -if CUDA.functional() include("ctc-gpu.jl") end end #module diff --git a/src/losses/ctc-gpu.jl b/src/losses/ctc-gpu.jl deleted file mode 100644 index b856215b61..0000000000 --- a/src/losses/ctc-gpu.jl +++ /dev/null @@ -1,232 +0,0 @@ -# GPU implementation - -# a port of the GPU kernels from Baidu's C++ warp-ctc package, -# which itself is Copyright 2015-2016 Baidu USA LLC -# and available under the Apache 2.0 license -# -# Apache 2.0 license: https://www.apache.org/licenses/LICENSE-2.0 -# GitHub: https://github.com/baidu-research/warp-ctc/ -# paper: https://arxiv.org/pdf/1512.02595.pdf - -using Flux -using Statistics -using CUDA -using NNlib - -const MAX_THREADS = 256 - -function log_plus_f(p1, p2) - isinf(p1) && return p2 - isinf(p2) && return p1 - if p1 < p2 - p1, p2 = p2, p1 - end - return p1 + log(1+exp(p2 - p1)) -end - -function count_repeats(A) - repeats = 0 - for (i,elem) in enumerate(A) - if i > 1 && A[i] == A[i-1] - repeats += 1 - end - end - return repeats -end - -function compute_alpha_kernel(probs, labelSize, uttLength, repeats, labelsWithoutBlanks, labelsWithBlanks, alpha, blankLabel) - - tid = threadIdx().x - L = labelSize - T = uttLength - S = length(labelsWithBlanks) - - if L + repeats > T - return nothing - end - labels = labelsWithBlanks - - # Corner-case checking - start = (L + repeats <= T) ? 0 : 1 - last = S > 1 ? 2 : 1 - - # Fill in first column (time step) - i = tid - while i <= last - start - alpha[start+i, 1] = probs[labels[start+i], 1] - i += blockDim().x - end - sync_threads() - - # Fill in coefficients for each time step - for t=2:T - # Corner-case checking - if tid == 1 && !(1 < S - 2*(T-t) - 1) - if start == 0 - alpha[1, t] = alpha[1, t-1] + probs[blankLabel, t] - elseif start == 1 - alpha[1, t] = alpha[1, t-1] - end - end - sync_threads() - - # Fill in coefficients for each label class in the target output sequence; - # each thread will process the calculations for one class - idx = tid+1 - while idx <= S - prevSum = log_plus_f(alpha[idx, t-1], alpha[idx-1, t-1]) - if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx-2] - prevSum = log_plus_f(prevSum, alpha[idx-2, t-1]) - end - if idx < S - 2*(T-t) - 1 - alpha[idx, t] = -Inf32 - else - alpha[idx, t] = prevSum + probs[labels[idx], t] - end - idx += blockDim().x - end - sync_threads() - end - return nothing -end - -function compute_beta_and_grad_kernel(probs, labelSize, uttLength, - repeatsInLabel, labelsWithBlanks, - alphas, beta, output, accum, - grad, blankLabel, loss) - - tid = threadIdx().x - L = labelSize - T = uttLength - S = 2*L + 1 - repeats = repeatsInLabel - labels = labelsWithBlanks - - if (L+repeats) > T - return nothing - end - - # Corner-case checking - start = S > 1 ? S-2 : 0 - last = L + repeats < T ? S : S-1 - sync_threads() - i = tid - - # Calculate coefficients for last column (time step) - # then determine alpha and beta product - while i <= last - start - beta[i+start, T] = 0 - output[i+start, T] = beta[i+start, T] + alphas[i+start, T] - i += blockDim().x - end - sync_threads() - - # Fill in `accum` for last column (time step) - if tid == 1 - for i=1:S - labelIdx = labels[i] - accum[labelIdx, T] = log_plus_f(accum[labelIdx, T], output[i, T]) - end - end - sync_threads() - - # Fill in `grad` for last column (time step) - idx = tid - while idx <= size(grad, 1) - s = -Inf32 - for i=1:S - s = log_plus_f(s, output[i, T]) - end - - # ∂L/∂a (where a is activation before logsoftmax) - grad[idx, T] = exp(probs[idx, T]) - exp(accum[idx, T] - s) - idx += blockDim().x - end - sync_threads() - - # Fill in the rest of the coefficients - t = T-1 - while t >= 1 - if t < T - idx = tid - while idx <= S - nextSum = probs[labels[idx], t+1] + beta[idx, t+1] - if idx < S - nextSum = log_plus_f(nextSum, - probs[labels[idx+1], t+1] + beta[idx+1, t+1]) - end - if labels[idx] != blankLabel && idx != S-1 && labels[idx] != labels[idx+2] - nextSum = log_plus_f(nextSum, - probs[labels[idx+2], t+1] + beta[idx + 2, t+1]) - end - if idx > 2*t - beta[idx, t] = -Inf32 - else - beta[idx, t] = nextSum - end - idx += blockDim().x - end - sync_threads() - idx = tid - while idx <= S - output[idx, t] = alphas[idx, t] + beta[idx, t] - idx += blockDim().x - end - sync_threads() - end - sync_threads() - - # Calculate accumulated alpha-beta products for each label class for - # each time step; used in calculating gradients - if tid == 1 - for i=1:S - labelIdx = labels[i] - accum[labelIdx, t] = log_plus_f(accum[labelIdx, t], output[i, t]) - end - end - sync_threads() - idx = tid - - # Calculate gradients - while idx <= size(grad, 1) - - # ∂L/∂a (where a is activation before logsoftmax) - grad[idx, t] = exp(probs[idx, t]) - exp(accum[idx, t] + loss) - idx += blockDim().x - end - sync_threads() - t -= 1 - sync_threads() - end - return nothing -end - -function ctc_alpha(ŷ::CuArray, y) - ŷ = logsoftmax(ŷ) - blank = size(ŷ, 1) - z′ = fill(blank, 2 * length(y) + 1) - z′[eachindex(y) .* 2] = y - T = size(ŷ, 2) - U′ = 2*length(y) + 1 - alphas = CUDA.fill(log(zero(ŷ[1])), U′,T) - nRepeats = count_repeats(y) - nThreads = min(U′, MAX_THREADS) - @cuda blocks=1 threads=nThreads compute_alpha_kernel(ŷ, length(y), T, nRepeats, CuArray(y), CuArray(z′), alphas, blank) - return (loss=-1 * logsumexp(alphas[end-1:end]), alpha=alphas, z′=z′, yhat=ŷ, nRepeats=nRepeats) -end - -ctc_loss(ŷ::CuArray, y) = ctc_alpha(ŷ::CuArray, y).loss - -function ∇ctc_loss(ŷ::CuArray, y, out) - loss, alphas, z′, ŷ, nRepeats = out - U′, T = size(alphas) - blank = size(ŷ, 1) - typed_zero = zero(first(ŷ)) - betas = CUDA.fill(log(typed_zero), U′, T) - output = CUDA.fill(log(typed_zero), U′, T) - nThreads = min(U′, MAX_THREADS) - grads = CUDA.fill(log(typed_zero), size(ŷ)) - accum = CUDA.fill(log(typed_zero), size(ŷ)) - @cuda blocks=1 threads=nThreads compute_beta_and_grad_kernel(ŷ, length(y), T, nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank, loss) - return grads -end diff --git a/src/onehot.jl b/src/onehot.jl index b879e5cda2..edc5fc5f8b 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -1,5 +1,4 @@ import Adapt -import .CUDA struct OneHotArray{T<:Integer, L, N, var"N+1", I<:Union{T, AbstractArray{T, N}}} <: AbstractArray{Bool, var"N+1"} indices::I @@ -62,7 +61,6 @@ Base.getindex(x::OneHotArray{<:Any, <:Any, <:Any, N}, ::Vararg{Colon, N}) where Base.getindex(x::OneHotArray, I::CartesianIndex{N}) where N = x[I[1], Tuple(I)[2:N]...] _onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:Union{Integer, AbstractArray}}) where N = Array{Bool, N} -_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N} function Base.cat(x::OneHotLike{<:Any, L}, xs::OneHotLike{<:Any, L}...; dims::Int) where L if isone(dims) || any(x -> !_isonehot(x), (x, xs...)) @@ -79,8 +77,6 @@ batch(xs::AbstractArray{<:OneHotVector{<:Any, L}}) where L = OneHotArray(_indice Adapt.adapt_structure(T, x::OneHotArray{<:Any, L}) where L = OneHotArray(adapt(T, _indices(x)), L) -Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}() - Base.argmax(x::OneHotLike; dims = Colon()) = (_isonehot(x) && dims == 1) ? reshape(CartesianIndex.(_indices(x), CartesianIndices(_indices(x))), 1, size(_indices(x))...) : diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl index d7ff1bdf9d..37d6ab3134 100644 --- a/test/ctc-gpu.jl +++ b/test/ctc-gpu.jl @@ -3,7 +3,6 @@ using Flux using Flux.Losses: ctc_loss using Zygote: gradient using LinearAlgebra -using CUDA # Custom function to check numerical gradient of ctc loss, # based on `ngradient` in `Tracker.jl` @@ -26,31 +25,31 @@ end @testset "ctc-gpu" begin x = rand(10, 50) y = rand(1:9, 30) - x_cu = CuArray(x) - g1 = gradient(ctc_loss, x_cu, y)[1] + x_gpu = gpu(x) + g1 = gradient(ctc_loss, x_gpu, y)[1] g1 = g1 |> collect g2 = ctc_ngradient(x, y) @test g1 ≈ g2 rtol=1e-5 atol=1e-5 - + # test that GPU loss matches CPU implementation - l1 = ctc_loss(x_cu, y) + l1 = ctc_loss(x_gpu, y) l2 = ctc_loss(x, y) @test l1 ≈ l2 - + # tests using hand-calculated values - x_cu = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] |> CuArray + x_gpu = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] |> gpu y = [1, 2] - @test ctc_loss(x_cu, y) ≈ 3.6990738275138035 - + @test ctc_loss(x_gpu, y) ≈ 3.6990738275138035 + g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; 0.0729422 0.447346 0.16457] - ghat = gradient(ctc_loss, x_cu, y)[1] |> collect + ghat = gradient(ctc_loss, x_gpu, y)[1] |> collect @test g ≈ ghat rtol=1e-5 atol=1e-5 - x_cu = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] |> CuArray - y = [1, 2] |> CuArray - @test ctc_loss(x_cu, y) ≈ 8.02519869363453 + x_gpu = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] |> gpu + y = [1, 2] |> gpu + @test ctc_loss(x_gpu, y) ≈ 8.02519869363453 g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] - ghat = gradient(ctc_loss, x_cu, y)[1] |> collect + ghat = gradient(ctc_loss, x_gpu, y)[1] |> collect @test g ≈ ghat rtol=1e-5 atol=1e-5 end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl deleted file mode 100644 index 92c04404f8..0000000000 --- a/test/cuda/cuda.jl +++ /dev/null @@ -1,65 +0,0 @@ -using Flux, Test -using Flux.CUDA -using Flux: cpu, gpu -using Statistics: mean -using LinearAlgebra: I, cholesky, Cholesky - -@testset "CUDA" begin - x = randn(5, 5) - cx = gpu(x) - @test cx isa CuArray - - @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3 - - x = Flux.onehotbatch([1, 2, 3], 1:3) - cx = gpu(x) - @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray - @test (cx .+ 1) isa CuArray - - m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) - cm = gpu(m) - - @test all(p isa CuArray for p in params(cm)) - @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} - - xs = rand(5, 5) - ys = Flux.onehotbatch(1:5,1:5) - @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) - - c = gpu(Conv((2,2),3=>4)) - x = gpu(rand(10, 10, 3, 2)) - l = c(gpu(rand(10,10,3,2))) - @test gradient(x -> sum(c(x)), x)[1] isa CuArray - - c = gpu(CrossCor((2,2),3=>4)) - x = gpu(rand(10, 10, 3, 2)) - l = c(gpu(rand(10,10,3,2))) - @test gradient(x -> sum(c(x)), x)[1] isa CuArray - -end - -@testset "onecold gpu" begin - y = Flux.onehotbatch(ones(3), 1:10) |> gpu; - l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] - @test Flux.onecold(y) isa CuArray - @test y[3,:] isa CuArray - @test Flux.onecold(y, l) == ['a', 'a', 'a'] -end - -@testset "restructure gpu" begin - dudt = Dense(1,1) |> gpu - p,re = Flux.destructure(dudt) - foo(x) = sum(re(p)(x)) - @test gradient(foo, cu(rand(1)))[1] isa CuArray -end - -@testset "GPU functors" begin - @testset "Cholesky" begin - M = 2.0*I(10) |> collect - Q = cholesky(M) - Q_gpu = Q |> gpu - @test Q_gpu isa Cholesky{<:Any,<:CuArray} - Q_cpu = Q_gpu |> cpu - @test Q_cpu == cholesky(eltype(Q_gpu).(M)) - end -end diff --git a/test/cuda/cudnn.jl b/test/cuda/cudnn.jl deleted file mode 100644 index 5d1727e984..0000000000 --- a/test/cuda/cudnn.jl +++ /dev/null @@ -1,44 +0,0 @@ -using Flux, CUDA, Test -using Flux: pullback - -@testset "CUDNN BatchNorm" begin - @testset "4D Input" begin - x = rand(Float32, 2, 2, 3, 4) - m = BatchNorm(3) - gx = gpu(x) - gm = gpu(m) - - y, back = pullback((m, x) -> m(x), m, x) - gy, gback = pullback((m, x) -> m(x), gm, gx) - - @test cpu(gy) ≈ y - - Δ = randn(Float32, size(y)) - dm, dx = back(Δ) - gdm, gdx = gback(gpu(Δ)) - - @test dm[].γ ≈ cpu(gdm[].γ) - @test dm[].β ≈ cpu(gdm[].β) - @test dx ≈ cpu(gdx) - end - - @testset "2D Input" begin - x = rand(Float32, 3, 4) - m = BatchNorm(3) - gx = gpu(x) - gm = gpu(m) - - y, back = pullback((m, x) -> m(x), m, x) - gy, gback = pullback((m, x) -> m(x), gm, gx) - - @test cpu(gy) ≈ y - - Δ = randn(Float32, size(y)) - dm, dx = back(Δ) - gdm, gdx = gback(gpu(Δ)) - - @test dm[].γ ≈ cpu(gdm[].γ) - @test dm[].β ≈ cpu(gdm[].β) - @test dx ≈ cpu(gdx) - end -end diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl deleted file mode 100644 index 63a5f93ada..0000000000 --- a/test/cuda/curnn.jl +++ /dev/null @@ -1,58 +0,0 @@ -using Flux, CUDA, Test - -@testset for R in [RNN, GRU, LSTM] - m = R(10, 5) |> gpu - x = gpu(rand(10)) - (m̄,) = gradient(m -> sum(m(x)), m) - Flux.reset!(m) - θ = gradient(() -> sum(m(x)), params(m)) - @test x isa CuArray - @test θ[m.cell.Wi] isa CuArray - @test collect(m̄[].cell.Wi) == collect(θ[m.cell.Wi]) -end - -@testset "RNN" begin - @testset for R in [RNN, GRU, LSTM], batch_size in (1, 5) - rnn = R(10, 5) - curnn = fmap(gpu, rnn) - - Flux.reset!(rnn) - Flux.reset!(curnn) - x = batch_size == 1 ? - rand(Float32, 10) : - rand(Float32, 10, batch_size) - cux = gpu(x) - - y, back = pullback((r, x) -> r(x), rnn, x) - cuy, cuback = pullback((r, x) -> r(x), curnn, cux) - - @test y ≈ collect(cuy) - - ȳ = randn(size(y)) - m̄, x̄ = back(ȳ) - cum̄, cux̄ = cuback(gpu(ȳ)) - - @test x̄ ≈ collect(cux̄) - @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi) - @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) - @test m̄[].cell.b ≈ collect(cum̄[].cell.b) - if m̄[].state isa Tuple - for (x, cx) in zip(m̄[].state, cum̄[].state) - @test x ≈ collect(cx) - end - else - @test m̄[].state ≈ collect(cum̄[].state) - end - - Flux.reset!(rnn) - Flux.reset!(curnn) - ohx = batch_size == 1 ? - Flux.onehot(rand(1:10), 1:10) : - Flux.onehotbatch(rand(1:10, batch_size), 1:10) - cuohx = gpu(ohx) - y = (rnn(ohx); rnn(ohx)) - - cuy = (curnn(cuohx); curnn(cuohx)) - @test y ≈ collect(cuy) - end -end diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl deleted file mode 100644 index 0a4034303d..0000000000 --- a/test/cuda/layers.jl +++ /dev/null @@ -1,261 +0,0 @@ -# Test layers and data/model movements on and off the GPU -# Add tests for layers and their gradients on the GPU -# Most of the forward passes should be fine being applied -# to bitstype objects, but this gives higher coverage for our use-cases -# Check that getting the gradients does not throw - -# generic movement tests -@testset "Basic GPU Movement" begin - @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple - @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple -end - -# TODO: These layers get into scalar indexing -# `AlphaDropout` throws a compilation error on GPUs, -# whereas, the rest are scalar indexing issues. -# The norm layers behave differently on the CPU and -# the GPU too. -const BROKEN_LAYERS = Union{DepthwiseConv, - AlphaDropout} - -const ACTIVATIONS = [identity, relu, tanh, - sigmoid, exp, softplus, - elu, selu] - -function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true) - isnothing(x_cpu) && error("Missing input to test the layers against.") - @testset "$name GPU grad tests" begin - for layer in layers - @testset "$layer Layer GPU grad test" begin - - # compute output and grad of parameters - l_cpu = layer(args...) - ps_cpu = Flux.params(l_cpu) - y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) - gs_cpu = back_cpu(1f0) - - x_gpu = gpu(x_cpu) - l_gpu = l_cpu |> gpu - ps_gpu = Flux.params(l_gpu) - - if typeof(l_gpu) <: BROKEN_LAYERS - @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads - else - y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) - gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix - - # compute grad of input - xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] - xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] - - # test - if test_cpu - @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3 - @test Array(xg_gpu) ≈ xg_cpu rtol=1f-3 atol=1f-3 - end - @test gs_gpu isa Flux.Zygote.Grads - for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) - @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray - if test_cpu - @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3 - end - end - end - end - end - end -end - -# Just to give testset in gpu_gradtest meaningful labels -ConvNoBias(args...) = Conv(args...; bias = false) -ConvTransposeNoBias(args...) = ConvTranspose(args...; bias = false) -CrossCorNoBias(args...) = CrossCor(args...; bias = false) -DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias = false) - -for act in ACTIVATIONS - r = rand(Float32, 28, 28, 1, 1) - conv_layers = [Conv, ConvNoBias, - ConvTranspose, ConvTransposeNoBias, - CrossCor, CrossCorNoBias, - DepthwiseConv, DepthwiseConvNoBias] - gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false) - - batch_norm = [BatchNorm] - gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors - gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false) - - instancenorm = [InstanceNorm] - gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false) - - groupnorm = [GroupNorm] - gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false) -end - -r = rand(Float32, 28, 28, 1, 1) - -pooling_layers = [MaxPool, MeanPool] -gpu_gradtest("Pooling", pooling_layers, r, (2,2)) - -adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool] -gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false) - -dropout_layers = [Dropout, AlphaDropout] -gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic - -layer_norm = [LayerNorm] -gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors -gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5) - -upsample = [x -> Upsample(scale=x)] -gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2)) -gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,)) - -pixelshuffle = [PixelShuffle] -gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3) -gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3) - -@testset "function layers" begin - x = rand(Float32, 3,3) - gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x) - gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x) - gpu_autodiff_test(x -> sum(Flux.normalise(x)), x) -end - -@testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv) - l = cl((2,2), 1=>3, bias = false) |> gpu - ip = zeros(Float32, 28,28,1,1) |> gpu - if typeof(l) <: BROKEN_LAYERS - @test_broken sum(l(ip)) ≈ 0.f0 - @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads - else - @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.bias ∉ gs.params - end -end - -@testset "Dense with Zeros bias" begin - l = Dense(ones(Float32, 4,3), Flux.Zeros()) |> gpu - ip = zeros(Float32, 3, 7) |> gpu - - @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.b ∉ gs.params -end - -@testset "Extended BatchNorm" begin - m_cpu = BatchNorm(2) - m_gpu = m_cpu |> gpu - x_cpu = rand(Float32, 3, 2, 2) - x_gpu = x_cpu |> gpu - - ## In :auto mode, track statistics only in gradient contest - μ_cpu = copy(m_cpu.μ) - m_cpu(x_cpu) - @test m_cpu.μ ≈ μ_cpu - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - @test !(m_cpu.μ ≈ μ_cpu) - - μ_gpu = copy(m_gpu.μ) - m_gpu(x_gpu) - @test m_gpu.μ ≈ μ_gpu - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) - @test !(m_gpu.μ ≈ μ_gpu) - - @test Array(m_gpu.μ) ≈ m_cpu.μ - - ## In testmode, never track statistics - testmode!(m_cpu) - μ_cpu = copy(m_cpu.μ) - m_cpu(x_cpu) - @test m_cpu.μ ≈ μ_cpu - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - @test m_cpu.μ ≈ μ_cpu - - testmode!(m_gpu) - μ_gpu = copy(m_gpu.μ) - m_gpu(x_gpu) - @test m_gpu.μ ≈ μ_gpu - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) - @test m_gpu.μ ≈ μ_gpu - - ## In trainmode, always track statistics - trainmode!(m_cpu) - μ_cpu = copy(m_cpu.μ) - m_cpu(x_cpu) - @test !(m_cpu.μ ≈ μ_cpu) - μ_cpu = copy(m_cpu.μ) - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - @test !(m_cpu.μ ≈ μ_cpu) - - trainmode!(m_gpu) - μ_gpu = copy(m_gpu.μ) - m_gpu(x_gpu) - @test !(m_gpu.μ ≈ μ_gpu) - μ_gpu = copy(m_gpu.μ) - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) - @test !(m_gpu.μ ≈ μ_gpu) - - ## No errors if input type mistmatch - # x_cpu = rand(Float64, 3, 2, 2) - # x_gpu = x_cpu |> gpu - # m_cpu(x_cpu) - # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - # m_gpu(x_gpu) - # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) -end - -@testset "Two-streams Bilinear" begin - x = zeros(Float32,10,9) |> gpu - y = zeros(Float32,2,9) |> gpu - b = Flux.Bilinear(10, 2, 3) |> gpu - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) ≈ 0f0 - gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) - b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu - gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) - for (pgpu, pcpu) in zip(params(b), params(b_cpu)) - @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) - end -end - -@testset "Two-streams Bilinear" begin - x = zeros(Float32,10,9) |> gpu - y = zeros(Float32,2,9) |> gpu - b = Flux.Bilinear(10, 2, 3) |> gpu - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) ≈ 0f0 - gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) - b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu - gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) - for (pgpu, pcpu) in zip(params(b), params(b_cpu)) - @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) - end -end - -@testset "Parallel" begin - @testset "zero sum" begin - input = randn(10, 10, 10, 10) |> gpu - layer_gpu = Parallel(+, zero, identity) |> gpu - @test layer_gpu(input) == input - @test layer_gpu(input) isa Flux.CUDA.CuArray - end - - @testset "vararg input" begin - inputs = (randn(10), randn(5), randn(4)) .|> gpu - layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu - @test size(layer(inputs)) == (2,) - end - - @testset "gradient" begin - input_cpu = randn(10, 10, 10, 10) - input_gpu = input_cpu |> gpu - layer_cpu = Parallel(+, x -> zero(x), identity) - layer_gpu = layer_cpu |> gpu - gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu)) - gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu)) - for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu)) - @test gs_cpu[pcpu] ≈ gs_gpu[pgpu] - end - end -end diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl deleted file mode 100644 index a0f7f47d80..0000000000 --- a/test/cuda/losses.jl +++ /dev/null @@ -1,38 +0,0 @@ -using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss - - -@testset "Losses" begin - -x = [1.,2.,3.] -cx = gpu(x) -@test crossentropy(x,x) ≈ crossentropy(cx,cx) -@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu -@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x)) - -x = [-1.1491, 0.8619, 0.3127] -y = [1, 1, 0.] -@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) -@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) - -x = [0.268941 0.5 0.268941 - 0.731059 0.5 0.731059] -y = [0 1 0 - 1 0 1] -@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) - -x = softmax(reshape(-7:7, 3, 5) .* 1f0) -y = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] -@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) - -@testset "GPU grad tests" begin - x = rand(Float32, 3,3) - y = rand(Float32, 3,3) - - for loss in ALL_LOSSES - gpu_autodiff_test(loss, x, y) - end -end - -end #testset diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl deleted file mode 100644 index 8ed3d66eb4..0000000000 --- a/test/cuda/runtests.jl +++ /dev/null @@ -1,19 +0,0 @@ -using Flux, Test, CUDA -using Zygote -using Zygote: pullback - -@info "Testing GPU Support" -CUDA.allowscalar(false) - -include("test_utils.jl") -include("cuda.jl") -include("losses.jl") -include("layers.jl") - -if CUDA.has_cudnn() - @info "Testing Flux/CUDNN" - include("cudnn.jl") - include("curnn.jl") -else - @warn "CUDNN unavailable, not testing GPU DNN support" -end diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl deleted file mode 100644 index bc0db37474..0000000000 --- a/test/cuda/test_utils.jl +++ /dev/null @@ -1,72 +0,0 @@ -function check_grad(g_gpu, g_cpu, atol, rtol) - @show g_gpu g_cpu - @test false -end -check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = - check_grad(g_gpu[], g_cpu[], atol, rtol) -check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true -check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu rtol=rtol atol=atol -check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = - @test g_cpu ≈ collect(g_gpu) rtol=rtol atol=atol - -function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol) - for (v1, v2) in zip(g_gpu, g_cpu) - check_grad(v1, v2, atol, rtol) - end -end - -function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol) - for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu)) - @test k1 == k2 - # @show k2 v2 - check_grad(v1, v2, atol, rtol) - end -end - -function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; - test_equal=true, rtol=1e-4, atol=1e-4) - - check_type(x) = false - check_type(x::Float32) = true - check_type(x::CuArray{Float32}) = true - check_type(x::Array{Float32}) = true - - ### GRADIENT WITH RESPECT TO INPUT ##### - # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...) - y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) - @test check_type(y_cpu) - Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) - gs_cpu = back_cpu(Δ_cpu) - - f_gpu = f_cpu |> gpu - xs_gpu = gpu.(xs_cpu) - Δ_gpu = Δ_cpu |> gpu - # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...) - y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) - @test check_type(y_gpu) - gs_gpu = back_gpu(Δ_gpu) - - if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) - check_grad(g_gpu, g_cpu, atol, rtol) - end - end - - ### GRADIENT WITH RESPECT TO f ##### - ps_cpu = Flux.params(f_cpu) - y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) - gs_cpu = back_cpu(Δ_cpu) - - ps_gpu = Flux.params(f_gpu) - y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) - gs_gpu = back_gpu(Δ_gpu) - - if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - @assert length(ps_gpu) == length(ps_cpu) - for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) - check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) - end - end -end diff --git a/test/runtests.jl b/test/runtests.jl index a40433d0f1..d385420e3b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -25,7 +25,9 @@ end @testset "Losses" begin include("losses.jl") include("ctc.jl") - if Flux.use_cuda[] include("ctc-gpu.jl") end + if Flux.default_gpu_converter[] !== identity + include("ctc-gpu.jl") + end end @testset "Layers" begin @@ -42,14 +44,6 @@ end include("outputsize.jl") end -@testset "CUDA" begin - if Flux.use_cuda[] - include("cuda/runtests.jl") - else - @warn "CUDA unavailable, not testing GPU support" - end -end - @static if VERSION == v"1.5" using Documenter @testset "Docs" begin From ec1e7195e9b12287867af365dafb3a33dd28856c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Apr 2021 14:15:08 +0200 Subject: [PATCH 03/11] Update CI. --- .buildkite/pipeline.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 4dd12e2b57..8af3bd6734 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,20 +1,20 @@ steps: - - label: "GPU integeration with julia v1.6" # change to "v1" when 1.6 is out + - label: "CUDA support" plugins: - JuliaCI/julia#v1: - version: "1" - - JuliaCI/julia-test#v1: ~ + version: 1.6 + command: | + julia -e 'using Pkg; + + println("--- :julia: Instantiating project"); + Pkg.develop(PackageSpec(path=pwd())); + Pkg.add(PackageSpec(url="https://github.com/FluxML/FluxCUDA.jl", rev="main")); + Pkg.instantiate(); + + println("+++ :julia: Running tests"); + Pkg.test("FluxCUDA"; coverage=true);' agents: queue: "juliagpu" cuda: "*" + if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 60 - - # - label: "GPU nightly" - # plugins: - # - JuliaCI/julia#v1: - # version: "nightly" - # - JuliaCI/julia-test#v1: ~ - # agents: - # queue: "juliagpu" - # cuda: "*" - # timeout_in_minutes: 60 From 32b50659ca26bb7483f34bcf1a38aff57252fa24 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Apr 2021 15:01:36 +0200 Subject: [PATCH 04/11] Try subpackages. --- .buildkite/pipeline.yml | 3 +- lib/FluxCUDA/Project.toml | 12 + lib/FluxCUDA/src/FluxCUDA.jl | 13 + lib/FluxCUDA/src/ctc.jl | 232 ++++++++++++++++ lib/FluxCUDA/src/cudnn.jl | 20 ++ lib/FluxCUDA/src/onehot.jl | 5 + lib/FluxCUDA/test/.gitignore | 1 + .../FluxCUDA/test/Manifest.toml | 102 ++++++- lib/FluxCUDA/test/Project.toml | 7 + lib/FluxCUDA/test/cuda.jl | 61 ++++ lib/FluxCUDA/test/cudnn.jl | 44 +++ lib/FluxCUDA/test/curnn.jl | 58 ++++ lib/FluxCUDA/test/layers.jl | 261 ++++++++++++++++++ lib/FluxCUDA/test/losses.jl | 52 ++++ lib/FluxCUDA/test/runtests.jl | 23 ++ lib/FluxCUDA/test/test_utils.jl | 72 +++++ 16 files changed, 950 insertions(+), 16 deletions(-) create mode 100644 lib/FluxCUDA/Project.toml create mode 100644 lib/FluxCUDA/src/FluxCUDA.jl create mode 100644 lib/FluxCUDA/src/ctc.jl create mode 100644 lib/FluxCUDA/src/cudnn.jl create mode 100644 lib/FluxCUDA/src/onehot.jl create mode 100644 lib/FluxCUDA/test/.gitignore rename Manifest.toml => lib/FluxCUDA/test/Manifest.toml (76%) create mode 100644 lib/FluxCUDA/test/Project.toml create mode 100644 lib/FluxCUDA/test/cuda.jl create mode 100644 lib/FluxCUDA/test/cudnn.jl create mode 100644 lib/FluxCUDA/test/curnn.jl create mode 100644 lib/FluxCUDA/test/layers.jl create mode 100644 lib/FluxCUDA/test/losses.jl create mode 100644 lib/FluxCUDA/test/runtests.jl create mode 100644 lib/FluxCUDA/test/test_utils.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8af3bd6734..a11e983e00 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -8,8 +8,7 @@ steps: println("--- :julia: Instantiating project"); Pkg.develop(PackageSpec(path=pwd())); - Pkg.add(PackageSpec(url="https://github.com/FluxML/FluxCUDA.jl", rev="main")); - Pkg.instantiate(); + Pkg.develop(PackageSpec(path=joinpath(pwd(), "lib/FluxCUDA"))); println("+++ :julia: Running tests"); Pkg.test("FluxCUDA"; coverage=true);' diff --git a/lib/FluxCUDA/Project.toml b/lib/FluxCUDA/Project.toml new file mode 100644 index 0000000000..40ec4b887d --- /dev/null +++ b/lib/FluxCUDA/Project.toml @@ -0,0 +1,12 @@ +name = "FluxCUDA" +uuid = "899ac4cc-ca92-44e3-a03b-1f2759274493" +authors = ["Tim Besard "] +version = "0.1.0" + +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/lib/FluxCUDA/src/FluxCUDA.jl b/lib/FluxCUDA/src/FluxCUDA.jl new file mode 100644 index 0000000000..75de37eb34 --- /dev/null +++ b/lib/FluxCUDA/src/FluxCUDA.jl @@ -0,0 +1,13 @@ +module FluxCUDA + +using Flux, CUDA +using NNlib, NNlibCUDA +using Zygote +using Zygote: @adjoint + +include("onehot.jl") +include("ctc.jl") +include("cudnn.jl") + + +end # module diff --git a/lib/FluxCUDA/src/ctc.jl b/lib/FluxCUDA/src/ctc.jl new file mode 100644 index 0000000000..4afcea726a --- /dev/null +++ b/lib/FluxCUDA/src/ctc.jl @@ -0,0 +1,232 @@ +# GPU implementation + +# a port of the GPU kernels from Baidu's C++ warp-ctc package, +# which itself is Copyright 2015-2016 Baidu USA LLC +# and available under the Apache 2.0 license +# +# Apache 2.0 license: https://www.apache.org/licenses/LICENSE-2.0 +# GitHub: https://github.com/baidu-research/warp-ctc/ +# paper: https://arxiv.org/pdf/1512.02595.pdf + +using Flux +using Statistics +using CUDA +using NNlib + +const MAX_THREADS = 256 + +function log_plus_f(p1, p2) + isinf(p1) && return p2 + isinf(p2) && return p1 + if p1 < p2 + p1, p2 = p2, p1 + end + return p1 + log(1+exp(p2 - p1)) +end + +function count_repeats(A) + repeats = 0 + for (i,elem) in enumerate(A) + if i > 1 && A[i] == A[i-1] + repeats += 1 + end + end + return repeats +end + +function compute_alpha_kernel(probs, labelSize, uttLength, repeats, labelsWithoutBlanks, labelsWithBlanks, alpha, blankLabel) + + tid = threadIdx().x + L = labelSize + T = uttLength + S = length(labelsWithBlanks) + + if L + repeats > T + return nothing + end + labels = labelsWithBlanks + + # Corner-case checking + start = (L + repeats <= T) ? 0 : 1 + last = S > 1 ? 2 : 1 + + # Fill in first column (time step) + i = tid + while i <= last - start + alpha[start+i, 1] = probs[labels[start+i], 1] + i += blockDim().x + end + sync_threads() + + # Fill in coefficients for each time step + for t=2:T + # Corner-case checking + if tid == 1 && !(1 < S - 2*(T-t) - 1) + if start == 0 + alpha[1, t] = alpha[1, t-1] + probs[blankLabel, t] + elseif start == 1 + alpha[1, t] = alpha[1, t-1] + end + end + sync_threads() + + # Fill in coefficients for each label class in the target output sequence; + # each thread will process the calculations for one class + idx = tid+1 + while idx <= S + prevSum = log_plus_f(alpha[idx, t-1], alpha[idx-1, t-1]) + if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx-2] + prevSum = log_plus_f(prevSum, alpha[idx-2, t-1]) + end + if idx < S - 2*(T-t) - 1 + alpha[idx, t] = -Inf32 + else + alpha[idx, t] = prevSum + probs[labels[idx], t] + end + idx += blockDim().x + end + sync_threads() + end + return nothing +end + +function compute_beta_and_grad_kernel(probs, labelSize, uttLength, + repeatsInLabel, labelsWithBlanks, + alphas, beta, output, accum, + grad, blankLabel, loss) + + tid = threadIdx().x + L = labelSize + T = uttLength + S = 2*L + 1 + repeats = repeatsInLabel + labels = labelsWithBlanks + + if (L+repeats) > T + return nothing + end + + # Corner-case checking + start = S > 1 ? S-2 : 0 + last = L + repeats < T ? S : S-1 + sync_threads() + i = tid + + # Calculate coefficients for last column (time step) + # then determine alpha and beta product + while i <= last - start + beta[i+start, T] = 0 + output[i+start, T] = beta[i+start, T] + alphas[i+start, T] + i += blockDim().x + end + sync_threads() + + # Fill in `accum` for last column (time step) + if tid == 1 + for i=1:S + labelIdx = labels[i] + accum[labelIdx, T] = log_plus_f(accum[labelIdx, T], output[i, T]) + end + end + sync_threads() + + # Fill in `grad` for last column (time step) + idx = tid + while idx <= size(grad, 1) + s = -Inf32 + for i=1:S + s = log_plus_f(s, output[i, T]) + end + + # ∂L/∂a (where a is activation before logsoftmax) + grad[idx, T] = exp(probs[idx, T]) - exp(accum[idx, T] - s) + idx += blockDim().x + end + sync_threads() + + # Fill in the rest of the coefficients + t = T-1 + while t >= 1 + if t < T + idx = tid + while idx <= S + nextSum = probs[labels[idx], t+1] + beta[idx, t+1] + if idx < S + nextSum = log_plus_f(nextSum, + probs[labels[idx+1], t+1] + beta[idx+1, t+1]) + end + if labels[idx] != blankLabel && idx != S-1 && labels[idx] != labels[idx+2] + nextSum = log_plus_f(nextSum, + probs[labels[idx+2], t+1] + beta[idx + 2, t+1]) + end + if idx > 2*t + beta[idx, t] = -Inf32 + else + beta[idx, t] = nextSum + end + idx += blockDim().x + end + sync_threads() + idx = tid + while idx <= S + output[idx, t] = alphas[idx, t] + beta[idx, t] + idx += blockDim().x + end + sync_threads() + end + sync_threads() + + # Calculate accumulated alpha-beta products for each label class for + # each time step; used in calculating gradients + if tid == 1 + for i=1:S + labelIdx = labels[i] + accum[labelIdx, t] = log_plus_f(accum[labelIdx, t], output[i, t]) + end + end + sync_threads() + idx = tid + + # Calculate gradients + while idx <= size(grad, 1) + + # ∂L/∂a (where a is activation before logsoftmax) + grad[idx, t] = exp(probs[idx, t]) - exp(accum[idx, t] + loss) + idx += blockDim().x + end + sync_threads() + t -= 1 + sync_threads() + end + return nothing +end + +function ctc_alpha(ŷ::CuArray, y) + ŷ = logsoftmax(ŷ) + blank = size(ŷ, 1) + z′ = fill(blank, 2 * length(y) + 1) + z′[eachindex(y) .* 2] = y + T = size(ŷ, 2) + U′ = 2*length(y) + 1 + alphas = CUDA.fill(log(zero(ŷ[1])), U′,T) + nRepeats = count_repeats(y) + nThreads = min(U′, MAX_THREADS) + @cuda blocks=1 threads=nThreads compute_alpha_kernel(ŷ, length(y), T, nRepeats, CuArray(y), CuArray(z′), alphas, blank) + return (loss=-1 * logsumexp(alphas[end-1:end]), alpha=alphas, z′=z′, yhat=ŷ, nRepeats=nRepeats) +end + +ctc_loss(ŷ::CuArray, y) = ctc_alpha(ŷ::CuArray, y).loss + +function ∇ctc_loss(ŷ::CuArray, y, out) + loss, alphas, z′, ŷ, nRepeats = out + U′, T = size(alphas) + blank = size(ŷ, 1) + typed_zero = zero(first(ŷ)) + betas = CUDA.fill(log(typed_zero), U′, T) + output = CUDA.fill(log(typed_zero), U′, T) + nThreads = min(U′, MAX_THREADS) + grads = CUDA.fill(log(typed_zero), size(ŷ)) + accum = CUDA.fill(log(typed_zero), size(ŷ)) + @cuda blocks=1 threads=nThreads compute_beta_and_grad_kernel(ŷ, length(y), T, nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank, loss) + return grads +end diff --git a/lib/FluxCUDA/src/cudnn.jl b/lib/FluxCUDA/src/cudnn.jl new file mode 100644 index 0000000000..4a3b2618c8 --- /dev/null +++ b/lib/FluxCUDA/src/cudnn.jl @@ -0,0 +1,20 @@ +import NNlibCUDA: batchnorm, ∇batchnorm + +function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, + cache=nothing) where T<:Union{Float32, Float64} + + @assert BN.affine "BatchNorm: only affine=true supported on gpu" + @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu" + @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wronng number of channels" + return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; + cache=cache, alpha=1, beta=0, eps=BN.ϵ, + training=Flux._isactive(BN))) +end + +@adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...) + y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) + function batchnorm_pullback(Δ) + ∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)..., nothing, nothing, nothing + end + y, batchnorm_pullback +end diff --git a/lib/FluxCUDA/src/onehot.jl b/lib/FluxCUDA/src/onehot.jl new file mode 100644 index 0000000000..760df1d27e --- /dev/null +++ b/lib/FluxCUDA/src/onehot.jl @@ -0,0 +1,5 @@ +import Flux: OneHotArray, OneHotLike, _onehot_bool_type + +_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N} + +Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}() diff --git a/lib/FluxCUDA/test/.gitignore b/lib/FluxCUDA/test/.gitignore new file mode 100644 index 0000000000..c181d1f83f --- /dev/null +++ b/lib/FluxCUDA/test/.gitignore @@ -0,0 +1 @@ +test/Manifest.toml diff --git a/Manifest.toml b/lib/FluxCUDA/test/Manifest.toml similarity index 76% rename from Manifest.toml rename to lib/FluxCUDA/test/Manifest.toml index 4aad025690..51ed72ce90 100644 --- a/Manifest.toml +++ b/lib/FluxCUDA/test/Manifest.toml @@ -23,9 +23,26 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"] +git-tree-sha1 = "d4fa6486e94c4087f1d081d7be2d501a170bd51d" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.1.0" + [[ChainRules]] deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Random", "Reexport", "Requires", "Statistics"] git-tree-sha1 = "1f410fba5c04d03ab712f348f1542e6059376547" @@ -34,9 +51,9 @@ version = "0.7.61" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "42e3c181483fbd2c416087a0a93838803e358358" +git-tree-sha1 = "a66109c73612c63b10923ac446fddb0f0d21a593" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "0.9.38" +version = "0.9.40" [[CodecZlib]] deps = ["TranscodingStreams", "Zlib_jll"] @@ -46,15 +63,15 @@ version = "0.7.0" [[ColorTypes]] deps = ["FixedPointNumbers", "Random"] -git-tree-sha1 = "32a2b8af383f11cbb65803883837a149d10dfe8a" +git-tree-sha1 = "024fe24d83e4a5bf5fc80501a314ce0d1aa35597" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -version = "0.10.12" +version = "0.11.0" [[Colors]] deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] -git-tree-sha1 = "82f4e6ff9f847eca3e5ebc666ea2cd7b48e8b47e" +git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40" uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" -version = "0.12.7" +version = "0.12.8" [[CommonSubexpressions]] deps = ["MacroTools", "Test"] @@ -111,6 +128,11 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" deps = ["ArgTools", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +[[ExprTools]] +git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.3" + [[FillArrays]] deps = ["LinearAlgebra", "Random", "SparseArrays"] git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a" @@ -123,6 +145,12 @@ git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" version = "0.8.4" +[[Flux]] +deps = ["AbstractTrees", "Adapt", "CUDA", "CodecZlib", "Colors", "DelimitedFiles", "Functors", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "NNlibCUDA", "Pkg", "Printf", "Random", "Reexport", "SHA", "Statistics", "StatsBase", "Test", "ZipFile", "Zygote"] +git-tree-sha1 = "5e94fff7b4385fdd059863300b6b25ea0f849dda" +uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" +version = "0.12.3" + [[ForwardDiff]] deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"] git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6" @@ -141,6 +169,12 @@ git-tree-sha1 = "9c95b2fd5c16bc7f97371e9f92f0fef77e0f5957" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "6.2.2" +[[GPUCompiler]] +deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "6eadd2321dc3ac0fc9d530ab01c2caa7fe5d74c6" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.11.4" + [[IRTools]] deps = ["InteractiveUtils", "MacroTools", "Test"] git-tree-sha1 = "c67e7515a11f726f44083e74f218d134396d6510" @@ -163,6 +197,16 @@ git-tree-sha1 = "07cb43290a840908a771552911a6274bc6c072c7" uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" version = "0.8.4" +[[LLVM]] +deps = ["CEnum", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "3.6.0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" @@ -209,11 +253,17 @@ git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" version = "0.5.0" +[[Memoize]] +deps = ["MacroTools"] +git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" +uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" +version = "0.4.4" + [[Missings]] deps = ["DataAPI"] -git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c" +git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "0.4.5" +version = "1.0.0" [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" @@ -227,6 +277,12 @@ git-tree-sha1 = "80b8360670f445d88b3475e88b33bbcc92f7866e" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" version = "0.7.19" +[[NNlibCUDA]] +deps = ["CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"] +git-tree-sha1 = "4b368b466bcdd25d448a5b20de4b7e481d68b88e" +uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d" +version = "0.1.0" + [[NaNMath]] git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" @@ -237,9 +293,9 @@ uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" [[OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "9db77584158d0ab52307f8c04f8e7c08ca76b5b3" +git-tree-sha1 = "b9b8b8ed236998f91143938a760c2112dceeb2b4" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.3+4" +version = "0.5.4+0" [[OrderedCollections]] git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf" @@ -272,6 +328,12 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.4.0" + [[Reexport]] git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" uuid = "189a3867-3050-52da-a836-e630ba90ab69" @@ -286,6 +348,12 @@ version = "1.1.3" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.0.3" + [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -314,9 +382,9 @@ version = "1.3.0" [[StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "e8cd1b100d37f5b4cfd2c83f45becf61c762eaf7" +git-tree-sha1 = "2653e9c769343808781a8bd5010ee7a17c01152e" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.1.1" +version = "1.1.2" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] @@ -324,9 +392,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[StatsBase]] deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "4bc58880426274277a066de306ef19ecc22a6863" +git-tree-sha1 = "4d8ca45223d7a28839e775d73a6f6b6b2ac64fd1" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.33.5" +version = "0.33.6" [[TOML]] deps = ["Dates"] @@ -340,6 +408,12 @@ uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +[[TimerOutputs]] +deps = ["Printf"] +git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.8" + [[TranscodingStreams]] deps = ["Random", "Test"] git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" diff --git a/lib/FluxCUDA/test/Project.toml b/lib/FluxCUDA/test/Project.toml new file mode 100644 index 0000000000..8d9b755315 --- /dev/null +++ b/lib/FluxCUDA/test/Project.toml @@ -0,0 +1,7 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/lib/FluxCUDA/test/cuda.jl b/lib/FluxCUDA/test/cuda.jl new file mode 100644 index 0000000000..65982836ab --- /dev/null +++ b/lib/FluxCUDA/test/cuda.jl @@ -0,0 +1,61 @@ +using LinearAlgebra + +@testset "CUDA" begin + x = randn(5, 5) + cx = gpu(x) + @test cx isa CuArray + + @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3 + + x = Flux.onehotbatch([1, 2, 3], 1:3) + cx = gpu(x) + @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray + @test (cx .+ 1) isa CuArray + + m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) + cm = gpu(m) + + @test all(p isa CuArray for p in params(cm)) + @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} + + xs = rand(5, 5) + ys = Flux.onehotbatch(1:5,1:5) + @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) + + c = gpu(Conv((2,2),3=>4)) + x = gpu(rand(10, 10, 3, 2)) + l = c(gpu(rand(10,10,3,2))) + @test gradient(x -> sum(c(x)), x)[1] isa CuArray + + c = gpu(CrossCor((2,2),3=>4)) + x = gpu(rand(10, 10, 3, 2)) + l = c(gpu(rand(10,10,3,2))) + @test gradient(x -> sum(c(x)), x)[1] isa CuArray + +end + +@testset "onecold gpu" begin + y = Flux.onehotbatch(ones(3), 1:10) |> gpu; + l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + @test Flux.onecold(y) isa CuArray + @test y[3,:] isa CuArray + @test Flux.onecold(y, l) == ['a', 'a', 'a'] +end + +@testset "restructure gpu" begin + dudt = Dense(1,1) |> gpu + p,re = Flux.destructure(dudt) + foo(x) = sum(re(p)(x)) + @test gradient(foo, cu(rand(1)))[1] isa CuArray +end + +@testset "GPU functors" begin + @testset "Cholesky" begin + M = 2.0*I(10) |> collect + Q = cholesky(M) + Q_gpu = Q |> gpu + @test Q_gpu isa Cholesky{<:Any,<:CuArray} + Q_cpu = Q_gpu |> cpu + @test Q_cpu == cholesky(eltype(Q_gpu).(M)) + end +end diff --git a/lib/FluxCUDA/test/cudnn.jl b/lib/FluxCUDA/test/cudnn.jl new file mode 100644 index 0000000000..5d1727e984 --- /dev/null +++ b/lib/FluxCUDA/test/cudnn.jl @@ -0,0 +1,44 @@ +using Flux, CUDA, Test +using Flux: pullback + +@testset "CUDNN BatchNorm" begin + @testset "4D Input" begin + x = rand(Float32, 2, 2, 3, 4) + m = BatchNorm(3) + gx = gpu(x) + gm = gpu(m) + + y, back = pullback((m, x) -> m(x), m, x) + gy, gback = pullback((m, x) -> m(x), gm, gx) + + @test cpu(gy) ≈ y + + Δ = randn(Float32, size(y)) + dm, dx = back(Δ) + gdm, gdx = gback(gpu(Δ)) + + @test dm[].γ ≈ cpu(gdm[].γ) + @test dm[].β ≈ cpu(gdm[].β) + @test dx ≈ cpu(gdx) + end + + @testset "2D Input" begin + x = rand(Float32, 3, 4) + m = BatchNorm(3) + gx = gpu(x) + gm = gpu(m) + + y, back = pullback((m, x) -> m(x), m, x) + gy, gback = pullback((m, x) -> m(x), gm, gx) + + @test cpu(gy) ≈ y + + Δ = randn(Float32, size(y)) + dm, dx = back(Δ) + gdm, gdx = gback(gpu(Δ)) + + @test dm[].γ ≈ cpu(gdm[].γ) + @test dm[].β ≈ cpu(gdm[].β) + @test dx ≈ cpu(gdx) + end +end diff --git a/lib/FluxCUDA/test/curnn.jl b/lib/FluxCUDA/test/curnn.jl new file mode 100644 index 0000000000..63a5f93ada --- /dev/null +++ b/lib/FluxCUDA/test/curnn.jl @@ -0,0 +1,58 @@ +using Flux, CUDA, Test + +@testset for R in [RNN, GRU, LSTM] + m = R(10, 5) |> gpu + x = gpu(rand(10)) + (m̄,) = gradient(m -> sum(m(x)), m) + Flux.reset!(m) + θ = gradient(() -> sum(m(x)), params(m)) + @test x isa CuArray + @test θ[m.cell.Wi] isa CuArray + @test collect(m̄[].cell.Wi) == collect(θ[m.cell.Wi]) +end + +@testset "RNN" begin + @testset for R in [RNN, GRU, LSTM], batch_size in (1, 5) + rnn = R(10, 5) + curnn = fmap(gpu, rnn) + + Flux.reset!(rnn) + Flux.reset!(curnn) + x = batch_size == 1 ? + rand(Float32, 10) : + rand(Float32, 10, batch_size) + cux = gpu(x) + + y, back = pullback((r, x) -> r(x), rnn, x) + cuy, cuback = pullback((r, x) -> r(x), curnn, cux) + + @test y ≈ collect(cuy) + + ȳ = randn(size(y)) + m̄, x̄ = back(ȳ) + cum̄, cux̄ = cuback(gpu(ȳ)) + + @test x̄ ≈ collect(cux̄) + @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi) + @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) + @test m̄[].cell.b ≈ collect(cum̄[].cell.b) + if m̄[].state isa Tuple + for (x, cx) in zip(m̄[].state, cum̄[].state) + @test x ≈ collect(cx) + end + else + @test m̄[].state ≈ collect(cum̄[].state) + end + + Flux.reset!(rnn) + Flux.reset!(curnn) + ohx = batch_size == 1 ? + Flux.onehot(rand(1:10), 1:10) : + Flux.onehotbatch(rand(1:10, batch_size), 1:10) + cuohx = gpu(ohx) + y = (rnn(ohx); rnn(ohx)) + + cuy = (curnn(cuohx); curnn(cuohx)) + @test y ≈ collect(cuy) + end +end diff --git a/lib/FluxCUDA/test/layers.jl b/lib/FluxCUDA/test/layers.jl new file mode 100644 index 0000000000..78dc9aa7d2 --- /dev/null +++ b/lib/FluxCUDA/test/layers.jl @@ -0,0 +1,261 @@ +# Test layers and data/model movements on and off the GPU +# Add tests for layers and their gradients on the GPU +# Most of the forward passes should be fine being applied +# to bitstype objects, but this gives higher coverage for our use-cases +# Check that getting the gradients does not throw + +# generic movement tests +@testset "Basic GPU Movement" begin + @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple + @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple +end + +# TODO: These layers get into scalar indexing +# `AlphaDropout` throws a compilation error on GPUs, +# whereas, the rest are scalar indexing issues. +# The norm layers behave differently on the CPU and +# the GPU too. +const BROKEN_LAYERS = Union{DepthwiseConv, + AlphaDropout} + +const ACTIVATIONS = [identity, relu, tanh, + sigmoid, exp, softplus, + elu, selu] + +function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true) + isnothing(x_cpu) && error("Missing input to test the layers against.") + @testset "$name GPU grad tests" begin + for layer in layers + @testset "$layer Layer GPU grad test" begin + + # compute output and grad of parameters + l_cpu = layer(args...) + ps_cpu = Flux.params(l_cpu) + y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) + gs_cpu = back_cpu(1f0) + + x_gpu = gpu(x_cpu) + l_gpu = l_cpu |> gpu + ps_gpu = Flux.params(l_gpu) + + if typeof(l_gpu) <: BROKEN_LAYERS + @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Zygote.Grads + else + y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) + gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix + + # compute grad of input + xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] + xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] + + # test + if test_cpu + @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3 + @test Array(xg_gpu) ≈ xg_cpu rtol=1f-3 atol=1f-3 + end + @test gs_gpu isa Zygote.Grads + for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) + @test gs_gpu[p_gpu] isa CUDA.CuArray + if test_cpu + @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3 + end + end + end + end + end + end +end + +# Just to give testset in gpu_gradtest meaningful labels +ConvNoBias(args...) = Conv(args...; bias = false) +ConvTransposeNoBias(args...) = ConvTranspose(args...; bias = false) +CrossCorNoBias(args...) = CrossCor(args...; bias = false) +DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias = false) + +for act in ACTIVATIONS + r = rand(Float32, 28, 28, 1, 1) + conv_layers = [Conv, ConvNoBias, + ConvTranspose, ConvTransposeNoBias, + CrossCor, CrossCorNoBias, + DepthwiseConv, DepthwiseConvNoBias] + gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false) + + batch_norm = [BatchNorm] + gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors + gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false) + + instancenorm = [InstanceNorm] + gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false) + + groupnorm = [GroupNorm] + gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false) +end + +r = rand(Float32, 28, 28, 1, 1) + +pooling_layers = [MaxPool, MeanPool] +gpu_gradtest("Pooling", pooling_layers, r, (2,2)) + +adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool] +gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false) + +dropout_layers = [Dropout, AlphaDropout] +gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic + +layer_norm = [LayerNorm] +gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors +gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5) + +upsample = [x -> Upsample(scale=x)] +gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2)) +gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,)) + +pixelshuffle = [PixelShuffle] +gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3) +gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3) + +@testset "function layers" begin + x = rand(Float32, 3,3) + gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x) + gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x) + gpu_autodiff_test(x -> sum(Flux.normalise(x)), x) +end + +@testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv) + l = cl((2,2), 1=>3, bias = false) |> gpu + ip = zeros(Float32, 28,28,1,1) |> gpu + if typeof(l) <: BROKEN_LAYERS + @test_broken sum(l(ip)) ≈ 0.f0 + @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Zygote.Grads + else + @test sum(l(ip)) ≈ 0.f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.bias ∉ gs.params + end +end + +@testset "Dense with Zeros bias" begin + l = Dense(ones(Float32, 4,3), Flux.Zeros()) |> gpu + ip = zeros(Float32, 3, 7) |> gpu + + @test sum(l(ip)) ≈ 0.f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.b ∉ gs.params +end + +@testset "Extended BatchNorm" begin + m_cpu = BatchNorm(2) + m_gpu = m_cpu |> gpu + x_cpu = rand(Float32, 3, 2, 2) + x_gpu = x_cpu |> gpu + + ## In :auto mode, track statistics only in gradient contest + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test m_cpu.μ ≈ μ_cpu + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test !(m_cpu.μ ≈ μ_cpu) + + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test m_gpu.μ ≈ μ_gpu + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test !(m_gpu.μ ≈ μ_gpu) + + @test Array(m_gpu.μ) ≈ m_cpu.μ + + ## In testmode, never track statistics + testmode!(m_cpu) + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test m_cpu.μ ≈ μ_cpu + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test m_cpu.μ ≈ μ_cpu + + testmode!(m_gpu) + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test m_gpu.μ ≈ μ_gpu + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test m_gpu.μ ≈ μ_gpu + + ## In trainmode, always track statistics + trainmode!(m_cpu) + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test !(m_cpu.μ ≈ μ_cpu) + μ_cpu = copy(m_cpu.μ) + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test !(m_cpu.μ ≈ μ_cpu) + + trainmode!(m_gpu) + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test !(m_gpu.μ ≈ μ_gpu) + μ_gpu = copy(m_gpu.μ) + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test !(m_gpu.μ ≈ μ_gpu) + + ## No errors if input type mistmatch + # x_cpu = rand(Float64, 3, 2, 2) + # x_gpu = x_cpu |> gpu + # m_cpu(x_cpu) + # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + # m_gpu(x_gpu) + # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) +end + +@testset "Two-streams Bilinear" begin + x = zeros(Float32,10,9) |> gpu + y = zeros(Float32,2,9) |> gpu + b = Flux.Bilinear(10, 2, 3) |> gpu + @test size(b(x,y)) == (3,9) + @test sum(abs2, b(x,y)) ≈ 0f0 + gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) + b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu + gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) + for (pgpu, pcpu) in zip(params(b), params(b_cpu)) + @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) + end +end + +@testset "Two-streams Bilinear" begin + x = zeros(Float32,10,9) |> gpu + y = zeros(Float32,2,9) |> gpu + b = Flux.Bilinear(10, 2, 3) |> gpu + @test size(b(x,y)) == (3,9) + @test sum(abs2, b(x,y)) ≈ 0f0 + gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) + b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu + gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) + for (pgpu, pcpu) in zip(params(b), params(b_cpu)) + @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) + end +end + +@testset "Parallel" begin + @testset "zero sum" begin + input = randn(10, 10, 10, 10) |> gpu + layer_gpu = Parallel(+, zero, identity) |> gpu + @test layer_gpu(input) == input + @test layer_gpu(input) isa CUDA.CuArray + end + + @testset "vararg input" begin + inputs = (randn(10), randn(5), randn(4)) .|> gpu + layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu + @test size(layer(inputs)) == (2,) + end + + @testset "gradient" begin + input_cpu = randn(10, 10, 10, 10) + input_gpu = input_cpu |> gpu + layer_cpu = Parallel(+, x -> zero(x), identity) + layer_gpu = layer_cpu |> gpu + gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu)) + gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu)) + for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu)) + @test gs_cpu[pcpu] ≈ gs_gpu[pgpu] + end + end +end diff --git a/lib/FluxCUDA/test/losses.jl b/lib/FluxCUDA/test/losses.jl new file mode 100644 index 0000000000..6ed5089f9e --- /dev/null +++ b/lib/FluxCUDA/test/losses.jl @@ -0,0 +1,52 @@ +using Statistics + +using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss + +# XXX: duplicated from Flux' tests +const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, + Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy, + Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy, + Flux.Losses.kldivergence, + Flux.Losses.huber_loss, + Flux.Losses.tversky_loss, + Flux.Losses.dice_coeff_loss, + Flux.Losses.poisson_loss, + Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss, + Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss] + + +@testset "Losses" begin + +x = [1.,2.,3.] +cx = gpu(x) +@test crossentropy(x,x) ≈ crossentropy(cx,cx) +@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu +@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x)) + +x = [-1.1491, 0.8619, 0.3127] +y = [1, 1, 0.] +@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) +@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) + +x = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] +y = [0 1 0 + 1 0 1] +@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) + +x = softmax(reshape(-7:7, 3, 5) .* 1f0) +y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] +@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) + +@testset "GPU grad tests" begin + x = rand(Float32, 3,3) + y = rand(Float32, 3,3) + + for loss in ALL_LOSSES + gpu_autodiff_test(loss, x, y) + end +end + +end #testset diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl new file mode 100644 index 0000000000..40ba98e028 --- /dev/null +++ b/lib/FluxCUDA/test/runtests.jl @@ -0,0 +1,23 @@ +using Test + +using CUDA +CUDA.allowscalar(false) + +using Flux, FluxCUDA +Flux.default_gpu_converter[] = cu + +using Zygote +using Zygote: pullback + +include("test_utils.jl") +include("cuda.jl") +include("losses.jl") +include("layers.jl") + +if CUDA.has_cudnn() + @info "Testing Flux/CUDNN" + include("cudnn.jl") + include("curnn.jl") +else + @warn "CUDNN unavailable, not testing GPU DNN support" +end diff --git a/lib/FluxCUDA/test/test_utils.jl b/lib/FluxCUDA/test/test_utils.jl new file mode 100644 index 0000000000..bc0db37474 --- /dev/null +++ b/lib/FluxCUDA/test/test_utils.jl @@ -0,0 +1,72 @@ +function check_grad(g_gpu, g_cpu, atol, rtol) + @show g_gpu g_cpu + @test false +end +check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = + check_grad(g_gpu[], g_cpu[], atol, rtol) +check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true +check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu rtol=rtol atol=atol +check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = + @test g_cpu ≈ collect(g_gpu) rtol=rtol atol=atol + +function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol) + for (v1, v2) in zip(g_gpu, g_cpu) + check_grad(v1, v2, atol, rtol) + end +end + +function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol) + for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu)) + @test k1 == k2 + # @show k2 v2 + check_grad(v1, v2, atol, rtol) + end +end + +function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; + test_equal=true, rtol=1e-4, atol=1e-4) + + check_type(x) = false + check_type(x::Float32) = true + check_type(x::CuArray{Float32}) = true + check_type(x::Array{Float32}) = true + + ### GRADIENT WITH RESPECT TO INPUT ##### + # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...) + y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) + @test check_type(y_cpu) + Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) + gs_cpu = back_cpu(Δ_cpu) + + f_gpu = f_cpu |> gpu + xs_gpu = gpu.(xs_cpu) + Δ_gpu = Δ_cpu |> gpu + # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...) + y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) + @test check_type(y_gpu) + gs_gpu = back_gpu(Δ_gpu) + + if test_equal + @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol + for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) + check_grad(g_gpu, g_cpu, atol, rtol) + end + end + + ### GRADIENT WITH RESPECT TO f ##### + ps_cpu = Flux.params(f_cpu) + y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) + gs_cpu = back_cpu(Δ_cpu) + + ps_gpu = Flux.params(f_gpu) + y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) + gs_gpu = back_gpu(Δ_gpu) + + if test_equal + @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol + @assert length(ps_gpu) == length(ps_cpu) + for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) + check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) + end + end +end From 3e5803f58265bf30053c992a43b7173d2909f7f7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Apr 2021 15:50:48 +0200 Subject: [PATCH 05/11] Load Flux and CUDA via FluxCUDA. Avoids a needless listing in test/Project.toml, making it possible to pick up development versions. --- lib/FluxCUDA/test/Manifest.toml | 458 -------------------------------- lib/FluxCUDA/test/Project.toml | 2 - lib/FluxCUDA/test/cudnn.jl | 3 - lib/FluxCUDA/test/curnn.jl | 2 - lib/FluxCUDA/test/losses.jl | 2 +- lib/FluxCUDA/test/runtests.jl | 7 +- 6 files changed, 6 insertions(+), 468 deletions(-) delete mode 100644 lib/FluxCUDA/test/Manifest.toml diff --git a/lib/FluxCUDA/test/Manifest.toml b/lib/FluxCUDA/test/Manifest.toml deleted file mode 100644 index 51ed72ce90..0000000000 --- a/lib/FluxCUDA/test/Manifest.toml +++ /dev/null @@ -1,458 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -[[AbstractFFTs]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" -uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.0.1" - -[[AbstractTrees]] -git-tree-sha1 = "03e0550477d86222521d254b741d470ba17ea0b5" -uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -version = "0.3.4" - -[[Adapt]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "f1b523983a58802c4695851926203b36e28f09db" -uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.0" - -[[ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" - -[[Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[BFloat16s]] -deps = ["LinearAlgebra", "Test"] -git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.1.0" - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" -uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" - -[[CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"] -git-tree-sha1 = "d4fa6486e94c4087f1d081d7be2d501a170bd51d" -uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.1.0" - -[[ChainRules]] -deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Random", "Reexport", "Requires", "Statistics"] -git-tree-sha1 = "1f410fba5c04d03ab712f348f1542e6059376547" -uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2" -version = "0.7.61" - -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "a66109c73612c63b10923ac446fddb0f0d21a593" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "0.9.40" - -[[CodecZlib]] -deps = ["TranscodingStreams", "Zlib_jll"] -git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da" -uuid = "944b1d66-785c-5afd-91f1-9de20f533193" -version = "0.7.0" - -[[ColorTypes]] -deps = ["FixedPointNumbers", "Random"] -git-tree-sha1 = "024fe24d83e4a5bf5fc80501a314ce0d1aa35597" -uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -version = "0.11.0" - -[[Colors]] -deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] -git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40" -uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" -version = "0.12.8" - -[[CommonSubexpressions]] -deps = ["MacroTools", "Test"] -git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7" -uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" -version = "0.3.0" - -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "ac4132ad78082518ec2037ae5770b6e796f7f956" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.27.0" - -[[CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" - -[[DataAPI]] -git-tree-sha1 = "dfb3b7e89e395be1e25c2ad6d7690dc29cc53b1d" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.6.0" - -[[DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.9" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[DiffResults]] -deps = ["StaticArrays"] -git-tree-sha1 = "c18e98cba888c6c25d1c3b048e4b3380ca956805" -uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" -version = "1.0.3" - -[[DiffRules]] -deps = ["NaNMath", "Random", "SpecialFunctions"] -git-tree-sha1 = "214c3fcac57755cfda163d91c58893a8723f93e9" -uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" -version = "1.0.2" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" - -[[ExprTools]] -git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.3" - -[[FillArrays]] -deps = ["LinearAlgebra", "Random", "SparseArrays"] -git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a" -uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "0.11.7" - -[[FixedPointNumbers]] -deps = ["Statistics"] -git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" -uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" -version = "0.8.4" - -[[Flux]] -deps = ["AbstractTrees", "Adapt", "CUDA", "CodecZlib", "Colors", "DelimitedFiles", "Functors", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "NNlibCUDA", "Pkg", "Printf", "Random", "Reexport", "SHA", "Statistics", "StatsBase", "Test", "ZipFile", "Zygote"] -git-tree-sha1 = "5e94fff7b4385fdd059863300b6b25ea0f849dda" -uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" -version = "0.12.3" - -[[ForwardDiff]] -deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6" -uuid = "f6369f11-7733-5829-9624-2563aa707210" -version = "0.10.18" - -[[Functors]] -deps = ["MacroTools"] -git-tree-sha1 = "a7bb2af991c43dcf5c3455d276dd83976799634f" -uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196" -version = "0.2.1" - -[[GPUArrays]] -deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"] -git-tree-sha1 = "9c95b2fd5c16bc7f97371e9f92f0fef77e0f5957" -uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "6.2.2" - -[[GPUCompiler]] -deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "6eadd2321dc3ac0fc9d530ab01c2caa7fe5d74c6" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.11.4" - -[[IRTools]] -deps = ["InteractiveUtils", "MacroTools", "Test"] -git-tree-sha1 = "c67e7515a11f726f44083e74f218d134396d6510" -uuid = "7869d1d1-7146-5819-86e3-90919afe41df" -version = "0.4.2" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" - -[[Juno]] -deps = ["Base64", "Logging", "Media", "Profile"] -git-tree-sha1 = "07cb43290a840908a771552911a6274bc6c072c7" -uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d" -version = "0.8.4" - -[[LLVM]] -deps = ["CEnum", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "3.6.0" - -[[LazyArtifacts]] -deps = ["Artifacts", "Pkg"] -uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" - -[[LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" - -[[LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" - -[[LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[LinearAlgebra]] -deps = ["Libdl"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[MacroTools]] -deps = ["Markdown", "Random"] -git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" -uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.6" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" - -[[Media]] -deps = ["MacroTools", "Test"] -git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58" -uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27" -version = "0.5.0" - -[[Memoize]] -deps = ["MacroTools"] -git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" -uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" -version = "0.4.4" - -[[Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "1.0.0" - -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" - -[[NNlib]] -deps = ["Adapt", "ChainRulesCore", "Compat", "LinearAlgebra", "Pkg", "Requires", "Statistics"] -git-tree-sha1 = "80b8360670f445d88b3475e88b33bbcc92f7866e" -uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -version = "0.7.19" - -[[NNlibCUDA]] -deps = ["CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"] -git-tree-sha1 = "4b368b466bcdd25d448a5b20de4b7e481d68b88e" -uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d" -version = "0.1.0" - -[[NaNMath]] -git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" -uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" -version = "0.3.5" - -[[NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" - -[[OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "b9b8b8ed236998f91143938a760c2112dceeb2b4" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.4+0" - -[[OrderedCollections]] -git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.0" - -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[Preferences]] -deps = ["TOML"] -git-tree-sha1 = "ea79e4c9077208cd3bc5d29631a26bc0cff78902" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.1" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[Profile]] -deps = ["Printf"] -uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[RandomNumbers]] -deps = ["Random", "Requires"] -git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" -uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" -version = "1.4.0" - -[[Reexport]] -git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" -uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.0.0" - -[[Requires]] -deps = ["UUIDs"] -git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" -uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.1.3" - -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[Scratch]] -deps = ["Dates"] -git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6" -uuid = "6c6a2e73-6563-6170-7368-637461726353" -version = "1.0.3" - -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[SortingAlgorithms]] -deps = ["DataStructures", "Random", "Test"] -git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" -uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "0.3.1" - -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[[SpecialFunctions]] -deps = ["ChainRulesCore", "OpenSpecFun_jll"] -git-tree-sha1 = "5919936c0e92cff40e57d0ddf0ceb667d42e5902" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.3.0" - -[[StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "2653e9c769343808781a8bd5010ee7a17c01152e" -uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.1.2" - -[[Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - -[[StatsBase]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "4d8ca45223d7a28839e775d73a6f6b6b2ac64fd1" -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.33.6" - -[[TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" - -[[Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" - -[[Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[TimerOutputs]] -deps = ["Printf"] -git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236" -uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.8" - -[[TranscodingStreams]] -deps = ["Random", "Test"] -git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" -uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.9.5" - -[[UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[ZipFile]] -deps = ["Libdl", "Printf", "Zlib_jll"] -git-tree-sha1 = "c3a5637e27e914a7a445b8d0ad063d701931e9f7" -uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" -version = "0.9.3" - -[[Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" - -[[Zygote]] -deps = ["AbstractFFTs", "ChainRules", "ChainRulesCore", "DiffRules", "Distributed", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"] -git-tree-sha1 = "927209c83efa62256788a9880c191774c07c5b51" -uuid = "e88e6eb3-aa80-5325-afca-941959d7151f" -version = "0.6.10" - -[[ZygoteRules]] -deps = ["MacroTools"] -git-tree-sha1 = "9e7a1e8ca60b742e508a315c17eef5211e7fbfd7" -uuid = "700de1a5-db45-46bc-99cf-38207098b444" -version = "0.2.1" - -[[nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" - -[[p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/lib/FluxCUDA/test/Project.toml b/lib/FluxCUDA/test/Project.toml index 8d9b755315..e5dd177881 100644 --- a/lib/FluxCUDA/test/Project.toml +++ b/lib/FluxCUDA/test/Project.toml @@ -1,6 +1,4 @@ [deps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/lib/FluxCUDA/test/cudnn.jl b/lib/FluxCUDA/test/cudnn.jl index 5d1727e984..bbb9d644b7 100644 --- a/lib/FluxCUDA/test/cudnn.jl +++ b/lib/FluxCUDA/test/cudnn.jl @@ -1,6 +1,3 @@ -using Flux, CUDA, Test -using Flux: pullback - @testset "CUDNN BatchNorm" begin @testset "4D Input" begin x = rand(Float32, 2, 2, 3, 4) diff --git a/lib/FluxCUDA/test/curnn.jl b/lib/FluxCUDA/test/curnn.jl index 63a5f93ada..750acf5b12 100644 --- a/lib/FluxCUDA/test/curnn.jl +++ b/lib/FluxCUDA/test/curnn.jl @@ -1,5 +1,3 @@ -using Flux, CUDA, Test - @testset for R in [RNN, GRU, LSTM] m = R(10, 5) |> gpu x = gpu(rand(10)) diff --git a/lib/FluxCUDA/test/losses.jl b/lib/FluxCUDA/test/losses.jl index 6ed5089f9e..60b4469df2 100644 --- a/lib/FluxCUDA/test/losses.jl +++ b/lib/FluxCUDA/test/losses.jl @@ -1,6 +1,6 @@ using Statistics -using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss +using .Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss # XXX: duplicated from Flux' tests const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl index 40ba98e028..c9deebdff3 100644 --- a/lib/FluxCUDA/test/runtests.jl +++ b/lib/FluxCUDA/test/runtests.jl @@ -1,9 +1,12 @@ using Test -using CUDA +using FluxCUDA +using FluxCUDA: Flux, CUDA + +using .CUDA CUDA.allowscalar(false) -using Flux, FluxCUDA +using .Flux Flux.default_gpu_converter[] = cu using Zygote From f9d7124941501100bbad244157ef72a67243434b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Apr 2021 16:24:04 +0200 Subject: [PATCH 06/11] Use GPUArrays 6.3's global allowscalar toggle. --- lib/FluxCUDA/test/runtests.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl index c9deebdff3..f92f8f8522 100644 --- a/lib/FluxCUDA/test/runtests.jl +++ b/lib/FluxCUDA/test/runtests.jl @@ -4,7 +4,9 @@ using FluxCUDA using FluxCUDA: Flux, CUDA using .CUDA -CUDA.allowscalar(false) +# XXX: allowscalar is currently not inherited by child tasks, so set it globally +#CUDA.allowscalar(false) +ENV["JULIA_GPU_ALLOWSCALAR"] = "false" using .Flux Flux.default_gpu_converter[] = cu From ed8e0028fdff7e7d40ed8fe93bdff69e50a5b383 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 4 May 2021 08:12:16 +0200 Subject: [PATCH 07/11] Add FluxAMDGPU.jl. --- .buildkite/pipeline.yml | 21 +++ lib/FluxAMDGPU/Project.toml | 13 ++ lib/FluxAMDGPU/src/FluxAMDGPU.jl | 13 ++ lib/FluxAMDGPU/test/Project.toml | 5 + lib/FluxAMDGPU/test/core.jl | 63 +++++++++ lib/FluxAMDGPU/test/layers.jl | 221 ++++++++++++++++++++++++++++++ lib/FluxAMDGPU/test/losses.jl | 38 +++++ lib/FluxAMDGPU/test/runtests.jl | 20 +++ lib/FluxAMDGPU/test/test_utils.jl | 72 ++++++++++ 9 files changed, 466 insertions(+) create mode 100644 lib/FluxAMDGPU/Project.toml create mode 100644 lib/FluxAMDGPU/src/FluxAMDGPU.jl create mode 100644 lib/FluxAMDGPU/test/Project.toml create mode 100644 lib/FluxAMDGPU/test/core.jl create mode 100644 lib/FluxAMDGPU/test/layers.jl create mode 100644 lib/FluxAMDGPU/test/losses.jl create mode 100644 lib/FluxAMDGPU/test/runtests.jl create mode 100644 lib/FluxAMDGPU/test/test_utils.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a11e983e00..02751f72b8 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -17,3 +17,24 @@ steps: cuda: "*" if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 60 + + - label: "AMD support" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + command: | + julia -e 'using Pkg; + + println("--- :julia: Instantiating project"); + Pkg.develop(PackageSpec(path=pwd())); + Pkg.develop(PackageSpec(path=joinpath(pwd(), "lib/FluxAMDGPU"))); + + println("+++ :julia: Running tests"); + Pkg.test("FluxAMDGPU"; coverage=true);' + soft_fail: + - exit_status: 1 + agents: + queue: "juliagpu" + rocm: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 diff --git a/lib/FluxAMDGPU/Project.toml b/lib/FluxAMDGPU/Project.toml new file mode 100644 index 0000000000..193a8c42ee --- /dev/null +++ b/lib/FluxAMDGPU/Project.toml @@ -0,0 +1,13 @@ +name = "FluxAMDGPU" +uuid = "15448036-796b-45b3-936c-e3e32bc623ba" +authors = ["Julian P Samaroo "] +version = "0.1.0" + +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" + +[compat] +AMDGPU = "0.2" +Flux = "0.12" +julia = "1.6" diff --git a/lib/FluxAMDGPU/src/FluxAMDGPU.jl b/lib/FluxAMDGPU/src/FluxAMDGPU.jl new file mode 100644 index 0000000000..4e93d59d46 --- /dev/null +++ b/lib/FluxAMDGPU/src/FluxAMDGPU.jl @@ -0,0 +1,13 @@ +module FluxAMDGPU + +using Flux +using AMDGPU + +### onehot + +import Flux: OneHotArray, OneHotLike, _onehot_bool_type + +_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:ROCArray}) where N = ROCArray{Bool, N} +Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N = AMDGPU.ROCArrayStyle{N}() + +end # module diff --git a/lib/FluxAMDGPU/test/Project.toml b/lib/FluxAMDGPU/test/Project.toml new file mode 100644 index 0000000000..e5dd177881 --- /dev/null +++ b/lib/FluxAMDGPU/test/Project.toml @@ -0,0 +1,5 @@ +[deps] +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/lib/FluxAMDGPU/test/core.jl b/lib/FluxAMDGPU/test/core.jl new file mode 100644 index 0000000000..18ae76536d --- /dev/null +++ b/lib/FluxAMDGPU/test/core.jl @@ -0,0 +1,63 @@ +import .Flux: cpu, gpu +using Statistics: mean +using LinearAlgebra: I, cholesky, Cholesky + +@testset "Core" begin + x = randn(5, 5) + cx = gpu(x) + @test cx isa ROCArray + + @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3 + + x = Flux.onehotbatch([1, 2, 3], 1:3) + cx = gpu(x) + @test cx isa Flux.OneHotMatrix && cx.indices isa ROCArray + @test (cx .+ 1) isa ROCArray + + m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) + cm = gpu(m) + + @test all(p isa ROCArray for p in params(cm)) + @test cm(gpu(rand(10, 10))) isa ROCArray{Float32,2} + + xs = rand(5, 5) + ys = Flux.onehotbatch(1:5,1:5) + @test collect(roc(xs) .+ roc(ys)) ≈ collect(xs .+ ys) + + c = gpu(Conv((2,2),3=>4)) + x = gpu(rand(10, 10, 3, 2)) + l = c(gpu(rand(10,10,3,2))) + @test gradient(x -> sum(c(x)), x)[1] isa ROCArray + + c = gpu(CrossCor((2,2),3=>4)) + x = gpu(rand(10, 10, 3, 2)) + l = c(gpu(rand(10,10,3,2))) + @test gradient(x -> sum(c(x)), x)[1] isa ROCArray + +end + +@testset "onecold gpu" begin + y = Flux.onehotbatch(ones(3), 1:10) |> gpu; + l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + @test Flux.onecold(y) isa ROCArray + @test y[3,:] isa ROCArray + @test Flux.onecold(y, l) == ['a', 'a', 'a'] +end + +@testset "restructure gpu" begin + dudt = Dense(1,1) |> gpu + p,re = Flux.destructure(dudt) + foo(x) = sum(re(p)(x)) + @test gradient(foo, roc(rand(1)))[1] isa ROCArray +end + +@testset "GPU functors" begin + @testset "Cholesky" begin + M = 2.0*I(10) |> collect + Q = cholesky(M) + Q_gpu = Q |> gpu + @test Q_gpu isa Cholesky{<:Any,<:ROCArray} + Q_cpu = Q_gpu |> cpu + @test Q_cpu == cholesky(eltype(Q_gpu).(M)) + end +end diff --git a/lib/FluxAMDGPU/test/layers.jl b/lib/FluxAMDGPU/test/layers.jl new file mode 100644 index 0000000000..da6e431c09 --- /dev/null +++ b/lib/FluxAMDGPU/test/layers.jl @@ -0,0 +1,221 @@ +# Test layers and data/model movements on and off the GPU +# Add tests for layers and their gradients on the GPU +# Most of the forward passes should be fine being applied +# to bitstype objects, but this gives higher coverage for our use-cases +# Check that getting the gradients does not throw + +# generic movement tests +@testset "Basic GPU Movement" begin + @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple + @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple +end + +# TODO: These layers get into scalar indexing +# `AlphaDropout` throws a compilation error on GPUs, +# whereas, the rest are scalar indexing issues. +const BROKEN_LAYERS = Union{DepthwiseConv, + AlphaDropout} + +function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; + setmode=false, test_cpu=true, rtol=1e-5, atol=1e-5) + @testset "$name GPU grad tests" begin + for layer in layers + @testset "$layer GPU grad test" begin + l_cpu = layer(args...) + if l_cpu isa BROKEN_LAYERS + l_gpu, x_gpu = l_cpu |> gpu, x_cpu |> gpu + @test_broken gradient(() -> sum(l_gpu(x_gpu)), Flux.params(l_gpu)) isa Flux.Zygote.Grads + else + gpu_autodiff_test(l_cpu, x_cpu, + test_equal=test_cpu, rtol=rtol, atol=atol) + if setmode + testmode!(l_cpu) + gpu_autodiff_test(l_cpu, x_cpu, + test_equal=test_cpu, rtol=rtol, atol=atol) + end + end + end + end + end +end + + +# Just to give testset in gradtest meaningful labels +ConvNoBias(args...) = Conv(args...; bias=false) +ConvTransposeNoBias(args...) = ConvTranspose(args...; bias=false) +CrossCorNoBias(args...) = CrossCor(args...; bias=false) +DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias=false) +r = rand(Float32, 28, 28, 1, 1) +conv_layers = [Conv, ConvNoBias, ConvTranspose, ConvTransposeNoBias, CrossCor, CrossCorNoBias, DepthwiseConv, DepthwiseConvNoBias] +gpu_gradtest("Conv", conv_layers, r, (2,2), 1=>3) + +pooling_layers = [MaxPool, MeanPool] +gpu_gradtest("Pooling", pooling_layers, r, (2,2)) + +adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool] +gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7)) + +dropout_layers = [Dropout, AlphaDropout] +gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu=false, setmode=true) # dropout is not deterministic + +layer_norm = [i -> LayerNorm(i; affine=false), i -> LayerNorm(i; affine=true)] +gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 8, 8, 3, 4), 8) +gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 8, 8, 3, 4), (8,8)) +gpu_gradtest("LayerNorm 3", layer_norm, rand(Float32, 5, 4), 5) + +batch_norm = [BatchNorm] +gpu_gradtest("BatchNorm 3d", batch_norm, rand(Float32, 8, 8, 8, 3, 4), 3, setmode=false) # bug in CUDA.jl with gradient in testmode +gpu_gradtest("BatchNorm 2d", batch_norm, rand(Float32, 8, 8, 3, 4), 3, setmode=false) # bug in CUDA.jl with gradient in testmode +gpu_gradtest("BatchNorm 1d", batch_norm, rand(Float32, 8, 3, 4), 3, setmode=false) # bug in CUDA.jl with gradient in testmode +gpu_gradtest("BatchNorm fullyconn", batch_norm, rand(Float32, 5,4), 5, setmode=false) + +instancenorm = [i -> InstanceNorm(i; affine=false), i -> InstanceNorm(i; affine=true)] +gpu_gradtest("InstanceNorm 3d", instancenorm, rand(Float32, 8, 8, 8, 3, 4), 3, setmode=true) +gpu_gradtest("InstanceNorm 2d", instancenorm, rand(Float32, 8, 8, 3, 4), 3, setmode=true) +gpu_gradtest("InstanceNorm 1d", instancenorm, rand(Float32, 8, 3, 4), 3, setmode=true) + +groupnorm = [(i, j) -> GroupNorm(i, j; affine=false), (i, j) -> GroupNorm(i, j; affine=true)] +gpu_gradtest("GroupNorm 3d", groupnorm, rand(Float32, 8, 8, 8, 12, 4), 12, 3, setmode=true) +gpu_gradtest("GroupNorm 2d", groupnorm, rand(Float32, 8, 8, 12, 4), 12, 3, setmode=true) +gpu_gradtest("GroupNorm 1d", groupnorm, rand(Float32, 8, 3, 12, 4), 12, 3, setmode=true) + +upsample = [x -> Upsample(scale=x)] +gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2)) +gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,)) + +pixelshuffle = [PixelShuffle] +gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3) +gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3) + + +@testset "function layers" begin + x = rand(Float32, 3,3) + gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x) + gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x) + gpu_autodiff_test(x -> sum(Flux.normalise(x)), x) +end + +@testset "BatchNorm mix stuff" begin + m_cpu = BatchNorm(2) + m_gpu = m_cpu |> gpu + x_cpu = rand(Float32, 3, 2, 2) + x_gpu = x_cpu |> gpu + + ## In :auto mode, track statistics only in gradient contest + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test m_cpu.μ ≈ μ_cpu + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test !(m_cpu.μ ≈ μ_cpu) + + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test m_gpu.μ ≈ μ_gpu + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test !(m_gpu.μ ≈ μ_gpu) + + @test Array(m_gpu.μ) ≈ m_cpu.μ + + ## In testmode, never track statistics + testmode!(m_cpu) + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test m_cpu.μ ≈ μ_cpu + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test m_cpu.μ ≈ μ_cpu + + testmode!(m_gpu) + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test m_gpu.μ ≈ μ_gpu + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test m_gpu.μ ≈ μ_gpu + + ## In trainmode, always track statistics + trainmode!(m_cpu) + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test !(m_cpu.μ ≈ μ_cpu) + μ_cpu = copy(m_cpu.μ) + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test !(m_cpu.μ ≈ μ_cpu) + + trainmode!(m_gpu) + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test !(m_gpu.μ ≈ μ_gpu) + μ_gpu = copy(m_gpu.μ) + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test !(m_gpu.μ ≈ μ_gpu) + + ## No errors if input type mistmatch + x_cpu = rand(Float64, 3, 2, 2) + x_gpu = x_cpu |> gpu + m_cpu(x_cpu) + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + m_gpu(x_gpu) + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) +end + +@testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv) + l = cl((2,2), 1=>3, bias = false) |> gpu + ip = zeros(Float32, 28,28,1,1) |> gpu + if l isa BROKEN_LAYERS + @test_broken sum(l(ip)) ≈ 0.f0 + @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads + else + @test sum(l(ip)) ≈ 0.f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.bias ∉ gs.params + end +end + +@testset "Dense with Zeros bias" begin + l = Dense(ones(Float32, 4,3), Flux.Zeros()) |> gpu + ip = zeros(Float32, 3, 7) |> gpu + + @test sum(l(ip)) ≈ 0.f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.b ∉ gs.params +end + +@testset "Two-streams Bilinear" begin + x = zeros(Float32,10,9) |> gpu + y = zeros(Float32,2,9) |> gpu + b = Flux.Bilinear(10, 2, 3) |> gpu + @test size(b(x,y)) == (3,9) + @test sum(abs2, b(x,y)) ≈ 0f0 + gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) + b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu + gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) + for (pgpu, pcpu) in zip(params(b), params(b_cpu)) + @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) + end +end + +@testset "Parallel" begin + @testset "zero sum" begin + input = randn(10, 10, 10, 10) |> gpu + layer_gpu = Parallel(+, zero, identity) |> gpu + @test layer_gpu(input) == input + @test layer_gpu(input) isa ROCArray + end + + @testset "vararg input" begin + inputs = (randn(10), randn(5), randn(4)) .|> gpu + layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu + @test size(layer(inputs)) == (2,) + end + + @testset "gradient" begin + input_cpu = randn(10, 10, 10, 10) + input_gpu = input_cpu |> gpu + layer_cpu = Parallel(+, x -> zero(x), identity) + layer_gpu = layer_cpu |> gpu + gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu)) + gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu)) + for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu)) + @test gs_cpu[pcpu] ≈ gs_gpu[pgpu] + end + end +end diff --git a/lib/FluxAMDGPU/test/losses.jl b/lib/FluxAMDGPU/test/losses.jl new file mode 100644 index 0000000000..981ddd1244 --- /dev/null +++ b/lib/FluxAMDGPU/test/losses.jl @@ -0,0 +1,38 @@ +using .Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss + + +@testset "Losses" begin + +x = [1.,2.,3.] +cx = gpu(x) +@test crossentropy(x,x) ≈ crossentropy(cx,cx) +@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu +@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x)) + +x = [-1.1491, 0.8619, 0.3127] +y = [1, 1, 0.] +@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) +@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) + +x = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] +y = [0 1 0 + 1 0 1] +@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) + +x = softmax(reshape(-7:7, 3, 5) .* 1f0) +y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] +@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) + +@testset "GPU grad tests" begin + x = rand(Float32, 3,3) + y = rand(Float32, 3,3) + + for loss in ALL_LOSSES + gpu_autodiff_test(loss, x, y) + end +end + +end #testset diff --git a/lib/FluxAMDGPU/test/runtests.jl b/lib/FluxAMDGPU/test/runtests.jl new file mode 100644 index 0000000000..7a030438d6 --- /dev/null +++ b/lib/FluxAMDGPU/test/runtests.jl @@ -0,0 +1,20 @@ +using Test + +using FluxAMDGPU +using FluxAMDGPU: Flux, AMDGPU + +using .AMDGPU +# XXX: allowscalar is currently not inherited by child tasks, so set it globally +#AMDGPU.allowscalar(false) +ENV["JULIA_GPU_ALLOWSCALAR"] = "false" + +using .Flux +Flux.default_gpu_converter[] = AMDGPU.roc + +using Zygote +using Zygote: pullback + +include("test_utils.jl") +include("core.jl") +include("losses.jl") +include("layers.jl") diff --git a/lib/FluxAMDGPU/test/test_utils.jl b/lib/FluxAMDGPU/test/test_utils.jl new file mode 100644 index 0000000000..f1b7040a4a --- /dev/null +++ b/lib/FluxAMDGPU/test/test_utils.jl @@ -0,0 +1,72 @@ +function check_grad(g_gpu, g_cpu, atol, rtol) + @show g_gpu g_cpu + @test false +end +check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = + check_grad(g_gpu[], g_cpu[], atol, rtol) +check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true +check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu rtol=rtol atol=atol +check_grad(g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = + @test g_cpu ≈ collect(g_gpu) rtol=rtol atol=atol + +function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol) + for (v1, v2) in zip(g_gpu, g_cpu) + check_grad(v1, v2, atol, rtol) + end +end + +function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol) + for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu)) + @test k1 == k2 + # @show k2 v2 + check_grad(v1, v2, atol, rtol) + end +end + +function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; + test_equal=true, rtol=1e-4, atol=1e-4) + + check_type(x) = false + check_type(x::Float32) = true + check_type(x::ROCArray{Float32}) = true + check_type(x::Array{Float32}) = true + + ### GRADIENT WITH RESPECT TO INPUT ##### + # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...) + y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) + @test check_type(y_cpu) + Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) + gs_cpu = back_cpu(Δ_cpu) + + f_gpu = f_cpu |> gpu + xs_gpu = gpu.(xs_cpu) + Δ_gpu = Δ_cpu |> gpu + # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...) + y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) + @test check_type(y_gpu) + gs_gpu = back_gpu(Δ_gpu) + + if test_equal + @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol + for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) + check_grad(g_gpu, g_cpu, atol, rtol) + end + end + + ### GRADIENT WITH RESPECT TO f ##### + ps_cpu = Flux.params(f_cpu) + y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) + gs_cpu = back_cpu(Δ_cpu) + + ps_gpu = Flux.params(f_gpu) + y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) + gs_gpu = back_gpu(Δ_gpu) + + if test_equal + @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol + @assert length(ps_gpu) == length(ps_cpu) + for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) + check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) + end + end +end From 2eaffb31bae098d215dcb96a99bbab008bda6f4c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 4 May 2021 08:48:40 +0200 Subject: [PATCH 08/11] Ignore subproject manifests. --- lib/FluxAMDGPU/.gitignore | 1 + lib/FluxCUDA/.gitignore | 1 + 2 files changed, 2 insertions(+) create mode 100644 lib/FluxAMDGPU/.gitignore create mode 100644 lib/FluxCUDA/.gitignore diff --git a/lib/FluxAMDGPU/.gitignore b/lib/FluxAMDGPU/.gitignore new file mode 100644 index 0000000000..ba39cc531e --- /dev/null +++ b/lib/FluxAMDGPU/.gitignore @@ -0,0 +1 @@ +Manifest.toml diff --git a/lib/FluxCUDA/.gitignore b/lib/FluxCUDA/.gitignore new file mode 100644 index 0000000000..ba39cc531e --- /dev/null +++ b/lib/FluxCUDA/.gitignore @@ -0,0 +1 @@ +Manifest.toml From ee9efbcc1890c92dbdc7b8dbb738c744b23ad522 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 4 May 2021 12:49:15 +0200 Subject: [PATCH 09/11] Remove _onehot_bool_type. --- lib/FluxAMDGPU/src/FluxAMDGPU.jl | 6 +++--- lib/FluxCUDA/src/onehot.jl | 7 +++---- src/onehot.jl | 6 ++---- test/onehot.jl | 2 +- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/FluxAMDGPU/src/FluxAMDGPU.jl b/lib/FluxAMDGPU/src/FluxAMDGPU.jl index 4e93d59d46..844ba954af 100644 --- a/lib/FluxAMDGPU/src/FluxAMDGPU.jl +++ b/lib/FluxAMDGPU/src/FluxAMDGPU.jl @@ -5,9 +5,9 @@ using AMDGPU ### onehot -import Flux: OneHotArray, OneHotLike, _onehot_bool_type +using Flux: OneHotArray, OneHotLike -_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:ROCArray}) where N = ROCArray{Bool, N} -Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N = AMDGPU.ROCArrayStyle{N}() +Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N = + AMDGPU.ROCArrayStyle{N}() end # module diff --git a/lib/FluxCUDA/src/onehot.jl b/lib/FluxCUDA/src/onehot.jl index 760df1d27e..037991d053 100644 --- a/lib/FluxCUDA/src/onehot.jl +++ b/lib/FluxCUDA/src/onehot.jl @@ -1,5 +1,4 @@ -import Flux: OneHotArray, OneHotLike, _onehot_bool_type +using Flux: OneHotArray, OneHotLike -_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N} - -Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}() +Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = + CUDA.CuArrayStyle{N}() diff --git a/src/onehot.jl b/src/onehot.jl index edc5fc5f8b..72b28ca4e7 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -60,11 +60,9 @@ Base.getindex(x::OneHotArray{<:Any, L}, ::Colon, I...) where L = OneHotArray(x.i Base.getindex(x::OneHotArray{<:Any, <:Any, <:Any, N}, ::Vararg{Colon, N}) where N = x Base.getindex(x::OneHotArray, I::CartesianIndex{N}) where N = x[I[1], Tuple(I)[2:N]...] -_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:Union{Integer, AbstractArray}}) where N = Array{Bool, N} - function Base.cat(x::OneHotLike{<:Any, L}, xs::OneHotLike{<:Any, L}...; dims::Int) where L if isone(dims) || any(x -> !_isonehot(x), (x, xs...)) - return cat(map(x -> convert(_onehot_bool_type(x), x), (x, xs...))...; dims = dims) + return cat(map(x -> Bool.(x), (x, xs...))...; dims = dims) else return OneHotArray(cat(_indices(x), _indices.(xs)...; dims = dims - 1), L) end @@ -164,7 +162,7 @@ function _fast_argmax(x::OneHotLike) if _isonehot(x) return _indices(x) else - return _fast_argmax(convert(_onehot_bool_type(x), x)) + return _fast_argmax(Bool.(x)) end end diff --git a/test/onehot.jl b/test/onehot.jl index ce30534ec9..60522c5e8a 100644 --- a/test/onehot.jl +++ b/test/onehot.jl @@ -99,7 +99,7 @@ end @testset "w/ cat" begin r = reshape(oa, 10, :) @test hcat(r, r) isa OneHotArray - @test vcat(r, r) isa Array{Bool} + @test vcat(r, r) isa BitMatrix end @testset "w/ argmax" begin From 49f94d0bca1eec599e5dac5a206e71bfcebcddcf Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 4 May 2021 12:54:18 +0200 Subject: [PATCH 10/11] Automatically register GPU converters upon loading the glue package. --- lib/FluxAMDGPU/src/FluxAMDGPU.jl | 9 +++++++++ lib/FluxAMDGPU/test/runtests.jl | 2 +- lib/FluxCUDA/src/FluxCUDA.jl | 8 ++++++++ lib/FluxCUDA/test/runtests.jl | 2 +- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/FluxAMDGPU/src/FluxAMDGPU.jl b/lib/FluxAMDGPU/src/FluxAMDGPU.jl index 844ba954af..acffd3b4c8 100644 --- a/lib/FluxAMDGPU/src/FluxAMDGPU.jl +++ b/lib/FluxAMDGPU/src/FluxAMDGPU.jl @@ -10,4 +10,13 @@ using Flux: OneHotArray, OneHotLike Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N = AMDGPU.ROCArrayStyle{N}() +function __init__() + if Flux.default_gpu_converter[] === identity + @info "Registering AMDGPU.jl as the default GPU converter" + Flux.default_gpu_converter[] = roc + else + @warn "Not registering AMDGPU.jl as the default GPU converter as another one has been registered already." + end +end + end # module diff --git a/lib/FluxAMDGPU/test/runtests.jl b/lib/FluxAMDGPU/test/runtests.jl index 7a030438d6..8b6c3a65f8 100644 --- a/lib/FluxAMDGPU/test/runtests.jl +++ b/lib/FluxAMDGPU/test/runtests.jl @@ -9,7 +9,7 @@ using .AMDGPU ENV["JULIA_GPU_ALLOWSCALAR"] = "false" using .Flux -Flux.default_gpu_converter[] = AMDGPU.roc +@assert Flux.default_gpu_converter[] == roc using Zygote using Zygote: pullback diff --git a/lib/FluxCUDA/src/FluxCUDA.jl b/lib/FluxCUDA/src/FluxCUDA.jl index 75de37eb34..2936118bff 100644 --- a/lib/FluxCUDA/src/FluxCUDA.jl +++ b/lib/FluxCUDA/src/FluxCUDA.jl @@ -9,5 +9,13 @@ include("onehot.jl") include("ctc.jl") include("cudnn.jl") +function __init__() + if Flux.default_gpu_converter[] === identity + @info "Registering CUDA.jl as the default GPU converter" + Flux.default_gpu_converter[] = cu + else + @warn "Not registering CUDA.jl as the default GPU converter as another one has been registered already." + end +end end # module diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl index f92f8f8522..233ba77df4 100644 --- a/lib/FluxCUDA/test/runtests.jl +++ b/lib/FluxCUDA/test/runtests.jl @@ -9,7 +9,7 @@ using .CUDA ENV["JULIA_GPU_ALLOWSCALAR"] = "false" using .Flux -Flux.default_gpu_converter[] = cu +@assert Flux.default_gpu_converter[] == cu using Zygote using Zygote: pullback From b615dca0979ae8a74b7cbbfb2e7ccbc04df94073 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 25 Jun 2021 08:09:15 +0200 Subject: [PATCH 11/11] Add oneAPI back-end. --- lib/FluxOneAPI/Project.toml | 10 +++++++ lib/FluxOneAPI/src/FluxOneAPI.jl | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 lib/FluxOneAPI/Project.toml create mode 100644 lib/FluxOneAPI/src/FluxOneAPI.jl diff --git a/lib/FluxOneAPI/Project.toml b/lib/FluxOneAPI/Project.toml new file mode 100644 index 0000000000..d290aee506 --- /dev/null +++ b/lib/FluxOneAPI/Project.toml @@ -0,0 +1,10 @@ +name = "FluxOneAPI" +uuid = "71a558b2-ada5-41b7-bb4b-aca4e46ab144" +authors = ["Tim Besard "] +version = "0.1.0" + +[deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" diff --git a/lib/FluxOneAPI/src/FluxOneAPI.jl b/lib/FluxOneAPI/src/FluxOneAPI.jl new file mode 100644 index 0000000000..5fa50630b2 --- /dev/null +++ b/lib/FluxOneAPI/src/FluxOneAPI.jl @@ -0,0 +1,46 @@ +module FluxOneAPI + +using Flux +using oneAPI +using Adapt +using Zygote +using Zygote: @adjoint + +### onehot + +using Flux: OneHotArray, OneHotLike + +Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:oneArray}}) where N = + oneAPI.oneArrayStyle{N}() + +## zygote + +# TODO: generalize to GPUArray in Zygote.jl? + +@eval @adjoint function Base.broadcasted(::oneAPI.oneArrayStyle, f, args...) + y, back = Zygote.broadcast_forward(f, args...) + y, ȳ -> (nothing, nothing, back(ȳ)...) +end + +@adjoint oneArray{N,T}(xs::Array) where {N,T} = + oneArray{N,T}(xs), Δ -> (convert(Array, Δ), ) + +@adjoint function sum(xs::oneArray; dims = :) + placeholder = similar(xs) + sum(xs, dims = dims), Δ -> (placeholder .= Δ,) +end + +@adjoint function Base.convert(::Type{T}, xs::Array) where {T<:oneArray} + Base.convert(T, xs), Δ -> (nothing, Base.convert(Array, Δ),) +end + +function __init__() + if Flux.default_gpu_converter[] === identity + @info "Registering oneAPI.jl as the default GPU converter" + Flux.default_gpu_converter[] = (x)->adapt(oneArray, x) + else + @warn "Not registering oneAPI.jl as the default GPU converter as another one has been registered already." + end +end + +end # module