From 04d7ababff4ba949538af2da697bab897d80e3d3 Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Tue, 6 Apr 2021 14:29:31 -0500
Subject: [PATCH 01/11] Support multiple GPU backends

---
 src/Flux.jl    | 1 +
 src/functor.jl | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 5e6776d601..a36a341c89 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -32,6 +32,7 @@ export Descent, ADAM, Momentum, Nesterov, RMSProp,
 
 using CUDA
 const use_cuda = Ref(false)
+const default_gpu_converter = Ref{Function}(identity)
 
 include("utils.jl")
 include("zeros.jl")
diff --git a/src/functor.jl b/src/functor.jl
index afda1f5b84..c07907635e 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -66,7 +66,7 @@ end
 
 cpu(m) = fmap(x -> adapt(Array, x), m)
 
-gpu(x) = use_cuda[] ? fmap(CUDA.cu, x) : x
+gpu(x) = fmap(default_gpu_converter[], x)
 
 # Precision
 

From 3080aa85cd29edf4e78a0441facee7613a7b8b01 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Thu, 29 Apr 2021 12:46:26 +0200
Subject: [PATCH 02/11] Prepare to move CUDA-specific functionality to separate
 package.

---
 Manifest.toml           |  68 -----------
 Project.toml            |   6 +-
 src/Flux.jl             |  14 +--
 src/cuda/cuda.jl        |  12 --
 src/cuda/cudnn.jl       |  20 ---
 src/losses/Losses.jl    |   2 -
 src/losses/ctc-gpu.jl   | 232 -----------------------------------
 src/onehot.jl           |   4 -
 test/ctc-gpu.jl         |  27 ++---
 test/cuda/cuda.jl       |  65 ----------
 test/cuda/cudnn.jl      |  44 -------
 test/cuda/curnn.jl      |  58 ---------
 test/cuda/layers.jl     | 261 ----------------------------------------
 test/cuda/losses.jl     |  38 ------
 test/cuda/runtests.jl   |  19 ---
 test/cuda/test_utils.jl |  72 -----------
 test/runtests.jl        |  12 +-
 17 files changed, 18 insertions(+), 936 deletions(-)
 delete mode 100644 src/cuda/cuda.jl
 delete mode 100644 src/cuda/cudnn.jl
 delete mode 100644 src/losses/ctc-gpu.jl
 delete mode 100644 test/cuda/cuda.jl
 delete mode 100644 test/cuda/cudnn.jl
 delete mode 100644 test/cuda/curnn.jl
 delete mode 100644 test/cuda/layers.jl
 delete mode 100644 test/cuda/losses.jl
 delete mode 100644 test/cuda/runtests.jl
 delete mode 100644 test/cuda/test_utils.jl

diff --git a/Manifest.toml b/Manifest.toml
index 455ea5bddf..4aad025690 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -23,26 +23,9 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
 [[Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
-[[BFloat16s]]
-deps = ["LinearAlgebra", "Test"]
-git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
-uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-version = "0.1.0"
-
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
-[[CEnum]]
-git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
-uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.4.1"
-
-[[CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "a6ce96dcf22fc4f1bfdfac02d54f0b77ecf2a4cc"
-uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "3.0.3"
-
 [[ChainRules]]
 deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Random", "Reexport", "Requires", "Statistics"]
 git-tree-sha1 = "1f410fba5c04d03ab712f348f1542e6059376547"
@@ -128,11 +111,6 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 deps = ["ArgTools", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
-[[ExprTools]]
-git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
-uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.3"
-
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
 git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a"
@@ -163,12 +141,6 @@ git-tree-sha1 = "9c95b2fd5c16bc7f97371e9f92f0fef77e0f5957"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "6.2.2"
 
-[[GPUCompiler]]
-deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "6eadd2321dc3ac0fc9d530ab01c2caa7fe5d74c6"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.11.4"
-
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
 git-tree-sha1 = "c67e7515a11f726f44083e74f218d134396d6510"
@@ -191,16 +163,6 @@ git-tree-sha1 = "07cb43290a840908a771552911a6274bc6c072c7"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 version = "0.8.4"
 
-[[LLVM]]
-deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194"
-uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "3.6.0"
-
-[[LazyArtifacts]]
-deps = ["Artifacts", "Pkg"]
-uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
-
 [[LibCURL]]
 deps = ["LibCURL_jll", "MozillaCACerts_jll"]
 uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
@@ -247,12 +209,6 @@ git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58"
 uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
 version = "0.5.0"
 
-[[Memoize]]
-deps = ["MacroTools"]
-git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
-uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
-version = "0.4.4"
-
 [[Missings]]
 deps = ["DataAPI"]
 git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c"
@@ -271,12 +227,6 @@ git-tree-sha1 = "80b8360670f445d88b3475e88b33bbcc92f7866e"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 version = "0.7.19"
 
-[[NNlibCUDA]]
-deps = ["CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"]
-git-tree-sha1 = "4b368b466bcdd25d448a5b20de4b7e481d68b88e"
-uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d"
-version = "0.1.0"
-
 [[NaNMath]]
 git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
@@ -322,12 +272,6 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 deps = ["Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
-[[RandomNumbers]]
-deps = ["Random", "Requires"]
-git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f"
-uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
-version = "1.4.0"
-
 [[Reexport]]
 git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5"
 uuid = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -342,12 +286,6 @@ version = "1.1.3"
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 
-[[Scratch]]
-deps = ["Dates"]
-git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6"
-uuid = "6c6a2e73-6563-6170-7368-637461726353"
-version = "1.0.3"
-
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
@@ -402,12 +340,6 @@ uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-[[TimerOutputs]]
-deps = ["Printf"]
-git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236"
-uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.8"
-
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
 git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
diff --git a/Project.toml b/Project.toml
index 0a106defa5..89ada44fe0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,17 +5,15 @@ version = "0.12.3"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -29,14 +27,12 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 AbstractTrees = "0.3"
 Adapt = "3.0"
-CUDA = "3"
 CodecZlib = "0.7"
 Colors = "0.12"
 Functors = "0.2.1"
 Juno = "0.8"
 MacroTools = "0.5"
 NNlib = "0.7.14"
-NNlibCUDA = "0.1"
 Reexport = "0.2, 1.0"
 StatsBase = "0.33"
 ZipFile = "0.9"
diff --git a/src/Flux.jl b/src/Flux.jl
index a36a341c89..b75f7700d7 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -8,6 +8,7 @@ using Zygote, MacroTools, Juno, Reexport
 using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
+using GPUArrays
 
 export gradient
 
@@ -30,8 +31,6 @@ export Descent, ADAM, Momentum, Nesterov, RMSProp,
   WeightDecay, ClipValue, ClipNorm
 
 
-using CUDA
-const use_cuda = Ref(false)
 const default_gpu_converter = Ref{Function}(identity)
 
 include("utils.jl")
@@ -55,15 +54,4 @@ using .Losses # TODO: stop importing Losses in Flux's namespace in v0.12
 
 include("deprecations.jl")
 
-include("cuda/cuda.jl")
-
-function __init__()
-  use_cuda[] = CUDA.functional() # Can be overridden after load with `Flux.use_cuda[] = false`
-  if CUDA.functional()
-    if !CUDA.has_cudnn()
-      @warn "CUDA.jl found cuda, but did not find libcudnn. Some functionality will not be available."
-    end
-  end
-end
-
 end # module
diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl
deleted file mode 100644
index 0ef3c65308..0000000000
--- a/src/cuda/cuda.jl
+++ /dev/null
@@ -1,12 +0,0 @@
-module CUDAint
-
-using ..CUDA
-
-import ..Flux: Flux
-import Zygote
-using Zygote: @adjoint
-import NNlib, NNlibCUDA
-
-include("cudnn.jl")
-
-end
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
deleted file mode 100644
index 4a3b2618c8..0000000000
--- a/src/cuda/cudnn.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-import NNlibCUDA: batchnorm, ∇batchnorm
-
-function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
-                              cache=nothing) where T<:Union{Float32, Float64}
-  
-  @assert BN.affine "BatchNorm: only affine=true supported on gpu"
-  @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
-  @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wronng number of channels"
-  return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; 
-                  cache=cache, alpha=1, beta=0, eps=BN.ϵ, 
-                  training=Flux._isactive(BN)))
-end
-
-@adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...)
-  y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) 
-  function batchnorm_pullback(Δ)
-    ∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)..., nothing, nothing, nothing
-  end
-  y, batchnorm_pullback
-end
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
index bf944f9231..5736e3201a 100644
--- a/src/losses/Losses.jl
+++ b/src/losses/Losses.jl
@@ -4,7 +4,6 @@ using Statistics
 using Zygote
 using Zygote: @adjoint
 using ..Flux: ofeltype, epseltype
-using CUDA
 using NNlib: logsoftmax, logσ
 import Base.Broadcast: broadcasted
 
@@ -24,6 +23,5 @@ export mse, mae, msle,
 include("utils.jl")
 include("functions.jl")
 include("ctc.jl")
-if CUDA.functional() include("ctc-gpu.jl") end
 
 end #module
diff --git a/src/losses/ctc-gpu.jl b/src/losses/ctc-gpu.jl
deleted file mode 100644
index b856215b61..0000000000
--- a/src/losses/ctc-gpu.jl
+++ /dev/null
@@ -1,232 +0,0 @@
-# GPU implementation
-
-# a port of the GPU kernels from Baidu's C++ warp-ctc package,
-# which itself is Copyright 2015-2016 Baidu USA LLC
-# and available under the Apache 2.0 license
-#
-# Apache 2.0 license: https://www.apache.org/licenses/LICENSE-2.0
-# GitHub: https://github.com/baidu-research/warp-ctc/
-# paper: https://arxiv.org/pdf/1512.02595.pdf
-
-using Flux
-using Statistics
-using CUDA
-using NNlib
-
-const MAX_THREADS = 256
-
-function log_plus_f(p1, p2)
-  isinf(p1) && return p2
-  isinf(p2) && return p1
-  if p1 < p2
-    p1, p2 = p2, p1
-  end
-  return p1 + log(1+exp(p2 - p1))
-end
-
-function count_repeats(A)
-  repeats = 0
-  for (i,elem) in enumerate(A)
-    if i > 1 && A[i] == A[i-1]
-      repeats += 1
-    end
-  end
-  return repeats
-end
-
-function compute_alpha_kernel(probs, labelSize, uttLength, repeats, labelsWithoutBlanks, labelsWithBlanks, alpha, blankLabel)
-  
-  tid = threadIdx().x
-  L = labelSize
-  T = uttLength
-  S = length(labelsWithBlanks)
-  
-  if L + repeats > T
-    return nothing
-  end
-  labels = labelsWithBlanks
-
-  # Corner-case checking
-  start = (L + repeats <= T) ? 0 : 1
-  last = S > 1 ? 2 : 1
-  
-  # Fill in first column (time step)
-  i = tid
-  while i <= last - start
-    alpha[start+i, 1] = probs[labels[start+i], 1]
-    i += blockDim().x
-  end
-  sync_threads()
-  
-  # Fill in coefficients for each time step
-  for t=2:T
-    # Corner-case checking
-    if tid == 1 && !(1 < S - 2*(T-t) - 1)
-      if start == 0
-        alpha[1, t] = alpha[1, t-1] + probs[blankLabel, t]
-      elseif start == 1
-        alpha[1, t] = alpha[1, t-1]
-      end
-    end
-    sync_threads()
-    
-    # Fill in coefficients for each label class in the target output sequence;
-    # each thread will process the calculations for one class
-    idx = tid+1
-    while idx <= S
-      prevSum = log_plus_f(alpha[idx, t-1], alpha[idx-1, t-1])
-      if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx-2]
-        prevSum = log_plus_f(prevSum, alpha[idx-2, t-1])
-      end
-      if idx < S - 2*(T-t) - 1
-        alpha[idx, t] = -Inf32
-      else
-        alpha[idx, t] = prevSum + probs[labels[idx], t]
-      end
-      idx += blockDim().x
-    end
-    sync_threads()
-  end
-  return nothing
-end
-
-function compute_beta_and_grad_kernel(probs, labelSize, uttLength,
-                  repeatsInLabel, labelsWithBlanks,
-                  alphas, beta, output, accum,
-                  grad, blankLabel, loss)
-  
-  tid = threadIdx().x
-  L = labelSize
-  T = uttLength
-  S = 2*L + 1
-  repeats = repeatsInLabel
-  labels = labelsWithBlanks
-  
-  if (L+repeats) > T
-    return nothing
-  end
-  
-  # Corner-case checking
-  start = S > 1 ? S-2 : 0
-  last = L + repeats < T ? S : S-1
-  sync_threads()
-  i = tid
-  
-  # Calculate coefficients for last column (time step)
-  # then determine alpha and beta product
-  while i <= last - start
-    beta[i+start, T] = 0
-    output[i+start, T] = beta[i+start, T] + alphas[i+start, T]
-    i += blockDim().x
-  end
-  sync_threads()
-  
-  # Fill in `accum` for last column (time step)
-  if tid == 1    
-    for i=1:S
-      labelIdx = labels[i]
-      accum[labelIdx, T] = log_plus_f(accum[labelIdx, T], output[i, T])
-    end
-  end
-  sync_threads()
-  
-  # Fill in `grad` for last column (time step)
-  idx = tid
-  while idx <= size(grad, 1)
-    s = -Inf32
-    for i=1:S
-      s = log_plus_f(s, output[i, T])
-    end
-    
-    # ∂L/∂a (where a is activation before logsoftmax)
-    grad[idx, T] = exp(probs[idx, T]) - exp(accum[idx, T] - s)
-    idx += blockDim().x
-  end
-  sync_threads()
-  
-  # Fill in the rest of the coefficients
-  t = T-1
-  while t >= 1
-    if t < T
-      idx = tid
-      while idx <= S
-        nextSum = probs[labels[idx], t+1] + beta[idx, t+1]
-        if idx < S
-          nextSum = log_plus_f(nextSum,
-            probs[labels[idx+1], t+1] + beta[idx+1, t+1])
-        end
-        if labels[idx] != blankLabel && idx != S-1 && labels[idx] != labels[idx+2]
-          nextSum = log_plus_f(nextSum,
-            probs[labels[idx+2], t+1] + beta[idx + 2, t+1])
-        end
-        if idx > 2*t
-          beta[idx, t] = -Inf32
-        else
-          beta[idx, t] = nextSum
-        end
-        idx += blockDim().x
-      end
-      sync_threads()
-      idx = tid
-      while idx <= S
-        output[idx, t] = alphas[idx, t] + beta[idx, t]
-        idx += blockDim().x
-      end
-      sync_threads()
-    end
-    sync_threads()
-    
-    # Calculate accumulated alpha-beta products for each label class for
-    # each time step; used in calculating gradients
-    if tid == 1      
-      for i=1:S
-        labelIdx = labels[i]
-        accum[labelIdx, t] = log_plus_f(accum[labelIdx, t], output[i, t])
-      end
-    end
-    sync_threads()
-    idx = tid
-    
-    # Calculate gradients
-    while idx <= size(grad, 1)
-      
-      # ∂L/∂a (where a is activation before logsoftmax)
-      grad[idx, t] = exp(probs[idx, t]) - exp(accum[idx, t] + loss)
-      idx += blockDim().x
-    end
-    sync_threads()
-    t -= 1
-    sync_threads()
-  end
-  return nothing
-end
-
-function ctc_alpha(ŷ::CuArray, y)
-  ŷ = logsoftmax(ŷ)
-  blank = size(ŷ, 1)
-  z′ = fill(blank, 2 * length(y) + 1)
-  z′[eachindex(y) .* 2] = y
-  T = size(ŷ, 2)
-  U′ = 2*length(y) + 1
-  alphas = CUDA.fill(log(zero(ŷ[1])), U′,T)
-  nRepeats = count_repeats(y)
-  nThreads = min(U′, MAX_THREADS)
-  @cuda blocks=1 threads=nThreads compute_alpha_kernel(ŷ, length(y), T, nRepeats, CuArray(y), CuArray(z′), alphas, blank)
-  return (loss=-1 * logsumexp(alphas[end-1:end]), alpha=alphas, z′=z′, yhat=ŷ, nRepeats=nRepeats)
-end
-
-ctc_loss(ŷ::CuArray, y) = ctc_alpha(ŷ::CuArray, y).loss
-
-function ∇ctc_loss(ŷ::CuArray, y, out)
-  loss, alphas, z′, ŷ, nRepeats = out
-  U′, T = size(alphas)
-  blank = size(ŷ, 1)
-  typed_zero = zero(first(ŷ))
-  betas = CUDA.fill(log(typed_zero), U′, T)
-  output = CUDA.fill(log(typed_zero), U′, T)
-  nThreads = min(U′, MAX_THREADS)
-  grads = CUDA.fill(log(typed_zero), size(ŷ))
-  accum = CUDA.fill(log(typed_zero), size(ŷ))
-  @cuda blocks=1 threads=nThreads compute_beta_and_grad_kernel(ŷ, length(y), T, nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank, loss)
-  return grads
-end
diff --git a/src/onehot.jl b/src/onehot.jl
index b879e5cda2..edc5fc5f8b 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -1,5 +1,4 @@
 import Adapt
-import .CUDA
 
 struct OneHotArray{T<:Integer, L, N, var"N+1", I<:Union{T, AbstractArray{T, N}}} <: AbstractArray{Bool, var"N+1"}
   indices::I
@@ -62,7 +61,6 @@ Base.getindex(x::OneHotArray{<:Any, <:Any, <:Any, N}, ::Vararg{Colon, N}) where
 Base.getindex(x::OneHotArray, I::CartesianIndex{N}) where N = x[I[1], Tuple(I)[2:N]...]
 
 _onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:Union{Integer, AbstractArray}}) where N = Array{Bool, N}
-_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N}
 
 function Base.cat(x::OneHotLike{<:Any, L}, xs::OneHotLike{<:Any, L}...; dims::Int) where L
   if isone(dims) || any(x -> !_isonehot(x), (x, xs...))
@@ -79,8 +77,6 @@ batch(xs::AbstractArray{<:OneHotVector{<:Any, L}}) where L = OneHotArray(_indice
 
 Adapt.adapt_structure(T, x::OneHotArray{<:Any, L}) where L = OneHotArray(adapt(T, _indices(x)), L)
 
-Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}()
-
 Base.argmax(x::OneHotLike; dims = Colon()) =
   (_isonehot(x) && dims == 1) ?
     reshape(CartesianIndex.(_indices(x), CartesianIndices(_indices(x))), 1, size(_indices(x))...) :
diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl
index d7ff1bdf9d..37d6ab3134 100644
--- a/test/ctc-gpu.jl
+++ b/test/ctc-gpu.jl
@@ -3,7 +3,6 @@ using Flux
 using Flux.Losses: ctc_loss
 using Zygote: gradient
 using LinearAlgebra
-using CUDA
 
 # Custom function to check numerical gradient of ctc loss,
 # based on `ngradient` in `Tracker.jl`
@@ -26,31 +25,31 @@ end
 @testset "ctc-gpu" begin
   x = rand(10, 50)
   y = rand(1:9, 30)
-  x_cu = CuArray(x)
-  g1 = gradient(ctc_loss, x_cu, y)[1]
+  x_gpu = gpu(x)
+  g1 = gradient(ctc_loss, x_gpu, y)[1]
   g1 = g1 |> collect
   g2 = ctc_ngradient(x, y)
   @test g1 ≈ g2 rtol=1e-5 atol=1e-5
-  
+
   # test that GPU loss matches CPU implementation
-  l1 = ctc_loss(x_cu, y)
+  l1 = ctc_loss(x_gpu, y)
   l2 = ctc_loss(x, y)
   @test l1 ≈ l2
-  
+
   # tests using hand-calculated values
-  x_cu = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] |> CuArray
+  x_gpu = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] |> gpu
   y = [1, 2]
-  @test ctc_loss(x_cu, y) ≈ 3.6990738275138035
-  
+  @test ctc_loss(x_gpu, y) ≈ 3.6990738275138035
+
   g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; 0.0729422 0.447346 0.16457]
-  ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
+  ghat = gradient(ctc_loss, x_gpu, y)[1] |> collect
   @test g ≈ ghat rtol=1e-5 atol=1e-5
 
-  x_cu = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] |> CuArray
-  y = [1, 2] |> CuArray
-  @test ctc_loss(x_cu, y) ≈ 8.02519869363453
+  x_gpu = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] |> gpu
+  y = [1, 2] |> gpu
+  @test ctc_loss(x_gpu, y) ≈ 8.02519869363453
 
   g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
-  ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
+  ghat = gradient(ctc_loss, x_gpu, y)[1] |> collect
   @test g ≈ ghat rtol=1e-5 atol=1e-5
 end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
deleted file mode 100644
index 92c04404f8..0000000000
--- a/test/cuda/cuda.jl
+++ /dev/null
@@ -1,65 +0,0 @@
-using Flux, Test
-using Flux.CUDA
-using Flux: cpu, gpu
-using Statistics: mean
-using LinearAlgebra: I, cholesky, Cholesky
-
-@testset "CUDA" begin
-  x = randn(5, 5)
-  cx = gpu(x)
-  @test cx isa CuArray
-
-  @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3
-
-  x = Flux.onehotbatch([1, 2, 3], 1:3)
-  cx = gpu(x)
-  @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray
-  @test (cx .+ 1) isa CuArray
-
-  m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
-  cm = gpu(m)
-
-  @test all(p isa CuArray for p in params(cm))
-  @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
-
-  xs = rand(5, 5)
-  ys = Flux.onehotbatch(1:5,1:5)
-  @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
-
-  c = gpu(Conv((2,2),3=>4))
-  x = gpu(rand(10, 10, 3, 2))
-  l = c(gpu(rand(10,10,3,2)))
-  @test gradient(x -> sum(c(x)), x)[1] isa CuArray
-
-  c = gpu(CrossCor((2,2),3=>4))
-  x = gpu(rand(10, 10, 3, 2))
-  l = c(gpu(rand(10,10,3,2)))
-  @test gradient(x -> sum(c(x)), x)[1] isa CuArray
-
-end
-
-@testset "onecold gpu" begin
-  y = Flux.onehotbatch(ones(3), 1:10) |> gpu;
-  l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
-  @test Flux.onecold(y) isa CuArray
-  @test y[3,:] isa CuArray
-  @test Flux.onecold(y, l) == ['a', 'a', 'a']
-end
-
-@testset "restructure gpu" begin
-  dudt = Dense(1,1) |> gpu
-  p,re = Flux.destructure(dudt)
-  foo(x) = sum(re(p)(x))
-  @test gradient(foo, cu(rand(1)))[1] isa CuArray
-end
-
-@testset "GPU functors" begin
-  @testset "Cholesky" begin
-    M = 2.0*I(10) |> collect
-    Q = cholesky(M)
-    Q_gpu = Q |> gpu
-    @test Q_gpu isa Cholesky{<:Any,<:CuArray}
-    Q_cpu = Q_gpu |> cpu
-    @test Q_cpu == cholesky(eltype(Q_gpu).(M))
-  end
-end
diff --git a/test/cuda/cudnn.jl b/test/cuda/cudnn.jl
deleted file mode 100644
index 5d1727e984..0000000000
--- a/test/cuda/cudnn.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-using Flux, CUDA, Test
-using Flux: pullback
-
-@testset "CUDNN BatchNorm" begin
-    @testset "4D Input" begin
-        x = rand(Float32, 2, 2, 3, 4)
-        m = BatchNorm(3)
-        gx = gpu(x)
-        gm = gpu(m)
-
-        y, back = pullback((m, x) -> m(x), m, x)
-        gy, gback = pullback((m, x) -> m(x), gm, gx)
-
-        @test cpu(gy) ≈ y
-
-        Δ = randn(Float32, size(y))
-        dm, dx = back(Δ)
-        gdm, gdx = gback(gpu(Δ))
-
-        @test dm[].γ ≈ cpu(gdm[].γ)
-        @test dm[].β ≈ cpu(gdm[].β)
-        @test dx ≈ cpu(gdx)
-    end
-
-    @testset "2D Input" begin
-        x = rand(Float32, 3, 4)
-        m = BatchNorm(3)
-        gx = gpu(x)
-        gm = gpu(m)
-
-        y, back = pullback((m, x) -> m(x), m, x)
-        gy, gback = pullback((m, x) -> m(x), gm, gx)
-
-        @test cpu(gy) ≈ y
-
-        Δ = randn(Float32, size(y))
-        dm, dx = back(Δ)
-        gdm, gdx = gback(gpu(Δ))
-
-        @test dm[].γ ≈ cpu(gdm[].γ)
-        @test dm[].β ≈ cpu(gdm[].β)
-        @test dx ≈ cpu(gdx)
-    end
-end
diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl
deleted file mode 100644
index 63a5f93ada..0000000000
--- a/test/cuda/curnn.jl
+++ /dev/null
@@ -1,58 +0,0 @@
-using Flux, CUDA, Test
-
-@testset for R in [RNN, GRU, LSTM]
-  m = R(10, 5) |> gpu
-  x = gpu(rand(10))
-  (m̄,) = gradient(m -> sum(m(x)), m)
-  Flux.reset!(m)
-  θ = gradient(() -> sum(m(x)), params(m))
-  @test x isa CuArray
-  @test θ[m.cell.Wi] isa CuArray
-  @test collect(m̄[].cell.Wi) == collect(θ[m.cell.Wi])
-end
-
-@testset "RNN" begin
-  @testset for R in [RNN, GRU, LSTM], batch_size in (1, 5)
-    rnn = R(10, 5)
-    curnn = fmap(gpu, rnn)
-
-    Flux.reset!(rnn)
-    Flux.reset!(curnn)
-    x = batch_size == 1 ?
-      rand(Float32, 10) :
-      rand(Float32, 10, batch_size)
-    cux = gpu(x)
-
-    y, back = pullback((r, x) -> r(x), rnn, x)
-    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
-
-    @test y ≈ collect(cuy)
-
-    ȳ = randn(size(y))
-    m̄, x̄ = back(ȳ)
-    cum̄, cux̄ = cuback(gpu(ȳ))
-
-    @test x̄ ≈ collect(cux̄)
-    @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
-    @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
-    @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
-    if m̄[].state isa Tuple
-      for (x, cx) in zip(m̄[].state, cum̄[].state)
-        @test x ≈ collect(cx)
-      end
-    else
-      @test m̄[].state ≈ collect(cum̄[].state)
-    end
-
-    Flux.reset!(rnn)
-    Flux.reset!(curnn)
-    ohx = batch_size == 1 ?
-      Flux.onehot(rand(1:10), 1:10) :
-      Flux.onehotbatch(rand(1:10, batch_size), 1:10)
-    cuohx = gpu(ohx)
-    y = (rnn(ohx); rnn(ohx))
-
-    cuy = (curnn(cuohx); curnn(cuohx))
-    @test y ≈ collect(cuy)  
-  end
-end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
deleted file mode 100644
index 0a4034303d..0000000000
--- a/test/cuda/layers.jl
+++ /dev/null
@@ -1,261 +0,0 @@
-# Test layers and data/model movements on and off the GPU
-# Add tests for layers and their gradients on the GPU
-# Most of the forward passes should be fine being applied
-# to bitstype objects, but this gives higher coverage for our use-cases
-# Check that getting the gradients does not throw
-
-# generic movement tests
-@testset "Basic GPU Movement" begin
-  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
-  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
-end
-
-# TODO: These layers get into scalar indexing
-# `AlphaDropout` throws a compilation error on GPUs,
-# whereas, the rest are scalar indexing issues.
-# The norm layers behave differently on the CPU and
-# the GPU too.
-const BROKEN_LAYERS = Union{DepthwiseConv,
-                            AlphaDropout}
-
-const ACTIVATIONS = [identity, relu, tanh,
-                     sigmoid, exp, softplus,
-                     elu, selu]
-
-function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true)
-  isnothing(x_cpu) && error("Missing input to test the layers against.")
-  @testset "$name GPU grad tests" begin
-    for layer in layers
-      @testset "$layer Layer GPU grad test" begin
-
-        # compute output and grad of parameters
-        l_cpu = layer(args...)
-        ps_cpu = Flux.params(l_cpu)
-        y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
-        gs_cpu = back_cpu(1f0)
-
-        x_gpu = gpu(x_cpu)
-        l_gpu = l_cpu |> gpu
-        ps_gpu = Flux.params(l_gpu)
-
-        if typeof(l_gpu) <: BROKEN_LAYERS
-          @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads
-        else
-          y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
-          gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix
-
-          # compute grad of input
-          xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
-          xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
-
-          # test 
-          if test_cpu
-            @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3
-            @test Array(xg_gpu) ≈ xg_cpu rtol=1f-3 atol=1f-3
-          end
-          @test gs_gpu isa Flux.Zygote.Grads
-          for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
-            @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
-            if test_cpu
-              @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3
-            end
-          end
-        end
-      end
-    end
-  end
-end
-
-# Just to give testset in gpu_gradtest meaningful labels
-ConvNoBias(args...) = Conv(args...; bias = false)
-ConvTransposeNoBias(args...) = ConvTranspose(args...; bias = false)
-CrossCorNoBias(args...) = CrossCor(args...; bias = false)
-DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias = false)
-
-for act in ACTIVATIONS
-  r = rand(Float32, 28, 28, 1, 1)
-  conv_layers = [Conv, ConvNoBias,
-                 ConvTranspose, ConvTransposeNoBias,
-                 CrossCor, CrossCorNoBias,
-                 DepthwiseConv, DepthwiseConvNoBias]
-  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false)
-  
-  batch_norm = [BatchNorm]
-  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors
-  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false)
-  
-  instancenorm = [InstanceNorm]
-  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
-  
-  groupnorm = [GroupNorm]
-  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false)
-end
-
-r = rand(Float32, 28, 28, 1, 1)
-
-pooling_layers = [MaxPool, MeanPool]
-gpu_gradtest("Pooling", pooling_layers, r, (2,2))
-
-adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool]
-gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false)
-
-dropout_layers = [Dropout, AlphaDropout]
-gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic
-
-layer_norm = [LayerNorm]
-gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors
-gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
-
-upsample = [x -> Upsample(scale=x)]
-gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
-gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
-
-pixelshuffle = [PixelShuffle]
-gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
-gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
-
-@testset "function layers" begin
-  x = rand(Float32, 3,3)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
-end
-
-@testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
-  l = cl((2,2), 1=>3, bias = false) |> gpu
-  ip = zeros(Float32, 28,28,1,1) |> gpu
-  if typeof(l) <: BROKEN_LAYERS
-    @test_broken sum(l(ip)) ≈ 0.f0
-    @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads
-  else
-    @test sum(l(ip)) ≈ 0.f0
-    gs = gradient(() -> sum(l(ip)), Flux.params(l))
-    @test l.bias ∉ gs.params 
-  end
-end
-
-@testset "Dense with Zeros bias" begin
-  l = Dense(ones(Float32, 4,3), Flux.Zeros()) |> gpu
-  ip = zeros(Float32, 3, 7) |> gpu
-
-  @test sum(l(ip)) ≈ 0.f0
-  gs = gradient(() -> sum(l(ip)), Flux.params(l))
-  @test l.b ∉ gs.params
-end
-
-@testset "Extended BatchNorm" begin
-  m_cpu = BatchNorm(2)
-  m_gpu = m_cpu |> gpu
-  x_cpu = rand(Float32, 3, 2, 2)
-  x_gpu = x_cpu |> gpu
-
-  ## In :auto mode, track statistics only in gradient contest
-  μ_cpu = copy(m_cpu.μ)
-  m_cpu(x_cpu)
-  @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  @test !(m_cpu.μ ≈ μ_cpu)
-
-  μ_gpu = copy(m_gpu.μ)
-  m_gpu(x_gpu)
-  @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-  @test !(m_gpu.μ ≈ μ_gpu)
-
-  @test Array(m_gpu.μ) ≈ m_cpu.μ
-
-  ## In testmode, never track statistics
-  testmode!(m_cpu)
-  μ_cpu = copy(m_cpu.μ)
-  m_cpu(x_cpu)
-  @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  @test m_cpu.μ ≈ μ_cpu
-
-  testmode!(m_gpu)
-  μ_gpu = copy(m_gpu.μ)
-  m_gpu(x_gpu)
-  @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-  @test m_gpu.μ ≈ μ_gpu
-
-  ## In trainmode, always track statistics
-  trainmode!(m_cpu)
-  μ_cpu = copy(m_cpu.μ)
-  m_cpu(x_cpu)
-  @test !(m_cpu.μ ≈ μ_cpu)
-  μ_cpu = copy(m_cpu.μ)
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  @test !(m_cpu.μ ≈ μ_cpu)
-
-  trainmode!(m_gpu)
-  μ_gpu = copy(m_gpu.μ)
-  m_gpu(x_gpu)
-  @test !(m_gpu.μ ≈ μ_gpu)
-  μ_gpu = copy(m_gpu.μ)
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-  @test !(m_gpu.μ ≈ μ_gpu)
-
-  ## No errors if input type mistmatch
-  # x_cpu = rand(Float64, 3, 2, 2)
-  # x_gpu = x_cpu |> gpu
-  # m_cpu(x_cpu)
-  # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  # m_gpu(x_gpu)
-  # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
-end
-
-@testset "Two-streams Bilinear" begin
-  x = zeros(Float32,10,9) |> gpu
-  y = zeros(Float32,2,9) |> gpu
-  b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
-end
-
-@testset "Two-streams Bilinear" begin
-  x = zeros(Float32,10,9) |> gpu
-  y = zeros(Float32,2,9) |> gpu
-  b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
-end
-
-@testset "Parallel" begin
-  @testset "zero sum" begin
-    input = randn(10, 10, 10, 10) |> gpu
-    layer_gpu = Parallel(+, zero, identity) |> gpu
-    @test layer_gpu(input) == input
-    @test layer_gpu(input) isa Flux.CUDA.CuArray
-  end
-
-  @testset "vararg input" begin
-    inputs = (randn(10), randn(5), randn(4)) .|> gpu
-    layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
-    @test size(layer(inputs)) == (2,)
-  end
-
-  @testset "gradient" begin
-    input_cpu = randn(10, 10, 10, 10)
-    input_gpu = input_cpu |> gpu
-    layer_cpu = Parallel(+, x -> zero(x), identity)
-    layer_gpu = layer_cpu |> gpu
-    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
-    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
-    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
-      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
-    end
-  end
-end
diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl
deleted file mode 100644
index a0f7f47d80..0000000000
--- a/test/cuda/losses.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
-
-
-@testset "Losses" begin
-
-x = [1.,2.,3.]
-cx = gpu(x)
-@test crossentropy(x,x) ≈ crossentropy(cx,cx)
-@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu
-@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x))
-
-x = [-1.1491, 0.8619, 0.3127]
-y = [1, 1, 0.]
-@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
-@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
-
-x = [0.268941  0.5  0.268941
-     0.731059  0.5  0.731059]
-y = [0  1  0
-     1  0  1]
-@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
-
-x = softmax(reshape(-7:7, 3, 5) .* 1f0)
-y = [1  0  0  0  1
-     0  1  0  1  0
-     0  0  1  0  0]
-@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
-
-@testset "GPU grad tests" begin
-  x = rand(Float32, 3,3)
-  y = rand(Float32, 3,3)
-
-  for loss in ALL_LOSSES
-    gpu_autodiff_test(loss, x, y)
-  end
-end
-
-end #testset
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
deleted file mode 100644
index 8ed3d66eb4..0000000000
--- a/test/cuda/runtests.jl
+++ /dev/null
@@ -1,19 +0,0 @@
-using Flux, Test, CUDA
-using Zygote
-using Zygote: pullback
-
-@info "Testing GPU Support"
-CUDA.allowscalar(false)
-
-include("test_utils.jl")
-include("cuda.jl")
-include("losses.jl")
-include("layers.jl")
-
-if CUDA.has_cudnn()
-  @info "Testing Flux/CUDNN"
-  include("cudnn.jl")
-  include("curnn.jl")
-else
-  @warn "CUDNN unavailable, not testing GPU DNN support"
-end
diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl
deleted file mode 100644
index bc0db37474..0000000000
--- a/test/cuda/test_utils.jl
+++ /dev/null
@@ -1,72 +0,0 @@
-function check_grad(g_gpu, g_cpu, atol, rtol)
-  @show g_gpu g_cpu
-  @test false
-end
-check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = 
-    check_grad(g_gpu[], g_cpu[], atol, rtol)
-check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true
-check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu   rtol=rtol atol=atol 
-check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = 
-  @test g_cpu ≈ collect(g_gpu)   rtol=rtol atol=atol
-
-function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol)
-  for (v1, v2) in zip(g_gpu, g_cpu)  
-    check_grad(v1, v2, atol, rtol)
-  end
-end
-
-function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol)
-  for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu))  
-    @test k1 == k2
-    # @show k2 v2
-    check_grad(v1, v2, atol, rtol)
-  end
-end
-
-function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; 
-        test_equal=true, rtol=1e-4, atol=1e-4)
-
-  check_type(x) = false
-  check_type(x::Float32) = true
-  check_type(x::CuArray{Float32}) = true
-  check_type(x::Array{Float32}) = true
-
-  ### GRADIENT WITH RESPECT TO INPUT #####
-  # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...)
-  y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
-  @test check_type(y_cpu)
-  Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
-  gs_cpu = back_cpu(Δ_cpu)
-
-  f_gpu = f_cpu |> gpu
-  xs_gpu = gpu.(xs_cpu)
-  Δ_gpu = Δ_cpu |> gpu 
-  # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...)
-  y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
-  @test check_type(y_gpu)
-  gs_gpu = back_gpu(Δ_gpu)
-
-  if test_equal 
-    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
-    for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
-      check_grad(g_gpu, g_cpu, atol, rtol)
-    end
-  end
-
-  ### GRADIENT WITH RESPECT TO f #####
-  ps_cpu = Flux.params(f_cpu)
-  y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
-  gs_cpu = back_cpu(Δ_cpu)
-
-  ps_gpu = Flux.params(f_gpu)
-  y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
-  gs_gpu = back_gpu(Δ_gpu)
-  
-  if test_equal 
-    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
-    @assert length(ps_gpu) == length(ps_cpu)
-    for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
-      check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
-    end
-  end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index a40433d0f1..d385420e3b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -25,7 +25,9 @@ end
 @testset "Losses" begin
   include("losses.jl")
   include("ctc.jl")
-  if Flux.use_cuda[] include("ctc-gpu.jl") end
+  if Flux.default_gpu_converter[] !== identity
+    include("ctc-gpu.jl")
+  end
 end
 
 @testset "Layers" begin
@@ -42,14 +44,6 @@ end
   include("outputsize.jl")
 end
 
-@testset "CUDA" begin
-  if Flux.use_cuda[]
-    include("cuda/runtests.jl")
-  else
-    @warn "CUDA unavailable, not testing GPU support"
-  end
-end
-
 @static if VERSION == v"1.5"
   using Documenter
   @testset "Docs" begin

From ec1e7195e9b12287867af365dafb3a33dd28856c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Thu, 29 Apr 2021 14:15:08 +0200
Subject: [PATCH 03/11] Update CI.

---
 .buildkite/pipeline.yml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 4dd12e2b57..8af3bd6734 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,20 +1,20 @@
 steps:
-  - label: "GPU integeration with julia v1.6"       # change to "v1" when 1.6 is out
+  - label: "CUDA support"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1"
-      - JuliaCI/julia-test#v1: ~
+          version: 1.6
+    command: |
+      julia -e 'using Pkg;
+
+                println("--- :julia: Instantiating project");
+                Pkg.develop(PackageSpec(path=pwd()));
+                Pkg.add(PackageSpec(url="https://github.com/FluxML/FluxCUDA.jl", rev="main"));
+                Pkg.instantiate();
+
+                println("+++ :julia: Running tests");
+                Pkg.test("FluxCUDA"; coverage=true);'
     agents:
       queue: "juliagpu"
       cuda: "*"
+    if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
-
-  # - label: "GPU nightly"
-  #   plugins:
-  #     - JuliaCI/julia#v1:
-  #         version: "nightly"
-  #     - JuliaCI/julia-test#v1: ~
-  #   agents:
-  #     queue: "juliagpu"
-  #     cuda: "*"
-  #   timeout_in_minutes: 60

From 32b50659ca26bb7483f34bcf1a38aff57252fa24 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Thu, 29 Apr 2021 15:01:36 +0200
Subject: [PATCH 04/11] Try subpackages.

---
 .buildkite/pipeline.yml                       |   3 +-
 lib/FluxCUDA/Project.toml                     |  12 +
 lib/FluxCUDA/src/FluxCUDA.jl                  |  13 +
 lib/FluxCUDA/src/ctc.jl                       | 232 ++++++++++++++++
 lib/FluxCUDA/src/cudnn.jl                     |  20 ++
 lib/FluxCUDA/src/onehot.jl                    |   5 +
 lib/FluxCUDA/test/.gitignore                  |   1 +
 .../FluxCUDA/test/Manifest.toml               | 102 ++++++-
 lib/FluxCUDA/test/Project.toml                |   7 +
 lib/FluxCUDA/test/cuda.jl                     |  61 ++++
 lib/FluxCUDA/test/cudnn.jl                    |  44 +++
 lib/FluxCUDA/test/curnn.jl                    |  58 ++++
 lib/FluxCUDA/test/layers.jl                   | 261 ++++++++++++++++++
 lib/FluxCUDA/test/losses.jl                   |  52 ++++
 lib/FluxCUDA/test/runtests.jl                 |  23 ++
 lib/FluxCUDA/test/test_utils.jl               |  72 +++++
 16 files changed, 950 insertions(+), 16 deletions(-)
 create mode 100644 lib/FluxCUDA/Project.toml
 create mode 100644 lib/FluxCUDA/src/FluxCUDA.jl
 create mode 100644 lib/FluxCUDA/src/ctc.jl
 create mode 100644 lib/FluxCUDA/src/cudnn.jl
 create mode 100644 lib/FluxCUDA/src/onehot.jl
 create mode 100644 lib/FluxCUDA/test/.gitignore
 rename Manifest.toml => lib/FluxCUDA/test/Manifest.toml (76%)
 create mode 100644 lib/FluxCUDA/test/Project.toml
 create mode 100644 lib/FluxCUDA/test/cuda.jl
 create mode 100644 lib/FluxCUDA/test/cudnn.jl
 create mode 100644 lib/FluxCUDA/test/curnn.jl
 create mode 100644 lib/FluxCUDA/test/layers.jl
 create mode 100644 lib/FluxCUDA/test/losses.jl
 create mode 100644 lib/FluxCUDA/test/runtests.jl
 create mode 100644 lib/FluxCUDA/test/test_utils.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8af3bd6734..a11e983e00 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -8,8 +8,7 @@ steps:
 
                 println("--- :julia: Instantiating project");
                 Pkg.develop(PackageSpec(path=pwd()));
-                Pkg.add(PackageSpec(url="https://github.com/FluxML/FluxCUDA.jl", rev="main"));
-                Pkg.instantiate();
+                Pkg.develop(PackageSpec(path=joinpath(pwd(), "lib/FluxCUDA")));
 
                 println("+++ :julia: Running tests");
                 Pkg.test("FluxCUDA"; coverage=true);'
diff --git a/lib/FluxCUDA/Project.toml b/lib/FluxCUDA/Project.toml
new file mode 100644
index 0000000000..40ec4b887d
--- /dev/null
+++ b/lib/FluxCUDA/Project.toml
@@ -0,0 +1,12 @@
+name = "FluxCUDA"
+uuid = "899ac4cc-ca92-44e3-a03b-1f2759274493"
+authors = ["Tim Besard <tim.besard@gmail.com>"]
+version = "0.1.0"
+
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/lib/FluxCUDA/src/FluxCUDA.jl b/lib/FluxCUDA/src/FluxCUDA.jl
new file mode 100644
index 0000000000..75de37eb34
--- /dev/null
+++ b/lib/FluxCUDA/src/FluxCUDA.jl
@@ -0,0 +1,13 @@
+module FluxCUDA
+
+using Flux, CUDA
+using NNlib, NNlibCUDA
+using Zygote
+using Zygote: @adjoint
+
+include("onehot.jl")
+include("ctc.jl")
+include("cudnn.jl")
+
+
+end # module
diff --git a/lib/FluxCUDA/src/ctc.jl b/lib/FluxCUDA/src/ctc.jl
new file mode 100644
index 0000000000..4afcea726a
--- /dev/null
+++ b/lib/FluxCUDA/src/ctc.jl
@@ -0,0 +1,232 @@
+# GPU implementation
+
+# a port of the GPU kernels from Baidu's C++ warp-ctc package,
+# which itself is Copyright 2015-2016 Baidu USA LLC
+# and available under the Apache 2.0 license
+#
+# Apache 2.0 license: https://www.apache.org/licenses/LICENSE-2.0
+# GitHub: https://github.com/baidu-research/warp-ctc/
+# paper: https://arxiv.org/pdf/1512.02595.pdf
+
+using Flux
+using Statistics
+using CUDA
+using NNlib
+
+const MAX_THREADS = 256
+
+function log_plus_f(p1, p2)
+  isinf(p1) && return p2
+  isinf(p2) && return p1
+  if p1 < p2
+    p1, p2 = p2, p1
+  end
+  return p1 + log(1+exp(p2 - p1))
+end
+
+function count_repeats(A)
+  repeats = 0
+  for (i,elem) in enumerate(A)
+    if i > 1 && A[i] == A[i-1]
+      repeats += 1
+    end
+  end
+  return repeats
+end
+
+function compute_alpha_kernel(probs, labelSize, uttLength, repeats, labelsWithoutBlanks, labelsWithBlanks, alpha, blankLabel)
+
+  tid = threadIdx().x
+  L = labelSize
+  T = uttLength
+  S = length(labelsWithBlanks)
+
+  if L + repeats > T
+    return nothing
+  end
+  labels = labelsWithBlanks
+
+  # Corner-case checking
+  start = (L + repeats <= T) ? 0 : 1
+  last = S > 1 ? 2 : 1
+
+  # Fill in first column (time step)
+  i = tid
+  while i <= last - start
+    alpha[start+i, 1] = probs[labels[start+i], 1]
+    i += blockDim().x
+  end
+  sync_threads()
+
+  # Fill in coefficients for each time step
+  for t=2:T
+    # Corner-case checking
+    if tid == 1 && !(1 < S - 2*(T-t) - 1)
+      if start == 0
+        alpha[1, t] = alpha[1, t-1] + probs[blankLabel, t]
+      elseif start == 1
+        alpha[1, t] = alpha[1, t-1]
+      end
+    end
+    sync_threads()
+
+    # Fill in coefficients for each label class in the target output sequence;
+    # each thread will process the calculations for one class
+    idx = tid+1
+    while idx <= S
+      prevSum = log_plus_f(alpha[idx, t-1], alpha[idx-1, t-1])
+      if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx-2]
+        prevSum = log_plus_f(prevSum, alpha[idx-2, t-1])
+      end
+      if idx < S - 2*(T-t) - 1
+        alpha[idx, t] = -Inf32
+      else
+        alpha[idx, t] = prevSum + probs[labels[idx], t]
+      end
+      idx += blockDim().x
+    end
+    sync_threads()
+  end
+  return nothing
+end
+
+function compute_beta_and_grad_kernel(probs, labelSize, uttLength,
+                  repeatsInLabel, labelsWithBlanks,
+                  alphas, beta, output, accum,
+                  grad, blankLabel, loss)
+
+  tid = threadIdx().x
+  L = labelSize
+  T = uttLength
+  S = 2*L + 1
+  repeats = repeatsInLabel
+  labels = labelsWithBlanks
+
+  if (L+repeats) > T
+    return nothing
+  end
+
+  # Corner-case checking
+  start = S > 1 ? S-2 : 0
+  last = L + repeats < T ? S : S-1
+  sync_threads()
+  i = tid
+
+  # Calculate coefficients for last column (time step)
+  # then determine alpha and beta product
+  while i <= last - start
+    beta[i+start, T] = 0
+    output[i+start, T] = beta[i+start, T] + alphas[i+start, T]
+    i += blockDim().x
+  end
+  sync_threads()
+
+  # Fill in `accum` for last column (time step)
+  if tid == 1
+    for i=1:S
+      labelIdx = labels[i]
+      accum[labelIdx, T] = log_plus_f(accum[labelIdx, T], output[i, T])
+    end
+  end
+  sync_threads()
+
+  # Fill in `grad` for last column (time step)
+  idx = tid
+  while idx <= size(grad, 1)
+    s = -Inf32
+    for i=1:S
+      s = log_plus_f(s, output[i, T])
+    end
+
+    # ∂L/∂a (where a is activation before logsoftmax)
+    grad[idx, T] = exp(probs[idx, T]) - exp(accum[idx, T] - s)
+    idx += blockDim().x
+  end
+  sync_threads()
+
+  # Fill in the rest of the coefficients
+  t = T-1
+  while t >= 1
+    if t < T
+      idx = tid
+      while idx <= S
+        nextSum = probs[labels[idx], t+1] + beta[idx, t+1]
+        if idx < S
+          nextSum = log_plus_f(nextSum,
+            probs[labels[idx+1], t+1] + beta[idx+1, t+1])
+        end
+        if labels[idx] != blankLabel && idx != S-1 && labels[idx] != labels[idx+2]
+          nextSum = log_plus_f(nextSum,
+            probs[labels[idx+2], t+1] + beta[idx + 2, t+1])
+        end
+        if idx > 2*t
+          beta[idx, t] = -Inf32
+        else
+          beta[idx, t] = nextSum
+        end
+        idx += blockDim().x
+      end
+      sync_threads()
+      idx = tid
+      while idx <= S
+        output[idx, t] = alphas[idx, t] + beta[idx, t]
+        idx += blockDim().x
+      end
+      sync_threads()
+    end
+    sync_threads()
+
+    # Calculate accumulated alpha-beta products for each label class for
+    # each time step; used in calculating gradients
+    if tid == 1
+      for i=1:S
+        labelIdx = labels[i]
+        accum[labelIdx, t] = log_plus_f(accum[labelIdx, t], output[i, t])
+      end
+    end
+    sync_threads()
+    idx = tid
+
+    # Calculate gradients
+    while idx <= size(grad, 1)
+
+      # ∂L/∂a (where a is activation before logsoftmax)
+      grad[idx, t] = exp(probs[idx, t]) - exp(accum[idx, t] + loss)
+      idx += blockDim().x
+    end
+    sync_threads()
+    t -= 1
+    sync_threads()
+  end
+  return nothing
+end
+
+function ctc_alpha(ŷ::CuArray, y)
+  ŷ = logsoftmax(ŷ)
+  blank = size(ŷ, 1)
+  z′ = fill(blank, 2 * length(y) + 1)
+  z′[eachindex(y) .* 2] = y
+  T = size(ŷ, 2)
+  U′ = 2*length(y) + 1
+  alphas = CUDA.fill(log(zero(ŷ[1])), U′,T)
+  nRepeats = count_repeats(y)
+  nThreads = min(U′, MAX_THREADS)
+  @cuda blocks=1 threads=nThreads compute_alpha_kernel(ŷ, length(y), T, nRepeats, CuArray(y), CuArray(z′), alphas, blank)
+  return (loss=-1 * logsumexp(alphas[end-1:end]), alpha=alphas, z′=z′, yhat=ŷ, nRepeats=nRepeats)
+end
+
+ctc_loss(ŷ::CuArray, y) = ctc_alpha(ŷ::CuArray, y).loss
+
+function ∇ctc_loss(ŷ::CuArray, y, out)
+  loss, alphas, z′, ŷ, nRepeats = out
+  U′, T = size(alphas)
+  blank = size(ŷ, 1)
+  typed_zero = zero(first(ŷ))
+  betas = CUDA.fill(log(typed_zero), U′, T)
+  output = CUDA.fill(log(typed_zero), U′, T)
+  nThreads = min(U′, MAX_THREADS)
+  grads = CUDA.fill(log(typed_zero), size(ŷ))
+  accum = CUDA.fill(log(typed_zero), size(ŷ))
+  @cuda blocks=1 threads=nThreads compute_beta_and_grad_kernel(ŷ, length(y), T, nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank, loss)
+  return grads
+end
diff --git a/lib/FluxCUDA/src/cudnn.jl b/lib/FluxCUDA/src/cudnn.jl
new file mode 100644
index 0000000000..4a3b2618c8
--- /dev/null
+++ b/lib/FluxCUDA/src/cudnn.jl
@@ -0,0 +1,20 @@
+import NNlibCUDA: batchnorm, ∇batchnorm
+
+function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
+                              cache=nothing) where T<:Union{Float32, Float64}
+  
+  @assert BN.affine "BatchNorm: only affine=true supported on gpu"
+  @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
+  @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wronng number of channels"
+  return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; 
+                  cache=cache, alpha=1, beta=0, eps=BN.ϵ, 
+                  training=Flux._isactive(BN)))
+end
+
+@adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...)
+  y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) 
+  function batchnorm_pullback(Δ)
+    ∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)..., nothing, nothing, nothing
+  end
+  y, batchnorm_pullback
+end
diff --git a/lib/FluxCUDA/src/onehot.jl b/lib/FluxCUDA/src/onehot.jl
new file mode 100644
index 0000000000..760df1d27e
--- /dev/null
+++ b/lib/FluxCUDA/src/onehot.jl
@@ -0,0 +1,5 @@
+import Flux: OneHotArray, OneHotLike, _onehot_bool_type
+
+_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N}
+
+Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}()
diff --git a/lib/FluxCUDA/test/.gitignore b/lib/FluxCUDA/test/.gitignore
new file mode 100644
index 0000000000..c181d1f83f
--- /dev/null
+++ b/lib/FluxCUDA/test/.gitignore
@@ -0,0 +1 @@
+test/Manifest.toml
diff --git a/Manifest.toml b/lib/FluxCUDA/test/Manifest.toml
similarity index 76%
rename from Manifest.toml
rename to lib/FluxCUDA/test/Manifest.toml
index 4aad025690..51ed72ce90 100644
--- a/Manifest.toml
+++ b/lib/FluxCUDA/test/Manifest.toml
@@ -23,9 +23,26 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
 [[Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
+[[BFloat16s]]
+deps = ["LinearAlgebra", "Test"]
+git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
+uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
+version = "0.1.0"
+
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
+[[CEnum]]
+git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.4.1"
+
+[[CUDA]]
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"]
+git-tree-sha1 = "d4fa6486e94c4087f1d081d7be2d501a170bd51d"
+uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
+version = "3.1.0"
+
 [[ChainRules]]
 deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Random", "Reexport", "Requires", "Statistics"]
 git-tree-sha1 = "1f410fba5c04d03ab712f348f1542e6059376547"
@@ -34,9 +51,9 @@ version = "0.7.61"
 
 [[ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "42e3c181483fbd2c416087a0a93838803e358358"
+git-tree-sha1 = "a66109c73612c63b10923ac446fddb0f0d21a593"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "0.9.38"
+version = "0.9.40"
 
 [[CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
@@ -46,15 +63,15 @@ version = "0.7.0"
 
 [[ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "32a2b8af383f11cbb65803883837a149d10dfe8a"
+git-tree-sha1 = "024fe24d83e4a5bf5fc80501a314ce0d1aa35597"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.10.12"
+version = "0.11.0"
 
 [[Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
-git-tree-sha1 = "82f4e6ff9f847eca3e5ebc666ea2cd7b48e8b47e"
+git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.12.7"
+version = "0.12.8"
 
 [[CommonSubexpressions]]
 deps = ["MacroTools", "Test"]
@@ -111,6 +128,11 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 deps = ["ArgTools", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
+[[ExprTools]]
+git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.3"
+
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
 git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a"
@@ -123,6 +145,12 @@ git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 version = "0.8.4"
 
+[[Flux]]
+deps = ["AbstractTrees", "Adapt", "CUDA", "CodecZlib", "Colors", "DelimitedFiles", "Functors", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "NNlibCUDA", "Pkg", "Printf", "Random", "Reexport", "SHA", "Statistics", "StatsBase", "Test", "ZipFile", "Zygote"]
+git-tree-sha1 = "5e94fff7b4385fdd059863300b6b25ea0f849dda"
+uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+version = "0.12.3"
+
 [[ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"]
 git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6"
@@ -141,6 +169,12 @@ git-tree-sha1 = "9c95b2fd5c16bc7f97371e9f92f0fef77e0f5957"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "6.2.2"
 
+[[GPUCompiler]]
+deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "6eadd2321dc3ac0fc9d530ab01c2caa7fe5d74c6"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.11.4"
+
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
 git-tree-sha1 = "c67e7515a11f726f44083e74f218d134396d6510"
@@ -163,6 +197,16 @@ git-tree-sha1 = "07cb43290a840908a771552911a6274bc6c072c7"
 uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
 version = "0.8.4"
 
+[[LLVM]]
+deps = ["CEnum", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "3.6.0"
+
+[[LazyArtifacts]]
+deps = ["Artifacts", "Pkg"]
+uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+
 [[LibCURL]]
 deps = ["LibCURL_jll", "MozillaCACerts_jll"]
 uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
@@ -209,11 +253,17 @@ git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58"
 uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
 version = "0.5.0"
 
+[[Memoize]]
+deps = ["MacroTools"]
+git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
+uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
+version = "0.4.4"
+
 [[Missings]]
 deps = ["DataAPI"]
-git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c"
+git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.5"
+version = "1.0.0"
 
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
@@ -227,6 +277,12 @@ git-tree-sha1 = "80b8360670f445d88b3475e88b33bbcc92f7866e"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 version = "0.7.19"
 
+[[NNlibCUDA]]
+deps = ["CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"]
+git-tree-sha1 = "4b368b466bcdd25d448a5b20de4b7e481d68b88e"
+uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d"
+version = "0.1.0"
+
 [[NaNMath]]
 git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
@@ -237,9 +293,9 @@ uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
 
 [[OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "9db77584158d0ab52307f8c04f8e7c08ca76b5b3"
+git-tree-sha1 = "b9b8b8ed236998f91143938a760c2112dceeb2b4"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+4"
+version = "0.5.4+0"
 
 [[OrderedCollections]]
 git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf"
@@ -272,6 +328,12 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 deps = ["Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
+[[RandomNumbers]]
+deps = ["Random", "Requires"]
+git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f"
+uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
+version = "1.4.0"
+
 [[Reexport]]
 git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5"
 uuid = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -286,6 +348,12 @@ version = "1.1.3"
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 
+[[Scratch]]
+deps = ["Dates"]
+git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6"
+uuid = "6c6a2e73-6563-6170-7368-637461726353"
+version = "1.0.3"
+
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
@@ -314,9 +382,9 @@ version = "1.3.0"
 
 [[StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "e8cd1b100d37f5b4cfd2c83f45becf61c762eaf7"
+git-tree-sha1 = "2653e9c769343808781a8bd5010ee7a17c01152e"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.1.1"
+version = "1.1.2"
 
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
@@ -324,9 +392,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "4bc58880426274277a066de306ef19ecc22a6863"
+git-tree-sha1 = "4d8ca45223d7a28839e775d73a6f6b6b2ac64fd1"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.33.5"
+version = "0.33.6"
 
 [[TOML]]
 deps = ["Dates"]
@@ -340,6 +408,12 @@ uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[[TimerOutputs]]
+deps = ["Printf"]
+git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.8"
+
 [[TranscodingStreams]]
 deps = ["Random", "Test"]
 git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
diff --git a/lib/FluxCUDA/test/Project.toml b/lib/FluxCUDA/test/Project.toml
new file mode 100644
index 0000000000..8d9b755315
--- /dev/null
+++ b/lib/FluxCUDA/test/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/lib/FluxCUDA/test/cuda.jl b/lib/FluxCUDA/test/cuda.jl
new file mode 100644
index 0000000000..65982836ab
--- /dev/null
+++ b/lib/FluxCUDA/test/cuda.jl
@@ -0,0 +1,61 @@
+using LinearAlgebra
+
+@testset "CUDA" begin
+  x = randn(5, 5)
+  cx = gpu(x)
+  @test cx isa CuArray
+
+  @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3
+
+  x = Flux.onehotbatch([1, 2, 3], 1:3)
+  cx = gpu(x)
+  @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray
+  @test (cx .+ 1) isa CuArray
+
+  m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
+  cm = gpu(m)
+
+  @test all(p isa CuArray for p in params(cm))
+  @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
+
+  xs = rand(5, 5)
+  ys = Flux.onehotbatch(1:5,1:5)
+  @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
+
+  c = gpu(Conv((2,2),3=>4))
+  x = gpu(rand(10, 10, 3, 2))
+  l = c(gpu(rand(10,10,3,2)))
+  @test gradient(x -> sum(c(x)), x)[1] isa CuArray
+
+  c = gpu(CrossCor((2,2),3=>4))
+  x = gpu(rand(10, 10, 3, 2))
+  l = c(gpu(rand(10,10,3,2)))
+  @test gradient(x -> sum(c(x)), x)[1] isa CuArray
+
+end
+
+@testset "onecold gpu" begin
+  y = Flux.onehotbatch(ones(3), 1:10) |> gpu;
+  l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
+  @test Flux.onecold(y) isa CuArray
+  @test y[3,:] isa CuArray
+  @test Flux.onecold(y, l) == ['a', 'a', 'a']
+end
+
+@testset "restructure gpu" begin
+  dudt = Dense(1,1) |> gpu
+  p,re = Flux.destructure(dudt)
+  foo(x) = sum(re(p)(x))
+  @test gradient(foo, cu(rand(1)))[1] isa CuArray
+end
+
+@testset "GPU functors" begin
+  @testset "Cholesky" begin
+    M = 2.0*I(10) |> collect
+    Q = cholesky(M)
+    Q_gpu = Q |> gpu
+    @test Q_gpu isa Cholesky{<:Any,<:CuArray}
+    Q_cpu = Q_gpu |> cpu
+    @test Q_cpu == cholesky(eltype(Q_gpu).(M))
+  end
+end
diff --git a/lib/FluxCUDA/test/cudnn.jl b/lib/FluxCUDA/test/cudnn.jl
new file mode 100644
index 0000000000..5d1727e984
--- /dev/null
+++ b/lib/FluxCUDA/test/cudnn.jl
@@ -0,0 +1,44 @@
+using Flux, CUDA, Test
+using Flux: pullback
+
+@testset "CUDNN BatchNorm" begin
+    @testset "4D Input" begin
+        x = rand(Float32, 2, 2, 3, 4)
+        m = BatchNorm(3)
+        gx = gpu(x)
+        gm = gpu(m)
+
+        y, back = pullback((m, x) -> m(x), m, x)
+        gy, gback = pullback((m, x) -> m(x), gm, gx)
+
+        @test cpu(gy) ≈ y
+
+        Δ = randn(Float32, size(y))
+        dm, dx = back(Δ)
+        gdm, gdx = gback(gpu(Δ))
+
+        @test dm[].γ ≈ cpu(gdm[].γ)
+        @test dm[].β ≈ cpu(gdm[].β)
+        @test dx ≈ cpu(gdx)
+    end
+
+    @testset "2D Input" begin
+        x = rand(Float32, 3, 4)
+        m = BatchNorm(3)
+        gx = gpu(x)
+        gm = gpu(m)
+
+        y, back = pullback((m, x) -> m(x), m, x)
+        gy, gback = pullback((m, x) -> m(x), gm, gx)
+
+        @test cpu(gy) ≈ y
+
+        Δ = randn(Float32, size(y))
+        dm, dx = back(Δ)
+        gdm, gdx = gback(gpu(Δ))
+
+        @test dm[].γ ≈ cpu(gdm[].γ)
+        @test dm[].β ≈ cpu(gdm[].β)
+        @test dx ≈ cpu(gdx)
+    end
+end
diff --git a/lib/FluxCUDA/test/curnn.jl b/lib/FluxCUDA/test/curnn.jl
new file mode 100644
index 0000000000..63a5f93ada
--- /dev/null
+++ b/lib/FluxCUDA/test/curnn.jl
@@ -0,0 +1,58 @@
+using Flux, CUDA, Test
+
+@testset for R in [RNN, GRU, LSTM]
+  m = R(10, 5) |> gpu
+  x = gpu(rand(10))
+  (m̄,) = gradient(m -> sum(m(x)), m)
+  Flux.reset!(m)
+  θ = gradient(() -> sum(m(x)), params(m))
+  @test x isa CuArray
+  @test θ[m.cell.Wi] isa CuArray
+  @test collect(m̄[].cell.Wi) == collect(θ[m.cell.Wi])
+end
+
+@testset "RNN" begin
+  @testset for R in [RNN, GRU, LSTM], batch_size in (1, 5)
+    rnn = R(10, 5)
+    curnn = fmap(gpu, rnn)
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    x = batch_size == 1 ?
+      rand(Float32, 10) :
+      rand(Float32, 10, batch_size)
+    cux = gpu(x)
+
+    y, back = pullback((r, x) -> r(x), rnn, x)
+    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
+
+    @test y ≈ collect(cuy)
+
+    ȳ = randn(size(y))
+    m̄, x̄ = back(ȳ)
+    cum̄, cux̄ = cuback(gpu(ȳ))
+
+    @test x̄ ≈ collect(cux̄)
+    @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
+    @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
+    @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
+    if m̄[].state isa Tuple
+      for (x, cx) in zip(m̄[].state, cum̄[].state)
+        @test x ≈ collect(cx)
+      end
+    else
+      @test m̄[].state ≈ collect(cum̄[].state)
+    end
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    ohx = batch_size == 1 ?
+      Flux.onehot(rand(1:10), 1:10) :
+      Flux.onehotbatch(rand(1:10, batch_size), 1:10)
+    cuohx = gpu(ohx)
+    y = (rnn(ohx); rnn(ohx))
+
+    cuy = (curnn(cuohx); curnn(cuohx))
+    @test y ≈ collect(cuy)  
+  end
+end
diff --git a/lib/FluxCUDA/test/layers.jl b/lib/FluxCUDA/test/layers.jl
new file mode 100644
index 0000000000..78dc9aa7d2
--- /dev/null
+++ b/lib/FluxCUDA/test/layers.jl
@@ -0,0 +1,261 @@
+# Test layers and data/model movements on and off the GPU
+# Add tests for layers and their gradients on the GPU
+# Most of the forward passes should be fine being applied
+# to bitstype objects, but this gives higher coverage for our use-cases
+# Check that getting the gradients does not throw
+
+# generic movement tests
+@testset "Basic GPU Movement" begin
+  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
+  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
+end
+
+# TODO: These layers get into scalar indexing
+# `AlphaDropout` throws a compilation error on GPUs,
+# whereas, the rest are scalar indexing issues.
+# The norm layers behave differently on the CPU and
+# the GPU too.
+const BROKEN_LAYERS = Union{DepthwiseConv,
+                            AlphaDropout}
+
+const ACTIVATIONS = [identity, relu, tanh,
+                     sigmoid, exp, softplus,
+                     elu, selu]
+
+function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true)
+  isnothing(x_cpu) && error("Missing input to test the layers against.")
+  @testset "$name GPU grad tests" begin
+    for layer in layers
+      @testset "$layer Layer GPU grad test" begin
+
+        # compute output and grad of parameters
+        l_cpu = layer(args...)
+        ps_cpu = Flux.params(l_cpu)
+        y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
+        gs_cpu = back_cpu(1f0)
+
+        x_gpu = gpu(x_cpu)
+        l_gpu = l_cpu |> gpu
+        ps_gpu = Flux.params(l_gpu)
+
+        if typeof(l_gpu) <: BROKEN_LAYERS
+          @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Zygote.Grads
+        else
+          y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
+          gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix
+
+          # compute grad of input
+          xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
+          xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
+
+          # test
+          if test_cpu
+            @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3
+            @test Array(xg_gpu) ≈ xg_cpu rtol=1f-3 atol=1f-3
+          end
+          @test gs_gpu isa Zygote.Grads
+          for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
+            @test gs_gpu[p_gpu] isa CUDA.CuArray
+            if test_cpu
+              @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3
+            end
+          end
+        end
+      end
+    end
+  end
+end
+
+# Just to give testset in gpu_gradtest meaningful labels
+ConvNoBias(args...) = Conv(args...; bias = false)
+ConvTransposeNoBias(args...) = ConvTranspose(args...; bias = false)
+CrossCorNoBias(args...) = CrossCor(args...; bias = false)
+DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias = false)
+
+for act in ACTIVATIONS
+  r = rand(Float32, 28, 28, 1, 1)
+  conv_layers = [Conv, ConvNoBias,
+                 ConvTranspose, ConvTransposeNoBias,
+                 CrossCor, CrossCorNoBias,
+                 DepthwiseConv, DepthwiseConvNoBias]
+  gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false)
+
+  batch_norm = [BatchNorm]
+  gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors
+  gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false)
+
+  instancenorm = [InstanceNorm]
+  gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
+
+  groupnorm = [GroupNorm]
+  gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false)
+end
+
+r = rand(Float32, 28, 28, 1, 1)
+
+pooling_layers = [MaxPool, MeanPool]
+gpu_gradtest("Pooling", pooling_layers, r, (2,2))
+
+adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool]
+gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false)
+
+dropout_layers = [Dropout, AlphaDropout]
+gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic
+
+layer_norm = [LayerNorm]
+gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors
+gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
+
+upsample = [x -> Upsample(scale=x)]
+gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
+gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
+
+pixelshuffle = [PixelShuffle]
+gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
+gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
+
+@testset "function layers" begin
+  x = rand(Float32, 3,3)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
+end
+
+@testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
+  l = cl((2,2), 1=>3, bias = false) |> gpu
+  ip = zeros(Float32, 28,28,1,1) |> gpu
+  if typeof(l) <: BROKEN_LAYERS
+    @test_broken sum(l(ip)) ≈ 0.f0
+    @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Zygote.Grads
+  else
+    @test sum(l(ip)) ≈ 0.f0
+    gs = gradient(() -> sum(l(ip)), Flux.params(l))
+    @test l.bias ∉ gs.params
+  end
+end
+
+@testset "Dense with Zeros bias" begin
+  l = Dense(ones(Float32, 4,3), Flux.Zeros()) |> gpu
+  ip = zeros(Float32, 3, 7) |> gpu
+
+  @test sum(l(ip)) ≈ 0.f0
+  gs = gradient(() -> sum(l(ip)), Flux.params(l))
+  @test l.b ∉ gs.params
+end
+
+@testset "Extended BatchNorm" begin
+  m_cpu = BatchNorm(2)
+  m_gpu = m_cpu |> gpu
+  x_cpu = rand(Float32, 3, 2, 2)
+  x_gpu = x_cpu |> gpu
+
+  ## In :auto mode, track statistics only in gradient contest
+  μ_cpu = copy(m_cpu.μ)
+  m_cpu(x_cpu)
+  @test m_cpu.μ ≈ μ_cpu
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  @test !(m_cpu.μ ≈ μ_cpu)
+
+  μ_gpu = copy(m_gpu.μ)
+  m_gpu(x_gpu)
+  @test m_gpu.μ ≈ μ_gpu
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  @test !(m_gpu.μ ≈ μ_gpu)
+
+  @test Array(m_gpu.μ) ≈ m_cpu.μ
+
+  ## In testmode, never track statistics
+  testmode!(m_cpu)
+  μ_cpu = copy(m_cpu.μ)
+  m_cpu(x_cpu)
+  @test m_cpu.μ ≈ μ_cpu
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  @test m_cpu.μ ≈ μ_cpu
+
+  testmode!(m_gpu)
+  μ_gpu = copy(m_gpu.μ)
+  m_gpu(x_gpu)
+  @test m_gpu.μ ≈ μ_gpu
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  @test m_gpu.μ ≈ μ_gpu
+
+  ## In trainmode, always track statistics
+  trainmode!(m_cpu)
+  μ_cpu = copy(m_cpu.μ)
+  m_cpu(x_cpu)
+  @test !(m_cpu.μ ≈ μ_cpu)
+  μ_cpu = copy(m_cpu.μ)
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  @test !(m_cpu.μ ≈ μ_cpu)
+
+  trainmode!(m_gpu)
+  μ_gpu = copy(m_gpu.μ)
+  m_gpu(x_gpu)
+  @test !(m_gpu.μ ≈ μ_gpu)
+  μ_gpu = copy(m_gpu.μ)
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  @test !(m_gpu.μ ≈ μ_gpu)
+
+  ## No errors if input type mistmatch
+  # x_cpu = rand(Float64, 3, 2, 2)
+  # x_gpu = x_cpu |> gpu
+  # m_cpu(x_cpu)
+  # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  # m_gpu(x_gpu)
+  # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+end
+
+@testset "Two-streams Bilinear" begin
+  x = zeros(Float32,10,9) |> gpu
+  y = zeros(Float32,2,9) |> gpu
+  b = Flux.Bilinear(10, 2, 3) |> gpu
+  @test size(b(x,y)) == (3,9)
+  @test sum(abs2, b(x,y)) ≈ 0f0
+  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
+  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
+  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
+  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
+    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
+  end
+end
+
+@testset "Two-streams Bilinear" begin
+  x = zeros(Float32,10,9) |> gpu
+  y = zeros(Float32,2,9) |> gpu
+  b = Flux.Bilinear(10, 2, 3) |> gpu
+  @test size(b(x,y)) == (3,9)
+  @test sum(abs2, b(x,y)) ≈ 0f0
+  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
+  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
+  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
+  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
+    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
+  end
+end
+
+@testset "Parallel" begin
+  @testset "zero sum" begin
+    input = randn(10, 10, 10, 10) |> gpu
+    layer_gpu = Parallel(+, zero, identity) |> gpu
+    @test layer_gpu(input) == input
+    @test layer_gpu(input) isa CUDA.CuArray
+  end
+
+  @testset "vararg input" begin
+    inputs = (randn(10), randn(5), randn(4)) .|> gpu
+    layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
+    @test size(layer(inputs)) == (2,)
+  end
+
+  @testset "gradient" begin
+    input_cpu = randn(10, 10, 10, 10)
+    input_gpu = input_cpu |> gpu
+    layer_cpu = Parallel(+, x -> zero(x), identity)
+    layer_gpu = layer_cpu |> gpu
+    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
+    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
+    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
+      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
+    end
+  end
+end
diff --git a/lib/FluxCUDA/test/losses.jl b/lib/FluxCUDA/test/losses.jl
new file mode 100644
index 0000000000..6ed5089f9e
--- /dev/null
+++ b/lib/FluxCUDA/test/losses.jl
@@ -0,0 +1,52 @@
+using Statistics
+
+using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
+
+# XXX: duplicated from Flux' tests
+const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
+                    Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy,
+                    Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy,
+                    Flux.Losses.kldivergence,
+                    Flux.Losses.huber_loss,
+                    Flux.Losses.tversky_loss,
+                    Flux.Losses.dice_coeff_loss,
+                    Flux.Losses.poisson_loss,
+                    Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss,
+                    Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss]
+
+
+@testset "Losses" begin
+
+x = [1.,2.,3.]
+cx = gpu(x)
+@test crossentropy(x,x) ≈ crossentropy(cx,cx)
+@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu
+@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x))
+
+x = [-1.1491, 0.8619, 0.3127]
+y = [1, 1, 0.]
+@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
+@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
+
+x = [0.268941  0.5  0.268941
+     0.731059  0.5  0.731059]
+y = [0  1  0
+     1  0  1]
+@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
+
+x = softmax(reshape(-7:7, 3, 5) .* 1f0)
+y = [1  0  0  0  1
+     0  1  0  1  0
+     0  0  1  0  0]
+@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
+
+@testset "GPU grad tests" begin
+  x = rand(Float32, 3,3)
+  y = rand(Float32, 3,3)
+
+  for loss in ALL_LOSSES
+    gpu_autodiff_test(loss, x, y)
+  end
+end
+
+end #testset
diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl
new file mode 100644
index 0000000000..40ba98e028
--- /dev/null
+++ b/lib/FluxCUDA/test/runtests.jl
@@ -0,0 +1,23 @@
+using Test
+
+using CUDA
+CUDA.allowscalar(false)
+
+using Flux, FluxCUDA
+Flux.default_gpu_converter[] = cu
+
+using Zygote
+using Zygote: pullback
+
+include("test_utils.jl")
+include("cuda.jl")
+include("losses.jl")
+include("layers.jl")
+
+if CUDA.has_cudnn()
+  @info "Testing Flux/CUDNN"
+  include("cudnn.jl")
+  include("curnn.jl")
+else
+  @warn "CUDNN unavailable, not testing GPU DNN support"
+end
diff --git a/lib/FluxCUDA/test/test_utils.jl b/lib/FluxCUDA/test/test_utils.jl
new file mode 100644
index 0000000000..bc0db37474
--- /dev/null
+++ b/lib/FluxCUDA/test/test_utils.jl
@@ -0,0 +1,72 @@
+function check_grad(g_gpu, g_cpu, atol, rtol)
+  @show g_gpu g_cpu
+  @test false
+end
+check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = 
+    check_grad(g_gpu[], g_cpu[], atol, rtol)
+check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true
+check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu   rtol=rtol atol=atol 
+check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = 
+  @test g_cpu ≈ collect(g_gpu)   rtol=rtol atol=atol
+
+function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol)
+  for (v1, v2) in zip(g_gpu, g_cpu)  
+    check_grad(v1, v2, atol, rtol)
+  end
+end
+
+function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol)
+  for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu))  
+    @test k1 == k2
+    # @show k2 v2
+    check_grad(v1, v2, atol, rtol)
+  end
+end
+
+function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; 
+        test_equal=true, rtol=1e-4, atol=1e-4)
+
+  check_type(x) = false
+  check_type(x::Float32) = true
+  check_type(x::CuArray{Float32}) = true
+  check_type(x::Array{Float32}) = true
+
+  ### GRADIENT WITH RESPECT TO INPUT #####
+  # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...)
+  y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
+  @test check_type(y_cpu)
+  Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
+  gs_cpu = back_cpu(Δ_cpu)
+
+  f_gpu = f_cpu |> gpu
+  xs_gpu = gpu.(xs_cpu)
+  Δ_gpu = Δ_cpu |> gpu 
+  # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...)
+  y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
+  @test check_type(y_gpu)
+  gs_gpu = back_gpu(Δ_gpu)
+
+  if test_equal 
+    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
+    for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
+      check_grad(g_gpu, g_cpu, atol, rtol)
+    end
+  end
+
+  ### GRADIENT WITH RESPECT TO f #####
+  ps_cpu = Flux.params(f_cpu)
+  y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
+  gs_cpu = back_cpu(Δ_cpu)
+
+  ps_gpu = Flux.params(f_gpu)
+  y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
+  gs_gpu = back_gpu(Δ_gpu)
+  
+  if test_equal 
+    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
+    @assert length(ps_gpu) == length(ps_cpu)
+    for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
+      check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
+    end
+  end
+end

From 3e5803f58265bf30053c992a43b7173d2909f7f7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Thu, 29 Apr 2021 15:50:48 +0200
Subject: [PATCH 05/11] Load Flux and CUDA via FluxCUDA.

Avoids a needless listing in test/Project.toml, making it possible
to pick up development versions.
---
 lib/FluxCUDA/test/Manifest.toml | 458 --------------------------------
 lib/FluxCUDA/test/Project.toml  |   2 -
 lib/FluxCUDA/test/cudnn.jl      |   3 -
 lib/FluxCUDA/test/curnn.jl      |   2 -
 lib/FluxCUDA/test/losses.jl     |   2 +-
 lib/FluxCUDA/test/runtests.jl   |   7 +-
 6 files changed, 6 insertions(+), 468 deletions(-)
 delete mode 100644 lib/FluxCUDA/test/Manifest.toml

diff --git a/lib/FluxCUDA/test/Manifest.toml b/lib/FluxCUDA/test/Manifest.toml
deleted file mode 100644
index 51ed72ce90..0000000000
--- a/lib/FluxCUDA/test/Manifest.toml
+++ /dev/null
@@ -1,458 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[AbstractFFTs]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
-uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.0.1"
-
-[[AbstractTrees]]
-git-tree-sha1 = "03e0550477d86222521d254b741d470ba17ea0b5"
-uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.4"
-
-[[Adapt]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "f1b523983a58802c4695851926203b36e28f09db"
-uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "3.3.0"
-
-[[ArgTools]]
-uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-
-[[Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[BFloat16s]]
-deps = ["LinearAlgebra", "Test"]
-git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
-uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-version = "0.1.0"
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[CEnum]]
-git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
-uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.4.1"
-
-[[CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "TimerOutputs"]
-git-tree-sha1 = "d4fa6486e94c4087f1d081d7be2d501a170bd51d"
-uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "3.1.0"
-
-[[ChainRules]]
-deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Random", "Reexport", "Requires", "Statistics"]
-git-tree-sha1 = "1f410fba5c04d03ab712f348f1542e6059376547"
-uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2"
-version = "0.7.61"
-
-[[ChainRulesCore]]
-deps = ["Compat", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "a66109c73612c63b10923ac446fddb0f0d21a593"
-uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "0.9.40"
-
-[[CodecZlib]]
-deps = ["TranscodingStreams", "Zlib_jll"]
-git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
-uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
-version = "0.7.0"
-
-[[ColorTypes]]
-deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "024fe24d83e4a5bf5fc80501a314ce0d1aa35597"
-uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.11.0"
-
-[[Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
-git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40"
-uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.12.8"
-
-[[CommonSubexpressions]]
-deps = ["MacroTools", "Test"]
-git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7"
-uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
-version = "0.3.0"
-
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "ac4132ad78082518ec2037ae5770b6e796f7f956"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "3.27.0"
-
-[[CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-
-[[DataAPI]]
-git-tree-sha1 = "dfb3b7e89e395be1e25c2ad6d7690dc29cc53b1d"
-uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.6.0"
-
-[[DataStructures]]
-deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677"
-uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.18.9"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[DelimitedFiles]]
-deps = ["Mmap"]
-uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
-
-[[DiffResults]]
-deps = ["StaticArrays"]
-git-tree-sha1 = "c18e98cba888c6c25d1c3b048e4b3380ca956805"
-uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "1.0.3"
-
-[[DiffRules]]
-deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "214c3fcac57755cfda163d91c58893a8723f93e9"
-uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.0.2"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[Downloads]]
-deps = ["ArgTools", "LibCURL", "NetworkOptions"]
-uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-
-[[ExprTools]]
-git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
-uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.3"
-
-[[FillArrays]]
-deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a"
-uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.11.7"
-
-[[FixedPointNumbers]]
-deps = ["Statistics"]
-git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
-uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.8.4"
-
-[[Flux]]
-deps = ["AbstractTrees", "Adapt", "CUDA", "CodecZlib", "Colors", "DelimitedFiles", "Functors", "Juno", "LinearAlgebra", "MacroTools", "NNlib", "NNlibCUDA", "Pkg", "Printf", "Random", "Reexport", "SHA", "Statistics", "StatsBase", "Test", "ZipFile", "Zygote"]
-git-tree-sha1 = "5e94fff7b4385fdd059863300b6b25ea0f849dda"
-uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.12.3"
-
-[[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6"
-uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.18"
-
-[[Functors]]
-deps = ["MacroTools"]
-git-tree-sha1 = "a7bb2af991c43dcf5c3455d276dd83976799634f"
-uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
-version = "0.2.1"
-
-[[GPUArrays]]
-deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "9c95b2fd5c16bc7f97371e9f92f0fef77e0f5957"
-uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "6.2.2"
-
-[[GPUCompiler]]
-deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "6eadd2321dc3ac0fc9d530ab01c2caa7fe5d74c6"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.11.4"
-
-[[IRTools]]
-deps = ["InteractiveUtils", "MacroTools", "Test"]
-git-tree-sha1 = "c67e7515a11f726f44083e74f218d134396d6510"
-uuid = "7869d1d1-7146-5819-86e3-90919afe41df"
-version = "0.4.2"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[JLLWrappers]]
-deps = ["Preferences"]
-git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
-uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.3.0"
-
-[[Juno]]
-deps = ["Base64", "Logging", "Media", "Profile"]
-git-tree-sha1 = "07cb43290a840908a771552911a6274bc6c072c7"
-uuid = "e5e0dc1b-0480-54bc-9374-aad01c23163d"
-version = "0.8.4"
-
-[[LLVM]]
-deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194"
-uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "3.6.0"
-
-[[LazyArtifacts]]
-deps = ["Artifacts", "Pkg"]
-uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
-
-[[LibCURL]]
-deps = ["LibCURL_jll", "MozillaCACerts_jll"]
-uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
-
-[[LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
-uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-
-[[LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
-uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-
-[[Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[LinearAlgebra]]
-deps = ["Libdl"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[MacroTools]]
-deps = ["Markdown", "Random"]
-git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
-uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.6"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-
-[[Media]]
-deps = ["MacroTools", "Test"]
-git-tree-sha1 = "75a54abd10709c01f1b86b84ec225d26e840ed58"
-uuid = "e89f7d12-3494-54d1-8411-f7d8b9ae1f27"
-version = "0.5.0"
-
-[[Memoize]]
-deps = ["MacroTools"]
-git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
-uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
-version = "0.4.4"
-
-[[Missings]]
-deps = ["DataAPI"]
-git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7"
-uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "1.0.0"
-
-[[Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[MozillaCACerts_jll]]
-uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-
-[[NNlib]]
-deps = ["Adapt", "ChainRulesCore", "Compat", "LinearAlgebra", "Pkg", "Requires", "Statistics"]
-git-tree-sha1 = "80b8360670f445d88b3475e88b33bbcc92f7866e"
-uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.7.19"
-
-[[NNlibCUDA]]
-deps = ["CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"]
-git-tree-sha1 = "4b368b466bcdd25d448a5b20de4b7e481d68b88e"
-uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d"
-version = "0.1.0"
-
-[[NaNMath]]
-git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb"
-uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.5"
-
-[[NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-
-[[OpenSpecFun_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "b9b8b8ed236998f91143938a760c2112dceeb2b4"
-uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.4+0"
-
-[[OrderedCollections]]
-git-tree-sha1 = "4fa2ba51070ec13fcc7517db714445b4ab986bdf"
-uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.4.0"
-
-[[Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[Preferences]]
-deps = ["TOML"]
-git-tree-sha1 = "ea79e4c9077208cd3bc5d29631a26bc0cff78902"
-uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.2.1"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[Profile]]
-deps = ["Printf"]
-uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[RandomNumbers]]
-deps = ["Random", "Requires"]
-git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f"
-uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
-version = "1.4.0"
-
-[[Reexport]]
-git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5"
-uuid = "189a3867-3050-52da-a836-e630ba90ab69"
-version = "1.0.0"
-
-[[Requires]]
-deps = ["UUIDs"]
-git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.1.3"
-
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-
-[[Scratch]]
-deps = ["Dates"]
-git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6"
-uuid = "6c6a2e73-6563-6170-7368-637461726353"
-version = "1.0.3"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[SortingAlgorithms]]
-deps = ["DataStructures", "Random", "Test"]
-git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd"
-uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "0.3.1"
-
-[[SparseArrays]]
-deps = ["LinearAlgebra", "Random"]
-uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-
-[[SpecialFunctions]]
-deps = ["ChainRulesCore", "OpenSpecFun_jll"]
-git-tree-sha1 = "5919936c0e92cff40e57d0ddf0ceb667d42e5902"
-uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "1.3.0"
-
-[[StaticArrays]]
-deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "2653e9c769343808781a8bd5010ee7a17c01152e"
-uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.1.2"
-
-[[Statistics]]
-deps = ["LinearAlgebra", "SparseArrays"]
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[[StatsBase]]
-deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "4d8ca45223d7a28839e775d73a6f6b6b2ac64fd1"
-uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.33.6"
-
-[[TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-
-[[Tar]]
-deps = ["ArgTools", "SHA"]
-uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-
-[[Test]]
-deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[TimerOutputs]]
-deps = ["Printf"]
-git-tree-sha1 = "32cdbe6cd2d214c25a0b88f985c9e0092877c236"
-uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.8"
-
-[[TranscodingStreams]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
-uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.9.5"
-
-[[UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[ZipFile]]
-deps = ["Libdl", "Printf", "Zlib_jll"]
-git-tree-sha1 = "c3a5637e27e914a7a445b8d0ad063d701931e9f7"
-uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
-version = "0.9.3"
-
-[[Zlib_jll]]
-deps = ["Libdl"]
-uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-
-[[Zygote]]
-deps = ["AbstractFFTs", "ChainRules", "ChainRulesCore", "DiffRules", "Distributed", "FillArrays", "ForwardDiff", "IRTools", "InteractiveUtils", "LinearAlgebra", "MacroTools", "NaNMath", "Random", "Requires", "SpecialFunctions", "Statistics", "ZygoteRules"]
-git-tree-sha1 = "927209c83efa62256788a9880c191774c07c5b51"
-uuid = "e88e6eb3-aa80-5325-afca-941959d7151f"
-version = "0.6.10"
-
-[[ZygoteRules]]
-deps = ["MacroTools"]
-git-tree-sha1 = "9e7a1e8ca60b742e508a315c17eef5211e7fbfd7"
-uuid = "700de1a5-db45-46bc-99cf-38207098b444"
-version = "0.2.1"
-
-[[nghttp2_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-
-[[p7zip_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
diff --git a/lib/FluxCUDA/test/Project.toml b/lib/FluxCUDA/test/Project.toml
index 8d9b755315..e5dd177881 100644
--- a/lib/FluxCUDA/test/Project.toml
+++ b/lib/FluxCUDA/test/Project.toml
@@ -1,6 +1,4 @@
 [deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/lib/FluxCUDA/test/cudnn.jl b/lib/FluxCUDA/test/cudnn.jl
index 5d1727e984..bbb9d644b7 100644
--- a/lib/FluxCUDA/test/cudnn.jl
+++ b/lib/FluxCUDA/test/cudnn.jl
@@ -1,6 +1,3 @@
-using Flux, CUDA, Test
-using Flux: pullback
-
 @testset "CUDNN BatchNorm" begin
     @testset "4D Input" begin
         x = rand(Float32, 2, 2, 3, 4)
diff --git a/lib/FluxCUDA/test/curnn.jl b/lib/FluxCUDA/test/curnn.jl
index 63a5f93ada..750acf5b12 100644
--- a/lib/FluxCUDA/test/curnn.jl
+++ b/lib/FluxCUDA/test/curnn.jl
@@ -1,5 +1,3 @@
-using Flux, CUDA, Test
-
 @testset for R in [RNN, GRU, LSTM]
   m = R(10, 5) |> gpu
   x = gpu(rand(10))
diff --git a/lib/FluxCUDA/test/losses.jl b/lib/FluxCUDA/test/losses.jl
index 6ed5089f9e..60b4469df2 100644
--- a/lib/FluxCUDA/test/losses.jl
+++ b/lib/FluxCUDA/test/losses.jl
@@ -1,6 +1,6 @@
 using Statistics
 
-using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
+using .Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
 
 # XXX: duplicated from Flux' tests
 const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl
index 40ba98e028..c9deebdff3 100644
--- a/lib/FluxCUDA/test/runtests.jl
+++ b/lib/FluxCUDA/test/runtests.jl
@@ -1,9 +1,12 @@
 using Test
 
-using CUDA
+using FluxCUDA
+using FluxCUDA: Flux, CUDA
+
+using .CUDA
 CUDA.allowscalar(false)
 
-using Flux, FluxCUDA
+using .Flux
 Flux.default_gpu_converter[] = cu
 
 using Zygote

From f9d7124941501100bbad244157ef72a67243434b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Thu, 29 Apr 2021 16:24:04 +0200
Subject: [PATCH 06/11] Use GPUArrays 6.3's global allowscalar toggle.

---
 lib/FluxCUDA/test/runtests.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl
index c9deebdff3..f92f8f8522 100644
--- a/lib/FluxCUDA/test/runtests.jl
+++ b/lib/FluxCUDA/test/runtests.jl
@@ -4,7 +4,9 @@ using FluxCUDA
 using FluxCUDA: Flux, CUDA
 
 using .CUDA
-CUDA.allowscalar(false)
+# XXX: allowscalar is currently not inherited by child tasks, so set it globally
+#CUDA.allowscalar(false)
+ENV["JULIA_GPU_ALLOWSCALAR"] = "false"
 
 using .Flux
 Flux.default_gpu_converter[] = cu

From ed8e0028fdff7e7d40ed8fe93bdff69e50a5b383 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 4 May 2021 08:12:16 +0200
Subject: [PATCH 07/11] Add FluxAMDGPU.jl.

---
 .buildkite/pipeline.yml           |  21 +++
 lib/FluxAMDGPU/Project.toml       |  13 ++
 lib/FluxAMDGPU/src/FluxAMDGPU.jl  |  13 ++
 lib/FluxAMDGPU/test/Project.toml  |   5 +
 lib/FluxAMDGPU/test/core.jl       |  63 +++++++++
 lib/FluxAMDGPU/test/layers.jl     | 221 ++++++++++++++++++++++++++++++
 lib/FluxAMDGPU/test/losses.jl     |  38 +++++
 lib/FluxAMDGPU/test/runtests.jl   |  20 +++
 lib/FluxAMDGPU/test/test_utils.jl |  72 ++++++++++
 9 files changed, 466 insertions(+)
 create mode 100644 lib/FluxAMDGPU/Project.toml
 create mode 100644 lib/FluxAMDGPU/src/FluxAMDGPU.jl
 create mode 100644 lib/FluxAMDGPU/test/Project.toml
 create mode 100644 lib/FluxAMDGPU/test/core.jl
 create mode 100644 lib/FluxAMDGPU/test/layers.jl
 create mode 100644 lib/FluxAMDGPU/test/losses.jl
 create mode 100644 lib/FluxAMDGPU/test/runtests.jl
 create mode 100644 lib/FluxAMDGPU/test/test_utils.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a11e983e00..02751f72b8 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -17,3 +17,24 @@ steps:
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
+
+  - label: "AMD support"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.6
+    command: |
+      julia -e 'using Pkg;
+
+                println("--- :julia: Instantiating project");
+                Pkg.develop(PackageSpec(path=pwd()));
+                Pkg.develop(PackageSpec(path=joinpath(pwd(), "lib/FluxAMDGPU")));
+
+                println("+++ :julia: Running tests");
+                Pkg.test("FluxAMDGPU"; coverage=true);'
+    soft_fail:
+      - exit_status: 1
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
diff --git a/lib/FluxAMDGPU/Project.toml b/lib/FluxAMDGPU/Project.toml
new file mode 100644
index 0000000000..193a8c42ee
--- /dev/null
+++ b/lib/FluxAMDGPU/Project.toml
@@ -0,0 +1,13 @@
+name = "FluxAMDGPU"
+uuid = "15448036-796b-45b3-936c-e3e32bc623ba"
+authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
+version = "0.1.0"
+
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+
+[compat]
+AMDGPU = "0.2"
+Flux = "0.12"
+julia = "1.6"
diff --git a/lib/FluxAMDGPU/src/FluxAMDGPU.jl b/lib/FluxAMDGPU/src/FluxAMDGPU.jl
new file mode 100644
index 0000000000..4e93d59d46
--- /dev/null
+++ b/lib/FluxAMDGPU/src/FluxAMDGPU.jl
@@ -0,0 +1,13 @@
+module FluxAMDGPU
+
+using Flux
+using AMDGPU
+
+### onehot
+
+import Flux: OneHotArray, OneHotLike, _onehot_bool_type
+
+_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:ROCArray}) where N = ROCArray{Bool, N}
+Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N = AMDGPU.ROCArrayStyle{N}()
+
+end # module
diff --git a/lib/FluxAMDGPU/test/Project.toml b/lib/FluxAMDGPU/test/Project.toml
new file mode 100644
index 0000000000..e5dd177881
--- /dev/null
+++ b/lib/FluxAMDGPU/test/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/lib/FluxAMDGPU/test/core.jl b/lib/FluxAMDGPU/test/core.jl
new file mode 100644
index 0000000000..18ae76536d
--- /dev/null
+++ b/lib/FluxAMDGPU/test/core.jl
@@ -0,0 +1,63 @@
+import .Flux: cpu, gpu
+using Statistics: mean
+using LinearAlgebra: I, cholesky, Cholesky
+
+@testset "Core" begin
+  x = randn(5, 5)
+  cx = gpu(x)
+  @test cx isa ROCArray
+
+  @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3
+
+  x = Flux.onehotbatch([1, 2, 3], 1:3)
+  cx = gpu(x)
+  @test cx isa Flux.OneHotMatrix && cx.indices isa ROCArray
+  @test (cx .+ 1) isa ROCArray
+
+  m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
+  cm = gpu(m)
+
+  @test all(p isa ROCArray for p in params(cm))
+  @test cm(gpu(rand(10, 10))) isa ROCArray{Float32,2}
+
+  xs = rand(5, 5)
+  ys = Flux.onehotbatch(1:5,1:5)
+  @test collect(roc(xs) .+ roc(ys)) ≈ collect(xs .+ ys)
+
+  c = gpu(Conv((2,2),3=>4))
+  x = gpu(rand(10, 10, 3, 2))
+  l = c(gpu(rand(10,10,3,2)))
+  @test gradient(x -> sum(c(x)), x)[1] isa ROCArray
+
+  c = gpu(CrossCor((2,2),3=>4))
+  x = gpu(rand(10, 10, 3, 2))
+  l = c(gpu(rand(10,10,3,2)))
+  @test gradient(x -> sum(c(x)), x)[1] isa ROCArray
+
+end
+
+@testset "onecold gpu" begin
+  y = Flux.onehotbatch(ones(3), 1:10) |> gpu;
+  l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
+  @test Flux.onecold(y) isa ROCArray
+  @test y[3,:] isa ROCArray
+  @test Flux.onecold(y, l) == ['a', 'a', 'a']
+end
+
+@testset "restructure gpu" begin
+  dudt = Dense(1,1) |> gpu
+  p,re = Flux.destructure(dudt)
+  foo(x) = sum(re(p)(x))
+  @test gradient(foo, roc(rand(1)))[1] isa ROCArray
+end
+
+@testset "GPU functors" begin
+  @testset "Cholesky" begin
+    M = 2.0*I(10) |> collect
+    Q = cholesky(M)
+    Q_gpu = Q |> gpu
+    @test Q_gpu isa Cholesky{<:Any,<:ROCArray}
+    Q_cpu = Q_gpu |> cpu
+    @test Q_cpu == cholesky(eltype(Q_gpu).(M))
+  end
+end
diff --git a/lib/FluxAMDGPU/test/layers.jl b/lib/FluxAMDGPU/test/layers.jl
new file mode 100644
index 0000000000..da6e431c09
--- /dev/null
+++ b/lib/FluxAMDGPU/test/layers.jl
@@ -0,0 +1,221 @@
+# Test layers and data/model movements on and off the GPU
+# Add tests for layers and their gradients on the GPU
+# Most of the forward passes should be fine being applied
+# to bitstype objects, but this gives higher coverage for our use-cases
+# Check that getting the gradients does not throw
+
+# generic movement tests
+@testset "Basic GPU Movement" begin
+  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
+  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
+end
+
+# TODO: These layers get into scalar indexing
+# `AlphaDropout` throws a compilation error on GPUs,
+# whereas, the rest are scalar indexing issues.
+const BROKEN_LAYERS = Union{DepthwiseConv,
+                            AlphaDropout}
+
+function gpu_gradtest(name::String, layers::Vector, x_cpu, args...;
+            setmode=false, test_cpu=true, rtol=1e-5, atol=1e-5)
+  @testset "$name GPU grad tests" begin
+    for layer in layers
+      @testset "$layer GPU grad test" begin
+        l_cpu = layer(args...)
+        if l_cpu isa BROKEN_LAYERS
+          l_gpu, x_gpu = l_cpu |> gpu, x_cpu |> gpu
+          @test_broken gradient(() -> sum(l_gpu(x_gpu)), Flux.params(l_gpu)) isa Flux.Zygote.Grads
+        else
+          gpu_autodiff_test(l_cpu, x_cpu,
+              test_equal=test_cpu, rtol=rtol, atol=atol)
+          if setmode
+            testmode!(l_cpu)
+            gpu_autodiff_test(l_cpu, x_cpu,
+              test_equal=test_cpu, rtol=rtol, atol=atol)
+          end
+        end
+      end
+    end
+  end
+end
+
+
+# Just to give testset in gradtest meaningful labels
+ConvNoBias(args...) = Conv(args...; bias=false)
+ConvTransposeNoBias(args...) = ConvTranspose(args...; bias=false)
+CrossCorNoBias(args...) = CrossCor(args...; bias=false)
+DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias=false)
+r = rand(Float32, 28, 28, 1, 1)
+conv_layers = [Conv, ConvNoBias, ConvTranspose, ConvTransposeNoBias, CrossCor, CrossCorNoBias, DepthwiseConv, DepthwiseConvNoBias]
+gpu_gradtest("Conv", conv_layers, r, (2,2), 1=>3)
+
+pooling_layers = [MaxPool, MeanPool]
+gpu_gradtest("Pooling", pooling_layers, r, (2,2))
+
+adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool]
+gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7))
+
+dropout_layers = [Dropout, AlphaDropout]
+gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu=false, setmode=true) # dropout is not deterministic
+
+layer_norm = [i -> LayerNorm(i; affine=false), i -> LayerNorm(i; affine=true)]
+gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 8, 8, 3, 4), 8)
+gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 8, 8, 3, 4), (8,8))
+gpu_gradtest("LayerNorm 3", layer_norm, rand(Float32, 5, 4), 5)
+
+batch_norm = [BatchNorm]
+gpu_gradtest("BatchNorm 3d", batch_norm, rand(Float32, 8, 8, 8, 3, 4), 3, setmode=false) # bug in CUDA.jl with gradient in testmode
+gpu_gradtest("BatchNorm 2d", batch_norm, rand(Float32, 8, 8, 3, 4), 3, setmode=false) # bug in CUDA.jl with gradient in testmode
+gpu_gradtest("BatchNorm 1d", batch_norm, rand(Float32, 8, 3, 4), 3, setmode=false) # bug in CUDA.jl with gradient in testmode
+gpu_gradtest("BatchNorm fullyconn", batch_norm, rand(Float32, 5,4), 5, setmode=false)
+
+instancenorm = [i -> InstanceNorm(i; affine=false), i -> InstanceNorm(i; affine=true)]
+gpu_gradtest("InstanceNorm 3d", instancenorm, rand(Float32, 8, 8, 8, 3, 4), 3, setmode=true)
+gpu_gradtest("InstanceNorm 2d", instancenorm, rand(Float32, 8, 8, 3, 4), 3, setmode=true)
+gpu_gradtest("InstanceNorm 1d", instancenorm, rand(Float32, 8, 3, 4), 3, setmode=true)
+
+groupnorm = [(i, j) -> GroupNorm(i, j; affine=false), (i, j) -> GroupNorm(i, j; affine=true)]
+gpu_gradtest("GroupNorm 3d", groupnorm, rand(Float32, 8, 8, 8, 12, 4), 12, 3, setmode=true)
+gpu_gradtest("GroupNorm 2d", groupnorm, rand(Float32, 8, 8, 12, 4), 12, 3, setmode=true)
+gpu_gradtest("GroupNorm 1d", groupnorm, rand(Float32, 8, 3, 12, 4), 12, 3, setmode=true)
+
+upsample = [x -> Upsample(scale=x)]
+gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
+gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
+
+pixelshuffle = [PixelShuffle]
+gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
+gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
+
+
+@testset "function layers" begin
+  x = rand(Float32, 3,3)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
+end
+
+@testset "BatchNorm mix stuff" begin
+  m_cpu = BatchNorm(2)
+  m_gpu = m_cpu |> gpu
+  x_cpu = rand(Float32, 3, 2, 2)
+  x_gpu = x_cpu |> gpu
+
+  ## In :auto mode, track statistics only in gradient contest
+  μ_cpu = copy(m_cpu.μ)
+  m_cpu(x_cpu)
+  @test m_cpu.μ ≈ μ_cpu
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  @test !(m_cpu.μ ≈ μ_cpu)
+
+  μ_gpu = copy(m_gpu.μ)
+  m_gpu(x_gpu)
+  @test m_gpu.μ ≈ μ_gpu
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  @test !(m_gpu.μ ≈ μ_gpu)
+
+  @test Array(m_gpu.μ) ≈ m_cpu.μ
+
+  ## In testmode, never track statistics
+  testmode!(m_cpu)
+  μ_cpu = copy(m_cpu.μ)
+  m_cpu(x_cpu)
+  @test m_cpu.μ ≈ μ_cpu
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  @test m_cpu.μ ≈ μ_cpu
+
+  testmode!(m_gpu)
+  μ_gpu = copy(m_gpu.μ)
+  m_gpu(x_gpu)
+  @test m_gpu.μ ≈ μ_gpu
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  @test m_gpu.μ ≈ μ_gpu
+
+  ## In trainmode, always track statistics
+  trainmode!(m_cpu)
+  μ_cpu = copy(m_cpu.μ)
+  m_cpu(x_cpu)
+  @test !(m_cpu.μ ≈ μ_cpu)
+  μ_cpu = copy(m_cpu.μ)
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  @test !(m_cpu.μ ≈ μ_cpu)
+
+  trainmode!(m_gpu)
+  μ_gpu = copy(m_gpu.μ)
+  m_gpu(x_gpu)
+  @test !(m_gpu.μ ≈ μ_gpu)
+  μ_gpu = copy(m_gpu.μ)
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  @test !(m_gpu.μ ≈ μ_gpu)
+
+  ## No errors if input type mistmatch
+  x_cpu = rand(Float64, 3, 2, 2)
+  x_gpu = x_cpu |> gpu
+  m_cpu(x_cpu)
+  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  m_gpu(x_gpu)
+  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+end
+
+@testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
+  l = cl((2,2), 1=>3, bias = false) |> gpu
+  ip = zeros(Float32, 28,28,1,1) |> gpu
+  if l isa BROKEN_LAYERS
+    @test_broken sum(l(ip)) ≈ 0.f0
+    @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads
+  else
+    @test sum(l(ip)) ≈ 0.f0
+    gs = gradient(() -> sum(l(ip)), Flux.params(l))
+    @test l.bias ∉ gs.params
+  end
+end
+
+@testset "Dense with Zeros bias" begin
+  l = Dense(ones(Float32, 4,3), Flux.Zeros()) |> gpu
+  ip = zeros(Float32, 3, 7) |> gpu
+
+  @test sum(l(ip)) ≈ 0.f0
+  gs = gradient(() -> sum(l(ip)), Flux.params(l))
+  @test l.b ∉ gs.params
+end
+
+@testset "Two-streams Bilinear" begin
+  x = zeros(Float32,10,9) |> gpu
+  y = zeros(Float32,2,9) |> gpu
+  b = Flux.Bilinear(10, 2, 3) |> gpu
+  @test size(b(x,y)) == (3,9)
+  @test sum(abs2, b(x,y)) ≈ 0f0
+  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
+  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
+  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
+  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
+    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
+  end
+end
+
+@testset "Parallel" begin
+  @testset "zero sum" begin
+    input = randn(10, 10, 10, 10) |> gpu
+    layer_gpu = Parallel(+, zero, identity) |> gpu
+    @test layer_gpu(input) == input
+    @test layer_gpu(input) isa ROCArray
+  end
+
+  @testset "vararg input" begin
+    inputs = (randn(10), randn(5), randn(4)) .|> gpu
+    layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
+    @test size(layer(inputs)) == (2,)
+  end
+
+  @testset "gradient" begin
+    input_cpu = randn(10, 10, 10, 10)
+    input_gpu = input_cpu |> gpu
+    layer_cpu = Parallel(+, x -> zero(x), identity)
+    layer_gpu = layer_cpu |> gpu
+    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
+    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
+    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
+      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
+    end
+  end
+end
diff --git a/lib/FluxAMDGPU/test/losses.jl b/lib/FluxAMDGPU/test/losses.jl
new file mode 100644
index 0000000000..981ddd1244
--- /dev/null
+++ b/lib/FluxAMDGPU/test/losses.jl
@@ -0,0 +1,38 @@
+using .Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
+
+
+@testset "Losses" begin
+
+x = [1.,2.,3.]
+cx = gpu(x)
+@test crossentropy(x,x) ≈ crossentropy(cx,cx)
+@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu
+@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x))
+
+x = [-1.1491, 0.8619, 0.3127]
+y = [1, 1, 0.]
+@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
+@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
+
+x = [0.268941  0.5  0.268941
+     0.731059  0.5  0.731059]
+y = [0  1  0
+     1  0  1]
+@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
+
+x = softmax(reshape(-7:7, 3, 5) .* 1f0)
+y = [1  0  0  0  1
+     0  1  0  1  0
+     0  0  1  0  0]
+@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
+
+@testset "GPU grad tests" begin
+  x = rand(Float32, 3,3)
+  y = rand(Float32, 3,3)
+
+  for loss in ALL_LOSSES
+    gpu_autodiff_test(loss, x, y)
+  end
+end
+
+end #testset
diff --git a/lib/FluxAMDGPU/test/runtests.jl b/lib/FluxAMDGPU/test/runtests.jl
new file mode 100644
index 0000000000..7a030438d6
--- /dev/null
+++ b/lib/FluxAMDGPU/test/runtests.jl
@@ -0,0 +1,20 @@
+using Test
+
+using FluxAMDGPU
+using FluxAMDGPU: Flux, AMDGPU
+
+using .AMDGPU
+# XXX: allowscalar is currently not inherited by child tasks, so set it globally
+#AMDGPU.allowscalar(false)
+ENV["JULIA_GPU_ALLOWSCALAR"] = "false"
+
+using .Flux
+Flux.default_gpu_converter[] = AMDGPU.roc
+
+using Zygote
+using Zygote: pullback
+
+include("test_utils.jl")
+include("core.jl")
+include("losses.jl")
+include("layers.jl")
diff --git a/lib/FluxAMDGPU/test/test_utils.jl b/lib/FluxAMDGPU/test/test_utils.jl
new file mode 100644
index 0000000000..f1b7040a4a
--- /dev/null
+++ b/lib/FluxAMDGPU/test/test_utils.jl
@@ -0,0 +1,72 @@
+function check_grad(g_gpu, g_cpu, atol, rtol)
+  @show g_gpu g_cpu
+  @test false
+end
+check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) =
+    check_grad(g_gpu[], g_cpu[], atol, rtol)
+check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true
+check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu  rtol=rtol atol=atol
+check_grad(g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}, atol, rtol) =
+  @test g_cpu ≈ collect(g_gpu)   rtol=rtol atol=atol
+
+function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol)
+  for (v1, v2) in zip(g_gpu, g_cpu)
+    check_grad(v1, v2, atol, rtol)
+  end
+end
+
+function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol)
+  for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu))
+    @test k1 == k2
+    # @show k2 v2
+    check_grad(v1, v2, atol, rtol)
+  end
+end
+
+function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...;
+        test_equal=true, rtol=1e-4, atol=1e-4)
+
+  check_type(x) = false
+  check_type(x::Float32) = true
+  check_type(x::ROCArray{Float32}) = true
+  check_type(x::Array{Float32}) = true
+
+  ### GRADIENT WITH RESPECT TO INPUT #####
+  # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...)
+  y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
+  @test check_type(y_cpu)
+  Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
+  gs_cpu = back_cpu(Δ_cpu)
+
+  f_gpu = f_cpu |> gpu
+  xs_gpu = gpu.(xs_cpu)
+  Δ_gpu = Δ_cpu |> gpu 
+  # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...)
+  y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
+  @test check_type(y_gpu)
+  gs_gpu = back_gpu(Δ_gpu)
+
+  if test_equal
+    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
+    for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
+      check_grad(g_gpu, g_cpu, atol, rtol)
+    end
+  end
+
+  ### GRADIENT WITH RESPECT TO f #####
+  ps_cpu = Flux.params(f_cpu)
+  y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
+  gs_cpu = back_cpu(Δ_cpu)
+
+  ps_gpu = Flux.params(f_gpu)
+  y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
+  gs_gpu = back_gpu(Δ_gpu)
+
+  if test_equal
+    @test collect(y_cpu) ≈ collect(y_gpu)   rtol=rtol atol=atol
+    @assert length(ps_gpu) == length(ps_cpu)
+    for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
+      check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
+    end
+  end
+end

From 2eaffb31bae098d215dcb96a99bbab008bda6f4c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 4 May 2021 08:48:40 +0200
Subject: [PATCH 08/11] Ignore subproject manifests.

---
 lib/FluxAMDGPU/.gitignore | 1 +
 lib/FluxCUDA/.gitignore   | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 lib/FluxAMDGPU/.gitignore
 create mode 100644 lib/FluxCUDA/.gitignore

diff --git a/lib/FluxAMDGPU/.gitignore b/lib/FluxAMDGPU/.gitignore
new file mode 100644
index 0000000000..ba39cc531e
--- /dev/null
+++ b/lib/FluxAMDGPU/.gitignore
@@ -0,0 +1 @@
+Manifest.toml
diff --git a/lib/FluxCUDA/.gitignore b/lib/FluxCUDA/.gitignore
new file mode 100644
index 0000000000..ba39cc531e
--- /dev/null
+++ b/lib/FluxCUDA/.gitignore
@@ -0,0 +1 @@
+Manifest.toml

From ee9efbcc1890c92dbdc7b8dbb738c744b23ad522 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 4 May 2021 12:49:15 +0200
Subject: [PATCH 09/11] Remove _onehot_bool_type.

---
 lib/FluxAMDGPU/src/FluxAMDGPU.jl | 6 +++---
 lib/FluxCUDA/src/onehot.jl       | 7 +++----
 src/onehot.jl                    | 6 ++----
 test/onehot.jl                   | 2 +-
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/lib/FluxAMDGPU/src/FluxAMDGPU.jl b/lib/FluxAMDGPU/src/FluxAMDGPU.jl
index 4e93d59d46..844ba954af 100644
--- a/lib/FluxAMDGPU/src/FluxAMDGPU.jl
+++ b/lib/FluxAMDGPU/src/FluxAMDGPU.jl
@@ -5,9 +5,9 @@ using AMDGPU
 
 ### onehot
 
-import Flux: OneHotArray, OneHotLike, _onehot_bool_type
+using Flux: OneHotArray, OneHotLike
 
-_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:ROCArray}) where N = ROCArray{Bool, N}
-Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N = AMDGPU.ROCArrayStyle{N}()
+Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N =
+    AMDGPU.ROCArrayStyle{N}()
 
 end # module
diff --git a/lib/FluxCUDA/src/onehot.jl b/lib/FluxCUDA/src/onehot.jl
index 760df1d27e..037991d053 100644
--- a/lib/FluxCUDA/src/onehot.jl
+++ b/lib/FluxCUDA/src/onehot.jl
@@ -1,5 +1,4 @@
-import Flux: OneHotArray, OneHotLike, _onehot_bool_type
+using Flux: OneHotArray, OneHotLike
 
-_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N}
-
-Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}()
+Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N =
+    CUDA.CuArrayStyle{N}()
diff --git a/src/onehot.jl b/src/onehot.jl
index edc5fc5f8b..72b28ca4e7 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -60,11 +60,9 @@ Base.getindex(x::OneHotArray{<:Any, L}, ::Colon, I...) where L = OneHotArray(x.i
 Base.getindex(x::OneHotArray{<:Any, <:Any, <:Any, N}, ::Vararg{Colon, N}) where N = x
 Base.getindex(x::OneHotArray, I::CartesianIndex{N}) where N = x[I[1], Tuple(I)[2:N]...]
 
-_onehot_bool_type(x::OneHotLike{<:Any, <:Any, <:Any, N, <:Union{Integer, AbstractArray}}) where N = Array{Bool, N}
-
 function Base.cat(x::OneHotLike{<:Any, L}, xs::OneHotLike{<:Any, L}...; dims::Int) where L
   if isone(dims) || any(x -> !_isonehot(x), (x, xs...))
-    return cat(map(x -> convert(_onehot_bool_type(x), x), (x, xs...))...; dims = dims)
+    return cat(map(x -> Bool.(x), (x, xs...))...; dims = dims)
   else
     return OneHotArray(cat(_indices(x), _indices.(xs)...; dims = dims - 1), L)
   end
@@ -164,7 +162,7 @@ function _fast_argmax(x::OneHotLike)
   if _isonehot(x)
     return _indices(x)
   else
-    return _fast_argmax(convert(_onehot_bool_type(x), x))
+    return _fast_argmax(Bool.(x))
   end
 end
 
diff --git a/test/onehot.jl b/test/onehot.jl
index ce30534ec9..60522c5e8a 100644
--- a/test/onehot.jl
+++ b/test/onehot.jl
@@ -99,7 +99,7 @@ end
     @testset "w/ cat" begin
       r = reshape(oa, 10, :)
       @test hcat(r, r) isa OneHotArray
-      @test vcat(r, r) isa Array{Bool}
+      @test vcat(r, r) isa BitMatrix
     end
 
     @testset "w/ argmax" begin

From 49f94d0bca1eec599e5dac5a206e71bfcebcddcf Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 4 May 2021 12:54:18 +0200
Subject: [PATCH 10/11] Automatically register GPU converters upon loading the
 glue package.

---
 lib/FluxAMDGPU/src/FluxAMDGPU.jl | 9 +++++++++
 lib/FluxAMDGPU/test/runtests.jl  | 2 +-
 lib/FluxCUDA/src/FluxCUDA.jl     | 8 ++++++++
 lib/FluxCUDA/test/runtests.jl    | 2 +-
 4 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/lib/FluxAMDGPU/src/FluxAMDGPU.jl b/lib/FluxAMDGPU/src/FluxAMDGPU.jl
index 844ba954af..acffd3b4c8 100644
--- a/lib/FluxAMDGPU/src/FluxAMDGPU.jl
+++ b/lib/FluxAMDGPU/src/FluxAMDGPU.jl
@@ -10,4 +10,13 @@ using Flux: OneHotArray, OneHotLike
 Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:ROCArray}}) where N =
     AMDGPU.ROCArrayStyle{N}()
 
+function __init__()
+    if Flux.default_gpu_converter[] === identity
+        @info "Registering AMDGPU.jl as the default GPU converter"
+        Flux.default_gpu_converter[] = roc
+    else
+        @warn "Not registering AMDGPU.jl as the default GPU converter as another one has been registered already."
+    end
+end
+
 end # module
diff --git a/lib/FluxAMDGPU/test/runtests.jl b/lib/FluxAMDGPU/test/runtests.jl
index 7a030438d6..8b6c3a65f8 100644
--- a/lib/FluxAMDGPU/test/runtests.jl
+++ b/lib/FluxAMDGPU/test/runtests.jl
@@ -9,7 +9,7 @@ using .AMDGPU
 ENV["JULIA_GPU_ALLOWSCALAR"] = "false"
 
 using .Flux
-Flux.default_gpu_converter[] = AMDGPU.roc
+@assert Flux.default_gpu_converter[] == roc
 
 using Zygote
 using Zygote: pullback
diff --git a/lib/FluxCUDA/src/FluxCUDA.jl b/lib/FluxCUDA/src/FluxCUDA.jl
index 75de37eb34..2936118bff 100644
--- a/lib/FluxCUDA/src/FluxCUDA.jl
+++ b/lib/FluxCUDA/src/FluxCUDA.jl
@@ -9,5 +9,13 @@ include("onehot.jl")
 include("ctc.jl")
 include("cudnn.jl")
 
+function __init__()
+    if Flux.default_gpu_converter[] === identity
+        @info "Registering CUDA.jl as the default GPU converter"
+        Flux.default_gpu_converter[] = cu
+    else
+        @warn "Not registering CUDA.jl as the default GPU converter as another one has been registered already."
+    end
+end
 
 end # module
diff --git a/lib/FluxCUDA/test/runtests.jl b/lib/FluxCUDA/test/runtests.jl
index f92f8f8522..233ba77df4 100644
--- a/lib/FluxCUDA/test/runtests.jl
+++ b/lib/FluxCUDA/test/runtests.jl
@@ -9,7 +9,7 @@ using .CUDA
 ENV["JULIA_GPU_ALLOWSCALAR"] = "false"
 
 using .Flux
-Flux.default_gpu_converter[] = cu
+@assert Flux.default_gpu_converter[] == cu
 
 using Zygote
 using Zygote: pullback

From b615dca0979ae8a74b7cbbfb2e7ccbc04df94073 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 25 Jun 2021 08:09:15 +0200
Subject: [PATCH 11/11] Add oneAPI back-end.

---
 lib/FluxOneAPI/Project.toml      | 10 +++++++
 lib/FluxOneAPI/src/FluxOneAPI.jl | 46 ++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 lib/FluxOneAPI/Project.toml
 create mode 100644 lib/FluxOneAPI/src/FluxOneAPI.jl

diff --git a/lib/FluxOneAPI/Project.toml b/lib/FluxOneAPI/Project.toml
new file mode 100644
index 0000000000..d290aee506
--- /dev/null
+++ b/lib/FluxOneAPI/Project.toml
@@ -0,0 +1,10 @@
+name = "FluxOneAPI"
+uuid = "71a558b2-ada5-41b7-bb4b-aca4e46ab144"
+authors = ["Tim Besard <tim.besard@gmail.com>"]
+version = "0.1.0"
+
+[deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
diff --git a/lib/FluxOneAPI/src/FluxOneAPI.jl b/lib/FluxOneAPI/src/FluxOneAPI.jl
new file mode 100644
index 0000000000..5fa50630b2
--- /dev/null
+++ b/lib/FluxOneAPI/src/FluxOneAPI.jl
@@ -0,0 +1,46 @@
+module FluxOneAPI
+
+using Flux
+using oneAPI
+using Adapt
+using Zygote
+using Zygote: @adjoint
+
+### onehot
+
+using Flux: OneHotArray, OneHotLike
+
+Base.BroadcastStyle(::Type{<:OneHotArray{<:Any, <:Any, <:Any, N, <:oneArray}}) where N =
+    oneAPI.oneArrayStyle{N}()
+
+## zygote
+
+# TODO: generalize to GPUArray in Zygote.jl?
+
+@eval @adjoint function Base.broadcasted(::oneAPI.oneArrayStyle, f, args...)
+    y, back = Zygote.broadcast_forward(f, args...)
+    y, ȳ -> (nothing, nothing, back(ȳ)...)
+end
+
+@adjoint oneArray{N,T}(xs::Array) where {N,T} =
+    oneArray{N,T}(xs), Δ -> (convert(Array, Δ), )
+
+@adjoint function sum(xs::oneArray; dims = :)
+    placeholder = similar(xs)
+    sum(xs, dims = dims), Δ -> (placeholder .= Δ,)
+end
+
+@adjoint function Base.convert(::Type{T}, xs::Array)  where {T<:oneArray}
+    Base.convert(T, xs), Δ -> (nothing, Base.convert(Array, Δ),)
+end
+
+function __init__()
+    if Flux.default_gpu_converter[] === identity
+        @info "Registering oneAPI.jl as the default GPU converter"
+        Flux.default_gpu_converter[] = (x)->adapt(oneArray, x)
+    else
+        @warn "Not registering oneAPI.jl as the default GPU converter as another one has been registered already."
+    end
+end
+
+end # module