From 3261517b68956fefc391790b8e59e5158dbcb6ce Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 02:58:06 +0530
Subject: [PATCH 01/14] rm NNlibCUDA

---
 lib/NNlibCUDA/.gitignore               |   1 -
 lib/NNlibCUDA/LICENSE.md               |  22 ----
 lib/NNlibCUDA/Project.toml             |  23 ----
 lib/NNlibCUDA/README.md                |   5 -
 lib/NNlibCUDA/src/NNlibCUDA.jl         |  16 ---
 lib/NNlibCUDA/src/batchedmul.jl        |   7 --
 lib/NNlibCUDA/src/cudnn/activations.jl |  39 ------
 lib/NNlibCUDA/src/cudnn/batchnorm.jl   | 124 -------------------
 lib/NNlibCUDA/src/cudnn/conv.jl        | 127 --------------------
 lib/NNlibCUDA/src/cudnn/cudnn.jl       |  13 --
 lib/NNlibCUDA/src/cudnn/pooling.jl     |  56 ---------
 lib/NNlibCUDA/src/cudnn/softmax.jl     | 102 ----------------
 lib/NNlibCUDA/src/upsample.jl          | 157 -------------------------
 lib/NNlibCUDA/test/activations.jl      |  32 -----
 lib/NNlibCUDA/test/batchedmul.jl       |  56 ---------
 lib/NNlibCUDA/test/batchnorm.jl        |   8 --
 lib/NNlibCUDA/test/conv.jl             |  60 ----------
 lib/NNlibCUDA/test/pooling.jl          |  25 ----
 lib/NNlibCUDA/test/runtests.jl         |  21 ----
 lib/NNlibCUDA/test/softmax.jl          |  12 --
 lib/NNlibCUDA/test/test_utils.jl       |  20 ----
 lib/NNlibCUDA/test/upsample.jl         |  27 -----
 22 files changed, 953 deletions(-)
 delete mode 100644 lib/NNlibCUDA/.gitignore
 delete mode 100644 lib/NNlibCUDA/LICENSE.md
 delete mode 100644 lib/NNlibCUDA/Project.toml
 delete mode 100644 lib/NNlibCUDA/README.md
 delete mode 100644 lib/NNlibCUDA/src/NNlibCUDA.jl
 delete mode 100644 lib/NNlibCUDA/src/batchedmul.jl
 delete mode 100644 lib/NNlibCUDA/src/cudnn/activations.jl
 delete mode 100644 lib/NNlibCUDA/src/cudnn/batchnorm.jl
 delete mode 100644 lib/NNlibCUDA/src/cudnn/conv.jl
 delete mode 100644 lib/NNlibCUDA/src/cudnn/cudnn.jl
 delete mode 100644 lib/NNlibCUDA/src/cudnn/pooling.jl
 delete mode 100644 lib/NNlibCUDA/src/cudnn/softmax.jl
 delete mode 100644 lib/NNlibCUDA/src/upsample.jl
 delete mode 100644 lib/NNlibCUDA/test/activations.jl
 delete mode 100644 lib/NNlibCUDA/test/batchedmul.jl
 delete mode 100644 lib/NNlibCUDA/test/batchnorm.jl
 delete mode 100644 lib/NNlibCUDA/test/conv.jl
 delete mode 100644 lib/NNlibCUDA/test/pooling.jl
 delete mode 100644 lib/NNlibCUDA/test/runtests.jl
 delete mode 100644 lib/NNlibCUDA/test/softmax.jl
 delete mode 100644 lib/NNlibCUDA/test/test_utils.jl
 delete mode 100644 lib/NNlibCUDA/test/upsample.jl

diff --git a/lib/NNlibCUDA/.gitignore b/lib/NNlibCUDA/.gitignore
deleted file mode 100644
index c177dd3c6..000000000
--- a/lib/NNlibCUDA/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-#Manifest.toml
diff --git a/lib/NNlibCUDA/LICENSE.md b/lib/NNlibCUDA/LICENSE.md
deleted file mode 100644
index 824b06f8b..000000000
--- a/lib/NNlibCUDA/LICENSE.md
+++ /dev/null
@@ -1,22 +0,0 @@
-The NNlib.jl package is licensed under the MIT "Expat" License:
-
-> Copyright (c) 2017-19: Julia Computing, Inc., Mike J Innes, and Contributors
-> 
-> Permission is hereby granted, free of charge, to any person obtaining a copy
-> of this software and associated documentation files (the "Software"), to deal
-> in the Software without restriction, including without limitation the rights
-> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-> copies of the Software, and to permit persons to whom the Software is
-> furnished to do so, subject to the following conditions:
-> 
-> The above copyright notice and this permission notice shall be included in all
-> copies or substantial portions of the Software.
-> 
-> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-> SOFTWARE.
-> 
diff --git a/lib/NNlibCUDA/Project.toml b/lib/NNlibCUDA/Project.toml
deleted file mode 100644
index a335e9544..000000000
--- a/lib/NNlibCUDA/Project.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-name = "NNlibCUDA"
-uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d"
-version = "0.1.0"
-
-[deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[compat]
-CUDA = "3.0"
-NNlib = "0.7"
-julia = "1.6"
-
-[extras]
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-
-[targets]
-test = ["ForwardDiff", "Test", "Zygote"]
diff --git a/lib/NNlibCUDA/README.md b/lib/NNlibCUDA/README.md
deleted file mode 100644
index 17703ac97..000000000
--- a/lib/NNlibCUDA/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# NNlibCUDA
-
-CUDA compatibility for NNlib.jl.
-
-Julia gpu kernels are in `src/`, while wrappers around `cudnn` are in `src/cudnn/`.
diff --git a/lib/NNlibCUDA/src/NNlibCUDA.jl b/lib/NNlibCUDA/src/NNlibCUDA.jl
deleted file mode 100644
index 48c11dfa3..000000000
--- a/lib/NNlibCUDA/src/NNlibCUDA.jl
+++ /dev/null
@@ -1,16 +0,0 @@
-module NNlibCUDA
-
-using NNlib
-using CUDA
-using Random, Statistics
-
-include("upsample.jl")
-include("batchedmul.jl")
-include("cudnn/cudnn.jl")
-include("cudnn/conv.jl")
-include("cudnn/pooling.jl")
-include("cudnn/softmax.jl")
-include("cudnn/activations.jl")
-include("cudnn/batchnorm.jl")
-
-end # module
diff --git a/lib/NNlibCUDA/src/batchedmul.jl b/lib/NNlibCUDA/src/batchedmul.jl
deleted file mode 100644
index 471cac00e..000000000
--- a/lib/NNlibCUDA/src/batchedmul.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-# Batched matrix multiplication
-# 1st argument is produced by NNlib.storage_type(A)
-NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) =
-     CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C)
-
-Base.unsafe_convert(::Type{CuPtr{T}}, A::NNlib.BatchedAdjOrTrans{T}) where {T} =
-    Base.unsafe_convert(CuPtr{T}, parent(A))
diff --git a/lib/NNlibCUDA/src/cudnn/activations.jl b/lib/NNlibCUDA/src/cudnn/activations.jl
deleted file mode 100644
index d6df03803..000000000
--- a/lib/NNlibCUDA/src/cudnn/activations.jl
+++ /dev/null
@@ -1,39 +0,0 @@
-
-# Activation
-
-using Base.Broadcast
-using CUDA.CUDNN: cudnnActivationForward!, cudnnOpTensor!,
-            CUDNN_ACTIVATION_TANH,CUDNN_ACTIVATION_SIGMOID,CUDNN_ACTIVATION_ELU,
-            CUDNN_ACTIVATION_RELU,CUDNN_ACTIVATION_CLIPPED_RELU,CUDNN_OP_TENSOR_MAX
-
-for (f, op) in [
-    CUDA.tanh       => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_TANH),
-    NNlib.σ         => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_SIGMOID),
-    NNlib.elu       => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_ELU),
-    NNlib.relu      => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_RELU),
-    # NNlib.relu6     => (src,dst)->cudnnActivationForward!(dst, src, mode=CUDNN_ACTIVATION_CLIPPED_RELU, coef=6.0),
-    # NNlib.leakyrelu => (src,dst)->cudnnOpTensor!(dst, src, src; op=CUDNN_OP_TENSOR_MAX, alpha1=0.01),
-    ]
-    
-    @eval begin
-        # in-place
-        function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
-                                   bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
-            $op(bc.args[1], dst)
-            return dst
-        end
-
-        # out of place
-        function Base.materialize(bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
-            ElType = Broadcast.combine_eltypes(bc.f, bc.args)
-            dst = similar(bc, ElType)
-            $op(bc.args[1], dst)
-            return dst
-        end
-    end
-end
-
-# CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward
-# FIXME: put this optimization in GPUArrays' `copyto!` (like Base.Broadcast's `copyto!`)
-Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
-
diff --git a/lib/NNlibCUDA/src/cudnn/batchnorm.jl b/lib/NNlibCUDA/src/cudnn/batchnorm.jl
deleted file mode 100644
index eb742c2fb..000000000
--- a/lib/NNlibCUDA/src/cudnn/batchnorm.jl
+++ /dev/null
@@ -1,124 +0,0 @@
-using CUDA.CUDNN: CUDNN_BN_MIN_EPSILON, cudnnBatchNormalizationBackward,
-                  cudnnBatchNormalizationForwardInference, CUDNN_BATCHNORM_SPATIAL,
-                  cudnnBatchNormalizationForwardTraining
-
-
-# TODO: replace with new cudnn normalization interface
-# https://github.com/JuliaGPU/CUDA.jl/blob/master/lib/cudnn/normalization.jl
-
-mutable struct BNCache
-  mean
-  ivar
-end
-
-BNCache() = BNCache(nothing, nothing)
-
-@inline _wsize(y) = (fill(1, ndims(y)-2)..., size(y)[end-1], 1)
-
-# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
-# so reshape a 2D Tensor into 4D
-batchnorm(g::DenseCuArray{T}, b::DenseCuArray{T}, x::DenseCuArray{T,2},
-          running_mean::DenseCuArray{T}, running_var::DenseCuArray{T}, momentum;
-          cache = nothing, alpha = T(1), beta = T(0),
-          eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
-  dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
-            cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
-
-function batchnorm(g::DenseCuArray{T}, b::DenseCuArray{T}, x::Union{DenseCuArray{T,4},DenseCuArray{T,5}},
-                    running_mean::DenseCuArray{T}, running_var::DenseCuArray{T}, momentum;
-                    cache = nothing, alpha = T(1), beta = T(0),
-                    eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  cudnnBNForward!(similar(x), g, b, x, running_mean, running_var, momentum, cache = cache,
-      alpha = alpha, beta = beta, eps = eps, training = training)
-end
-
-function cudnnBNForward!(y::DenseCuArray{T}, g::DenseCuArray{T}, b::DenseCuArray{T}, x::DenseCuArray{T},
-                        running_mean::DenseCuArray{T}, running_var::DenseCuArray{T},
-                        momentum; cache = nothing,
-                        alpha = T(1), beta = T(0),
-                        eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
-  dims = _wsize(x)
-  if eps < CUDNN_BN_MIN_EPSILON
-    # warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", CUDNN_BN_MIN_EPSILON)
-    eps = CUDNN_BN_MIN_EPSILON
-  end
-  xd = cudnnTensorDescriptor(x)
-  yd = cudnnTensorDescriptor(y)
-  gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(dims)), dim4(dims,Val(CUDNN_TENSOR_NCHW)))
-
-  if training
-
-    if cache !== nothing
-      mean = zeros(CuArray{T}, dims...)
-      ivar = ones(CuArray{T}, dims...)
-    else
-      mean = CU_NULL
-      ivar = CU_NULL
-    end
-
-    cudnnBatchNormalizationForwardTraining(handle(), CUDNN_BATCHNORM_SPATIAL, scalingParameter(T, alpha), scalingParameter(T, beta), xd, x, yd, y, gd, g, b, momentum, running_mean, running_var, eps, mean, ivar)
-
-    if cache !== nothing
-      cache.mean = mean
-      cache.ivar = ivar
-    end
-  else
-    cudnnBatchNormalizationForwardInference(handle(), CUDNN_BATCHNORM_SPATIAL, scalingParameter(T, alpha), scalingParameter(T, beta), xd, x, yd, y, gd, g, b, running_mean, running_var, eps)
-  end
-  return y
-end
-
-function ∇batchnorm(g::DenseCuArray{T}, b::DenseCuArray{T}, x::DenseCuArray{T, 2}, dy::DenseCuArray{T, 2},
-            running_mean::DenseCuArray{T}, running_var::DenseCuArray{T}, momentum;
-            cache = nothing, eps = T(1e-5), alpha = T(1),
-            beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
-                          size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
-                          alpha = alpha, beta = beta, training = training)
-  (dg, db, dropdims(dx, dims = (1, 2)))
-end
-
-function ∇batchnorm(g::DenseCuArray{T}, b::DenseCuArray{T}, x::DenseCuArray{T}, dy::DenseCuArray{T},
-                    running_mean::DenseCuArray{T}, running_var::DenseCuArray{T}, momentum;
-                    cache = nothing, eps = T(1e-5), alpha = T(1),
-                    beta = T(0), training = true) where T<:Union{Float32, Float64}
-  dg = similar(g)
-  db = similar(b)
-  dx = similar(x)
-  cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
-    training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
-  (dg, db, dx)
-end
-
-function cudnnBNBackward!(dg::DenseCuArray{T}, g::DenseCuArray{T}, db::DenseCuArray{T},
-                          dx::DenseCuArray{T}, x::DenseCuArray{T}, dy::DenseCuArray{T},
-                          running_mean::DenseCuArray{T}, running_var::DenseCuArray{T},
-                          momentum; cache = nothing, eps = T(1e-5),
-                          alpha = T(1), beta = T(0),
-                          dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
-  if training
-    xd = cudnnTensorDescriptor(x)
-    dyd = cudnnTensorDescriptor(dy)
-    dxd = cudnnTensorDescriptor(dx)
-    gd = cudnnTensorDescriptor(CUDNN_TENSOR_NCHW, cudnnDataType(T), Cint(length(_wsize(x))), dim4(_wsize(x),Val(CUDNN_TENSOR_NCHW)))
-    if cache !== nothing
-      mean, ivar = cache.mean, cache.ivar
-      info("mean and ivar are fetched from the cache")
-    else
-      mean, ivar = CU_NULL, CU_NULL
-    end
-
-    if eps < CUDNN_BN_MIN_EPSILON
-      eps = CUDNN_BN_MIN_EPSILON
-    end
-
-    cudnnBatchNormalizationBackward(handle(), CUDNN_BATCHNORM_SPATIAL, scalingParameter(T, alpha), scalingParameter(T, beta), scalingParameter(T, dalpha), scalingParameter(T, dbeta), xd, x, dyd, dy, dxd, dx, gd, g, dg, db, eps, mean, ivar)
-  else
-    ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
-    dx .= dy .* reshape(g, _wsize(x)) .* ivar
-    rdims = ((1:ndims(x)-2)..., ndims(x))
-    dg .= vec(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, dims=rdims))
-    db .= vec(sum(dy, dims=rdims))
-  end
-end
-  
\ No newline at end of file
diff --git a/lib/NNlibCUDA/src/cudnn/conv.jl b/lib/NNlibCUDA/src/cudnn/conv.jl
deleted file mode 100644
index bbcdde3ac..000000000
--- a/lib/NNlibCUDA/src/cudnn/conv.jl
+++ /dev/null
@@ -1,127 +0,0 @@
-
-# Deprecated methods
-using NNlib: DenseConvDims
-import NNlib: stride, padding, dilation, flipkernel, spatial_dims, kernel_size,
-    conv!, ∇conv_filter!, ∇conv_data!,
-    maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, PoolDims
-
-using CUDA.CUDNN: scalingParameter, CUDNN_CONVOLUTION, convdims, 
-                  cudnnConvolutionDescriptor, cudnnConvolutionBwdDataAlgoPerf,
-                  cudnnConvolutionForward!, cudnnConvolutionBwdFilterAlgoPerf,
-                  cudnnConvolutionBackwardData, cudnnConvolutionBackwardFilter,
-                  cudnnConvolutionBackwardBias
-
-const CUDNNFloat = Union{Float16,Float32,Float64}
-
-# Since CUDNN does not support 1D convolution, Conv in Flux will give a CUDNNError if the size is 1-dimensional.
-fix1d(x) = x
-fix1d(x::DenseCuArray{T, 3}) where T = reshape(x, 1, size(x, 1), size(x, 2), size(x, 3))
-fix1d(cdims::DenseConvDims{1,K,C_in,C_out,S,P,D,F}) where {K,C_in,C_out,S,P,D,F} =
-    DenseConvDims{2,(1,K...),C_in,C_out,(1,S...),(0,0,P...),(1,D...),F}((1,cdims.I...))
-fix1d(pdims::PoolDims{1,K,S,P,D}) where {K,S,P,D,F} =
-    PoolDims{2,(1,K...),(1,S...),(0,0,P...),(1,D...)}((1,pdims.I...), pdims.C_in)
-
-# Convolution
-
-function cudnnConvolutionDescriptor(cdims::DenseConvDims, x::DenseCuArray{T}) where T
-    cdims, x = fix1d(cdims), fix1d(x)
-    mode=(NNlib.flipkernel(cdims) ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION)
-    cudnnConvolutionDescriptor(convdims(nnlibPadding(cdims),size(x)), convdims(NNlib.stride(cdims),size(x)), convdims(NNlib.dilation(cdims),size(x)), mode, cudnnDataType(T), math_mode(), CUDNN_DEFAULT_REORDER, Cint(1))
-end
-
-function conv!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}, cdims::DenseConvDims;
-               alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
-    if cudnnversion() < v"6"
-        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-    end
-    if algo != -1
-        @warn "algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
-    end
-    d = cudnnConvolutionDescriptor(cdims, x)
-    cudnnConvolutionForward!(y, w, x, d; alpha, beta, z=y)
-end
-
-function NNlib.conv_bias_act!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}, 
-                            cdims::DenseConvDims, bias::DenseCuArray{T}, σ=identity;
-                            z::DenseCuArray{T}=y, alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
-    if cudnnversion() < v"6"
-        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-    end
-    if algo != -1
-        @warn "The algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
-    end    
-    d = cudnnConvolutionDescriptor(cdims, x)
-    # only relu and identity are supported by cudnnConvolutionForward!
-    activation = (σ == NNlib.relu ? CUDNN_ACTIVATION_RELU : CUDNN_ACTIVATION_IDENTITY)
-    cudnnConvolutionForward!(y, w, x, d; z, bias, activation, alpha, beta)
-    if activation === CUDNN_ACTIVATION_IDENTITY && σ ∉ (nothing, identity)
-        y = σ.(y)
-    end
-    return y
-end
-
-function ∇conv_data!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, w::DenseCuArray{T},
-                     cdims::DenseConvDims; alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
-    if cudnnversion() < v"6"
-        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-    end
-    if algo != -1
-        @warn "The algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
-    end    
-    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta);
-    xDesc, yDesc, wDesc = cudnnTensorDescriptor(dx), cudnnTensorDescriptor(dy), cudnnFilterDescriptor(w)
-    convDesc = cudnnConvolutionDescriptor(cdims, dx)
-    p = cudnnConvolutionBwdDataAlgoPerf(wDesc, w, yDesc, dy, convDesc, xDesc, dx)
-    @workspace size=p.memory workspace->cudnnConvolutionBackwardData(handle(), alpha, wDesc, w, yDesc, dy, convDesc, p.algo, workspace, sizeof(workspace), beta, xDesc, dx)
-    return dx
-end
-
-function ∇conv_filter!(dw::DenseCuArray{T}, x::DenseCuArray{T}, dy::DenseCuArray{T},
-                       cdims::DenseConvDims; alpha=1, beta=0, algo=-1) where T<:CUDNNFloat
-    if cudnnversion() < v"6"
-        all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
-    end
-    if algo != -1
-        @warn "The algo option has been deprecated, the fastest algo is computed automatically" maxlog=1
-    end    
-    alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta);
-    xDesc, yDesc, wDesc = cudnnTensorDescriptor(x), cudnnTensorDescriptor(dy), cudnnFilterDescriptor(dw)
-    convDesc = cudnnConvolutionDescriptor(cdims, x)
-    p = cudnnConvolutionBwdFilterAlgoPerf(xDesc, x, yDesc, dy, convDesc, wDesc, dw);
-    @workspace size=p.memory workspace->cudnnConvolutionBackwardFilter(handle(), alpha, xDesc, x, yDesc, dy, convDesc, p.algo, workspace, sizeof(workspace), beta, wDesc, dw);
-    return dw
-end
-
-
-function ∇conv_bias!(db::DenseCuArray{T}, dy::DenseCuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat
-    alpha,beta = scalingParameter(T,alpha), scalingParameter(T,beta)
-    bDesc, yDesc = cudnnTensorDescriptor.((db,dy))
-    cudnnConvolutionBackwardBias(handle(), alpha, yDesc, dy, beta, bDesc, db)
-    return db
-end
-
-# Compatibility shims until users upgrade to new NNlib format
-function conv!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
-    cdims = DenseConvDims(x, w; padding=pad, stride=stride, flipkernel=(flipkernel!=0), dilation=dilation)
-    return conv!(y, x, w, cdims; kwargs...)
-end
-
-function ∇conv_filter!(dw::DenseCuArray{T}, dy::DenseCuArray{T}, x::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
-    cdims = DenseConvDims(x, dw; padding=pad, stride=stride, flipkernel=(flipkernel!=0), dilation=dilation)
-    # NOTE!!!  This compat shim re-arranges the argument order!
-    return ∇conv_filter!(dw, x, dy, cdims; kwargs...)
-end
-
-
-function cudnnConvolutionForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N},
-                                 cdims::DenseConvDims; algo=0, alpha=1, beta=0) where {T,N}
-    # @warn "`cudnnConvolutionForward(y,x,w,c::DenseConvDims)` is deprecated, please use one of the methods in `@doc cudnnConvolutionForward!`." maxlog=1
-    cudnnConvolutionForward!(y, w, x; alpha, beta, padding=nnlibPadding(cdims), stride=NNlib.stride(cdims), dilation=NNlib.dilation(cdims), mode=(NNlib.flipkernel(cdims) ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION))
-end
-
-function cudnnConvolutionBiasActivationForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, w::DenseCuArray{T,N}, z::DenseCuArray{T,N}, bias::DenseCuArray{T,N},
-                                               cdims::DenseConvDims; algo=0, alpha1=1, alpha2=1,
-                                               activationMode=CUDNN_ACTIVATION_RELU, activationCoeff=0.0, activationReluNanOpt=CUDNN_NOT_PROPAGATE_NAN) where {T,N}
-    # @warn "`cudnnConvolutionBiasActivationForward` is deprecated, please use one of the methods in `@doc cudnnConvolutionForward!`." maxlog=1
-    cudnnConvolutionForward!(y, w, x; bias, activation=activationMode, z, alpha=alpha1, beta=alpha2, padding=nnlibPadding(cdims), stride=NNlib.stride(cdims), dilation=NNlib.dilation(cdims), mode=(NNlib.flipkernel(cdims) ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION))
-end
diff --git a/lib/NNlibCUDA/src/cudnn/cudnn.jl b/lib/NNlibCUDA/src/cudnn/cudnn.jl
deleted file mode 100644
index 99ee1b6a9..000000000
--- a/lib/NNlibCUDA/src/cudnn/cudnn.jl
+++ /dev/null
@@ -1,13 +0,0 @@
-using CUDA.CUDNN: handle, @workspace, cudnnTensorDescriptor, cudnnFilterDescriptor,
-                  cudnnDataType, math_mode, CUDNN_DEFAULT_REORDER, CUDNN_CROSS_CORRELATION,
-                  CUDNN_NOT_PROPAGATE_NAN, CUDNN_TENSOR_NCHW, dim4
-
-cudnnversion() = CUDA.CUDNN.version()
-
-function nnlibPadding(dims)
-    pd = NNlib.padding(dims)
-    if !all(pd[1:2:end] .== pd[2:2:end])
-        @warn "cuDNN does not support asymmetric padding; defaulting to symmetric choice" maxlog=1
-    end
-    return pd[1:2:end]
-end
\ No newline at end of file
diff --git a/lib/NNlibCUDA/src/cudnn/pooling.jl b/lib/NNlibCUDA/src/cudnn/pooling.jl
deleted file mode 100644
index be9ecc5d2..000000000
--- a/lib/NNlibCUDA/src/cudnn/pooling.jl
+++ /dev/null
@@ -1,56 +0,0 @@
-using CUDA.CUDNN: cudnnPoolingMode_t, CUDNN_POOLING_MAX, 
-                  CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
-                  cudnnPoolingForward!, pooldims, cudnnPoolingBackward
-          
-import CUDA.CUDNN: cudnnPoolingDescriptor
-
-function cudnnPoolingDescriptor(pdims::PoolDims, x::DenseCuArray{T}, mode::cudnnPoolingMode_t) where T
-    pdims, x = fix1d(pdims), fix1d(x)
-    window, padding, stride = NNlib.kernel_size(pdims), nnlibPadding(pdims), NNlib.stride(pdims)
-    nanOpt = CUDNN_NOT_PROPAGATE_NAN
-    cudnnPoolingDescriptor(mode, nanOpt, Cint(max(2,ndims(x)-2)), pooldims(window,size(x)), pooldims(padding,size(x)), pooldims(stride,size(x)))
-end
-
-function maxpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
-    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_MAX)
-    cudnnPoolingForward!(y, x, d)
-end
-
-function ∇maxpool!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
-    xDesc, yDesc = cudnnTensorDescriptor.((x, y))
-    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_MAX)
-    alpha, beta = scalingParameter(T,1), scalingParameter(T,0)
-    cudnnPoolingBackward(handle(), d, alpha, yDesc, y, yDesc, dy, xDesc, x, beta, xDesc, dx)
-    return dx
-end
-
-function meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
-    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING)
-    cudnnPoolingForward!(y, x, d)
-end
-
-function ∇meanpool!(dx::DenseCuArray{T}, dy::DenseCuArray{T}, y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDNNFloat
-    xDesc, yDesc = cudnnTensorDescriptor.((x, y))
-    d = cudnnPoolingDescriptor(pdims, x, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING)
-    alpha, beta = scalingParameter(T,1), scalingParameter(T,0)
-    cudnnPoolingBackward(handle(), d, alpha, yDesc, y, yDesc, dy, xDesc, x, beta, xDesc, dx)
-    return dx
-end
-
-function maxpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
-    pdims = PoolDims(x, k; padding=pad, stride=stride)
-    return maxpool!(y, x, pdims)
-end
-
-function meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
-    pdims = PoolDims(x, k; padding=pad, stride=stride)
-    return meanpool!(y, x, pdims)
-end
-
-# Deprecated methods
-function cudnnPoolingForward(y::DenseCuArray{T,N}, x::DenseCuArray{T,N}, pdims::NNlib.PoolDims;
-                             alpha=1, beta=0, mode=CUDNN_POOLING_MAX) where {T,N}
-    # @warn "`cudnnPoolingForward(y,x,d::PoolDims)` is deprecated, please use one of the methods in `@doc cudnnPoolingForward`." maxlog=1
-    cudnnPoolingForward!(y, x; window=NNlib.kernel_size(pdims), padding=nnlibPadding(pdims), stride=NNlib.stride(pdims), mode, alpha, beta)
-end
-
diff --git a/lib/NNlibCUDA/src/cudnn/softmax.jl b/lib/NNlibCUDA/src/cudnn/softmax.jl
deleted file mode 100644
index 06a373132..000000000
--- a/lib/NNlibCUDA/src/cudnn/softmax.jl
+++ /dev/null
@@ -1,102 +0,0 @@
-import NNlib: softmax, softmax!, ∇softmax, ∇softmax!,
-              logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
-
-using CUDA.CUDNN: CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL, 
-                CUDNN_SOFTMAX_FAST, CUDNN_SOFTMAX_ACCURATE, cudnnSoftmaxForward!,
-                cudnnSoftmaxBackward
-
-# Softmax
-
-# @denizyuret: do not do inplace operations with softmax/logsoftmax when (1) cpu version is not, (2) one can use softmax!
-function softmax(x::T; dims=1) where {T<:DenseCuArray}
-    softmax!(similar(x), x; dims)
-end
-
-function ∇softmax(dy::T, x::T, y::T; dims=1) where {T<:DenseCuArray}
-    ∇softmax!(similar(x), dy, x, y; dims)
-end
-
-function logsoftmax(x::T; dims=1) where {T<:DenseCuArray}
-    logsoftmax!(similar(x), x; dims)
-end
-
-function ∇logsoftmax(dy::T, x::T, y::T; dims=1) where {T<:DenseCuArray}
-    ∇logsoftmax!(similar(x), dy, x, y; dims)
-end
-
-# @denizyuret: backup implementations for unsupported/slow size/dims combinations:
-function _softmax!(y::T, x::T; dims) where {T<:DenseCuArray}
-    y .= exp.(x .- maximum(x; dims))
-    y ./= sum(y; dims)
-end
-
-function _∇softmax!(dx::T, dy::T, x::T, y::T; dims) where {T<:DenseCuArray}
-    dx .= y .* (dy .- sum(dy .* y; dims))
-end
-
-function _logsoftmax!(y::T, x::T; dims) where {T<:DenseCuArray}
-    y .= x .- maximum(x; dims)
-    y .-= log.(sum(exp.(y); dims))
-end
-
-function _∇logsoftmax!(dx::T, dy::T, x::T, y::T; dims) where {T<:DenseCuArray}
-    dx .= dy .- sum(dy; dims) .* exp.(y)
-end
-
-# Trick by @norci to use cudnn for softmax dims args that are contiguous: 
-# If dims=(dmin:dmax) then CUDNN_SOFTMAX_MODE_CHANNEL does the trick with reshape 
-#    (1, prod(size(x)[1:dmin-1]), prod(size(x)[dmin:dmax]), :)
-# softmaxdims returns nothing when the backup implementation should be used.
-
-function softmaxdims(x, dims)
-    dims === Colon() && return (1, 1, length(x), 1)
-    mind,maxd = minimum(dims),maximum(dims)
-    all(i in dims for i in mind:maxd) || return nothing # cannot handle if not contiguous
-    stride = dimsize = 1
-    for i in 1:(mind-1); stride *= size(x,i); end # Using size(x,i) assumes trailing dims = 1, robust to maxd > ndims(x)
-    for i in mind:maxd; dimsize *= size(x,i); end
-    batchsize = length(x)÷(stride*dimsize)
-    # Here is a region where cudnn is slower, so we go with the backup:
-    batchsize == 1 && 64 <= stride <= 4096 && 64 <= dimsize <= 4096 && return nothing
-    return (1, stride, dimsize, batchsize)
-end
-
-# Determine softmax algo based on math_mode
-
-softmaxalgo() = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE)
-
-# Main implementations:
-
-function softmax!(y::T, x::T = y; dims=1) where {T<:DenseCuArray}
-    s = softmaxdims(x, dims)
-    s === nothing && return _softmax!(y, x; dims)
-    cudnnSoftmaxForward!(reshape(y,s), reshape(x,s); mode = CUDNN_SOFTMAX_MODE_CHANNEL, algo = softmaxalgo())
-    return y
-end
-
-function ∇softmax!(dx::T, dy::T, x::T, y::T; dims=1) where {R,T<:DenseCuArray{R}}
-    s = softmaxdims(x, dims)
-    s === nothing && return _∇softmax!(dx, dy, x, y; dims)
-    xDesc = cudnnTensorDescriptor(reshape(x,s))
-    alpha, beta = scalingParameter(R,1), scalingParameter(R,0)
-    cudnnSoftmaxBackward(handle(), softmaxalgo(), CUDNN_SOFTMAX_MODE_CHANNEL, 
-                         alpha, xDesc, y, xDesc, dy, beta, xDesc, dx)
-    return dx
-end
-
-function logsoftmax!(y::T, x::T = y; dims=1) where {T<:DenseCuArray}
-    s = softmaxdims(x, dims)
-    s === nothing && return _logsoftmax!(y, x; dims)
-    cudnnSoftmaxForward!(reshape(y,s), reshape(x,s); mode = CUDNN_SOFTMAX_MODE_CHANNEL, algo = CUDNN_SOFTMAX_LOG)
-    return y
-end
-
-function ∇logsoftmax!(dx::T, dy::T, x::T, y::T; dims=1) where {R,T<:DenseCuArray{R}}
-    s = softmaxdims(x, dims)
-    s === nothing && return _∇logsoftmax!(dx, dy, x, y; dims)
-    xDesc = cudnnTensorDescriptor(reshape(x,s))
-    alpha, beta = scalingParameter(R,1), scalingParameter(R,0)
-    cudnnSoftmaxBackward(handle(), CUDNN_SOFTMAX_LOG, CUDNN_SOFTMAX_MODE_CHANNEL, 
-                         alpha, xDesc, y, xDesc, dy, beta, xDesc, dx)
-    return dx
-end
diff --git a/lib/NNlibCUDA/src/upsample.jl b/lib/NNlibCUDA/src/upsample.jl
deleted file mode 100644
index 287adbd28..000000000
--- a/lib/NNlibCUDA/src/upsample.jl
+++ /dev/null
@@ -1,157 +0,0 @@
-
-#
-# Upsampling
-#
-
-# GPU based bilinear upsampling including its gradient
-#
-# Based on the Caffe2 implementation at:
-# The code is a translation from the following files:
-# - https://github.com/pytorch/pytorch/blob/v1.8.0-rc1/caffe2/operators/upsample_op.cu
-# - https://github.com/pytorch/pytorch/blob/v1.8.0-rc1/caffe2/core/common_gpu.h
-#
-# Copyright (c) 2016-2021 Facebook Inc.
-# Copyright (c) 2015 Google Inc.
-# Copyright (c) 2015 Yangqing Jia
-# Copyright 2019-2020 Kakao Brain
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without modification, are
-# permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this list of
-#    conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice, this list of
-#    conditions and the following disclaimer in the documentation and/or other materials
-#    provided with the distribution.
-#
-# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America and
-#    IDIAP Research Institute nor the names of its contributors may be used to endorse or
-#    promote products derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Forward and backward pass have been tested to produce the same output
-# as pytorch with align_corners=True - it works modulo bit noise.
-
-function upsample_bilinear_whcn_kernel!(n_elem, rheight, rwidth, x, y)
-    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
-
-    if index < n_elem
-        in_w, in_h, channels, batchsize = size(x)
-        out_w, out_h, _, _ = size(y)
-
-        ow = index % out_w
-        oh = index ÷ out_w
-
-        real_index = rheight*oh
-        ih0 = Base.floor(Int, real_index)
-        offset = (ih0 < in_h-1) ? 1 : 0
-        ih1 = ih0 + offset + 1
-        h1lambda = real_index - ih0
-        h0lambda = 1 - h1lambda
-        ih0 += 1
-
-        real_index = rwidth*ow
-        iw0 = Base.floor(Int, real_index)
-        offset = (iw0 < in_w-1) ? 1 : 0
-        iw1 = iw0 + offset + 1
-        w1lambda = real_index - iw0
-        w0lambda = 1 - w1lambda
-        iw0 += 1
-
-        @inbounds for n in 1:batchsize
-            for c in 1:channels
-                val = h0lambda * (w0lambda * x[iw0, ih0, c, n]  + # h0 * w0 * i00
-                                  w1lambda * x[iw1, ih0, c, n]) + # h0 * w1 * i01
-                      h1lambda * (w0lambda * x[iw0, ih1, c, n]  + # h1 * w0 * i10
-                                  w1lambda * x[iw1, ih1, c, n])   # h1 * w1 * i11
-                y[ow+1, oh+1, c, n] = val
-            end
-        end
-    end
-    return nothing
-end
-
-# Δ is the gradient backpropagated from downstream layers
-function ∇upsample_bilinear_whcn_kernel!(n_elem, rheight, rwidth, Δ, dx)
-    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
-
-    if index < n_elem
-        in_width, in_height, channels, batchsize = size(Δ)
-        out_width, out_height, _, _ = size(dx)
-
-        iw = index % in_width
-        ih = index ÷ in_width
-
-        # Compute Y axis lambdas
-        real_index_h = rheight*ih
-        oh0 = Base.floor(Int, real_index_h)
-        offset = (oh0 < out_height-1) ? 1 : 0
-        oh1 = oh0 + offset + 1
-        h1lambda = real_index_h - oh0
-        h0lambda = 1 - h1lambda
-        oh0 += 1
-
-        # # Compute X axis lambdas
-        real_index_w = rwidth * iw
-        ow0 = Base.floor(Int, real_index_w)
-        offset = (ow0 < out_width - 1) ? 1 : 0
-        ow1 = ow0 + offset + 1
-        w1lambda = real_index_w - ow0
-        w0lambda = 1 - w1lambda
-        ow0 += 1
-
-        @inbounds for n in 1:batchsize
-            for c in 1:channels
-                val = Δ[iw+1, ih+1, c, n]
-                @atomic dx[ow0, oh0, c, n] += h0lambda * w0lambda * val
-                @atomic dx[ow1, oh0, c, n] += h0lambda * w1lambda * val
-                @atomic dx[ow0, oh1, c, n] += h1lambda * w0lambda * val
-                @atomic dx[ow1, oh1, c, n] += h1lambda * w1lambda * val
-            end
-        end
-    end # if
-    return nothing
-end
-
-function NNlib.upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}) where T
-    w,h,c,n = size(x)
-    out_w, out_h = (size(y,1), size(y,2))
-
-    out_size = out_h*out_w
-    rheight = T((h-1)/(out_h-1))
-    rwidth  = T((w-1)/(out_w-1))
-
-    kernel = @cuda launch=false upsample_bilinear_whcn_kernel!(out_size, rheight, rwidth, x, y)
-    config = launch_configuration(kernel.fun; max_threads=256)
-    threads = Base.min(out_size, config.threads)
-    blocks = cld(out_size, threads)
-    kernel(out_size, rheight, rwidth, x, y; threads=threads, blocks=blocks)
-    return y
-end
-
-function NNlib.∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}) where T
-    w,h,c,n = Base.size(Δ)
-    out_w, out_h = (size(dx, 1), size(dx, 2))
-    in_size = h*w
-    rheight = T((out_h-1)/(h-1)) # reversed compared to forward pass
-    rwidth  = T((out_w-1)/(w-1))
-
-    kernel = @cuda launch=false ∇upsample_bilinear_whcn_kernel!(in_size, rheight, rwidth, Δ, dx)
-    config = launch_configuration(kernel.fun; max_threads=256)
-    threads = Base.min(in_size, config.threads)
-    blocks = cld(in_size, threads)
-    kernel(in_size, rheight, rwidth, Δ, dx; threads=threads, blocks=blocks)
-    return dx
-end
diff --git a/lib/NNlibCUDA/test/activations.jl b/lib/NNlibCUDA/test/activations.jl
deleted file mode 100644
index 67ca7c1cc..000000000
--- a/lib/NNlibCUDA/test/activations.jl
+++ /dev/null
@@ -1,32 +0,0 @@
-@testset "activation broadcast" begin
-    for f in NNlib.ACTIVATIONS
-        if f ∉ [:rrelu]
-            @eval gputest(x -> $f.(x), rand(Float64, 5))
-        end
-    end
-end
-
-@testset "forward diff" begin
-    f(x) = logσ.(x)
-    ds = Dual.(rand(5),1)
-    @test f(ds) ≈ collect(f(CuArray(ds)))
-end
-
-@testset "softplus" begin 
-  # softplus does not give `Inf` for large arguments
-   x = CuArray([1000.])
-   @test all(softplus.(x) .== x)
-end
-
-@testset "input is preserved" begin
-    x = CUDA.ones(1)
-    @test Array(x) == [1f0]
-    tanh.(x)
-    @test Array(x) == [1f0]
-    y = tanh.(x)
-    @test Array(x) == [1f0]
-    @test Array(y) == [tanh(1f0)]
-    x .= tanh.(y)
-    @test Array(y) == [tanh(1f0)]
-    @test Array(x) == [tanh(tanh(1f0))]
-end
diff --git a/lib/NNlibCUDA/test/batchedmul.jl b/lib/NNlibCUDA/test/batchedmul.jl
deleted file mode 100644
index d6a3b6023..000000000
--- a/lib/NNlibCUDA/test/batchedmul.jl
+++ /dev/null
@@ -1,56 +0,0 @@
-@testset "batched_mul" begin
-    using NNlib: batched_mul, batched_mul!, batched_vec, 
-                 batched_adjoint, batched_transpose
-
-    A = randn(Float32, 3,3,2);
-    B = randn(Float32, 3,3,2);
-
-    C = batched_mul(A, B)
-    @test CuArray(C) ≈ batched_mul(CuArray(A), CuArray(B))
-
-    Ct = batched_mul(batched_transpose(A), B)
-    @test CuArray(Ct) ≈ batched_mul(batched_transpose(CuArray(A)), CuArray(B))
-
-    Ca = batched_mul(A, batched_adjoint(B))
-    @test CuArray(Ca) ≈ batched_mul(CuArray(A), batched_adjoint(CuArray(B)))
-
-    # 5-arg batched_mul!
-    C .= pi
-    batched_mul!(C, A, B, 2f0, 3f0)
-    cuCpi = CuArray(similar(C)) .= pi
-    @test CuArray(C) ≈ batched_mul!(cuCpi, CuArray(A), CuArray(B), 2f0, 3f0)
-
-    # PermutedDimsArray
-    @test CuArray(Ct) ≈ batched_mul(PermutedDimsArray(CuArray(A), (2,1,3)), CuArray(B))
-
-    D = permutedims(B, (1,3,2))
-    Cp = batched_mul(batched_adjoint(A), B)
-    @test CuArray(Cp) ≈ batched_mul(batched_adjoint(CuArray(A)), PermutedDimsArray(CuArray(D), (1,3,2)))
-
-    # Methods which reshape
-    M = randn(Float32, 3,3)
-
-    Cm = batched_mul(A, M)
-    @test CuArray(Cm) ≈ batched_mul(CuArray(A), CuArray(M))
-
-    Cv = batched_vec(permutedims(A,(3,1,2)), M)
-    @test CuArray(Cv) ≈ batched_vec(PermutedDimsArray(CuArray(A),(3,1,2)), CuArray(M))
-end
-
-@testset "NNlib storage_type etc." begin
-    using LinearAlgebra
-    using NNlib: is_strided, are_strided, storage_type
-
-    M = cu(ones(10,10))
-
-    @test is_strided(M)
-    @test is_strided(view(M, 1:2:5,:))
-    @test is_strided(PermutedDimsArray(M, (2,1)))
-
-    @test !is_strided(reshape(view(M, 1:2:10,:), 10,:))
-    @test !is_strided((M .+ im)')
-    @test !is_strided(Diagonal(cu(ones(3))))
-
-    @test storage_type(M) == CuArray{Float32,2}
-    @test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2}
-end
diff --git a/lib/NNlibCUDA/test/batchnorm.jl b/lib/NNlibCUDA/test/batchnorm.jl
deleted file mode 100644
index 0da8a0e0f..000000000
--- a/lib/NNlibCUDA/test/batchnorm.jl
+++ /dev/null
@@ -1,8 +0,0 @@
-@testset "Batchnorm" begin
-    v = CUDA.rand(Float32, 2)
-    m = CUDA.rand(Float32, 2, 5)
-    for training in (false, true)
-        NNlibCUDA.batchnorm(v, v, m, v, v, 1.0; training=training)
-        NNlibCUDA.∇batchnorm(v, v, m, m, v, v, 1.0; training=training)
-    end
-end
diff --git a/lib/NNlibCUDA/test/conv.jl b/lib/NNlibCUDA/test/conv.jl
deleted file mode 100644
index 6d7e0d6df..000000000
--- a/lib/NNlibCUDA/test/conv.jl
+++ /dev/null
@@ -1,60 +0,0 @@
-using NNlib: DenseConvDims
-
-@testset "convolution" begin
-    a, b, c = rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4), rand(Float64, 9, 9, 4, 1)
-    da, db, dc = CuArray(a), CuArray(b), CuArray(c)
-    cdims = DenseConvDims(a, b)
-    @test NNlib.conv(a, b, cdims) ≈ collect(NNlib.conv(da, db, cdims))
-    @test ∇conv_data(c, b, cdims) ≈ collect(∇conv_data(dc, db, cdims))
-    @test ∇conv_filter(a, c, cdims) ≈ collect(∇conv_filter(da, dc, cdims))
-
-    # Test for agreement between CPU NNlib and CuDNN versions, across a variety of kwargs
-    for num_spatial_dims in (1, 2, 3)
-        # Initialize data we'll run our tests over
-        C_in = 3
-        C_out = 4
-        batch_size = 1
-        x = rand(Float64, fill(8, num_spatial_dims)..., C_in, batch_size)
-        w = rand(Float64, fill(2, num_spatial_dims)..., C_in, C_out)
-        b = rand(Float64, fill(1, num_spatial_dims)..., C_in, C_out)
-        options = (Dict(), Dict(:dilation => 2), Dict(:flipkernel => true), Dict(:stride => 2), Dict(:padding => 1))
-
-        # @denizyuret: algo option deprecated for nnlib, handling in cudnn
-        # algos = (1, 0, 1, 1,)
-        # for (opts, algo) in zip(options, algos)
-
-        for opts in options  
-            cdims = DenseConvDims(x, w; opts...)
-            y = NNlib.conv(x, w, cdims)
-
-            # Test that basic convolution is equivalent across GPU/CPU
-            gputest((x, w) -> NNlib.conv(x, w, cdims), x, w)
-            gputest((y, w) -> NNlib.∇conv_data(y, w, cdims), y, w)
-            gputest((x, y) -> NNlib.∇conv_filter(x, y, cdims), x, y, checkgrad=false) # TODO fix grad
-
-            # Scaling factors
-            gputest((x, w) -> NNlib.conv(x, w, cdims; alpha=2.0), x, w, checkgrad=false) # TODO
-            gputest((y, w) -> NNlib.∇conv_data(y, w, cdims; alpha=2.0), y, w, checkgrad=false) # TODO
-            gputest((x, y) -> NNlib.∇conv_filter(x, y, cdims; alpha=2.0), x, y, checkgrad=false) # TODO
-            
-            gputest((y, x, w) -> NNlib.conv!(copy(y), x, w, cdims; beta=2.0), y, x, w, checkgrad=false) # TODO
-            # @test_broken gputest((x, y, w) -> NNlib.∇conv_data!(copy(x), y, w, cdims; beta=2.0), x, y, w, checkgrad=false) #TODO
-            gputest((w, x, y) -> NNlib.∇conv_filter!(copy(w), x, y, cdims; beta=2.0), w, x, y, checkgrad=false) # TODO
-
-            # Test the compatibility shims
-            cy,cx,cw = CuArray{Float32}.((y,x,w))
-            opts2 = Dict((k==:padding ? :pad : k)=>v for (k,v) in opts)
-            @test NNlib.conv!(similar(cy),cx,cw; opts2...) ≈ NNlib.conv!(similar(cy),cx,cw,cdims)
-            @test NNlib.∇conv_filter!(similar(cw),cy,cx; opts2...) ≈ NNlib.∇conv_filter!(similar(cw),cx,cy,cdims)
-        end
-
-        # CPU implementation of ∇conv_bias!
-        db = zeros(Float64, 1, 1, 3, 1)
-        dy = randn(Float64, 8, 8, 3, 1)
-        function NNlibCUDA.∇conv_bias!(db, dy)
-            db .= sum(dy, dims=(1:(ndims(dy)-2)))
-            return db
-        end
-        gputest(NNlibCUDA.∇conv_bias!, db, dy, checkgrad=false)
-    end
-end
diff --git a/lib/NNlibCUDA/test/pooling.jl b/lib/NNlibCUDA/test/pooling.jl
deleted file mode 100644
index 9cd3739fe..000000000
--- a/lib/NNlibCUDA/test/pooling.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-@testset "pooling" begin
-
-    # Test for agreement between CPU NNlib and CuDNN versions, across a variety of kwargs
-    for num_spatial_dims in (1, 2, 3)
-        # Initialize data we'll run our tests over
-        C_in = 3
-        batch_size = 1
-        x = rand(Float64, fill(8, num_spatial_dims)..., C_in, batch_size)
-       
-        # Test that pooling is equivalent across GPU/CPU
-        pdims = PoolDims(x, 2)
-        y = maxpool(x, pdims)
-        dy = ones(size(y))
-        gputest(x -> maxpool(x, pdims), x)
-        gputest((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x, checkgrad=false)
-        gputest(x -> maxpool(x, pdims), x)
-        gputest((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x, checkgrad=false)
-
-        # Test the compatibility shims for pooling
-        cx,cy,cdy = CuArray{Float32}.((x,y,dy))
-        win,pad=2,1
-        maxpool!(similar(cy), cx, win; pad=pad, stride=win) ≈ maxpool!(similar(cy), cx, PoolDims(cx, win; padding=pad, stride=win))
-        meanpool!(similar(cy), cx, win; pad=pad, stride=win) ≈ meanpool!(similar(cy), cx, PoolDims(cx, win; padding=pad, stride=win))
-    end
-end
diff --git a/lib/NNlibCUDA/test/runtests.jl b/lib/NNlibCUDA/test/runtests.jl
deleted file mode 100644
index 75e8ef779..000000000
--- a/lib/NNlibCUDA/test/runtests.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-using Test
-using NNlib
-using Zygote
-using NNlibCUDA
-using ForwardDiff: Dual
-using CUDA
-CUDA.allowscalar(false)
-
-include("test_utils.jl")
-
-if CUDA.functional()
-    include("activations.jl")
-    include("batchedmul.jl")
-    include("upsample.jl")
-    include("conv.jl")
-    include("pooling.jl")
-    include("softmax.jl")
-    include("batchnorm.jl")
-else
-    @warn "needs working CUDA installation to perform tests"
-end
diff --git a/lib/NNlibCUDA/test/softmax.jl b/lib/NNlibCUDA/test/softmax.jl
deleted file mode 100644
index 2611e04f6..000000000
--- a/lib/NNlibCUDA/test/softmax.jl
+++ /dev/null
@@ -1,12 +0,0 @@
-@testset "softmax" begin
-    for dims in [(5,5), (5,)]
-        x = randn(Float64, dims)
-        y = softmax(x)
-        dy = randn(Float64, dims)
-        gputest(softmax, x)
-        gputest(∇softmax, dy, x, y, checkgrad=false)
-        y = logsoftmax(x)
-        gputest(logsoftmax, x)
-        gputest(∇logsoftmax, dy, x, y, checkgrad=false) 
-    end
-end
diff --git a/lib/NNlibCUDA/test/test_utils.jl b/lib/NNlibCUDA/test/test_utils.jl
deleted file mode 100644
index 46d33e0d8..000000000
--- a/lib/NNlibCUDA/test/test_utils.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-function gputest(f, xs...; checkgrad=true, atol=1e-10)
-    cpu_in = xs
-    gpu_in = CuArray.(xs)
-
-    cpu_out = f(cpu_in...)
-    gpu_out = f(gpu_in...)
-    @test collect(cpu_out) ≈ collect(gpu_out)
-    
-    if checkgrad
-        cpu_grad = gradient((x...) -> sum(f(x...)), cpu_in...)
-        gpu_grad = gradient((x...) -> sum(f(x...)), gpu_in...)
-        for (cpu_g, gpu_g) in zip(cpu_grad, gpu_grad)
-            if cpu_g === nothing
-                @test gpu_g === nothing
-            else
-                @test collect(cpu_g) ≈ collect(gpu_g)  atol=atol
-            end
-        end
-    end
-end
diff --git a/lib/NNlibCUDA/test/upsample.jl b/lib/NNlibCUDA/test/upsample.jl
deleted file mode 100644
index 4858eecbe..000000000
--- a/lib/NNlibCUDA/test/upsample.jl
+++ /dev/null
@@ -1,27 +0,0 @@
-@testset "Bilinear upsampling" begin
-    x = Float32[1 2; 3 4][:,:,:,:]
-    x = cat(x,x; dims=3)
-    x = cat(x,x; dims=4)
-    xgpu = cu(x)
-  
-    y_true = Float32[ 1//1  4//3   5//3   2//1;
-            7//5 26//15 31//15 12//5;
-            9//5 32//15 37//15 14//5;
-           11//5 38//15 43//15 16//5;
-           13//5 44//15 49//15 18//5;
-            3//1 10//3  11//3   4//1]
-    y_true = cat(y_true,y_true; dims=3)
-    y_true = cat(y_true,y_true; dims=4)
-    y_true_gpu = cu(y_true)
-  
-    y = upsample_bilinear(xgpu, (3,2))
-    @test size(y) == size(y_true_gpu)
-    @test eltype(y) == Float32
-    @test collect(y) ≈ collect(y_true_gpu)
-
-    o = CUDA.ones(Float32,6,4,2,1)
-    grad_true = 6*CUDA.ones(Float32,2,2,2,1)
-    @test ∇upsample_bilinear(o; size=(2,2)) ≈ grad_true
-
-    gputest(x -> upsample_bilinear(x, (3, 2)), x, atol=1e-5)
-end

From 758118d95aa9526f1080ed6eb317af24366cd996 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 02:58:28 +0530
Subject: [PATCH 02/14] rm .ci

---
 .ci/develop.jl           |  7 -------
 .ci/develop_nnlibcuda.jl | 11 -----------
 .ci/test.jl              |  5 -----
 .ci/test_nnlibcuda.jl    |  5 -----
 4 files changed, 28 deletions(-)
 delete mode 100644 .ci/develop.jl
 delete mode 100644 .ci/develop_nnlibcuda.jl
 delete mode 100644 .ci/test.jl
 delete mode 100644 .ci/test_nnlibcuda.jl

diff --git a/.ci/develop.jl b/.ci/develop.jl
deleted file mode 100644
index ec423b45e..000000000
--- a/.ci/develop.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-import Pkg
-
-root_directory = dirname(@__DIR__)
-
-nnlib = Pkg.PackageSpec(path = root_directory)
-Pkg.develop(nnlib)
-Pkg.precompile()
diff --git a/.ci/develop_nnlibcuda.jl b/.ci/develop_nnlibcuda.jl
deleted file mode 100644
index f309ca01c..000000000
--- a/.ci/develop_nnlibcuda.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-import Pkg
-
-root_directory = dirname(@__DIR__)
-
-nnlib = Pkg.PackageSpec(path = root_directory)
-nnlibcuda = Pkg.PackageSpec(path = joinpath(root_directory, "lib", "NNlibCUDA"))
-
-Pkg.develop(nnlib)
-Pkg.develop(nnlibcuda)
-
-Pkg.precompile()
diff --git a/.ci/test.jl b/.ci/test.jl
deleted file mode 100644
index 51d8e28e8..000000000
--- a/.ci/test.jl
+++ /dev/null
@@ -1,5 +0,0 @@
-import Pkg
-
-pkgs = ["NNlib"]
-
-Pkg.test(pkgs; coverage = true)
diff --git a/.ci/test_nnlibcuda.jl b/.ci/test_nnlibcuda.jl
deleted file mode 100644
index e9bb40071..000000000
--- a/.ci/test_nnlibcuda.jl
+++ /dev/null
@@ -1,5 +0,0 @@
-import Pkg
-
-pkgs = ["NNlibCUDA"]
-
-Pkg.test(pkgs; coverage = true)

From ac405f3248d884d77ac58f4b3b711459da25147d Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 03:09:37 +0530
Subject: [PATCH 03/14] use CUDA test dep

---
 Project.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index caf41960b..6008e45f1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,6 +19,7 @@ Requires = "0.5, 1.0"
 julia = "1.3"
 
 [extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -28,4 +29,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["ChainRulesTestUtils", "FiniteDifferences", "Logging", "Random", "StableRNGs", "Test", "Zygote"]
+test = ["ChainRulesTestUtils", "CUDA", "FiniteDifferences", "Logging", "Random", "StableRNGs", "Test", "Zygote"]

From 7806bf8478800bf4830536c185bd76b4c4b9909a Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 03:10:34 +0530
Subject: [PATCH 04/14] test NNlibCUDA when possible

---
 test/runtests.jl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index 7b4c17f31..60d9231b7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -56,3 +56,15 @@ end
     include("utils.jl")
 end
 
+using CUDA
+
+if VERSION >= v"1.6" && CUDA.functional()
+    import Pkg
+    Pkg.develop(url = "https://github.com/FluxML/NNlibCUDA.jl")
+    using NNlibCUDA
+    @testset "CUDA" begin
+        Pkg.test("NNlibCUDA")
+    end
+else
+    @info "Insufficient version or CUDA not found: Skipping CUDA tests"
+end

From 996b98605bf3e775978ff8c53b6dcffd32f907eb Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 03:31:36 +0530
Subject: [PATCH 05/14] restore ci

---
 .buildkite/pipeline.yml  | 16 +++++++++++-----
 .github/workflows/ci.yml |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 457442d97..624e8429b 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -3,19 +3,25 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "1.6"
+      - JuliaCI/julia-test#v1: ~
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
-            - lib
-    commands:
-      - julia .ci/develop_nnlibcuda.jl
-      - julia .ci/test_nnlibcuda.jl
+#     commands:
+#       - julia --project=test -e """
+#         Pkg.develop(url = \"https://github.com/FluxML/NNlibCUDA.jl\")
+#         Pkg.instantiate()
+#         Pkg.build()
+#         Pkg.status()
+#         Pkg.test()
+#         Pkg.test(\"NNlibCUDA\")
+#         """
     agents:
       queue: "juliagpu"
       cuda: "*"
     timeout_in_minutes: 60
-  
+
   ## Add these when julia 1.7 is out
   # - label: "GPU julia v1"       
   #   plugins:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 245502576..4ea9f56cc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,8 +65,8 @@ jobs:
             ${{ runner.os }}-test-${{ env.cache-name }}-
             ${{ runner.os }}-test-
             ${{ runner.os }}-
-      - run: julia .ci/develop.jl
-      - run: julia .ci/test.jl
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v1
         with:

From 281727de24553f8ef70ae2724206b75d1b13225c Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 04:13:50 +0530
Subject: [PATCH 06/14] set NNLIB_TEST_CUDA on bk

---
 .buildkite/pipeline.yml | 12 ++++--------
 test/runtests.jl        | 14 +++++++++-----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 624e8429b..c6256be49 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -20,6 +20,8 @@ steps:
     agents:
       queue: "juliagpu"
       cuda: "*"
+    env:
+      NNLIB_TEST_CUDA: true
     timeout_in_minutes: 60
 
   ## Add these when julia 1.7 is out
@@ -27,14 +29,11 @@ steps:
   #   plugins:
   #     - JuliaCI/julia#v1:
   #         version: "1"      
+  #     - JuliaCI/julia-test#v1: ~
   #     - JuliaCI/julia-coverage#v1:
   #         codecov: true
   #         dirs:
   #           - src
-  #           - lib
-  #   commands:
-  #     - julia .ci/develop.jl
-  #     - julia .ci/test.jl
   #   agents:
   #     queue: "juliagpu"
   #     cuda: "*"
@@ -44,14 +43,11 @@ steps:
   #   plugins:
   #     - JuliaCI/julia#v1:
   #         version: "nightly"
+  #     - JuliaCI/julia-test#v1: ~  
   #     - JuliaCI/julia-coverage#v1:
   #         codecov: true
   #         dirs:
   #           - src
-  #           - lib
-  #   commands:
-  #     - julia .ci/develop.jl
-  #     - julia .ci/test.jl
   #   agents:
   #     queue: "juliagpu"
   #     cuda: "*"
diff --git a/test/runtests.jl b/test/runtests.jl
index 60d9231b7..688da6e11 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -59,11 +59,15 @@ end
 using CUDA
 
 if VERSION >= v"1.6" && CUDA.functional()
-    import Pkg
-    Pkg.develop(url = "https://github.com/FluxML/NNlibCUDA.jl")
-    using NNlibCUDA
-    @testset "CUDA" begin
-        Pkg.test("NNlibCUDA")
+    if ENV["NNLIB_TEST_CUDA"] == "true"
+        import Pkg
+        Pkg.develop(url = "https://github.com/FluxML/NNlibCUDA.jl")
+        using NNlibCUDA
+        @testset "CUDA" begin
+            Pkg.test("NNlibCUDA")
+        end
+    else
+        @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them"
     end
 else
     @info "Insufficient version or CUDA not found: Skipping CUDA tests"

From d5ad6aab50423382f76f40b7752f6e7c242a6473 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 04:48:24 +0530
Subject: [PATCH 07/14] typo

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 688da6e11..87f03b902 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -70,5 +70,5 @@ if VERSION >= v"1.6" && CUDA.functional()
         @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them"
     end
 else
-    @info "Insufficient version or CUDA not found: Skipping CUDA tests"
+    @info "Insufficient version or CUDA not found; Skipping CUDA tests"
 end

From 11e0da593c591a42d976bc796a95418823fe7a09 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 04:53:18 +0530
Subject: [PATCH 08/14] handle if env variable not set

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 87f03b902..35ce7ede5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -59,7 +59,7 @@ end
 using CUDA
 
 if VERSION >= v"1.6" && CUDA.functional()
-    if ENV["NNLIB_TEST_CUDA"] == "true"
+    if get(ENV, "NNLIB_TEST_CUDA", "false") == "true"
         import Pkg
         Pkg.develop(url = "https://github.com/FluxML/NNlibCUDA.jl")
         using NNlibCUDA

From 3a99c4028c7829b9bbafff4d16368afe135336d8 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 14:50:32 +0530
Subject: [PATCH 09/14] use has_cuda

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 35ce7ede5..c2acb429f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -58,7 +58,7 @@ end
 
 using CUDA
 
-if VERSION >= v"1.6" && CUDA.functional()
+if VERSION >= v"1.6" && CUDA.has_cuda()
     if get(ENV, "NNLIB_TEST_CUDA", "false") == "true"
         import Pkg
         Pkg.develop(url = "https://github.com/FluxML/NNlibCUDA.jl")

From 39585c093ffd914d066864bf3e0ed703c064a1a6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 14:52:00 +0530
Subject: [PATCH 10/14] add Adapt compat

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 6008e45f1..fe0cc4fe3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,7 +12,7 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-Adapt = "3.2"
+Adapt = "2, 3.2"
 ChainRulesCore = "0.9"
 Compat = "3.14"
 Requires = "0.5, 1.0"

From 4b7064c0052ac37a1000a2fe60416166e6523526 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 14:53:40 +0530
Subject: [PATCH 11/14] keep using functional

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index c2acb429f..35ce7ede5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -58,7 +58,7 @@ end
 
 using CUDA
 
-if VERSION >= v"1.6" && CUDA.has_cuda()
+if VERSION >= v"1.6" && CUDA.functional()
     if get(ENV, "NNLIB_TEST_CUDA", "false") == "true"
         import Pkg
         Pkg.develop(url = "https://github.com/FluxML/NNlibCUDA.jl")

From 16d30424ae6f2bbd41bdbcf7ce39645a0980e83e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 15 Apr 2021 22:15:31 +0530
Subject: [PATCH 12/14] test on 1.5 and up

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4ea9f56cc..01e75e912 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.3' # Replace this with the minimum Julia version that your package supports.
+          - '1.5' # Replace this with the minimum Julia version that your package supports.
           - '1'   # automatically expands to the latest stable 1.x release of Julia
           - 'nightly'
         os:

From 7fe6618d009af1f79b68a3374811e0f2c9ba66cb Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Fri, 16 Apr 2021 14:28:55 +0530
Subject: [PATCH 13/14] lower bound nnlib to 1.5

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index fe0cc4fe3..13193df27 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,7 +16,7 @@ Adapt = "2, 3.2"
 ChainRulesCore = "0.9"
 Compat = "3.14"
 Requires = "0.5, 1.0"
-julia = "1.3"
+julia = "1.5"
 
 [extras]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

From 53ada557b44c07e590aeb28d00766134bcedbc8e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Fri, 16 Apr 2021 16:53:23 +0530
Subject: [PATCH 14/14] trigger bk and bump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 13193df27..f1974c9d9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NNlib"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.7.18"
+version = "0.7.19"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"