From c3e25dc0a40b625a9b279d05dfa9f108f671741d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 12 Oct 2022 15:00:20 +0200 Subject: [PATCH] Move CUDNN and CUTENSOR into separate packages (#1624) --- .buildkite/pipeline.yml | 719 ++++++++++-------- .gitignore | 1 + deps/bindeps.jl | 213 +----- lib/cudnn/Project.toml | 9 + lib/cudnn/{ => src}/CUDNN.jl | 16 +- lib/cudnn/{ => src}/activation.jl | 0 lib/cudnn/{ => src}/base.jl | 0 lib/cudnn/src/bindeps.jl | 55 ++ lib/cudnn/{ => src}/convolution.jl | 0 lib/cudnn/{ => src}/descriptors.jl | 0 lib/cudnn/{ => src}/dropout.jl | 0 lib/cudnn/{ => src}/error.jl | 0 lib/cudnn/{ => src}/inplace.jl | 0 lib/cudnn/{ => src}/libcudnn.jl | 0 lib/cudnn/{ => src}/libcudnn_common.jl | 0 lib/cudnn/{ => src}/libcudnn_deprecated.jl | 0 lib/cudnn/{ => src}/multiheadattn.jl | 0 lib/cudnn/{ => src}/normalization.jl | 0 lib/cudnn/{ => src}/optensor.jl | 0 lib/cudnn/{ => src}/pooling.jl | 0 lib/cudnn/{ => src}/reduce.jl | 0 lib/cudnn/{ => src}/rnn.jl | 0 lib/cudnn/{ => src}/softmax.jl | 0 lib/cudnn/{ => src}/tensor.jl | 0 lib/cudnn/{ => src}/util.jl | 0 lib/cudnn/test/Project.toml | 7 + lib/cudnn/test/activation.jl | 61 ++ lib/cudnn/test/convolution.jl | 191 +++++ lib/cudnn/test/dropout.jl | 30 + lib/cudnn/test/inplace.jl | 28 + lib/cudnn/test/multiheadattn.jl | 170 +++++ lib/cudnn/test/normalization.jl | 116 +++ lib/cudnn/test/optensor.jl | 61 ++ lib/cudnn/test/pooling.jl | 91 +++ lib/cudnn/test/reduce.jl | 84 ++ lib/cudnn/test/rnn.jl | 142 ++++ lib/cudnn/test/runtests.jl | 27 + lib/cudnn/test/softmax.jl | 45 ++ lib/cudnn/test/tensor.jl | 31 + lib/{CUSTATEVEC => custatevec}/Project.toml | 1 + .../src/CUSTATEVEC.jl | 12 +- lib/custatevec/src/bindeps.jl | 47 ++ lib/{CUSTATEVEC => custatevec}/src/error.jl | 0 .../src/libcustatevec.jl | 0 .../src/libcustatevec_common.jl | 0 .../src/statevec.jl | 0 lib/{CUSTATEVEC => custatevec}/src/types.jl | 0 .../test/Project.toml | 0 .../test/runtests.jl | 8 +- lib/cutensor/Project.toml | 10 + lib/cutensor/{ => src}/CUTENSOR.jl | 11 +- lib/cutensor/src/bindeps.jl | 63 ++ lib/cutensor/{ => src}/error.jl | 0 lib/cutensor/{ => src}/interfaces.jl | 0 lib/cutensor/{ => src}/libcutensor.jl | 0 lib/cutensor/{ => src}/libcutensor_common.jl | 0 lib/cutensor/{ => src}/tensor.jl | 0 lib/cutensor/{ => src}/wrappers.jl | 0 lib/cutensor/test/Project.toml | 6 + {test/cutensor => lib/cutensor/test}/base.jl | 7 +- .../cutensor/test}/contractions.jl | 11 +- .../cutensor/test}/elementwise_binary.jl | 8 +- .../cutensor/test}/elementwise_trinary.jl | 8 +- .../cutensor/test}/permutations.jl | 5 +- .../cutensor/test}/reductions.jl | 6 +- lib/cutensor/test/runtests.jl | 24 + lib/{CUTENSORNET => cutensornet}/Project.toml | 2 + .../src/CUTENSORNET.jl | 22 +- lib/cutensornet/src/bindeps.jl | 49 ++ lib/{CUTENSORNET => cutensornet}/src/error.jl | 0 .../src/libcutensornet.jl | 0 .../src/libcutensornet_common.jl | 0 .../src/tensornet.jl | 0 lib/{CUTENSORNET => cutensornet}/src/types.jl | 0 .../test/Project.toml | 0 .../test/runtests.jl | 9 +- perf/.gitignore | 2 + src/CUDA.jl | 4 +- src/utilities.jl | 2 - test/cudnn/activation.jl | 63 -- test/cudnn/convolution.jl | 193 ----- test/cudnn/dropout.jl | 32 - test/cudnn/inplace.jl | 30 - test/cudnn/multiheadattn.jl | 172 ----- test/cudnn/normalization.jl | 119 --- test/cudnn/optensor.jl | 63 -- test/cudnn/pooling.jl | 93 --- test/cudnn/reduce.jl | 86 --- test/cudnn/rnn.jl | 144 ---- test/cudnn/softmax.jl | 47 -- test/cudnn/tensor.jl | 33 - test/runtests.jl | 6 - 92 files changed, 1816 insertions(+), 1679 deletions(-) create mode 100644 lib/cudnn/Project.toml rename lib/cudnn/{ => src}/CUDNN.jl (93%) rename lib/cudnn/{ => src}/activation.jl (100%) rename lib/cudnn/{ => src}/base.jl (100%) create mode 100644 lib/cudnn/src/bindeps.jl rename lib/cudnn/{ => src}/convolution.jl (100%) rename lib/cudnn/{ => src}/descriptors.jl (100%) rename lib/cudnn/{ => src}/dropout.jl (100%) rename lib/cudnn/{ => src}/error.jl (100%) rename lib/cudnn/{ => src}/inplace.jl (100%) rename lib/cudnn/{ => src}/libcudnn.jl (100%) rename lib/cudnn/{ => src}/libcudnn_common.jl (100%) rename lib/cudnn/{ => src}/libcudnn_deprecated.jl (100%) rename lib/cudnn/{ => src}/multiheadattn.jl (100%) rename lib/cudnn/{ => src}/normalization.jl (100%) rename lib/cudnn/{ => src}/optensor.jl (100%) rename lib/cudnn/{ => src}/pooling.jl (100%) rename lib/cudnn/{ => src}/reduce.jl (100%) rename lib/cudnn/{ => src}/rnn.jl (100%) rename lib/cudnn/{ => src}/softmax.jl (100%) rename lib/cudnn/{ => src}/tensor.jl (100%) rename lib/cudnn/{ => src}/util.jl (100%) create mode 100644 lib/cudnn/test/Project.toml create mode 100644 lib/cudnn/test/activation.jl create mode 100644 lib/cudnn/test/convolution.jl create mode 100644 lib/cudnn/test/dropout.jl create mode 100644 lib/cudnn/test/inplace.jl create mode 100644 lib/cudnn/test/multiheadattn.jl create mode 100644 lib/cudnn/test/normalization.jl create mode 100644 lib/cudnn/test/optensor.jl create mode 100644 lib/cudnn/test/pooling.jl create mode 100644 lib/cudnn/test/reduce.jl create mode 100644 lib/cudnn/test/rnn.jl create mode 100644 lib/cudnn/test/runtests.jl create mode 100644 lib/cudnn/test/softmax.jl create mode 100644 lib/cudnn/test/tensor.jl rename lib/{CUSTATEVEC => custatevec}/Project.toml (83%) rename lib/{CUSTATEVEC => custatevec}/src/CUSTATEVEC.jl (87%) create mode 100644 lib/custatevec/src/bindeps.jl rename lib/{CUSTATEVEC => custatevec}/src/error.jl (100%) rename lib/{CUSTATEVEC => custatevec}/src/libcustatevec.jl (100%) rename lib/{CUSTATEVEC => custatevec}/src/libcustatevec_common.jl (100%) rename lib/{CUSTATEVEC => custatevec}/src/statevec.jl (100%) rename lib/{CUSTATEVEC => custatevec}/src/types.jl (100%) rename lib/{CUSTATEVEC => custatevec}/test/Project.toml (100%) rename lib/{CUSTATEVEC => custatevec}/test/runtests.jl (90%) create mode 100644 lib/cutensor/Project.toml rename lib/cutensor/{ => src}/CUTENSOR.jl (90%) create mode 100644 lib/cutensor/src/bindeps.jl rename lib/cutensor/{ => src}/error.jl (100%) rename lib/cutensor/{ => src}/interfaces.jl (100%) rename lib/cutensor/{ => src}/libcutensor.jl (100%) rename lib/cutensor/{ => src}/libcutensor_common.jl (100%) rename lib/cutensor/{ => src}/tensor.jl (100%) rename lib/cutensor/{ => src}/wrappers.jl (100%) create mode 100644 lib/cutensor/test/Project.toml rename {test/cutensor => lib/cutensor/test}/base.jl (86%) rename {test/cutensor => lib/cutensor/test}/contractions.jl (99%) rename {test/cutensor => lib/cutensor/test}/elementwise_binary.jl (96%) rename {test/cutensor => lib/cutensor/test}/elementwise_trinary.jl (98%) rename {test/cutensor => lib/cutensor/test}/permutations.jl (97%) rename {test/cutensor => lib/cutensor/test}/reductions.jl (98%) create mode 100644 lib/cutensor/test/runtests.jl rename lib/{CUTENSORNET => cutensornet}/Project.toml (70%) rename lib/{CUTENSORNET => cutensornet}/src/CUTENSORNET.jl (76%) create mode 100644 lib/cutensornet/src/bindeps.jl rename lib/{CUTENSORNET => cutensornet}/src/error.jl (100%) rename lib/{CUTENSORNET => cutensornet}/src/libcutensornet.jl (100%) rename lib/{CUTENSORNET => cutensornet}/src/libcutensornet_common.jl (100%) rename lib/{CUTENSORNET => cutensornet}/src/tensornet.jl (100%) rename lib/{CUTENSORNET => cutensornet}/src/types.jl (100%) rename lib/{CUTENSORNET => cutensornet}/test/Project.toml (100%) rename lib/{CUTENSORNET => cutensornet}/test/runtests.jl (87%) create mode 100644 perf/.gitignore delete mode 100644 test/cudnn/activation.jl delete mode 100644 test/cudnn/convolution.jl delete mode 100644 test/cudnn/dropout.jl delete mode 100644 test/cudnn/inplace.jl delete mode 100644 test/cudnn/multiheadattn.jl delete mode 100644 test/cudnn/normalization.jl delete mode 100644 test/cudnn/optensor.jl delete mode 100644 test/cudnn/pooling.jl delete mode 100644 test/cudnn/reduce.jl delete mode 100644 test/cudnn/rnn.jl delete mode 100644 test/cudnn/softmax.jl delete mode 100644 test/cudnn/tensor.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 77448ccd4b..1db9a6cb23 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,371 +1,418 @@ steps: + # first, test supported Julia versions (using local CUDA as installed on the system) + - group: ":julia: Julia" + key: "julia" + steps: + - label: "Julia 1.6" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-test#v1: + test_args: "--quickfail" + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 120 - # Julia versions + - label: "Julia 1.7" + plugins: + - JuliaCI/julia#v1: + version: 1.7 + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 - - label: "Julia 1.6" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: - test_args: "--quickfail" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 120 + - label: "Julia 1.8" + plugins: + - JuliaCI/julia#v1: + version: 1.8 + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 - - label: "Julia 1.7" - plugins: - - JuliaCI/julia#v1: - version: 1.7 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 + - label: "Julia nightly" + plugins: + - JuliaCI/julia#v1: + version: nightly + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 + soft_fail: + - exit_status: 1 - - label: "Julia 1.8" - plugins: - - JuliaCI/julia#v1: - version: 1.8 - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 + # then, test supported CUDA toolkits (installed through the artifact system) + - group: "CUDA" + key: "cuda" + depends_on: "julia" + steps: + # NOTE: we support those CUDA versions for which the latest cuDNN is available + # https://developer.nvidia.com/rdp/cudnn-archive - - label: "Julia nightly" - plugins: - - JuliaCI/julia#v1: - version: nightly - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - soft_fail: - - exit_status: 1 + - label: "CUDA {{matrix}}" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-test#v1: + test_args: "--thorough" + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "11.0" + cap: "sm_80" + env: + JULIA_CUDA_VERSION: "{{matrix}}" + JULIA_CUDA_USE_BINARYBUILDER: 'true' + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 + matrix: + - "11.8" + - "11.7" + - "11.6" + - "11.5" + - "11.4" + - "11.3" + - "11.2" + - "11.1" + - "11.0" + # XXX: we cannot set an agents key (cap: "sm_75") via a build matrix + - label: "CUDA 10.2" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-test#v1: + test_args: "--thorough" + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "10.2" + cap: "sm_75" + env: + JULIA_CUDA_VERSION: '10.2' + JULIA_CUDA_USE_BINARYBUILDER: 'true' + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 - # CUDA versions (from artifacts) + - group: ":nesting_dolls: Subpackages" + depends_on: "cuda" + steps: + # NOTE: we Pkg.develop all subpackages so that they can depend on each other + - label: "{{matrix}} on CUDA 10.2" + matrix: + - "CUDNN" + - "CUTENSOR" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "10.2" + cap: "sm_75" + env: + JULIA_CUDA_VERSION: "10.2" + JULIA_CUDA_USE_BINARYBUILDER: 'true' + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 + commands: | + julia -e ' + using Pkg - # NOTE: we support those CUDA versions for which the latest cuDNN is available - # https://developer.nvidia.com/rdp/cudnn-archive + println("--- :julia: Instantiating projects"); + Pkg.develop([PackageSpec(path=pwd()), # CUDA + PackageSpec(path=joinpath(pwd(), "lib", "cudnn")), + PackageSpec(path=joinpath(pwd(), "lib", "cutensor")), + PackageSpec(path=joinpath(pwd(), "lib", "cutensornet")), + PackageSpec(path=joinpath(pwd(), "lib", "custatevec"))]) - - label: "CUDA {{matrix}}" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: - test_args: "--thorough" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: "{{matrix}}" - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - matrix: - - "11.8" - - "11.7" - - "11.6" - - "11.5" - - "11.4" - - "11.3" - - "11.2" - - "11.1" - - "11.0" + println("+++ :julia: Running tests"); + Pkg.test("{{matrix}}")' - - label: "CUDA 10.2" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: - test_args: "--thorough" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "10.2" - cap: "sm_75" - env: - JULIA_CUDA_VERSION: '10.2' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 + - label: "{{matrix}} on CUDA 11.8" + matrix: + - "CUDNN" + - "CUTENSOR" + - "CUSTATEVEC" + - "CUTENSORNET" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "11.0" + cap: "sm_80" + env: + JULIA_CUDA_VERSION: "11.8" + JULIA_CUDA_USE_BINARYBUILDER: 'true' + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 + commands: | + julia -e ' + using Pkg + println("--- :julia: Instantiating projects"); + Pkg.develop([PackageSpec(path=pwd()), # CUDA + PackageSpec(path=joinpath(pwd(), "lib", "cudnn")), + PackageSpec(path=joinpath(pwd(), "lib", "cutensor")), + PackageSpec(path=joinpath(pwd(), "lib", "cutensornet")), + PackageSpec(path=joinpath(pwd(), "lib", "custatevec"))]) - # special tests + println("+++ :julia: Running tests"); + Pkg.test("{{matrix}}")' - # - label: "Windows" - # plugins: - # - JuliaCI/julia#v1: - # version: 1.6 - # - JuliaCI/julia-test#v1: ~ - # # XXX: no coverage, as no secrets on Windows - # agents: - # queue: "juliagpu-windows" - # cuda: "*" - # if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - # timeout_in_minutes: 120 + - group: ":telescope: Downstream" + depends_on: "cuda" + steps: + - label: "NNlibCUDA.jl" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + command: | + julia -e ' + using Pkg; - - label: "GPU-less environment" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - - JuliaCI/julia-test#v1: - run_tests: false - command: | - julia --project -e 'using CUDA; - @assert !CUDA.functional(); - CUDA.download_artifacts()' - env: - CUDA_VISIBLE_DEVICES: '' - JULIA_CUDA_VERSION: '11.6' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 60 + println("--- :julia: Instantiating project"); + Pkg.develop(PackageSpec(path=pwd())); + Pkg.add(PackageSpec(name="NNlibCUDA", rev="master")); + Pkg.instantiate(); - - label: "NNlibCUDA.jl" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - command: | - julia -e 'using Pkg; + println("+++ :julia: Running tests"); + Pkg.test("NNlibCUDA"; coverage=true);' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 60 + soft_fail: + - exit_status: 1 - println("--- :julia: Instantiating project"); - Pkg.develop(PackageSpec(path=pwd())); - Pkg.add(PackageSpec(name="NNlibCUDA", rev="master")); - Pkg.instantiate(); + - group: ":eyes: Special" + depends_on: "cuda" + steps: + - label: "GPU-less environment" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + - JuliaCI/julia-test#v1: + run_tests: false + command: | + julia --project -e 'using CUDA; + @assert !CUDA.functional(); + CUDA.download_artifacts()' + env: + CUDA_VISIBLE_DEVICES: '' + JULIA_CUDA_VERSION: '11.6' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 60 - println("+++ :julia: Running tests"); - Pkg.test("NNlibCUDA"; coverage=true);' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 60 + - label: "Documentation" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + command: | + julia --project -e ' + println("--- :julia: Instantiating project") + using Pkg + Pkg.instantiate() + Pkg.activate("docs") + Pkg.instantiate() + push!(LOAD_PATH, @__DIR__) - - label: "Compute Sanitizer" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-test#v1: - julia_args: "-g2" - test_args: "--sanitize --quickfail --jobs=1" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.6' - JULIA_CUDA_USE_COMPAT: 'false' # NVIDIA bug #3418723: injection tools prevent probing libcuda - JULIA_CUDA_USE_BINARYBUILDER: 'true' - if: build.message !~ /\[skip tests\]/ && - build.branch =~ /^master$$/ - timeout_in_minutes: 240 + println("+++ :julia: Building documentation") + include("docs/make.jl")' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft + timeout_in_minutes: 30 - - label: "SubPackage -- {{matrix}}" - matrix: - - "CUSTATEVEC" - - "CUTENSORNET" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "11.0" - cap: "sm_80" - env: - JULIA_CUDA_VERSION: '11.6' - JULIA_CUDA_USE_BINARYBUILDER: 'true' - PACKAGE: '{{matrix}}' - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft - timeout_in_minutes: 120 - commands: | - julia -e 'import Pkg; Pkg.develop(; path = pwd())' # CUDA - julia -e 'import Pkg; Pkg.develop(; path = joinpath(pwd(), "lib", ENV["PACKAGE"]))' - julia -e 'import Pkg; Pkg.precompile()' - julia -e 'import Pkg; Pkg.test(ENV["PACKAGE"])' - - # other tasks + - label: "Compute sanitizer" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + - JuliaCI/julia-test#v1: + julia_args: "-g2" + test_args: "--sanitize --quickfail --jobs=1" + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "11.0" + cap: "sm_80" + env: + JULIA_CUDA_VERSION: '11.6' + JULIA_CUDA_USE_COMPAT: 'false' # NVIDIA bug #3418723: injection tools prevent probing libcuda + JULIA_CUDA_USE_BINARYBUILDER: 'true' + if: build.message !~ /\[skip tests\]/ && + build.branch =~ /^master$$/ + timeout_in_minutes: 240 # we want to benchmark every commit on the master branch, even if it failed CI - wait: ~ continue_on_failure: true - # if we will submit results, use the benchmark queue so that we will - # be running on the same system each time - - label: "Benchmarks on 1.6" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - env: - BENCHMARKS: "true" - CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" - CODESPEED_BRANCH: "$BUILDKITE_BRANCH" - CODESPEED_COMMIT: "$BUILDKITE_COMMIT" - CODESPEED_EXECUTABLE: "Julia 1.6" - command: | - julia --project -e ' - ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("perf") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - println("+++ :julia: Benchmarking") - include("perf/runbenchmarks.jl")' - agents: - queue: "benchmark" - cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch =~ /^master$$/ - timeout_in_minutes: 30 - - - label: "Benchmarks on 1.7" - plugins: - - JuliaCI/julia#v1: - version: 1.7 - env: - BENCHMARKS: "true" - CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" - CODESPEED_BRANCH: "$BUILDKITE_BRANCH" - CODESPEED_COMMIT: "$BUILDKITE_COMMIT" - CODESPEED_EXECUTABLE: "Julia 1.7" - command: | - julia --project -e ' - ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("perf") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - println("+++ :julia: Benchmarking") - include("perf/runbenchmarks.jl")' - agents: - queue: "benchmark" - cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch =~ /^master$$/ - timeout_in_minutes: 30 + - group: ":racehorse: Benchmarks" + steps: + # if we will submit results, use the benchmark queue so that we will + # be running on the same system each time + - label: "Benchmarks on 1.6" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + env: + BENCHMARKS: "true" + CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" + CODESPEED_BRANCH: "$BUILDKITE_BRANCH" + CODESPEED_COMMIT: "$BUILDKITE_COMMIT" + CODESPEED_EXECUTABLE: "Julia 1.6" + command: | + julia --project -e ' + ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - - wait + println("--- :julia: Instantiating project") + using Pkg + Pkg.instantiate() + Pkg.activate("perf") + Pkg.instantiate() + push!(LOAD_PATH, @__DIR__) - # benchmarks outside of the master branch don't submit their results, - # so they can run on any system in the juliagpu queue. - - label: "Benchmarks (dry run)" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - command: | - julia --project -e ' - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("perf") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) + println("+++ :julia: Benchmarking") + include("perf/runbenchmarks.jl")' + agents: + queue: "benchmark" + cuda: "*" + if: build.message !~ /\[skip benchmarks\]/ && + build.branch =~ /^master$$/ + timeout_in_minutes: 30 - println("+++ :julia: Benchmarking") - include("perf/runbenchmarks.jl")' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch !~ /^master$$/ && - !build.pull_request.draft - timeout_in_minutes: 30 + - label: "Benchmarks on 1.7" + plugins: + - JuliaCI/julia#v1: + version: 1.7 + env: + BENCHMARKS: "true" + CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" + CODESPEED_BRANCH: "$BUILDKITE_BRANCH" + CODESPEED_COMMIT: "$BUILDKITE_COMMIT" + CODESPEED_EXECUTABLE: "Julia 1.7" + command: | + julia --project -e ' + ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - - label: "Documentation" - plugins: - - JuliaCI/julia#v1: - version: 1.6 - command: | - julia --project -e ' - println("--- :julia: Instantiating project") - using Pkg - Pkg.instantiate() - Pkg.activate("docs") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) + println("--- :julia: Instantiating project") + using Pkg + Pkg.instantiate() + Pkg.activate("perf") + Pkg.instantiate() + push!(LOAD_PATH, @__DIR__) - println("+++ :julia: Building documentation") - include("docs/make.jl")' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft - timeout_in_minutes: 30 + println("+++ :julia: Benchmarking") + include("perf/runbenchmarks.jl")' + agents: + queue: "benchmark" + cuda: "*" + if: build.message !~ /\[skip benchmarks\]/ && + build.branch =~ /^master$$/ + timeout_in_minutes: 30 + # benchmarks outside of the master branch don't submit their results, + # so they can run on any system in the juliagpu queue. + - label: "Benchmarks (dry run)" + plugins: + - JuliaCI/julia#v1: + version: 1.6 + command: | + julia --project -e ' + println("--- :julia: Instantiating project") + using Pkg + Pkg.instantiate() + Pkg.activate("perf") + Pkg.instantiate() + push!(LOAD_PATH, @__DIR__) + println("+++ :julia: Benchmarking") + include("perf/runbenchmarks.jl")' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip benchmarks\]/ && + build.branch !~ /^master$$/ && + !build.pull_request.draft + timeout_in_minutes: 30 env: JULIA_PKG_SERVER: "" # we don't want to wait until the PkgServer updates diff --git a/.gitignore b/.gitignore index fa3b511ca8..ea955bb2a0 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ .vscode lcov.info build/ +lib/**/Manifest.toml diff --git a/deps/bindeps.jl b/deps/bindeps.jl index 650c86b402..abab2005e3 100644 --- a/deps/bindeps.jl +++ b/deps/bindeps.jl @@ -534,126 +534,6 @@ function find_libcudadevrt(cuda::LocalToolkit) end -# -# CUDNN -# - -export libcudnn, has_cudnn - -const __libcudnn = Ref{Union{String,Nothing}}() -function libcudnn(; throw_error::Bool=true) - path = @initialize_ref __libcudnn begin - # CUDNN depends on CUBLAS - libcublas() - - find_cudnn(toolkit(), v"8") - end CUDA.CUDNN.__runtime_init__() - if path === nothing && throw_error - error("This functionality is unavailabe as CUDNN is missing.") - end - path -end -has_cudnn() = libcudnn(throw_error=false) !== nothing - -function find_cudnn(cuda::ArtifactToolkit, version) - artifact_dir = cuda_artifact("CUDNN", cuda.release) - if artifact_dir === nothing - return nothing - end - path = artifact_library(artifact_dir, "cudnn", [version]) - - # HACK: eagerly open CUDNN sublibraries to avoid dlopen discoverability issues - for sublibrary in ("ops_infer", "ops_train", - "cnn_infer", "cnn_train", - "adv_infer", "adv_train") - sublibrary_path = artifact_library(artifact_dir, "cudnn_$(sublibrary)", [version]) - Libdl.dlopen(sublibrary_path) - end - - @debug "Using CUDNN from an artifact at $(artifact_dir)" - Libdl.dlopen(path) - return path -end - -function find_cudnn(cuda::LocalToolkit, version) - path = find_library("cudnn", [version]; locations=cuda.dirs) - if path === nothing - return nothing - end - - # with a local CUDNN version, we shouldn't need to eagerly open sublibraries, - # as they are expected to be globally discoverable next to libcudnn.so - - @debug "Using local CUDNN at $(path)" - Libdl.dlopen(path) - return path -end - - -# -# CUTENSOR -# - -export libcutensor, libcutensormg, has_cutensor, has_cutensormg - -const __libcutensor = Ref{Union{String,Nothing}}() -function libcutensor(; throw_error::Bool=true) - path = @initialize_ref __libcutensor begin - # CUTENSOR depends on CUBLAS - libcublas() - - find_cutensor(toolkit(), "cutensor", v"1") - end - if path === nothing && throw_error - error("This functionality is unavailabe as CUTENSOR is missing.") - end - path -end -has_cutensor() = libcutensor(throw_error=false) !== nothing - -const __libcutensormg = Ref{Union{String,Nothing}}() -function libcutensormg(; throw_error::Bool=true) - path = @initialize_ref __libcutensor begin - # CUTENSORMg additionally depends on CUDARt - libcudart() - - if CUTENSOR.version() < v"1.4" - nothing - else - find_cutensor(toolkit(), "cutensorMg", v"1") - end - end - if path === nothing && throw_error - error("This functionality is unavailabe as CUTENSORMg is missing.") - end - path -end -has_cutensormg() = libcutensormg(throw_error=false) !== nothing - -function find_cutensor(cuda::ArtifactToolkit, name, version) - artifact_dir = cuda_artifact("CUTENSOR", cuda.release) - if artifact_dir === nothing - return nothing - end - path = artifact_library(artifact_dir, name, [version]) - - @debug "Using CUTENSOR library $name from an artifact at $(artifact_dir)" - Libdl.dlopen(path) - return path -end - -function find_cutensor(cuda::LocalToolkit, name, version) - path = find_library(name, [version]; locations=cuda.dirs) - if path === nothing - return nothing - end - - @debug "Using local CUTENSOR library $name at $(path)" - Libdl.dlopen(path) - return path -end - - # # NCCL # @@ -666,7 +546,7 @@ function libnccl(; throw_error::Bool=true) find_nccl(toolkit(), "nccl", v"1") end if path === nothing && throw_error - error("This functionality is unavailabe as CUTENSOR is missing.") + error("This functionality is unavailabe as NCCL is missing.") end path end @@ -696,97 +576,6 @@ function find_nccl(cuda::LocalToolkit, name, version) end -# -# CUQUANTUM -# - -export libcutensornet, has_cutensornet, libcustatevec, has_custatevec - -const __libcutensornet = Ref{Union{String,Nothing}}() -function libcutensornet(; throw_error::Bool=true) - path = @initialize_ref __libcutensornet begin - # CUTENSORNET depends on CUTENSOR - libcutensor(throw_error=throw_error) - - if CUDA.runtime_version() < v"11" - # XXX: bound this using tags in the Artifact.toml? - nothing - else - find_cutensornet(toolkit(), "cutensornet", v"0.1.0") - end - end - if path === nothing && throw_error - error("This functionality is unavailabe as CUTENSORNET is missing.") - end - return path -end -has_cutensornet() = has_cutensor() && libcutensornet(throw_error=false) !== nothing - -const __libcustatevec = Ref{Union{String,Nothing}}() -function libcustatevec(; throw_error::Bool=true) - path = @initialize_ref __libcustatevec begin - - if CUDA.runtime_version() < v"11" - # XXX: bound this using tags in the Artifact.toml? - nothing - else - find_custatevec(toolkit(), "custatevec", v"0.1.0") - end - end - if path === nothing && throw_error - error("This functionality is unavailabe as CUSTATEVEC is missing.") - end - return path -end -has_custatevec() = libcustatevec(throw_error=false) !== nothing - -function find_cutensornet(cuda::ArtifactToolkit, name, version) - artifact_dir = generic_artifact("cuQuantum") - if artifact_dir === nothing - return nothing - end - path = artifact_library(artifact_dir, name, [version]) - - @debug "Using CUTENSORNET library $name from an artifact at $(artifact_dir)" - Libdl.dlopen(path) - return path -end - -function find_cutensornet(cuda::LocalToolkit, name, version) - path = find_library(name, [version]; locations=cuda.dirs) - if path === nothing - return nothing - end - - @debug "Using local CUTENSORNET library $name at $(path)" - Libdl.dlopen(path) - return path -end - -function find_custatevec(cuda::ArtifactToolkit, name, version) - artifact_dir = cuda_artifact("cuQuantum", v"0.1.3") - if artifact_dir === nothing - return nothing - end - path = artifact_library(artifact_dir, name, [version]) - - @debug "Using CUSTATEVEC library $name from an artifact at $(artifact_dir)" - Libdl.dlopen(path) - return path -end - -function find_custatevec(cuda::LocalToolkit, name, version) - path = find_library(name, [version]; locations=cuda.dirs) - if path === nothing - return nothing - end - - @debug "Using local CUSTATEVEC library $name at $(path)" - Libdl.dlopen(path) - return path -end - - # # Utilities # diff --git a/lib/cudnn/Project.toml b/lib/cudnn/Project.toml new file mode 100644 index 0000000000..9ece714b7d --- /dev/null +++ b/lib/cudnn/Project.toml @@ -0,0 +1,9 @@ +name = "CUDNN" +uuid = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" +authors = ["Tim Besard "] +version = "0.1.0" + +[deps] +CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" diff --git a/lib/cudnn/CUDNN.jl b/lib/cudnn/src/CUDNN.jl similarity index 93% rename from lib/cudnn/CUDNN.jl rename to lib/cudnn/src/CUDNN.jl index e48df71667..341de1d9a7 100644 --- a/lib/cudnn/CUDNN.jl +++ b/lib/cudnn/src/CUDNN.jl @@ -1,20 +1,20 @@ """ - CUDA.CUDNN + CUDNN High level interface to cuDNN functions. See -https://github.com/JuliaGPU/CUDA.jl/blob/master/lib/state/README.md -for a design overview. +[README.md](https://github.com/JuliaGPU/CUDA.jl/blob/master/lib/cudnn/README.md) for a +design overview. """ module CUDNN -using ..APIUtils - -using ..CUDA -using ..CUDA: CUstream, libraryPropertyType -using ..CUDA: libcudnn, @retry_reclaim, isdebug, initialize_context +using CUDA +using CUDA.APIUtils +using CUDA: CUstream, libraryPropertyType +using CUDA: @retry_reclaim, isdebug, initialize_context using CEnum: @cenum +include("bindeps.jl") # core library include("libcudnn_common.jl") diff --git a/lib/cudnn/activation.jl b/lib/cudnn/src/activation.jl similarity index 100% rename from lib/cudnn/activation.jl rename to lib/cudnn/src/activation.jl diff --git a/lib/cudnn/base.jl b/lib/cudnn/src/base.jl similarity index 100% rename from lib/cudnn/base.jl rename to lib/cudnn/src/base.jl diff --git a/lib/cudnn/src/bindeps.jl b/lib/cudnn/src/bindeps.jl new file mode 100644 index 0000000000..93981ed6a6 --- /dev/null +++ b/lib/cudnn/src/bindeps.jl @@ -0,0 +1,55 @@ +using CUDA.Deps: @initialize_ref, libcublas, cuda_artifact, artifact_library, find_library, + LocalToolkit, ArtifactToolkit, toolkit + +import Libdl + +export libcudnn, has_cudnn + +const __libcudnn = Ref{Union{String,Nothing}}() +function libcudnn(; throw_error::Bool=true) + path = @initialize_ref __libcudnn begin + # CUDNN depends on CUBLAS + libcublas() + + find_cudnn(toolkit(), v"8") + end __runtime_init__() + if path === nothing && throw_error + error("This functionality is unavailabe as CUDNN is missing.") + end + path +end +has_cudnn() = libcudnn(throw_error=false) !== nothing + +function find_cudnn(cuda::ArtifactToolkit, version) + artifact_dir = cuda_artifact("CUDNN", cuda.release) + if artifact_dir === nothing + return nothing + end + path = artifact_library(artifact_dir, "cudnn", [version]) + + # HACK: eagerly open CUDNN sublibraries to avoid dlopen discoverability issues + for sublibrary in ("ops_infer", "ops_train", + "cnn_infer", "cnn_train", + "adv_infer", "adv_train") + sublibrary_path = artifact_library(artifact_dir, "cudnn_$(sublibrary)", [version]) + Libdl.dlopen(sublibrary_path) + end + + @debug "Using CUDNN from an artifact at $(artifact_dir)" + Libdl.dlopen(path) + return path +end + +function find_cudnn(cuda::LocalToolkit, version) + path = find_library("cudnn", [version]; locations=cuda.dirs) + if path === nothing + return nothing + end + + # with a local CUDNN version, we shouldn't need to eagerly open sublibraries, + # as they are expected to be globally discoverable next to libcudnn.so + + @debug "Using local CUDNN at $(path)" + Libdl.dlopen(path) + return path +end diff --git a/lib/cudnn/convolution.jl b/lib/cudnn/src/convolution.jl similarity index 100% rename from lib/cudnn/convolution.jl rename to lib/cudnn/src/convolution.jl diff --git a/lib/cudnn/descriptors.jl b/lib/cudnn/src/descriptors.jl similarity index 100% rename from lib/cudnn/descriptors.jl rename to lib/cudnn/src/descriptors.jl diff --git a/lib/cudnn/dropout.jl b/lib/cudnn/src/dropout.jl similarity index 100% rename from lib/cudnn/dropout.jl rename to lib/cudnn/src/dropout.jl diff --git a/lib/cudnn/error.jl b/lib/cudnn/src/error.jl similarity index 100% rename from lib/cudnn/error.jl rename to lib/cudnn/src/error.jl diff --git a/lib/cudnn/inplace.jl b/lib/cudnn/src/inplace.jl similarity index 100% rename from lib/cudnn/inplace.jl rename to lib/cudnn/src/inplace.jl diff --git a/lib/cudnn/libcudnn.jl b/lib/cudnn/src/libcudnn.jl similarity index 100% rename from lib/cudnn/libcudnn.jl rename to lib/cudnn/src/libcudnn.jl diff --git a/lib/cudnn/libcudnn_common.jl b/lib/cudnn/src/libcudnn_common.jl similarity index 100% rename from lib/cudnn/libcudnn_common.jl rename to lib/cudnn/src/libcudnn_common.jl diff --git a/lib/cudnn/libcudnn_deprecated.jl b/lib/cudnn/src/libcudnn_deprecated.jl similarity index 100% rename from lib/cudnn/libcudnn_deprecated.jl rename to lib/cudnn/src/libcudnn_deprecated.jl diff --git a/lib/cudnn/multiheadattn.jl b/lib/cudnn/src/multiheadattn.jl similarity index 100% rename from lib/cudnn/multiheadattn.jl rename to lib/cudnn/src/multiheadattn.jl diff --git a/lib/cudnn/normalization.jl b/lib/cudnn/src/normalization.jl similarity index 100% rename from lib/cudnn/normalization.jl rename to lib/cudnn/src/normalization.jl diff --git a/lib/cudnn/optensor.jl b/lib/cudnn/src/optensor.jl similarity index 100% rename from lib/cudnn/optensor.jl rename to lib/cudnn/src/optensor.jl diff --git a/lib/cudnn/pooling.jl b/lib/cudnn/src/pooling.jl similarity index 100% rename from lib/cudnn/pooling.jl rename to lib/cudnn/src/pooling.jl diff --git a/lib/cudnn/reduce.jl b/lib/cudnn/src/reduce.jl similarity index 100% rename from lib/cudnn/reduce.jl rename to lib/cudnn/src/reduce.jl diff --git a/lib/cudnn/rnn.jl b/lib/cudnn/src/rnn.jl similarity index 100% rename from lib/cudnn/rnn.jl rename to lib/cudnn/src/rnn.jl diff --git a/lib/cudnn/softmax.jl b/lib/cudnn/src/softmax.jl similarity index 100% rename from lib/cudnn/softmax.jl rename to lib/cudnn/src/softmax.jl diff --git a/lib/cudnn/tensor.jl b/lib/cudnn/src/tensor.jl similarity index 100% rename from lib/cudnn/tensor.jl rename to lib/cudnn/src/tensor.jl diff --git a/lib/cudnn/util.jl b/lib/cudnn/src/util.jl similarity index 100% rename from lib/cudnn/util.jl rename to lib/cudnn/src/util.jl diff --git a/lib/cudnn/test/Project.toml b/lib/cudnn/test/Project.toml new file mode 100644 index 0000000000..5eb6e657a0 --- /dev/null +++ b/lib/cudnn/test/Project.toml @@ -0,0 +1,7 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/lib/cudnn/test/activation.jl b/lib/cudnn/test/activation.jl new file mode 100644 index 0000000000..189276b8c6 --- /dev/null +++ b/lib/cudnn/test/activation.jl @@ -0,0 +1,61 @@ +using CUDNN: + cudnnActivationForward, + cudnnActivationForward!, + cudnnActivationBackward, + cudnnActivationDescriptor, + cudnnActivationDescriptor_t, + cudnnCreateActivationDescriptor, + cudnnSetActivationDescriptor, + cudnnGetActivationDescriptor, + cudnnDestroyActivationDescriptor, + cudnnActivationMode_t, + CUDNN_ACTIVATION_SIGMOID, # 0 + CUDNN_ACTIVATION_RELU, # 1 + CUDNN_ACTIVATION_TANH, # 2 + CUDNN_ACTIVATION_CLIPPED_RELU, # 3 + CUDNN_ACTIVATION_ELU, # 4 + CUDNN_ACTIVATION_IDENTITY, # 5 + cudnnNanPropagation_t, + CUDNN_NOT_PROPAGATE_NAN, # 0 + CUDNN_PROPAGATE_NAN # 1 + + +@test cudnnActivationDescriptor(C_NULL) isa cudnnActivationDescriptor +@test Base.unsafe_convert(Ptr, cudnnActivationDescriptor(C_NULL)) isa Ptr +@test cudnnActivationDescriptor(CUDNN_ACTIVATION_RELU,CUDNN_NOT_PROPAGATE_NAN,0) isa cudnnActivationDescriptor + +(ax,ay) = randn.((10,10)) +(cx,cy) = CuArray.((ax,ay)) + +function activationtest(; + mode=CUDNN_ACTIVATION_SIGMOID, + nanOpt=CUDNN_NOT_PROPAGATE_NAN, + coef=1, + alpha=1, + beta=0, +) + fx = (mode === CUDNN_ACTIVATION_SIGMOID ? 1 ./ (1 .+ exp.(-ax)) : + mode === CUDNN_ACTIVATION_RELU ? max.(0,ax) : + mode === CUDNN_ACTIVATION_TANH ? tanh.(ax) : + mode === CUDNN_ACTIVATION_CLIPPED_RELU ? clamp.(ax,0,coef) : + mode === CUDNN_ACTIVATION_ELU ? (x->(x >= 0 ? x : coef*(exp(x)-1))).(ax) : + error("Unknown activation")) + d = cudnnActivationDescriptor(mode,nanOpt,Cfloat(coef)) + y0 = alpha * fx + y1 = y0 .+ beta * ay + @test y0 ≈ cudnnActivationForward(cx; mode, nanOpt, coef, alpha) |> Array + @test y0 ≈ cudnnActivationForward(cx, d; alpha) |> Array + @test y1 ≈ cudnnActivationForward!(copy(cy), cx; mode, nanOpt, coef, alpha, beta) |> Array + @test y1 ≈ cudnnActivationForward!(copy(cy), cx, d; alpha, beta) |> Array +end + +activationtest(mode=CUDNN_ACTIVATION_SIGMOID) +activationtest(mode=CUDNN_ACTIVATION_RELU) +activationtest(mode=CUDNN_ACTIVATION_TANH) +activationtest(mode=CUDNN_ACTIVATION_CLIPPED_RELU) +activationtest(mode=CUDNN_ACTIVATION_ELU) +activationtest(nanOpt=CUDNN_PROPAGATE_NAN) +activationtest(coef=2,mode=CUDNN_ACTIVATION_CLIPPED_RELU) +activationtest(coef=2,mode=CUDNN_ACTIVATION_ELU) +activationtest(alpha=2) +activationtest(beta=2) diff --git a/lib/cudnn/test/convolution.jl b/lib/cudnn/test/convolution.jl new file mode 100644 index 0000000000..2b4052681b --- /dev/null +++ b/lib/cudnn/test/convolution.jl @@ -0,0 +1,191 @@ +import NNlib +using CUDNN: + cudnnConvolutionForward, + cudnnConvolutionForward!, + cudnnConvolutionBackwardFilter, + cudnnConvolutionBackwardData, + cudnnGetConvolutionNdForwardOutputDim, + cudnnSetConvolutionMathType, + cudnnSetConvolutionReorderType, + cudnnSetConvolutionGroupCount, + cudnnFindConvolutionForwardAlgorithmEx, + cudnnConvolutionFwdAlgoPerf_t, + cudnnFindConvolutionBackwardFilterAlgorithmEx, + cudnnConvolutionBwdFilterAlgoPerf_t, + cudnnFindConvolutionBackwardDataAlgorithmEx, + cudnnConvolutionBwdDataAlgoPerf_t, + cudnnConvolutionDescriptor, + cudnnConvolutionDescriptor_t, + cudnnCreateConvolutionDescriptor, + cudnnSetConvolutionNdDescriptor, + cudnnDestroyConvolutionDescriptor, + cudnnConvolutionMode_t, + CUDNN_CONVOLUTION, # 0 + CUDNN_CROSS_CORRELATION, # 1 + cudnnActivationMode_t, + CUDNN_ACTIVATION_SIGMOID, # 0 + CUDNN_ACTIVATION_RELU, # 1 + CUDNN_ACTIVATION_TANH, # 2 + CUDNN_ACTIVATION_CLIPPED_RELU, # 3 + CUDNN_ACTIVATION_ELU, # 4 + CUDNN_ACTIVATION_IDENTITY, # 5 + cudnnNanPropagation_t, + CUDNN_NOT_PROPAGATE_NAN, # 0 + CUDNN_PROPAGATE_NAN, # 1 + cudnnMathType_t, + CUDNN_DEFAULT_MATH, # 0 + CUDNN_TENSOR_OP_MATH, # 1 + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2 + CUDNN_FMA_MATH, # 3 + cudnnReorderType_t, + CUDNN_DEFAULT_REORDER, # 0 + CUDNN_NO_REORDER, # 1 + cudnnConvolutionFwdAlgo_t, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, # 0 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, # 1 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, # 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, # 3 + CUDNN_CONVOLUTION_FWD_ALGO_FFT, # 4 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, # 5 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, # 6 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, # 7 + CUDNN_CONVOLUTION_FWD_ALGO_COUNT, # 8 + cudnnConvolutionBwdFilterAlgo_t, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, # 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, # 1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, # 2, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, # 3, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD, # 4, /* not implemented */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, # 5, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, # 6, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT, # 7 + cudnnConvolutionBwdDataAlgo_t, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, # 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, # 1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, # 2, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, # 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, # 4, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, # 5, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT, # 6 + cudnnTensorFormat_t, + CUDNN_TENSOR_NCHW, # 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC, # 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */ + cudnnDataType, + convdims, + math_mode + + +T = Float32 +ax,aw,ab = randn(T,8,8,4,4),randn(T,3,3,4,4),randn(T,1,1,4,1) +cx,cw,cb = CuArray.((ax,aw,ab)) + +function convtest(; + blendz=false, + bias=nothing, + activation = CUDNN_ACTIVATION_IDENTITY, + mode = CUDNN_CONVOLUTION, + padding = 0, + stride = 1, + dilation = 1, + group = 1, + dataType = eltype(cx), + mathType = math_mode(), + reorderType = CUDNN_DEFAULT_REORDER, + alpha = 1, + beta = 0) + if group == 1 + cdims = NNlib.DenseConvDims(ax, aw; stride, padding, dilation, flipkernel = (mode === CUDNN_CROSS_CORRELATION)) + ay = NNlib.conv(ax, aw, cdims) + cw0 = cw + else + # Implement grouped convolution + xchan = size(aw,3)÷group + ychan = size(aw,4)÷group + xdims = (size(ax,1),size(ax,2),xchan,size(ax,4)) + wdims = (size(aw,1),size(aw,2),xchan,ychan) + cdims = NNlib.DenseConvDims(xdims, wdims; stride, padding, dilation, flipkernel = (mode === CUDNN_CROSS_CORRELATION)) + ay = nothing + for g in 1:group + xrange = 1+(g-1)*xchan:g*xchan + yrange = 1+(g-1)*ychan:g*ychan + ay0 = NNlib.conv(ax[:,:,xrange,:], aw[:,:,1:xchan,yrange], cdims) + ay = (ay === nothing ? ay0 : cat(ay, ay0; dims=3)) + end + cw0 = CuArray(aw[:,:,1:xchan,:]) + end + + if alpha != 1; ay = alpha * ay; end + if bias != nothing; ay = ay .+ Array(bias); end + + act = (activation === CUDNN_ACTIVATION_RELU ? NNlib.relu : + activation === CUDNN_ACTIVATION_IDENTITY ? identity : + error("Unsupported activation $activation")) + ay1 = act.(ay) + + az0 = randn(T,size(ay)...) + ay0 = randn(T,size(ay)...) + cy0, cy1 = CuArray.((ay0,ay0)) + if blendz + cz0 = cz1 = CuArray(az0) + ay2 = act.(ay .+ beta * az0) + else + cz0, cz1 = cy0, cy1 + ay2 = act.(ay .+ beta * ay0) + end + + d = cudnnConvolutionDescriptor(convdims(padding,size(ax)), + convdims(stride,size(ax)), + convdims(dilation,size(ax)), mode, + cudnnDataType(dataType), mathType, reorderType, + Cint(group)) + @test ay1 ≈ cudnnConvolutionForward(cw0, cx; bias, activation, mode, padding, + stride, dilation, group, mathType, reorderType, + alpha) |> Array + @test ay1 ≈ cudnnConvolutionForward(cw0, cx, d; bias, activation, alpha) |> Array + @test ay2 ≈ cudnnConvolutionForward!(cy0, cw0, cx; z=cz0, bias, activation, mode, + padding, stride, dilation, group, mathType, + reorderType, alpha, beta) |> Array + @test ay2 ≈ cudnnConvolutionForward!(cy1, cw0, cx, d; z=cz1, bias, activation, + alpha, beta) |> Array +end + +# These call cudnnConvolutionForward +convtest() +convtest(padding=1) +convtest(stride=2) +convtest(dilation=2) +convtest(group=2) # See https://blog.yani.ai/filter-group-tutorial/ +convtest(mathType=CUDNN_DEFAULT_MATH) +convtest(mathType=CUDNN_TENSOR_OP_MATH) +convtest(mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) +convtest(reorderType=CUDNN_NO_REORDER) +convtest(alpha=2) +convtest(beta=2) + +# These call cudnnConvolutionBiasActivationForward +convtest(bias=cb) +convtest(blendz=true) +convtest(activation=CUDNN_ACTIVATION_RELU) +convtest(bias=cb,blendz=true) +convtest(bias=cb,activation=CUDNN_ACTIVATION_RELU) +convtest(bias=cb,padding=1) +convtest(bias=cb,stride=2) +convtest(bias=cb,dilation=2) +convtest(bias=cb,group=2) +convtest(bias=cb,mathType=CUDNN_DEFAULT_MATH) +convtest(bias=cb,mathType=CUDNN_TENSOR_OP_MATH) +convtest(bias=cb,mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) +convtest(bias=cb,reorderType=CUDNN_NO_REORDER) +convtest(bias=cb,alpha=2) +convtest(bias=cb,beta=2) +convtest(bias=cb,beta=2,blendz=true) + +# Test tensor format +cx2,cw2,cb2 = (x->permutedims(x,(3,1,2,4))).((cx,cw,cb)) +whcn = cudnnConvolutionForward(cw,cx) |> Array +cwhn = cudnnConvolutionForward(cw2,cx2,format=CUDNN_TENSOR_NHWC) |> Array +@test cwhn ≈ permutedims(whcn,(3,1,2,4)) +whcn = cudnnConvolutionForward(cw,cx;bias=cb) |> Array +cwhn = cudnnConvolutionForward(cw2,cx2;bias=cb2,format=CUDNN_TENSOR_NHWC) |> Array +@test cwhn ≈ permutedims(whcn,(3,1,2,4)) diff --git a/lib/cudnn/test/dropout.jl b/lib/cudnn/test/dropout.jl new file mode 100644 index 0000000000..53e027e721 --- /dev/null +++ b/lib/cudnn/test/dropout.jl @@ -0,0 +1,30 @@ +using Statistics +using CUDNN: + cudnnDropoutForward, + cudnnDropoutForward!, + cudnnDropoutBackward, + cudnnDropoutSeed, + cudnnDropoutDescriptor, + cudnnDropoutDescriptor_t, + cudnnCreateDropoutDescriptor, + cudnnSetDropoutDescriptor, + cudnnGetDropoutDescriptor, + cudnnRestoreDropoutDescriptor, + cudnnDestroyDropoutDescriptor, + cudnnDropoutGetStatesSize, + cudnnDropoutGetReserveSpaceSize + +@test cudnnDropoutDescriptor(C_NULL) isa cudnnDropoutDescriptor +@test Base.unsafe_convert(Ptr, cudnnDropoutDescriptor(C_NULL)) isa Ptr +@test cudnnDropoutDescriptor(0.5) isa cudnnDropoutDescriptor + +N,P = 1000, 0.7 +x = CUDA.rand(N) +d = cudnnDropoutDescriptor(P) +cudnnDropoutSeed[] = 1 +y = cudnnDropoutForward(x; dropout = P) |> Array +@test isapprox(mean(y.==0), P; atol = 3/sqrt(N)) +@test y == cudnnDropoutForward(x, d) |> Array +@test y == cudnnDropoutForward!(similar(x), x; dropout = P) |> Array +@test y == cudnnDropoutForward!(similar(x), x, d) |> Array +cudnnDropoutSeed[] = -1 diff --git a/lib/cudnn/test/inplace.jl b/lib/cudnn/test/inplace.jl new file mode 100644 index 0000000000..27dcd2cca4 --- /dev/null +++ b/lib/cudnn/test/inplace.jl @@ -0,0 +1,28 @@ +import CUDNN: + cudnnSetTensor!, + cudnnScaleTensor!, + cudnnScaleTensor, + cudnnAddTensor!, + cudnnAddTensor, + CUDNN_TENSOR_NHWC + +x = CUDA.rand(10) +cudnnSetTensor!(x, 7) +@test all(isequal(7), Array(x)) +ax = rand(10) +cx = CuArray(ax) +@test 7*ax ≈ cudnnScaleTensor(cx, 7) |> Array +@test 7*ax ≈ cudnnScaleTensor!(similar(cx), cx, 7) |> Array +ax,ab = rand(5,4,3,2),rand(1,1,3,1) +cx,cb = CuArray.((ax,ab)) +@test ax .+ ab ≈ cudnnAddTensor(cx, cb) |> Array +@test ax .+ 7*ab ≈ cudnnAddTensor(cx, cb, alpha=7) |> Array +@test 7*ax .+ ab ≈ cudnnAddTensor(cx, cb, beta=7) |> Array +@test ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb) |> Array +@test ax .+ 7*ab ≈ cudnnAddTensor!(similar(cx), cx, cb, alpha=7) |> Array +@test 7*ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb, beta=7) |> Array +@test ax .+ ab ≈ cudnnAddTensor!(cx, cx, cb) |> Array +@test ax .+ ab ≈ cx |> Array +ax,ab = rand(3,5,4,2),rand(3,1,1,1) +cx,cb = CuArray.((ax,ab)) +@test ax .+ ab ≈ cudnnAddTensor(cx, cb, format=CUDNN_TENSOR_NHWC) |> Array diff --git a/lib/cudnn/test/multiheadattn.jl b/lib/cudnn/test/multiheadattn.jl new file mode 100644 index 0000000000..1197993030 --- /dev/null +++ b/lib/cudnn/test/multiheadattn.jl @@ -0,0 +1,170 @@ +using CUDNN: + cudnnMultiHeadAttnForward, + cudnnMultiHeadAttnForward!, + cudnnMultiHeadAttnBackwardData, + cudnnMultiHeadAttnBackwardWeights, + cudnnGetMultiHeadAttnBuffers, + cudnnGetMultiHeadAttnWeights, + cudnnAttnDescriptor, + cudnnAttnDescriptor_t, + cudnnCreateAttnDescriptor, + cudnnDestroyAttnDescriptor, + cudnnSetAttnDescriptor, + cudnnGetAttnDescriptor, + cudnnDataType_t, + cudnnDropoutDescriptor_t, + cudnnAttnQueryMap_t, + CUDNN_ATTN_QUERYMAP_ALL_TO_ONE, # 0 /* multiple Q-s map to a single (K,V) set when beam size > 1, beam sizes for (K,V) = 1 */ + CUDNN_ATTN_QUERYMAP_ONE_TO_ONE, # (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1, beam sizes for (K,V) = beam size for (Q) */ + CUDNN_ATTN_DISABLE_PROJ_BIASES, # 0 /* no biases in attention input and output projections */ + CUDNN_ATTN_ENABLE_PROJ_BIASES, # (1U << 1) /* use biases in attention input and output projections */ + cudnnMultiHeadAttnWeightKind_t, + CUDNN_MH_ATTN_Q_WEIGHTS, # 0, /* input projection weights for 'queries' */ + CUDNN_MH_ATTN_K_WEIGHTS, # 1, /* input projection weights for 'keys' */ + CUDNN_MH_ATTN_V_WEIGHTS, # 2, /* input projection weights for 'values' */ + CUDNN_MH_ATTN_O_WEIGHTS, # 3, /* output projection weights */ + CUDNN_MH_ATTN_Q_BIASES, # 4, /* input projection bias tensor for 'queries' */ + CUDNN_MH_ATTN_K_BIASES, # 5, /* input projection bias for 'keys' */ + CUDNN_MH_ATTN_V_BIASES, # 6, /* input projection bias for 'values' */ + CUDNN_MH_ATTN_O_BIASES, # 7, /* output projection biases */ + cudnnMathType_t, + CUDNN_DEFAULT_MATH, # 0, + CUDNN_TENSOR_OP_MATH, # 1, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2, + CUDNN_FMA_MATH, # 3, + cudnnWgradMode_t, + CUDNN_WGRAD_MODE_ADD, # 0, + CUDNN_WGRAD_MODE_SET, # 1, + cudnnSeqDataDescriptor, + cudnnSeqDataDescriptor_t, + cudnnCreateSeqDataDescriptor, + cudnnDestroySeqDataDescriptor, + cudnnSetSeqDataDescriptor, + cudnnGetSeqDataDescriptor, + cudnnSeqDataAxis_t, + CUDNN_SEQDATA_TIME_DIM, # 0, /* index in time */ + CUDNN_SEQDATA_BATCH_DIM, # 1, /* index in batch */ + CUDNN_SEQDATA_BEAM_DIM, # 2, /* index in beam */ + CUDNN_SEQDATA_VECT_DIM, # 3 /* index in vector */ + CUDNN_SEQDATA_DIM_COUNT, # 4 + cudnnDataType, + cudnnSeqDataDefaultAxes, + math_mode, + sdim + +function mhatest(; + # Input tensor descriptors + axes::Vector{cudnnSeqDataAxis_t} = cudnnSeqDataDefaultAxes, + seqLengthsQO::Vector{<:Integer} = fill(Cint(sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM)), + seqLengthsKV::Vector{<:Integer} = fill(Cint(sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(keys,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(keys,axes,CUDNN_SEQDATA_BEAM_DIM)), + #devSeqLengthsQO::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsQO), + #devSeqLengthsKV::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsKV), + #qDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(queries; axes, seqLengthArray=seqLengthsQO), + #kDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(keys; axes, seqLengthArray=seqLengthsKV), + #vDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(values; axes, seqLengthArray=seqLengthsKV), + + # attnDesc parameters + attnMode::Unsigned = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint, + nHeads::Integer = Cint(1), + smScaler::Real = Cdouble(1), + # dataType::DataType = eltype(queries), + # computePrec::DataType = eltype(queries), ## No other option according to 8.0.2 + mathType::cudnnMathType_t = math_mode(), + # attnDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API + # postDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API + qProjSize::Integer = 0, # Use zero to disable the corresponding projection + kProjSize::Integer = 0, + vProjSize::Integer = 0, + oProjSize::Integer = 0, + qoMaxSeqLength::Integer = sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM), + kvMaxSeqLength::Integer = sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM), + maxBatchSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM), + maxBeamSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM), + + # forw parameters + residuals = nothing, + currIdx::Integer = -1, + loWinIdx::Array{Cint} = fill(Cint(0), qoMaxSeqLength), + hiWinIdx::Array{Cint} = fill(Cint(kvMaxSeqLength), qoMaxSeqLength), + #workspace::Union{CuArray,Nothing} = nothing, + #reserveSpace::Union{CuArray,Nothing} = nothing, +) + attnDesc::cudnnAttnDescriptor = cudnnAttnDescriptor( + Cuint(attnMode), + Cint(nHeads), + Cdouble(smScaler), + cudnnDataType(eltype(queries)), # dataType + cudnnDataType(eltype(queries)), # computePrec + mathType, + C_NULL, # attnDropout + C_NULL, # postDropout + Cint(sdim(queries,axes,CUDNN_SEQDATA_VECT_DIM)), # qSize + Cint(sdim(keys, axes,CUDNN_SEQDATA_VECT_DIM)), # kSize + Cint(sdim(values, axes,CUDNN_SEQDATA_VECT_DIM)), # vSize + Cint(qProjSize), + Cint(kProjSize), + Cint(vProjSize), + Cint(oProjSize), + Cint(qoMaxSeqLength), + Cint(kvMaxSeqLength), + Cint(maxBatchSize), + Cint(maxBeamSize) + ) + y = cudnnMultiHeadAttnForward(weights, queries, keys, values; axes, seqLengthsQO, + seqLengthsKV, attnMode, nHeads, smScaler, mathType, + qProjSize, kProjSize, vProjSize, oProjSize, + qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, + maxBeamSize, residuals, currIdx, loWinIdx, hiWinIdx) + @test Array(y) ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values; axes, + seqLengthsQO, seqLengthsKV, attnMode, nHeads, + smScaler, mathType, qProjSize, kProjSize, + vProjSize, oProjSize, qoMaxSeqLength, + kvMaxSeqLength, maxBatchSize, maxBeamSize, + residuals, currIdx, loWinIdx, hiWinIdx) |> Array + @test Array(y) ≈ cudnnMultiHeadAttnForward(weights, queries, keys, values, attnDesc; + axes, seqLengthsQO, seqLengthsKV, residuals, + currIdx, loWinIdx, hiWinIdx) |> Array + @test Array(y) ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values, attnDesc; + axes, seqLengthsQO, seqLengthsKV, residuals, + currIdx, loWinIdx, hiWinIdx) |> Array +end + +Q,K,V,B,T,F = 6,6,5,4,3,Float32 + +weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T),(F,K,B,T),(F,V,B,T))) +mhatest() +mhatest(attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_ENABLE_PROJ_BIASES |> Cuint, vProjSize=7) +mhatest(seqLengthsQO = Cint[1,2,3,1]) +mhatest(seqLengthsKV = Cint[1,2,3,1]) +mhatest(nHeads = 2) +mhatest(smScaler = 2) +mhatest(mathType = CUDNN_DEFAULT_MATH) +mhatest(mathType = CUDNN_TENSOR_OP_MATH) +mhatest(mathType = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) +mhatest(mathType = CUDNN_FMA_MATH) +mhatest(kProjSize = 7, qProjSize = 7) # k and q have to match +mhatest(vProjSize = 7) +mhatest(oProjSize = 7) +mhatest(qoMaxSeqLength = 7) +mhatest(kvMaxSeqLength = 7) +mhatest(maxBatchSize = 7) +mhatest(maxBeamSize = 7) +mhatest(loWinIdx = fill(Cint(1),T)) +mhatest(hiWinIdx = fill(Cint(1),T)) +mhatest(currIdx = 0) + +# Test residuals: residuals and output (and thus values unless oProjSize>0) must match queries in vector size +values, residuals = (CUDA.randn(x...) for x in ((F,Q,B,T),(F,Q,B,T))) +mhatest(residuals = residuals) + +# Test nonstandard axes order +weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,T,B),(F,K,T,B),(F,V,T,B))) +mhatest(axes = [CUDNN_SEQDATA_VECT_DIM, CUDNN_SEQDATA_TIME_DIM, CUDNN_SEQDATA_BATCH_DIM, CUDNN_SEQDATA_BEAM_DIM]) + +# Test beam handling +weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T,2),(F,K,B,T,1),(F,V,B,T,1))) +mhatest() + +# CUDNN_ATTN_QUERYMAP_ONE_TO_ONE does not seem to be supported +#weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T,M),(F,K,B,T,M),(F,V,B,T,M))) +#mhatest(attnMode = CUDNN_ATTN_QUERYMAP_ONE_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint) diff --git a/lib/cudnn/test/normalization.jl b/lib/cudnn/test/normalization.jl new file mode 100644 index 0000000000..6e42f8e864 --- /dev/null +++ b/lib/cudnn/test/normalization.jl @@ -0,0 +1,116 @@ +using Statistics, Random + +using CUDNN: + cudnnNormalizationForward, + cudnnNormalizationForward!, + cudnnNormalizationForwardInference, + cudnnNormalizationForwardTraining, + cudnnNormalizationBackward, + cudnnActivationDescriptor, + cudnnNormMode_t, + CUDNN_NORM_PER_ACTIVATION, # 0, bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) + CUDNN_NORM_PER_CHANNEL, # 1, bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) + cudnnNormOps_t, + CUDNN_NORM_OPS_NORM, # 0, /* do normalization only */ + CUDNN_NORM_OPS_NORM_ACTIVATION, # 1, /* do Norm, then activation */ + CUDNN_NORM_OPS_NORM_ADD_ACTIVATION, # 2, /* do Norm, then elemWiseAdd, then activation */ + cudnnNormAlgo_t, + CUDNN_NORM_ALGO_STANDARD, # 0 + CUDNN_NORM_ALGO_PERSIST, # 1 + cudnnActivationMode_t, + CUDNN_ACTIVATION_SIGMOID, # 0 + CUDNN_ACTIVATION_RELU, # 1 + CUDNN_ACTIVATION_TANH, # 2 + CUDNN_ACTIVATION_CLIPPED_RELU, # 3 + CUDNN_ACTIVATION_ELU, # 4 + CUDNN_ACTIVATION_IDENTITY, # 5 + cudnnNanPropagation_t, + CUDNN_NOT_PROPAGATE_NAN, # 0 + CUDNN_PROPAGATE_NAN, # 1 + cudnnTensorFormat_t, + CUDNN_TENSOR_NCHW, # 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC, # 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C # 2, /* each image point is vector of element of C, vector length in data type */ + + +function normtest( + x; + + training = false, + + # Inference parameters: + z = nothing, # for residual addition to the result of the normalization operation, prior to the activation + mode::cudnnNormMode_t = CUDNN_NORM_PER_CHANNEL, # Per-channel layer is based on the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, S. Ioffe, C. Szegedy, 2015. + normOps::cudnnNormOps_t = CUDNN_NORM_OPS_NORM, # Currently CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are only supported in the NHWC layout (training,backward), not supported (inference) + algo::cudnnNormAlgo_t = CUDNN_NORM_ALGO_STANDARD, # trigger the new semi-persistent NHWC kernel when CUDNN_NORM_ALGO_PERSIST + alpha::Real = 1, + beta::Real = 0, + epsilon::Real = 1e-5, # Has to be >= 0. Should be the same in forward and backward functions. + groupCnt::Integer = 1, # Place hold for future work, should be set to 1 now + + # Main argument defaults: + format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, # or NHWC + _sdims = (mode == CUDNN_NORM_PER_CHANNEL && format == CUDNN_TENSOR_NCHW ? (1,1,size(x,3),1) : + mode == CUDNN_NORM_PER_CHANNEL && format == CUDNN_TENSOR_NHWC ? (size(x,1),1,1,1) : + mode == CUDNN_NORM_PER_ACTIVATION && format == CUDNN_TENSOR_NCHW ? (size(x)[1:3]...,1) : + mode == CUDNN_NORM_PER_ACTIVATION && format == CUDNN_TENSOR_NHWC ? (size(x)[1:3]...,1) : + error("Unknown mode $mode and format $format")), + scale = fill!(similar(x, _sdims), 1), + bias = fill!(similar(x, _sdims), 0), + xmean = fill!(similar(x, _sdims), 0), + xvar = fill!(similar(x, _sdims), 1), + + # Training-only parameters: + exponentialAverageFactor::Real = 0.1, + savedMean = nothing, # Optionally save intermediate results from the forward pass here - can be reused to speed up backward pass. NULL if unused. + savedInvVariance = nothing, + + # Activation parameters: + activationMode::cudnnActivationMode_t = CUDNN_ACTIVATION_IDENTITY, + activationReluNanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN, + activationCoef::Real = 1, + activationDesc::Union{Nothing,cudnnActivationDescriptor} = (normOps == CUDNN_NORM_OPS_NORM ? nothing : cudnnActivationDescriptor(activationMode, activationReluNanOpt, Cdouble(activationCoef))), +) + if training + dims = findall(size(xmean) .== 1) + m = mean(x; dims) + v = var(x; dims, mean=m, corrected=false) + y = bias .+ scale .* (x .- m) ./ sqrt.(epsilon .+ v) + else + y = bias .+ scale .* (x .- xmean) ./ sqrt.(epsilon .+ xvar) + end + y0 = randn!(similar(x)) + y1 = alpha * y + y2 = y1 + beta * y0 + @test Array(y1) ≈ cudnnNormalizationForward(x, xmean, xvar, bias, scale; training, z, mode, + normOps, algo, alpha, epsilon, groupCnt, + format, exponentialAverageFactor, savedMean, + savedInvVariance, activationDesc) |> Array + @test Array(y2) ≈ cudnnNormalizationForward!(copy(y0), x, xmean, xvar, bias, scale; + training, z, mode, normOps, algo, alpha, beta, + epsilon, groupCnt, format, + exponentialAverageFactor, savedMean, + savedInvVariance, activationDesc) |> Array +end + +x, z, s = (CUDA.randn(x...) for x in ((5,4,3,2),(5,4,3,2),(1,1,3,1))) +normtest(x) +normtest(x; training = true) +normtest(x; mode = CUDNN_NORM_PER_ACTIVATION) +normtest(x; algo = CUDNN_NORM_ALGO_PERSIST) +normtest(x; algo = CUDNN_NORM_ALGO_PERSIST, format = CUDNN_TENSOR_NHWC) +normtest(x; alpha = 2) +normtest(x; beta = 2) +normtest(x; epsilon = 0) +normtest(x; format = CUDNN_TENSOR_NHWC) +normtest(x; scale = fill!(s, 2)) +normtest(x; bias = fill!(s, 2)) +normtest(x; xmean = fill!(s, 2)) +normtest(x; xvar = fill!(s, 2)) +normtest(x; exponentialAverageFactor = 0.01) +normtest(x; savedMean = similar(s)) +normtest(x; savedInvVariance = similar(s)) +# cudnn-8.0.5: Currently, CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are not supported in inference. +#normtest(x; normOps = CUDNN_NORM_OPS_NORM_ACTIVATION, activationMode = CUDNN_ACTIVATION_RELU, format = CUDNN_TENSOR_NHWC) +#normtest(x; normOps = CUDNN_NORM_OPS_NORM_ADD_ACTIVATION, activationMode = CUDNN_ACTIVATION_RELU, z, format = CUDNN_TENSOR_NHWC) +#normtest(x; groupCnt = 2) # cudnn-8.0.5: Currently only groupCnt=1 is supported diff --git a/lib/cudnn/test/optensor.jl b/lib/cudnn/test/optensor.jl new file mode 100644 index 0000000000..a168a022c0 --- /dev/null +++ b/lib/cudnn/test/optensor.jl @@ -0,0 +1,61 @@ +using CUDNN: + cudnnOpTensor, + cudnnOpTensor!, + cudnnOpTensorDescriptor, + cudnnOpTensorDescriptor_t, + cudnnCreateOpTensorDescriptor, + cudnnSetOpTensorDescriptor, + cudnnGetOpTensorDescriptor, + cudnnDestroyOpTensorDescriptor, + cudnnOpTensorOp_t, + CUDNN_OP_TENSOR_ADD, # 0, + CUDNN_OP_TENSOR_MUL, # 1, + CUDNN_OP_TENSOR_MIN, # 2, + CUDNN_OP_TENSOR_MAX, # 3, + CUDNN_OP_TENSOR_SQRT, # 4, performed only on first arg + CUDNN_OP_TENSOR_NOT, # 5, performed only on first arg + cudnnNanPropagation_t, + CUDNN_NOT_PROPAGATE_NAN, # 0 + CUDNN_PROPAGATE_NAN, # 1 + cudnnDataType + +@test cudnnOpTensorDescriptor(C_NULL) isa cudnnOpTensorDescriptor +@test Base.unsafe_convert(Ptr, cudnnOpTensorDescriptor(C_NULL)) isa Ptr +@test cudnnOpTensorDescriptor(CUDNN_OP_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN) isa cudnnOpTensorDescriptor + +(ax1,ax2,ay) = rand.((10,10,10)) +(cx1,cx2,cy) = CuArray.((ax1,ax2,ay)) + +function optensortest(; + op=CUDNN_OP_TENSOR_ADD, + nanOpt=CUDNN_NOT_PROPAGATE_NAN, + compType=(eltype(ax1) <: Float64 ? Float64 : Float32), + alpha1=1, + alpha2=1, + beta=0, +) + f1 = (op === CUDNN_OP_TENSOR_ADD ? alpha1*ax1 .+ alpha2*ax2 : + op === CUDNN_OP_TENSOR_MUL ? (alpha1*ax1) .* (alpha2*ax2) : + op === CUDNN_OP_TENSOR_MIN ? min.(alpha1*ax1, alpha2*ax2) : + op === CUDNN_OP_TENSOR_MAX ? max.(alpha1*ax1, alpha2*ax2) : + op === CUDNN_OP_TENSOR_SQRT ? sqrt.(alpha1*ax1) : + op === CUDNN_OP_TENSOR_NOT ? 1 .- ax1 : + error("Unknown optensor")) + f2 = f1 .+ beta * ay + d = cudnnOpTensorDescriptor(op,cudnnDataType(compType),nanOpt) + @test f1 ≈ cudnnOpTensor(cx1, cx2; op, compType, nanOpt, alpha1, alpha2) |> Array + @test f1 ≈ cudnnOpTensor(cx1, cx2, d; alpha1, alpha2) |> Array + @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2; op, compType, nanOpt, alpha1, alpha2, beta) |> Array + @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2, d; alpha1, alpha2, beta) |> Array +end + +optensortest(op = CUDNN_OP_TENSOR_ADD) +optensortest(op = CUDNN_OP_TENSOR_MUL) +optensortest(op = CUDNN_OP_TENSOR_MIN) +optensortest(op = CUDNN_OP_TENSOR_MAX) +optensortest(op = CUDNN_OP_TENSOR_SQRT) +optensortest(op = CUDNN_OP_TENSOR_NOT) +optensortest(nanOpt = CUDNN_PROPAGATE_NAN) +optensortest(alpha1 = 2) +optensortest(alpha2 = 2) +optensortest(beta = 2) diff --git a/lib/cudnn/test/pooling.jl b/lib/cudnn/test/pooling.jl new file mode 100644 index 0000000000..c2e5849a49 --- /dev/null +++ b/lib/cudnn/test/pooling.jl @@ -0,0 +1,91 @@ +using CUDA, Random +import NNlib +using CUDNN: + cudnnPoolingForward, + cudnnPoolingForward!, + cudnnPoolingBackward, + cudnnGetPoolingNdForwardOutputDim, + cudnnPoolingDescriptor, + cudnnPoolingDescriptor_t, + cudnnCreatePoolingDescriptor, + cudnnSetPoolingNdDescriptor, + cudnnDestroyPoolingDescriptor, + cudnnPoolingMode_t, + CUDNN_POOLING_MAX, # 0, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, # 1, /* count for average includes padded values */ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, # 2, /* count for average does not include padded values */ + CUDNN_POOLING_MAX_DETERMINISTIC, # 3 + cudnnNanPropagation_t, + CUDNN_NOT_PROPAGATE_NAN, # 0 + CUDNN_PROPAGATE_NAN, # 1 + cudnnTensorFormat_t, + CUDNN_TENSOR_NCHW, # 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC, # 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */ + pooldims + +function pooltest(; + mode = CUDNN_POOLING_MAX, + nanOpt = CUDNN_NOT_PROPAGATE_NAN, + window = 2, + padding = 0, + stride = window, + format = CUDNN_TENSOR_NCHW, + dataType = Float32, + alpha = 1, + beta = 0) + ax = randn(dataType,12,6,4,2) + N = ndims(ax) + window = expand(Val(N-2), window) + stride = expand(Val(N-2), stride) + padding = expand(Val(N-2), padding) + pdims = NNlib.PoolDims(ax, window; padding = padding, stride = stride) + #= + if mode == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING + @warn "Pool mode=$mode not yet implemented in NNlib, using INCLUDE instead. See https://github.com/FluxML/NNlib.jl/issues/218" maxlog=1 + end + if mode == CUDNN_POOLING_MAX_DETERMINISTIC + @warn "Pool mode=$mode not yet implemented in NNlib, using MAX instead." maxlog=1 + end + if nanOpt == CUDNN_NOT_PROPAGATE_NAN + @warn "Pool nanOpt=$nanOpt not yet implemented in NNlib, using PROPAGATE instead. See https://github.com/FluxML/NNlib.jl/issues/218" maxlog=1 + end + =# + ay1 = (mode == CUDNN_POOLING_MAX ? NNlib.maxpool(ax, pdims) : + mode == CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ? NNlib.meanpool(ax, pdims) : + mode == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING ? NNlib.meanpool(ax, pdims) : + mode == CUDNN_POOLING_MAX_DETERMINISTIC ? NNlib.maxpool(ax, pdims) : + error("mode=$mode is not supported.")) + ay1 = alpha * ay1 + ay = randn!(similar(ay1)) + ay2 = ay1 .+ beta * ay + d = cudnnPoolingDescriptor(mode, nanOpt, Cint(max(2,ndims(ax)-2)), pooldims(window,size(ax)), pooldims(padding,size(ax)), pooldims(stride,size(ax))) + nhwc(a) = permutedims(a,(3,1,2,4)) + if format === CUDNN_TENSOR_NCHW + cx, cy = CuArray.((ax, ay)) + else + cx, cy = CuArray.(nhwc.((ax,ay))) + ay1, ay2 = nhwc.((ay1, ay2)) + end + @test ay1 ≈ cudnnPoolingForward(cx; mode, nanOpt, window, padding, stride, format, alpha) |> Array + @test ay1 ≈ cudnnPoolingForward(cx, d; format, alpha) |> Array + @test ay2 ≈ cudnnPoolingForward!(copy(cy), cx; mode, nanOpt, window, padding, stride, format, alpha, beta) |> Array + @test ay2 ≈ cudnnPoolingForward!(copy(cy), cx, d; format, alpha, beta) |> Array +end + +expand(::Val{N}, i::NTuple{N}) where {N} = i +expand(::Val{N}, i::Integer) where {N} = ntuple(_ -> i, N) + + +pooltest() +pooltest(mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING) +pooltest(mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) +pooltest(mode = CUDNN_POOLING_MAX_DETERMINISTIC) +pooltest(nanOpt = CUDNN_PROPAGATE_NAN) +pooltest(window = 3) +pooltest(padding = 1) +pooltest(stride = 1) +pooltest(format = CUDNN_TENSOR_NHWC) +pooltest(dataType = Float16) +pooltest(alpha = 2) +pooltest(beta = 2) diff --git a/lib/cudnn/test/reduce.jl b/lib/cudnn/test/reduce.jl new file mode 100644 index 0000000000..5be45493ad --- /dev/null +++ b/lib/cudnn/test/reduce.jl @@ -0,0 +1,84 @@ +using Statistics +using CUDNN: + cudnnReduceTensor, + cudnnReduceTensor!, + cudnnGetReductionIndicesSize, + cudnnGetReductionWorkspaceSize, + cudnnReduceTensorDescriptor, + cudnnReduceTensorDescriptor_t, + cudnnCreateReduceTensorDescriptor, + cudnnSetReduceTensorDescriptor, + cudnnGetReduceTensorDescriptor, + cudnnDestroyReduceTensorDescriptor, + cudnnReduceTensorOp_t, + CUDNN_REDUCE_TENSOR_ADD, # 0, + CUDNN_REDUCE_TENSOR_MUL, # 1, + CUDNN_REDUCE_TENSOR_MIN, # 2, + CUDNN_REDUCE_TENSOR_MAX, # 3, + CUDNN_REDUCE_TENSOR_AMAX, # 4, + CUDNN_REDUCE_TENSOR_AVG, # 5, + CUDNN_REDUCE_TENSOR_NORM1, # 6, + CUDNN_REDUCE_TENSOR_NORM2, # 7, + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS, # 8, + cudnnNanPropagation_t, + CUDNN_NOT_PROPAGATE_NAN, # 0 + CUDNN_PROPAGATE_NAN, # 1 + cudnnReduceTensorIndices, + cudnnReduceTensorIndices_t, + CUDNN_REDUCE_TENSOR_NO_INDICES, # 0, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, # 1, + cudnnIndicesType, + cudnnIndicesType_t, + CUDNN_32BIT_INDICES, # 0, + CUDNN_64BIT_INDICES, # 1, + CUDNN_16BIT_INDICES, # 2, + CUDNN_8BIT_INDICES, # 3, + cudnnDataType + +@test cudnnReduceTensorDescriptor(C_NULL) isa cudnnReduceTensorDescriptor +@test Base.unsafe_convert(Ptr, cudnnReduceTensorDescriptor(C_NULL)) isa Ptr +@test cudnnReduceTensorDescriptor(CUDNN_REDUCE_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN,CUDNN_REDUCE_TENSOR_NO_INDICES,CUDNN_32BIT_INDICES) isa cudnnReduceTensorDescriptor + +(ax,ay) = randn(Float32,10,10), randn(Float32,10,1) +(cx,cy) = CuArray.((ax,ay)) + +function reducetensortest(; + op::cudnnReduceTensorOp_t = CUDNN_REDUCE_TENSOR_ADD, + compType::DataType = (eltype(ax) <: Float64 ? Float64 : Float32), + nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN, + indices::Union{Vector{<:Unsigned},Nothing} = nothing, + d::cudnnReduceTensorDescriptor = cudnnReduceTensorDescriptor(op, cudnnDataType(compType), nanOpt, cudnnReduceTensorIndices(op, indices), cudnnIndicesType(indices)), + alpha::Real = 1, + beta::Real = 0, +) + f0 = (op === CUDNN_REDUCE_TENSOR_ADD ? sum(ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_MUL ? prod(ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_MIN ? minimum(ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_MAX ? maximum(ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_AMAX ? maximum(abs, ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_AVG ? mean(ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_NORM1 ? sum(abs, ax, dims=2) : + op === CUDNN_REDUCE_TENSOR_NORM2 ? sqrt.(sum(abs2, ax, dims=2)) : + op === CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS ? (ax1=copy(ax);ax1[ax.==0].=1;prod(ax1,dims=2)) : + error("Unknown reducetensor")) + f1 = alpha * f0 + f2 = f1 + beta * ay + dims = size(ay) + @test f1 ≈ cudnnReduceTensor(cx; dims, op, compType, nanOpt, indices, alpha) |> Array + @test f1 ≈ cudnnReduceTensor(cx, d; dims, indices, alpha) |> Array + @test f2 ≈ cudnnReduceTensor!(copy(cy), cx; op, compType, nanOpt, indices, alpha, beta) |> Array + @test f2 ≈ cudnnReduceTensor!(copy(cy), cx, d; indices, alpha, beta) |> Array +end + +reducetensortest() +reducetensortest(op = CUDNN_REDUCE_TENSOR_MUL) +reducetensortest(op = CUDNN_REDUCE_TENSOR_MIN) +reducetensortest(op = CUDNN_REDUCE_TENSOR_MAX) +reducetensortest(op = CUDNN_REDUCE_TENSOR_AMAX) +reducetensortest(op = CUDNN_REDUCE_TENSOR_AVG) +reducetensortest(op = CUDNN_REDUCE_TENSOR_NORM1) +reducetensortest(op = CUDNN_REDUCE_TENSOR_NORM2) +reducetensortest(op = CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS) +reducetensortest(nanOpt = CUDNN_PROPAGATE_NAN) +reducetensortest(alpha = 2) +reducetensortest(beta = 2) diff --git a/lib/cudnn/test/rnn.jl b/lib/cudnn/test/rnn.jl new file mode 100644 index 0000000000..806ad9db95 --- /dev/null +++ b/lib/cudnn/test/rnn.jl @@ -0,0 +1,142 @@ +using CUDNN: + cudnnRNNForward, + cudnnRNNForward!, + cudnnRNNBackwardData_v8, + cudnnRNNBackwardWeights_v8, + cudnnRNNDescriptor, + cudnnRNNDescriptor_t, + cudnnSetRNNDescriptor_v8, + cudnnGetRNNWeightSpaceSize, + cudnnGetRNNTempSpaceSizes, + cudnnRNNAlgo_t, + CUDNN_RNN_ALGO_STANDARD, # 0, robust performance across a wide range of network parameters + CUDNN_RNN_ALGO_PERSIST_STATIC, # 1, fast when the first dimension of the input tensor is small (meaning, a small minibatch), cc>=6.0 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC, # 2, similar to static, optimize using the specific parameters of the network and active GPU, cc>=6.0 + CUDNN_RNN_ALGO_COUNT, # 3 + cudnnRNNMode_t, + CUDNN_RNN_RELU, # 0, /* basic RNN cell type with ReLu activation */ + CUDNN_RNN_TANH, # 1, /* basic RNN cell type with tanh activation */ + CUDNN_LSTM, # 2, /* LSTM with optional recurrent projection and clipping */ + CUDNN_GRU, # 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ + cudnnRNNBiasMode_t, + CUDNN_RNN_NO_BIAS, # 0, /* rnn cell formulas do not use biases */ + CUDNN_RNN_SINGLE_INP_BIAS, # 1, /* rnn cell formulas use one input bias in input GEMM */ + CUDNN_RNN_DOUBLE_BIAS, # 2, /* default, rnn cell formulas use two bias vectors */ + CUDNN_RNN_SINGLE_REC_BIAS, # 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */ + cudnnDirectionMode_t, + CUDNN_UNIDIRECTIONAL, # 0, /* single direction network */ + CUDNN_BIDIRECTIONAL, # 1, /* output concatination at each layer */ + cudnnRNNInputMode_t, + CUDNN_LINEAR_INPUT, # 0, /* adjustable weight matrix in first layer input GEMM */ + CUDNN_SKIP_INPUT, # 1, /* fixed identity matrix in the first layer input GEMM */ + cudnnMathType_t, + CUDNN_DEFAULT_MATH, # 0, + CUDNN_TENSOR_OP_MATH, # 1, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2, + CUDNN_FMA_MATH, # 3, + #/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */ + CUDNN_RNN_PADDED_IO_DISABLED, # 0 + CUDNN_RNN_PADDED_IO_ENABLED, # (1U << 0) + cudnnForwardMode_t, + CUDNN_FWD_MODE_INFERENCE, # 0 + CUDNN_FWD_MODE_TRAINING, # 1 + cudnnRNNDataDescriptor_t, + cudnnSetRNNDataDescriptor, + cudnnRNNDataLayout_t, + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, # 0, /* padded, outer stride from one time-step to the next */ + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, # 1, /* sequence length sorted and packed as in basic RNN api */ + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED, # 2, /* padded, outer stride from one batch to the next */ + cudnnWgradMode_t, + CUDNN_WGRAD_MODE_ADD, # 0, /* add partial gradients to wgrad output buffers */ + CUDNN_WGRAD_MODE_SET, # 1, /* write partial gradients to wgrad output buffers */ + cudnnTensorDescriptor, + cudnnDropoutDescriptor, + cudnnDataType, + math_mode + + X,H,B,T = 8,8,4,2 + w = CUDA.randn(10000) + x = CUDA.randn(X,B,T) + hx1 = CUDA.randn(H,B,1) + cx1 = CUDA.randn(H,B,1) + +function rnntest(; + hx = nothing, + cx = nothing, + hy = nothing, + cy = nothing, + layout::cudnnRNNDataLayout_t = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, + seqLengthArray::Union{Nothing,Vector{Cint}} = nothing, + fwdMode::cudnnForwardMode_t = CUDNN_FWD_MODE_INFERENCE, + # descriptor keywords + hiddenSize::Integer = H, + algo::cudnnRNNAlgo_t = CUDNN_RNN_ALGO_STANDARD, + cellMode::cudnnRNNMode_t = CUDNN_LSTM, + biasMode::cudnnRNNBiasMode_t = CUDNN_RNN_DOUBLE_BIAS, + dirMode::cudnnDirectionMode_t = CUDNN_UNIDIRECTIONAL, + inputMode::cudnnRNNInputMode_t = CUDNN_LINEAR_INPUT, + mathPrec::DataType = eltype(x), + mathType::cudnnMathType_t = math_mode(), + inputSize::Integer = size(x,1), + projSize::Integer = hiddenSize, + numLayers::Integer = 1, + dropout::Real = 0, + auxFlags::Integer = CUDNN_RNN_PADDED_IO_ENABLED, +) + d = cudnnRNNDescriptor(algo, cellMode, biasMode, dirMode, inputMode, + cudnnDataType(eltype(x)), cudnnDataType(mathPrec), mathType, + Int32(inputSize), Int32(hiddenSize), Int32(projSize), + Int32(numLayers), cudnnDropoutDescriptor(Cfloat(dropout)), + UInt32(auxFlags)) + y = cudnnRNNForward(w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode, + hiddenSize, algo, cellMode, biasMode, dirMode, inputMode, + mathPrec, mathType, inputSize, projSize, numLayers, dropout, + auxFlags) + _y = copy(y) + _hy = (hy === nothing ? hy : copy(hy[])) + _cy = (cy === nothing ? cy : copy(cy[])) + @test Array(_y) ≈ cudnnRNNForward!(y, w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode, + hiddenSize, algo, cellMode, biasMode, dirMode, inputMode, + mathPrec, mathType, inputSize, projSize, numLayers, dropout, auxFlags) |> Array + (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[])) + (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[])) + @test Array(_y) ≈ cudnnRNNForward(w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) |> Array + (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[])) + (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[])) + @test Array(_y) ≈ cudnnRNNForward!(y, w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) |> Array + (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[])) + (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[])) +end + +rnntest() +rnntest(hx=hx1) +rnntest(cx=cx1) +rnntest(hy=Ref{Any}()) +rnntest(cy=Ref{Any}()) +rnntest(layout=CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED) +rnntest(layout=CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED) +rnntest(seqLengthArray=Cint[1,2,1,2]) +rnntest(fwdMode=CUDNN_FWD_MODE_TRAINING) +rnntest(hiddenSize=16) +# XXX: it's unclear which devices support this algorithm +if capability(device()) >= v"6.1" + rnntest(algo=CUDNN_RNN_ALGO_PERSIST_STATIC) +end +#rnntest(algo=CUDNN_RNN_ALGO_PERSIST_DYNAMIC) # causes segfault +rnntest(cellMode=CUDNN_RNN_RELU) +rnntest(cellMode=CUDNN_RNN_TANH) +rnntest(cellMode=CUDNN_GRU) +rnntest(biasMode=CUDNN_RNN_NO_BIAS) +rnntest(biasMode=CUDNN_RNN_SINGLE_INP_BIAS) +rnntest(biasMode=CUDNN_RNN_SINGLE_REC_BIAS) +rnntest(dirMode=CUDNN_BIDIRECTIONAL) +rnntest(inputMode=CUDNN_SKIP_INPUT) +rnntest(mathPrec=Float32) # only possible option for F32 input +rnntest(mathType=CUDNN_DEFAULT_MATH) +rnntest(mathType=CUDNN_TENSOR_OP_MATH) +rnntest(mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) +rnntest(projSize=4) +rnntest(numLayers=2) +rnntest(dropout=0.5) +rnntest(auxFlags=CUDNN_RNN_PADDED_IO_DISABLED) +rnntest(auxFlags=CUDNN_RNN_PADDED_IO_ENABLED) diff --git a/lib/cudnn/test/runtests.jl b/lib/cudnn/test/runtests.jl new file mode 100644 index 0000000000..46648a4cd7 --- /dev/null +++ b/lib/cudnn/test/runtests.jl @@ -0,0 +1,27 @@ +using CUDNN, CUDA, Test + +@info "CUDA information:\n" * sprint(io->CUDA.versioninfo(io)) + +@test CUDNN.has_cudnn() +@info "CUDNN version: $(CUDNN.version()) (built for CUDA $(CUDNN.cuda_version()))" + +@testset "CUDNN" begin + +# include all tests +for entry in readdir(@__DIR__) + endswith(entry, ".jl") || continue + entry in ["runtests.jl"] && continue + + # XXX: disabled due to sporadic CI issue (JuliaGPU/CUDA.jl#/725) + entry == "convolution.jl" && continue + + # generate a testset + name = splitext(entry)[1] + @eval begin + @testset $name begin + include($entry) + end + end +end + +end diff --git a/lib/cudnn/test/softmax.jl b/lib/cudnn/test/softmax.jl new file mode 100644 index 0000000000..14d0b5c1c3 --- /dev/null +++ b/lib/cudnn/test/softmax.jl @@ -0,0 +1,45 @@ +using CUDNN: + cudnnSoftmaxForward, + cudnnSoftmaxForward!, + cudnnSoftmaxBackward, + cudnnSoftmaxAlgorithm_t, + CUDNN_SOFTMAX_FAST, # 0, /* straightforward implementation */ + CUDNN_SOFTMAX_ACCURATE, # 1, /* subtract max from every point to avoid overflow */ + CUDNN_SOFTMAX_LOG, # 2 + cudnnSoftmaxMode_t, + CUDNN_SOFTMAX_MODE_INSTANCE, # 0, /* compute the softmax over all C, H, W for each N */ + CUDNN_SOFTMAX_MODE_CHANNEL # 1 /* compute the softmax over all C for each H, W, N */ + +ax,ay = randn(Float32,10,10),randn(Float32,10,10) +cx,cy = CuArray.((ax,ay)) + +function softmaxtest(; + alpha=1, + beta=0, + mode=CUDNN_SOFTMAX_MODE_INSTANCE, + algo=CUDNN_SOFTMAX_FAST +) + d = mode === CUDNN_SOFTMAX_MODE_INSTANCE ? 1 : 2 + x = ax .- maximum(ax, dims=d) + y = x .- log.(sum(exp.(x), dims=d)) + if algo !== CUDNN_SOFTMAX_LOG; y = exp.(y); end + add1(x)=reshape(x, (size(x)..., 1)) + if mode === CUDNN_SOFTMAX_MODE_CHANNEL + y,cx1,cy1 = add1.((y,cx,cy)) + else + cx1,cy1 = cx,cy + end + y0 = alpha * y + y1 = y0 .+ beta * ay + @test y0 ≈ cudnnSoftmaxForward(cx1; algo, mode, alpha) |> Array + @test y1 ≈ cudnnSoftmaxForward!(copy(cy1), cx1; algo, mode, alpha, beta) |> Array +end + +softmaxtest() +softmaxtest(alpha=2) +softmaxtest(beta=2) +softmaxtest(mode=CUDNN_SOFTMAX_MODE_INSTANCE) +softmaxtest(mode=CUDNN_SOFTMAX_MODE_CHANNEL) +softmaxtest(algo=CUDNN_SOFTMAX_FAST) +softmaxtest(algo=CUDNN_SOFTMAX_ACCURATE) +softmaxtest(algo=CUDNN_SOFTMAX_LOG) diff --git a/lib/cudnn/test/tensor.jl b/lib/cudnn/test/tensor.jl new file mode 100644 index 0000000000..ba508b09dc --- /dev/null +++ b/lib/cudnn/test/tensor.jl @@ -0,0 +1,31 @@ +using CUDNN: + cudnnTensorDescriptor, + cudnnCreateTensorDescriptor, + cudnnFilterDescriptor, + cudnnDataType, + cudnnDataType_t, + CUDNN_TENSOR_NCHW, + CUDNN_STATUS_SUCCESS, + @retry_reclaim + +x = CUDA.rand(1,1,1,2) + +TD = cudnnTensorDescriptor +FD = cudnnFilterDescriptor +DT = cudnnDataType + +@test TD(x) isa TD +@test TD(CUDNN_TENSOR_NCHW, DT(eltype(x)), Cint(ndims(x)), Cint[reverse(size(x))...]) isa TD +td = TD(x) +@test TD(td.ptr) isa TD +@test Base.unsafe_convert(Ptr, TD(td.ptr)) isa Ptr + +@test FD(x) isa FD +@test FD(DT(eltype(x)),CUDNN_TENSOR_NCHW,Cint(ndims(x)),Cint[reverse(size(x))...]) isa FD +fd = FD(x) +@test FD(fd.ptr) isa FD +@test Base.unsafe_convert(Ptr, FD(fd.ptr)) isa Ptr + +@test DT(Float32) isa cudnnDataType_t + +@test (@retry_reclaim(x->(x!==CUDNN_STATUS_SUCCESS),cudnnCreateTensorDescriptor(Ref{Ptr{Cvoid}}(C_NULL)))) isa Nothing diff --git a/lib/CUSTATEVEC/Project.toml b/lib/custatevec/Project.toml similarity index 83% rename from lib/CUSTATEVEC/Project.toml rename to lib/custatevec/Project.toml index 7873c6c270..b744874414 100644 --- a/lib/CUSTATEVEC/Project.toml +++ b/lib/custatevec/Project.toml @@ -6,3 +6,4 @@ version = "0.1.0" [deps] CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" diff --git a/lib/CUSTATEVEC/src/CUSTATEVEC.jl b/lib/custatevec/src/CUSTATEVEC.jl similarity index 87% rename from lib/CUSTATEVEC/src/CUSTATEVEC.jl rename to lib/custatevec/src/CUSTATEVEC.jl index ce061fa18a..2b167303fa 100644 --- a/lib/CUSTATEVEC/src/CUSTATEVEC.jl +++ b/lib/custatevec/src/CUSTATEVEC.jl @@ -2,12 +2,14 @@ module CUSTATEVEC using CUDA using CUDA: CUstream, cudaDataType, @checked, HandleCache, with_workspace, libraryPropertyType -using CUDA: libcustatevec, unsafe_free!, @retry_reclaim, initialize_context +using CUDA: unsafe_free!, @retry_reclaim, initialize_context using CEnum: @cenum const cudaDataType_t = cudaDataType +include("bindeps.jl") + # core library include("libcustatevec_common.jl") include("error.jl") @@ -63,4 +65,12 @@ function handle() return state.handle end +function version() + ver = custatevecGetVersion() + major, ver = divrem(ver, 1000) + minor, patch = divrem(ver, 10) + + VersionNumber(major, minor, patch) +end + end diff --git a/lib/custatevec/src/bindeps.jl b/lib/custatevec/src/bindeps.jl new file mode 100644 index 0000000000..40df601b20 --- /dev/null +++ b/lib/custatevec/src/bindeps.jl @@ -0,0 +1,47 @@ +using CUDA.Deps: @initialize_ref, generic_artifact, artifact_library, find_library, + LocalToolkit, ArtifactToolkit, toolkit + +using Libdl + +export libcustatevec, has_custatevec + +const __libcustatevec = Ref{Union{String,Nothing}}() +function libcustatevec(; throw_error::Bool=true) + path = @initialize_ref __libcustatevec begin + + if CUDA.runtime_version() < v"11" + # XXX: bound this using tags in the Artifact.toml? + nothing + else + find_custatevec(toolkit(), "custatevec", v"0.1") + end + end + if path === nothing && throw_error + error("This functionality is unavailabe as CUSTATEVEC is missing.") + end + return path +end +has_custatevec() = libcustatevec(throw_error=false) !== nothing + +function find_custatevec(cuda::ArtifactToolkit, name, version) + artifact_dir = generic_artifact("cuQuantum") + if artifact_dir === nothing + return nothing + end + path = artifact_library(artifact_dir, name, [version]) + + @debug "Using CUSTATEVEC library $name from an artifact at $(artifact_dir)" + Libdl.dlopen(path) + return path +end + +function find_custatevec(cuda::LocalToolkit, name, version) + path = find_library(name, [version]; locations=cuda.dirs) + if path === nothing + return nothing + end + + @debug "Using local CUSTATEVEC library $name at $(path)" + Libdl.dlopen(path) + return path +end diff --git a/lib/CUSTATEVEC/src/error.jl b/lib/custatevec/src/error.jl similarity index 100% rename from lib/CUSTATEVEC/src/error.jl rename to lib/custatevec/src/error.jl diff --git a/lib/CUSTATEVEC/src/libcustatevec.jl b/lib/custatevec/src/libcustatevec.jl similarity index 100% rename from lib/CUSTATEVEC/src/libcustatevec.jl rename to lib/custatevec/src/libcustatevec.jl diff --git a/lib/CUSTATEVEC/src/libcustatevec_common.jl b/lib/custatevec/src/libcustatevec_common.jl similarity index 100% rename from lib/CUSTATEVEC/src/libcustatevec_common.jl rename to lib/custatevec/src/libcustatevec_common.jl diff --git a/lib/CUSTATEVEC/src/statevec.jl b/lib/custatevec/src/statevec.jl similarity index 100% rename from lib/CUSTATEVEC/src/statevec.jl rename to lib/custatevec/src/statevec.jl diff --git a/lib/CUSTATEVEC/src/types.jl b/lib/custatevec/src/types.jl similarity index 100% rename from lib/CUSTATEVEC/src/types.jl rename to lib/custatevec/src/types.jl diff --git a/lib/CUSTATEVEC/test/Project.toml b/lib/custatevec/test/Project.toml similarity index 100% rename from lib/CUSTATEVEC/test/Project.toml rename to lib/custatevec/test/Project.toml diff --git a/lib/CUSTATEVEC/test/runtests.jl b/lib/custatevec/test/runtests.jl similarity index 90% rename from lib/CUSTATEVEC/test/runtests.jl rename to lib/custatevec/test/runtests.jl index 634f601605..93afbe4cbf 100644 --- a/lib/CUSTATEVEC/test/runtests.jl +++ b/lib/custatevec/test/runtests.jl @@ -1,7 +1,11 @@ -using Test -using CUSTATEVEC +using Test, CUDA, CUSTATEVEC import CUSTATEVEC: CuStateVec, applyMatrix!, expectation, sample +@info "CUDA information:\n" * sprint(io->CUDA.versioninfo(io)) + +@test CUSTATEVEC.has_custatevec() +@info "CUSTATEVEC version: $(CUSTATEVEC.version())" + @testset "CUSTATEVEC" begin # build a simple state and compute expectations n_q = 2 diff --git a/lib/cutensor/Project.toml b/lib/cutensor/Project.toml new file mode 100644 index 0000000000..b9cc3cf5d2 --- /dev/null +++ b/lib/cutensor/Project.toml @@ -0,0 +1,10 @@ +name = "CUTENSOR" +uuid = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" +authors = ["Tim Besard "] +version = "0.1.0" + +[deps] +CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" diff --git a/lib/cutensor/CUTENSOR.jl b/lib/cutensor/src/CUTENSOR.jl similarity index 90% rename from lib/cutensor/CUTENSOR.jl rename to lib/cutensor/src/CUTENSOR.jl index 7dc17873cf..93ded97e3c 100644 --- a/lib/cutensor/CUTENSOR.jl +++ b/lib/cutensor/src/CUTENSOR.jl @@ -1,16 +1,17 @@ module CUTENSOR -using ..APIUtils - -using ..CUDA -using ..CUDA: CUstream, cudaDataType -using ..CUDA: libcutensor, @retry_reclaim, initialize_context +using CUDA +using CUDA.APIUtils +using CUDA: CUstream, cudaDataType +using CUDA: @retry_reclaim, initialize_context using CEnum: @cenum const cudaDataType_t = cudaDataType +include("bindeps.jl") + # core library include("libcutensor_common.jl") include("error.jl") diff --git a/lib/cutensor/src/bindeps.jl b/lib/cutensor/src/bindeps.jl new file mode 100644 index 0000000000..723b07b457 --- /dev/null +++ b/lib/cutensor/src/bindeps.jl @@ -0,0 +1,63 @@ +using CUDA.Deps: @initialize_ref, libcublas, cuda_artifact, artifact_library, find_library, + LocalToolkit, ArtifactToolkit, toolkit + +using Libdl + +export libcutensor, libcutensormg, has_cutensor, has_cutensormg + +const __libcutensor = Ref{Union{String,Nothing}}() +function libcutensor(; throw_error::Bool=true) + path = @initialize_ref __libcutensor begin + # CUTENSOR depends on CUBLAS + libcublas() + + find_cutensor(toolkit(), "cutensor", v"1") + end + if path === nothing && throw_error + error("This functionality is unavailabe as CUTENSOR is missing.") + end + path +end +has_cutensor() = libcutensor(throw_error=false) !== nothing + +const __libcutensormg = Ref{Union{String,Nothing}}() +function libcutensormg(; throw_error::Bool=true) + path = @initialize_ref __libcutensor begin + # CUTENSORMg additionally depends on CUDARt + libcudart() + + if CUTENSOR.version() < v"1.4" + nothing + else + find_cutensor(toolkit(), "cutensorMg", v"1") + end + end + if path === nothing && throw_error + error("This functionality is unavailabe as CUTENSORMg is missing.") + end + path +end +has_cutensormg() = libcutensormg(throw_error=false) !== nothing + +function find_cutensor(cuda::ArtifactToolkit, name, version) + artifact_dir = cuda_artifact("CUTENSOR", cuda.release) + if artifact_dir === nothing + return nothing + end + path = artifact_library(artifact_dir, name, [version]) + + @debug "Using CUTENSOR library $name from an artifact at $(artifact_dir)" + Libdl.dlopen(path) + return path +end + +function find_cutensor(cuda::LocalToolkit, name, version) + path = find_library(name, [version]; locations=cuda.dirs) + if path === nothing + return nothing + end + + @debug "Using local CUTENSOR library $name at $(path)" + Libdl.dlopen(path) + return path +end diff --git a/lib/cutensor/error.jl b/lib/cutensor/src/error.jl similarity index 100% rename from lib/cutensor/error.jl rename to lib/cutensor/src/error.jl diff --git a/lib/cutensor/interfaces.jl b/lib/cutensor/src/interfaces.jl similarity index 100% rename from lib/cutensor/interfaces.jl rename to lib/cutensor/src/interfaces.jl diff --git a/lib/cutensor/libcutensor.jl b/lib/cutensor/src/libcutensor.jl similarity index 100% rename from lib/cutensor/libcutensor.jl rename to lib/cutensor/src/libcutensor.jl diff --git a/lib/cutensor/libcutensor_common.jl b/lib/cutensor/src/libcutensor_common.jl similarity index 100% rename from lib/cutensor/libcutensor_common.jl rename to lib/cutensor/src/libcutensor_common.jl diff --git a/lib/cutensor/tensor.jl b/lib/cutensor/src/tensor.jl similarity index 100% rename from lib/cutensor/tensor.jl rename to lib/cutensor/src/tensor.jl diff --git a/lib/cutensor/wrappers.jl b/lib/cutensor/src/wrappers.jl similarity index 100% rename from lib/cutensor/wrappers.jl rename to lib/cutensor/src/wrappers.jl diff --git a/lib/cutensor/test/Project.toml b/lib/cutensor/test/Project.toml new file mode 100644 index 0000000000..8c2694dba6 --- /dev/null +++ b/lib/cutensor/test/Project.toml @@ -0,0 +1,6 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/cutensor/base.jl b/lib/cutensor/test/base.jl similarity index 86% rename from test/cutensor/base.jl rename to lib/cutensor/test/base.jl index 93134d8741..ab1d81df68 100644 --- a/test/cutensor/base.jl +++ b/lib/cutensor/test/base.jl @@ -1,11 +1,10 @@ -using CUDA.CUTENSOR -using CUDA -using LinearAlgebra +using CUDA, CUTENSOR +using LinearAlgebra, Random @test has_cutensor() @test CUTENSOR.version() isa VersionNumber -@testset "CuTensor type basics" begin +@testset "type basics" begin N = 2 dmax = 2^div(18,N) dims = rand(2:dmax, N) diff --git a/test/cutensor/contractions.jl b/lib/cutensor/test/contractions.jl similarity index 99% rename from test/cutensor/contractions.jl rename to lib/cutensor/test/contractions.jl index cd90235d22..b6aabff717 100644 --- a/test/cutensor/contractions.jl +++ b/lib/cutensor/test/contractions.jl @@ -1,5 +1,4 @@ -using CUDA.CUTENSOR -using CUDA +using CUDA, CUTENSOR using LinearAlgebra eltypes = ( (Float32, Float32, Float32, Float32), @@ -60,7 +59,7 @@ can_pin = !Sys.iswindows() C = collect(dC) mC = reshape(permutedims(C, ipC), (loA, loB)) @test mC ≈ mA * mB rtol=compute_rtol - + # simple case with plan storage opA = CUTENSOR.CUTENSOR_OP_IDENTITY opB = CUTENSOR.CUTENSOR_OP_IDENTITY @@ -117,7 +116,7 @@ can_pin = !Sys.iswindows() mC = reshape(permutedims(C2, invperm(pC2)), (loA, loB)) @test mC ≈ mA * mB end - + # with conjugation flag for complex arguments if !((NoA, NoB, Nc) in ((1,1,3), (1,2,3), (3,1,2))) # not supported for these specific cases for unknown reason @@ -168,7 +167,7 @@ can_pin = !Sys.iswindows() @test !any(isnan.(B)) @test !any(isnan.(mC)) @test mC ≈ mA * mB rtol=compute_rtol - + # simple case with non-zero α host side α = rand(eltyCompute) C .= zero(eltyC) @@ -177,7 +176,7 @@ can_pin = !Sys.iswindows() mC = reshape(permutedims(collect(C), ipC), (loA, loB)) @test !any(isnan.(mC)) @test mC ≈ α * mA * mB rtol=compute_rtol - + # simple case with plan storage host-side opA = CUTENSOR.CUTENSOR_OP_IDENTITY opB = CUTENSOR.CUTENSOR_OP_IDENTITY diff --git a/test/cutensor/elementwise_binary.jl b/lib/cutensor/test/elementwise_binary.jl similarity index 96% rename from test/cutensor/elementwise_binary.jl rename to lib/cutensor/test/elementwise_binary.jl index b4c55c0aff..1f2b5dea74 100644 --- a/test/cutensor/elementwise_binary.jl +++ b/lib/cutensor/test/elementwise_binary.jl @@ -1,10 +1,8 @@ -using CUDA.CUTENSOR -using CUDA +using CUDA, CUTENSOR using LinearAlgebra -# using host memory with CUTENSOR doesn't work on Windows, -# and occasionally causes failures under compute-sanitizer. -can_pin = !Sys.iswindows() && !sanitize +# using host memory with CUTENSOR doesn't work on Windows +can_pin = !Sys.iswindows() eltypes = ((Float16, Float16), #(Float16, Float32), diff --git a/test/cutensor/elementwise_trinary.jl b/lib/cutensor/test/elementwise_trinary.jl similarity index 98% rename from test/cutensor/elementwise_trinary.jl rename to lib/cutensor/test/elementwise_trinary.jl index e942514a88..29fcfd9bc2 100644 --- a/test/cutensor/elementwise_trinary.jl +++ b/lib/cutensor/test/elementwise_trinary.jl @@ -1,10 +1,8 @@ -using CUDA.CUTENSOR -using CUDA +using CUDA, CUTENSOR using LinearAlgebra -# using host memory with CUTENSOR doesn't work on Windows, -# and occasionally causes failures under compute-sanitizer. -can_pin = !Sys.iswindows() && !sanitize +# using host memory with CUTENSOR doesn't work on Windows +can_pin = !Sys.iswindows() eltypes = ((Float16, Float16, Float16), #(Float16, Float32, Float32), diff --git a/test/cutensor/permutations.jl b/lib/cutensor/test/permutations.jl similarity index 97% rename from test/cutensor/permutations.jl rename to lib/cutensor/test/permutations.jl index 30aed153e4..5e83b20215 100644 --- a/test/cutensor/permutations.jl +++ b/lib/cutensor/test/permutations.jl @@ -1,6 +1,5 @@ -using CUDA.CUTENSOR -using CUDA -using LinearAlgebra +using CUDA, CUTENSOR +using LinearAlgebra, Random # using host memory with CUTENSOR doesn't work on Windows can_pin = !Sys.iswindows() diff --git a/test/cutensor/reductions.jl b/lib/cutensor/test/reductions.jl similarity index 98% rename from test/cutensor/reductions.jl rename to lib/cutensor/test/reductions.jl index 065059af67..5f58afeb28 100644 --- a/test/cutensor/reductions.jl +++ b/lib/cutensor/test/reductions.jl @@ -1,6 +1,7 @@ -using CUDA.CUTENSOR -using CUDA +using CUDA, CUTENSOR using LinearAlgebra +using Test +using Random # using host memory with CUTENSOR doesn't work on Windows can_pin = !Sys.iswindows() @@ -11,6 +12,7 @@ eltypes = (#(Float16, Float16), #(Float16, Float32), #(ComplexF16, ComplexF16), (ComplexF16, ComplexF32), (ComplexF32, ComplexF32), #(ComplexF32, ComplexF64), (ComplexF64, ComplexF64)) + @testset for NA=2:5, NC = 1:NA-1 @testset for (eltyA, eltyC) in eltypes # setup diff --git a/lib/cutensor/test/runtests.jl b/lib/cutensor/test/runtests.jl new file mode 100644 index 0000000000..3ad52eb54e --- /dev/null +++ b/lib/cutensor/test/runtests.jl @@ -0,0 +1,24 @@ +using CUTENSOR, CUDA, Test + +@info "CUDA information:\n" * sprint(io->CUDA.versioninfo(io)) + +@test CUTENSOR.has_cutensor() +@info "CUTENSOR version: $(CUTENSOR.version()) (built for CUDA $(CUTENSOR.cuda_version()))" + +@testset "CUTENSOR" begin + +# include all tests +for entry in readdir(@__DIR__) + endswith(entry, ".jl") || continue + entry in ["runtests.jl"] && continue + + # generate a testset + name = splitext(entry)[1] + @eval begin + @testset $name begin + include($entry) + end + end +end + +end diff --git a/lib/CUTENSORNET/Project.toml b/lib/cutensornet/Project.toml similarity index 70% rename from lib/CUTENSORNET/Project.toml rename to lib/cutensornet/Project.toml index 18313e44f7..26a458bb47 100644 --- a/lib/CUTENSORNET/Project.toml +++ b/lib/cutensornet/Project.toml @@ -6,3 +6,5 @@ version = "0.1.0" [deps] CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" +Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" diff --git a/lib/CUTENSORNET/src/CUTENSORNET.jl b/lib/cutensornet/src/CUTENSORNET.jl similarity index 76% rename from lib/CUTENSORNET/src/CUTENSORNET.jl rename to lib/cutensornet/src/CUTENSORNET.jl index c4afd02ba2..e8b074c2f6 100644 --- a/lib/CUTENSORNET/src/CUTENSORNET.jl +++ b/lib/cutensornet/src/CUTENSORNET.jl @@ -2,12 +2,16 @@ module CUTENSORNET using CUDA using CUDA: CUstream, cudaDataType, @checked, HandleCache, with_workspace -using CUDA: libcutensornet, @retry_reclaim, initialize_context +using CUDA: @retry_reclaim, initialize_context + +using CUTENSOR using CEnum: @cenum const cudaDataType_t = cudaDataType +include("bindeps.jl") + # core library include("libcutensornet_common.jl") include("error.jl") @@ -53,4 +57,20 @@ function handle() return state.handle end +function version() + ver = cutensornetGetVersion() + major, ver = divrem(ver, 1000) + minor, patch = divrem(ver, 10) + + VersionNumber(major, minor, patch) +end + +function cuda_version() + ver = cutensornetGetCudartVersion() + major, ver = divrem(ver, 1000) + minor, patch = divrem(ver, 10) + + VersionNumber(major, minor, patch) +end + end diff --git a/lib/cutensornet/src/bindeps.jl b/lib/cutensornet/src/bindeps.jl new file mode 100644 index 0000000000..bba58d88d9 --- /dev/null +++ b/lib/cutensornet/src/bindeps.jl @@ -0,0 +1,49 @@ +using CUDA.Deps: @initialize_ref, generic_artifact, artifact_library, find_library, + LocalToolkit, ArtifactToolkit, toolkit + +using Libdl + +export libcutensornet, has_cutensornet + +const __libcutensornet = Ref{Union{String,Nothing}}() +function libcutensornet(; throw_error::Bool=true) + path = @initialize_ref __libcutensornet begin + # CUTENSORNET depends on CUTENSOR + CUTENSOR.libcutensor(throw_error=throw_error) + + if CUDA.runtime_version() < v"11" + # XXX: bound this using tags in the Artifact.toml? + nothing + else + find_cutensornet(toolkit(), "cutensornet", v"0.1") + end + end + if path === nothing && throw_error + error("This functionality is unavailabe as CUTENSORNET is missing.") + end + return path +end +has_cutensornet() = CUTENSOR.has_cutensor() && libcutensornet(throw_error=false) !== nothing + +function find_cutensornet(cuda::ArtifactToolkit, name, version) + artifact_dir = generic_artifact("cuQuantum") + if artifact_dir === nothing + return nothing + end + path = artifact_library(artifact_dir, name, [version]) + + @debug "Using CUTENSORNET library $name from an artifact at $(artifact_dir)" + Libdl.dlopen(path) + return path +end + +function find_cutensornet(cuda::LocalToolkit, name, version) + path = find_library(name, [version]; locations=cuda.dirs) + if path === nothing + return nothing + end + + @debug "Using local CUTENSORNET library $name at $(path)" + Libdl.dlopen(path) + return path +end diff --git a/lib/CUTENSORNET/src/error.jl b/lib/cutensornet/src/error.jl similarity index 100% rename from lib/CUTENSORNET/src/error.jl rename to lib/cutensornet/src/error.jl diff --git a/lib/CUTENSORNET/src/libcutensornet.jl b/lib/cutensornet/src/libcutensornet.jl similarity index 100% rename from lib/CUTENSORNET/src/libcutensornet.jl rename to lib/cutensornet/src/libcutensornet.jl diff --git a/lib/CUTENSORNET/src/libcutensornet_common.jl b/lib/cutensornet/src/libcutensornet_common.jl similarity index 100% rename from lib/CUTENSORNET/src/libcutensornet_common.jl rename to lib/cutensornet/src/libcutensornet_common.jl diff --git a/lib/CUTENSORNET/src/tensornet.jl b/lib/cutensornet/src/tensornet.jl similarity index 100% rename from lib/CUTENSORNET/src/tensornet.jl rename to lib/cutensornet/src/tensornet.jl diff --git a/lib/CUTENSORNET/src/types.jl b/lib/cutensornet/src/types.jl similarity index 100% rename from lib/CUTENSORNET/src/types.jl rename to lib/cutensornet/src/types.jl diff --git a/lib/CUTENSORNET/test/Project.toml b/lib/cutensornet/test/Project.toml similarity index 100% rename from lib/CUTENSORNET/test/Project.toml rename to lib/cutensornet/test/Project.toml diff --git a/lib/CUTENSORNET/test/runtests.jl b/lib/cutensornet/test/runtests.jl similarity index 87% rename from lib/CUTENSORNET/test/runtests.jl rename to lib/cutensornet/test/runtests.jl index 39cd145ff1..ac39b846cc 100644 --- a/lib/CUTENSORNET/test/runtests.jl +++ b/lib/cutensornet/test/runtests.jl @@ -1,8 +1,11 @@ -using Test -using CUDA -using CUTENSORNET +using Test, CUDA, CUTENSORNET import CUTENSORNET: CuTensorNetwork, rehearse_contraction, perform_contraction!, AutoTune, NoAutoTune +@info "CUDA information:\n" * sprint(io->CUDA.versioninfo(io)) + +@test CUTENSORNET.has_cutensornet() +@info "CUTENSORNET version: $(CUTENSORNET.version()) (built for CUDA $(CUTENSORNET.cuda_version()))" + @testset "CUTENSORNET" begin n = 8 m = 16 diff --git a/perf/.gitignore b/perf/.gitignore new file mode 100644 index 0000000000..124aa781c5 --- /dev/null +++ b/perf/.gitignore @@ -0,0 +1,2 @@ +results.json +reference.json diff --git a/src/CUDA.jl b/src/CUDA.jl index bdc6fd8d62..3b391db254 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -84,10 +84,8 @@ include("../lib/cusparse/CUSPARSE.jl") include("../lib/cusolver/CUSOLVER.jl") include("../lib/cufft/CUFFT.jl") include("../lib/curand/CURAND.jl") -include("../lib/cudnn/CUDNN.jl") -include("../lib/cutensor/CUTENSOR.jl") -export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR +export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND # random depends on CURAND include("random.jl") diff --git a/src/utilities.jl b/src/utilities.jl index 2a4ed98f0d..c454c57ef3 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -46,8 +46,6 @@ function versioninfo(io::IO=stdout) end println(io, "- CUPTI: ", has_cupti() ? CUPTI.version() : "missing") println(io, "- NVML: ", has_nvml() ? NVML.version() : "missing") - println(io, "- CUDNN: ", has_cudnn() ? "$(CUDNN.version())" : "missing") - println(io, "- CUTENSOR: ", has_cutensor() ? "$(CUTENSOR.version())" : "missing") println(io) println(io, "Toolchain:") diff --git a/test/cudnn/activation.jl b/test/cudnn/activation.jl deleted file mode 100644 index 9aa7d7360f..0000000000 --- a/test/cudnn/activation.jl +++ /dev/null @@ -1,63 +0,0 @@ -using CUDA.CUDNN: - cudnnActivationForward, - cudnnActivationForward!, - cudnnActivationBackward, - cudnnActivationDescriptor, - cudnnActivationDescriptor_t, - cudnnCreateActivationDescriptor, - cudnnSetActivationDescriptor, - cudnnGetActivationDescriptor, - cudnnDestroyActivationDescriptor, - cudnnActivationMode_t, - CUDNN_ACTIVATION_SIGMOID, # 0 - CUDNN_ACTIVATION_RELU, # 1 - CUDNN_ACTIVATION_TANH, # 2 - CUDNN_ACTIVATION_CLIPPED_RELU, # 3 - CUDNN_ACTIVATION_ELU, # 4 - CUDNN_ACTIVATION_IDENTITY, # 5 - cudnnNanPropagation_t, - CUDNN_NOT_PROPAGATE_NAN, # 0 - CUDNN_PROPAGATE_NAN # 1 - - -@testset "cudnn/activation" begin - @test cudnnActivationDescriptor(C_NULL) isa cudnnActivationDescriptor - @test Base.unsafe_convert(Ptr, cudnnActivationDescriptor(C_NULL)) isa Ptr - @test cudnnActivationDescriptor(CUDNN_ACTIVATION_RELU,CUDNN_NOT_PROPAGATE_NAN,0) isa cudnnActivationDescriptor - - (ax,ay) = randn.((10,10)) - (cx,cy) = CuArray.((ax,ay)) - - function activationtest(; - mode=CUDNN_ACTIVATION_SIGMOID, - nanOpt=CUDNN_NOT_PROPAGATE_NAN, - coef=1, - alpha=1, - beta=0, - ) - fx = (mode === CUDNN_ACTIVATION_SIGMOID ? 1 ./ (1 .+ exp.(-ax)) : - mode === CUDNN_ACTIVATION_RELU ? max.(0,ax) : - mode === CUDNN_ACTIVATION_TANH ? tanh.(ax) : - mode === CUDNN_ACTIVATION_CLIPPED_RELU ? clamp.(ax,0,coef) : - mode === CUDNN_ACTIVATION_ELU ? (x->(x >= 0 ? x : coef*(exp(x)-1))).(ax) : - error("Unknown activation")) - d = cudnnActivationDescriptor(mode,nanOpt,Cfloat(coef)) - y0 = alpha * fx - y1 = y0 .+ beta * ay - @test y0 ≈ cudnnActivationForward(cx; mode, nanOpt, coef, alpha) |> Array - @test y0 ≈ cudnnActivationForward(cx, d; alpha) |> Array - @test y1 ≈ cudnnActivationForward!(copy(cy), cx; mode, nanOpt, coef, alpha, beta) |> Array - @test y1 ≈ cudnnActivationForward!(copy(cy), cx, d; alpha, beta) |> Array - end - - activationtest(mode=CUDNN_ACTIVATION_SIGMOID) - activationtest(mode=CUDNN_ACTIVATION_RELU) - activationtest(mode=CUDNN_ACTIVATION_TANH) - activationtest(mode=CUDNN_ACTIVATION_CLIPPED_RELU) - activationtest(mode=CUDNN_ACTIVATION_ELU) - activationtest(nanOpt=CUDNN_PROPAGATE_NAN) - activationtest(coef=2,mode=CUDNN_ACTIVATION_CLIPPED_RELU) - activationtest(coef=2,mode=CUDNN_ACTIVATION_ELU) - activationtest(alpha=2) - activationtest(beta=2) -end diff --git a/test/cudnn/convolution.jl b/test/cudnn/convolution.jl deleted file mode 100644 index 56c8f265ea..0000000000 --- a/test/cudnn/convolution.jl +++ /dev/null @@ -1,193 +0,0 @@ -import NNlib -using CUDA.CUDNN: - cudnnConvolutionForward, - cudnnConvolutionForward!, - cudnnConvolutionBackwardFilter, - cudnnConvolutionBackwardData, - cudnnGetConvolutionNdForwardOutputDim, - cudnnSetConvolutionMathType, - cudnnSetConvolutionReorderType, - cudnnSetConvolutionGroupCount, - cudnnFindConvolutionForwardAlgorithmEx, - cudnnConvolutionFwdAlgoPerf_t, - cudnnFindConvolutionBackwardFilterAlgorithmEx, - cudnnConvolutionBwdFilterAlgoPerf_t, - cudnnFindConvolutionBackwardDataAlgorithmEx, - cudnnConvolutionBwdDataAlgoPerf_t, - cudnnConvolutionDescriptor, - cudnnConvolutionDescriptor_t, - cudnnCreateConvolutionDescriptor, - cudnnSetConvolutionNdDescriptor, - cudnnDestroyConvolutionDescriptor, - cudnnConvolutionMode_t, - CUDNN_CONVOLUTION, # 0 - CUDNN_CROSS_CORRELATION, # 1 - cudnnActivationMode_t, - CUDNN_ACTIVATION_SIGMOID, # 0 - CUDNN_ACTIVATION_RELU, # 1 - CUDNN_ACTIVATION_TANH, # 2 - CUDNN_ACTIVATION_CLIPPED_RELU, # 3 - CUDNN_ACTIVATION_ELU, # 4 - CUDNN_ACTIVATION_IDENTITY, # 5 - cudnnNanPropagation_t, - CUDNN_NOT_PROPAGATE_NAN, # 0 - CUDNN_PROPAGATE_NAN, # 1 - cudnnMathType_t, - CUDNN_DEFAULT_MATH, # 0 - CUDNN_TENSOR_OP_MATH, # 1 - CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2 - CUDNN_FMA_MATH, # 3 - cudnnReorderType_t, - CUDNN_DEFAULT_REORDER, # 0 - CUDNN_NO_REORDER, # 1 - cudnnConvolutionFwdAlgo_t, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, # 0 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, # 1 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM, # 2 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, # 3 - CUDNN_CONVOLUTION_FWD_ALGO_FFT, # 4 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, # 5 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, # 6 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, # 7 - CUDNN_CONVOLUTION_FWD_ALGO_COUNT, # 8 - cudnnConvolutionBwdFilterAlgo_t, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, # 0, /* non-deterministic */ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, # 1, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT, # 2, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3, # 3, /* non-deterministic */ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD, # 4, /* not implemented */ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED, # 5, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, # 6, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT, # 7 - cudnnConvolutionBwdDataAlgo_t, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, # 0, /* non-deterministic */ - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, # 1, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, # 2, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, # 3, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, # 4, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, # 5, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT, # 6 - cudnnTensorFormat_t, - CUDNN_TENSOR_NCHW, # 0, /* row major (wStride = 1, hStride = w) */ - CUDNN_TENSOR_NHWC, # 1, /* feature maps interleaved ( cStride = 1 )*/ - CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */ - cudnnDataType, - convdims, - math_mode - -# XXX: disabled due to sporadic CI issue (JuliaGPU/CUDA.jl#/725) -false && @testset "cudnn/convolution" begin - T = Float32 - ax,aw,ab = randn(T,8,8,4,4),randn(T,3,3,4,4),randn(T,1,1,4,1) - cx,cw,cb = CuArray.((ax,aw,ab)) - - function convtest(; - blendz=false, - bias=nothing, - activation = CUDNN_ACTIVATION_IDENTITY, - mode = CUDNN_CONVOLUTION, - padding = 0, - stride = 1, - dilation = 1, - group = 1, - dataType = eltype(cx), - mathType = math_mode(), - reorderType = CUDNN_DEFAULT_REORDER, - alpha = 1, - beta = 0) - if group == 1 - cdims = NNlib.DenseConvDims(ax, aw; stride, padding, dilation, flipkernel = (mode === CUDNN_CROSS_CORRELATION)) - ay = NNlib.conv(ax, aw, cdims) - cw0 = cw - else - # Implement grouped convolution - xchan = size(aw,3)÷group - ychan = size(aw,4)÷group - xdims = (size(ax,1),size(ax,2),xchan,size(ax,4)) - wdims = (size(aw,1),size(aw,2),xchan,ychan) - cdims = NNlib.DenseConvDims(xdims, wdims; stride, padding, dilation, flipkernel = (mode === CUDNN_CROSS_CORRELATION)) - ay = nothing - for g in 1:group - xrange = 1+(g-1)*xchan:g*xchan - yrange = 1+(g-1)*ychan:g*ychan - ay0 = NNlib.conv(ax[:,:,xrange,:], aw[:,:,1:xchan,yrange], cdims) - ay = (ay === nothing ? ay0 : cat(ay, ay0; dims=3)) - end - cw0 = CuArray(aw[:,:,1:xchan,:]) - end - - if alpha != 1; ay = alpha * ay; end - if bias != nothing; ay = ay .+ Array(bias); end - - act = (activation === CUDNN_ACTIVATION_RELU ? NNlib.relu : - activation === CUDNN_ACTIVATION_IDENTITY ? identity : - error("Unsupported activation $activation")) - ay1 = act.(ay) - - az0 = randn(T,size(ay)...) - ay0 = randn(T,size(ay)...) - cy0, cy1 = CuArray.((ay0,ay0)) - if blendz - cz0 = cz1 = CuArray(az0) - ay2 = act.(ay .+ beta * az0) - else - cz0, cz1 = cy0, cy1 - ay2 = act.(ay .+ beta * ay0) - end - - d = cudnnConvolutionDescriptor(convdims(padding,size(ax)), - convdims(stride,size(ax)), - convdims(dilation,size(ax)), mode, - cudnnDataType(dataType), mathType, reorderType, - Cint(group)) - @test ay1 ≈ cudnnConvolutionForward(cw0, cx; bias, activation, mode, padding, - stride, dilation, group, mathType, reorderType, - alpha) |> Array - @test ay1 ≈ cudnnConvolutionForward(cw0, cx, d; bias, activation, alpha) |> Array - @test ay2 ≈ cudnnConvolutionForward!(cy0, cw0, cx; z=cz0, bias, activation, mode, - padding, stride, dilation, group, mathType, - reorderType, alpha, beta) |> Array - @test ay2 ≈ cudnnConvolutionForward!(cy1, cw0, cx, d; z=cz1, bias, activation, - alpha, beta) |> Array - end - - # These call cudnnConvolutionForward - convtest() - convtest(padding=1) - convtest(stride=2) - convtest(dilation=2) - convtest(group=2) # See https://blog.yani.ai/filter-group-tutorial/ - convtest(mathType=CUDNN_DEFAULT_MATH) - convtest(mathType=CUDNN_TENSOR_OP_MATH) - convtest(mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) - convtest(reorderType=CUDNN_NO_REORDER) - convtest(alpha=2) - convtest(beta=2) - - # These call cudnnConvolutionBiasActivationForward - convtest(bias=cb) - convtest(blendz=true) - convtest(activation=CUDNN_ACTIVATION_RELU) - convtest(bias=cb,blendz=true) - convtest(bias=cb,activation=CUDNN_ACTIVATION_RELU) - convtest(bias=cb,padding=1) - convtest(bias=cb,stride=2) - convtest(bias=cb,dilation=2) - convtest(bias=cb,group=2) - convtest(bias=cb,mathType=CUDNN_DEFAULT_MATH) - convtest(bias=cb,mathType=CUDNN_TENSOR_OP_MATH) - convtest(bias=cb,mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) - convtest(bias=cb,reorderType=CUDNN_NO_REORDER) - convtest(bias=cb,alpha=2) - convtest(bias=cb,beta=2) - convtest(bias=cb,beta=2,blendz=true) - - # Test tensor format - cx2,cw2,cb2 = (x->permutedims(x,(3,1,2,4))).((cx,cw,cb)) - whcn = cudnnConvolutionForward(cw,cx) |> Array - cwhn = cudnnConvolutionForward(cw2,cx2,format=CUDNN_TENSOR_NHWC) |> Array - @test cwhn ≈ permutedims(whcn,(3,1,2,4)) - whcn = cudnnConvolutionForward(cw,cx;bias=cb) |> Array - cwhn = cudnnConvolutionForward(cw2,cx2;bias=cb2,format=CUDNN_TENSOR_NHWC) |> Array - @test cwhn ≈ permutedims(whcn,(3,1,2,4)) -end diff --git a/test/cudnn/dropout.jl b/test/cudnn/dropout.jl deleted file mode 100644 index 7faedbae0f..0000000000 --- a/test/cudnn/dropout.jl +++ /dev/null @@ -1,32 +0,0 @@ -using Statistics -using CUDA.CUDNN: - cudnnDropoutForward, - cudnnDropoutForward!, - cudnnDropoutBackward, - cudnnDropoutSeed, - cudnnDropoutDescriptor, - cudnnDropoutDescriptor_t, - cudnnCreateDropoutDescriptor, - cudnnSetDropoutDescriptor, - cudnnGetDropoutDescriptor, - cudnnRestoreDropoutDescriptor, - cudnnDestroyDropoutDescriptor, - cudnnDropoutGetStatesSize, - cudnnDropoutGetReserveSpaceSize - -@testset "cudnn/dropout" begin - @test cudnnDropoutDescriptor(C_NULL) isa cudnnDropoutDescriptor - @test Base.unsafe_convert(Ptr, cudnnDropoutDescriptor(C_NULL)) isa Ptr - @test cudnnDropoutDescriptor(0.5) isa cudnnDropoutDescriptor - - N,P = 1000, 0.7 - x = CUDA.rand(N) - d = cudnnDropoutDescriptor(P) - cudnnDropoutSeed[] = 1 - y = cudnnDropoutForward(x; dropout = P) |> Array - @test isapprox(mean(y.==0), P; atol = 3/sqrt(N)) - @test y == cudnnDropoutForward(x, d) |> Array - @test y == cudnnDropoutForward!(similar(x), x; dropout = P) |> Array - @test y == cudnnDropoutForward!(similar(x), x, d) |> Array - cudnnDropoutSeed[] = -1 -end diff --git a/test/cudnn/inplace.jl b/test/cudnn/inplace.jl deleted file mode 100644 index cac88579bb..0000000000 --- a/test/cudnn/inplace.jl +++ /dev/null @@ -1,30 +0,0 @@ -import CUDA.CUDNN: - cudnnSetTensor!, - cudnnScaleTensor!, - cudnnScaleTensor, - cudnnAddTensor!, - cudnnAddTensor, - CUDNN_TENSOR_NHWC - -@testset "cudnn/inplace" begin - x = CUDA.rand(10) - cudnnSetTensor!(x, 7) - @test all(isequal(7), Array(x)) - ax = rand(10) - cx = CuArray(ax) - @test 7*ax ≈ cudnnScaleTensor(cx, 7) |> Array - @test 7*ax ≈ cudnnScaleTensor!(similar(cx), cx, 7) |> Array - ax,ab = rand(5,4,3,2),rand(1,1,3,1) - cx,cb = CuArray.((ax,ab)) - @test ax .+ ab ≈ cudnnAddTensor(cx, cb) |> Array - @test ax .+ 7*ab ≈ cudnnAddTensor(cx, cb, alpha=7) |> Array - @test 7*ax .+ ab ≈ cudnnAddTensor(cx, cb, beta=7) |> Array - @test ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb) |> Array - @test ax .+ 7*ab ≈ cudnnAddTensor!(similar(cx), cx, cb, alpha=7) |> Array - @test 7*ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb, beta=7) |> Array - @test ax .+ ab ≈ cudnnAddTensor!(cx, cx, cb) |> Array - @test ax .+ ab ≈ cx |> Array - ax,ab = rand(3,5,4,2),rand(3,1,1,1) - cx,cb = CuArray.((ax,ab)) - @test ax .+ ab ≈ cudnnAddTensor(cx, cb, format=CUDNN_TENSOR_NHWC) |> Array -end diff --git a/test/cudnn/multiheadattn.jl b/test/cudnn/multiheadattn.jl deleted file mode 100644 index 5235053d97..0000000000 --- a/test/cudnn/multiheadattn.jl +++ /dev/null @@ -1,172 +0,0 @@ -using CUDA.CUDNN: - cudnnMultiHeadAttnForward, - cudnnMultiHeadAttnForward!, - cudnnMultiHeadAttnBackwardData, - cudnnMultiHeadAttnBackwardWeights, - cudnnGetMultiHeadAttnBuffers, - cudnnGetMultiHeadAttnWeights, - cudnnAttnDescriptor, - cudnnAttnDescriptor_t, - cudnnCreateAttnDescriptor, - cudnnDestroyAttnDescriptor, - cudnnSetAttnDescriptor, - cudnnGetAttnDescriptor, - cudnnDataType_t, - cudnnDropoutDescriptor_t, - cudnnAttnQueryMap_t, - CUDNN_ATTN_QUERYMAP_ALL_TO_ONE, # 0 /* multiple Q-s map to a single (K,V) set when beam size > 1, beam sizes for (K,V) = 1 */ - CUDNN_ATTN_QUERYMAP_ONE_TO_ONE, # (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1, beam sizes for (K,V) = beam size for (Q) */ - CUDNN_ATTN_DISABLE_PROJ_BIASES, # 0 /* no biases in attention input and output projections */ - CUDNN_ATTN_ENABLE_PROJ_BIASES, # (1U << 1) /* use biases in attention input and output projections */ - cudnnMultiHeadAttnWeightKind_t, - CUDNN_MH_ATTN_Q_WEIGHTS, # 0, /* input projection weights for 'queries' */ - CUDNN_MH_ATTN_K_WEIGHTS, # 1, /* input projection weights for 'keys' */ - CUDNN_MH_ATTN_V_WEIGHTS, # 2, /* input projection weights for 'values' */ - CUDNN_MH_ATTN_O_WEIGHTS, # 3, /* output projection weights */ - CUDNN_MH_ATTN_Q_BIASES, # 4, /* input projection bias tensor for 'queries' */ - CUDNN_MH_ATTN_K_BIASES, # 5, /* input projection bias for 'keys' */ - CUDNN_MH_ATTN_V_BIASES, # 6, /* input projection bias for 'values' */ - CUDNN_MH_ATTN_O_BIASES, # 7, /* output projection biases */ - cudnnMathType_t, - CUDNN_DEFAULT_MATH, # 0, - CUDNN_TENSOR_OP_MATH, # 1, - CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2, - CUDNN_FMA_MATH, # 3, - cudnnWgradMode_t, - CUDNN_WGRAD_MODE_ADD, # 0, - CUDNN_WGRAD_MODE_SET, # 1, - cudnnSeqDataDescriptor, - cudnnSeqDataDescriptor_t, - cudnnCreateSeqDataDescriptor, - cudnnDestroySeqDataDescriptor, - cudnnSetSeqDataDescriptor, - cudnnGetSeqDataDescriptor, - cudnnSeqDataAxis_t, - CUDNN_SEQDATA_TIME_DIM, # 0, /* index in time */ - CUDNN_SEQDATA_BATCH_DIM, # 1, /* index in batch */ - CUDNN_SEQDATA_BEAM_DIM, # 2, /* index in beam */ - CUDNN_SEQDATA_VECT_DIM, # 3 /* index in vector */ - CUDNN_SEQDATA_DIM_COUNT, # 4 - cudnnDataType, - cudnnSeqDataDefaultAxes, - math_mode, - sdim - -@testset "cudnn/multiheadattn" begin - function mhatest(; - # Input tensor descriptors - axes::Vector{cudnnSeqDataAxis_t} = cudnnSeqDataDefaultAxes, - seqLengthsQO::Vector{<:Integer} = fill(Cint(sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM)), - seqLengthsKV::Vector{<:Integer} = fill(Cint(sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM)), sdim(keys,axes,CUDNN_SEQDATA_BATCH_DIM)*sdim(keys,axes,CUDNN_SEQDATA_BEAM_DIM)), - #devSeqLengthsQO::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsQO), - #devSeqLengthsKV::CuVector{Cint} = convert(CuVector{Cint}, seqLengthsKV), - #qDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(queries; axes, seqLengthArray=seqLengthsQO), - #kDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(keys; axes, seqLengthArray=seqLengthsKV), - #vDesc::cudnnSeqDataDescriptor = cudnnSeqDataDescriptor(values; axes, seqLengthArray=seqLengthsKV), - - # attnDesc parameters - attnMode::Unsigned = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint, - nHeads::Integer = Cint(1), - smScaler::Real = Cdouble(1), - # dataType::DataType = eltype(queries), - # computePrec::DataType = eltype(queries), ## No other option according to 8.0.2 - mathType::cudnnMathType_t = math_mode(), - # attnDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API - # postDropout::Real = 0, ## The dropout option is currently not supported by the multi-head attention API - qProjSize::Integer = 0, # Use zero to disable the corresponding projection - kProjSize::Integer = 0, - vProjSize::Integer = 0, - oProjSize::Integer = 0, - qoMaxSeqLength::Integer = sdim(queries,axes,CUDNN_SEQDATA_TIME_DIM), - kvMaxSeqLength::Integer = sdim(keys,axes,CUDNN_SEQDATA_TIME_DIM), - maxBatchSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BATCH_DIM), - maxBeamSize::Integer = sdim(queries,axes,CUDNN_SEQDATA_BEAM_DIM), - - # forw parameters - residuals = nothing, - currIdx::Integer = -1, - loWinIdx::Array{Cint} = fill(Cint(0), qoMaxSeqLength), - hiWinIdx::Array{Cint} = fill(Cint(kvMaxSeqLength), qoMaxSeqLength), - #workspace::Union{CuArray,Nothing} = nothing, - #reserveSpace::Union{CuArray,Nothing} = nothing, - ) - attnDesc::cudnnAttnDescriptor = cudnnAttnDescriptor( - Cuint(attnMode), - Cint(nHeads), - Cdouble(smScaler), - cudnnDataType(eltype(queries)), # dataType - cudnnDataType(eltype(queries)), # computePrec - mathType, - C_NULL, # attnDropout - C_NULL, # postDropout - Cint(sdim(queries,axes,CUDNN_SEQDATA_VECT_DIM)), # qSize - Cint(sdim(keys, axes,CUDNN_SEQDATA_VECT_DIM)), # kSize - Cint(sdim(values, axes,CUDNN_SEQDATA_VECT_DIM)), # vSize - Cint(qProjSize), - Cint(kProjSize), - Cint(vProjSize), - Cint(oProjSize), - Cint(qoMaxSeqLength), - Cint(kvMaxSeqLength), - Cint(maxBatchSize), - Cint(maxBeamSize) - ) - y = cudnnMultiHeadAttnForward(weights, queries, keys, values; axes, seqLengthsQO, - seqLengthsKV, attnMode, nHeads, smScaler, mathType, - qProjSize, kProjSize, vProjSize, oProjSize, - qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, - maxBeamSize, residuals, currIdx, loWinIdx, hiWinIdx) - @test Array(y) ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values; axes, - seqLengthsQO, seqLengthsKV, attnMode, nHeads, - smScaler, mathType, qProjSize, kProjSize, - vProjSize, oProjSize, qoMaxSeqLength, - kvMaxSeqLength, maxBatchSize, maxBeamSize, - residuals, currIdx, loWinIdx, hiWinIdx) |> Array - @test Array(y) ≈ cudnnMultiHeadAttnForward(weights, queries, keys, values, attnDesc; - axes, seqLengthsQO, seqLengthsKV, residuals, - currIdx, loWinIdx, hiWinIdx) |> Array - @test Array(y) ≈ cudnnMultiHeadAttnForward!(zero(y), weights, queries, keys, values, attnDesc; - axes, seqLengthsQO, seqLengthsKV, residuals, - currIdx, loWinIdx, hiWinIdx) |> Array - end - - Q,K,V,B,T,F = 6,6,5,4,3,Float32 - - weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T),(F,K,B,T),(F,V,B,T))) - mhatest() - mhatest(attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE | CUDNN_ATTN_ENABLE_PROJ_BIASES |> Cuint, vProjSize=7) - mhatest(seqLengthsQO = Cint[1,2,3,1]) - mhatest(seqLengthsKV = Cint[1,2,3,1]) - mhatest(nHeads = 2) - mhatest(smScaler = 2) - mhatest(mathType = CUDNN_DEFAULT_MATH) - mhatest(mathType = CUDNN_TENSOR_OP_MATH) - mhatest(mathType = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) - mhatest(mathType = CUDNN_FMA_MATH) - mhatest(kProjSize = 7, qProjSize = 7) # k and q have to match - mhatest(vProjSize = 7) - mhatest(oProjSize = 7) - mhatest(qoMaxSeqLength = 7) - mhatest(kvMaxSeqLength = 7) - mhatest(maxBatchSize = 7) - mhatest(maxBeamSize = 7) - mhatest(loWinIdx = fill(Cint(1),T)) - mhatest(hiWinIdx = fill(Cint(1),T)) - mhatest(currIdx = 0) - - # Test residuals: residuals and output (and thus values unless oProjSize>0) must match queries in vector size - values, residuals = (CUDA.randn(x...) for x in ((F,Q,B,T),(F,Q,B,T))) - mhatest(residuals = residuals) - - # Test nonstandard axes order - weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,T,B),(F,K,T,B),(F,V,T,B))) - mhatest(axes = [CUDNN_SEQDATA_VECT_DIM, CUDNN_SEQDATA_TIME_DIM, CUDNN_SEQDATA_BATCH_DIM, CUDNN_SEQDATA_BEAM_DIM]) - - # Test beam handling - weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T,2),(F,K,B,T,1),(F,V,B,T,1))) - mhatest() - - # CUDNN_ATTN_QUERYMAP_ONE_TO_ONE does not seem to be supported - #weights, queries, keys, values = (CUDA.randn(x...) for x in ((F,100),(F,Q,B,T,M),(F,K,B,T,M),(F,V,B,T,M))) - #mhatest(attnMode = CUDNN_ATTN_QUERYMAP_ONE_TO_ONE | CUDNN_ATTN_DISABLE_PROJ_BIASES |> Cuint) -end diff --git a/test/cudnn/normalization.jl b/test/cudnn/normalization.jl deleted file mode 100644 index a37f0c301c..0000000000 --- a/test/cudnn/normalization.jl +++ /dev/null @@ -1,119 +0,0 @@ -using Statistics - -using CUDA.CUDNN: - cudnnNormalizationForward, - cudnnNormalizationForward!, - cudnnNormalizationForwardInference, - cudnnNormalizationForwardTraining, - cudnnNormalizationBackward, - cudnnActivationDescriptor, - cudnnNormMode_t, - CUDNN_NORM_PER_ACTIVATION, # 0, bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) - CUDNN_NORM_PER_CHANNEL, # 1, bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) - cudnnNormOps_t, - CUDNN_NORM_OPS_NORM, # 0, /* do normalization only */ - CUDNN_NORM_OPS_NORM_ACTIVATION, # 1, /* do Norm, then activation */ - CUDNN_NORM_OPS_NORM_ADD_ACTIVATION, # 2, /* do Norm, then elemWiseAdd, then activation */ - cudnnNormAlgo_t, - CUDNN_NORM_ALGO_STANDARD, # 0 - CUDNN_NORM_ALGO_PERSIST, # 1 - cudnnActivationMode_t, - CUDNN_ACTIVATION_SIGMOID, # 0 - CUDNN_ACTIVATION_RELU, # 1 - CUDNN_ACTIVATION_TANH, # 2 - CUDNN_ACTIVATION_CLIPPED_RELU, # 3 - CUDNN_ACTIVATION_ELU, # 4 - CUDNN_ACTIVATION_IDENTITY, # 5 - cudnnNanPropagation_t, - CUDNN_NOT_PROPAGATE_NAN, # 0 - CUDNN_PROPAGATE_NAN, # 1 - cudnnTensorFormat_t, - CUDNN_TENSOR_NCHW, # 0, /* row major (wStride = 1, hStride = w) */ - CUDNN_TENSOR_NHWC, # 1, /* feature maps interleaved ( cStride = 1 )*/ - CUDNN_TENSOR_NCHW_VECT_C # 2, /* each image point is vector of element of C, vector length in data type */ - - -@testset "cudnn/normalization" begin - - function normtest( - x; - - training = false, - - # Inference parameters: - z = nothing, # for residual addition to the result of the normalization operation, prior to the activation - mode::cudnnNormMode_t = CUDNN_NORM_PER_CHANNEL, # Per-channel layer is based on the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, S. Ioffe, C. Szegedy, 2015. - normOps::cudnnNormOps_t = CUDNN_NORM_OPS_NORM, # Currently CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are only supported in the NHWC layout (training,backward), not supported (inference) - algo::cudnnNormAlgo_t = CUDNN_NORM_ALGO_STANDARD, # trigger the new semi-persistent NHWC kernel when CUDNN_NORM_ALGO_PERSIST - alpha::Real = 1, - beta::Real = 0, - epsilon::Real = 1e-5, # Has to be >= 0. Should be the same in forward and backward functions. - groupCnt::Integer = 1, # Place hold for future work, should be set to 1 now - - # Main argument defaults: - format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, # or NHWC - _sdims = (mode == CUDNN_NORM_PER_CHANNEL && format == CUDNN_TENSOR_NCHW ? (1,1,size(x,3),1) : - mode == CUDNN_NORM_PER_CHANNEL && format == CUDNN_TENSOR_NHWC ? (size(x,1),1,1,1) : - mode == CUDNN_NORM_PER_ACTIVATION && format == CUDNN_TENSOR_NCHW ? (size(x)[1:3]...,1) : - mode == CUDNN_NORM_PER_ACTIVATION && format == CUDNN_TENSOR_NHWC ? (size(x)[1:3]...,1) : - error("Unknown mode $mode and format $format")), - scale = fill!(similar(x, _sdims), 1), - bias = fill!(similar(x, _sdims), 0), - xmean = fill!(similar(x, _sdims), 0), - xvar = fill!(similar(x, _sdims), 1), - - # Training-only parameters: - exponentialAverageFactor::Real = 0.1, - savedMean = nothing, # Optionally save intermediate results from the forward pass here - can be reused to speed up backward pass. NULL if unused. - savedInvVariance = nothing, - - # Activation parameters: - activationMode::cudnnActivationMode_t = CUDNN_ACTIVATION_IDENTITY, - activationReluNanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN, - activationCoef::Real = 1, - activationDesc::Union{Nothing,cudnnActivationDescriptor} = (normOps == CUDNN_NORM_OPS_NORM ? nothing : cudnnActivationDescriptor(activationMode, activationReluNanOpt, Cdouble(activationCoef))), - ) - if training - dims = findall(size(xmean) .== 1) - m = mean(x; dims) - v = var(x; dims, mean=m, corrected=false) - y = bias .+ scale .* (x .- m) ./ sqrt.(epsilon .+ v) - else - y = bias .+ scale .* (x .- xmean) ./ sqrt.(epsilon .+ xvar) - end - y0 = randn!(similar(x)) - y1 = alpha * y - y2 = y1 + beta * y0 - @test Array(y1) ≈ cudnnNormalizationForward(x, xmean, xvar, bias, scale; training, z, mode, - normOps, algo, alpha, epsilon, groupCnt, - format, exponentialAverageFactor, savedMean, - savedInvVariance, activationDesc) |> Array - @test Array(y2) ≈ cudnnNormalizationForward!(copy(y0), x, xmean, xvar, bias, scale; - training, z, mode, normOps, algo, alpha, beta, - epsilon, groupCnt, format, - exponentialAverageFactor, savedMean, - savedInvVariance, activationDesc) |> Array - end - - x, z, s = (CUDA.randn(x...) for x in ((5,4,3,2),(5,4,3,2),(1,1,3,1))) - normtest(x) - normtest(x; training = true) - normtest(x; mode = CUDNN_NORM_PER_ACTIVATION) - normtest(x; algo = CUDNN_NORM_ALGO_PERSIST) - normtest(x; algo = CUDNN_NORM_ALGO_PERSIST, format = CUDNN_TENSOR_NHWC) - normtest(x; alpha = 2) - normtest(x; beta = 2) - normtest(x; epsilon = 0) - normtest(x; format = CUDNN_TENSOR_NHWC) - normtest(x; scale = fill!(s, 2)) - normtest(x; bias = fill!(s, 2)) - normtest(x; xmean = fill!(s, 2)) - normtest(x; xvar = fill!(s, 2)) - normtest(x; exponentialAverageFactor = 0.01) - normtest(x; savedMean = similar(s)) - normtest(x; savedInvVariance = similar(s)) - # cudnn-8.0.5: Currently, CUDNN_NORM_OPS_NORM_ACTIVATION and CUDNN_NORM_OPS_NORM_ADD_ACTIVATION are not supported in inference. - #normtest(x; normOps = CUDNN_NORM_OPS_NORM_ACTIVATION, activationMode = CUDNN_ACTIVATION_RELU, format = CUDNN_TENSOR_NHWC) - #normtest(x; normOps = CUDNN_NORM_OPS_NORM_ADD_ACTIVATION, activationMode = CUDNN_ACTIVATION_RELU, z, format = CUDNN_TENSOR_NHWC) - #normtest(x; groupCnt = 2) # cudnn-8.0.5: Currently only groupCnt=1 is supported -end diff --git a/test/cudnn/optensor.jl b/test/cudnn/optensor.jl deleted file mode 100644 index 5177364275..0000000000 --- a/test/cudnn/optensor.jl +++ /dev/null @@ -1,63 +0,0 @@ -using CUDA.CUDNN: - cudnnOpTensor, - cudnnOpTensor!, - cudnnOpTensorDescriptor, - cudnnOpTensorDescriptor_t, - cudnnCreateOpTensorDescriptor, - cudnnSetOpTensorDescriptor, - cudnnGetOpTensorDescriptor, - cudnnDestroyOpTensorDescriptor, - cudnnOpTensorOp_t, - CUDNN_OP_TENSOR_ADD, # 0, - CUDNN_OP_TENSOR_MUL, # 1, - CUDNN_OP_TENSOR_MIN, # 2, - CUDNN_OP_TENSOR_MAX, # 3, - CUDNN_OP_TENSOR_SQRT, # 4, performed only on first arg - CUDNN_OP_TENSOR_NOT, # 5, performed only on first arg - cudnnNanPropagation_t, - CUDNN_NOT_PROPAGATE_NAN, # 0 - CUDNN_PROPAGATE_NAN, # 1 - cudnnDataType - -@testset "cudnn/optensor" begin - @test cudnnOpTensorDescriptor(C_NULL) isa cudnnOpTensorDescriptor - @test Base.unsafe_convert(Ptr, cudnnOpTensorDescriptor(C_NULL)) isa Ptr - @test cudnnOpTensorDescriptor(CUDNN_OP_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN) isa cudnnOpTensorDescriptor - - (ax1,ax2,ay) = rand.((10,10,10)) - (cx1,cx2,cy) = CuArray.((ax1,ax2,ay)) - - function optensortest(; - op=CUDNN_OP_TENSOR_ADD, - nanOpt=CUDNN_NOT_PROPAGATE_NAN, - compType=(eltype(ax1) <: Float64 ? Float64 : Float32), - alpha1=1, - alpha2=1, - beta=0, - ) - f1 = (op === CUDNN_OP_TENSOR_ADD ? alpha1*ax1 .+ alpha2*ax2 : - op === CUDNN_OP_TENSOR_MUL ? (alpha1*ax1) .* (alpha2*ax2) : - op === CUDNN_OP_TENSOR_MIN ? min.(alpha1*ax1, alpha2*ax2) : - op === CUDNN_OP_TENSOR_MAX ? max.(alpha1*ax1, alpha2*ax2) : - op === CUDNN_OP_TENSOR_SQRT ? sqrt.(alpha1*ax1) : - op === CUDNN_OP_TENSOR_NOT ? 1 .- ax1 : - error("Unknown optensor")) - f2 = f1 .+ beta * ay - d = cudnnOpTensorDescriptor(op,cudnnDataType(compType),nanOpt) - @test f1 ≈ cudnnOpTensor(cx1, cx2; op, compType, nanOpt, alpha1, alpha2) |> Array - @test f1 ≈ cudnnOpTensor(cx1, cx2, d; alpha1, alpha2) |> Array - @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2; op, compType, nanOpt, alpha1, alpha2, beta) |> Array - @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2, d; alpha1, alpha2, beta) |> Array - end - - optensortest(op = CUDNN_OP_TENSOR_ADD) - optensortest(op = CUDNN_OP_TENSOR_MUL) - optensortest(op = CUDNN_OP_TENSOR_MIN) - optensortest(op = CUDNN_OP_TENSOR_MAX) - optensortest(op = CUDNN_OP_TENSOR_SQRT) - optensortest(op = CUDNN_OP_TENSOR_NOT) - optensortest(nanOpt = CUDNN_PROPAGATE_NAN) - optensortest(alpha1 = 2) - optensortest(alpha2 = 2) - optensortest(beta = 2) -end diff --git a/test/cudnn/pooling.jl b/test/cudnn/pooling.jl deleted file mode 100644 index 6f5a9e86b7..0000000000 --- a/test/cudnn/pooling.jl +++ /dev/null @@ -1,93 +0,0 @@ -using Test, CUDA, Random -import NNlib -using CUDA.CUDNN: - cudnnPoolingForward, - cudnnPoolingForward!, - cudnnPoolingBackward, - cudnnGetPoolingNdForwardOutputDim, - cudnnPoolingDescriptor, - cudnnPoolingDescriptor_t, - cudnnCreatePoolingDescriptor, - cudnnSetPoolingNdDescriptor, - cudnnDestroyPoolingDescriptor, - cudnnPoolingMode_t, - CUDNN_POOLING_MAX, # 0, - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, # 1, /* count for average includes padded values */ - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, # 2, /* count for average does not include padded values */ - CUDNN_POOLING_MAX_DETERMINISTIC, # 3 - cudnnNanPropagation_t, - CUDNN_NOT_PROPAGATE_NAN, # 0 - CUDNN_PROPAGATE_NAN, # 1 - cudnnTensorFormat_t, - CUDNN_TENSOR_NCHW, # 0, /* row major (wStride = 1, hStride = w) */ - CUDNN_TENSOR_NHWC, # 1, /* feature maps interleaved ( cStride = 1 )*/ - CUDNN_TENSOR_NCHW_VECT_C, # 2, /* each image point is vector of element of C, vector length in data type */ - pooldims - -@testset "cudnn/pooling" begin - function pooltest(; - mode = CUDNN_POOLING_MAX, - nanOpt = CUDNN_NOT_PROPAGATE_NAN, - window = 2, - padding = 0, - stride = window, - format = CUDNN_TENSOR_NCHW, - dataType = Float32, - alpha = 1, - beta = 0) - ax = randn(dataType,12,6,4,2) - N = ndims(ax) - window = expand(Val(N-2), window) - stride = expand(Val(N-2), stride) - padding = expand(Val(N-2), padding) - pdims = NNlib.PoolDims(ax, window; padding = padding, stride = stride) - #= - if mode == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING - @warn "Pool mode=$mode not yet implemented in NNlib, using INCLUDE instead. See https://github.com/FluxML/NNlib.jl/issues/218" maxlog=1 - end - if mode == CUDNN_POOLING_MAX_DETERMINISTIC - @warn "Pool mode=$mode not yet implemented in NNlib, using MAX instead." maxlog=1 - end - if nanOpt == CUDNN_NOT_PROPAGATE_NAN - @warn "Pool nanOpt=$nanOpt not yet implemented in NNlib, using PROPAGATE instead. See https://github.com/FluxML/NNlib.jl/issues/218" maxlog=1 - end - =# - ay1 = (mode == CUDNN_POOLING_MAX ? NNlib.maxpool(ax, pdims) : - mode == CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ? NNlib.meanpool(ax, pdims) : - mode == CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING ? NNlib.meanpool(ax, pdims) : - mode == CUDNN_POOLING_MAX_DETERMINISTIC ? NNlib.maxpool(ax, pdims) : - error("mode=$mode is not supported.")) - ay1 = alpha * ay1 - ay = randn!(similar(ay1)) - ay2 = ay1 .+ beta * ay - d = cudnnPoolingDescriptor(mode, nanOpt, Cint(max(2,ndims(ax)-2)), pooldims(window,size(ax)), pooldims(padding,size(ax)), pooldims(stride,size(ax))) - nhwc(a) = permutedims(a,(3,1,2,4)) - if format === CUDNN_TENSOR_NCHW - cx, cy = CuArray.((ax, ay)) - else - cx, cy = CuArray.(nhwc.((ax,ay))) - ay1, ay2 = nhwc.((ay1, ay2)) - end - @test ay1 ≈ cudnnPoolingForward(cx; mode, nanOpt, window, padding, stride, format, alpha) |> Array - @test ay1 ≈ cudnnPoolingForward(cx, d; format, alpha) |> Array - @test ay2 ≈ cudnnPoolingForward!(copy(cy), cx; mode, nanOpt, window, padding, stride, format, alpha, beta) |> Array - @test ay2 ≈ cudnnPoolingForward!(copy(cy), cx, d; format, alpha, beta) |> Array - end - - expand(::Val{N}, i::NTuple{N}) where {N} = i - expand(::Val{N}, i::Integer) where {N} = ntuple(_ -> i, N) - - - pooltest() - pooltest(mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING) - pooltest(mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) - pooltest(mode = CUDNN_POOLING_MAX_DETERMINISTIC) - pooltest(nanOpt = CUDNN_PROPAGATE_NAN) - pooltest(window = 3) - pooltest(padding = 1) - pooltest(stride = 1) - pooltest(format = CUDNN_TENSOR_NHWC) - pooltest(dataType = Float16) - pooltest(alpha = 2) - pooltest(beta = 2) -end diff --git a/test/cudnn/reduce.jl b/test/cudnn/reduce.jl deleted file mode 100644 index 2e9c5830c5..0000000000 --- a/test/cudnn/reduce.jl +++ /dev/null @@ -1,86 +0,0 @@ -using Statistics -using CUDA.CUDNN: - cudnnReduceTensor, - cudnnReduceTensor!, - cudnnGetReductionIndicesSize, - cudnnGetReductionWorkspaceSize, - cudnnReduceTensorDescriptor, - cudnnReduceTensorDescriptor_t, - cudnnCreateReduceTensorDescriptor, - cudnnSetReduceTensorDescriptor, - cudnnGetReduceTensorDescriptor, - cudnnDestroyReduceTensorDescriptor, - cudnnReduceTensorOp_t, - CUDNN_REDUCE_TENSOR_ADD, # 0, - CUDNN_REDUCE_TENSOR_MUL, # 1, - CUDNN_REDUCE_TENSOR_MIN, # 2, - CUDNN_REDUCE_TENSOR_MAX, # 3, - CUDNN_REDUCE_TENSOR_AMAX, # 4, - CUDNN_REDUCE_TENSOR_AVG, # 5, - CUDNN_REDUCE_TENSOR_NORM1, # 6, - CUDNN_REDUCE_TENSOR_NORM2, # 7, - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS, # 8, - cudnnNanPropagation_t, - CUDNN_NOT_PROPAGATE_NAN, # 0 - CUDNN_PROPAGATE_NAN, # 1 - cudnnReduceTensorIndices, - cudnnReduceTensorIndices_t, - CUDNN_REDUCE_TENSOR_NO_INDICES, # 0, - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, # 1, - cudnnIndicesType, - cudnnIndicesType_t, - CUDNN_32BIT_INDICES, # 0, - CUDNN_64BIT_INDICES, # 1, - CUDNN_16BIT_INDICES, # 2, - CUDNN_8BIT_INDICES, # 3, - cudnnDataType - -@testset "cudnn/reduce" begin - @test cudnnReduceTensorDescriptor(C_NULL) isa cudnnReduceTensorDescriptor - @test Base.unsafe_convert(Ptr, cudnnReduceTensorDescriptor(C_NULL)) isa Ptr - @test cudnnReduceTensorDescriptor(CUDNN_REDUCE_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN,CUDNN_REDUCE_TENSOR_NO_INDICES,CUDNN_32BIT_INDICES) isa cudnnReduceTensorDescriptor - - (ax,ay) = randn(Float32,10,10), randn(Float32,10,1) - (cx,cy) = CuArray.((ax,ay)) - - function reducetensortest(; - op::cudnnReduceTensorOp_t = CUDNN_REDUCE_TENSOR_ADD, - compType::DataType = (eltype(ax) <: Float64 ? Float64 : Float32), - nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN, - indices::Union{Vector{<:Unsigned},Nothing} = nothing, - d::cudnnReduceTensorDescriptor = cudnnReduceTensorDescriptor(op, cudnnDataType(compType), nanOpt, cudnnReduceTensorIndices(op, indices), cudnnIndicesType(indices)), - alpha::Real = 1, - beta::Real = 0, - ) - f0 = (op === CUDNN_REDUCE_TENSOR_ADD ? sum(ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_MUL ? prod(ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_MIN ? minimum(ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_MAX ? maximum(ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_AMAX ? maximum(abs, ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_AVG ? mean(ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_NORM1 ? sum(abs, ax, dims=2) : - op === CUDNN_REDUCE_TENSOR_NORM2 ? sqrt.(sum(abs2, ax, dims=2)) : - op === CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS ? (ax1=copy(ax);ax1[ax.==0].=1;prod(ax1,dims=2)) : - error("Unknown reducetensor")) - f1 = alpha * f0 - f2 = f1 + beta * ay - dims = size(ay) - @test f1 ≈ cudnnReduceTensor(cx; dims, op, compType, nanOpt, indices, alpha) |> Array - @test f1 ≈ cudnnReduceTensor(cx, d; dims, indices, alpha) |> Array - @test f2 ≈ cudnnReduceTensor!(copy(cy), cx; op, compType, nanOpt, indices, alpha, beta) |> Array - @test f2 ≈ cudnnReduceTensor!(copy(cy), cx, d; indices, alpha, beta) |> Array - end - - reducetensortest() - reducetensortest(op = CUDNN_REDUCE_TENSOR_MUL) - reducetensortest(op = CUDNN_REDUCE_TENSOR_MIN) - reducetensortest(op = CUDNN_REDUCE_TENSOR_MAX) - reducetensortest(op = CUDNN_REDUCE_TENSOR_AMAX) - reducetensortest(op = CUDNN_REDUCE_TENSOR_AVG) - reducetensortest(op = CUDNN_REDUCE_TENSOR_NORM1) - reducetensortest(op = CUDNN_REDUCE_TENSOR_NORM2) - reducetensortest(op = CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS) - reducetensortest(nanOpt = CUDNN_PROPAGATE_NAN) - reducetensortest(alpha = 2) - reducetensortest(beta = 2) -end diff --git a/test/cudnn/rnn.jl b/test/cudnn/rnn.jl deleted file mode 100644 index 67c26f6650..0000000000 --- a/test/cudnn/rnn.jl +++ /dev/null @@ -1,144 +0,0 @@ -using CUDA.CUDNN: - cudnnRNNForward, - cudnnRNNForward!, - cudnnRNNBackwardData_v8, - cudnnRNNBackwardWeights_v8, - cudnnRNNDescriptor, - cudnnRNNDescriptor_t, - cudnnSetRNNDescriptor_v8, - cudnnGetRNNWeightSpaceSize, - cudnnGetRNNTempSpaceSizes, - cudnnRNNAlgo_t, - CUDNN_RNN_ALGO_STANDARD, # 0, robust performance across a wide range of network parameters - CUDNN_RNN_ALGO_PERSIST_STATIC, # 1, fast when the first dimension of the input tensor is small (meaning, a small minibatch), cc>=6.0 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC, # 2, similar to static, optimize using the specific parameters of the network and active GPU, cc>=6.0 - CUDNN_RNN_ALGO_COUNT, # 3 - cudnnRNNMode_t, - CUDNN_RNN_RELU, # 0, /* basic RNN cell type with ReLu activation */ - CUDNN_RNN_TANH, # 1, /* basic RNN cell type with tanh activation */ - CUDNN_LSTM, # 2, /* LSTM with optional recurrent projection and clipping */ - CUDNN_GRU, # 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ - cudnnRNNBiasMode_t, - CUDNN_RNN_NO_BIAS, # 0, /* rnn cell formulas do not use biases */ - CUDNN_RNN_SINGLE_INP_BIAS, # 1, /* rnn cell formulas use one input bias in input GEMM */ - CUDNN_RNN_DOUBLE_BIAS, # 2, /* default, rnn cell formulas use two bias vectors */ - CUDNN_RNN_SINGLE_REC_BIAS, # 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */ - cudnnDirectionMode_t, - CUDNN_UNIDIRECTIONAL, # 0, /* single direction network */ - CUDNN_BIDIRECTIONAL, # 1, /* output concatination at each layer */ - cudnnRNNInputMode_t, - CUDNN_LINEAR_INPUT, # 0, /* adjustable weight matrix in first layer input GEMM */ - CUDNN_SKIP_INPUT, # 1, /* fixed identity matrix in the first layer input GEMM */ - cudnnMathType_t, - CUDNN_DEFAULT_MATH, # 0, - CUDNN_TENSOR_OP_MATH, # 1, - CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION, # 2, - CUDNN_FMA_MATH, # 3, - #/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */ - CUDNN_RNN_PADDED_IO_DISABLED, # 0 - CUDNN_RNN_PADDED_IO_ENABLED, # (1U << 0) - cudnnForwardMode_t, - CUDNN_FWD_MODE_INFERENCE, # 0 - CUDNN_FWD_MODE_TRAINING, # 1 - cudnnRNNDataDescriptor_t, - cudnnSetRNNDataDescriptor, - cudnnRNNDataLayout_t, - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, # 0, /* padded, outer stride from one time-step to the next */ - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, # 1, /* sequence length sorted and packed as in basic RNN api */ - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED, # 2, /* padded, outer stride from one batch to the next */ - cudnnWgradMode_t, - CUDNN_WGRAD_MODE_ADD, # 0, /* add partial gradients to wgrad output buffers */ - CUDNN_WGRAD_MODE_SET, # 1, /* write partial gradients to wgrad output buffers */ - cudnnTensorDescriptor, - cudnnDropoutDescriptor, - cudnnDataType, - math_mode - -@testset "cudnn/rnn" begin - X,H,B,T = 8,8,4,2 - w = CUDA.randn(10000) - x = CUDA.randn(X,B,T) - hx1 = CUDA.randn(H,B,1) - cx1 = CUDA.randn(H,B,1) - - function rnntest(; - hx = nothing, - cx = nothing, - hy = nothing, - cy = nothing, - layout::cudnnRNNDataLayout_t = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, - seqLengthArray::Union{Nothing,Vector{Cint}} = nothing, - fwdMode::cudnnForwardMode_t = CUDNN_FWD_MODE_INFERENCE, - # descriptor keywords - hiddenSize::Integer = H, - algo::cudnnRNNAlgo_t = CUDNN_RNN_ALGO_STANDARD, - cellMode::cudnnRNNMode_t = CUDNN_LSTM, - biasMode::cudnnRNNBiasMode_t = CUDNN_RNN_DOUBLE_BIAS, - dirMode::cudnnDirectionMode_t = CUDNN_UNIDIRECTIONAL, - inputMode::cudnnRNNInputMode_t = CUDNN_LINEAR_INPUT, - mathPrec::DataType = eltype(x), - mathType::cudnnMathType_t = math_mode(), - inputSize::Integer = size(x,1), - projSize::Integer = hiddenSize, - numLayers::Integer = 1, - dropout::Real = 0, - auxFlags::Integer = CUDNN_RNN_PADDED_IO_ENABLED, - ) - d = cudnnRNNDescriptor(algo, cellMode, biasMode, dirMode, inputMode, - cudnnDataType(eltype(x)), cudnnDataType(mathPrec), mathType, - Int32(inputSize), Int32(hiddenSize), Int32(projSize), - Int32(numLayers), cudnnDropoutDescriptor(Cfloat(dropout)), - UInt32(auxFlags)) - y = cudnnRNNForward(w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode, - hiddenSize, algo, cellMode, biasMode, dirMode, inputMode, - mathPrec, mathType, inputSize, projSize, numLayers, dropout, - auxFlags) - _y = copy(y) - _hy = (hy === nothing ? hy : copy(hy[])) - _cy = (cy === nothing ? cy : copy(cy[])) - @test Array(_y) ≈ cudnnRNNForward!(y, w, x; hx, cx, hy, cy, layout, seqLengthArray, fwdMode, - hiddenSize, algo, cellMode, biasMode, dirMode, inputMode, - mathPrec, mathType, inputSize, projSize, numLayers, dropout, auxFlags) |> Array - (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[])) - (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[])) - @test Array(_y) ≈ cudnnRNNForward(w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) |> Array - (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[])) - (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[])) - @test Array(_y) ≈ cudnnRNNForward!(y, w, x, d; hx, cx, hy, cy, layout, seqLengthArray, fwdMode) |> Array - (_hy === hy === nothing || @test Array(_hy) ≈ Array(hy[])) - (_cy === cy === nothing || @test Array(_cy) ≈ Array(cy[])) - end - - rnntest() - rnntest(hx=hx1) - rnntest(cx=cx1) - rnntest(hy=Ref{Any}()) - rnntest(cy=Ref{Any}()) - rnntest(layout=CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED) - rnntest(layout=CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED) - rnntest(seqLengthArray=Cint[1,2,1,2]) - rnntest(fwdMode=CUDNN_FWD_MODE_TRAINING) - rnntest(hiddenSize=16) - # XXX: it's unclear which devices support this algorithm - if capability(device()) >= v"6.1" - rnntest(algo=CUDNN_RNN_ALGO_PERSIST_STATIC) - end - #rnntest(algo=CUDNN_RNN_ALGO_PERSIST_DYNAMIC) # causes segfault - rnntest(cellMode=CUDNN_RNN_RELU) - rnntest(cellMode=CUDNN_RNN_TANH) - rnntest(cellMode=CUDNN_GRU) - rnntest(biasMode=CUDNN_RNN_NO_BIAS) - rnntest(biasMode=CUDNN_RNN_SINGLE_INP_BIAS) - rnntest(biasMode=CUDNN_RNN_SINGLE_REC_BIAS) - rnntest(dirMode=CUDNN_BIDIRECTIONAL) - rnntest(inputMode=CUDNN_SKIP_INPUT) - rnntest(mathPrec=Float32) # only possible option for F32 input - rnntest(mathType=CUDNN_DEFAULT_MATH) - rnntest(mathType=CUDNN_TENSOR_OP_MATH) - rnntest(mathType=CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) - rnntest(projSize=4) - rnntest(numLayers=2) - rnntest(dropout=0.5) - rnntest(auxFlags=CUDNN_RNN_PADDED_IO_DISABLED) - rnntest(auxFlags=CUDNN_RNN_PADDED_IO_ENABLED) -end diff --git a/test/cudnn/softmax.jl b/test/cudnn/softmax.jl deleted file mode 100644 index 878c300017..0000000000 --- a/test/cudnn/softmax.jl +++ /dev/null @@ -1,47 +0,0 @@ -using CUDA.CUDNN: - cudnnSoftmaxForward, - cudnnSoftmaxForward!, - cudnnSoftmaxBackward, - cudnnSoftmaxAlgorithm_t, - CUDNN_SOFTMAX_FAST, # 0, /* straightforward implementation */ - CUDNN_SOFTMAX_ACCURATE, # 1, /* subtract max from every point to avoid overflow */ - CUDNN_SOFTMAX_LOG, # 2 - cudnnSoftmaxMode_t, - CUDNN_SOFTMAX_MODE_INSTANCE, # 0, /* compute the softmax over all C, H, W for each N */ - CUDNN_SOFTMAX_MODE_CHANNEL # 1 /* compute the softmax over all C for each H, W, N */ - -@testset "cudnn/softmax" begin - ax,ay = randn(Float32,10,10),randn(Float32,10,10) - cx,cy = CuArray.((ax,ay)) - - function softmaxtest(; - alpha=1, - beta=0, - mode=CUDNN_SOFTMAX_MODE_INSTANCE, - algo=CUDNN_SOFTMAX_FAST - ) - d = mode === CUDNN_SOFTMAX_MODE_INSTANCE ? 1 : 2 - x = ax .- maximum(ax, dims=d) - y = x .- log.(sum(exp.(x), dims=d)) - if algo !== CUDNN_SOFTMAX_LOG; y = exp.(y); end - add1(x)=reshape(x, (size(x)..., 1)) - if mode === CUDNN_SOFTMAX_MODE_CHANNEL - y,cx1,cy1 = add1.((y,cx,cy)) - else - cx1,cy1 = cx,cy - end - y0 = alpha * y - y1 = y0 .+ beta * ay - @test y0 ≈ cudnnSoftmaxForward(cx1; algo, mode, alpha) |> Array - @test y1 ≈ cudnnSoftmaxForward!(copy(cy1), cx1; algo, mode, alpha, beta) |> Array - end - - softmaxtest() - softmaxtest(alpha=2) - softmaxtest(beta=2) - softmaxtest(mode=CUDNN_SOFTMAX_MODE_INSTANCE) - softmaxtest(mode=CUDNN_SOFTMAX_MODE_CHANNEL) - softmaxtest(algo=CUDNN_SOFTMAX_FAST) - softmaxtest(algo=CUDNN_SOFTMAX_ACCURATE) - softmaxtest(algo=CUDNN_SOFTMAX_LOG) -end diff --git a/test/cudnn/tensor.jl b/test/cudnn/tensor.jl deleted file mode 100644 index 22b8fddd62..0000000000 --- a/test/cudnn/tensor.jl +++ /dev/null @@ -1,33 +0,0 @@ -using CUDA.CUDNN: - cudnnTensorDescriptor, - cudnnCreateTensorDescriptor, - cudnnFilterDescriptor, - cudnnDataType, - cudnnDataType_t, - CUDNN_TENSOR_NCHW, - CUDNN_STATUS_SUCCESS, - @retry_reclaim - -@testset "cudnn/tensor" begin - x = CUDA.rand(1,1,1,2) - - TD = cudnnTensorDescriptor - FD = cudnnFilterDescriptor - DT = cudnnDataType - - @test TD(x) isa TD - @test TD(CUDNN_TENSOR_NCHW, DT(eltype(x)), Cint(ndims(x)), Cint[reverse(size(x))...]) isa TD - td = TD(x) - @test TD(td.ptr) isa TD - @test Base.unsafe_convert(Ptr, TD(td.ptr)) isa Ptr - - @test FD(x) isa FD - @test FD(DT(eltype(x)),CUDNN_TENSOR_NCHW,Cint(ndims(x)),Cint[reverse(size(x))...]) isa FD - fd = FD(x) - @test FD(fd.ptr) isa FD - @test Base.unsafe_convert(Ptr, FD(fd.ptr)) isa Ptr - - @test DT(Float32) isa cudnnDataType_t - - @test (@retry_reclaim(x->(x!==CUDNN_STATUS_SUCCESS),cudnnCreateTensorDescriptor(Ref{Ptr{Cvoid}}(C_NULL)))) isa Nothing -end diff --git a/test/runtests.jl b/test/runtests.jl index ef15e90710..0a3a43bc6b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -171,14 +171,8 @@ ENV["CUDA_VISIBLE_DEVICES"] = join(map(pick->"$(pick.mig ? "MIG" : "GPU")-$(pick # determine tests to skip skip_tests = [] -has_cudnn() || push!(skip_tests, "cudnn") has_cusolvermg() || push!(skip_tests, "cusolver/multigpu") has_nvml() || push!(skip_tests, "nvml") -if !has_cutensor() || first(picks).cap < v"6.0" - push!(skip_tests, "cutensor") -end -has_cutensornet() || push!(skip_tests, "cutensornet") -has_custatevec() || push!(skip_tests, "custatevec") if do_sanitize # XXX: some library tests fail under compute-sanitizer append!(skip_tests, ["cutensor"])