Merge pull request #22 from JuliaGNI/static_arrays

Added `CPUStatic` backend and implemented new `initialparameters` interface.
JuliaGNI · Dec 5, 2024 · 649e93e · 649e93e
2 parents 2657df1 + 4ff3219
commit 649e93e
Show file tree

Hide file tree

Showing 36 changed files with 326 additions and 114 deletions.
diff --git a/.githooks/pre-push b/.githooks/pre-push
@@ -0,0 +1,19 @@
+# pre-push git hook that runs all tests before pushing
+
+red='\033[0;31m'
+green='\033[0;32m'
+no_color='\033[0m'
+
+reponame=$(basename `git rev-parse --show-toplevel`)
+
+
+echo "\nRunning pre-push hook\n"
+echo "Testing $reponame"
+julia --project=@.  -e "using Pkg; Pkg.test(\"AbstractNeuralNetworks\")"
+
+if [[ $? -ne 0 ]]; then
+        echo "\n${red}ERROR - Tests must pass before push!\n${no_color}"
+  exit 1
+fi
+
+echo "\n${green}Git hook was SUCCESSFUL!${no_color}\n"
diff --git a/Project.toml b/Project.toml
@@ -4,14 +4,18 @@ authors = ["Michael Kraus"]
 version = "0.4.0"
 
 [deps]
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
+GPUArraysCore = "0.2.0"
 HDF5 = "0.17.2"
 KernelAbstractions = "0.9"
+StaticArrays = "1.9.8"
 julia = "1.6"
 
 [extras]

diff --git a/README.md b/README.md
@@ -8,3 +8,11 @@
 
 This package implements abstract and general data structures for the construction of neural networks, e.g., layers, chains, and architectures.
 It mainly serves as a common base package for [GeometricMachineLearning.jl](https://github.com/JuliaGNI/GeometricMachineLearning.jl) and [SymbolicNetworks.jl](https://github.com/JuliaGNI/SymbolicNetworks.jl).
+
+
+## Development
+
+We are using git hooks, e.g., to enforce that all tests pass before pushing. In order to activate these hooks, the following command must be executed once:
+```
+git config core.hooksPath .githooks
+```
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,3 +1,5 @@
 [deps]
 AbstractNeuralNetworks = "60874f82-5ada-4c70-bd1c-fa6be7711c8a"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,13 +1,24 @@
 using AbstractNeuralNetworks
 using Documenter
+using DocumenterCitations
+import Pkg
+
+PROJECT_TOML = Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))
+VERSION = PROJECT_TOML["version"]
+NAME = PROJECT_TOML["name"]
+AUTHORS = join(PROJECT_TOML["authors"], ", ") * " and contributors"
+GITHUB = "https://github.com/JuliaGNI/AbstractNeuralNetworks.jl"
+
+bib = CitationBibliography(joinpath(@__DIR__, "src", "AbstractNeuralNetworks.bib"))
 
 DocMeta.setdocmeta!(AbstractNeuralNetworks, :DocTestSetup, :(using AbstractNeuralNetworks); recursive=true)
 
 makedocs(;
+    plugins=[bib],
     modules=[AbstractNeuralNetworks],
-    authors="Michael Kraus",
+    authors=AUTHORS,
     repo="https://github.com/JuliaGNI/AbstractNeuralNetworks.jl/blob/{commit}{path}#{line}",
-    sitename="AbstractNeuralNetworks.jl",
+    sitename=NAME,
     format=Documenter.HTML(;
         prettyurls=get(ENV, "CI", "false") == "true",
         canonical="https://JuliaGNI.github.io/AbstractNeuralNetworks.jl",
@@ -16,11 +27,13 @@ makedocs(;
     ),
     pages=[
         "Home" => "index.md",
+        "Static Neural Network Parameters" => "static_neural_network_parameters.md",
+        "References" => "bibliography.md"
     ],
 )
 
 deploydocs(;
-    repo   = "github.com/JuliaGNI/AbstractNeuralNetworks.jl",
+    repo   = GITHUB,
     devurl = "latest",
     devbranch = "main",
 )
diff --git a/docs/src/AbstractNeuralNetworks.bib b/docs/src/AbstractNeuralNetworks.bib
@@ -0,0 +1,8 @@
+@inproceedings{glorot2010understanding,
+  title={Understanding the difficulty of training deep feedforward neural networks},
+  author={Glorot, Xavier and Bengio, Yoshua},
+  booktitle={Proceedings of the thirteenth international conference on artificial intelligence and statistics},
+  pages={249--256},
+  year={2010},
+  organization={JMLR Workshop and Conference Proceedings}
+}
diff --git a/docs/src/bibliography.md b/docs/src/bibliography.md
@@ -0,0 +1,5 @@
+# References
+
+```@bibliography
+*
+```
diff --git a/docs/src/static_neural_network_parameters.md b/docs/src/static_neural_network_parameters.md
@@ -0,0 +1,51 @@
+# Static Neural Network Parameters
+
+We can also allocate neural network parameters using [`StaticArrays`](https://github.com/JuliaArrays/StaticArrays.jl). Therefore we simply need to set the keyword `static` to true in the [`NeuralNetwork`](@ref) constructor. 
+
+!!! warning
+    Static neural network parameters are only supported for dense CPU arrays. `AbstractNeuralNetworks` defines a type `CPUStatic`, but does not have equivalent GPU objects.
+
+```@example static_parameters
+using AbstractNeuralNetworks
+import Random
+Random.seed!(123)
+
+backend = AbstractNeuralNetworks.CPUStatic()
+input_dim = 2
+n_hidden_layers = 100
+c = Chain(Dense(input_dim, 10, tanh), Tuple(Dense(10, 10, tanh) for _ in 1:n_hidden_layers)..., Dense(10, 1, tanh))
+nn = NeuralNetwork(c, backend)
+typeof(nn.params.L1.W)
+```
+
+We can compare different evaluation times:
+```@example static_parameters
+nn_cpu = changebackend(CPU(), nn)
+second_dim = 200
+x = rand(input_dim, second_dim)
+nn(x); # hide
+@time nn(x);
+nothing # hide
+```
+
+```@example static_parameters
+nn_cpu(x); # hide
+@time nn_cpu(x);
+nothing # hide
+```
+
+If we also make the *input* static, we get:
+
+```@example static_parameters
+using StaticArrays
+x = @SMatrix rand(input_dim, second_dim)
+nn(x);
+@time nn(x);
+nothing # hide
+```
+
+```@example static_parameters
+nn_cpu(x); # hide
+@time nn_cpu(x);
+nothing # hide
+```
diff --git a/src/AbstractNeuralNetworks.jl b/src/AbstractNeuralNetworks.jl
@@ -3,11 +3,13 @@ module AbstractNeuralNetworks
     using HDF5
     using HDF5: H5DataStore
     using KernelAbstractions
+    using GPUArraysCore: AbstractGPUArray
     using LinearAlgebra
+    using StaticArrays
     using Random
 
     export CPU, GPU
-    
+
     include("utils/add.jl")
     include("utils/zero_vector.jl")
 
@@ -23,6 +25,11 @@ module AbstractNeuralNetworks
 
     include("parameters.jl")
 
+    include("static_cpu_backend.jl")
+
+    export NeuralNetworkBackend, networkbackend
+
+    include("neural_network_backend.jl")
 
     export OneInitializer, ZeroInitializer, GlorotUniform
 
@@ -67,4 +74,7 @@ module AbstractNeuralNetworks
     include("pullback.jl")
 
     export AbstractPullback
+
+    export changebackend
+    include("utils/changebackend.jl")
 end
diff --git a/src/architecture.jl b/src/architecture.jl
@@ -1,4 +1,6 @@
-
+"""
+    Architecture
+"""
 abstract type Architecture end
 
 struct UnknownArchitecture <: Architecture end

diff --git a/src/cells/abstract.jl b/src/cells/abstract.jl
@@ -5,7 +5,7 @@ An `AbstractCell` is a map from $\mathbb{R}^{M}×\mathbb{R}^{N} \rightarrow \mat
 
 Concrete cell types should implement the following functions:
 
-- `initialparameters(backend::Backend, ::Type{T}, cell::AbstractCell; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng())`
+- `initialparameters(backend::NeuralNetworkBackend, ::Type{T}, cell::AbstractCell; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng())`
 - `update!(::AbstractLayer, θ::NamedTuple, dθ::NamedTuple, η::AbstractFloat)`
 
 and the functors

diff --git a/src/cells/grid.jl b/src/cells/grid.jl
@@ -31,7 +31,7 @@ Base.eachindex(g::GridCell) = Iterators.product(1:lines(g), 1:rows(g))
     return Expr(:block, calls...)
 end 
 
-function initialparameters(gridcell::GridCell, backend::Backend, ::Type{T}; kwargs...) where {T}
+function initialparameters(gridcell::GridCell, backend::NeuralNetworkBackend, ::Type{T}; kwargs...) where {T}
     M, N = size(gridcell)
     [initialparameters(cell(gridcell, i, j), backend, T; kwargs...) for i in 1:M, j in 1:N]
 end

diff --git a/src/cells/gru.jl b/src/cells/gru.jl
@@ -17,7 +17,7 @@ function (cell::GRU{M, N, O, P})(x::AbstractArray, st::AbstractArray, ps::NamedT
 end
 
 
-function initialparameters(cell::GRU{M, N, O, P}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O,P,T}
+function initialparameters(cell::GRU{M, N, O, P}, backend::NeuralNetworkBackend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O,P,T}
     Wᵣₓ = KernelAbstractions.zeros(backend, T, N, M)
     Wᵣₕ = KernelAbstractions.zeros(backend, T, N, N)
     Wᵤₓ = KernelAbstractions.zeros(backend, T, N, M)

diff --git a/src/cells/identity.jl b/src/cells/identity.jl
@@ -7,7 +7,7 @@ function (cell::IdentityCell{M, N, O, P})(x::AbstractArray, st::AbstractArray, p
     return (x, st)
 end
 
-function initialparameters(cell::IdentityCell{M, N, O, P}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O, P, T}
+function initialparameters(cell::IdentityCell{M, N, O, P}, backend::NeuralNetworkBackend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O, P, T}
     NamedTuple()
 end
 

diff --git a/src/cells/lstm.jl b/src/cells/lstm.jl
@@ -20,7 +20,7 @@ function (cell::LSTM{M, N, O, P})(x::AbstractArray, st::AbstractArray, ps::Named
 end
 
 
-function initialparameters(cell::LSTM{M, N, O, P}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O,P,T}
+function initialparameters(cell::LSTM{M, N, O, P}, backend::NeuralNetworkBackend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O,P,T}
     Wfₓ = KernelAbstractions.zeros(backend, T, O, M)
     Wfₕ = KernelAbstractions.zeros(backend, T, O, O)
     Wᵢₓ = KernelAbstractions.zeros(backend, T, O, M)

diff --git a/src/cells/recurrent.jl b/src/cells/recurrent.jl
@@ -30,7 +30,7 @@ end
 
 usebias(::Recurrent{M, N, O, P, BIAS}) where {M, N, O, P, BIAS} = BIAS
 
-function initialparameters(cell::Recurrent{M, N, O, P}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O,P,T}
+function initialparameters(cell::Recurrent{M, N, O, P}, backend::NeuralNetworkBackend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,O,P,T}
     Wₛₛ = KernelAbstractions.zeros(backend, T, P, N)
     Wₛₓ = KernelAbstractions.zeros(backend, T, P, M)
     Wₒₛ = KernelAbstractions.zeros(backend, T, O, P)
@@ -44,7 +44,7 @@ function initialparameters(cell::Recurrent{M, N, O, P}, backend::Backend, ::Type
     (Wₛₛ = Wₛₛ, Wₛₓ = Wₛₓ, Wₒₛ = Wₒₛ, bₛ = bₛ, bₒ = bₒ)
 end
 
-function initialparameters(cell::Recurrent{M, N, 0, P}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,P,T}
+function initialparameters(cell::Recurrent{M, N, 0, P}, backend::NeuralNetworkBackend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,P,T}
     Wₛₛ = KernelAbstractions.zeros(backend, T, P, N)
     Wₛₓ = KernelAbstractions.zeros(backend, T, P, M)
     bₛ = KernelAbstractions.zeros(backend, T, P)

diff --git a/src/chain.jl b/src/chain.jl
@@ -9,7 +9,7 @@ Chain(layers...)
 ```
 or a neural network architecture together with a backend and a parameter type:
 ```
-Chain(::Architecture, ::Backend, ::Type; kwargs...)
+Chain(::Architecture, ::NeuralNetworkBackend, ::Type; kwargs...)
 Chain(::Architecture, ::Type; kwargs...)
 ```
 If the backend is omitted, the default backend `CPU()` is chosen.
@@ -46,20 +46,12 @@ end
 
 @inline applychain(layers::Tuple, x, ps::Union{NamedTuple,NeuralNetworkParameters}) = applychain(layers, x, values(ps))
 
-function initialparameters(model::Chain, backend::Backend, ::Type{T}; kwargs...) where {T <: Number}
+function initialparameters(rng::AbstractRNG, initializer::Initializer, model::Chain, backend::NeuralNetworkBackend, ::Type{T}; kwargs...) where T
     keys = Tuple(Symbol("L$(i)") for i in eachindex(model))
-    vals = Tuple(initialparameters(layer, backend, T; kwargs...) for layer in model)
-    NamedTuple{keys}(vals)
+    vals = Tuple(initialparameters(rng, initializer, layer, backend, T; kwargs...) for layer in model)
+    NeuralNetworkParameters{keys}(vals)
 end
 
-initialparameters(model::Chain, ::Type{T}; kwargs...) where {T <: Number} = initialparameters(model, CPU(), T; kwargs...)
-
-initialparameters(model::Chain, backend::Backend; kwargs...) = initialparameters(model, backend, Float32; kwargs...)
-
-initialparameters(model::Chain, backend::CPU; kwargs...) = initialparameters(model, backend, Float64; kwargs...)
-
-initialparameters(model::Chain; kwargs...) = initialparameters(model, CPU(); kwargs...)
-
 function update!(chain::Chain, params::Tuple, grad::Tuple, η::AbstractFloat)
     for (layer, θ, dθ) in zip(chain, params, grad)
         update!(layer, θ, dθ, η)

diff --git a/src/initializer.jl b/src/initializer.jl
@@ -1,24 +1,43 @@
+"""
+    Initializer
 
-abstract type AbstractInitializer end
+Determines how neural network weights are initialized.
+"""
+abstract type Initializer end
 
-const Initializer = Union{AbstractInitializer, Base.Callable}
+"""
+    ZeroInitializer <: Initializer
+"""
+struct ZeroInitializer <: Initializer end
 
-struct ZeroInitializer <: AbstractInitializer end
 function (::ZeroInitializer)(_, x) 
     x .= KernelAbstractions.zero(x)
+
+    nothing
 end
 
-struct OneInitializer <: AbstractInitializer end
+"""
+    OneInitializer <: Initializer
+"""
+struct OneInitializer <: Initializer end
+
 function (::OneInitializer)(_, x::AbstractArray{T}) where T 
-    backend = get_backend(x)
+    backend = networkbackend(x)
     x .= KernelAbstractions.ones(backend, T, size(x))
+
+    nothing
 end
 
-default_initializer() = randn!
+"""
+    GlorotUniform <: Initializer
 
-struct GlorotUniform <: AbstractNeuralNetworks.AbstractInitializer end
+Glorot uniform was introduced by [glorot2010understanding](@cite).
+"""
+struct GlorotUniform <: Initializer end
 
 function (::GlorotUniform)(rng, x::AbstractVecOrMat{T}) where T
     rand!(rng, x)
     x .= sqrt(T(24.0) / sum(size(x))) * (x .- T(0.5)) 
-end
+end
+
+const DefaultInitializer = GlorotUniform
diff --git a/src/layers/abstract.jl b/src/layers/abstract.jl
@@ -5,7 +5,7 @@ An `AbstractLayer` is a map from $\mathbb{R}^{M} \rightarrow \mathbb{R}^{N}$.
 
 Concrete layer types should implement the following functions:
 
-- `initialparameters(backend::Backend, ::Type{T}, layer::AbstractLayer; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng())`
+- `initialparameters(backend::NeuralNetworkBackend, ::Type{T}, layer::AbstractLayer; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng())`
 - `update!(::AbstractLayer, θ::NamedTuple, dθ::NamedTuple, η::AbstractFloat)`
 
 and the functors

diff --git a/src/layers/dense.jl b/src/layers/dense.jl
@@ -25,15 +25,15 @@ end
 
 usebias(::Dense{M, N, BIAS}) where {M, N, BIAS} = BIAS
 
-function initialparameters(layer::Dense{M,N,true}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,T}
+function initialparameters(rng::AbstractRNG, init::Initializer, ::Dense{M,N,true}, backend::NeuralNetworkBackend, ::Type{T}) where {M,N,T}
     W = KernelAbstractions.zeros(backend, T, N, M)
     b = KernelAbstractions.zeros(backend, T, N)
     init(rng, W)
     init(rng, b)
     (W = W, b = b)
 end
 
-function initialparameters(layer::Dense{M,N,false}, backend::Backend, ::Type{T}; init::Initializer = default_initializer(), rng::AbstractRNG = Random.default_rng()) where {M,N,T}
+function initialparameters(rng::AbstractRNG, init::Initializer, ::Dense{M,N,false}, backend::NeuralNetworkBackend, ::Type{T}) where {M,N,T}
     W = KernelAbstractions.zeros(backend, T, N, M)
     init(rng, W)
     (W = W,)