FluxML · mcabbott · Apr 18, 2023 · Aug 2, 2022 · Aug 2, 2022 · Aug 2, 2022
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
@@ -61,6 +61,7 @@ Parallel
 Flux.Bilinear
 Flux.Scale
 Flux.Embedding
+Flux.EmbeddingBag
 ```
 
 ## Normalisation & Regularisation

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -692,3 +692,84 @@ end
 function Base.show(io::IO, m::Embedding)
   print(io, "Embedding(", size(m.weight, 2), " => ", size(m.weight, 1), ")")
 end
+
+"""
+    EmbeddingBag(in => out, reduction=Statistics.mean; init=randn)
+
+A lookup table that stores embeddings of dimension `out` for a vocabulary of size 
+`in`. Similar to [`Embedding`](@ref) but can take multiple inputs in a "bag". The
+embeddings of these are then reduced to a single embedding based on `reduction`.
+Typically, `reduction` is `Statistics.mean`, `sum`, or `maximum`. 
+
+This layer is often used to store word embeddings and retrieve them using indices. 
+The inputs can take several forms:
+  - A scalar := single bag with a single item
+  - A vector := single bag with multiple items
+  - A matrix := multiple bags with multiple items (each column is a bag)
+  - A vector of vectors: multiple mags with multiple items (each vector is a bag)
+  - An input vector and offset vector: Explained below
+
+  The `input`/`offset` input type is similar to PyTorch's implementation. `input` should be
+  a vector of class indices and `offset` should be a vector representing offsets from the
+  first index of `input`. The first element of `offsets` must be `0`, and `offsets` should
+  be monotonically increasing, but the second condition is not checked.
+
+  For example, the `input`/`offset` pair `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`/`[0, 4, 5, 7]`
+  is equivalent to the bags `[[1, 2, 3, 4], [5], [6, 7], [8, 9, 10]]`
+
+# Examples
+```jldoctest
+julia> vocab_size, embed_size = 1000, 4;
+
+julia> model = Flux.EmbeddingBag(vocab_size => embed_size)
+EmbeddingBag(1000 => 4)  # 4_000 parameters
+
+julia> bags = [[1, 200, 25, 789], [2, 5, 10, 999]];
+
+julia> bags_mtx = [1 2; 200 5; 25 10; 789 999];
+
+julia> model(bags) |> summary
+"4×2 Matrix{Float32}"
+
+julia> model(bags) ≈ model(bags_mtx)
+true
+```
+"""
+struct EmbeddingBag{F, W}
+  weight::W
+  reduction::F
+end
+
+@functor EmbeddingBag
+
+EmbeddingBag((in, out)::Pair{<:Integer, <:Integer}, reduction::Function = Statistics.mean; init = randn32) = EmbeddingBag(init(out, in), reduction)
+EmbeddingBag(weight) = EmbeddingBag(weight, Statistics.mean)
+
+function (m::EmbeddingBag)(inputs::AbstractVector, offsets::AbstractVector)
+    offsets[1] == 0 || throw(ArgumentError("`offsets` must begin with 0."))
+    out = zeros(eltype(m.weight), size(m.weight, 1), length(offsets))
+    start = firstindex(inputs)
+    for i in eachindex(offsets[1:end-1])
+        out[:, i] = m(inputs[start:offsets[i+1]])
+        start = offsets[i+1]+1
+    end
+    out[:, end] = m(inputs[offsets[end]+1:end])
+    out
+end
+(m::EmbeddingBag)(idx::Integer) = m.weight[:, idx]
+(m::EmbeddingBag)(bag::AbstractVector) = vec(m.reduction(NNlib.gather(m.weight, bag), dims=2))
+(m::EmbeddingBag)(bags::AbstractVector{<:AbstractVector}) = reduce(hcat, m.(bags))
+(m::EmbeddingBag)(bags::AbstractMatrix) = reduce(hcat, m.(eachcol(bags)))
+
+function (m::EmbeddingBag)(x::OneHotVector{T,L}) where {T,L}
+  size(m.weight, 2) == L || throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L"))
+  return m(onecold(x))
+end
+function (m::EmbeddingBag)(x::OneHotMatrix{T,L}) where {T,L}
+  size(m.weight, 2) == L || throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L"))
+  return m(LinearAlgebra.Transpose(onecold(x)))
+end
+
+function Base.show(io::IO, m::EmbeddingBag)
+  print(io, "EmbeddingBag(", size(m.weight, 2), " => ", size(m.weight, 1), ")")
+end
diff --git a/src/layers/show.jl b/src/layers/show.jl
@@ -57,7 +57,7 @@ _show_children(p::Parallel) = (p.connection, p.layers...)
 _show_children(f::PairwiseFusion) = (f.connection, f.layers...)
 
 for T in [
-    :Conv, :ConvTranspose, :CrossCor, :Dense, :Scale, :Bilinear, :Embedding,
+    :Conv, :ConvTranspose, :CrossCor, :Dense, :Scale, :Bilinear, :Embedding, :EmbeddingBag,
     :BatchNorm, :LayerNorm, :InstanceNorm, :GroupNorm,
   ]
   @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -311,6 +311,63 @@ import Flux: activations
     @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:,3]
     @test_throws DimensionMismatch m(OneHotVector(3, 1000))
   end
+
+  @testset "EmbeddingBag" begin
+    for reduction in [sum, Statistics.mean, maximum]
+      vocab_size, embed_size = 10, 4
+      emb_bag = Flux.EmbeddingBag(vocab_size => embed_size, reduction)
+      emb = Flux.Embedding(emb_bag.weight)
+      @test size(emb_bag.weight) == (embed_size, vocab_size)
+
+      # scalar bag
+      @test emb_bag(2) ≈ emb_bag.weight[:,2]
+      @test emb_bag(3) ≈ emb(3)
+
+      # single bag (input as a vector)
+      x = rand(1:vocab_size, 3)
+      y = emb_bag(x)
+      z = vec(reduction(emb(x), dims=2))
+      @test y isa Vector{Float32}
+      @test y ≈ z
+
+      # PyTorch style `input`/`offset` bagging
+      @test emb_bag([1,3,2,4,5,7], [0,2,4]) ≈ emb_bag([[1,3], [2,4], [5,7]])
+      @test emb_bag([1,3,2,4,5,7], [0,2,4]) ≈ emb_bag([1 2 5; 3 4 7])
+      @test_throws ArgumentError emb_bag([1,2,3,4,5,6], [2,4])
+      @test_throws BoundsError emb_bag([1,2,3,4,5,6], [0,12])
+
+      # docstring example
+      @test emb_bag([1,2,3,4,5,6,7,8,9,10], [0,4,5,7]) ≈ emb_bag([[1,2,3,4], [5], [6,7], [8,9,10]])
+
+      # multiple bags (input as a vector of vectors)
+      x = [rand(1:vocab_size, 3) for _ in 1:4]
+      y = emb_bag(x)
+      z = reduce(hcat, reduction.(emb.(x), dims=2))
+      @test y isa Matrix{Float32}
+      @test y ≈ z
+
+      # multiple bags (input as a matrix)
+      x = rand(1:vocab_size, (3, 5))
+      xvec = collect(eachcol(x))
+      y = emb_bag(x)
+      z = reduce(hcat, reduction.(emb.(xvec), dims=2))
+      @test y ≈ emb_bag(xvec)
+      @test y ≈ z
+
+      # one hot bags. should be identical to Embedding, since the bags
+      # are of size 1.
+      @test emb_bag(Flux.OneHotVector(3, vocab_size)) ≈ emb_bag.weight[:,3]
+      @test emb_bag(Flux.OneHotVector(4, vocab_size)) ≈ emb(Flux.OneHotVector(4, vocab_size))
+      @test_throws DimensionMismatch emb_bag(Flux.OneHotVector(3, 1000))
+
+      x2 = Flux.OneHotMatrix(rand(1:vocab_size, 3), vocab_size)
+      y2 = emb_bag(x2)
+      z2 = emb(x2)
+      @test y2 isa Matrix{Float32}
+      @test y2 ≈ z2
+      @test_throws DimensionMismatch emb_bag(Flux.OneHotMatrix(1:5, 1000))
+    end
+  end
 end
 
 @testset "second derivatives" begin