JuliaDynamics · kahaaga · Aug 25, 2023 · Aug 18, 2023 · Aug 18, 2023 · Aug 18, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,11 +15,16 @@ Further additions to the library in v3:
 - Add the 1976 Lempel-Ziv complexity measure (`LempelZiv76`).
 - New entropy definition: identification entropy (`Identification`).
 - Minor documentation fixes.
+- `GaussianCDFEncoding` now can be used with vector-valued inputs.
 
 ### Bug fixes
 
 - `outcome_space` for `Dispersion` now correctly returns the all possible **sorted** outcomes
  (as promised by the `outcome_space` docstring).
+- `decode` with `GaussianCDFEncoding` now correctly returns only the left-sides of the
+ `[0, 1]` subintervals, and always returns the decoded symbol as a `Vector{SVector}`
+ (consistent with `RectangularBinEncoding`), regardless of whether the input is a scalar
+ or a vector.
 
 ### Renaming
 

diff --git a/docs/src/probabilities.md b/docs/src/probabilities.md
@@ -143,4 +143,7 @@ decode
 OrdinalPatternEncoding
 GaussianCDFEncoding
 RectangularBinEncoding
+RelativeMeanEncoding
+RelativeFirstDifferenceEncoding
+CombinationEncoding
 ```
diff --git a/src/core/encodings.jl b/src/core/encodings.jl
@@ -13,6 +13,9 @@ Current available encodings are:
 - [`OrdinalPatternEncoding`](@ref).
 - [`GaussianCDFEncoding`](@ref).
 - [`RectangularBinEncoding`](@ref).
+- [`RelativeMeanEncoding`](@ref).
+- [`RelativeFirstDifferenceEncoding`](@ref).
+- [`CombinationEncoding`](@ref), which can combine any of the above encodings.
 """
 abstract type Encoding end
 

diff --git a/src/encoding_implementations/combination_encoding.jl b/src/encoding_implementations/combination_encoding.jl
@@ -0,0 +1,97 @@
+export CombinationEncoding
+
+"""
+ CombinationEncoding <: Encoding
+ CombinationEncoding(encodings)
+
+A `CombinationEncoding` takes multiple [`Encoding`](@ref)s and create a combined
+encoding that can be used to encode vectors.
+
+## Encoding/decoding
+
+When used with [`encode`](@ref), each [`Encoding`](@ref) in `encodings` returns
+integers in the set `1, 2, …, n_e`, where `n_e` is the total number of outcomes
+for a particular encoding. For `k` different encodings, we can thus construct the
+cartesian coordinate `(c₁, c₂, …, cₖ)` (`cᵢ ∈ 1, 2, …, n_i`), which can uniquely
+be identified by an integer. We can thus identify each unique *combined* encoding
+with a single integer.
+
+When used with [`decode`](@ref), the integer symbol is converted to its corresponding
+cartesian coordinate, which is used to retrieve the decoded symbols for each of
+the encodings.
+
+The total number of outcomes is `prod(total_outcomes(e) for e in encodings)`.
+
+## Examples
+
+```julia
+using ComplexityMeasures
+
+# We want to encode the vector `x`.
+x = [0.9, 0.2, 0.3]
+
+# To do so, we will use a combination of first-difference encoding, amplitude encoding,
+# and ordinal pattern encoding.
+
+encodings = [
+ RelativeFirstDifferenceEncoding(0, 1; n = 2),
+ RelativeMeanEncoding(0, 1; n = 5),
+ OrdinalPatternEncoding(3) # x is a three-element vector
+ ]
+c = CombinationEncoding(encodings)
+
+# Encode `x` as integer
+ω = encode(c, x)
+
+# Decode symbol (into a vector of decodings, one for each encodings `e ∈ encodings`).
+# In this particular case, the first two element will be left-bin edges, and
+# the last element will be the decoded ordinal pattern (indices that would sort `x`).
+d = decode(c, ω)
+```
+"""
+struct CombinationEncoding{VE, L, C} <: Encoding
+ # An iterable of encodings.
+ encodings::VE
+
+ # internal fields: LinearIndices/CartesianIndices for encodings/decodings.
+ linear_indices::L
+ cartesian_indices::C
+
+ function CombinationEncoding(encodings::VE, l::L, c::C) where {VE, L, C}
+ if any(e isa CombinationEncoding for e in encodings)
+ s = "CombinationEncoding doesn't accept a CombinationEncoding as one of its " *
+ "sub-encodings."
+ throw(ArgumentError(s))
+ end
+ new{VE, L, C}(encodings, l, c)
+ end
+end
+
+function CombinationEncoding(encodings::Vararg{<:Encoding, N}) where N
+ ranges = tuple([1:total_outcomes(e) for e in encodings]...)
+ linear_indices = LinearIndices(ranges)
+ cartesian_indices = CartesianIndices(ranges)
+ return CombinationEncoding(encodings, linear_indices, cartesian_indices)
+end
+CombinationEncoding(encodings::Vector{<:Encoding}) = CombinationEncoding(encodings...)
+
+# We could in principle allow any `x` here, but not all encodings support encoding
+# single numbers. In particular, the `RelativeFirstDifferenceEncoding` isn't even defined
+# for single numbers, and `OrdinalPatternEncoding` also isn't defined for single numbers.
+# Therefore, we enforce vector-valued input with encoding.
+function encode(encoding::CombinationEncoding, x::AbstractVector{<:Real})
+ # note: we don't enforce length(x) >= 2 here, because some combinations of
+ # encodings may work on single-element vectors (even though most don't).
+ symbols = [encode(e, x) for e in encoding.encodings]
+ ω::Int = encoding.linear_indices[symbols...]
+ return ω
+end
+
+function decode(encoding::CombinationEncoding, ω::Int)
+ cidx = encoding.cartesian_indices[ω]
+ return [decode(e, cidx[i]) for (i, e) in enumerate(encoding.encodings)]
+end
+
+function total_outcomes(encoding::CombinationEncoding)
+ return prod(total_outcomes(e) for e in encoding.encodings)
+end
diff --git a/src/encoding_implementations/encoding_implementations.jl b/src/encoding_implementations/encoding_implementations.jl
@@ -2,3 +2,6 @@ include("fasthist.jl")
 include("rectangular_binning.jl")
 include("gaussian_cdf.jl")
 include("ordinal_pattern.jl")
+include("relative_mean_encoding.jl")
+include("relative_first_difference_encoding.jl")
+include("combination_encoding.jl")
diff --git a/src/encoding_implementations/gaussian_cdf.jl b/src/encoding_implementations/gaussian_cdf.jl
@@ -4,19 +4,26 @@
 
 """
  GaussianCDFEncoding <: Encoding
- GaussianCDFEncoding(; μ, σ, c::Int = 3)
+ GaussianCDFEncoding(m::Int = 1; μ, σ, c::Int = 3)
+ GaussianCDFEncoding(x::AbstractVector; μ, σ, c::Int = 3)
 
-An encoding scheme that [`encode`](@ref)s a scalar value into one of the integers
+An encoding scheme that [`encode`](@ref)s a scalar or vector `x` into one of the integers
 `sᵢ ∈ [1, 2, …, c]` based on the normal cumulative distribution function (NCDF),
 and [`decode`](@ref)s the `sᵢ` into subintervals of `[0, 1]` (with some loss of information).
 
+The size of the input to be encoded must be known beforehand, and one must set
+`m = length(x)`, where `x` is the input (`m = 1` for scalars, `m ≥ 2` for vectors).
+Alternatively, provide the vector `x` to the constructor to infer `m` automatically.
+
 Notice that the decoding step does not yield an element of any outcome space of the
 estimators that use `GaussianCDFEncoding` internally, such as [`Dispersion`](@ref).
 That is because these estimators additionally delay embed the encoded data.
 
 ## Description
 
-`GaussianCDFEncoding` first maps an input point ``x`` (scalar) to a new real number
+### Encoding/decoding scalars
+
+`GaussianCDFEncoding` first maps an input scalar ``x`` to a new real number
 ``y_ \\in [0, 1]`` by using the normal cumulative distribution function (CDF) with the
 given mean `μ` and standard deviation `σ`, according to the map
 
@@ -31,6 +38,20 @@
 
 Because of the floor operation, some information is lost, so when used with
 [`decode`](@ref), each decoded `sᵢ` is mapped to a *subinterval* of `[0, 1]`.
+This subinterval is returned as a length-`1` `Vector{SVector}`.
+
+### Encoding/decoding vectors
+
+If `GaussianCDFEncoding` is used with a vector `x`, then each element of `x` is
+encoded separately, resulting in a `length(x)` sequence of integers which may be
+treated as a `CartesianIndex`. The encoded symbol `s ∈ [1, 2, …, c]` is then just the
+linear index corresponding to this cartesian index (similar to how
+[CombinationEncoding](@ref) works).
+
+When [`decode`](@ref)d, the integer symbol `s` is converted back into its `CartesianIndex`
+representation, which is just a sequence of integers that refer to subdivisions
+of the `[0, 1]` interval. The relevant subintervals are then returned as a length-`x`
+`Vector{SVector}`.
 
 ## Examples
 
@@ -54,32 +75,91 @@
  0.4
  0.6
 ```
+
+One can also encode the entire vector as an integer.
+
+```jldoctest
+julia> using ComplexityMeasures, Statistics
+
+julia> x = [0.1, 0.4, 0.7, -2.1, 8.0];
+
+julia> μ, σ = mean(x), std(x); encoding = GaussianCDFEncoding(x; μ, σ, c = 2)
+GaussianCDFEncoding(m=5; c=2, μ=1.42, σ=3.840182287340016)
+
+julia> symbol = encode(encoding, x)
+17
+
+julia> decode(encoding, symbol)
+5-element Vector{SVector{1, Float64}}:
+ [0.0]
+ [0.0]
+ [0.0]
+ [0.0]
+ [0.5000000000000001]
+```
 """
-struct GaussianCDFEncoding{T} <: Encoding
+struct GaussianCDFEncoding{m, T, L <: LinearIndices, C <: CartesianIndices, R} <: Encoding
+ m::Int
  c::Int
  σ::T
  μ::T
- # We require the input data, because we need σ and μ for encoding single values.
- function GaussianCDFEncoding(; μ::T, σ::T, c::Int = 3) where T
- new{T}(c, σ, μ)
+
+ # internal fields: LinearIndices/CartesianIndices for encodings/decodings. binencoder
+ # for discretizing the interval [0, 1]
+ linear_indices::L
+ cartesian_indices::C
+ binencoder::R # RectangularBinEncoding
+
+ # The input `m` restricts what length the input scalar/vector can be.
+ function GaussianCDFEncoding(m::Int = 1; μ::T, σ::T, c::Int = 3) where T
+ m >= 1 || throw(ArgumentError("m must be an integer ≥ 1. Got $m."))
+ ranges = tuple([1:c for i in 1:m]...)
+ cartesian_indices = CartesianIndices(ranges)
+ linear_indices = LinearIndices(ranges)
+ L = typeof(linear_indices)
+ C = typeof(cartesian_indices)
+ binencoder = RectangularBinEncoding(FixedRectangularBinning(0, 1, c + 1))
+ R = typeof(binencoder)
+ new{m, T, L, C, R}(m, c, σ, μ, linear_indices, cartesian_indices, binencoder)
  end
 end
+GaussianCDFEncoding(x::AbstractVector; kwargs...) = GaussianCDFEncoding(length(x); kwargs...)
+
+function Base.show(io::IO, e::GaussianCDFEncoding{m, T, L, C}) where {m, T, L, C}
+ c, μ, σ = e.c, e.μ, e.σ
+ print(io, "GaussianCDFEncoding(m=$m; c=$c, μ=$μ, σ=$σ)")
+end
 
-total_outcomes(encoding::GaussianCDFEncoding) = encoding.c
+function total_outcomes(encoding::GaussianCDFEncoding{m}) where m
+ c = encoding.c
+ return prod(c for i = 1:m)
+end
 
 gaussian(x, μ, σ) = exp((-(x - μ)^2)/(2σ^2))
 
 function encode(encoding::GaussianCDFEncoding, x::Real)
- (; c, σ, μ) = encoding
+ σ, μ = encoding.σ, encoding.μ
  # We only need the value of the integral (not the error), so
  # index first element returned from quadgk
  k = 1/(σ*sqrt(2π))
  y = k * first(quadgk(x -> gaussian(x, μ, σ), -Inf, x))
- return floor(Int, y / (1 / c)) + 1
+ # The integral estimate sometime returns a value slightly above 1.0, so we need
+ # to adjust to be sure that all points fall within the FixedRectangularBinning.
+ y_corrected = min(y, 1.0)
+ return encode(encoding.binencoder, y_corrected)
 end
 
-function decode(encoding::GaussianCDFEncoding, i::Int)
- c = encoding.c
- lower_interval_bound = (i - 1)/(c)
- return SVector(lower_interval_bound, prevfloat(lower_interval_bound + 1/c))
+function encode(encoding::GaussianCDFEncoding{m}, x::AbstractVector) where m
+ L = length(x)
+ if L != m
+ throw(ArgumentError("length(`x`) must equal `m` (got length(x)=$L, m=$m)"))
+ end
+ symbols = encode.(Ref(encoding), x)
+ ω::Int = encoding.linear_indices[symbols...]
+ return ω
+end
+
+function decode(encoding::GaussianCDFEncoding, ω::Int)
+ cidxs = Tuple(encoding.cartesian_indices[ω])
+ return [decode(encoding.binencoder, cᵢ) for cᵢ in cidxs]
 end
diff --git a/src/encoding_implementations/ordinal_pattern.jl b/src/encoding_implementations/ordinal_pattern.jl
@@ -60,6 +60,10 @@
  return OrdinalPatternEncoding{m, F}(zero(MVector{m, Int}), lt)
 end
 
+function Base.show(io::IO, e::OrdinalPatternEncoding{M}) where {M}
+ print(io, "OrdinalPatternEncoding{3}(lt = $(e.lt))")
+end
+
 # So that SymbolicPerm stuff fallback here
 total_outcomes(::OrdinalPatternEncoding{m}) where {m} = factorial(m)
 outcome_space(::OrdinalPatternEncoding{m}) where {m} = permutations(1:m) |> collect