diff --git a/base/deprecated.jl b/base/deprecated.jl index cd37516e303dc..11d2cd9a579d1 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1366,9 +1366,6 @@ export readandwrite @deprecate findmax(A::AbstractArray, dims) findmax(A, dims=dims) @deprecate findmin(A::AbstractArray, dims) findmin(A, dims=dims) -@deprecate mean(A::AbstractArray, dims) mean(A, dims=dims) -@deprecate median(A::AbstractArray, dims; kwargs...) median(A; kwargs..., dims=dims) - @deprecate mapreducedim(f, op, A::AbstractArray, dims) mapreduce(f, op, A, dims=dims) @deprecate mapreducedim(f, op, A::AbstractArray, dims, v0) mapreduce(f, op, v0, A, dims=dims) @deprecate reducedim(op, A::AbstractArray, dims) reduce(op, A, dims=dims) @@ -1692,12 +1689,6 @@ end @deprecate ipermute!(a, p::AbstractVector) invpermute!(a, p) # #27140, #27152 -@deprecate_moved cor "StatsBase" -@deprecate_moved cov "StatsBase" -@deprecate_moved std "StatsBase" -@deprecate_moved stdm "StatsBase" -@deprecate_moved var "StatsBase" -@deprecate_moved varm "StatsBase" @deprecate_moved linreg "StatsBase" # ?? more special functions to SpecialFunctions.jl diff --git a/base/exports.jl b/base/exports.jl index 37da399892e43..d87889eb047df 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -626,15 +626,6 @@ export get_zero_subnormals, set_zero_subnormals, -# statistics - mean!, - mean, - median!, - median, - middle, - quantile!, - quantile, - # iteration done, next, diff --git a/base/range.jl b/base/range.jl index 1aaa5c64235f8..747148a17eba4 100644 --- a/base/range.jl +++ b/base/range.jl @@ -847,13 +847,6 @@ function sum(r::AbstractRange{<:Real}) : (step(r) * l) * ((l-1)>>1)) end -function mean(r::AbstractRange{<:Real}) - isempty(r) && throw(ArgumentError("mean of an empty range is undefined")) - (first(r) + last(r)) / 2 -end - -median(r::AbstractRange{<:Real}) = mean(r) - function _in_range(x, r::AbstractRange) if step(r) == 0 return !isempty(r) && first(r) == x diff --git a/base/statistics.jl b/base/statistics.jl deleted file mode 100644 index 31f5630635b76..0000000000000 --- a/base/statistics.jl +++ /dev/null @@ -1,298 +0,0 @@ -# This file is a part of Julia. License is MIT: https://julialang.org/license - -##### mean ##### - -""" - mean(f::Function, v) - -Apply the function `f` to each element of `v` and take the mean. - -```jldoctest -julia> mean(√, [1, 2, 3]) -1.3820881233139908 - -julia> mean([√1, √2, √3]) -1.3820881233139908 -``` -""" -function mean(f::Callable, iterable) - y = iterate(iterable) - if y === nothing - throw(ArgumentError("mean of empty collection undefined: $(repr(iterable))")) - end - count = 1 - value, state = y - f_value = f(value) - total = reduce_first(add_sum, f_value) - y = iterate(iterable, state) - while y !== nothing - value, state = y - total += f(value) - count += 1 - y = iterate(iterable, state) - end - return total/count -end -mean(iterable) = mean(identity, iterable) -mean(f::Callable, A::AbstractArray) = sum(f, A) / _length(A) - -""" - mean!(r, v) - -Compute the mean of `v` over the singleton dimensions of `r`, and write results to `r`. - -# Examples -```jldoctest -julia> v = [1 2; 3 4] -2×2 Array{Int64,2}: - 1 2 - 3 4 - -julia> mean!([1., 1.], v) -2-element Array{Float64,1}: - 1.5 - 3.5 - -julia> mean!([1. 1.], v) -1×2 Array{Float64,2}: - 2.0 3.0 -``` -""" -function mean!(R::AbstractArray, A::AbstractArray) - sum!(R, A; init=true) - x = max(1, _length(R)) // _length(A) - R .= R .* x - return R -end - -""" - mean(v; dims) - -Compute the mean of whole array `v`, or optionally along the given dimensions. - -!!! note - Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type - to represent missing values, and the [`skipmissing`](@ref) function to omit them. -""" -mean(A::AbstractArray; dims=:) = _mean(A, dims) - -_mean(A::AbstractArray{T}, region) where {T} = mean!(reducedim_init(t -> t/2, +, A, region), A) -_mean(A::AbstractArray, ::Colon) = sum(A) / _length(A) - -##### median & quantiles ##### - -""" - middle(x) - -Compute the middle of a scalar value, which is equivalent to `x` itself, but of the type of `middle(x, x)` for consistency. -""" -middle(x::Union{Bool,Int8,Int16,Int32,Int64,Int128,UInt8,UInt16,UInt32,UInt64,UInt128}) = Float64(x) -# Specialized functions for real types allow for improved performance -middle(x::AbstractFloat) = x -middle(x::Real) = (x + zero(x)) / 1 - -""" - middle(x, y) - -Compute the middle of two reals `x` and `y`, which is -equivalent in both value and type to computing their mean (`(x + y) / 2`). -""" -middle(x::Real, y::Real) = x/2 + y/2 - -""" - middle(range) - -Compute the middle of a range, which consists of computing the mean of its extrema. -Since a range is sorted, the mean is performed with the first and last element. - -```jldoctest -julia> middle(1:10) -5.5 -``` -""" -middle(a::AbstractRange) = middle(a[1], a[end]) - -""" - middle(a) - -Compute the middle of an array `a`, which consists of finding its -extrema and then computing their mean. - -```jldoctest -julia> a = [1,2,3.6,10.9] -4-element Array{Float64,1}: - 1.0 - 2.0 - 3.6 - 10.9 - -julia> middle(a) -5.95 -``` -""" -middle(a::AbstractArray) = ((v1, v2) = extrema(a); middle(v1, v2)) - -""" - median!(v) - -Like [`median`](@ref), but may overwrite the input vector. -""" -function median!(v::AbstractVector) - isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))")) - if eltype(v)<:AbstractFloat - @inbounds for x in v - isnan(x) && return x - end - end - inds = axes(v, 1) - n = _length(inds) - mid = div(first(inds)+last(inds),2) - if isodd(n) - return middle(partialsort!(v,mid)) - else - m = partialsort!(v, mid:mid+1) - return middle(m[1], m[2]) - end -end -median!(v::AbstractArray) = median!(vec(v)) - -""" - median(v; dims) - -Compute the median of an entire array `v`, or, optionally, -along the given dimensions. For an even number of -elements no exact median element exists, so the result is -equivalent to calculating mean of two median elements. - -!!! note - Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type - to represent missing values, and the [`skipmissing`](@ref) function to omit them. -""" -median(v::AbstractArray; dims=:) = _median(v, dims) - -_median(v::AbstractArray, dims) = mapslices(median!, v, dims = dims) - -_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, _length(v)), v)) - -# for now, use the R/S definition of quantile; may want variants later -# see ?quantile in R -- this is type 7 -""" - quantile!([q, ] v, p; sorted=false) - -Compute the quantile(s) of a vector `v` at the probability or probabilities `p`, which -can be given as a single value, a vector, or a tuple. If `p` is a vector, an optional -output array `q` may also be specified. (If not provided, a new output array is created.) -The keyword argument `sorted` indicates whether `v` can be assumed to be sorted; if -`false` (the default), then the elements of `v` may be partially sorted. - -The elements of `p` should be on the interval [0,1], and `v` should not have any `NaN` -values. - -Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, -for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan -(1996), and is the same as the R default. - -!!! note - Julia does not ignore `NaN` values in the computation: `quantile!` will - throw an `ArgumentError` in the presence of `NaN` values in the data array. - Use the [`missing`](@ref) type to represent missing values, and the - [`skipmissing`](@ref) function to omit them. - -* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", - *The American Statistician*, Vol. 50, No. 4, pp. 361-365 -""" -function quantile!(q::AbstractArray, v::AbstractVector, p::AbstractArray; - sorted::Bool=false) - if size(p) != size(q) - throw(DimensionMismatch("size of p, $(size(p)), must equal size of q, $(size(q))")) - end - isempty(q) && return q - - minp, maxp = extrema(p) - _quantilesort!(v, sorted, minp, maxp) - - for (i, j) in zip(eachindex(p), eachindex(q)) - @inbounds q[j] = _quantile(v,p[i]) - end - return q -end - -quantile!(v::AbstractVector, p::AbstractArray; sorted::Bool=false) = - quantile!(similar(p,float(eltype(v))), v, p; sorted=sorted) - -quantile!(v::AbstractVector, p::Real; sorted::Bool=false) = - _quantile(_quantilesort!(v, sorted, p, p), p) - -function quantile!(v::AbstractVector, p::Tuple{Vararg{Real}}; sorted::Bool=false) - isempty(p) && return () - minp, maxp = extrema(p) - _quantilesort!(v, sorted, minp, maxp) - return map(x->_quantile(v, x), p) -end - -# Function to perform partial sort of v for quantiles in given range -function _quantilesort!(v::AbstractArray, sorted::Bool, minp::Real, maxp::Real) - isempty(v) && throw(ArgumentError("empty data vector")) - - if !sorted - lv = length(v) - lo = floor(Int,1+minp*(lv-1)) - hi = ceil(Int,1+maxp*(lv-1)) - - # only need to perform partial sort - sort!(v, 1, lv, Sort.PartialQuickSort(lo:hi), Base.Sort.Forward) - end - isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs")) - return v -end - -# Core quantile lookup function: assumes `v` sorted -@inline function _quantile(v::AbstractVector, p::Real) - 0 <= p <= 1 || throw(ArgumentError("input probability out of [0,1] range")) - - lv = length(v) - f0 = (lv - 1)*p # 0-based interpolated index - t0 = trunc(f0) - h = f0 - t0 - i = trunc(Int,t0) + 1 - - T = promote_type(eltype(v), typeof(v[1]*h)) - - if h == 0 - return convert(T, v[i]) - else - a = v[i] - b = v[i+1] - if isfinite(a) && isfinite(b) - return convert(T, a + h*(b-a)) - else - return convert(T, (1-h)*a + h*b) - end - end -end - - -""" - quantile(v, p; sorted=false) - -Compute the quantile(s) of a vector `v` at a specified probability or vector or tuple of -probabilities `p`. The keyword argument `sorted` indicates whether `v` can be assumed to -be sorted. - -The `p` should be on the interval [0,1], and `v` should not have any `NaN` values. - -Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, -for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan -(1996), and is the same as the R default. - -!!! note - Julia does not ignore `NaN` values in the computation: `quantile` will - throw an `ArgumentError` in the presence of `NaN` values in the data array. - Use the [`missing`](@ref) type to represent missing values, and the - [`skipmissing`](@ref) function to omit them. - -- Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", - *The American Statistician*, Vol. 50, No. 4, pp. 361-365 -""" -quantile(v::AbstractVector, p; sorted::Bool=false) = - quantile!(sorted ? v : copymutable(v), p; sorted=sorted) diff --git a/base/sysimg.jl b/base/sysimg.jl index cb82306742dc7..bd5f2ff9d515f 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -413,9 +413,6 @@ using .StackTraces include("initdefs.jl") -# statistics -include("statistics.jl") - # worker threads include("threadcall.jl") @@ -525,6 +522,7 @@ let :Pkg, :Test, :REPL, + :Statistics, ] maxlen = maximum(textwidth.(string.(stdlibs))) @@ -895,6 +893,20 @@ end @deprecate_stdlib send Sockets true @deprecate_stdlib TCPSocket Sockets true @deprecate_stdlib UDPSocket Sockets true + + @deprecate_stdlib cor Statistics true + @deprecate_stdlib cov Statistics true + @deprecate_stdlib std Statistics true + @deprecate_stdlib stdm Statistics true + @deprecate_stdlib var Statistics true + @deprecate_stdlib varm Statistics true + @deprecate_stdlib mean! Statistics true + @deprecate_stdlib mean Statistics true + @deprecate_stdlib median! Statistics true + @deprecate_stdlib median Statistics true + @deprecate_stdlib middle Statistics true + @deprecate_stdlib quantile! Statistics true + @deprecate_stdlib quantile Statistics true end end diff --git a/doc/src/base/math.md b/doc/src/base/math.md index 91e8d07fa6501..0fa973df3c828 100644 --- a/doc/src/base/math.md +++ b/doc/src/base/math.md @@ -177,15 +177,3 @@ Base.widemul Base.Math.@evalpoly Base.FastMath.@fastmath ``` - -## Statistics - -```@docs -Base.mean -Base.mean! -Base.middle -Base.median -Base.median! -Base.quantile -Base.quantile! -``` diff --git a/doc/src/manual/interfaces.md b/doc/src/manual/interfaces.md index abdf61186cf3a..618f55f79e583 100644 --- a/doc/src/manual/interfaces.md +++ b/doc/src/manual/interfaces.md @@ -83,18 +83,21 @@ julia> for i in Squares(7) 49 ``` -We can use many of the builtin methods that work with iterables, like [`in`](@ref), -[`sum`](@ref) and [`mean`](@ref): +We can use many of the builtin methods that work with iterables, +like [`in`](@ref), or [`mean`](@ref) and [`std`](@ref) from the +`Statistics` standard library module: ```jldoctest squaretype julia> 25 in Squares(10) true -julia> sum(Squares(100)) -338350 +julia> using Statistics julia> mean(Squares(100)) 3383.5 + +julia> std(Squares(100)) +3024.355854282583 ``` There are a few more methods we can extend to give Julia more information about this iterable @@ -388,8 +391,8 @@ julia> A[SquaresVector(3)] 4.0 9.0 -julia> mean(A) -5.0 +julia> sum(A) +45.0 ``` If you are defining an array type that allows non-traditional indexing (indices that start at diff --git a/doc/src/manual/missing.md b/doc/src/manual/missing.md index 4467f587da162..67b349f57a157 100644 --- a/doc/src/manual/missing.md +++ b/doc/src/manual/missing.md @@ -294,7 +294,7 @@ julia> sum(skipmissing([1, missing])) This convenience function returns an iterator which filters out `missing` values efficiently. It can therefore be used with any function which supports iterators -```jldoctest +```jldoctest; setup = :(using Statistics) julia> maximum(skipmissing([3, missing, 2, 1])) 3 diff --git a/stdlib/Markdown/test/runtests.jl b/stdlib/Markdown/test/runtests.jl index 7cb5aee14d3ee..e1d03fdffaffd 100644 --- a/stdlib/Markdown/test/runtests.jl +++ b/stdlib/Markdown/test/runtests.jl @@ -427,20 +427,20 @@ end ref(x) = Reference(x) -ref(mean) +ref(sum) show(io::IO, m::MIME"text/plain", r::Reference) = print(io, "$(r.ref) (see Julia docs)") -mean_ref = md"Behaves like $(ref(mean))" -@test plain(mean_ref) == "Behaves like mean (see Julia docs)\n" -@test html(mean_ref) == "

Behaves like mean (see Julia docs)

\n" +sum_ref = md"Behaves like $(ref(sum))" +@test plain(sum_ref) == "Behaves like sum (see Julia docs)\n" +@test html(sum_ref) == "

Behaves like sum (see Julia docs)

\n" show(io::IO, m::MIME"text/html", r::Reference) = Markdown.withtag(io, :a, :href=>"test") do Markdown.htmlesc(io, Markdown.plaininline(r)) end -@test html(mean_ref) == "

Behaves like mean (see Julia docs)

\n" +@test html(sum_ref) == "

Behaves like sum (see Julia docs)

\n" @test md""" ````julia diff --git a/stdlib/Pkg/src/GraphType.jl b/stdlib/Pkg/src/GraphType.jl index e2af668c96406..6a879f0fa2676 100644 --- a/stdlib/Pkg/src/GraphType.jl +++ b/stdlib/Pkg/src/GraphType.jl @@ -1422,7 +1422,7 @@ function prune_graph!(graph::Graph) # Done - log_event_global!(graph, "pruned graph — stats (n. of packages, mean connectivity): before = ($np,$(mean(spp))) after = ($new_np,$(mean(new_spp)))") + log_event_global!(graph, "pruned graph — stats (n. of packages, mean connectivity): before = ($np,$(sum(spp)/length(spp))) after = ($new_np,$(sum(new_spp)/length(new_spp)))") # Replace old data with new data.pkgs = new_pkgs diff --git a/stdlib/REPL/src/docview.jl b/stdlib/REPL/src/docview.jl index f11aaef8fb26f..1b0d92691f9be 100644 --- a/stdlib/REPL/src/docview.jl +++ b/stdlib/REPL/src/docview.jl @@ -378,7 +378,7 @@ function fuzzyscore(needle, haystack) score += (acro ? 2 : 1)*length(is) # Matched characters score -= 2(length(needle)-length(is)) # Missing characters !acro && (score -= avgdistance(is)/10) # Contiguous - !isempty(is) && (score -= mean(is)/100) # Closer to beginning + !isempty(is) && (score -= sum(is)/length(is)/100) # Closer to beginning return score end diff --git a/stdlib/SparseArrays/test/sparse.jl b/stdlib/SparseArrays/test/sparse.jl index 6309efc11b8a0..ad17fc1d3edb1 100644 --- a/stdlib/SparseArrays/test/sparse.jl +++ b/stdlib/SparseArrays/test/sparse.jl @@ -601,7 +601,7 @@ end @test length(sprb45) == 20 sprb45nnzs[i] = sum(sprb45)[1] end - @test 4 <= mean(sprb45nnzs) <= 16 + @test 4 <= sum(sprb45nnzs)/length(sprb45nnzs) <= 16 end @testset "issue #5853, sparse diff" begin diff --git a/stdlib/Statistics/Project.toml b/stdlib/Statistics/Project.toml new file mode 100644 index 0000000000000..772693d53e56e --- /dev/null +++ b/stdlib/Statistics/Project.toml @@ -0,0 +1,6 @@ +name = "Statistics" +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[deps] +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" diff --git a/stdlib/Statistics/docs/src/index.md b/stdlib/Statistics/docs/src/index.md new file mode 100644 index 0000000000000..5a684541f015f --- /dev/null +++ b/stdlib/Statistics/docs/src/index.md @@ -0,0 +1,27 @@ +# Statistics + +```@meta +DocTestSetup = :(using Statistics) +``` + +The Statistics module contains basic statistics functionality. + +```@docs +Statistics.std +Statistics.stdm +Statistics.var +Statistics.varm +Statistics.cor +Statistics.cov +Statistics.mean! +Statistics.mean +Statistics.median! +Statistics.median +Statistics.middle +Statistics.quantile! +Statistics.quantile +``` + +```@meta +DocTestSetup = nothing +``` diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl new file mode 100644 index 0000000000000..be227b7db7359 --- /dev/null +++ b/stdlib/Statistics/src/Statistics.jl @@ -0,0 +1,906 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +""" + Statistics + +Standard library module for basic statistics functionality. +""" +module Statistics + +using LinearAlgebra, SparseArrays + +export cor, cov, std, stdm, var, varm, mean!, mean, + median!, median, middle, quantile!, quantile + +##### mean ##### + +""" + mean(f::Function, v) + +Apply the function `f` to each element of `v` and take the mean. + +```jldoctest +julia> mean(√, [1, 2, 3]) +1.3820881233139908 + +julia> mean([√1, √2, √3]) +1.3820881233139908 +``` +""" +function mean(f::Base.Callable, iterable) + y = iterate(iterable) + if y === nothing + throw(ArgumentError("mean of empty collection undefined: $(repr(iterable))")) + end + count = 1 + value, state = y + f_value = f(value) + total = Base.reduce_first(Base.add_sum, f_value) + y = iterate(iterable, state) + while y !== nothing + value, state = y + total += f(value) + count += 1 + y = iterate(iterable, state) + end + return total/count +end +mean(iterable) = mean(identity, iterable) +mean(f::Base.Callable, A::AbstractArray) = sum(f, A) / Base._length(A) + +""" + mean!(r, v) + +Compute the mean of `v` over the singleton dimensions of `r`, and write results to `r`. + +# Examples +```jldoctest +julia> v = [1 2; 3 4] +2×2 Array{Int64,2}: + 1 2 + 3 4 + +julia> mean!([1., 1.], v) +2-element Array{Float64,1}: + 1.5 + 3.5 + +julia> mean!([1. 1.], v) +1×2 Array{Float64,2}: + 2.0 3.0 +``` +""" +function mean!(R::AbstractArray, A::AbstractArray) + sum!(R, A; init=true) + x = max(1, Base._length(R)) // Base._length(A) + R .= R .* x + return R +end + +""" + mean(v; dims) + +Compute the mean of whole array `v`, or optionally along the given dimensions. + +!!! note + Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type + to represent missing values, and the [`skipmissing`](@ref) function to omit them. +""" +mean(A::AbstractArray; dims=:) = _mean(A, dims) + +_mean(A::AbstractArray{T}, region) where {T} = mean!(Base.reducedim_init(t -> t/2, +, A, region), A) +_mean(A::AbstractArray, ::Colon) = sum(A) / Base._length(A) + +function mean(r::AbstractRange{<:Real}) + isempty(r) && throw(ArgumentError("mean of an empty range is undefined")) + (first(r) + last(r)) / 2 +end + +median(r::AbstractRange{<:Real}) = mean(r) + +##### variances ##### + +# faster computation of real(conj(x)*y) +realXcY(x::Real, y::Real) = x*y +realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) + +var(iterable; corrected::Bool=true, mean=nothing) = _var(iterable, corrected, mean) + +function _var(iterable, corrected::Bool, mean) + y = iterate(iterable) + if y === nothing + throw(ArgumentError("variance of empty collection undefined: $(repr(iterable))")) + end + count = 1 + value, state = y + y = iterate(iterable, state) + if mean === nothing + # Use Welford algorithm as seen in (among other places) + # Knuth's TAOCP, Vol 2, page 232, 3rd edition. + M = value / 1 + S = real(zero(M)) + while y !== nothing + value, state = y + y = iterate(iterable, state) + count += 1 + new_M = M + (value - M) / count + S = S + realXcY(value - M, value - new_M) + M = new_M + end + return S / (count - Int(corrected)) + elseif isa(mean, Number) # mean provided + # Cannot use a compensated version, e.g. the one from + # "Updating Formulae and a Pairwise Algorithm for Computing Sample Variances." + # by Chan, Golub, and LeVeque, Technical Report STAN-CS-79-773, + # Department of Computer Science, Stanford University, + # because user can provide mean value that is different to mean(iterable) + sum2 = abs2(value - mean::Number) + while y !== nothing + value, state = y + y = iterate(iterable, state) + count += 1 + sum2 += abs2(value - mean) + end + return sum2 / (count - Int(corrected)) + else + throw(ArgumentError("invalid value of mean, $(mean)::$(typeof(mean))")) + end +end + +centralizedabs2fun(m) = x -> abs2.(x - m) +centralize_sumabs2(A::AbstractArray, m) = + mapreduce(centralizedabs2fun(m), +, A) +centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = + mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast) + +function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S + # following the implementation of _mapreducedim! at base/reducedim.jl + lsiz = Base.check_reducedims(R,A) + isempty(R) || fill!(R, zero(S)) + isempty(A) && return R + + if Base.has_fast_linear_indexing(A) && lsiz > 16 + nslices = div(Base._length(A), lsiz) + ibase = first(LinearIndices(A))-1 + for i = 1:nslices + @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz) + ibase += lsiz + end + return R + end + indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + if Base.reducedim1(R, A) + i1 = first(Base.indices1(R)) + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + r = R[i1,IR] + m = means[i1,IR] + @simd for i in axes(A, 1) + r += abs2(A[i,IA] - m) + end + R[i1,IR] = r + end + else + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + @simd for i in axes(A, 1) + R[i,IR] += abs2(A[i,IA] - means[i,IR]) + end + end + end + return R +end + +function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S + if isempty(A) + fill!(R, convert(S, NaN)) + else + rn = div(Base._length(A), Base._length(R)) - Int(corrected) + centralize_sumabs2!(R, A, m) + R .= R .* (1 // rn) + end + return R +end + +""" + varm(v, m; dims, corrected::Bool=true) + +Compute the sample variance of a collection `v` with known mean(s) `m`, +optionally over the given dimensions. `m` may contain means for each dimension of +`v`. If `corrected` is `true`, then the sum is scaled with `n-1`, +whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. + +!!! note + Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type + to represent missing values, and the [`skipmissing`](@ref) function to omit them. +""" +varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) + +_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = + varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) + +varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) + +function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T + n = Base._length(A) + n == 0 && return typeof((abs2(zero(T)) + abs2(zero(T)))/2)(NaN) + return centralize_sumabs2(A, m) / (n - Int(corrected)) +end + + +""" + var(v; dims, corrected::Bool=true, mean=nothing) + +Compute the sample variance of a vector or array `v`, optionally along the given dimensions. +The algorithm will return an estimator of the generative distribution's variance +under the assumption that each entry of `v` is an IID drawn from that generative +distribution. This computation is equivalent to calculating `sum(abs2, v - mean(v)) / +(length(v) - 1)`. If `corrected` is `true`, then the sum is scaled with `n-1`, +whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. +The mean `mean` over the region may be provided. + +!!! note + Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type + to represent missing values, and the [`skipmissing`](@ref) function to omit them. +""" +var(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _var(A, corrected, mean, dims) + +_var(A::AbstractArray, corrected::Bool, mean, dims) = + varm(A, something(mean, Statistics.mean(A, dims=dims)); corrected=corrected, dims=dims) + +_var(A::AbstractArray, corrected::Bool, mean, ::Colon) = + real(varm(A, something(mean, Statistics.mean(A)); corrected=corrected)) + +varm(iterable, m; corrected::Bool=true) = _var(iterable, corrected, m) + +## variances over ranges + +varm(v::AbstractRange, m::AbstractArray) = range_varm(v, m) +varm(v::AbstractRange, m) = range_varm(v, m) + +function range_varm(v::AbstractRange, m) + f = first(v) - m + s = step(v) + l = length(v) + vv = f^2 * l / (l - 1) + f * s * l + s^2 * l * (2 * l - 1) / 6 + if l == 0 || l == 1 + return typeof(vv)(NaN) + end + return vv +end + +function var(v::AbstractRange) + s = step(v) + l = length(v) + vv = abs2(s) * (l + 1) * l / 12 + if l == 0 || l == 1 + return typeof(vv)(NaN) + end + return vv +end + + +##### standard deviation ##### + +function sqrt!(A::AbstractArray) + for i in eachindex(A) + @inbounds A[i] = sqrt(A[i]) + end + A +end + +stdm(A::AbstractArray, m; corrected::Bool=true) = + sqrt.(varm(A, m; corrected=corrected)) + +""" + std(v; corrected::Bool=true, mean=nothing, dims) + +Compute the sample standard deviation of a vector or array `v`, optionally along the given +dimensions. The algorithm returns an estimator of the generative distribution's standard +deviation under the assumption that each entry of `v` is an IID drawn from that generative +distribution. This computation is equivalent to calculating `sqrt(sum((v - mean(v)).^2) / +(length(v) - 1))`. A pre-computed `mean` may be provided. If `corrected` is `true`, +then the sum is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is +`false` where `n = length(x)`. + +!!! note + Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type + to represent missing values, and the [`skipmissing`](@ref) function to omit them. +""" +std(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _std(A, corrected, mean, dims) + +_std(A::AbstractArray, corrected::Bool, mean, dims) = + sqrt.(var(A; corrected=corrected, mean=mean, dims=dims)) + +_std(A::AbstractArray, corrected::Bool, mean, ::Colon) = + sqrt.(var(A; corrected=corrected, mean=mean)) + +_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, dims) = + sqrt!(var(A; corrected=corrected, mean=mean, dims=dims)) + +_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, ::Colon) = + sqrt.(var(A; corrected=corrected, mean=mean)) + +std(iterable; corrected::Bool=true, mean=nothing) = + sqrt(var(iterable, corrected=corrected, mean=mean)) + +""" + stdm(v, m; corrected::Bool=true) + +Compute the sample standard deviation of a vector `v` +with known mean `m`. If `corrected` is `true`, +then the sum is scaled with `n-1`, whereas the sum is +scaled with `n` if `corrected` is `false` where `n = length(x)`. + +!!! note + Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type + to represent missing values, and the [`skipmissing`](@ref) function to omit them. +""" +stdm(iterable, m; corrected::Bool=true) = + std(iterable, corrected=corrected, mean=m) + + +###### covariance ###### + +# auxiliary functions + +_conj(x::AbstractArray{<:Real}) = x +_conj(x::AbstractArray) = conj(x) + +_getnobs(x::AbstractVector, vardim::Int) = Base._length(x) +_getnobs(x::AbstractMatrix, vardim::Int) = size(x, vardim) + +function _getnobs(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int) + n = _getnobs(x, vardim) + _getnobs(y, vardim) == n || throw(DimensionMismatch("dimensions of x and y mismatch")) + return n +end + +_vmean(x::AbstractVector, vardim::Int) = mean(x) +_vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim) + +# core functions + +unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x) +unscaled_covzm(x::AbstractVector) = sum(t -> t*t', x) +unscaled_covzm(x::AbstractMatrix, vardim::Int) = (vardim == 1 ? _conj(x'x) : x * x') + +unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(conj(y[i])*x[i] for i in eachindex(y, x)) +unscaled_covzm(x::AbstractVector, y::AbstractMatrix, vardim::Int) = + (vardim == 1 ? *(transpose(x), _conj(y)) : *(transpose(x), transpose(_conj(y)))) +unscaled_covzm(x::AbstractMatrix, y::AbstractVector, vardim::Int) = + (c = vardim == 1 ? *(transpose(x), _conj(y)) : x * _conj(y); reshape(c, length(c), 1)) +unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) = + (vardim == 1 ? *(transpose(x), _conj(y)) : *(x, adjoint(y))) + +# covzm (with centered data) + +covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (Base._length(x) - Int(corrected)) +function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) + C = unscaled_covzm(x, vardim) + T = promote_type(typeof(first(C) / 1), eltype(C)) + A = convert(AbstractMatrix{T}, C) + b = 1//(size(x, vardim) - corrected) + A .= A .* b + return A +end +covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = + unscaled_covzm(x, y) / (Base._length(x) - Int(corrected)) +function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true) + C = unscaled_covzm(x, y, vardim) + T = promote_type(typeof(first(C) / 1), eltype(C)) + A = convert(AbstractArray{T}, C) + b = 1//(_getnobs(x, y, vardim) - corrected) + A .= A .* b + return A +end + +# covm (with provided mean) +## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector} +## which can't be handled by broadcast +covm(x::AbstractVector, xmean; corrected::Bool=true) = + covzm(map(t -> t - xmean, x); corrected=corrected) +covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) = + covzm(x .- xmean, vardim; corrected=corrected) +covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) = + covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) +covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) = + covzm(x .- xmean, y .- ymean, vardim; corrected=corrected) + +# cov (API) +""" + cov(x::AbstractVector; corrected::Bool=true) + +Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum +is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. +""" +cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected) + +""" + cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) + +Compute the covariance matrix of the matrix `X` along the dimension `dims`. If `corrected` +is `true` (the default) then the sum is scaled with `n-1`, whereas the sum is scaled with `n` +if `corrected` is `false` where `n = size(X, dims)`. +""" +cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) = + covm(X, _vmean(X, dims), dims; corrected=corrected) + +""" + cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) + +Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the +default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where +``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is +`false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``. +""" +cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = + covm(x, mean(x), y, mean(y); corrected=corrected) + +""" + cov(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims::Int=1, corrected::Bool=true) + +Compute the covariance between the vectors or matrices `X` and `Y` along the dimension +`dims`. If `corrected` is `true` (the default) then the sum is scaled with `n-1`, whereas +the sum is scaled with `n` if `corrected` is `false` where `n = size(X, dims) = size(Y, dims)`. +""" +cov(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims::Int=1, corrected::Bool=true) = + covm(X, _vmean(X, dims), Y, _vmean(Y, dims), dims; corrected=corrected) + +##### correlation ##### + +""" + clampcor(x) + +Clamp a real correlation to between -1 and 1, leaving complex correlations unchanged +""" +clampcor(x::Real) = clamp(x, -1, 1) +clampcor(x) = x + +# cov2cor! + +function cov2cor!(C::AbstractMatrix{T}, xsd::AbstractArray) where T + nx = length(xsd) + size(C) == (nx, nx) || throw(DimensionMismatch("inconsistent dimensions")) + for j = 1:nx + for i = 1:j-1 + C[i,j] = adjoint(C[j,i]) + end + C[j,j] = oneunit(T) + for i = j+1:nx + C[i,j] = clampcor(C[i,j] / (xsd[i] * xsd[j])) + end + end + return C +end +function cov2cor!(C::AbstractMatrix, xsd, ysd::AbstractArray) + nx, ny = size(C) + length(ysd) == ny || throw(DimensionMismatch("inconsistent dimensions")) + for (j, y) in enumerate(ysd) # fixme (iter): here and in all `cov2cor!` we assume that `C` is efficiently indexed by integers + for i in 1:nx + C[i,j] = clampcor(C[i, j] / (xsd * y)) + end + end + return C +end +function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd) + nx, ny = size(C) + length(xsd) == nx || throw(DimensionMismatch("inconsistent dimensions")) + for j in 1:ny + for (i, x) in enumerate(xsd) + C[i,j] = clampcor(C[i,j] / (x * ysd)) + end + end + return C +end +function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray) + nx, ny = size(C) + (length(xsd) == nx && length(ysd) == ny) || + throw(DimensionMismatch("inconsistent dimensions")) + for (i, x) in enumerate(xsd) + for (j, y) in enumerate(ysd) + C[i,j] = clampcor(C[i,j] / (x * y)) + end + end + return C +end + +# corzm (non-exported, with centered data) + +corzm(x::AbstractVector{T}) where {T} = one(real(T)) +function corzm(x::AbstractMatrix, vardim::Int=1) + c = unscaled_covzm(x, vardim) + return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...))) +end +corzm(x::AbstractVector, y::AbstractMatrix, vardim::Int=1) = + cov2cor!(unscaled_covzm(x, y, vardim), sqrt(sum(abs2, x)), sqrt!(sum(abs2, y, dims=vardim))) +corzm(x::AbstractMatrix, y::AbstractVector, vardim::Int=1) = + cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt(sum(abs2, y))) +corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) = + cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt!(sum(abs2, y, dims=vardim))) + +# corm + +corm(x::AbstractVector{T}, xmean) where {T} = one(real(T)) +corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim) +function corm(x::AbstractVector, mx, y::AbstractVector, my) + n = length(x) + length(y) == n || throw(DimensionMismatch("inconsistent lengths")) + n > 0 || throw(ArgumentError("correlation only defined for non-empty vectors")) + + @inbounds begin + # Initialize the accumulators + xx = zero(sqrt(abs2(x[1]))) + yy = zero(sqrt(abs2(y[1]))) + xy = zero(x[1] * y[1]') + + @simd for i in eachindex(x, y) + xi = x[i] - mx + yi = y[i] - my + xx += abs2(xi) + yy += abs2(yi) + xy += xi * yi' + end + end + return clampcor(xy / max(xx, yy) / sqrt(min(xx, yy) / max(xx, yy))) +end + +corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) = + corzm(x .- xmean, y .- ymean, vardim) + +# cor +""" + cor(x::AbstractVector) + +Return the number one. +""" +cor(x::AbstractVector) = one(real(eltype(x))) + +""" + cor(X::AbstractMatrix; dims::Int=1) + +Compute the Pearson correlation matrix of the matrix `X` along the dimension `dims`. +""" +cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims) + +""" + cor(x::AbstractVector, y::AbstractVector) + +Compute the Pearson correlation between the vectors `x` and `y`. +""" +cor(x::AbstractVector, y::AbstractVector) = corm(x, mean(x), y, mean(y)) + +""" + cor(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims=1) + +Compute the Pearson correlation between the vectors or matrices `X` and `Y` along the dimension `dims`. +""" +cor(x::AbstractVecOrMat, y::AbstractVecOrMat; dims::Int=1) = + corm(x, _vmean(x, dims), y, _vmean(y, dims), dims) + +##### median & quantiles ##### + +""" + middle(x) + +Compute the middle of a scalar value, which is equivalent to `x` itself, but of the type of `middle(x, x)` for consistency. +""" +middle(x::Union{Bool,Int8,Int16,Int32,Int64,Int128,UInt8,UInt16,UInt32,UInt64,UInt128}) = Float64(x) +# Specialized functions for real types allow for improved performance +middle(x::AbstractFloat) = x +middle(x::Real) = (x + zero(x)) / 1 + +""" + middle(x, y) + +Compute the middle of two reals `x` and `y`, which is +equivalent in both value and type to computing their mean (`(x + y) / 2`). +""" +middle(x::Real, y::Real) = x/2 + y/2 + +""" + middle(range) + +Compute the middle of a range, which consists of computing the mean of its extrema. +Since a range is sorted, the mean is performed with the first and last element. + +```jldoctest +julia> middle(1:10) +5.5 +``` +""" +middle(a::AbstractRange) = middle(a[1], a[end]) + +""" + middle(a) + +Compute the middle of an array `a`, which consists of finding its +extrema and then computing their mean. + +```jldoctest +julia> a = [1,2,3.6,10.9] +4-element Array{Float64,1}: + 1.0 + 2.0 + 3.6 + 10.9 + +julia> middle(a) +5.95 +``` +""" +middle(a::AbstractArray) = ((v1, v2) = extrema(a); middle(v1, v2)) + +""" + median!(v) + +Like [`median`](@ref), but may overwrite the input vector. +""" +function median!(v::AbstractVector) + isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))")) + if eltype(v)<:AbstractFloat + @inbounds for x in v + isnan(x) && return x + end + end + inds = axes(v, 1) + n = Base._length(inds) + mid = div(first(inds)+last(inds),2) + if isodd(n) + return middle(partialsort!(v,mid)) + else + m = partialsort!(v, mid:mid+1) + return middle(m[1], m[2]) + end +end +median!(v::AbstractArray) = median!(vec(v)) + +""" + median(v; dims) + +Compute the median of an entire array `v`, or, optionally, +along the given dimensions. For an even number of +elements no exact median element exists, so the result is +equivalent to calculating mean of two median elements. + +!!! note + Julia does not ignore `NaN` values in the computation. Use the [`missing`](@ref) type + to represent missing values, and the [`skipmissing`](@ref) function to omit them. +""" +median(v::AbstractArray; dims=:) = _median(v, dims) + +_median(v::AbstractArray, dims) = mapslices(median!, v, dims = dims) + +_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, Base._length(v)), v)) + +# for now, use the R/S definition of quantile; may want variants later +# see ?quantile in R -- this is type 7 +""" + quantile!([q, ] v, p; sorted=false) + +Compute the quantile(s) of a vector `v` at the probability or probabilities `p`, which +can be given as a single value, a vector, or a tuple. If `p` is a vector, an optional +output array `q` may also be specified. (If not provided, a new output array is created.) +The keyword argument `sorted` indicates whether `v` can be assumed to be sorted; if +`false` (the default), then the elements of `v` may be partially sorted. + +The elements of `p` should be on the interval [0,1], and `v` should not have any `NaN` +values. + +Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, +for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan +(1996), and is the same as the R default. + +!!! note + Julia does not ignore `NaN` values in the computation: `quantile!` will + throw an `ArgumentError` in the presence of `NaN` values in the data array. + Use the [`missing`](@ref) type to represent missing values, and the + [`skipmissing`](@ref) function to omit them. + +* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", + *The American Statistician*, Vol. 50, No. 4, pp. 361-365 +""" +function quantile!(q::AbstractArray, v::AbstractVector, p::AbstractArray; + sorted::Bool=false) + if size(p) != size(q) + throw(DimensionMismatch("size of p, $(size(p)), must equal size of q, $(size(q))")) + end + isempty(q) && return q + + minp, maxp = extrema(p) + _quantilesort!(v, sorted, minp, maxp) + + for (i, j) in zip(eachindex(p), eachindex(q)) + @inbounds q[j] = _quantile(v,p[i]) + end + return q +end + +quantile!(v::AbstractVector, p::AbstractArray; sorted::Bool=false) = + quantile!(similar(p,float(eltype(v))), v, p; sorted=sorted) + +quantile!(v::AbstractVector, p::Real; sorted::Bool=false) = + _quantile(_quantilesort!(v, sorted, p, p), p) + +function quantile!(v::AbstractVector, p::Tuple{Vararg{Real}}; sorted::Bool=false) + isempty(p) && return () + minp, maxp = extrema(p) + _quantilesort!(v, sorted, minp, maxp) + return map(x->_quantile(v, x), p) +end + +# Function to perform partial sort of v for quantiles in given range +function _quantilesort!(v::AbstractArray, sorted::Bool, minp::Real, maxp::Real) + isempty(v) && throw(ArgumentError("empty data vector")) + + if !sorted + lv = length(v) + lo = floor(Int,1+minp*(lv-1)) + hi = ceil(Int,1+maxp*(lv-1)) + + # only need to perform partial sort + sort!(v, 1, lv, Base.Sort.PartialQuickSort(lo:hi), Base.Sort.Forward) + end + isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs")) + return v +end + +# Core quantile lookup function: assumes `v` sorted +@inline function _quantile(v::AbstractVector, p::Real) + 0 <= p <= 1 || throw(ArgumentError("input probability out of [0,1] range")) + + lv = length(v) + f0 = (lv - 1)*p # 0-based interpolated index + t0 = trunc(f0) + h = f0 - t0 + i = trunc(Int,t0) + 1 + + T = promote_type(eltype(v), typeof(v[1]*h)) + + if h == 0 + return convert(T, v[i]) + else + a = v[i] + b = v[i+1] + if isfinite(a) && isfinite(b) + return convert(T, a + h*(b-a)) + else + return convert(T, (1-h)*a + h*b) + end + end +end + + +""" + quantile(v, p; sorted=false) + +Compute the quantile(s) of a vector `v` at a specified probability or vector or tuple of +probabilities `p`. The keyword argument `sorted` indicates whether `v` can be assumed to +be sorted. + +The `p` should be on the interval [0,1], and `v` should not have any `NaN` values. + +Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, +for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan +(1996), and is the same as the R default. + +!!! note + Julia does not ignore `NaN` values in the computation: `quantile` will + throw an `ArgumentError` in the presence of `NaN` values in the data array. + Use the [`missing`](@ref) type to represent missing values, and the + [`skipmissing`](@ref) function to omit them. + +- Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", + *The American Statistician*, Vol. 50, No. 4, pp. 361-365 +""" +quantile(v::AbstractVector, p; sorted::Bool=false) = + quantile!(sorted ? v : Base.copymutable(v), p; sorted=sorted) + + +##### SparseArrays optimizations ##### + +function cov(X::SparseMatrixCSC; dims::Int=1, corrected::Bool=true) + vardim = dims + a, b = size(X) + n, p = vardim == 1 ? (a, b) : (b, a) + + # The covariance can be decomposed into two terms + # 1/(n - 1) ∑ (x_i - x̄)*(x_i - x̄)' = 1/(n - 1) (∑ x_i*x_i' - n*x̄*x̄') + # which can be evaluated via a sparse matrix-matrix product + + # Compute ∑ x_i*x_i' = X'X using sparse matrix-matrix product + out = Matrix(unscaled_covzm(X, vardim)) + + # Compute x̄ + x̄ᵀ = mean(X, dims=vardim) + + # Subtract n*x̄*x̄' from X'X + @inbounds for j in 1:p, i in 1:p + out[i,j] -= x̄ᵀ[i] * x̄ᵀ[j]' * n + end + + # scale with the sample size n or the corrected sample size n - 1 + return rmul!(out, inv(n - corrected)) +end + +# This is the function that does the reduction underlying var/std +function centralize_sumabs2!(R::AbstractArray{S}, A::SparseMatrixCSC{Tv,Ti}, means::AbstractArray) where {S,Tv,Ti} + lsiz = Base.check_reducedims(R,A) + size(means) == size(R) || error("size of means must match size of R") + isempty(R) || fill!(R, zero(S)) + isempty(A) && return R + + colptr = A.colptr + rowval = A.rowval + nzval = A.nzval + m = size(A, 1) + n = size(A, 2) + + if size(R, 1) == size(R, 2) == 1 + # Reduction along both columns and rows + R[1, 1] = centralize_sumabs2(A, means[1]) + elseif size(R, 1) == 1 + # Reduction along rows + @inbounds for col = 1:n + mu = means[col] + r = convert(S, (m-colptr[col+1]+colptr[col])*abs2(mu)) + @simd for j = colptr[col]:colptr[col+1]-1 + r += abs2(nzval[j] - mu) + end + R[1, col] = r + end + elseif size(R, 2) == 1 + # Reduction along columns + rownz = fill(convert(Ti, n), m) + @inbounds for col = 1:n + @simd for j = colptr[col]:colptr[col+1]-1 + row = rowval[j] + R[row, 1] += abs2(nzval[j] - means[row]) + rownz[row] -= 1 + end + end + for i = 1:m + R[i, 1] += rownz[i]*abs2(means[i]) + end + else + # Reduction along a dimension > 2 + @inbounds for col = 1:n + lastrow = 0 + @simd for j = colptr[col]:colptr[col+1]-1 + row = rowval[j] + for i = lastrow+1:row-1 + R[i, col] = abs2(means[i, col]) + end + R[row, col] = abs2(nzval[j] - means[row, col]) + lastrow = row + end + for i = lastrow+1:m + R[i, col] = abs2(means[i, col]) + end + end + end + return R +end + + +##### deprecations ##### + +# PR #21709 +@deprecate cov(x::AbstractVector, corrected::Bool) cov(x, corrected=corrected) +@deprecate cov(x::AbstractMatrix, vardim::Int, corrected::Bool) cov(x, dims=vardim, corrected=corrected) +@deprecate cov(X::AbstractVector, Y::AbstractVector, corrected::Bool) cov(X, Y, corrected=corrected) +@deprecate cov(X::AbstractVecOrMat, Y::AbstractVecOrMat, vardim::Int, corrected::Bool) cov(X, Y, dims=vardim, corrected=corrected) + +# issue #25501 +@deprecate mean(A::AbstractArray, dims) mean(A, dims=dims) +@deprecate varm(A::AbstractArray, m::AbstractArray, dims; kwargs...) varm(A, m; kwargs..., dims=dims) +@deprecate var(A::AbstractArray, dims; kwargs...) var(A; kwargs..., dims=dims) +@deprecate std(A::AbstractArray, dims; kwargs...) std(A; kwargs..., dims=dims) +@deprecate cov(X::AbstractMatrix, dim::Int; kwargs...) cov(X; kwargs..., dims=dim) +@deprecate cov(x::AbstractVecOrMat, y::AbstractVecOrMat, dim::Int; kwargs...) cov(x, y; kwargs..., dims=dim) +@deprecate cor(X::AbstractMatrix, dim::Int) cor(X, dims=dim) +@deprecate cor(x::AbstractVecOrMat, y::AbstractVecOrMat, dim::Int) cor(x, y, dims=dim) +@deprecate median(A::AbstractArray, dims; kwargs...) median(A; kwargs..., dims=dims) + +end # module diff --git a/stdlib/Statistics/test/runtests.jl b/stdlib/Statistics/test/runtests.jl new file mode 100644 index 0000000000000..ad0e5076bbe0d --- /dev/null +++ b/stdlib/Statistics/test/runtests.jl @@ -0,0 +1,602 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Statistics, Test, Random, LinearAlgebra, SparseArrays + +@testset "middle" begin + @test middle(3) === 3.0 + @test middle(2, 3) === 2.5 + let x = ((realmax(1.0)/4)*3) + @test middle(x, x) === x + end + @test middle(1:8) === 4.5 + @test middle([1:8;]) === 4.5 + + # ensure type-correctness + for T in [Bool,Int8,Int16,Int32,Int64,Int128,UInt8,UInt16,UInt32,UInt64,UInt128,Float16,Float32,Float64] + @test middle(one(T)) === middle(one(T), one(T)) + end +end + +@testset "median" begin + @test median([1.]) === 1. + @test median([1.,3]) === 2. + @test median([1.,3,2]) === 2. + + @test median([1,3,2]) === 2.0 + @test median([1,3,2,4]) === 2.5 + + @test median([0.0,Inf]) == Inf + @test median([0.0,-Inf]) == -Inf + @test median([0.,Inf,-Inf]) == 0.0 + @test median([1.,-1.,Inf,-Inf]) == 0.0 + @test isnan(median([-Inf,Inf])) + + X = [2 3 1 -1; 7 4 5 -4] + @test all(median(X, dims=2) .== [1.5, 4.5]) + @test all(median(X, dims=1) .== [4.5 3.5 3.0 -2.5]) + @test X == [2 3 1 -1; 7 4 5 -4] # issue #17153 + + @test_throws ArgumentError median([]) + @test isnan(median([NaN])) + @test isnan(median([0.0,NaN])) + @test isnan(median([NaN,0.0])) + @test isequal(median([NaN 0.0; 1.2 4.5], dims=2), reshape([NaN; 2.85], 2, 1)) + + @test median!([1 2 3 4]) == 2.5 + @test median!([1 2; 3 4]) == 2.5 + + @test invoke(median, Tuple{AbstractVector}, 1:10) == median(1:10) == 5.5 +end + +@testset "mean" begin + @test_throws ArgumentError mean(()) + @test mean((1,2,3)) === 2. + @test mean([0]) === 0. + @test mean([1.]) === 1. + @test mean([1.,3]) == 2. + @test mean([1,2,3]) == 2. + @test mean([0 1 2; 4 5 6], dims=1) == [2. 3. 4.] + @test mean([1 2 3; 4 5 6], dims=1) == [2.5 3.5 4.5] + @test mean(i->i+1, 0:2) === 2. + @test mean(isodd, [3]) === 1. + @test mean(x->3x, (1,1)) === 3. + + @test isnan(mean([NaN])) + @test isnan(mean([0.0,NaN])) + @test isnan(mean([NaN,0.0])) + + @test isnan(mean([0.,Inf,-Inf])) + @test isnan(mean([1.,-1.,Inf,-Inf])) + @test isnan(mean([-Inf,Inf])) + @test isequal(mean([NaN 0.0; 1.2 4.5], dims=2), reshape([NaN; 2.85], 2, 1)) + + # Check that small types are accumulated using wider type + for T in (Int8, UInt8) + x = [typemax(T) typemax(T)] + g = (v for v in x) + @test mean(x) == mean(g) == typemax(T) + @test mean(identity, x) == mean(identity, g) == typemax(T) + @test mean(x, dims=2) == [typemax(T)]' + end +end + +@testset "mean/median for ranges" begin + for f in (mean, median) + for n = 2:5 + @test f(2:n) == f([2:n;]) + @test f(2:0.1:n) ≈ f([2:0.1:n;]) + end + end +end + +@testset "var & std" begin + # edge case: empty vector + # iterable; this has to throw for type stability + @test_throws ArgumentError var(()) + @test_throws ArgumentError var((); corrected=false) + @test_throws ArgumentError var((); mean=2) + @test_throws ArgumentError var((); mean=2, corrected=false) + # reduction + @test isnan(var(Int[])) + @test isnan(var(Int[]; corrected=false)) + @test isnan(var(Int[]; mean=2)) + @test isnan(var(Int[]; mean=2, corrected=false)) + # reduction across dimensions + @test isequal(var(Int[], dims=1), [NaN]) + @test isequal(var(Int[], dims=1; corrected=false), [NaN]) + @test isequal(var(Int[], dims=1; mean=[2]), [NaN]) + @test isequal(var(Int[], dims=1; mean=[2], corrected=false), [NaN]) + + # edge case: one-element vector + # iterable + @test isnan(@inferred(var((1,)))) + @test var((1,); corrected=false) === 0.0 + @test var((1,); mean=2) === Inf + @test var((1,); mean=2, corrected=false) === 1.0 + # reduction + @test isnan(@inferred(var([1]))) + @test var([1]; corrected=false) === 0.0 + @test var([1]; mean=2) === Inf + @test var([1]; mean=2, corrected=false) === 1.0 + # reduction across dimensions + @test isequal(@inferred(var([1], dims=1)), [NaN]) + @test var([1], dims=1; corrected=false) ≈ [0.0] + @test var([1], dims=1; mean=[2]) ≈ [Inf] + @test var([1], dims=1; mean=[2], corrected=false) ≈ [1.0] + + @test var(1:8) == 6. + @test varm(1:8,1) == varm(Vector(1:8),1) + @test isnan(varm(1:1,1)) + @test isnan(var(1:1)) + @test isnan(var(1:-1)) + + @test @inferred(var(1.0:8.0)) == 6. + @test varm(1.0:8.0,1.0) == varm(Vector(1.0:8.0),1) + @test isnan(varm(1.0:1.0,1.0)) + @test isnan(var(1.0:1.0)) + @test isnan(var(1.0:-1.0)) + + @test @inferred(var(1.0f0:8.0f0)) === 6.f0 + @test varm(1.0f0:8.0f0,1.0f0) == varm(Vector(1.0f0:8.0f0),1) + @test isnan(varm(1.0f0:1.0f0,1.0f0)) + @test isnan(var(1.0f0:1.0f0)) + @test isnan(var(1.0f0:-1.0f0)) + + @test varm([1,2,3], 2) ≈ 1. + @test var([1,2,3]) ≈ 1. + @test var([1,2,3]; corrected=false) ≈ 2.0/3 + @test var([1,2,3]; mean=0) ≈ 7. + @test var([1,2,3]; mean=0, corrected=false) ≈ 14.0/3 + + @test varm((1,2,3), 2) ≈ 1. + @test var((1,2,3)) ≈ 1. + @test var((1,2,3); corrected=false) ≈ 2.0/3 + @test var((1,2,3); mean=0) ≈ 7. + @test var((1,2,3); mean=0, corrected=false) ≈ 14.0/3 + @test_throws ArgumentError var((1,2,3); mean=()) + + @test var([1 2 3 4 5; 6 7 8 9 10], dims=2) ≈ [2.5 2.5]' + @test var([1 2 3 4 5; 6 7 8 9 10], dims=2; corrected=false) ≈ [2.0 2.0]' + + @test stdm([1,2,3], 2) ≈ 1. + @test std([1,2,3]) ≈ 1. + @test std([1,2,3]; corrected=false) ≈ sqrt(2.0/3) + @test std([1,2,3]; mean=0) ≈ sqrt(7.0) + @test std([1,2,3]; mean=0, corrected=false) ≈ sqrt(14.0/3) + + @test stdm([1.0,2,3], 2) ≈ 1. + @test std([1.0,2,3]) ≈ 1. + @test std([1.0,2,3]; corrected=false) ≈ sqrt(2.0/3) + @test std([1.0,2,3]; mean=0) ≈ sqrt(7.0) + @test std([1.0,2,3]; mean=0, corrected=false) ≈ sqrt(14.0/3) + + @test std([1.0,2,3]; dims=1)[] ≈ 1. + @test std([1.0,2,3]; dims=1, corrected=false)[] ≈ sqrt(2.0/3) + @test std([1.0,2,3]; dims=1, mean=[0])[] ≈ sqrt(7.0) + @test std([1.0,2,3]; dims=1, mean=[0], corrected=false)[] ≈ sqrt(14.0/3) + + @test stdm((1,2,3), 2) ≈ 1. + @test std((1,2,3)) ≈ 1. + @test std((1,2,3); corrected=false) ≈ sqrt(2.0/3) + @test std((1,2,3); mean=0) ≈ sqrt(7.0) + @test std((1,2,3); mean=0, corrected=false) ≈ sqrt(14.0/3) + + @test std([1 2 3 4 5; 6 7 8 9 10], dims=2) ≈ sqrt.([2.5 2.5]') + @test std([1 2 3 4 5; 6 7 8 9 10], dims=2; corrected=false) ≈ sqrt.([2.0 2.0]') + + let A = ComplexF64[exp(i*im) for i in 1:10^4] + @test varm(A, 0.) ≈ sum(map(abs2, A)) / (length(A) - 1) + @test varm(A, mean(A)) ≈ var(A) + end + + @test var([1//1, 2//1]) isa Rational{Int} + @test var([1//1, 2//1], dims=1) isa Vector{Rational{Int}} + + @test std([1//1, 2//1]) isa Float64 + @test std([1//1, 2//1], dims=1) isa Vector{Float64} + + @testset "var: empty cases" begin + A = Matrix{Int}(undef, 0,1) + @test var(A) === NaN + + @test isequal(var(A, dims=1), fill(NaN, 1, 1)) + @test isequal(var(A, dims=2), fill(NaN, 0, 1)) + @test isequal(var(A, dims=(1, 2)), fill(NaN, 1, 1)) + @test isequal(var(A, dims=3), fill(NaN, 0, 1)) + end + + # issue #6672 + @test std(AbstractFloat[1,2,3], dims=1) == [1.0] +end + +function safe_cov(x, y, zm::Bool, cr::Bool) + n = length(x) + if !zm + x = x .- mean(x) + y = y .- mean(y) + end + dot(vec(x), vec(y)) / (n - Int(cr)) +end +X = [1.0 5.0; + 2.0 4.0; + 3.0 6.0; + 4.0 2.0; + 5.0 1.0] +Y = [6.0 2.0; + 1.0 7.0; + 5.0 8.0; + 3.0 4.0; + 2.0 3.0] + +@testset "covariance" begin + for vd in [1, 2], zm in [true, false], cr in [true, false] + # println("vd = $vd: zm = $zm, cr = $cr") + if vd == 1 + k = size(X, 2) + Cxx = zeros(k, k) + Cxy = zeros(k, k) + for i = 1:k, j = 1:k + Cxx[i,j] = safe_cov(X[:,i], X[:,j], zm, cr) + Cxy[i,j] = safe_cov(X[:,i], Y[:,j], zm, cr) + end + x1 = vec(X[:,1]) + y1 = vec(Y[:,1]) + else + k = size(X, 1) + Cxx = zeros(k, k) + Cxy = zeros(k, k) + for i = 1:k, j = 1:k + Cxx[i,j] = safe_cov(X[i,:], X[j,:], zm, cr) + Cxy[i,j] = safe_cov(X[i,:], Y[j,:], zm, cr) + end + x1 = vec(X[1,:]) + y1 = vec(Y[1,:]) + end + + c = zm ? Statistics.covm(x1, 0, corrected=cr) : + cov(x1, corrected=cr) + @test isa(c, Float64) + @test c ≈ Cxx[1,1] + @inferred cov(x1, corrected=cr) + + @test cov(X) == Statistics.covm(X, mean(X, dims=1)) + C = zm ? Statistics.covm(X, 0, vd, corrected=cr) : + cov(X, dims=vd, corrected=cr) + @test size(C) == (k, k) + @test C ≈ Cxx + @inferred cov(X, dims=vd, corrected=cr) + + @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1)) + c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) : + cov(x1, y1, corrected=cr) + @test isa(c, Float64) + @test c ≈ Cxy[1,1] + @inferred cov(x1, y1, corrected=cr) + + if vd == 1 + @test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1)) + end + C = zm ? Statistics.covm(x1, 0, Y, 0, vd, corrected=cr) : + cov(x1, Y, dims=vd, corrected=cr) + @test size(C) == (1, k) + @test vec(C) ≈ Cxy[1,:] + @inferred cov(x1, Y, dims=vd, corrected=cr) + + if vd == 1 + @test cov(X, y1) == Statistics.covm(X, mean(X, dims=1), y1, mean(y1)) + end + C = zm ? Statistics.covm(X, 0, y1, 0, vd, corrected=cr) : + cov(X, y1, dims=vd, corrected=cr) + @test size(C) == (k, 1) + @test vec(C) ≈ Cxy[:,1] + @inferred cov(X, y1, dims=vd, corrected=cr) + + @test cov(X, Y) == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + C = zm ? Statistics.covm(X, 0, Y, 0, vd, corrected=cr) : + cov(X, Y, dims=vd, corrected=cr) + @test size(C) == (k, k) + @test C ≈ Cxy + @inferred cov(X, Y, dims=vd, corrected=cr) + end +end + +function safe_cor(x, y, zm::Bool) + if !zm + x = x .- mean(x) + y = y .- mean(y) + end + x = vec(x) + y = vec(y) + dot(x, y) / (sqrt(dot(x, x)) * sqrt(dot(y, y))) +end +@testset "correlation" begin + for vd in [1, 2], zm in [true, false] + # println("vd = $vd: zm = $zm") + if vd == 1 + k = size(X, 2) + Cxx = zeros(k, k) + Cxy = zeros(k, k) + for i = 1:k, j = 1:k + Cxx[i,j] = safe_cor(X[:,i], X[:,j], zm) + Cxy[i,j] = safe_cor(X[:,i], Y[:,j], zm) + end + x1 = vec(X[:,1]) + y1 = vec(Y[:,1]) + else + k = size(X, 1) + Cxx = zeros(k, k) + Cxy = zeros(k, k) + for i = 1:k, j = 1:k + Cxx[i,j] = safe_cor(X[i,:], X[j,:], zm) + Cxy[i,j] = safe_cor(X[i,:], Y[j,:], zm) + end + x1 = vec(X[1,:]) + y1 = vec(Y[1,:]) + end + + c = zm ? Statistics.corm(x1, 0) : cor(x1) + @test isa(c, Float64) + @test c ≈ Cxx[1,1] + @inferred cor(x1) + + @test cor(X) == Statistics.corm(X, mean(X, dims=1)) + C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd) + @test size(C) == (k, k) + @test C ≈ Cxx + @inferred cor(X, dims=vd) + + @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1)) + c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1) + @test isa(c, Float64) + @test c ≈ Cxy[1,1] + @inferred cor(x1, y1) + + if vd == 1 + @test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1)) + end + C = zm ? Statistics.corm(x1, 0, Y, 0, vd) : cor(x1, Y, dims=vd) + @test size(C) == (1, k) + @test vec(C) ≈ Cxy[1,:] + @inferred cor(x1, Y, dims=vd) + + if vd == 1 + @test cor(X, y1) == Statistics.corm(X, mean(X, dims=1), y1, mean(y1)) + end + C = zm ? Statistics.corm(X, 0, y1, 0, vd) : cor(X, y1, dims=vd) + @test size(C) == (k, 1) + @test vec(C) ≈ Cxy[:,1] + @inferred cor(X, y1, dims=vd) + + @test cor(X, Y) == Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + C = zm ? Statistics.corm(X, 0, Y, 0, vd) : cor(X, Y, dims=vd) + @test size(C) == (k, k) + @test C ≈ Cxy + @inferred cor(X, Y, dims=vd) + end + + @test cor(repeat(1:17, 1, 17))[2] <= 1.0 + @test cor(1:17, 1:17) <= 1.0 + @test cor(1:17, 18:34) <= 1.0 + let tmp = range(1, stop=85, length=100) + tmp2 = Vector(tmp) + @test cor(tmp, tmp) <= 1.0 + @test cor(tmp, tmp2) <= 1.0 + end +end + +@testset "quantile" begin + @test quantile([1,2,3,4],0.5) == 2.5 + @test quantile([1,2,3,4],[0.5]) == [2.5] + @test quantile([1., 3],[.25,.5,.75])[2] == median([1., 3]) + @test quantile(100.0:-1.0:0.0, 0.0:0.1:1.0) == 0.0:10.0:100.0 + @test quantile(0.0:100.0, 0.0:0.1:1.0, sorted=true) == 0.0:10.0:100.0 + @test quantile(100f0:-1f0:0.0, 0.0:0.1:1.0) == 0f0:10f0:100f0 + @test quantile([Inf,Inf],0.5) == Inf + @test quantile([-Inf,1],0.5) == -Inf + @test quantile([0,1],1e-18) == 1e-18 + @test quantile([1, 2, 3, 4],[]) == [] + @test quantile([1, 2, 3, 4], (0.5,)) == (2.5,) + @test quantile([4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], (0.1, 0.2, 0.4, 0.9)) == (2.0, 3.0, 5.0, 11.0) + @test quantile([1, 2, 3, 4], ()) == () +end + +# StatsBase issue 164 +let y = [0.40003674665581906, 0.4085630862624367, 0.41662034698690303, 0.41662034698690303, 0.42189053966652057, 0.42189053966652057, 0.42553514344518345, 0.43985732442991354] + @test issorted(quantile(y, range(0.01, stop=0.99, length=17))) +end + +@testset "variance of complex arrays (#13309)" begin + z = rand(ComplexF64, 10) + @test var(z) ≈ invoke(var, Tuple{Any}, z) ≈ cov(z) ≈ var(z,dims=1)[1] ≈ sum(abs2, z .- mean(z))/9 + @test isa(var(z), Float64) + @test isa(invoke(var, Tuple{Any}, z), Float64) + @test isa(cov(z), Float64) + @test isa(var(z,dims=1), Vector{Float64}) + @test varm(z, 0.0) ≈ invoke(varm, Tuple{Any,Float64}, z, 0.0) ≈ sum(abs2, z)/9 + @test isa(varm(z, 0.0), Float64) + @test isa(invoke(varm, Tuple{Any,Float64}, z, 0.0), Float64) + @test cor(z) === 1.0 + v = varm([1.0+2.0im], 0; corrected = false) + @test v ≈ 5 + @test isa(v, Float64) +end + +@testset "cov and cor of complex arrays (issue #21093)" begin + x = [2.7 - 3.3im, 0.9 + 5.4im, 0.1 + 0.2im, -1.7 - 5.8im, 1.1 + 1.9im] + y = [-1.7 - 1.6im, -0.2 + 6.5im, 0.8 - 10.0im, 9.1 - 3.4im, 2.7 - 5.5im] + @test cov(x, y) ≈ 4.8365 - 12.119im + @test cov(y, x) ≈ 4.8365 + 12.119im + @test cov(x, reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1) + @test cov(reshape(x, :, 1), y) ≈ reshape([4.8365 - 12.119im], 1, 1) + @test cov(reshape(x, :, 1), reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1) + @test cov([x y]) ≈ [21.779 4.8365-12.119im; + 4.8365+12.119im 54.548] + @test cor(x, y) ≈ 0.14032104449218274 - 0.35160772008699703im + @test cor(y, x) ≈ 0.14032104449218274 + 0.35160772008699703im + @test cor(x, reshape(y, :, 1)) ≈ reshape([0.14032104449218274 - 0.35160772008699703im], 1, 1) + @test cor(reshape(x, :, 1), y) ≈ reshape([0.14032104449218274 - 0.35160772008699703im], 1, 1) + @test cor(reshape(x, :, 1), reshape(y, :, 1)) ≈ reshape([0.14032104449218274 - 0.35160772008699703im], 1, 1) + @test cor([x y]) ≈ [1.0 0.14032104449218274-0.35160772008699703im + 0.14032104449218274+0.35160772008699703im 1.0] +end + +@testset "Issue #17153 and PR #17154" begin + a = rand(10,10) + b = copy(a) + x = median(a, dims=1) + @test b == a + x = median(a, dims=2) + @test b == a + x = mean(a, dims=1) + @test b == a + x = mean(a, dims=2) + @test b == a + x = var(a, dims=1) + @test b == a + x = var(a, dims=2) + @test b == a + x = std(a, dims=1) + @test b == a + x = std(a, dims=2) + @test b == a +end + +# dimensional correctness +const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test") +isdefined(Main, :TestHelpers) || @eval Main include(joinpath($(BASE_TEST_PATH), "TestHelpers.jl")) +using .Main.TestHelpers: Furlong + +Statistics.middle(x::Furlong{p}) where {p} = Furlong{p}(middle(x.val)) +Statistics.middle(x::Furlong{p}, y::Furlong{p}) where {p} = Furlong{p}(middle(x.val, y.val)) + +@testset "Unitful elements" begin + r = Furlong(1):Furlong(1):Furlong(2) + a = Vector(r) + @test sum(r) == sum(a) == Furlong(3) + @test cumsum(r) == Furlong.([1,3]) + @test mean(r) == mean(a) == median(a) == median(r) == Furlong(1.5) + @test var(r) == var(a) == Furlong{2}(0.5) + @test std(r) == std(a) == Furlong{1}(sqrt(0.5)) + + # Issue #21786 + A = [Furlong{1}(rand(-5:5)) for i in 1:2, j in 1:2] + @test mean(mean(A, dims=1), dims=2)[1] === mean(A) + @test var(A, dims=1)[1] === var(A[:, 1]) + @test std(A, dims=1)[1] === std(A[:, 1]) +end + +# Issue #22901 +@testset "var and quantile of Any arrays" begin + x = Any[1, 2, 4, 10] + y = Any[1, 2, 4, 10//1] + @test var(x) === 16.25 + @test var(y) === 65//4 + @test std(x) === sqrt(16.25) + @test quantile(x, 0.5) === 3.0 + @test quantile(x, 1//2) === 3//1 +end + +@testset "Promotion in covzm. Issue #8080" begin + A = [1 -1 -1; -1 1 1; -1 1 -1; 1 -1 -1; 1 -1 1] + @test Statistics.covzm(A) - mean(A, dims=1)'*mean(A, dims=1)*size(A, 1)/(size(A, 1) - 1) ≈ cov(A) + A = [1//1 -1 -1; -1 1 1; -1 1 -1; 1 -1 -1; 1 -1 1] + @test (A'A - size(A, 1)*mean(A, dims=1)'*mean(A, dims=1))/4 == cov(A) +end + +@testset "Mean along dimension of empty array" begin + a0 = zeros(0) + a00 = zeros(0, 0) + a01 = zeros(0, 1) + a10 = zeros(1, 0) + @test isequal(mean(a0, dims=1) , fill(NaN, 1)) + @test isequal(mean(a00, dims=(1, 2)), fill(NaN, 1, 1)) + @test isequal(mean(a01, dims=1) , fill(NaN, 1, 1)) + @test isequal(mean(a10, dims=2) , fill(NaN, 1, 1)) +end + +@testset "cov/var/std of Vector{Vector}" begin + x = [[2,4,6],[4,6,8]] + @test var(x) ≈ vec(var([x[1] x[2]], dims=2)) + @test std(x) ≈ vec(std([x[1] x[2]], dims=2)) + @test cov(x) ≈ cov([x[1] x[2]], dims=2) +end + +@testset "var of sparse array" begin + se33 = SparseMatrixCSC{Float64}(I, 3, 3) + sA = sprandn(3, 7, 0.5) + pA = sparse(rand(3, 7)) + + for arr in (se33, sA, pA) + farr = Array(arr) + @test var(arr) ≈ var(farr) + @test var(arr, dims=1) ≈ var(farr, dims=1) + @test var(arr, dims=2) ≈ var(farr, dims=2) + @test var(arr, dims=(1, 2)) ≈ [var(farr)] + @test isequal(var(arr, dims=3), var(farr, dims=3)) + end + + @testset "empty cases" begin + @test var(sparse(Int[])) === NaN + @test isequal(var(spzeros(0, 1), dims=1), var(Matrix{Int}(I, 0, 1), dims=1)) + @test isequal(var(spzeros(0, 1), dims=2), var(Matrix{Int}(I, 0, 1), dims=2)) + @test isequal(var(spzeros(0, 1), dims=(1, 2)), var(Matrix{Int}(I, 0, 1), dims=(1, 2))) + @test isequal(var(spzeros(0, 1), dims=3), var(Matrix{Int}(I, 0, 1), dims=3)) + end +end + +# Faster covariance function for sparse matrices +# Prevents densifying the input matrix when subtracting the mean +# Test against dense implementation +# PR https://github.com/JuliaLang/julia/pull/22735 +# Part of this test needed to be hacked due to the treatment +# of Inf in sparse matrix algebra +# https://github.com/JuliaLang/julia/issues/22921 +# The issue will be resolved in +# https://github.com/JuliaLang/julia/issues/22733 +@testset "optimizing sparse $elty covariance" for elty in (Float64, Complex{Float64}) + n = 10 + p = 5 + np2 = div(n*p, 2) + nzvals, x_sparse = guardsrand(1) do + if elty <: Real + nzvals = randn(np2) + else + nzvals = complex.(randn(np2), randn(np2)) + end + nzvals, sparse(rand(1:n, np2), rand(1:p, np2), nzvals, n, p) + end + x_dense = convert(Matrix{elty}, x_sparse) + @testset "Test with no Infs and NaNs, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), + corrected in (true, false) + @test cov(x_sparse, dims=vardim, corrected=corrected) ≈ + cov(x_dense , dims=vardim, corrected=corrected) + end + + @testset "Test with $x11, vardim=$vardim, corrected=$corrected" for x11 in (NaN, Inf), + vardim in (1, 2), + corrected in (true, false) + x_sparse[1,1] = x11 + x_dense[1 ,1] = x11 + + cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) + cov_dense = cov(x_dense , dims=vardim, corrected=corrected) + @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end] + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + end + + @testset "Test with NaN and Inf, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), + corrected in (true, false) + x_sparse[1,1] = Inf + x_dense[1 ,1] = Inf + x_sparse[2,1] = NaN + x_dense[2 ,1] = NaN + + cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) + cov_dense = cov(x_dense , dims=vardim, corrected=corrected) + @test cov_sparse[(1 + vardim):end, (1 + vardim):end] ≈ + cov_dense[ (1 + vardim):end, (1 + vardim):end] + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + end +end diff --git a/test/choosetests.jl b/test/choosetests.jl index 2bceb7f35e626..5f2a4880f1926 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -41,7 +41,7 @@ function choosetests(choices = []) "intfuncs", "simdloop", "vecelement", "rational", "bitarray", "copy", "math", "fastmath", "functional", "iterators", "operators", "path", "ccall", "parse", "loading", "bigint", - "bigfloat", "sorting", "statistics", "spawn", "backtrace", + "bigfloat", "sorting", "spawn", "backtrace", "file", "read", "version", "namedtuple", "mpfr", "broadcast", "complex", "floatapprox", "stdlib", "reflection", "regex", "float16", diff --git a/test/dimensionful.jl b/test/dimensionful.jl index 21c9747806d54..d02313203103d 100644 --- a/test/dimensionful.jl +++ b/test/dimensionful.jl @@ -38,13 +38,12 @@ sylvester(a::Furlong,b::Furlong,c::Furlong) = -c / (a + b) for f in (:isfinite, :isnan, :isreal) @eval Base.$f(x::Furlong) = $f(x.val) end -for f in (:real,:imag,:complex,:middle,:+,:-) +for f in (:real,:imag,:complex,:+,:-) @eval Base.$f(x::Furlong{p}) where {p} = Furlong{p}($f(x.val)) end -import Base: +, -, ==, !=, <, <=, isless, isequal, *, /, //, div, rem, mod, ^, - middle, hypot -for op in (:+, :-, :middle, :hypot) +import Base: +, -, ==, !=, <, <=, isless, isequal, *, /, //, div, rem, mod, ^, hypot +for op in (:+, :-, :hypot) @eval function $op(x::Furlong{p}, y::Furlong{p}) where {p} v = $op(x.val, y.val) Furlong{p}(v) diff --git a/test/error.jl b/test/error.jl index 64932a8a80422..2d9b1065c036a 100644 --- a/test/error.jl +++ b/test/error.jl @@ -7,7 +7,9 @@ ratio(x) = x[2:end]./x[1:end-1] @test all(x->x ≈ 10.0, ratio(collect(ExponentialBackOff(n=10, max_delay=Inf, factor=10, jitter=0.0)))) guardsrand(12345) do - @test (mean(ratio(collect(ExponentialBackOff(n=100, max_delay=Inf, factor=1, jitter=0.1)))) - 1.0) < 1e-4 + x = ratio(collect(ExponentialBackOff(n=100, max_delay=Inf, factor=1, jitter=0.1))) + xm = sum(x) / length(x) + @test (xm - 1.0) < 1e-4 end end @testset "retrying after errors" begin diff --git a/test/offsetarray.jl b/test/offsetarray.jl index 5d73d4be8a4dc..fc88ec7eb49b1 100644 --- a/test/offsetarray.jl +++ b/test/offsetarray.jl @@ -5,6 +5,7 @@ using .Main.TestHelpers.OAs using DelimitedFiles using Random using LinearAlgebra +using Statistics const OAs_name = join(fullname(OAs), ".") @@ -374,6 +375,9 @@ I = findall(!iszero, z) @test mean(x->2x, A_3_3) == 10 @test mean(A_3_3, dims=1) == median(A_3_3, dims=1) == OffsetArray([2 5 8], (0,A_3_3.offsets[2])) @test mean(A_3_3, dims=2) == median(A_3_3, dims=2) == OffsetArray(reshape([4,5,6],(3,1)), (A_3_3.offsets[1],0)) +@test var(A_3_3) == 7.5 +@test std(A_3_3, dims=1) == OffsetArray([1 1 1], (0,A_3_3.offsets[2])) +@test std(A_3_3, dims=2) == OffsetArray(reshape([3,3,3], (3,1)), (A_3_3.offsets[1],0)) @test sum(OffsetArray(fill(1,3000), -1000)) == 3000 @test norm(v) ≈ norm(parent(v)) diff --git a/test/precompile.jl b/test/precompile.jl index cb3e190dcfa7e..51a5833a31730 100644 --- a/test/precompile.jl +++ b/test/precompile.jl @@ -226,7 +226,8 @@ try [:Base64, :CRC32c, :Dates, :DelimitedFiles, :Distributed, :FileWatching, :Markdown, :Future, :Libdl, :LinearAlgebra, :Logging, :Mmap, :Printf, :Profile, :Random, :Serialization, :SharedArrays, :SparseArrays, :SuiteSparse, :Test, - :Unicode, :REPL, :InteractiveUtils, :OldPkg, :Pkg, :LibGit2, :SHA, :UUIDs, :Sockets])) + :Unicode, :REPL, :InteractiveUtils, :OldPkg, :Pkg, :LibGit2, :SHA, :UUIDs, :Sockets, + :Statistics, ])) @test discard_module.(deps) == deps1 @test current_task()(0x01, 0x4000, 0x30031234) == 2 diff --git a/test/ranges.jl b/test/ranges.jl index aa0d7980de956..dbfbc6609c84b 100644 --- a/test/ranges.jl +++ b/test/ranges.jl @@ -829,14 +829,6 @@ end @test length(map(identity, UInt64(1):UInt64(5))) == 5 @test length(map(identity, UInt128(1):UInt128(5))) == 5 end -@testset "mean/median" begin - for f in (mean, median) - for n = 2:5 - @test f(2:n) == f([2:n;]) - @test f(2:0.1:n) ≈ f([2:0.1:n;]) - end - end -end @testset "issue #8531" begin smallint = (Int === Int64 ? (Int8,UInt8,Int16,UInt16,Int32,UInt32) : diff --git a/test/statistics.jl b/test/statistics.jl deleted file mode 100644 index edc401c4c937d..0000000000000 --- a/test/statistics.jl +++ /dev/null @@ -1,149 +0,0 @@ -# This file is a part of Julia. License is MIT: https://julialang.org/license - -using Test, Random, LinearAlgebra - -@testset "middle" begin - @test middle(3) === 3.0 - @test middle(2, 3) === 2.5 - let x = ((realmax(1.0)/4)*3) - @test middle(x, x) === x - end - @test middle(1:8) === 4.5 - @test middle([1:8;]) === 4.5 - - # ensure type-correctness - for T in [Bool,Int8,Int16,Int32,Int64,Int128,UInt8,UInt16,UInt32,UInt64,UInt128,Float16,Float32,Float64] - @test middle(one(T)) === middle(one(T), one(T)) - end -end - -@testset "median" begin - @test median([1.]) === 1. - @test median([1.,3]) === 2. - @test median([1.,3,2]) === 2. - - @test median([1,3,2]) === 2.0 - @test median([1,3,2,4]) === 2.5 - - @test median([0.0,Inf]) == Inf - @test median([0.0,-Inf]) == -Inf - @test median([0.,Inf,-Inf]) == 0.0 - @test median([1.,-1.,Inf,-Inf]) == 0.0 - @test isnan(median([-Inf,Inf])) - - X = [2 3 1 -1; 7 4 5 -4] - @test all(median(X, dims=2) .== [1.5, 4.5]) - @test all(median(X, dims=1) .== [4.5 3.5 3.0 -2.5]) - @test X == [2 3 1 -1; 7 4 5 -4] # issue #17153 - - @test_throws ArgumentError median([]) - @test isnan(median([NaN])) - @test isnan(median([0.0,NaN])) - @test isnan(median([NaN,0.0])) - @test isequal(median([NaN 0.0; 1.2 4.5], dims=2), reshape([NaN; 2.85], 2, 1)) - - @test median!([1 2 3 4]) == 2.5 - @test median!([1 2; 3 4]) == 2.5 - - @test invoke(median, Tuple{AbstractVector}, 1:10) == median(1:10) == 5.5 -end - -@testset "mean" begin - @test_throws ArgumentError mean(()) - @test mean((1,2,3)) === 2. - @test mean([0]) === 0. - @test mean([1.]) === 1. - @test mean([1.,3]) == 2. - @test mean([1,2,3]) == 2. - @test mean([0 1 2; 4 5 6], dims=1) == [2. 3. 4.] - @test mean([1 2 3; 4 5 6], dims=1) == [2.5 3.5 4.5] - @test mean(i->i+1, 0:2) === 2. - @test mean(isodd, [3]) === 1. - @test mean(x->3x, (1,1)) === 3. - - @test isnan(mean([NaN])) - @test isnan(mean([0.0,NaN])) - @test isnan(mean([NaN,0.0])) - - @test isnan(mean([0.,Inf,-Inf])) - @test isnan(mean([1.,-1.,Inf,-Inf])) - @test isnan(mean([-Inf,Inf])) - @test isequal(mean([NaN 0.0; 1.2 4.5], dims=2), reshape([NaN; 2.85], 2, 1)) - - # Check that small types are accumulated using wider type - for T in (Int8, UInt8) - x = [typemax(T) typemax(T)] - g = (v for v in x) - @test mean(x) == mean(g) == typemax(T) - @test mean(identity, x) == mean(identity, g) == typemax(T) - @test mean(x, dims=2) == [typemax(T)]' - end -end - -@testset "quantile" begin - @test quantile([1,2,3,4],0.5) == 2.5 - @test quantile([1,2,3,4],[0.5]) == [2.5] - @test quantile([1., 3],[.25,.5,.75])[2] == median([1., 3]) - @test quantile(100.0:-1.0:0.0, 0.0:0.1:1.0) == 0.0:10.0:100.0 - @test quantile(0.0:100.0, 0.0:0.1:1.0, sorted=true) == 0.0:10.0:100.0 - @test quantile(100f0:-1f0:0.0, 0.0:0.1:1.0) == 0f0:10f0:100f0 - @test quantile([Inf,Inf],0.5) == Inf - @test quantile([-Inf,1],0.5) == -Inf - @test quantile([0,1],1e-18) == 1e-18 - @test quantile([1, 2, 3, 4],[]) == [] - @test quantile([1, 2, 3, 4], (0.5,)) == (2.5,) - @test quantile([4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], (0.1, 0.2, 0.4, 0.9)) == (2.0, 3.0, 5.0, 11.0) - @test quantile([1, 2, 3, 4], ()) == () -end - -# StatsBase issue 164 -let y = [0.40003674665581906, 0.4085630862624367, 0.41662034698690303, 0.41662034698690303, 0.42189053966652057, 0.42189053966652057, 0.42553514344518345, 0.43985732442991354] - @test issorted(quantile(y, range(0.01, stop=0.99, length=17))) -end - -@testset "Issue #17153 and PR #17154" begin - a = rand(10,10) - b = copy(a) - x = median(a, dims=1) - @test b == a - x = median(a, dims=2) - @test b == a - x = mean(a, dims=1) - @test b == a - x = mean(a, dims=2) - @test b == a -end - -# dimensional correctness -isdefined(Main, :TestHelpers) || @eval Main include("TestHelpers.jl") -using .Main.TestHelpers: Furlong -@testset "Unitful elements" begin - r = Furlong(1):Furlong(1):Furlong(2) - a = Vector(r) - @test sum(r) == sum(a) == Furlong(3) - @test cumsum(r) == Furlong.([1,3]) - @test mean(r) == mean(a) == median(a) == median(r) == Furlong(1.5) - - # Issue #21786 - A = [Furlong{1}(rand(-5:5)) for i in 1:2, j in 1:2] - @test mean(mean(A, dims=1), dims=2)[1] === mean(A) -end - -# Issue #22901 -@testset "quantile of Any arrays" begin - x = Any[1, 2, 4, 10] - y = Any[1, 2, 4, 10//1] - @test quantile(x, 0.5) === 3.0 - @test quantile(x, 1//2) === 3//1 -end - -@testset "Mean along dimension of empty array" begin - a0 = zeros(0) - a00 = zeros(0, 0) - a01 = zeros(0, 1) - a10 = zeros(1, 0) - @test isequal(mean(a0, dims=1) , fill(NaN, 1)) - @test isequal(mean(a00, dims=(1, 2)), fill(NaN, 1, 1)) - @test isequal(mean(a01, dims=1) , fill(NaN, 1, 1)) - @test isequal(mean(a10, dims=2) , fill(NaN, 1, 1)) -end