From a5bf6a44fd3ea6ff867ee2effe06f6c94922d640 Mon Sep 17 00:00:00 2001 From: mikmoore Date: Sat, 4 Jun 2022 09:37:58 -0600 Subject: [PATCH 01/11] faster minimum/maximum for IEEEFloat --- base/float.jl | 10 ++--- base/reduce.jl | 104 ++++++++++++++++++++++------------------------ base/reducedim.jl | 2 +- 3 files changed, 55 insertions(+), 61 deletions(-) diff --git a/base/float.jl b/base/float.jl index 60850b7e02f64..edb17e9899700 100644 --- a/base/float.jl +++ b/base/float.jl @@ -415,11 +415,7 @@ end isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y) # interpret as sign-magnitude integer -@inline function _fpint(x) - IntT = inttype(typeof(x)) - ix = reinterpret(IntT, x) - return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix) -end +@inline _fpint(x) = flipifneg(reinterpret(Signed, x)) @inline function isless(a::T, b::T) where T<:IEEEFloat (isnan(a) || isnan(b)) && return !isnan(a) @@ -427,6 +423,10 @@ end return _fpint(a) < _fpint(b) end +# if negative, flip all bits except the top bit +# used for permuting the order of IEEEFloat values +flipifneg(x::BitSigned) = xor(x, ifelse(signbit(x), typemax(x), zero(x))) + # Exact Float (Tf) vs Integer (Ti) comparisons # Assumes: # - typemax(Ti) == 2^n-1 diff --git a/base/reduce.jl b/base/reduce.jl index 45284d884a279..201dd6bc2e48c 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -349,6 +349,11 @@ reduce_empty(op::MappingRF, ::Type{T}) where {T} = mapreduce_empty(op.f, op.rf, reduce_empty(op::FilteringRF, ::Type{T}) where {T} = reduce_empty(op.rf, T) reduce_empty(op::FlipArgs, ::Type{T}) where {T} = reduce_empty(op.f, T) +# return a function to transform a value of type `T` to be fast when the operation `op` is applied +_makefast_preproc(op, T) = identity +# reverse the transform used by makefast_preproc, restoring `x` to a value of type `T` +_makefast_postproc(op, T, x) = x + """ Base.mapreduce_empty(f, op, T) @@ -410,7 +415,13 @@ The default is `reduce_first(op, f(x))`. """ mapreduce_first(f, op, x) = reduce_first(op, f(x)) -_mapreduce(f, op, A::AbstractArrayOrBroadcasted) = _mapreduce(f, op, IndexStyle(A), A) +function _mapreduce(f, op, A::AbstractArrayOrBroadcasted) + fT = _return_type(f, Tuple{eltype(A)}) + accelerator = _makefast_preproc(op, fT) + ffast = accelerator ∘ f + rfast = _mapreduce(ffast, op, IndexStyle(A), A) + return _makefast_postproc(op, fT, rfast) +end function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted) inds = LinearIndices(A) @@ -604,62 +615,45 @@ julia> prod(1:5; init = 1.0) """ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...) -## maximum, minimum, & extrema -_fast(::typeof(min),x,y) = min(x,y) -_fast(::typeof(max),x,y) = max(x,y) -function _fast(::typeof(max), x::AbstractFloat, y::AbstractFloat) - ifelse(isnan(x), - x, - ifelse(x > y, x, y)) +# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior. +# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value. +# In the case that both arguments are `NaN`, either value may be returned. +# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is: +# -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN +# The desired ordering for min is: +# NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf +# The desired ordering for max is: +# -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign) +# Achieving this ordering requires two steps: +# 1) flip the ordering of all values with a negative sign +# 2) circularly-shift the values so that NaNs are together at the correct end of the spectrum +# The following functions perform this transformation and reverse it. +# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail. +function _makefast_preproc(op::typeof(min), ::Type{T}) where T<:IEEEFloat + # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster + topval = flipifneg(reinterpret(Signed, typemax(T))) + offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top + floatorder_min(x) = flipifneg(reinterpret(Signed, x)) + offset + return floatorder_min end - -function _fast(::typeof(min),x::AbstractFloat, y::AbstractFloat) - ifelse(isnan(x), - x, - ifelse(x < y, x, y)) +function _makefast_postproc(op::typeof(min), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat + # undo the transformation of _makefast_preproc + topval = flipifneg(reinterpret(Signed, typemax(T))) + offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top + return reinterpret(T, flipifneg(x - offset)) end - -isbadzero(::typeof(max), x::AbstractFloat) = (x == zero(x)) & signbit(x) -isbadzero(::typeof(min), x::AbstractFloat) = (x == zero(x)) & !signbit(x) -isbadzero(op, x) = false -isgoodzero(::typeof(max), x) = isbadzero(min, x) -isgoodzero(::typeof(min), x) = isbadzero(max, x) - -function mapreduce_impl(f, op::Union{typeof(max), typeof(min)}, - A::AbstractArrayOrBroadcasted, first::Int, last::Int) - a1 = @inbounds A[first] - v1 = mapreduce_first(f, op, a1) - v2 = v3 = v4 = v1 - chunk_len = 256 - start = first + 1 - simdstop = start + chunk_len - 4 - while simdstop <= last - 3 - @inbounds for i in start:4:simdstop - v1 = _fast(op, v1, f(A[i+0])) - v2 = _fast(op, v2, f(A[i+1])) - v3 = _fast(op, v3, f(A[i+2])) - v4 = _fast(op, v4, f(A[i+3])) - end - checkbounds(A, simdstop+3) - start += chunk_len - simdstop += chunk_len - end - v = op(op(v1,v2),op(v3,v4)) - for i in start:last - @inbounds ai = A[i] - v = op(v, f(ai)) - end - - # enforce correct order of 0.0 and -0.0 - # e.g. maximum([0.0, -0.0]) === 0.0 - # should hold - if isbadzero(op, v) - for i in first:last - x = @inbounds A[i] - isgoodzero(op,x) && return x - end - end - return v +function _makefast_preproc(op::typeof(max), ::Type{T}) where T<:IEEEFloat + # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster + botval = flipifneg(reinterpret(Signed, typemin(T))) + offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom + floatorder_max(x) = flipifneg(reinterpret(Signed, x)) + offset + return floatorder_max +end +function _makefast_postproc(op::typeof(max), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat + # undo the transformation of _makefast_preproc + botval = flipifneg(reinterpret(Signed, typemin(T))) + offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom + return reinterpret(T, flipifneg(x - offset)) end """ diff --git a/base/reducedim.jl b/base/reducedim.jl index a2dfc28ca99c3..ca0e9984dcb62 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -363,7 +363,7 @@ _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, ::Colon) = mapfoldl_impl(f, op, nt, A) _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) = - _mapreduce(f, op, IndexStyle(A), A) + _mapreduce(f, op, A) _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, dims) = mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A) From aea0ec612a09f673818f78b553b403fe30807be8 Mon Sep 17 00:00:00 2001 From: mikmoore Date: Sat, 4 Jun 2022 10:28:45 -0600 Subject: [PATCH 02/11] faster isless --- base/float.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/base/float.jl b/base/float.jl index edb17e9899700..ac895b3570b0c 100644 --- a/base/float.jl +++ b/base/float.jl @@ -414,13 +414,13 @@ end isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y) -# interpret as sign-magnitude integer -@inline _fpint(x) = flipifneg(reinterpret(Signed, x)) - @inline function isless(a::T, b::T) where T<:IEEEFloat - (isnan(a) || isnan(b)) && return !isnan(a) - - return _fpint(a) < _fpint(b) + botval = flipifneg(reinterpret(Signed, typemin(T))) + topval = flipifneg(reinterpret(Signed, typemax(T))) + offset = typemin(botval) - botval # adding offset will place typemin(T) at the bottom, wrapping NaN to the top + u = flipifneg(reinterpret(Signed, a)) + offset + v = flipifneg(reinterpret(Signed, b)) + offset + return (u < v) & (u <= topval+offset) # (a < b) & (a <= Inf), with the proper signed zero handling end # if negative, flip all bits except the top bit From a0b5853049f64daec28668baf50bc8ad78ae295b Mon Sep 17 00:00:00 2001 From: mikmoore Date: Sat, 4 Jun 2022 11:12:00 -0600 Subject: [PATCH 03/11] only apply _makefast if type is concrete --- base/reduce.jl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index 201dd6bc2e48c..00ff833c19336 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -417,10 +417,14 @@ mapreduce_first(f, op, x) = reduce_first(op, f(x)) function _mapreduce(f, op, A::AbstractArrayOrBroadcasted) fT = _return_type(f, Tuple{eltype(A)}) - accelerator = _makefast_preproc(op, fT) - ffast = accelerator ∘ f - rfast = _mapreduce(ffast, op, IndexStyle(A), A) - return _makefast_postproc(op, fT, rfast) + if isconcretetype(fT) + accelerator = _makefast_preproc(op, fT) + ffast = accelerator ∘ f + rfast = _mapreduce(ffast, op, IndexStyle(A), A) + return _makefast_postproc(op, fT, rfast) + else + return _mapreduce(f, op, IndexStyle(A), A) + end end function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted) From 0fc9ce00a8f74edf8a0c2aaa966d77ee78ccdbb4 Mon Sep 17 00:00:00 2001 From: mikmoore Date: Sat, 4 Jun 2022 11:44:40 -0600 Subject: [PATCH 04/11] fast extrema --- base/reduce.jl | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index 00ff833c19336..ab50b29c4e042 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -634,31 +634,43 @@ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...) # The following functions perform this transformation and reverse it. # Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail. function _makefast_preproc(op::typeof(min), ::Type{T}) where T<:IEEEFloat - # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster topval = flipifneg(reinterpret(Signed, typemax(T))) offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top - floatorder_min(x) = flipifneg(reinterpret(Signed, x)) + offset + floatorder_min(x::T) = flipifneg(reinterpret(Signed, x)) + offset return floatorder_min end function _makefast_postproc(op::typeof(min), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat - # undo the transformation of _makefast_preproc topval = flipifneg(reinterpret(Signed, typemax(T))) offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top return reinterpret(T, flipifneg(x - offset)) end function _makefast_preproc(op::typeof(max), ::Type{T}) where T<:IEEEFloat - # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster botval = flipifneg(reinterpret(Signed, typemin(T))) offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom - floatorder_max(x) = flipifneg(reinterpret(Signed, x)) + offset + floatorder_max(x::T) = flipifneg(reinterpret(Signed, x)) + offset return floatorder_max end function _makefast_postproc(op::typeof(max), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat - # undo the transformation of _makefast_preproc botval = flipifneg(reinterpret(Signed, typemin(T))) offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom return reinterpret(T, flipifneg(x - offset)) end +function _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}) where T<:IEEEFloat + topval = flipifneg(reinterpret(Signed, typemax(T))) + offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top + botval = flipifneg(reinterpret(Signed, typemin(T))) + offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom + floatorder_extrema((x,y)::NTuple{2,T}) = (flipifneg(reinterpret(Signed, x))+offset_min, flipifneg(reinterpret(Signed, y))+offset_max) + return floatorder_extrema +end +function _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}, (x,y)::NTuple{2,Base.BitSigned}) where T<:IEEEFloat + # undo the transformation of _makefast_preproc + topval = flipifneg(reinterpret(Signed, typemax(T))) + offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top + botval = flipifneg(reinterpret(Signed, typemin(T))) + offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom + return (reinterpret(T, flipifneg(x - offset_min)), reinterpret(T, flipifneg(x - offset_max))) +end """ maximum(f, itr; [init]) From 130b0e5828e2232bc9afdeb5cfcfcda263e84d8f Mon Sep 17 00:00:00 2001 From: mikmoore Date: Sun, 5 Jun 2022 09:07:51 -0600 Subject: [PATCH 05/11] less ambiguity in accelerator definitions --- base/reduce.jl | 45 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index ab50b29c4e042..d980141aa8209 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -633,43 +633,18 @@ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...) # 2) circularly-shift the values so that NaNs are together at the correct end of the spectrum # The following functions perform this transformation and reverse it. # Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail. -function _makefast_preproc(op::typeof(min), ::Type{T}) where T<:IEEEFloat - topval = flipifneg(reinterpret(Signed, typemax(T))) - offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top - floatorder_min(x::T) = flipifneg(reinterpret(Signed, x)) + offset - return floatorder_min -end -function _makefast_postproc(op::typeof(min), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat - topval = flipifneg(reinterpret(Signed, typemax(T))) - offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top - return reinterpret(T, flipifneg(x - offset)) -end -function _makefast_preproc(op::typeof(max), ::Type{T}) where T<:IEEEFloat - botval = flipifneg(reinterpret(Signed, typemin(T))) - offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom - floatorder_max(x::T) = flipifneg(reinterpret(Signed, x)) + offset - return floatorder_max -end -function _makefast_postproc(op::typeof(max), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat - botval = flipifneg(reinterpret(Signed, typemin(T))) - offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom - return reinterpret(T, flipifneg(x - offset)) -end -function _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}) where T<:IEEEFloat - topval = flipifneg(reinterpret(Signed, typemax(T))) +for (T,S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64)) + topval = flipifneg(reinterpret(S, typemax(T))) + botval = flipifneg(reinterpret(S, typemin(T))) offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top - botval = flipifneg(reinterpret(Signed, typemin(T))) offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom - floatorder_extrema((x,y)::NTuple{2,T}) = (flipifneg(reinterpret(Signed, x))+offset_min, flipifneg(reinterpret(Signed, y))+offset_max) - return floatorder_extrema -end -function _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}, (x,y)::NTuple{2,Base.BitSigned}) where T<:IEEEFloat - # undo the transformation of _makefast_preproc - topval = flipifneg(reinterpret(Signed, typemax(T))) - offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top - botval = flipifneg(reinterpret(Signed, typemin(T))) - offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom - return (reinterpret(T, flipifneg(x - offset_min)), reinterpret(T, flipifneg(x - offset_max))) + + @eval _makefast_preproc(op::typeof(min), ::Type{$T}) = (floatorder_min(x::$T) = flipifneg(reinterpret($S, x)) + $offset_min) + @eval _makefast_preproc(op::typeof(max), ::Type{$T}) = (floatorder_max(x::$T) = flipifneg(reinterpret($S, x)) + $offset_max) + @eval _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) = (floatorder_extrema((x,y)::NTuple{2,$T}) = (flipifneg(reinterpret($S, x))+$offset_min, flipifneg(reinterpret($S, y))+$offset_max)) + @eval _makefast_postproc(op::typeof(min), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_min)) + @eval _makefast_postproc(op::typeof(max), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_max)) + @eval _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}, (x,y)::NTuple{2,$S}) = (reinterpret($T, flipifneg(x - $offset_min)), reinterpret($T, flipifneg(x - $offset_max))) end """ From 02efa06ee0a4d719eb61d1326eb18566384646e1 Mon Sep 17 00:00:00 2001 From: mikmoore Date: Sun, 5 Jun 2022 10:53:57 -0600 Subject: [PATCH 06/11] only apply accelerator if it exists --- base/reduce.jl | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index d980141aa8209..fa195752df9ee 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -416,15 +416,11 @@ The default is `reduce_first(op, f(x))`. mapreduce_first(f, op, x) = reduce_first(op, f(x)) function _mapreduce(f, op, A::AbstractArrayOrBroadcasted) - fT = _return_type(f, Tuple{eltype(A)}) - if isconcretetype(fT) - accelerator = _makefast_preproc(op, fT) - ffast = accelerator ∘ f - rfast = _mapreduce(ffast, op, IndexStyle(A), A) - return _makefast_postproc(op, fT, rfast) - else - return _mapreduce(f, op, IndexStyle(A), A) - end + fT = _return_type(f, Tuple{eltype(A)}) # determine type of f(A[i]) + preproc = _makefast_preproc(op, fT) # fetch an accelerator, if available + ffast = preproc == identity ? f : preproc ∘ f # apply accelerator, if available + rfast = _mapreduce(ffast, op, IndexStyle(A), A) + return _makefast_postproc(op, fT, rfast) end function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted) From f00ac8c7bfa165424f78f355ba879ed3092029fb Mon Sep 17 00:00:00 2001 From: mikmoore <95002244+mikmoore@users.noreply.github.com> Date: Sun, 5 Jun 2022 18:05:46 -0600 Subject: [PATCH 07/11] Apply suggestions from code review Co-authored-by: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> --- base/reduce.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/reduce.jl b/base/reduce.jl index fa195752df9ee..7f923c1ec18e4 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -418,7 +418,7 @@ mapreduce_first(f, op, x) = reduce_first(op, f(x)) function _mapreduce(f, op, A::AbstractArrayOrBroadcasted) fT = _return_type(f, Tuple{eltype(A)}) # determine type of f(A[i]) preproc = _makefast_preproc(op, fT) # fetch an accelerator, if available - ffast = preproc == identity ? f : preproc ∘ f # apply accelerator, if available + ffast = preproc === identity ? f : preproc ∘ f # apply accelerator, if available rfast = _mapreduce(ffast, op, IndexStyle(A), A) return _makefast_postproc(op, fT, rfast) end From 81a0ff5c4f37c71a813338c17c7f08fb87a404f2 Mon Sep 17 00:00:00 2001 From: mikmoore Date: Wed, 21 Dec 2022 17:10:25 -0800 Subject: [PATCH 08/11] reorganize --- base/reduce.jl | 126 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 38 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index 7f923c1ec18e4..8f2442441d2b0 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -349,11 +349,6 @@ reduce_empty(op::MappingRF, ::Type{T}) where {T} = mapreduce_empty(op.f, op.rf, reduce_empty(op::FilteringRF, ::Type{T}) where {T} = reduce_empty(op.rf, T) reduce_empty(op::FlipArgs, ::Type{T}) where {T} = reduce_empty(op.f, T) -# return a function to transform a value of type `T` to be fast when the operation `op` is applied -_makefast_preproc(op, T) = identity -# reverse the transform used by makefast_preproc, restoring `x` to a value of type `T` -_makefast_postproc(op, T, x) = x - """ Base.mapreduce_empty(f, op, T) @@ -416,11 +411,8 @@ The default is `reduce_first(op, f(x))`. mapreduce_first(f, op, x) = reduce_first(op, f(x)) function _mapreduce(f, op, A::AbstractArrayOrBroadcasted) - fT = _return_type(f, Tuple{eltype(A)}) # determine type of f(A[i]) - preproc = _makefast_preproc(op, fT) # fetch an accelerator, if available - ffast = preproc === identity ? f : preproc ∘ f # apply accelerator, if available - rfast = _mapreduce(ffast, op, IndexStyle(A), A) - return _makefast_postproc(op, fT, rfast) + pre_f, op_fast, post = _makefast_mapreduction(f, op, A) # find an accelerator, if available + return post(_mapreduce(pre_f, op_fast, IndexStyle(A), A)) end function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted) @@ -615,34 +607,6 @@ julia> prod(1:5; init = 1.0) """ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...) -# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior. -# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value. -# In the case that both arguments are `NaN`, either value may be returned. -# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is: -# -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN -# The desired ordering for min is: -# NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf -# The desired ordering for max is: -# -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign) -# Achieving this ordering requires two steps: -# 1) flip the ordering of all values with a negative sign -# 2) circularly-shift the values so that NaNs are together at the correct end of the spectrum -# The following functions perform this transformation and reverse it. -# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail. -for (T,S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64)) - topval = flipifneg(reinterpret(S, typemax(T))) - botval = flipifneg(reinterpret(S, typemin(T))) - offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top - offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom - - @eval _makefast_preproc(op::typeof(min), ::Type{$T}) = (floatorder_min(x::$T) = flipifneg(reinterpret($S, x)) + $offset_min) - @eval _makefast_preproc(op::typeof(max), ::Type{$T}) = (floatorder_max(x::$T) = flipifneg(reinterpret($S, x)) + $offset_max) - @eval _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) = (floatorder_extrema((x,y)::NTuple{2,$T}) = (flipifneg(reinterpret($S, x))+$offset_min, flipifneg(reinterpret($S, y))+$offset_max)) - @eval _makefast_postproc(op::typeof(min), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_min)) - @eval _makefast_postproc(op::typeof(max), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_max)) - @eval _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}, (x,y)::NTuple{2,$S}) = (reinterpret($T, flipifneg(x - $offset_min)), reinterpret($T, flipifneg(x - $offset_max))) -end - """ maximum(f, itr; [init]) @@ -1337,3 +1301,89 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T} end return n end + +""" + Base._FastMapReduce(transform, mapfun) + +Essentially a [`ComposedFunction`](@ref) but with limited functionality and +a few specialized methods so that it can initialize collections properly. + +Used with `mapreduce` in some situations. +""" +struct _FastMapReduce{A,B} + trans::A + f::B +end +# (c::_FastMapReduce)(x...; kw...) = c.trans(c.f(x...; kw...)) # behave like trans ∘ f +(c::_FastMapReduce)(x) = c.trans(c.f(x)) # behave like trans ∘ f +mapreduce_empty(c::_FastMapReduce, op, T) = c.trans(mapreduce_empty(c.f, op, T)) +mapreduce_first(c::_FastMapReduce, op, x) = c.trans(mapreduce_first(c.f, op, x)) + +""" + Base._makefast_mapreduction(f, op, A) -> (pre_f, op_fast, post) + +Wrapper to call `Base._makefast_reduction(op, T)`, where `T` is the return type of +`f(::eltype(A))`, and compose the resulting `pre`-function with `f` to make `pre_f`. +""" +function _makefast_mapreduction(f, op, A) + elT = eltype(A) # try this + if elT == Any + elT = @default_eltype(A) # try again + end + fT = _return_type(f, Tuple{elT}) # determine type of f(A[i]), if possible + pre, op_fast, post = _makefast_reduction(op, fT) # find an accelerator, if available + return _FastMapReduce(pre, f), op_fast, post +end + +""" + Base._makefast_reduction(op, T) -> (pre, op_fast, post) + +Define a series of functions that can be used accelerate the computation of `op(x, y)`. +Incorrect results may be produced if it does not hold that `op(x::T, y::T)` and +`post(op_fast(pre(x::T), pre(y::T)))` produce equivalent results (and further, that this +holds for arbitrary-length reductions). This can be useful if `op_fast` is much faster +than `op` after the preprocessing provided by `pre`. + +The fallback return value for this method is `(identity, op, identity)`, corresponding +to no change to the processing. +""" +_makefast_reduction(op, T) = (identity, op, identity) + +# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior. +# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value. +# In the case that both arguments are `NaN`, either value may be returned. +# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is: +# -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN +# The desired ordering for min is: +# NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf +# The desired ordering for max is: +# -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign) +# Achieving this ordering requires two steps: +# 1) flip the ordering of all values with a negative sign +# 2) circularly shift the values so that NaNs are together at the correct end of the spectrum +# The following functions perform this transformation and reverse it. +# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail. +for (T, S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64)) + topval = flipifneg(reinterpret(S, typemax(T))) + botval = flipifneg(reinterpret(S, typemin(T))) + offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top + offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom + + for (op, offset) in ((min, offset_min), (max, offset_max)) + @eval function _makefast_reduction(::typeof($op), ::Type{$T}) + preprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x)) + $offset) + reduction(x::$T, y::$T) = reinterpret($T, $op(reinterpret($S, x), reinterpret($S, y))) + postprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x) - $offset)) + return preprocess, reduction, postprocess + end + end + + @eval function _makefast_reduction(::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) # used for extrema + premin, reducemin, postmin = _makefast_reduction(min, $T) + premax, reducemax, postmax = _makefast_reduction(max, $T) + preprocess((x, y)::NTuple{2,$T}) = (premin(x), premax(y)) + reduction((x1, y1)::NTuple{2,$T}, (x2, y2)::NTuple{2,$T}) = (reducemin(x1, x2), reducemax(y1, y2)) + postprocess((x, y)::NTuple{2,$T}) = (postmin(x), postmax(y)) + return preprocess, reduction, postprocess + end +end From 413b7392c62c95e96742f838872b26e4c3cbcc1e Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Fri, 6 Jan 2023 14:55:13 +0800 Subject: [PATCH 09/11] Move the optimization deeper and spread it. It helps to avoid extra allocation (caused by non-specialization) --- base/reduce.jl | 43 +++++++++------------------------------- base/reducedim.jl | 11 ++++++---- base/reinterpretarray.jl | 10 ++++++---- 3 files changed, 22 insertions(+), 42 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index a2abb07de3302..b6175128d9a07 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -252,6 +252,8 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...) # certain `op` (e.g. `min` and `max`) may have their own specialized versions. @noinline function mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted, ifirst::Integer, ilast::Integer, blksize::Int) + Eltype = _mapped_eltype(f, A) + pre, op_fast, post = _makefast_reduction(op, Eltype) if ifirst == ilast @inbounds a1 = A[ifirst] return mapreduce_first(f, op, a1) @@ -259,12 +261,12 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...) # sequential portion @inbounds a1 = A[ifirst] @inbounds a2 = A[ifirst+1] - v = op(f(a1), f(a2)) + v = op_fast(pre(f(a1)), pre(f(a2))) @simd for i = ifirst + 2 : ilast @inbounds ai = A[i] - v = op(v, f(ai)) + v = op_fast(v, pre(f(ai))) end - return v + return post(v) else # pairwise portion imid = ifirst + (ilast - ifirst) >> 1 @@ -423,10 +425,7 @@ The default is `reduce_first(op, f(x))`. """ mapreduce_first(f, op, x) = reduce_first(op, f(x)) -function _mapreduce(f, op, A::AbstractArrayOrBroadcasted) - pre_f, op_fast, post = _makefast_mapreduction(f, op, A) # find an accelerator, if available - return post(_mapreduce(pre_f, op_fast, IndexStyle(A), A)) -end +_mapreduce(f, op, A::AbstractArrayOrBroadcasted) = _mapreduce(f, op, IndexStyle(A), A) function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted) inds = LinearIndices(A) @@ -1325,37 +1324,13 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T} return n end -""" - Base._FastMapReduce(transform, mapfun) - -Essentially a [`ComposedFunction`](@ref) but with limited functionality and -a few specialized methods so that it can initialize collections properly. - -Used with `mapreduce` in some situations. -""" -struct _FastMapReduce{A,B} - trans::A - f::B -end -# (c::_FastMapReduce)(x...; kw...) = c.trans(c.f(x...; kw...)) # behave like trans ∘ f -(c::_FastMapReduce)(x) = c.trans(c.f(x)) # behave like trans ∘ f -mapreduce_empty(c::_FastMapReduce, op, T) = c.trans(mapreduce_empty(c.f, op, T)) -mapreduce_first(c::_FastMapReduce, op, x) = c.trans(mapreduce_first(c.f, op, x)) - -""" - Base._makefast_mapreduction(f, op, A) -> (pre_f, op_fast, post) - -Wrapper to call `Base._makefast_reduction(op, T)`, where `T` is the return type of -`f(::eltype(A))`, and compose the resulting `pre`-function with `f` to make `pre_f`. -""" -function _makefast_mapreduction(f, op, A) +function _mapped_eltype(f, A) elT = eltype(A) # try this - if elT == Any + if elT === Any elT = @default_eltype(A) # try again end fT = _return_type(f, Tuple{elT}) # determine type of f(A[i]), if possible - pre, op_fast, post = _makefast_reduction(op, fT) # find an accelerator, if available - return _FastMapReduce(pre, f), op_fast, post + return isconcretetype(fT) ? fT : Any # always widen this to `Any` if non-concrete end """ diff --git a/base/reducedim.jl b/base/reducedim.jl index 2479f86fab827..baa7549d05d45 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -301,14 +301,17 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted) keep, Idefault = Broadcast.shapeindexer(indsRt) if reducedim1(R, A) # keep the accumulator as a local variable when reducing along the first dimension + Eltype = _mapped_eltype(f, A) + Eltype = Eltype === eltype(R) ? Eltype : Any + pre, op_fast, post = _makefast_reduction(op, Eltype) i1 = first(axes1(R)) @inbounds for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] + r = pre(R[i1,IR]) @simd for i in axes(A, 1) - r = op(r, f(A[i, IA])) + r = op_fast(r, pre(f(A[i, IA]))) end - R[i1,IR] = r + R[i1,IR] = post(r) end else @inbounds for IA in CartesianIndices(indsAt) @@ -363,7 +366,7 @@ _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, ::Colon) = mapfoldl_impl(f, op, nt, A) _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) = - _mapreduce(f, op, A) + _mapreduce(f, op, IndexStyle(A), A) _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, dims) = mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A) diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl index 1fe0788a1739a..af8c6eeb4f718 100644 --- a/base/reinterpretarray.jl +++ b/base/reinterpretarray.jl @@ -757,23 +757,25 @@ end @noinline function mapreduce_impl(f::F, op::OP, A::AbstractArrayOrBroadcasted, ifirst::SCI, ilast::SCI, blksize::Int) where {F,OP,SCI<:SCartesianIndex2{K}} where K + Eltype = _mapped_eltype(f, A) + pre, op_fast, post = _makefast_reduction(op, Eltype) if ilast.j - ifirst.j < blksize # sequential portion @inbounds a1 = A[ifirst] @inbounds a2 = A[SCI(2,ifirst.j)] - v = op(f(a1), f(a2)) + v = op_fast(pre(f(a1)), pre(f(a2))) @simd for i = ifirst.i + 2 : K @inbounds ai = A[SCI(i,ifirst.j)] - v = op(v, f(ai)) + v = op_fast(v, pre(f(ai))) end # Remaining columns for j = ifirst.j+1 : ilast.j @simd for i = 1:K @inbounds ai = A[SCI(i,j)] - v = op(v, f(ai)) + v = op_fast(v, pre(f(ai))) end end - return v + return post(v) else # pairwise portion jmid = ifirst.j + (ilast.j - ifirst.j) >> 1 From 068082a2cbb3c4fe8e223a1bdfbdce5eb8c1de88 Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Fri, 6 Jan 2023 15:09:29 +0800 Subject: [PATCH 10/11] Turn off pairwise reduction for some op. The accuracy won't be better. --- base/reduce.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/base/reduce.jl b/base/reduce.jl index b6175128d9a07..12252d2835a9a 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -316,6 +316,9 @@ pairwise_blocksize(f, op) = 1024 # This combination appears to show a benefit from a larger block size pairwise_blocksize(::typeof(abs2), ::typeof(+)) = 4096 +# The following operations prefer non-pairwise reduction. +pairwise_blocksize(_, ::Union{typeof(min),typeof(max)}) = typemax(Int) +pairwise_blocksize(_, ::Union{typeof(|),typeof(&)}) = typemax(Int) # handling empty arrays _empty_reduce_error() = throw(ArgumentError("reducing over an empty collection is not allowed")) @@ -816,6 +819,8 @@ function _extrema_rf(x::NTuple{2,T}, y::NTuple{2,T}) where {T<:IEEEFloat} z1, z2 end +pairwise_blocksize(::ExtremaMap, ::typeof(_extrema_rf)) = typemax(Int) + ## findmax, findmin, argmax & argmin """ From a9bd1e879d007c4a34c09a6544294d4f543a7c0d Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Sat, 7 Jan 2023 00:26:35 +0800 Subject: [PATCH 11/11] Adopt suggestion. Co-Authored-By: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com> --- base/reduce.jl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/base/reduce.jl b/base/reduce.jl index 12252d2835a9a..3b5f6e92dd3f2 100644 --- a/base/reduce.jl +++ b/base/reduce.jl @@ -1329,14 +1329,7 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T} return n end -function _mapped_eltype(f, A) - elT = eltype(A) # try this - if elT === Any - elT = @default_eltype(A) # try again - end - fT = _return_type(f, Tuple{elT}) # determine type of f(A[i]), if possible - return isconcretetype(fT) ? fT : Any # always widen this to `Any` if non-concrete -end +_mapped_eltype(f, A) = _return_type(f, Tuple{@default_eltype(A)}) """ Base._makefast_reduction(op, T) -> (pre, op_fast, post)