From a5bf6a44fd3ea6ff867ee2effe06f6c94922d640 Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Sat, 4 Jun 2022 09:37:58 -0600
Subject: [PATCH 01/11] faster minimum/maximum for IEEEFloat

---
 base/float.jl     |  10 ++---
 base/reduce.jl    | 104 ++++++++++++++++++++++------------------------
 base/reducedim.jl |   2 +-
 3 files changed, 55 insertions(+), 61 deletions(-)

diff --git a/base/float.jl b/base/float.jl
index 60850b7e02f64..edb17e9899700 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -415,11 +415,7 @@ end
 isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y)
 
 # interpret as sign-magnitude integer
-@inline function _fpint(x)
-    IntT = inttype(typeof(x))
-    ix = reinterpret(IntT, x)
-    return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix)
-end
+@inline _fpint(x) = flipifneg(reinterpret(Signed, x))
 
 @inline function isless(a::T, b::T) where T<:IEEEFloat
     (isnan(a) || isnan(b)) && return !isnan(a)
@@ -427,6 +423,10 @@ end
     return _fpint(a) < _fpint(b)
 end
 
+# if negative, flip all bits except the top bit
+# used for permuting the order of IEEEFloat values
+flipifneg(x::BitSigned) = xor(x, ifelse(signbit(x), typemax(x), zero(x)))
+
 # Exact Float (Tf) vs Integer (Ti) comparisons
 # Assumes:
 # - typemax(Ti) == 2^n-1
diff --git a/base/reduce.jl b/base/reduce.jl
index 45284d884a279..201dd6bc2e48c 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -349,6 +349,11 @@ reduce_empty(op::MappingRF, ::Type{T}) where {T} = mapreduce_empty(op.f, op.rf,
 reduce_empty(op::FilteringRF, ::Type{T}) where {T} = reduce_empty(op.rf, T)
 reduce_empty(op::FlipArgs, ::Type{T}) where {T} = reduce_empty(op.f, T)
 
+# return a function to transform a value of type `T` to be fast when the operation `op` is applied
+_makefast_preproc(op, T) = identity
+# reverse the transform used by makefast_preproc, restoring `x` to a value of type `T`
+_makefast_postproc(op, T, x) = x
+
 """
     Base.mapreduce_empty(f, op, T)
 
@@ -410,7 +415,13 @@ The default is `reduce_first(op, f(x))`.
 """
 mapreduce_first(f, op, x) = reduce_first(op, f(x))
 
-_mapreduce(f, op, A::AbstractArrayOrBroadcasted) = _mapreduce(f, op, IndexStyle(A), A)
+function _mapreduce(f, op, A::AbstractArrayOrBroadcasted)
+    fT = _return_type(f, Tuple{eltype(A)})
+    accelerator = _makefast_preproc(op, fT)
+    ffast = accelerator ∘ f
+    rfast = _mapreduce(ffast, op, IndexStyle(A), A)
+    return _makefast_postproc(op, fT, rfast)
+end
 
 function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted)
     inds = LinearIndices(A)
@@ -604,62 +615,45 @@ julia> prod(1:5; init = 1.0)
 """
 prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...)
 
-## maximum, minimum, & extrema
-_fast(::typeof(min),x,y) = min(x,y)
-_fast(::typeof(max),x,y) = max(x,y)
-function _fast(::typeof(max), x::AbstractFloat, y::AbstractFloat)
-    ifelse(isnan(x),
-        x,
-        ifelse(x > y, x, y))
+# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior.
+# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value.
+# In the case that both arguments are `NaN`, either value may be returned.
+# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is:
+#   -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN
+# The desired ordering for min is:
+#   NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf
+# The desired ordering for max is:
+#   -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign)
+# Achieving this ordering requires two steps:
+#   1) flip the ordering of all values with a negative sign
+#   2) circularly-shift the values so that NaNs are together at the correct end of the spectrum
+# The following functions perform this transformation and reverse it.
+# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail.
+function _makefast_preproc(op::typeof(min), ::Type{T}) where T<:IEEEFloat
+    # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster
+    topval = flipifneg(reinterpret(Signed, typemax(T)))
+    offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
+    floatorder_min(x) = flipifneg(reinterpret(Signed, x)) + offset
+    return floatorder_min
 end
-
-function _fast(::typeof(min),x::AbstractFloat, y::AbstractFloat)
-    ifelse(isnan(x),
-        x,
-        ifelse(x < y, x, y))
+function _makefast_postproc(op::typeof(min), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat
+    # undo the transformation of _makefast_preproc
+    topval = flipifneg(reinterpret(Signed, typemax(T)))
+    offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
+    return reinterpret(T, flipifneg(x - offset))
 end
-
-isbadzero(::typeof(max), x::AbstractFloat) = (x == zero(x)) & signbit(x)
-isbadzero(::typeof(min), x::AbstractFloat) = (x == zero(x)) & !signbit(x)
-isbadzero(op, x) = false
-isgoodzero(::typeof(max), x) = isbadzero(min, x)
-isgoodzero(::typeof(min), x) = isbadzero(max, x)
-
-function mapreduce_impl(f, op::Union{typeof(max), typeof(min)},
-                        A::AbstractArrayOrBroadcasted, first::Int, last::Int)
-    a1 = @inbounds A[first]
-    v1 = mapreduce_first(f, op, a1)
-    v2 = v3 = v4 = v1
-    chunk_len = 256
-    start = first + 1
-    simdstop  = start + chunk_len - 4
-    while simdstop <= last - 3
-        @inbounds for i in start:4:simdstop
-            v1 = _fast(op, v1, f(A[i+0]))
-            v2 = _fast(op, v2, f(A[i+1]))
-            v3 = _fast(op, v3, f(A[i+2]))
-            v4 = _fast(op, v4, f(A[i+3]))
-        end
-        checkbounds(A, simdstop+3)
-        start += chunk_len
-        simdstop += chunk_len
-    end
-    v = op(op(v1,v2),op(v3,v4))
-    for i in start:last
-        @inbounds ai = A[i]
-        v = op(v, f(ai))
-    end
-
-    # enforce correct order of 0.0 and -0.0
-    # e.g. maximum([0.0, -0.0]) === 0.0
-    # should hold
-    if isbadzero(op, v)
-        for i in first:last
-            x = @inbounds A[i]
-            isgoodzero(op,x) && return x
-        end
-    end
-    return v
+function _makefast_preproc(op::typeof(max), ::Type{T}) where T<:IEEEFloat
+    # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster
+    botval = flipifneg(reinterpret(Signed, typemin(T)))
+    offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
+    floatorder_max(x) = flipifneg(reinterpret(Signed, x)) + offset
+    return floatorder_max
+end
+function _makefast_postproc(op::typeof(max), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat
+    # undo the transformation of _makefast_preproc
+    botval = flipifneg(reinterpret(Signed, typemin(T)))
+    offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
+    return reinterpret(T, flipifneg(x - offset))
 end
 
 """
diff --git a/base/reducedim.jl b/base/reducedim.jl
index a2dfc28ca99c3..ca0e9984dcb62 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -363,7 +363,7 @@ _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, ::Colon) =
     mapfoldl_impl(f, op, nt, A)
 
 _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) =
-    _mapreduce(f, op, IndexStyle(A), A)
+    _mapreduce(f, op, A)
 
 _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, dims) =
     mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)

From aea0ec612a09f673818f78b553b403fe30807be8 Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Sat, 4 Jun 2022 10:28:45 -0600
Subject: [PATCH 02/11] faster isless

---
 base/float.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/base/float.jl b/base/float.jl
index edb17e9899700..ac895b3570b0c 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -414,13 +414,13 @@ end
 
 isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y)
 
-# interpret as sign-magnitude integer
-@inline _fpint(x) = flipifneg(reinterpret(Signed, x))
-
 @inline function isless(a::T, b::T) where T<:IEEEFloat
-    (isnan(a) || isnan(b)) && return !isnan(a)
-
-    return _fpint(a) < _fpint(b)
+    botval = flipifneg(reinterpret(Signed, typemin(T)))
+    topval = flipifneg(reinterpret(Signed, typemax(T)))
+    offset = typemin(botval) - botval # adding offset will place typemin(T) at the bottom, wrapping NaN to the top
+    u = flipifneg(reinterpret(Signed, a)) + offset
+    v = flipifneg(reinterpret(Signed, b)) + offset
+    return (u < v) & (u <= topval+offset) # (a < b) & (a <= Inf), with the proper signed zero handling
 end
 
 # if negative, flip all bits except the top bit

From a0b5853049f64daec28668baf50bc8ad78ae295b Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Sat, 4 Jun 2022 11:12:00 -0600
Subject: [PATCH 03/11] only apply _makefast if type is concrete

---
 base/reduce.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 201dd6bc2e48c..00ff833c19336 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -417,10 +417,14 @@ mapreduce_first(f, op, x) = reduce_first(op, f(x))
 
 function _mapreduce(f, op, A::AbstractArrayOrBroadcasted)
     fT = _return_type(f, Tuple{eltype(A)})
-    accelerator = _makefast_preproc(op, fT)
-    ffast = accelerator ∘ f
-    rfast = _mapreduce(ffast, op, IndexStyle(A), A)
-    return _makefast_postproc(op, fT, rfast)
+    if isconcretetype(fT)
+        accelerator = _makefast_preproc(op, fT)
+        ffast = accelerator ∘ f
+        rfast = _mapreduce(ffast, op, IndexStyle(A), A)
+        return _makefast_postproc(op, fT, rfast)
+    else
+        return _mapreduce(f, op, IndexStyle(A), A)
+    end
 end
 
 function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted)

From 0fc9ce00a8f74edf8a0c2aaa966d77ee78ccdbb4 Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Sat, 4 Jun 2022 11:44:40 -0600
Subject: [PATCH 04/11] fast extrema

---
 base/reduce.jl | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 00ff833c19336..ab50b29c4e042 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -634,31 +634,43 @@ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...)
 # The following functions perform this transformation and reverse it.
 # Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail.
 function _makefast_preproc(op::typeof(min), ::Type{T}) where T<:IEEEFloat
-    # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster
     topval = flipifneg(reinterpret(Signed, typemax(T)))
     offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
-    floatorder_min(x) = flipifneg(reinterpret(Signed, x)) + offset
+    floatorder_min(x::T) = flipifneg(reinterpret(Signed, x)) + offset
     return floatorder_min
 end
 function _makefast_postproc(op::typeof(min), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat
-    # undo the transformation of _makefast_preproc
     topval = flipifneg(reinterpret(Signed, typemax(T)))
     offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
     return reinterpret(T, flipifneg(x - offset))
 end
 function _makefast_preproc(op::typeof(max), ::Type{T}) where T<:IEEEFloat
-    # transform an item of type `T` such that the ordering of `op` is still respected but `op` is faster
     botval = flipifneg(reinterpret(Signed, typemin(T)))
     offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
-    floatorder_max(x) = flipifneg(reinterpret(Signed, x)) + offset
+    floatorder_max(x::T) = flipifneg(reinterpret(Signed, x)) + offset
     return floatorder_max
 end
 function _makefast_postproc(op::typeof(max), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat
-    # undo the transformation of _makefast_preproc
     botval = flipifneg(reinterpret(Signed, typemin(T)))
     offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
     return reinterpret(T, flipifneg(x - offset))
 end
+function _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}) where T<:IEEEFloat
+    topval = flipifneg(reinterpret(Signed, typemax(T)))
+    offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
+    botval = flipifneg(reinterpret(Signed, typemin(T)))
+    offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
+    floatorder_extrema((x,y)::NTuple{2,T}) = (flipifneg(reinterpret(Signed, x))+offset_min, flipifneg(reinterpret(Signed, y))+offset_max)
+    return floatorder_extrema
+end
+function _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}, (x,y)::NTuple{2,Base.BitSigned}) where T<:IEEEFloat
+    # undo the transformation of _makefast_preproc
+    topval = flipifneg(reinterpret(Signed, typemax(T)))
+    offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
+    botval = flipifneg(reinterpret(Signed, typemin(T)))
+    offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
+    return (reinterpret(T, flipifneg(x - offset_min)), reinterpret(T, flipifneg(x - offset_max)))
+end
 
 """
     maximum(f, itr; [init])

From 130b0e5828e2232bc9afdeb5cfcfcda263e84d8f Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Sun, 5 Jun 2022 09:07:51 -0600
Subject: [PATCH 05/11] less ambiguity in accelerator definitions

---
 base/reduce.jl | 45 ++++++++++-----------------------------------
 1 file changed, 10 insertions(+), 35 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index ab50b29c4e042..d980141aa8209 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -633,43 +633,18 @@ prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...)
 #   2) circularly-shift the values so that NaNs are together at the correct end of the spectrum
 # The following functions perform this transformation and reverse it.
 # Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail.
-function _makefast_preproc(op::typeof(min), ::Type{T}) where T<:IEEEFloat
-    topval = flipifneg(reinterpret(Signed, typemax(T)))
-    offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
-    floatorder_min(x::T) = flipifneg(reinterpret(Signed, x)) + offset
-    return floatorder_min
-end
-function _makefast_postproc(op::typeof(min), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat
-    topval = flipifneg(reinterpret(Signed, typemax(T)))
-    offset = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
-    return reinterpret(T, flipifneg(x - offset))
-end
-function _makefast_preproc(op::typeof(max), ::Type{T}) where T<:IEEEFloat
-    botval = flipifneg(reinterpret(Signed, typemin(T)))
-    offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
-    floatorder_max(x::T) = flipifneg(reinterpret(Signed, x)) + offset
-    return floatorder_max
-end
-function _makefast_postproc(op::typeof(max), ::Type{T}, x::Base.BitSigned) where T<:IEEEFloat
-    botval = flipifneg(reinterpret(Signed, typemin(T)))
-    offset = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
-    return reinterpret(T, flipifneg(x - offset))
-end
-function _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}) where T<:IEEEFloat
-    topval = flipifneg(reinterpret(Signed, typemax(T)))
+for (T,S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64))
+    topval = flipifneg(reinterpret(S, typemax(T)))
+    botval = flipifneg(reinterpret(S, typemin(T)))
     offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
-    botval = flipifneg(reinterpret(Signed, typemin(T)))
     offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
-    floatorder_extrema((x,y)::NTuple{2,T}) = (flipifneg(reinterpret(Signed, x))+offset_min, flipifneg(reinterpret(Signed, y))+offset_max)
-    return floatorder_extrema
-end
-function _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,T}}, (x,y)::NTuple{2,Base.BitSigned}) where T<:IEEEFloat
-    # undo the transformation of _makefast_preproc
-    topval = flipifneg(reinterpret(Signed, typemax(T)))
-    offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
-    botval = flipifneg(reinterpret(Signed, typemin(T)))
-    offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
-    return (reinterpret(T, flipifneg(x - offset_min)), reinterpret(T, flipifneg(x - offset_max)))
+
+    @eval _makefast_preproc(op::typeof(min), ::Type{$T}) = (floatorder_min(x::$T) = flipifneg(reinterpret($S, x)) + $offset_min)
+    @eval _makefast_preproc(op::typeof(max), ::Type{$T}) = (floatorder_max(x::$T) = flipifneg(reinterpret($S, x)) + $offset_max)
+    @eval _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) = (floatorder_extrema((x,y)::NTuple{2,$T}) = (flipifneg(reinterpret($S, x))+$offset_min, flipifneg(reinterpret($S, y))+$offset_max))
+    @eval _makefast_postproc(op::typeof(min), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_min))
+    @eval _makefast_postproc(op::typeof(max), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_max))
+    @eval _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}, (x,y)::NTuple{2,$S}) = (reinterpret($T, flipifneg(x - $offset_min)), reinterpret($T, flipifneg(x - $offset_max)))
 end
 
 """

From 02efa06ee0a4d719eb61d1326eb18566384646e1 Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Sun, 5 Jun 2022 10:53:57 -0600
Subject: [PATCH 06/11] only apply accelerator if it exists

---
 base/reduce.jl | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index d980141aa8209..fa195752df9ee 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -416,15 +416,11 @@ The default is `reduce_first(op, f(x))`.
 mapreduce_first(f, op, x) = reduce_first(op, f(x))
 
 function _mapreduce(f, op, A::AbstractArrayOrBroadcasted)
-    fT = _return_type(f, Tuple{eltype(A)})
-    if isconcretetype(fT)
-        accelerator = _makefast_preproc(op, fT)
-        ffast = accelerator ∘ f
-        rfast = _mapreduce(ffast, op, IndexStyle(A), A)
-        return _makefast_postproc(op, fT, rfast)
-    else
-        return _mapreduce(f, op, IndexStyle(A), A)
-    end
+    fT = _return_type(f, Tuple{eltype(A)}) # determine type of f(A[i])
+    preproc = _makefast_preproc(op, fT) # fetch an accelerator, if available
+    ffast = preproc == identity ? f : preproc ∘ f # apply accelerator, if available
+    rfast = _mapreduce(ffast, op, IndexStyle(A), A)
+    return _makefast_postproc(op, fT, rfast)
 end
 
 function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted)

From f00ac8c7bfa165424f78f355ba879ed3092029fb Mon Sep 17 00:00:00 2001
From: mikmoore <95002244+mikmoore@users.noreply.github.com>
Date: Sun, 5 Jun 2022 18:05:46 -0600
Subject: [PATCH 07/11] Apply suggestions from code review

Co-authored-by: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com>
---
 base/reduce.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index fa195752df9ee..7f923c1ec18e4 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -418,7 +418,7 @@ mapreduce_first(f, op, x) = reduce_first(op, f(x))
 function _mapreduce(f, op, A::AbstractArrayOrBroadcasted)
     fT = _return_type(f, Tuple{eltype(A)}) # determine type of f(A[i])
     preproc = _makefast_preproc(op, fT) # fetch an accelerator, if available
-    ffast = preproc == identity ? f : preproc ∘ f # apply accelerator, if available
+    ffast = preproc === identity ? f : preproc ∘ f # apply accelerator, if available
     rfast = _mapreduce(ffast, op, IndexStyle(A), A)
     return _makefast_postproc(op, fT, rfast)
 end

From 81a0ff5c4f37c71a813338c17c7f08fb87a404f2 Mon Sep 17 00:00:00 2001
From: mikmoore <mikmoore@users.noreply.github.com>
Date: Wed, 21 Dec 2022 17:10:25 -0800
Subject: [PATCH 08/11] reorganize

---
 base/reduce.jl | 126 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 88 insertions(+), 38 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 7f923c1ec18e4..8f2442441d2b0 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -349,11 +349,6 @@ reduce_empty(op::MappingRF, ::Type{T}) where {T} = mapreduce_empty(op.f, op.rf,
 reduce_empty(op::FilteringRF, ::Type{T}) where {T} = reduce_empty(op.rf, T)
 reduce_empty(op::FlipArgs, ::Type{T}) where {T} = reduce_empty(op.f, T)
 
-# return a function to transform a value of type `T` to be fast when the operation `op` is applied
-_makefast_preproc(op, T) = identity
-# reverse the transform used by makefast_preproc, restoring `x` to a value of type `T`
-_makefast_postproc(op, T, x) = x
-
 """
     Base.mapreduce_empty(f, op, T)
 
@@ -416,11 +411,8 @@ The default is `reduce_first(op, f(x))`.
 mapreduce_first(f, op, x) = reduce_first(op, f(x))
 
 function _mapreduce(f, op, A::AbstractArrayOrBroadcasted)
-    fT = _return_type(f, Tuple{eltype(A)}) # determine type of f(A[i])
-    preproc = _makefast_preproc(op, fT) # fetch an accelerator, if available
-    ffast = preproc === identity ? f : preproc ∘ f # apply accelerator, if available
-    rfast = _mapreduce(ffast, op, IndexStyle(A), A)
-    return _makefast_postproc(op, fT, rfast)
+    pre_f, op_fast, post = _makefast_mapreduction(f, op, A) # find an accelerator, if available
+    return post(_mapreduce(pre_f, op_fast, IndexStyle(A), A))
 end
 
 function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted)
@@ -615,34 +607,6 @@ julia> prod(1:5; init = 1.0)
 """
 prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...)
 
-# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior.
-# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value.
-# In the case that both arguments are `NaN`, either value may be returned.
-# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is:
-#   -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN
-# The desired ordering for min is:
-#   NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf
-# The desired ordering for max is:
-#   -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign)
-# Achieving this ordering requires two steps:
-#   1) flip the ordering of all values with a negative sign
-#   2) circularly-shift the values so that NaNs are together at the correct end of the spectrum
-# The following functions perform this transformation and reverse it.
-# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail.
-for (T,S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64))
-    topval = flipifneg(reinterpret(S, typemax(T)))
-    botval = flipifneg(reinterpret(S, typemin(T)))
-    offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
-    offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
-
-    @eval _makefast_preproc(op::typeof(min), ::Type{$T}) = (floatorder_min(x::$T) = flipifneg(reinterpret($S, x)) + $offset_min)
-    @eval _makefast_preproc(op::typeof(max), ::Type{$T}) = (floatorder_max(x::$T) = flipifneg(reinterpret($S, x)) + $offset_max)
-    @eval _makefast_preproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) = (floatorder_extrema((x,y)::NTuple{2,$T}) = (flipifneg(reinterpret($S, x))+$offset_min, flipifneg(reinterpret($S, y))+$offset_max))
-    @eval _makefast_postproc(op::typeof(min), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_min))
-    @eval _makefast_postproc(op::typeof(max), ::Type{$T}, x::$S) = reinterpret($T, flipifneg(x - $offset_max))
-    @eval _makefast_postproc(op::typeof(_extrema_rf), ::Type{NTuple{2,$T}}, (x,y)::NTuple{2,$S}) = (reinterpret($T, flipifneg(x - $offset_min)), reinterpret($T, flipifneg(x - $offset_max)))
-end
-
 """
     maximum(f, itr; [init])
 
@@ -1337,3 +1301,89 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T}
     end
     return n
 end
+
+"""
+    Base._FastMapReduce(transform, mapfun)
+
+Essentially a [`ComposedFunction`](@ref) but with limited functionality and
+a few specialized methods so that it can initialize collections properly.
+
+Used with `mapreduce` in some situations.
+"""
+struct _FastMapReduce{A,B}
+    trans::A
+    f::B
+end
+# (c::_FastMapReduce)(x...; kw...) = c.trans(c.f(x...; kw...)) # behave like trans ∘ f
+(c::_FastMapReduce)(x) = c.trans(c.f(x)) # behave like trans ∘ f
+mapreduce_empty(c::_FastMapReduce, op, T) = c.trans(mapreduce_empty(c.f, op, T))
+mapreduce_first(c::_FastMapReduce, op, x) = c.trans(mapreduce_first(c.f, op, x))
+
+"""
+    Base._makefast_mapreduction(f, op, A) -> (pre_f, op_fast, post)
+
+Wrapper to call `Base._makefast_reduction(op, T)`, where `T` is the return type of
+`f(::eltype(A))`, and compose the resulting `pre`-function with `f` to make `pre_f`.
+"""
+function _makefast_mapreduction(f, op, A)
+    elT = eltype(A) # try this
+    if elT == Any
+        elT = @default_eltype(A) # try again
+    end
+    fT = _return_type(f, Tuple{elT}) # determine type of f(A[i]), if possible
+    pre, op_fast, post = _makefast_reduction(op, fT) # find an accelerator, if available
+    return _FastMapReduce(pre, f), op_fast, post
+end
+
+"""
+    Base._makefast_reduction(op, T) -> (pre, op_fast, post)
+
+Define a series of functions that can be used accelerate the computation of `op(x, y)`.
+Incorrect results may be produced if it does not hold that `op(x::T, y::T)` and
+`post(op_fast(pre(x::T), pre(y::T)))` produce equivalent results (and further, that this
+holds for arbitrary-length reductions). This can be useful if `op_fast` is much faster
+than `op` after the preprocessing provided by `pre`.
+
+The fallback return value for this method is `(identity, op, identity)`, corresponding
+to no change to the processing.
+"""
+_makefast_reduction(op, T) = (identity, op, identity)
+
+# The idea behind the following functions is to transform `IEEEFloat` values to native `Signed` values that obey the same `min`/`max` behavior.
+# Namely, the float values must match the semantics of `<` except that `-0.0 < +0.0` and `NaN` is preferred over any other value.
+# In the case that both arguments are `NaN`, either value may be returned.
+# If we reinterpret(Signed,::IEEEFloat), the signed-integer ordering is:
+#   -0.0, ... negative values in ascending magnitude ..., -Inf, -NaN, 0.0, ... positive values in ascending magnitude ..., +Inf, +NaN
+# The desired ordering for min is:
+#   NaN (unspecified sign), -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf
+# The desired ordering for max is:
+#   -Inf, ... negative values in descending magnitude ..., -0.0, +0.0, ... positive values in ascending magnitude ..., +Inf, NaN (unspecified sign)
+# Achieving this ordering requires two steps:
+#   1) flip the ordering of all values with a negative sign
+#   2) circularly shift the values so that NaNs are together at the correct end of the spectrum
+# The following functions perform this transformation and reverse it.
+# Under this scheme, we place a total order on every value (including NaNs), although the ordering of NaNs is an implementation detail.
+for (T, S) in ((Float16, Int16), (Float32, Int32), (Float64, Int64))
+    topval = flipifneg(reinterpret(S, typemax(T)))
+    botval = flipifneg(reinterpret(S, typemin(T)))
+    offset_min = typemax(topval) - topval # adding this to a int-interpreted float will place typemax(T) at the top
+    offset_max = typemin(botval) - botval # adding this to a int-interpreted float will place typemin(T) at the bottom
+
+    for (op, offset) in ((min, offset_min), (max, offset_max))
+        @eval function _makefast_reduction(::typeof($op), ::Type{$T})
+            preprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x)) + $offset)
+            reduction(x::$T, y::$T) = reinterpret($T, $op(reinterpret($S, x), reinterpret($S, y)))
+            postprocess(x::$T) = reinterpret($T, flipifneg(reinterpret($S, x) - $offset))
+            return preprocess, reduction, postprocess
+        end
+    end
+
+    @eval function _makefast_reduction(::typeof(_extrema_rf), ::Type{NTuple{2,$T}}) # used for extrema
+        premin, reducemin, postmin = _makefast_reduction(min, $T)
+        premax, reducemax, postmax = _makefast_reduction(max, $T)
+        preprocess((x, y)::NTuple{2,$T}) = (premin(x), premax(y))
+        reduction((x1, y1)::NTuple{2,$T}, (x2, y2)::NTuple{2,$T}) = (reducemin(x1, x2), reducemax(y1, y2))
+        postprocess((x, y)::NTuple{2,$T}) = (postmin(x), postmax(y))
+        return preprocess, reduction, postprocess
+    end
+end

From 413b7392c62c95e96742f838872b26e4c3cbcc1e Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Fri, 6 Jan 2023 14:55:13 +0800
Subject: [PATCH 09/11] Move the optimization deeper and spread it.

It helps to avoid extra allocation (caused by non-specialization)
---
 base/reduce.jl           | 43 +++++++++-------------------------------
 base/reducedim.jl        | 11 ++++++----
 base/reinterpretarray.jl | 10 ++++++----
 3 files changed, 22 insertions(+), 42 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index a2abb07de3302..b6175128d9a07 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -252,6 +252,8 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...)
 # certain `op` (e.g. `min` and `max`) may have their own specialized versions.
 @noinline function mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted,
                                   ifirst::Integer, ilast::Integer, blksize::Int)
+    Eltype = _mapped_eltype(f, A)
+    pre, op_fast, post = _makefast_reduction(op, Eltype)
     if ifirst == ilast
         @inbounds a1 = A[ifirst]
         return mapreduce_first(f, op, a1)
@@ -259,12 +261,12 @@ foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...)
         # sequential portion
         @inbounds a1 = A[ifirst]
         @inbounds a2 = A[ifirst+1]
-        v = op(f(a1), f(a2))
+        v = op_fast(pre(f(a1)), pre(f(a2)))
         @simd for i = ifirst + 2 : ilast
             @inbounds ai = A[i]
-            v = op(v, f(ai))
+            v = op_fast(v, pre(f(ai)))
         end
-        return v
+        return post(v)
     else
         # pairwise portion
         imid = ifirst + (ilast - ifirst) >> 1
@@ -423,10 +425,7 @@ The default is `reduce_first(op, f(x))`.
 """
 mapreduce_first(f, op, x) = reduce_first(op, f(x))
 
-function _mapreduce(f, op, A::AbstractArrayOrBroadcasted)
-    pre_f, op_fast, post = _makefast_mapreduction(f, op, A) # find an accelerator, if available
-    return post(_mapreduce(pre_f, op_fast, IndexStyle(A), A))
-end
+_mapreduce(f, op, A::AbstractArrayOrBroadcasted) = _mapreduce(f, op, IndexStyle(A), A)
 
 function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted)
     inds = LinearIndices(A)
@@ -1325,37 +1324,13 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T}
     return n
 end
 
-"""
-    Base._FastMapReduce(transform, mapfun)
-
-Essentially a [`ComposedFunction`](@ref) but with limited functionality and
-a few specialized methods so that it can initialize collections properly.
-
-Used with `mapreduce` in some situations.
-"""
-struct _FastMapReduce{A,B}
-    trans::A
-    f::B
-end
-# (c::_FastMapReduce)(x...; kw...) = c.trans(c.f(x...; kw...)) # behave like trans ∘ f
-(c::_FastMapReduce)(x) = c.trans(c.f(x)) # behave like trans ∘ f
-mapreduce_empty(c::_FastMapReduce, op, T) = c.trans(mapreduce_empty(c.f, op, T))
-mapreduce_first(c::_FastMapReduce, op, x) = c.trans(mapreduce_first(c.f, op, x))
-
-"""
-    Base._makefast_mapreduction(f, op, A) -> (pre_f, op_fast, post)
-
-Wrapper to call `Base._makefast_reduction(op, T)`, where `T` is the return type of
-`f(::eltype(A))`, and compose the resulting `pre`-function with `f` to make `pre_f`.
-"""
-function _makefast_mapreduction(f, op, A)
+function _mapped_eltype(f, A)
     elT = eltype(A) # try this
-    if elT == Any
+    if elT === Any
         elT = @default_eltype(A) # try again
     end
     fT = _return_type(f, Tuple{elT}) # determine type of f(A[i]), if possible
-    pre, op_fast, post = _makefast_reduction(op, fT) # find an accelerator, if available
-    return _FastMapReduce(pre, f), op_fast, post
+    return isconcretetype(fT) ? fT : Any # always widen this to `Any` if non-concrete
 end
 
 """
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 2479f86fab827..baa7549d05d45 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -301,14 +301,17 @@ function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted)
     keep, Idefault = Broadcast.shapeindexer(indsRt)
     if reducedim1(R, A)
         # keep the accumulator as a local variable when reducing along the first dimension
+        Eltype = _mapped_eltype(f, A)
+        Eltype = Eltype === eltype(R) ? Eltype : Any
+        pre, op_fast, post = _makefast_reduction(op, Eltype)
         i1 = first(axes1(R))
         @inbounds for IA in CartesianIndices(indsAt)
             IR = Broadcast.newindex(IA, keep, Idefault)
-            r = R[i1,IR]
+            r = pre(R[i1,IR])
             @simd for i in axes(A, 1)
-                r = op(r, f(A[i, IA]))
+                r = op_fast(r, pre(f(A[i, IA])))
             end
-            R[i1,IR] = r
+            R[i1,IR] = post(r)
         end
     else
         @inbounds for IA in CartesianIndices(indsAt)
@@ -363,7 +366,7 @@ _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, ::Colon) =
     mapfoldl_impl(f, op, nt, A)
 
 _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) =
-    _mapreduce(f, op, A)
+    _mapreduce(f, op, IndexStyle(A), A)
 
 _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, dims) =
     mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)
diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl
index 1fe0788a1739a..af8c6eeb4f718 100644
--- a/base/reinterpretarray.jl
+++ b/base/reinterpretarray.jl
@@ -757,23 +757,25 @@ end
 
 @noinline function mapreduce_impl(f::F, op::OP, A::AbstractArrayOrBroadcasted,
                                   ifirst::SCI, ilast::SCI, blksize::Int) where {F,OP,SCI<:SCartesianIndex2{K}} where K
+    Eltype = _mapped_eltype(f, A)
+    pre, op_fast, post = _makefast_reduction(op, Eltype)
     if ilast.j - ifirst.j < blksize
         # sequential portion
         @inbounds a1 = A[ifirst]
         @inbounds a2 = A[SCI(2,ifirst.j)]
-        v = op(f(a1), f(a2))
+        v = op_fast(pre(f(a1)), pre(f(a2)))
         @simd for i = ifirst.i + 2 : K
             @inbounds ai = A[SCI(i,ifirst.j)]
-            v = op(v, f(ai))
+            v = op_fast(v, pre(f(ai)))
         end
         # Remaining columns
         for j = ifirst.j+1 : ilast.j
             @simd for i = 1:K
                 @inbounds ai = A[SCI(i,j)]
-                v = op(v, f(ai))
+                v = op_fast(v, pre(f(ai)))
             end
         end
-        return v
+        return post(v)
     else
         # pairwise portion
         jmid = ifirst.j + (ilast.j - ifirst.j) >> 1

From 068082a2cbb3c4fe8e223a1bdfbdce5eb8c1de88 Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Fri, 6 Jan 2023 15:09:29 +0800
Subject: [PATCH 10/11] Turn off pairwise reduction for some op.

The accuracy won't be better.
---
 base/reduce.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/base/reduce.jl b/base/reduce.jl
index b6175128d9a07..12252d2835a9a 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -316,6 +316,9 @@ pairwise_blocksize(f, op) = 1024
 # This combination appears to show a benefit from a larger block size
 pairwise_blocksize(::typeof(abs2), ::typeof(+)) = 4096
 
+# The following operations prefer non-pairwise reduction.
+pairwise_blocksize(_, ::Union{typeof(min),typeof(max)}) = typemax(Int)
+pairwise_blocksize(_, ::Union{typeof(|),typeof(&)}) = typemax(Int)
 
 # handling empty arrays
 _empty_reduce_error() = throw(ArgumentError("reducing over an empty collection is not allowed"))
@@ -816,6 +819,8 @@ function _extrema_rf(x::NTuple{2,T}, y::NTuple{2,T}) where {T<:IEEEFloat}
     z1, z2
 end
 
+pairwise_blocksize(::ExtremaMap, ::typeof(_extrema_rf)) = typemax(Int)
+
 ## findmax, findmin, argmax & argmin
 
 """

From a9bd1e879d007c4a34c09a6544294d4f543a7c0d Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Sat, 7 Jan 2023 00:26:35 +0800
Subject: [PATCH 11/11] Adopt suggestion.

Co-Authored-By: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com>
---
 base/reduce.jl | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 12252d2835a9a..3b5f6e92dd3f2 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -1329,14 +1329,7 @@ function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T}
     return n
 end
 
-function _mapped_eltype(f, A)
-    elT = eltype(A) # try this
-    if elT === Any
-        elT = @default_eltype(A) # try again
-    end
-    fT = _return_type(f, Tuple{elT}) # determine type of f(A[i]), if possible
-    return isconcretetype(fT) ? fT : Any # always widen this to `Any` if non-concrete
-end
+_mapped_eltype(f, A) = _return_type(f, Tuple{@default_eltype(A)})
 
 """
     Base._makefast_reduction(op, T) -> (pre, op_fast, post)