From b6e894d83ca1f0947a34b9215acd99a995242a1a Mon Sep 17 00:00:00 2001 From: Oscar Smith Date: Thu, 20 Oct 2022 11:52:43 -0400 Subject: [PATCH 1/5] faster `inv` for normal sized `ComplexF64` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` julia> xc64 = rand(ComplexF64, 1024); yc64=similar(xc64); #Old julia> @benchmark @. $yc64 = inv($xc64) BenchmarkTools.Trial: 10000 samples with 6 evaluations. Range (min … max): 5.467 μs … 10.603 μs ┊ GC (min … max): 0.00% … 0.00% Time (median): 5.983 μs ┊ GC (median): 0.00% Time (mean ± σ): 5.838 μs ± 368.716 ns ┊ GC (mean ± σ): 0.00% ± 0.00% █▂ ██▁ ▂ ██▄▅▄▄▃▁▅▆▇▇▆▇▆▆▆▇▇▇▇███▇▇▇█▇███▇▆▆▆▅▆▅▇▆▆▇▇▅▄▅▃▅▄▅▃▅▄▁▃▃▁▃ █ 5.47 μs Histogram: log(frequency) by time 6.86 μs < Memory estimate: 0 bytes, allocs estimate: 0. #new julia> @benchmark @. $yc64 = inv($xc64) BenchmarkTools.Trial: 10000 samples with 9 evaluations. Range (min … max): 2.008 μs … 6.809 μs ┊ GC (min … max): 0.00% … 0.00% Time (median): 2.118 μs ┊ GC (median): 0.00% Time (mean ± σ): 2.209 μs ± 301.481 ns ┊ GC (mean ± σ): 0.00% ± 0.00% ▆▆▄█▂▄█▅▂▂▃▁ ▁▂ ▁ ▂ ██████████████████▇███▇▆▆▆▆▆▆▅▆▅▆▆▅▅▆▆▆▅▅▆▅▆▆▅▄▅▅▅▄▅▂▄▅▅▃▂▄ █ 2.01 μs Histogram: log(frequency) by time 3.73 μs < Memory estimate: 0 bytes, allocs estimate: 0. julia> xc64 = 1e200.+rand(ComplexF64, 1024); yc64=similar(xc64); #old julia> @benchmark @. $yc64 = inv($xc64) @b^[[ABenchmarkTools.Trial: 10000 samples with 6 evaluations. Range (min … max): 5.465 μs … 10.222 μs ┊ GC (min … max): 0.00% … 0.00% Time (median): 5.482 μs ┊ GC (median): 0.00% Time (mean ± σ): 5.749 μs ± 360.455 ns ┊ GC (mean ± σ): 0.00% ± 0.00% █ ▇▂ ▁ █▆▃▄▄▅▃▄▄▅▅▅▆▇▅▆▆▆▆▇▆▇▄▅▆▆▆██▇▆▇▇▇▇█▆▇▇▇▆▆▇▇▇▆▆▆▅▆▆▆▆▆▄▅▅▄▄ █ 5.46 μs Histogram: log(frequency) by time 6.57 μs < Memory estimate: 0 bytes, allocs estimate: 0. #new julia> @benchmark @. $yc64 = inv($xc64) BenchmarkTools.Trial: 10000 samples with 6 evaluations. Range (min … max): 5.683 μs … 18.619 μs ┊ GC (min … max): 0.00% … 0.00% Time (median): 6.214 μs ┊ GC (median): 0.00% Time (mean ± σ): 6.033 μs ± 479.948 ns ┊ GC (mean ± σ): 0.00% ± 0.00% █ ▅█▁▁ ▁ ▂ █▅▄▃▄▆▇▇▇█▇▇████████▇▇███▇▅▆▅▅▆▅▄▁▄▃▄▅▃▄▄▃▄▅▄▄▁▁▃▁▁▁▄▁▁▁▁▅▆ █ 5.68 μs Histogram: log(frequency) by time 8.1 μs < Memory estimate: 0 bytes, allocs estimate: 0. ``` So for most values that will appear in practice this is 2x faster (but it's 20% slower for values that risk overflow. --- base/complex.jl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/base/complex.jl b/base/complex.jl index a9590328a8c56..c1c2061f54489 100644 --- a/base/complex.jl +++ b/base/complex.jl @@ -472,10 +472,13 @@ function inv(z::Complex{T}) where T<:Union{Float16,Float32} end function inv(w::ComplexF64) c, d = reim(w) - (isinf(c) | isinf(d)) && return complex(copysign(0.0, c), flipsign(-0.0, d)) absc, absd = abs(c), abs(d) - cd = ifelse(absc>absd, absc, absd) # cheap `max`: don't need sign- and nan-checks here - + cd = ifelse(absc>absd, absc, absd) + if sqrt(floatmin(Float64)/2) <= cd <= sqrt(floatmax(Float64)/2) + return conj(w) / abs2(w) + end + (isinf(c) | isinf(d)) && return complex(copysign(0.0, c), flipsign(-0.0, d)) + ϵ = eps(Float64) bs = 2/(ϵ*ϵ) @@ -493,7 +496,7 @@ function inv(w::ComplexF64) else q, p = robust_cinv(-d, -c) end - return ComplexF64(p*s, q*s) # undo scaling + return ComplexF64(p*s, q*s) end function robust_cinv(c::Float64, d::Float64) r = d/c From 657b1d053d31b851ae745930e9e86c8f37142762 Mon Sep 17 00:00:00 2001 From: Oscar Smith Date: Thu, 20 Oct 2022 12:12:06 -0400 Subject: [PATCH 2/5] slight speedup, improved vectorization. --- base/complex.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/base/complex.jl b/base/complex.jl index c1c2061f54489..2e13ea074b01f 100644 --- a/base/complex.jl +++ b/base/complex.jl @@ -473,9 +473,10 @@ end function inv(w::ComplexF64) c, d = reim(w) absc, absd = abs(c), abs(d) - cd = ifelse(absc>absd, absc, absd) + cd, dc = ifelse(absc>absd, (absc, absd), (absd, absc)) + # no overflow from abs2 if sqrt(floatmin(Float64)/2) <= cd <= sqrt(floatmax(Float64)/2) - return conj(w) / abs2(w) + return conj(w) / muladd(cd, cd, dc*dc) end (isinf(c) | isinf(d)) && return complex(copysign(0.0, c), flipsign(-0.0, d)) From a6393b94bed0b6cb69e63390dd9ac60c2b2dcc6a Mon Sep 17 00:00:00 2001 From: Oscar Smith Date: Thu, 20 Oct 2022 14:41:17 -0400 Subject: [PATCH 3/5] use 2 divisions rather than 1 div and 2 muls. --- base/complex.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/base/complex.jl b/base/complex.jl index 2e13ea074b01f..7cdc477f38325 100644 --- a/base/complex.jl +++ b/base/complex.jl @@ -501,8 +501,9 @@ function inv(w::ComplexF64) end function robust_cinv(c::Float64, d::Float64) r = d/c - p = inv(muladd(d, r, c)) - q = -r*p + z = muladd(d, r, c) + p = 1.0/z + q = -r/z return p, q end From 48e61d11af86e247ab29bb8d02ce1b6db1663032 Mon Sep 17 00:00:00 2001 From: Oscar Smith Date: Fri, 21 Oct 2022 10:19:02 -0400 Subject: [PATCH 4/5] fix whitespace. --- base/complex.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/complex.jl b/base/complex.jl index 7cdc477f38325..edb323bfa1944 100644 --- a/base/complex.jl +++ b/base/complex.jl @@ -479,7 +479,7 @@ function inv(w::ComplexF64) return conj(w) / muladd(cd, cd, dc*dc) end (isinf(c) | isinf(d)) && return complex(copysign(0.0, c), flipsign(-0.0, d)) - + ϵ = eps(Float64) bs = 2/(ϵ*ϵ) From 710b94d45242b4180ccdb1fce76ef4163a2f99f0 Mon Sep 17 00:00:00 2001 From: Oscar Smith Date: Mon, 7 Nov 2022 12:56:46 -0500 Subject: [PATCH 5/5] fix doctest --- doc/src/manual/complex-and-rational-numbers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/src/manual/complex-and-rational-numbers.md b/doc/src/manual/complex-and-rational-numbers.md index 6fa0e2b71f822..ca6c241ca0583 100644 --- a/doc/src/manual/complex-and-rational-numbers.md +++ b/doc/src/manual/complex-and-rational-numbers.md @@ -48,7 +48,7 @@ julia> 3(2 - 5im)^2 -63 - 60im julia> 3(2 - 5im)^-1.0 -0.20689655172413796 + 0.5172413793103449im +0.20689655172413793 + 0.5172413793103449im ``` The promotion mechanism ensures that combinations of operands of different types just work: