Optimize pow5_12(::Float32) (#493)

JuliaGraphics · Jun 23, 2021 · bf61512 · bf61512
1 parent 9710272
commit bf61512
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 3 deletions.
diff --git a/src/conversions.jl b/src/conversions.jl
@@ -81,8 +81,9 @@ correct_gamut(c::CV) where {CV<:TransparentRGB} =
 @inline function srgb_compand(v)
  F = typeof(0.5f0v) === Float32 ? Float32 : promote_type(Float64, typeof(v))
  vf = F(v)
+ vc = @fastmath max(vf, F(0.0031308))
  # `pow5_12` is an optimized function to get `v^(1/2.4)`
- vf > F(0.0031308) ? muladd(F(1.055), F(pow5_12(vf)), F(-0.055)) : F(12.92) * vf
+ vf > F(0.0031308) ? muladd(F(1.055), F(pow5_12(vc)), F(-0.055)) : F(12.92) * vf
 end
 
 function _hsx_to_rgb(im::UInt8, v, n, m)

diff --git a/src/utilities.jl b/src/utilities.jl
@@ -92,8 +92,8 @@ pow3_4(x) = (y = @fastmath(sqrt(x)); y*@fastmath(sqrt(y))) # x^(3/4)
 
 # `pow5_12` is called from `srgb_compand`.
 pow5_12(x) = pow3_4(x) / cbrt(x) # 5/12 == 1/2 + 1/4 - 1/3 == 3/4 - 1/3
-pow5_12(x::Float32) = Float32(pow5_12(Float64(x)))
 @inline function pow5_12(x::Float64)
+ @noinline _cbrt(x) = cbrt01(x)
  p3_4 = pow3_4(x)
  # x^(-1/6)
  if x < 0.02
@@ -106,7 +106,7 @@ pow5_12(x::Float32) = Float32(pow5_12(Float64(x)))
  t0 = @evalpoly(x, 1.7047813285940905, -3.1261253501167308,
  7.498744828350077, -10.100319516746419, 6.820601476522508, -1.7978894213531524)
  else
- return p3_4 / cbrt(x)
+ return p3_4 / _cbrt(x)
  end
  # x^(-1/3)
  t1 = t0 * t0
@@ -117,6 +117,20 @@ pow5_12(x::Float32) = Float32(pow5_12(Float64(x)))
  # x^(3/4) * x^(-1/3)
  muladd(p3_4, t2, p3_4 * t2h)
 end
+@inline function pow5_12(x::Float32)
+ # x^(-1/3)
+ rc = rcbrt(x)
+ rcx = -rc * x
+ rch = muladd(muladd(rc, x, rcx), -rc^2, muladd(rc^2, rcx, 1.0f0)) # 1 - x * rc^3
+ rce = muladd(2/9f0, rch, 1/3f0) * rch * rc
+ # x^(3/4)
+ p3_4_f64 = pow3_4(Float64(x))
+ p3_4r = reinterpret(Float64, reinterpret(UInt64, p3_4_f64) & 0xffffffff_e0000000)
+ p3_4 = Float32(p3_4r)
+ p3_4e = Float32(p3_4_f64 - p3_4r)
+ # x^(3/4) * x^(-1/3)
+ muladd(p3_4, rc, muladd(p3_4, rce, p3_4e * rc))
+end
 
 # `pow12_5` is called from `invert_srgb_compand`.
 pow12_5(x) = pow12_5(Float64(x))