Merge pull request #31 from milankl/mk/intarith

Stochastic round to NaN?
milankl · Nov 4, 2020 · 763d9fe · 763d9fe
2 parents df02585 + d6e0408
commit 763d9fe
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 175 deletions.
diff --git a/src/StochasticRounding.jl b/src/StochasticRounding.jl
@@ -22,7 +22,7 @@ module StochasticRounding
 
     # faster random number generator
     using RandomNumbers.Xorshifts
-    const Xor128 = Ref{Xoroshiro128Plus}()
+    const Xor128 = Ref{Xoroshiro128Plus}(Xoroshiro128Plus())
 
     """Reseed the PRNG randomly by recalling."""
     function __init__()

diff --git a/src/bfloat16sr.jl b/src/bfloat16sr.jl
@@ -51,15 +51,8 @@ UInt32(x::BFloat16sr) = UInt32(Float32(x))
 UInt16(x::BFloat16sr) = UInt16(Float32(x))
 UInt8(x::BFloat16sr) = UInt8(Float32(x))
 
-const epsBF16 = 0.0078125f0							# machine epsilon of BFloat16 as Float32
-const epsBF16_half = epsBF16/2						# half the machine epsilon
-const eps_quarter = 0x0000_4000						# a quarter of eps as Float32 sig bits
 const F32_one = reinterpret(UInt32,one(Float32))	# Float32 one as UInt32
 
-# The smallest non-subnormal exponent of BFloat16 as Float32 reinterpreted as UInt32
-# floatmin(Float32) = floatmin(BFloat16)
-const min_expBF16 = reinterpret(UInt32,floatmin(Float32))
-
 """Convert to BFloat16sr from Float32 via round-to-nearest
 and tie to even. Identical to BFloat16(::Float32)."""
 function BFloat16sr(x::Float32)
@@ -72,64 +65,23 @@ end
 """Convert to BFloat16sr from Float32 with stochastic rounding.
 Binary arithmetic version."""
 function BFloat16_stochastic_round(x::Float32)
-	iszero(x) && return zero(BFloat16sr)
+	ix = reinterpret(Int32,x)
+	# if deterministically round to 0 return 0
+	# to avoid a stochastic rounding to NaN
+	# push to the left to get rid of sign
+	# push to the right to get rid of the insignificant bits
+	((ix << 1) >> 16) == 0x0000_0000 && return zero(BFloat16sr)
 	# r are random bits for the last 15
 	# >> either introduces 0s for the first 17 bits
-	# or 1s. Interpreted as Int64 this corresponds to [-ulp/2,ulp/2)
+	# or 1s. Interpreted as Int32 this corresponds to [-ulp/2,ulp/2)
 	# which is added with binary arithmetic subsequently
 	# this is the stochastic perturbation.
 	# Then deterministic round to nearest to either round up or round down.
 	r = rand(Xor128[],Int32) >> 16
-	ui = reinterpret(Int32,x) + r
-	return BFloat16sr(reinterpret(Float32,ui))
+	xr = reinterpret(Float32,ix + r)
+	return BFloat16sr(xr)
 end
 
-# """Convert to BFloat16sr from Float32 with stochastic rounding."""
-# function BFloat16_stochastic_round(x::Float32)
-#     isnan(x) && return NaNB16sr
-#
-# 	ui = reinterpret(UInt32, x)
-#
-# 	# e is the base 2 exponent of x (with signficand is set to zero)
-# 	# e.g. e is 2 for pi, e is -2 for -pi, e is 0.25 for 0.3
-# 	# e is at least min_exp for stochastic rounding for subnormals
-# 	e = (ui & sign_mask(Float32)) | max(min_expBF16,ui & exponent_mask(Float32))
-# 	e = reinterpret(Float32,e)
-#
-# 	# sig is the signficand (exponents & sign is masked out)
-# 	sig = ui & significand_mask(Float32)
-#
-# 	# STOCHASTIC ROUNDING
-# 	# In most cases, perturb any x between x0 and x1 with a random number
-# 	# that is in (-ulp/2,ulp/2) where ulp is the distance between x0 and x1.
-# 	# ulp = e*eps, with e the next base 2 exponent to zero from x.
-#
-# 	# However, there is a special case (aka the "quarter-case") for rounding
-# 	# below ulp/4 when x0 is 2^n for any n (i.e. above an exponent bit flip)
-# 	# due to doubling of ulp towards x1.
-# 	quartercase = sig < eps_quarter		# true for special case false otherwise
-#
-# 	# frac is in most cases 0.5 to shift the randum number [0,1) to [-0.5,0.5)
-#
-# 	# However, in the special case frac is (x-x0)/(x1-x0), that means the fraction
-# 	# of the distance where x is in between x0 and x1
-# 	# Then shift the random number [0,1) to be [-frac/2,-frac/2+ulp/2)
-# 	# such that e.g. x = x0 + ulp/8 gets perturbed to be in [x0+ulp/16,x0+ulp/16+ulp/2)
-# 	# and so the chance of a round-up is indeed 1/8
-# 	# Illustration, let x be at 1/8, then perturb such that x can be in (--)
-# 	# 1 -- x --1/4--   --1/2--   --   --   -- 2
-# 	# 1  (-x-----------------)                2
-# 	# i.e. starting from 1/16 up to 1/2+1/16
-# 	frac = quartercase ? reinterpret(Float32,F32_one | (sig << 7)) - 1f0 : 0.5f0
-# 	eps = quartercase ? epsBF16_half : epsBF16	# in this case use eps/2
-#
-# 	# stochastically perturb x before rounding (equiv to stochastic rounding)
-# 	x += e*eps*(rand(Xor128[],Float32) - frac)
-#
-#     # Round to nearest after stochastic perturbation
-#     return BFloat16sr(x)
-# end
-
 """Chance that x::Float32 is round up when converted to BFloat16sr."""
 function BFloat16_chance_roundup(x::Float32)
     isnan(x) && return NaNB16sr

diff --git a/src/float16sr.jl b/src/float16sr.jl
@@ -30,8 +30,8 @@ const NaN16sr = reinterpret(Float16sr, NaN16)
 
 # basic operations
 abs(x::Float16sr) = reinterpret(Float16sr, reinterpret(UInt16, x) & 0x7fff)
-isnan(x::Float16sr) = reinterpret(UInt16,x) & 0x7fff > 0x7c00
-isfinite(x::Float16sr) = reinterpret(UInt16,x) & 0x7c00 != 0x7c00
+isnan(x::Float16sr) = isnan(Float16(x))
+isfinite(x::Float16sr) = isfinite(Float16(x))
 
 nextfloat(x::Float16sr) = Float16sr(nextfloat(Float16(x)))
 prevfloat(x::Float16sr) = Float16sr(prevfloat(Float16(x)))
@@ -49,74 +49,27 @@ Float64(x::Float16sr) = Float64(Float16(x))
 Float16sr(x::Integer) = Float16sr(Float32(x))
 (::Type{T})(x::Float16sr) where {T<:Integer} = T(Float32(x))
 
-# constants needed for stochastic perturbation
-const epsF16 = Float32(eps(Float16))		# machine epsilon of Float16 as Float32
-const epsF16_half = epsF16/2				# machine epsilon half
-
-# The smallest non-subnormal exponent of Float16 as Float32 reinterpreted as UInt32
-const min_expF16 = reinterpret(UInt32,Float32(floatmin(Float16)))
-
 """Convert to Float16sr from Float32 with stochastic rounding.
 Binary arithmetic version."""
 function Float16_stochastic_round(x::Float32)
-	iszero(x) && return zero(Float16sr)
+	ix = reinterpret(Int32,x)
+	# if deterministically round to 0 return 0
+	# to avoid a stochastic rounding to NaN
+	# push to the left to get rid of sign
+	# push to the right to get rid of the insignificant bits
+	((ix << 1) >> 13) == 0x0000_0000 && return zero(BFloat16sr)
+
 	# r are random bits for the last 15
 	# >> either introduces 0s for the first 17 bits
 	# or 1s. Interpreted as Int64 this corresponds to [-ulp/2,ulp/2)
 	# which is added with binary arithmetic subsequently
 	# this is the stochastic perturbation.
 	# Then deterministic round to nearest to either round up or round down.
 	r = rand(Xor128[],Int32) >> 19 # = 16sbits+3expbits difference between f16,f32
-	ui = reinterpret(Int32,x) + r
-	return Float16sr(reinterpret(Float32,ui))
+	xr = reinterpret(Float32,ix + r)
+	return Float16sr(xr)
 end
 
-# """Convert to BFloat16sr from Float32 with stochastic rounding."""
-# function Float16_stochastic_round(x::Float32)
-# 	isnan(x) && return NaN16sr
-#
-# 	ui = reinterpret(UInt32, x)
-#
-# 	# e is the base 2 exponent of x (with signficand is set to zero)
-# 	# e.g. e is 2 for pi, e is -2 for -pi, e is 0.25 for 0.3
-#     # e is at least min_exp for stsochastic rounding for subnormals
-#     e = (ui & sign_mask(Float32)) | max(min_expF16,ui & exponent_mask(Float32))
-#     e = reinterpret(Float32,e)
-#
-# 	# sig is the signficand (exponents & sign is masked out)
-# 	sig = ui & significand_mask(Float32)
-#
-# 	# STOCHASTIC ROUNDING
-# 	# In most cases, perturb any x between x0 and x1 with a random number
-# 	# that is in (-ulp/2,ulp/2) where ulp is the distance between x0 and x1.
-# 	# ulp = e*eps, with e the next base 2 exponent to zero from x.
-#
-# 	# However, there is a special case (aka the "quarter-case") for rounding
-# 	# below ulp/4 when x0 is 2^n for any n (i.e. above an exponent bit flip)
-# 	# due to doubling of ulp towards x1.
-# 	quartercase = sig < eps_quarter		# true for special case false otherwise
-#
-# 	# frac is in most cases 0.5 to shift the randum number [0,1) to [-0.5,0.5)
-#
-# 	# However, in the special case frac is (x-x0)/(x1-x0), that means the fraction
-# 	# of the distance where x is in between x0 and x1
-# 	# Then shift the random number [0,1) to be [-frac/2,-frac/2+ulp/2)
-# 	# such that e.g. x = x0 + ulp/8 gets perturbed to be in [x0+ulp/16,x0+ulp/16+ulp/2)
-# 	# and so the chance of a round-up is indeed 1/8
-# 	# Illustration, let x be at 1/8, then perturb such that x can be in (--)
-# 	# 1 -- x --1/4--   --1/2--   --   --   -- 2
-# 	# 1  (-x-----------------)                2
-# 	# i.e. starting from 1/16 up to 1/2+1/16
-# 	frac = quartercase ? reinterpret(Float32,F32_one | (sig << 10)) - 1f0 : 0.5f0
-# 	eps = quartercase ? epsF16_half : epsF16
-#
-# 	# stochastically perturb x before rounding (equiv to stochastic rounding)
-# 	x += e*eps*(rand(Xor128[],Float32) - frac)
-#
-# 	# Round to nearest after stochastic perturbation
-#     return Float16sr(x)
-# end
-
 """Chance that x::Float32 is round up when converted to Float16sr."""
 function Float16_chance_roundup(x::Float32)
 	isnan(x) && return NaN16sr

diff --git a/src/float32sr.jl b/src/float32sr.jl
@@ -32,9 +32,9 @@ const Inf32sr = reinterpret(Float32sr, Inf32)
 const NaN32sr = reinterpret(Float32sr, NaN32)
 
 # basic operations
-abs(x::Float32sr) = reinterpret(Float32sr, reinterpret(UInt32, x) & 0x7fff_ffff)
-isnan(x::Float32sr) = reinterpret(UInt32,x) & 0x7fff_ffff > 0x7f80_0000
-isfinite(x::Float32sr) = reinterpret(UInt32,x) & 0x7fff_ffff != 0x7f80_0000
+abs(x::Float32sr) = reinterpret(Float32sr, abs(reinterpret(Float32)))
+isnan(x::Float32sr) = isnan(reinterpret(Float32,x))
+isfinite(x::Float32sr) = isfinite(reinterpret(Float32,x))
 
 nextfloat(x::Float32sr) = Float32sr(nextfloat(Float32(x)))
 prevfloat(x::Float32sr) = Float32sr(prevfloat(Float32(x)))
@@ -52,72 +52,25 @@ Float64(x::Float32sr) = Float64(Float32(x))
 Float32sr(x::Integer) = Float32sr(Float32(x))
 (::Type{T})(x::Float32sr) where {T<:Integer} = T(Float32(x))
 
-const epsF32 = Float64(eps(Float32))
-const epsF32_half = epsF32/2
-const eps64_quarter = 0x0000_0000_4000_0000		# a quarter of eps as Float64 sig bits
-const F64_one = reinterpret(UInt64,one(Float64))
-
-# The smallest non-subnormal exponent of Float32 as Float64 reinterpreted as UInt64
-const min_expF32 = reinterpret(UInt64,Float64(floatmin(Float32)))
-
 """Convert to Float32sr from Float64 with stochastic rounding.
 Binary arithmetic version."""
 function Float32_stochastic_round(x::Float64)
-	iszero(x) && return zero(Float32sr)
+	# check whether round to nearest maps to zero(Float32)
+	# to avoid stochastic rounding to NaN
+	iszero(Float32(x)) && return zero(Float32sr)
+
 	# r are random bits for the last 31
 	# >> either introduces 0s for the first 33 bits
 	# or 1s. Interpreted as Int64 this corresponds to [-ulp/2,ulp/2)
 	# which is added with binary arithmetic subsequently
 	# this is the stochastic perturbation.
 	# Then deterministic round to nearest to either round up or round down.
 	r = rand(Xor128[],Int64) >> 35   # = 32sbits+3expbits difference between f32,f64
-	ui = reinterpret(Int64,x) + r
-	return Float32sr(reinterpret(Float64,ui))
+	xr = reinterpret(Float64,reinterpret(Int64,x) + r)
+	return Float32sr(xr)			# round to nearest
 end
 
-# """Convert to Float32sr from Float64 with stochastic rounding."""
-# function Float32_stochastic_round(x::Float64)
-#
-# 	ui = reinterpret(UInt64, x)
-#
-# 	# stochastic rounding
-# 	# e is the base 2 exponent of x (signficand set to zero)
-# 	# e is at least min_exp for stochastic rounding for subnormals
-# 	e = (ui & sign_mask(Float64)) | max(min_expF32,ui & exponent_mask(Float64))
-# 	e = reinterpret(Float64,e)
-#
-# 	# sig is the signficand (exponents & sign is masked out)
-# 	sig = ui & significand_mask(Float64)
-#
-# 	# STOCHASTIC ROUNDING
-# 	# In most cases, perturb any x between x0 and x1 with a random number
-# 	# that is in (-ulp/2,ulp/2) where ulp is the distance between x0 and x1.
-# 	# ulp = e*eps, with e the next base 2 exponent to zero from x.
-#
-# 	# However, there is a special case (aka the "quarter-case") for rounding
-# 	# below ulp/4 when x0 is 2^n for any n (i.e. above an exponent bit flip)
-# 	# due to doubling of ulp towards x1.
-# 	quartercase = sig < eps64_quarter	# true for special case false otherwise
-#
-# 	# frac is in most cases 0.5 to shift the randum number [0,1) to [-0.5,0.5)
-# 	# However, in the special case frac is (x-x0)/(x1-x0), that means the fraction
-# 	# of the distance where x is in between x0 and x1
-# 	# Then shift the random number [0,1) to be [-frac/2,-frac/2+ulp/2)
-# 	# such that e.g. x = x0 + ulp/8 gets perturbed to be in [x0+ulp/16,x0+ulp/16+ulp/2)
-# 	# and so the chance of a round-up is indeed 1/8
-# 	# Illustration, let x be at 1/8, then perturb such that x can be in (--)
-# 	# 1 -- x --1/4--   --1/2--   --   --   -- 2
-# 	# 1  (-x-----------------)                2
-# 	# i.e. starting from 1/16 up to 1/2+1/16
-# 	frac = quartercase ? reinterpret(Float64,F64_one | (sig << 23)) - 1.0 : 0.5
-# 	eps = quartercase ? epsF32_half : epsF32
-#
-# 	# stochastically perturb x before rounding (equiv to stochastic rounding)
-# 	x += e*eps*(rand(Xor128[],Float64) - frac)
-#
-#     # Round to nearest after stochastic perturbation
-#     return Float32sr(x)
-# end
+const F64_one = reinterpret(UInt64,one(Float64))
 
 """Chance that x::Float64 is round up when converted to Float32sr."""
 function Float32_chance_roundup(x::Float64)

diff --git a/test/bfloat16sr.jl b/test/bfloat16sr.jl
@@ -9,6 +9,21 @@ end
     @test 0f == BFloat16sr(0)
 end
 
+@testset "NaN and Inf" begin
+    @test isnan(NaNB16sr)
+    @test ~isfinite(NaNB16sr)
+    @test ~isfinite(InfB16sr)
+end
+
+@testset "No stochastic round to NaN" begin
+    f1 = nextfloat(0f0)
+    f2 = prevfloat(0f0)
+    for i in 1:100
+        @test isfinite(BFloat16_stochastic_round(f1))
+        @test isfinite(BFloat16_stochastic_round(f2))
+    end
+end
+
 @testset "Rounding" begin
     @test 1 == Int(round(BFloat16sr(1.2)))
     @test 1 == Int(floor(BFloat16sr(1.2)))

diff --git a/test/float16sr.jl b/test/float16sr.jl
@@ -3,6 +3,21 @@
     @test zero(Float16sr) == -(zero(Float16sr))
 end
 
+@testset "NaN and Inf" begin
+    @test isnan(NaN16sr)
+    @test ~isfinite(NaN16sr)
+    @test ~isfinite(Inf16sr)
+end
+
+@testset "No stochastic round to NaN" begin
+    f1 = nextfloat(0f0)
+    f2 = prevfloat(0f0)
+    for i in 1:100
+        @test isfinite(Float16_stochastic_round(f1))
+        @test isfinite(Float16_stochastic_round(f2))
+    end
+end
+
 @testset "Integer promotion" begin
     f = Float16sr(1)
     @test 2f == Float16sr(2)
@@ -271,8 +286,6 @@ end
     # for some reason 0x0200 fails...?
     for hex in vcat(0x0001:0x01ff,0x0201:0x03ff)    # test for all subnormals of Float16
 
-        println(hex)
-
         # add ulp/2 to have stochastic rounding that is 50/50 up/down.
         x = Float32(reinterpret(Float16,hex)) + ulp_half
 

diff --git a/test/float32sr.jl b/test/float32sr.jl
@@ -270,3 +270,27 @@ end
         @test p1/N < 0.55
     end
 end
+
+@testset "NaN and Inf" begin
+    @test isnan(NaN32sr)
+    @test ~isfinite(NaN32sr)
+    @test ~isfinite(Inf32sr)
+end
+
+@testset "No stochastic round to NaN" begin
+    f1 = nextfloat(0.0)
+    f2 = prevfloat(0.0)
+    for i in 1:1000
+        @test isfinite(Float32_stochastic_round(f1))
+        @test isfinite(Float32_stochastic_round(f2))
+    end
+end
+
+@testset "No stochastic round to NaN" begin
+    f1 = Float64(floatmax(Float32))   # larger can map to Inf though!
+    f2 = -f1
+    for i in 1:1000
+        @test isfinite(Float32_stochastic_round(f1))
+        @test isfinite(Float32_stochastic_round(f2))
+    end
+end