Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stochastic round to NaN? #31

Merged
merged 4 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/StochasticRounding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ module StochasticRounding

# faster random number generator
using RandomNumbers.Xorshifts
const Xor128 = Ref{Xoroshiro128Plus}()
const Xor128 = Ref{Xoroshiro128Plus}(Xoroshiro128Plus())

"""Reseed the PRNG randomly by recalling."""
function __init__()
Expand Down
66 changes: 9 additions & 57 deletions src/bfloat16sr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,8 @@ UInt32(x::BFloat16sr) = UInt32(Float32(x))
UInt16(x::BFloat16sr) = UInt16(Float32(x))
UInt8(x::BFloat16sr) = UInt8(Float32(x))

const epsBF16 = 0.0078125f0 # machine epsilon of BFloat16 as Float32
const epsBF16_half = epsBF16/2 # half the machine epsilon
const eps_quarter = 0x0000_4000 # a quarter of eps as Float32 sig bits
const F32_one = reinterpret(UInt32,one(Float32)) # Float32 one as UInt32

# The smallest non-subnormal exponent of BFloat16 as Float32 reinterpreted as UInt32
# floatmin(Float32) = floatmin(BFloat16)
const min_expBF16 = reinterpret(UInt32,floatmin(Float32))

"""Convert to BFloat16sr from Float32 via round-to-nearest
and tie to even. Identical to BFloat16(::Float32)."""
function BFloat16sr(x::Float32)
Expand All @@ -72,64 +65,23 @@ end
"""Convert to BFloat16sr from Float32 with stochastic rounding.
Binary arithmetic version."""
function BFloat16_stochastic_round(x::Float32)
iszero(x) && return zero(BFloat16sr)
ix = reinterpret(Int32,x)
# if deterministically round to 0 return 0
# to avoid a stochastic rounding to NaN
# push to the left to get rid of sign
# push to the right to get rid of the insignificant bits
((ix << 1) >> 16) == 0x0000_0000 && return zero(BFloat16sr)
# r are random bits for the last 15
# >> either introduces 0s for the first 17 bits
# or 1s. Interpreted as Int64 this corresponds to [-ulp/2,ulp/2)
# or 1s. Interpreted as Int32 this corresponds to [-ulp/2,ulp/2)
# which is added with binary arithmetic subsequently
# this is the stochastic perturbation.
# Then deterministic round to nearest to either round up or round down.
r = rand(Xor128[],Int32) >> 16
ui = reinterpret(Int32,x) + r
return BFloat16sr(reinterpret(Float32,ui))
xr = reinterpret(Float32,ix + r)
return BFloat16sr(xr)
end

# """Convert to BFloat16sr from Float32 with stochastic rounding."""
# function BFloat16_stochastic_round(x::Float32)
# isnan(x) && return NaNB16sr
#
# ui = reinterpret(UInt32, x)
#
# # e is the base 2 exponent of x (with signficand is set to zero)
# # e.g. e is 2 for pi, e is -2 for -pi, e is 0.25 for 0.3
# # e is at least min_exp for stochastic rounding for subnormals
# e = (ui & sign_mask(Float32)) | max(min_expBF16,ui & exponent_mask(Float32))
# e = reinterpret(Float32,e)
#
# # sig is the signficand (exponents & sign is masked out)
# sig = ui & significand_mask(Float32)
#
# # STOCHASTIC ROUNDING
# # In most cases, perturb any x between x0 and x1 with a random number
# # that is in (-ulp/2,ulp/2) where ulp is the distance between x0 and x1.
# # ulp = e*eps, with e the next base 2 exponent to zero from x.
#
# # However, there is a special case (aka the "quarter-case") for rounding
# # below ulp/4 when x0 is 2^n for any n (i.e. above an exponent bit flip)
# # due to doubling of ulp towards x1.
# quartercase = sig < eps_quarter # true for special case false otherwise
#
# # frac is in most cases 0.5 to shift the randum number [0,1) to [-0.5,0.5)
#
# # However, in the special case frac is (x-x0)/(x1-x0), that means the fraction
# # of the distance where x is in between x0 and x1
# # Then shift the random number [0,1) to be [-frac/2,-frac/2+ulp/2)
# # such that e.g. x = x0 + ulp/8 gets perturbed to be in [x0+ulp/16,x0+ulp/16+ulp/2)
# # and so the chance of a round-up is indeed 1/8
# # Illustration, let x be at 1/8, then perturb such that x can be in (--)
# # 1 -- x --1/4-- --1/2-- -- -- -- 2
# # 1 (-x-----------------) 2
# # i.e. starting from 1/16 up to 1/2+1/16
# frac = quartercase ? reinterpret(Float32,F32_one | (sig << 7)) - 1f0 : 0.5f0
# eps = quartercase ? epsBF16_half : epsBF16 # in this case use eps/2
#
# # stochastically perturb x before rounding (equiv to stochastic rounding)
# x += e*eps*(rand(Xor128[],Float32) - frac)
#
# # Round to nearest after stochastic perturbation
# return BFloat16sr(x)
# end

"""Chance that x::Float32 is round up when converted to BFloat16sr."""
function BFloat16_chance_roundup(x::Float32)
isnan(x) && return NaNB16sr
Expand Down
69 changes: 11 additions & 58 deletions src/float16sr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ const NaN16sr = reinterpret(Float16sr, NaN16)

# basic operations
abs(x::Float16sr) = reinterpret(Float16sr, reinterpret(UInt16, x) & 0x7fff)
isnan(x::Float16sr) = reinterpret(UInt16,x) & 0x7fff > 0x7c00
isfinite(x::Float16sr) = reinterpret(UInt16,x) & 0x7c00 != 0x7c00
isnan(x::Float16sr) = isnan(Float16(x))
isfinite(x::Float16sr) = isfinite(Float16(x))

nextfloat(x::Float16sr) = Float16sr(nextfloat(Float16(x)))
prevfloat(x::Float16sr) = Float16sr(prevfloat(Float16(x)))
Expand All @@ -49,74 +49,27 @@ Float64(x::Float16sr) = Float64(Float16(x))
Float16sr(x::Integer) = Float16sr(Float32(x))
(::Type{T})(x::Float16sr) where {T<:Integer} = T(Float32(x))

# constants needed for stochastic perturbation
const epsF16 = Float32(eps(Float16)) # machine epsilon of Float16 as Float32
const epsF16_half = epsF16/2 # machine epsilon half

# The smallest non-subnormal exponent of Float16 as Float32 reinterpreted as UInt32
const min_expF16 = reinterpret(UInt32,Float32(floatmin(Float16)))

"""Convert to Float16sr from Float32 with stochastic rounding.
Binary arithmetic version."""
function Float16_stochastic_round(x::Float32)
iszero(x) && return zero(Float16sr)
ix = reinterpret(Int32,x)
# if deterministically round to 0 return 0
# to avoid a stochastic rounding to NaN
# push to the left to get rid of sign
# push to the right to get rid of the insignificant bits
((ix << 1) >> 13) == 0x0000_0000 && return zero(BFloat16sr)

# r are random bits for the last 15
# >> either introduces 0s for the first 17 bits
# or 1s. Interpreted as Int64 this corresponds to [-ulp/2,ulp/2)
# which is added with binary arithmetic subsequently
# this is the stochastic perturbation.
# Then deterministic round to nearest to either round up or round down.
r = rand(Xor128[],Int32) >> 19 # = 16sbits+3expbits difference between f16,f32
ui = reinterpret(Int32,x) + r
return Float16sr(reinterpret(Float32,ui))
xr = reinterpret(Float32,ix + r)
return Float16sr(xr)
end

# """Convert to BFloat16sr from Float32 with stochastic rounding."""
# function Float16_stochastic_round(x::Float32)
# isnan(x) && return NaN16sr
#
# ui = reinterpret(UInt32, x)
#
# # e is the base 2 exponent of x (with signficand is set to zero)
# # e.g. e is 2 for pi, e is -2 for -pi, e is 0.25 for 0.3
# # e is at least min_exp for stsochastic rounding for subnormals
# e = (ui & sign_mask(Float32)) | max(min_expF16,ui & exponent_mask(Float32))
# e = reinterpret(Float32,e)
#
# # sig is the signficand (exponents & sign is masked out)
# sig = ui & significand_mask(Float32)
#
# # STOCHASTIC ROUNDING
# # In most cases, perturb any x between x0 and x1 with a random number
# # that is in (-ulp/2,ulp/2) where ulp is the distance between x0 and x1.
# # ulp = e*eps, with e the next base 2 exponent to zero from x.
#
# # However, there is a special case (aka the "quarter-case") for rounding
# # below ulp/4 when x0 is 2^n for any n (i.e. above an exponent bit flip)
# # due to doubling of ulp towards x1.
# quartercase = sig < eps_quarter # true for special case false otherwise
#
# # frac is in most cases 0.5 to shift the randum number [0,1) to [-0.5,0.5)
#
# # However, in the special case frac is (x-x0)/(x1-x0), that means the fraction
# # of the distance where x is in between x0 and x1
# # Then shift the random number [0,1) to be [-frac/2,-frac/2+ulp/2)
# # such that e.g. x = x0 + ulp/8 gets perturbed to be in [x0+ulp/16,x0+ulp/16+ulp/2)
# # and so the chance of a round-up is indeed 1/8
# # Illustration, let x be at 1/8, then perturb such that x can be in (--)
# # 1 -- x --1/4-- --1/2-- -- -- -- 2
# # 1 (-x-----------------) 2
# # i.e. starting from 1/16 up to 1/2+1/16
# frac = quartercase ? reinterpret(Float32,F32_one | (sig << 10)) - 1f0 : 0.5f0
# eps = quartercase ? epsF16_half : epsF16
#
# # stochastically perturb x before rounding (equiv to stochastic rounding)
# x += e*eps*(rand(Xor128[],Float32) - frac)
#
# # Round to nearest after stochastic perturbation
# return Float16sr(x)
# end

"""Chance that x::Float32 is round up when converted to Float16sr."""
function Float16_chance_roundup(x::Float32)
isnan(x) && return NaN16sr
Expand Down
67 changes: 10 additions & 57 deletions src/float32sr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ const Inf32sr = reinterpret(Float32sr, Inf32)
const NaN32sr = reinterpret(Float32sr, NaN32)

# basic operations
abs(x::Float32sr) = reinterpret(Float32sr, reinterpret(UInt32, x) & 0x7fff_ffff)
isnan(x::Float32sr) = reinterpret(UInt32,x) & 0x7fff_ffff > 0x7f80_0000
isfinite(x::Float32sr) = reinterpret(UInt32,x) & 0x7fff_ffff != 0x7f80_0000
abs(x::Float32sr) = reinterpret(Float32sr, abs(reinterpret(Float32)))
isnan(x::Float32sr) = isnan(reinterpret(Float32,x))
isfinite(x::Float32sr) = isfinite(reinterpret(Float32,x))

nextfloat(x::Float32sr) = Float32sr(nextfloat(Float32(x)))
prevfloat(x::Float32sr) = Float32sr(prevfloat(Float32(x)))
Expand All @@ -52,72 +52,25 @@ Float64(x::Float32sr) = Float64(Float32(x))
Float32sr(x::Integer) = Float32sr(Float32(x))
(::Type{T})(x::Float32sr) where {T<:Integer} = T(Float32(x))

const epsF32 = Float64(eps(Float32))
const epsF32_half = epsF32/2
const eps64_quarter = 0x0000_0000_4000_0000 # a quarter of eps as Float64 sig bits
const F64_one = reinterpret(UInt64,one(Float64))

# The smallest non-subnormal exponent of Float32 as Float64 reinterpreted as UInt64
const min_expF32 = reinterpret(UInt64,Float64(floatmin(Float32)))

"""Convert to Float32sr from Float64 with stochastic rounding.
Binary arithmetic version."""
function Float32_stochastic_round(x::Float64)
iszero(x) && return zero(Float32sr)
# check whether round to nearest maps to zero(Float32)
# to avoid stochastic rounding to NaN
iszero(Float32(x)) && return zero(Float32sr)

# r are random bits for the last 31
# >> either introduces 0s for the first 33 bits
# or 1s. Interpreted as Int64 this corresponds to [-ulp/2,ulp/2)
# which is added with binary arithmetic subsequently
# this is the stochastic perturbation.
# Then deterministic round to nearest to either round up or round down.
r = rand(Xor128[],Int64) >> 35 # = 32sbits+3expbits difference between f32,f64
ui = reinterpret(Int64,x) + r
return Float32sr(reinterpret(Float64,ui))
xr = reinterpret(Float64,reinterpret(Int64,x) + r)
return Float32sr(xr) # round to nearest
end

# """Convert to Float32sr from Float64 with stochastic rounding."""
# function Float32_stochastic_round(x::Float64)
#
# ui = reinterpret(UInt64, x)
#
# # stochastic rounding
# # e is the base 2 exponent of x (signficand set to zero)
# # e is at least min_exp for stochastic rounding for subnormals
# e = (ui & sign_mask(Float64)) | max(min_expF32,ui & exponent_mask(Float64))
# e = reinterpret(Float64,e)
#
# # sig is the signficand (exponents & sign is masked out)
# sig = ui & significand_mask(Float64)
#
# # STOCHASTIC ROUNDING
# # In most cases, perturb any x between x0 and x1 with a random number
# # that is in (-ulp/2,ulp/2) where ulp is the distance between x0 and x1.
# # ulp = e*eps, with e the next base 2 exponent to zero from x.
#
# # However, there is a special case (aka the "quarter-case") for rounding
# # below ulp/4 when x0 is 2^n for any n (i.e. above an exponent bit flip)
# # due to doubling of ulp towards x1.
# quartercase = sig < eps64_quarter # true for special case false otherwise
#
# # frac is in most cases 0.5 to shift the randum number [0,1) to [-0.5,0.5)
# # However, in the special case frac is (x-x0)/(x1-x0), that means the fraction
# # of the distance where x is in between x0 and x1
# # Then shift the random number [0,1) to be [-frac/2,-frac/2+ulp/2)
# # such that e.g. x = x0 + ulp/8 gets perturbed to be in [x0+ulp/16,x0+ulp/16+ulp/2)
# # and so the chance of a round-up is indeed 1/8
# # Illustration, let x be at 1/8, then perturb such that x can be in (--)
# # 1 -- x --1/4-- --1/2-- -- -- -- 2
# # 1 (-x-----------------) 2
# # i.e. starting from 1/16 up to 1/2+1/16
# frac = quartercase ? reinterpret(Float64,F64_one | (sig << 23)) - 1.0 : 0.5
# eps = quartercase ? epsF32_half : epsF32
#
# # stochastically perturb x before rounding (equiv to stochastic rounding)
# x += e*eps*(rand(Xor128[],Float64) - frac)
#
# # Round to nearest after stochastic perturbation
# return Float32sr(x)
# end
const F64_one = reinterpret(UInt64,one(Float64))

"""Chance that x::Float64 is round up when converted to Float32sr."""
function Float32_chance_roundup(x::Float64)
Expand Down
15 changes: 15 additions & 0 deletions test/bfloat16sr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,21 @@ end
@test 0f == BFloat16sr(0)
end

@testset "NaN and Inf" begin
@test isnan(NaNB16sr)
@test ~isfinite(NaNB16sr)
@test ~isfinite(InfB16sr)
end

@testset "No stochastic round to NaN" begin
f1 = nextfloat(0f0)
f2 = prevfloat(0f0)
for i in 1:100
@test isfinite(BFloat16_stochastic_round(f1))
@test isfinite(BFloat16_stochastic_round(f2))
end
end

@testset "Rounding" begin
@test 1 == Int(round(BFloat16sr(1.2)))
@test 1 == Int(floor(BFloat16sr(1.2)))
Expand Down
17 changes: 15 additions & 2 deletions test/float16sr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@
@test zero(Float16sr) == -(zero(Float16sr))
end

@testset "NaN and Inf" begin
@test isnan(NaN16sr)
@test ~isfinite(NaN16sr)
@test ~isfinite(Inf16sr)
end

@testset "No stochastic round to NaN" begin
f1 = nextfloat(0f0)
f2 = prevfloat(0f0)
for i in 1:100
@test isfinite(Float16_stochastic_round(f1))
@test isfinite(Float16_stochastic_round(f2))
end
end

@testset "Integer promotion" begin
f = Float16sr(1)
@test 2f == Float16sr(2)
Expand Down Expand Up @@ -271,8 +286,6 @@ end
# for some reason 0x0200 fails...?
for hex in vcat(0x0001:0x01ff,0x0201:0x03ff) # test for all subnormals of Float16

println(hex)

# add ulp/2 to have stochastic rounding that is 50/50 up/down.
x = Float32(reinterpret(Float16,hex)) + ulp_half

Expand Down
24 changes: 24 additions & 0 deletions test/float32sr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,27 @@ end
@test p1/N < 0.55
end
end

@testset "NaN and Inf" begin
@test isnan(NaN32sr)
@test ~isfinite(NaN32sr)
@test ~isfinite(Inf32sr)
end

@testset "No stochastic round to NaN" begin
f1 = nextfloat(0.0)
f2 = prevfloat(0.0)
for i in 1:1000
@test isfinite(Float32_stochastic_round(f1))
@test isfinite(Float32_stochastic_round(f2))
end
end

@testset "No stochastic round to NaN" begin
f1 = Float64(floatmax(Float32)) # larger can map to Inf though!
f2 = -f1
for i in 1:1000
@test isfinite(Float32_stochastic_round(f1))
@test isfinite(Float32_stochastic_round(f2))
end
end