Skip to content

Commit

Permalink
Base: make constructing a float from a rational exact
Browse files Browse the repository at this point in the history
Two example bugs that are now fixed:
```julia-repl
julia> Float16(9//70000)       # incorrect because `Float16(70000)` overflows
Float16(0.0)

julia> Float16(big(9//70000))  # correct because of promotion to `BigFloat`
Float16(0.0001286)

julia> Float32(16777216//16777217) < 1  # `16777217` doesn't fit in a `Float32` mantissa
false
```

Another way to fix this would have been to convert the numerator and
denominator into `BigFloat` exactly and then divide one with the other.
However, the requirement for exactness means that the `BigFloat`
precision would need to be manipulated, something that seems to be
problematic in Julia. So implement the division using integer
arithmetic.

Updates #45213
  • Loading branch information
nsajko committed May 12, 2023
1 parent b21f100 commit c517246
Show file tree
Hide file tree
Showing 3 changed files with 483 additions and 11 deletions.
75 changes: 68 additions & 7 deletions base/mpfr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ import
cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
isone, big, _string_n, decompose, minmax,
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
to_int8_if_bool, rational_to_float_components, rational_to_float_impl,
rounding_mode_translated_for_abs

import ..Rounding: rounding_raw, setrounding_raw

Expand Down Expand Up @@ -280,14 +282,73 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
BigFloat(Float64(x), r; precision=precision)

function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
setprecision(BigFloat, precision) do
setrounding_raw(BigFloat, r) do
BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
end
end
function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
ccall(
(:mpfr_set_z_2exp, libmpfr),
Int32,
(Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
z, n, exp, rm,
)
nothing
end

rational_to_bigfloat(
::Type{<:Integer},
x::Rational{Bool},
requested_precision::Integer,
::RoundingMode,
) = rational_to_float_impl(
x, Bool, -1,
let prec = requested_precision
y -> BigFloat(y, precision = prec)
end,
)::BigFloat

function rational_to_bigfloat(
::Type{T},
x::Rational,
requested_precision::Integer,
romo::MPFRRoundingMode,
) where {T<:Integer}
s = Int8(sign(numerator(x)))
a = abs(x)

num = to_int8_if_bool(numerator(a))
den = to_int8_if_bool(denominator(a))

# Handle special cases
num_is_zero = iszero(num)
den_is_zero = iszero(den)
if den_is_zero
num_is_zero && return BigFloat(precision = requested_precision)
return BigFloat(s * Inf, precision = requested_precision)
end
num_is_zero && return BigFloat(false, precision = requested_precision)

components = rational_to_float_components(
num,
den,
requested_precision,
T,
rounding_mode_translated_for_abs(convert(RoundingMode, romo), s),
)
ret = BigFloat(precision = requested_precision)

# The rounding mode doesn't matter because there shouldn't be any
# rounding (MPFR doesn't have subnormals).
set_2exp!(ret, s * components.mantissa, Int(components.exponent), MPFRRoundToZero)

ret
end

BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
rational_to_bigfloat(
BigInt,
x,
precision,
r,
)::BigFloat

function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])
!isempty(s) && isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base = base)
z = BigFloat(precision=precision)
Expand Down
297 changes: 293 additions & 4 deletions base/rational.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,301 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
(::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
throw(InexactError(nameof(T), T, x)))

AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
P = promote_type(T,S)
convert(T, convert(P,x.num)/convert(P,x.den))::T
bit_width(n) = ndigits(n, base = UInt8(2), pad = false)

function divrem_pow2(num::Integer, n::Integer)
quot = num >> n
rema = num - (quot << n)
(quot, rema)
end

rounding_mode_translated_for_abs(
rm::Union{
RoundingMode{:ToZero},
RoundingMode{:FromZero},
RoundingMode{:Nearest},
RoundingMode{:NearestTiesAway},
},
::Real,
) =
rm

rounding_mode_translated_for_abs(::RoundingMode{:Down}, sign::Real) =
!signbit(sign) ? RoundToZero : RoundFromZero

rounding_mode_translated_for_abs(::RoundingMode{:Up}, sign::Real) =
!signbit(sign) ? RoundFromZero : RoundToZero

# `num`, `den` are positive integers. `requested_bit_width` is the
# requested floating-point precision. `T` is the integer type that
# we'll be working with mostly, it needs to be wide enough.
function rational_to_float_components_impl(
num::Integer,
den::Integer,
requested_bit_width::Integer,
::Type{T},
romo::RoundingMode,
) where {T<:Integer}
num_bit_width = bit_width(num)
den_bit_width = bit_width(den)

numT = T(num)
denT = T(den)

# Must be positive.
bit_shift_1 = den_bit_width - num_bit_width + requested_bit_width
(false bit_shift_1) || (bit_shift_1 = zero(bit_shift_1))

# `T` must be wide enough to make overflow impossible during the
# left shift here, we must not lose the high bits.
#
# Similarly, `scaled_num` must not be negative.
scaled_num = numT << bit_shift_1

# `quo0` must be at least `requested_bit_width` bits wide.
(quo0, rem0) = divrem(scaled_num, denT, RoundToZero)
quo0_bit_width = bit_width(quo0)

# Now we have a mantissa in `quo0`, but need to round it to the
# required precision.

# Should often be zero, but never negative.
bit_shift_2 = quo0_bit_width - requested_bit_width

# `quo1` is `div(scaled_num, denT << bit_shift_2, RoundToZero)` and should
# be exactly `requested_bit_width` bits wide.
(quo1, rem1) = divrem_pow2(quo0, bit_shift_2)

# `rem(scaled_num, denT << bit_shift_2, RoundToZero)`, but without the extra
# computation.
rem_total = rem1 * den + rem0

result_is_exact = iszero(rem_total)

mantissa = quo1

mantissa_carry = false

romo_is_to = romo == RoundToZero
romo_is_af = romo == RoundFromZero
romo_is_ntte = romo == RoundNearest
romo_is_ntaf = romo == RoundNearestTiesAway

romo_is_to_nearest = romo_is_ntte | romo_is_ntaf

if !result_is_exact & !romo_is_to
# Finish the rounding

(rem_quo, rem_rem) = divrem_pow2(rem_total, bit_shift_2)
to_nearest_compar_hi = rem_quo - (den >> true)
to_nearest_compar_lo = (rem_rem << true) - ((den & true) << bit_shift_2)
to_nearest_compar_hi_iszero = iszero(to_nearest_compar_hi)
to_nearest_compar_lo_iszero = iszero(to_nearest_compar_lo)
to_nearest_is_greater_than_half =
(false < to_nearest_compar_hi) |
(
to_nearest_compar_hi_iszero &
(false < to_nearest_compar_lo)
)
to_nearest_is_tied =
to_nearest_compar_hi_iszero &
to_nearest_compar_lo_iszero

mantissa_is_even = iszero(mantissa & true)

# True iff precision is one.
mantissa_is_one = isone(mantissa)

if (
romo_is_af |
(
romo_is_to_nearest &
(
to_nearest_is_greater_than_half |
(
to_nearest_is_tied &
!mantissa_is_one &
(romo_is_ntaf | (romo_is_ntte & !mantissa_is_even))
)
)
)
)
# Round up

# Assuming `T` is wide enough and there's no overflow.
mantissa += true

# We need to decrease the bit width in case it increased.
mantissa_carry = ispow2(mantissa)
mantissa >>= mantissa_carry
elseif romo_is_to_nearest & to_nearest_is_tied & mantissa_is_one
# Mantissa is one, which means the precision is also
# one. Be consistent with the `BigFloat` behavior, for
# example: `BigFloat(3) == BigFloat(3.0) == 4`.
mantissa_carry = true
end
end

# `mantissa` should now again be exactly `requested_bit_width` bits
# wide.

exp = bit_shift_2 - bit_shift_1 + mantissa_carry

(
mantissa = mantissa,
exponent = exp,
is_exact = result_is_exact,
)
end

# `num`, `den` are positive integers. `requested_bit_width` is the
# requested floating-point precision and must be positive. `T` is the
# integer type that we'll be working with mostly, it needs to be wide
# enough.
function rational_to_float_components(
num::Integer,
den::Integer,
requested_bit_width::Integer,
::Type{T},
romo::RoundingMode,
) where {T<:Integer}
(false < requested_bit_width) || error("nonpositive bit width")

# Factor out powers of two
trailing_zeros_num = trailing_zeros(num)
trailing_zeros_den = trailing_zeros(den)
num >>= trailing_zeros_num
den >>= trailing_zeros_den

c = rational_to_float_components_impl(
num, den, requested_bit_width, T, romo,
)

(
mantissa = c.mantissa,
exponent = c.exponent + trailing_zeros_num - trailing_zeros_den,
is_exact = c.is_exact,
)
end

function rational_to_float_impl(
to_float::C,
::Type{<:Integer},
x::Rational{Bool},
::Integer,
) where {C<:Union{Type,Function}}
n = numerator(x)
if iszero(denominator(x))
if iszero(n)
to_float(NaN) # 0/0
else
to_float(Inf) # 1/0
end
else
# n/1 = n
to_float(n)
end
end

to_int8_if_bool(n::Bool) = Int8(n)
to_int8_if_bool(n::Integer) = n

# Assuming the wanted rounding mode is round to nearest with ties to
# even.
#
# `requested_precision` must be positive.
function rational_to_float_impl(
to_float::C,
::Type{T},
x::Rational,
requested_precision::Integer,
) where {C<:Union{Type,Function},T<:Integer}
s = Int8(sign(numerator(x)))
a = abs(x)

num = to_int8_if_bool(numerator(a))
den = to_int8_if_bool(denominator(a))

# Handle special cases
num_is_zero = iszero(num)
den_is_zero = iszero(den)
if den_is_zero
num_is_zero && return to_float(NaN)
return to_float(s * Inf)
end
num_is_zero && return to_float(false)

components = rational_to_float_components(
num,
den,
requested_precision,
T,
RoundNearest,
)
mantissa = to_float(components.mantissa)

# TODO: `ldexp` could be replaced with a mere bit of bit twiddling
# in the case of `Float16`, `Float32`, `Float64`
ret = ldexp(s * mantissa, components.exponent)

# TODO: faster?
if iszero(ret) | issubnormal(ret)
# "Rounding to odd" to prevent double rounding error, see
# https://hal-ens-lyon.archives-ouvertes.fr/inria-00080427v2
components = rational_to_float_components(
num,
den,
requested_precision,
T,
RoundToZero,
)
mantissa = to_float(components.mantissa | !components.is_exact)

# TODO: `ldexp` could be replaced with a mere bit of bit
# twiddling in the case of `Float16`, `Float32`, `Float64`
ret = ldexp(s * mantissa, components.exponent)
end

ret
end

rational_to_float_promote_type(
::Type{F},
::Type{S},
) where {F<:AbstractFloat,S<:Integer} =
BigInt

rational_to_float_promote_type(
::Type{F},
::Type{S},
) where {F<:AbstractFloat,S<:Unsigned} =
rational_to_float_promote_type(F, signed(S))

# As an optimization, use a narrower type when possible.
rational_to_float_promote_type(::Type{Float16}, ::Type{<:Union{Int8,Int16}}) = Int32
rational_to_float_promote_type(::Type{Float32}, ::Type{<:Union{Int8,Int16}}) = Int64
rational_to_float_promote_type(::Type{Float64}, ::Type{<:Union{Int8,Int16}}) = Int128
rational_to_float_promote_type(::Type{<:Union{Float16,Float32}}, ::Type{Int32}) = Int64
rational_to_float_promote_type(::Type{Float64}, ::Type{Int32}) = Int128
rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64}) = Int128

(::Type{F})(x::Rational) where {F<:AbstractFloat} =
rational_to_float_impl(
F,
rational_to_float_promote_type(
F,
promote_type(
typeof(numerator(x)),
typeof(denominator(x)),
),
),
x,
precision(F),
)::F

AbstractFloat(x::Q) where {Q<:Rational} =
float(Q)(x)::AbstractFloat

function Rational{T}(x::AbstractFloat) where T<:Integer
r = rationalize(T, x, tol=0)
x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))
Expand Down
Loading

0 comments on commit c517246

Please sign in to comment.