Base: make constructing a float from a rational exact

Two example bugs that are now fixed: ```julia-repl julia> Float16(9//70000) # incorrect because `Float16(70000)` overflows Float16(0.0) julia> Float16(big(9//70000)) # correct because of promotion to `BigFloat` Float16(0.0001286) julia> Float32(16777216//16777217) < 1 # `16777217` doesn't fit in a `Float32` mantissa false ``` Another way to fix this would have been to convert the numerator and denominator into `BigFloat` exactly and then divide one with the other. However, the requirement for exactness means that the `BigFloat` precision would need to be manipulated, something that seems to be problematic in Julia. So implement the division using integer arithmetic. Updates #45213
JuliaLang · May 12, 2023 · c517246 · c517246
1 parent b21f100
commit c517246
Show file tree

Hide file tree

Showing 3 changed files with 483 additions and 11 deletions.
diff --git a/base/mpfr.jl b/base/mpfr.jl
@@ -17,7 +17,9 @@ import
  cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
  setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
  isone, big, _string_n, decompose, minmax,
- sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
+ sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
+ to_int8_if_bool, rational_to_float_components, rational_to_float_impl,
+ rounding_mode_translated_for_abs
 
 import ..Rounding: rounding_raw, setrounding_raw
 
@@ -280,14 +282,73 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
 BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
  BigFloat(Float64(x), r; precision=precision)
 
-function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
- setprecision(BigFloat, precision) do
- setrounding_raw(BigFloat, r) do
- BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
- end
- end
+function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
+ ccall(
+ (:mpfr_set_z_2exp, libmpfr),
+ Int32,
+ (Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
+ z, n, exp, rm,
+ )
+ nothing
 end
 
+rational_to_bigfloat(
+ ::Type{<:Integer},
+ x::Rational{Bool},
+ requested_precision::Integer,
+ ::RoundingMode,
+) = rational_to_float_impl(
+ x, Bool, -1,
+ let prec = requested_precision
+ y -> BigFloat(y, precision = prec)
+ end,
+)::BigFloat
+
+function rational_to_bigfloat(
+ ::Type{T},
+ x::Rational,
+ requested_precision::Integer,
+ romo::MPFRRoundingMode,
+) where {T<:Integer}
+ s = Int8(sign(numerator(x)))
+ a = abs(x)
+
+ num = to_int8_if_bool(numerator(a))
+ den = to_int8_if_bool(denominator(a))
+
+ # Handle special cases
+ num_is_zero = iszero(num)
+ den_is_zero = iszero(den)
+ if den_is_zero
+ num_is_zero && return BigFloat(precision = requested_precision)
+ return BigFloat(s * Inf, precision = requested_precision)
+ end
+ num_is_zero && return BigFloat(false, precision = requested_precision)
+
+ components = rational_to_float_components(
+ num,
+ den,
+ requested_precision,
+ T,
+ rounding_mode_translated_for_abs(convert(RoundingMode, romo), s),
+ )
+ ret = BigFloat(precision = requested_precision)
+
+ # The rounding mode doesn't matter because there shouldn't be any
+ # rounding (MPFR doesn't have subnormals).
+ set_2exp!(ret, s * components.mantissa, Int(components.exponent), MPFRRoundToZero)
+
+ ret
+end
+
+BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
+ rational_to_bigfloat(
+ BigInt,
+ x,
+ precision,
+ r,
+ )::BigFloat
+
 function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])
  !isempty(s) && isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base = base)
  z = BigFloat(precision=precision)

diff --git a/base/rational.jl b/base/rational.jl
@@ -129,12 +129,301 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
 (::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
  throw(InexactError(nameof(T), T, x)))
 
-AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
-function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
- P = promote_type(T,S)
- convert(T, convert(P,x.num)/convert(P,x.den))::T
+bit_width(n) = ndigits(n, base = UInt8(2), pad = false)
+
+function divrem_pow2(num::Integer, n::Integer)
+ quot = num >> n
+ rema = num - (quot << n)
+ (quot, rema)
+end
+
+rounding_mode_translated_for_abs(
+ rm::Union{
+ RoundingMode{:ToZero},
+ RoundingMode{:FromZero},
+ RoundingMode{:Nearest},
+ RoundingMode{:NearestTiesAway},
+ },
+ ::Real,
+) =
+ rm
+
+rounding_mode_translated_for_abs(::RoundingMode{:Down}, sign::Real) =
+ !signbit(sign) ? RoundToZero : RoundFromZero
+
+rounding_mode_translated_for_abs(::RoundingMode{:Up}, sign::Real) =
+ !signbit(sign) ? RoundFromZero : RoundToZero
+
+# `num`, `den` are positive integers. `requested_bit_width` is the
+# requested floating-point precision. `T` is the integer type that
+# we'll be working with mostly, it needs to be wide enough.
+function rational_to_float_components_impl(
+ num::Integer,
+ den::Integer,
+ requested_bit_width::Integer,
+ ::Type{T},
+ romo::RoundingMode,
+) where {T<:Integer}
+ num_bit_width = bit_width(num)
+ den_bit_width = bit_width(den)
+
+ numT = T(num)
+ denT = T(den)
+
+ # Must be positive.
+ bit_shift_1 = den_bit_width - num_bit_width + requested_bit_width
+ (false ≤ bit_shift_1) || (bit_shift_1 = zero(bit_shift_1))
+
+ # `T` must be wide enough to make overflow impossible during the
+ # left shift here, we must not lose the high bits.
+ #
+ # Similarly, `scaled_num` must not be negative.
+ scaled_num = numT << bit_shift_1
+
+ # `quo0` must be at least `requested_bit_width` bits wide.
+ (quo0, rem0) = divrem(scaled_num, denT, RoundToZero)
+ quo0_bit_width = bit_width(quo0)
+
+ # Now we have a mantissa in `quo0`, but need to round it to the
+ # required precision.
+
+ # Should often be zero, but never negative.
+ bit_shift_2 = quo0_bit_width - requested_bit_width
+
+ # `quo1` is `div(scaled_num, denT << bit_shift_2, RoundToZero)` and should
+ # be exactly `requested_bit_width` bits wide.
+ (quo1, rem1) = divrem_pow2(quo0, bit_shift_2)
+
+ # `rem(scaled_num, denT << bit_shift_2, RoundToZero)`, but without the extra
+ # computation.
+ rem_total = rem1 * den + rem0
+
+ result_is_exact = iszero(rem_total)
+
+ mantissa = quo1
+
+ mantissa_carry = false
+
+ romo_is_to = romo == RoundToZero
+ romo_is_af = romo == RoundFromZero
+ romo_is_ntte = romo == RoundNearest
+ romo_is_ntaf = romo == RoundNearestTiesAway
+
+ romo_is_to_nearest = romo_is_ntte | romo_is_ntaf
+
+ if !result_is_exact & !romo_is_to
+ # Finish the rounding
+
+ (rem_quo, rem_rem) = divrem_pow2(rem_total, bit_shift_2)
+ to_nearest_compar_hi = rem_quo - (den >> true)
+ to_nearest_compar_lo = (rem_rem << true) - ((den & true) << bit_shift_2)
+ to_nearest_compar_hi_iszero = iszero(to_nearest_compar_hi)
+ to_nearest_compar_lo_iszero = iszero(to_nearest_compar_lo)
+ to_nearest_is_greater_than_half =
+ (false < to_nearest_compar_hi) |
+ (
+ to_nearest_compar_hi_iszero &
+ (false < to_nearest_compar_lo)
+ )
+ to_nearest_is_tied =
+ to_nearest_compar_hi_iszero &
+ to_nearest_compar_lo_iszero
+
+ mantissa_is_even = iszero(mantissa & true)
+
+ # True iff precision is one.
+ mantissa_is_one = isone(mantissa)
+
+ if (
+ romo_is_af |
+ (
+ romo_is_to_nearest &
+ (
+ to_nearest_is_greater_than_half |
+ (
+ to_nearest_is_tied &
+ !mantissa_is_one &
+ (romo_is_ntaf | (romo_is_ntte & !mantissa_is_even))
+ )
+ )
+ )
+ )
+ # Round up
+
+ # Assuming `T` is wide enough and there's no overflow.
+ mantissa += true
+
+ # We need to decrease the bit width in case it increased.
+ mantissa_carry = ispow2(mantissa)
+ mantissa >>= mantissa_carry
+ elseif romo_is_to_nearest & to_nearest_is_tied & mantissa_is_one
+ # Mantissa is one, which means the precision is also
+ # one. Be consistent with the `BigFloat` behavior, for
+ # example: `BigFloat(3) == BigFloat(3.0) == 4`.
+ mantissa_carry = true
+ end
+ end
+
+ # `mantissa` should now again be exactly `requested_bit_width` bits
+ # wide.
+
+ exp = bit_shift_2 - bit_shift_1 + mantissa_carry
+
+ (
+ mantissa = mantissa,
+ exponent = exp,
+ is_exact = result_is_exact,
+ )
+end
+
+# `num`, `den` are positive integers. `requested_bit_width` is the
+# requested floating-point precision and must be positive. `T` is the
+# integer type that we'll be working with mostly, it needs to be wide
+# enough.
+function rational_to_float_components(
+ num::Integer,
+ den::Integer,
+ requested_bit_width::Integer,
+ ::Type{T},
+ romo::RoundingMode,
+) where {T<:Integer}
+ (false < requested_bit_width) || error("nonpositive bit width")
+
+ # Factor out powers of two
+ trailing_zeros_num = trailing_zeros(num)
+ trailing_zeros_den = trailing_zeros(den)
+ num >>= trailing_zeros_num
+ den >>= trailing_zeros_den
+
+ c = rational_to_float_components_impl(
+ num, den, requested_bit_width, T, romo,
+ )
+
+ (
+ mantissa = c.mantissa,
+ exponent = c.exponent + trailing_zeros_num - trailing_zeros_den,
+ is_exact = c.is_exact,
+ )
+end
+
+function rational_to_float_impl(
+ to_float::C,
+ ::Type{<:Integer},
+ x::Rational{Bool},
+ ::Integer,
+) where {C<:Union{Type,Function}}
+ n = numerator(x)
+ if iszero(denominator(x))
+ if iszero(n)
+ to_float(NaN) # 0/0
+ else
+ to_float(Inf) # 1/0
+ end
+ else
+ # n/1 = n
+ to_float(n)
+ end
 end
 
+to_int8_if_bool(n::Bool) = Int8(n)
+to_int8_if_bool(n::Integer) = n
+
+# Assuming the wanted rounding mode is round to nearest with ties to
+# even.
+#
+# `requested_precision` must be positive.
+function rational_to_float_impl(
+ to_float::C,
+ ::Type{T},
+ x::Rational,
+ requested_precision::Integer,
+) where {C<:Union{Type,Function},T<:Integer}
+ s = Int8(sign(numerator(x)))
+ a = abs(x)
+
+ num = to_int8_if_bool(numerator(a))
+ den = to_int8_if_bool(denominator(a))
+
+ # Handle special cases
+ num_is_zero = iszero(num)
+ den_is_zero = iszero(den)
+ if den_is_zero
+ num_is_zero && return to_float(NaN)
+ return to_float(s * Inf)
+ end
+ num_is_zero && return to_float(false)
+
+ components = rational_to_float_components(
+ num,
+ den,
+ requested_precision,
+ T,
+ RoundNearest,
+ )
+ mantissa = to_float(components.mantissa)
+
+ # TODO: `ldexp` could be replaced with a mere bit of bit twiddling
+ # in the case of `Float16`, `Float32`, `Float64`
+ ret = ldexp(s * mantissa, components.exponent)
+
+ # TODO: faster?
+ if iszero(ret) | issubnormal(ret)
+ # "Rounding to odd" to prevent double rounding error, see
+ # https://hal-ens-lyon.archives-ouvertes.fr/inria-00080427v2
+ components = rational_to_float_components(
+ num,
+ den,
+ requested_precision,
+ T,
+ RoundToZero,
+ )
+ mantissa = to_float(components.mantissa | !components.is_exact)
+
+ # TODO: `ldexp` could be replaced with a mere bit of bit
+ # twiddling in the case of `Float16`, `Float32`, `Float64`
+ ret = ldexp(s * mantissa, components.exponent)
+ end
+
+ ret
+end
+
+rational_to_float_promote_type(
+ ::Type{F},
+ ::Type{S},
+) where {F<:AbstractFloat,S<:Integer} =
+ BigInt
+
+rational_to_float_promote_type(
+ ::Type{F},
+ ::Type{S},
+) where {F<:AbstractFloat,S<:Unsigned} =
+ rational_to_float_promote_type(F, signed(S))
+
+# As an optimization, use a narrower type when possible.
+rational_to_float_promote_type(::Type{Float16}, ::Type{<:Union{Int8,Int16}}) = Int32
+rational_to_float_promote_type(::Type{Float32}, ::Type{<:Union{Int8,Int16}}) = Int64
+rational_to_float_promote_type(::Type{Float64}, ::Type{<:Union{Int8,Int16}}) = Int128
+rational_to_float_promote_type(::Type{<:Union{Float16,Float32}}, ::Type{Int32}) = Int64
+rational_to_float_promote_type(::Type{Float64}, ::Type{Int32}) = Int128
+rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64}) = Int128
+
+(::Type{F})(x::Rational) where {F<:AbstractFloat} =
+ rational_to_float_impl(
+ F,
+ rational_to_float_promote_type(
+ F,
+ promote_type(
+ typeof(numerator(x)),
+ typeof(denominator(x)),
+ ),
+ ),
+ x,
+ precision(F),
+ )::F
+
+AbstractFloat(x::Q) where {Q<:Rational} =
+ float(Q)(x)::AbstractFloat
+
 function Rational{T}(x::AbstractFloat) where T<:Integer
  r = rationalize(T, x, tol=0)
  x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))