From 61a08fc6ce996ec403601f30c402ecfa3d588bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9my=20Oudompheng?= Date: Wed, 27 Mar 2019 06:55:52 +0100 Subject: [PATCH] =?UTF-8?q?strconv:=20Implement=20Ry=C5=AB=20algorithm=20f?= =?UTF-8?q?or=20ftoa=20shortest=20mode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the algorithm from Ulf Adams, "Ryū: Fast Float-to-String Conversion" (doi:10.1145/3192366.3192369) for formatting floating-point numbers with a fixed number of decimal digits. It is not a direct translation of the reference C implementation but still follows the original paper. In particular, it uses full 128-bit powers of 10, which allows for more precision in the other modes (fixed ftoa, atof). name old time/op new time/op delta AppendFloat/Decimal-4 49.6ns ± 3% 59.3ns ± 0% +19.59% (p=0.008 n=5+5) AppendFloat/Float-4 122ns ± 1% 91ns ± 1% -25.92% (p=0.008 n=5+5) AppendFloat/Exp-4 89.3ns ± 1% 100.0ns ± 1% +11.98% (p=0.008 n=5+5) AppendFloat/NegExp-4 88.3ns ± 2% 97.1ns ± 1% +9.87% (p=0.008 n=5+5) AppendFloat/LongExp-4 143ns ± 2% 103ns ± 0% -28.17% (p=0.016 n=5+4) AppendFloat/Big-4 144ns ± 1% 110ns ± 1% -23.26% (p=0.008 n=5+5) AppendFloat/BinaryExp-4 46.2ns ± 2% 46.0ns ± 1% ~ (p=0.603 n=5+5) AppendFloat/32Integer-4 49.1ns ± 1% 58.7ns ± 1% +19.57% (p=0.008 n=5+5) AppendFloat/32ExactFraction-4 95.6ns ± 1% 88.6ns ± 1% -7.30% (p=0.008 n=5+5) AppendFloat/32Point-4 122ns ± 1% 87ns ± 1% -28.63% (p=0.008 n=5+5) AppendFloat/32Exp-4 88.6ns ± 2% 95.0ns ± 1% +7.29% (p=0.008 n=5+5) AppendFloat/32NegExp-4 87.2ns ± 1% 91.3ns ± 1% +4.63% (p=0.008 n=5+5) AppendFloat/32Shortest-4 107ns ± 1% 82ns ± 0% -24.08% (p=0.008 n=5+5) AppendFloat/Slowpath64-4 1.00µs ± 1% 0.10µs ± 0% -89.92% (p=0.016 n=5+4) AppendFloat/SlowpathDenormal64-4 34.1µs ± 3% 0.1µs ± 1% -99.72% (p=0.008 n=5+5) Fixes #15672 Change-Id: Ib90dfa245f62490a6666671896013cf3f9a1fb22 Reviewed-on: https://go-review.googlesource.com/c/go/+/170080 Trust: Emmanuel Odeke Trust: Nigel Tao Run-TryBot: Emmanuel Odeke TryBot-Result: Go Bot Reviewed-by: Nigel Tao --- src/strconv/ftoa.go | 10 +- src/strconv/ftoa_test.go | 32 ++++- src/strconv/ftoaryu.go | 255 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 287 insertions(+), 10 deletions(-) diff --git a/src/strconv/ftoa.go b/src/strconv/ftoa.go index a3bff52bc8333b..eca04b851c3ad0 100644 --- a/src/strconv/ftoa.go +++ b/src/strconv/ftoa.go @@ -113,15 +113,11 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte { // Negative precision means "only as much as needed to be exact." shortest := prec < 0 if shortest { - // Try Grisu3 algorithm. - f := new(extFloat) - lower, upper := f.AssignComputeBounds(mant, exp, neg, flt) + // Use Ryu algorithm. var buf [32]byte digs.d = buf[:] - ok = f.ShortestDecimal(&digs, &lower, &upper) - if !ok { - return bigFtoa(dst, prec, fmt, neg, mant, exp, flt) - } + ryuFtoaShortest(&digs, mant, exp-int(flt.mantbits), flt) + ok = true // Precision for shortest representation mode. switch fmt { case 'e', 'E': diff --git a/src/strconv/ftoa_test.go b/src/strconv/ftoa_test.go index b1685aeb2000b6..73008b1c621913 100644 --- a/src/strconv/ftoa_test.go +++ b/src/strconv/ftoa_test.go @@ -40,6 +40,7 @@ var ftoatests = []ftoaTest{ {200000, 'x', -1, "0x1.86ap+17"}, {200000, 'X', -1, "0X1.86AP+17"}, {2000000, 'g', -1, "2e+06"}, + {1e10, 'g', -1, "1e+10"}, // g conversion and zero suppression {400, 'g', 2, "4e+02"}, @@ -84,6 +85,7 @@ var ftoatests = []ftoaTest{ {1.2355, 'f', 3, "1.236"}, {1234567890123456.5, 'e', 15, "1.234567890123456e+15"}, {1234567890123457.5, 'e', 15, "1.234567890123458e+15"}, + {108678236358137.625, 'g', -1, "1.0867823635813762e+14"}, {1e23, 'e', 17, "9.99999999999999916e+22"}, {1e23, 'f', 17, "99999999999999991611392.00000000000000000"}, @@ -191,6 +193,25 @@ func TestFtoa(t *testing.T) { } } +func TestFtoaPowersOfTwo(t *testing.T) { + for exp := -2048; exp <= 2048; exp++ { + f := math.Ldexp(1, exp) + if !math.IsInf(f, 0) { + s := FormatFloat(f, 'e', -1, 64) + if x, _ := ParseFloat(s, 64); x != f { + t.Errorf("failed roundtrip %v => %s => %v", f, s, x) + } + } + f32 := float32(f) + if !math.IsInf(float64(f32), 0) { + s := FormatFloat(float64(f32), 'e', -1, 32) + if x, _ := ParseFloat(s, 32); float32(x) != f32 { + t.Errorf("failed roundtrip %v => %s => %v", f32, s, float32(x)) + } + } + } +} + func TestFtoaRandom(t *testing.T) { N := int(1e4) if testing.Short() { @@ -240,6 +261,7 @@ var ftoaBenches = []struct { {"Float", 339.7784, 'g', -1, 64}, {"Exp", -5.09e75, 'g', -1, 64}, {"NegExp", -5.11e-95, 'g', -1, 64}, + {"LongExp", 1.234567890123456e-78, 'g', -1, 64}, {"Big", 123456789123456789123456789, 'g', -1, 64}, {"BinaryExp", -1, 'b', -1, 64}, @@ -249,6 +271,7 @@ var ftoaBenches = []struct { {"32Point", 339.7784, 'g', -1, 32}, {"32Exp", -5.09e25, 'g', -1, 32}, {"32NegExp", -5.11e-25, 'g', -1, 32}, + {"32Shortest", 1.234567e-8, 'g', -1, 32}, {"32Fixed8Hard", math.Ldexp(15961084, -125), 'e', 8, 32}, {"32Fixed9Hard", math.Ldexp(14855922, -83), 'e', 9, 32}, @@ -264,7 +287,14 @@ var ftoaBenches = []struct { {"64Fixed18Hard", math.Ldexp(6994187472632449, 690), 'e', 18, 64}, // Trigger slow path (see issue #15672). - {"Slowpath64", 622666234635.3213e-320, 'e', -1, 64}, + // The shortest is: 8.034137530808823e+43 + {"Slowpath64", 8.03413753080882349e+43, 'e', -1, 64}, + // This denormal is pathological because the lower/upper + // halfways to neighboring floats are: + // 622666234635.321003e-320 ~= 622666234635.321e-320 + // 622666234635.321497e-320 ~= 622666234635.3215e-320 + // making it hard to find the 3rd digit + {"SlowpathDenormal64", 622666234635.3213e-320, 'e', -1, 64}, } func BenchmarkFormatFloat(b *testing.B) { diff --git a/src/strconv/ftoaryu.go b/src/strconv/ftoaryu.go index 44a55b1da95776..e53de756677d92 100644 --- a/src/strconv/ftoaryu.go +++ b/src/strconv/ftoaryu.go @@ -218,6 +218,109 @@ func formatDecimal(d *decimalSlice, m uint64, trunc bool, roundUp bool, prec int d.dp = d.nd + trimmed } +// ryuFtoaShortest formats mant*2^exp with prec decimal digits. +func ryuFtoaShortest(d *decimalSlice, mant uint64, exp int, flt *floatInfo) { + if mant == 0 { + d.nd, d.dp = 0, 0 + return + } + // If input is an exact integer with fewer bits than the mantissa, + // the previous and next integer are not admissible representations. + if exp <= 0 && bits.TrailingZeros64(mant) >= -exp { + mant >>= uint(-exp) + ryuDigits(d, mant, mant, mant, true, false) + return + } + ml, mc, mu, e2 := computeBounds(mant, exp, flt) + if e2 == 0 { + ryuDigits(d, ml, mc, mu, true, false) + return + } + // Find 10^q *larger* than 2^-e2 + q := mulByLog2Log10(-e2) + 1 + + // We are going to multiply by 10^q using 128-bit arithmetic. + // The exponent is the same for all 3 numbers. + var dl, dc, du uint64 + var dl0, dc0, du0 bool + if flt == &float32info { + var dl32, dc32, du32 uint32 + dl32, _, dl0 = mult64bitPow10(uint32(ml), e2, q) + dc32, _, dc0 = mult64bitPow10(uint32(mc), e2, q) + du32, e2, du0 = mult64bitPow10(uint32(mu), e2, q) + dl, dc, du = uint64(dl32), uint64(dc32), uint64(du32) + } else { + dl, _, dl0 = mult128bitPow10(ml, e2, q) + dc, _, dc0 = mult128bitPow10(mc, e2, q) + du, e2, du0 = mult128bitPow10(mu, e2, q) + } + if e2 >= 0 { + panic("not enough significant bits after mult128bitPow10") + } + // Is it an exact computation? + if q > 55 { + // Large positive powers of ten are not exact + dl0, dc0, du0 = false, false, false + } + if q < 0 && q >= -24 { + // Division by a power of ten may be exact. + // (note that 5^25 is a 59-bit number so division by 5^25 is never exact). + if divisibleByPower5(ml, -q) { + dl0 = true + } + if divisibleByPower5(mc, -q) { + dc0 = true + } + if divisibleByPower5(mu, -q) { + du0 = true + } + } + // Express the results (dl, dc, du)*2^e2 as integers. + // Extra bits must be removed and rounding hints computed. + extra := uint(-e2) + extraMask := uint64(1<>extra, dl&extraMask + dc, fracc := dc>>extra, dc&extraMask + du, fracu := du>>extra, du&extraMask + // Is it allowed to use 'du' as a result? + // It is always allowed when it is truncated, but also + // if it is exact and the original binary mantissa is even + // When disallowed, we can substract 1. + uok := !du0 || fracu > 0 + if du0 && fracu == 0 { + uok = mant&1 == 0 + } + if !uok { + du-- + } + // Is 'dc' the correctly rounded base 10 mantissa? + // The correct rounding might be dc+1 + cup := false // don't round up. + if dc0 { + // If we computed an exact product, the half integer + // should round to next (even) integer if 'dc' is odd. + cup = fracc > 1<<(extra-1) || + (fracc == 1<<(extra-1) && dc&1 == 1) + } else { + // otherwise, the result is a lower truncation of the ideal + // result. + cup = fracc>>(extra-1) == 1 + } + // Is 'dl' an allowed representation? + // Only if it is an exact value, and if the original binary mantissa + // was even. + lok := dl0 && fracl == 0 && (mant&1 == 0) + if !lok { + dl++ + } + // We need to remember whether the trimmed digits of 'dc' are zero. + c0 := dc0 && fracc == 0 + // render digits + ryuDigits(d, dl, dc, du, c0, cup) + d.dp -= q +} + // mulByLog2Log10 returns math.Floor(x * log(2)/log(10)) for an integer x in // the range -1600 <= x && x <= +1600. // @@ -238,6 +341,140 @@ func mulByLog10Log2(x int) int { return (x * 108853) >> 15 } +// computeBounds returns a floating-point vector (l, c, u)×2^e2 +// where the mantissas are 55-bit (or 26-bit) integers, describing the interval +// represented by the input float64 or float32. +func computeBounds(mant uint64, exp int, flt *floatInfo) (lower, central, upper uint64, e2 int) { + if mant != 1< 5e8) || (clo == 5e8 && cup) + ryuDigits32(d, lhi, chi, uhi, c0, cup, 8) + d.dp += 9 + } else { + d.nd = 0 + // emit high part + n := uint(9) + for v := chi; v > 0; { + v1, v2 := v/10, v%10 + v = v1 + n-- + d.d[n] = byte(v2 + '0') + } + d.d = d.d[n:] + d.nd = int(9 - n) + // emit low part + ryuDigits32(d, llo, clo, ulo, + c0, cup, d.nd+8) + } + // trim trailing zeros + for d.nd > 0 && d.d[d.nd-1] == '0' { + d.nd-- + } + // trim initial zeros + for d.nd > 0 && d.d[0] == '0' { + d.nd-- + d.dp-- + d.d = d.d[1:] + } +} + +// ryuDigits32 emits decimal digits for a number less than 1e9. +func ryuDigits32(d *decimalSlice, lower, central, upper uint32, + c0, cup bool, endindex int) { + if upper == 0 { + d.dp = endindex + 1 + return + } + trimmed := 0 + // Remember last trimmed digit to check for round-up. + // c0 will be used to remember zeroness of following digits. + cNextDigit := 0 + for upper > 0 { + // Repeatedly compute: + // l = Ceil(lower / 10^k) + // c = Round(central / 10^k) + // u = Floor(upper / 10^k) + // and stop when c goes out of the (l, u) interval. + l := (lower + 9) / 10 + c, cdigit := central/10, central%10 + u := upper / 10 + if l > u { + // don't trim the last digit as it is forbidden to go below l + // other, trim and exit now. + break + } + // Check that we didn't cross the lower boundary. + // The case where l < u but c == l-1 is essentially impossible, + // but may happen if: + // lower = ..11 + // central = ..19 + // upper = ..31 + // and means that 'central' is very close but less than + // an integer ending with many zeros, and usually + // the "round-up" logic hides the problem. + if l == c+1 && c < u { + c++ + cdigit = 0 + cup = false + } + trimmed++ + // Remember trimmed digits of c + c0 = c0 && cNextDigit == 0 + cNextDigit = int(cdigit) + lower, central, upper = l, c, u + } + // should we round up? + if trimmed > 0 { + cup = cNextDigit > 5 || + (cNextDigit == 5 && !c0) || + (cNextDigit == 5 && c0 && central&1 == 1) + } + if central < upper && cup { + central++ + } + // We know where the number ends, fill directly + endindex -= trimmed + v := central + n := endindex + for n > d.nd { + v1, v2 := v/100, v%100 + d.d[n] = smallsString[2*v2+1] + d.d[n-1] = smallsString[2*v2+0] + n -= 2 + v = v1 + } + if n == d.nd { + d.d[n] = byte(v + '0') + } + d.nd = endindex + 1 + d.dp = d.nd + trimmed +} + // mult64bitPow10 takes a floating-point input with a 25-bit // mantissa and multiplies it with 10^q. The resulting mantissa // is m*P >> 57 where P is a 64-bit element of the detailedPowersOfTen tables. @@ -249,7 +486,8 @@ func mulByLog10Log2(x int) int { // exact = ε == 0 func mult64bitPow10(m uint32, e2, q int) (resM uint32, resE int, exact bool) { if q == 0 { - return m << 7, e2 - 7, true + // P == 1<<63 + return m << 6, e2 - 6, true } if q < detailedPowersOfTenMinExp10 || detailedPowersOfTenMaxExp10 < q { // This never happens due to the range of float32/float64 exponent @@ -276,7 +514,8 @@ func mult64bitPow10(m uint32, e2, q int) (resM uint32, resE int, exact bool) { // exact = ε == 0 func mult128bitPow10(m uint64, e2, q int) (resM uint64, resE int, exact bool) { if q == 0 { - return m << 9, e2 - 9, true + // P == 1<<127 + return m << 8, e2 - 8, true } if q < detailedPowersOfTenMinExp10 || detailedPowersOfTenMaxExp10 < q { // This never happens due to the range of float32/float64 exponent @@ -309,3 +548,15 @@ func divisibleByPower5(m uint64, k int) bool { } return true } + +// divmod1e9 computes quotient and remainder of division by 1e9, +// avoiding runtime uint64 division on 32-bit platforms. +func divmod1e9(x uint64) (uint32, uint32) { + if !host32bit { + return uint32(x / 1e9), uint32(x % 1e9) + } + // Use the same sequence of operations as the amd64 compiler. + hi, _ := bits.Mul64(x>>1, 0x89705f4136b4a598) // binary digits of 1e-9 + q := hi >> 28 + return uint32(q), uint32(x - q*1e9) +}