Skip to content

Commit 113eb42

Browse files
taichimaedarsc
authored andcommitted
strconv: replace Ryu ftoa with Dragonbox
Dragonbox is a faster ftoa algorithm that provides the same guarantees as Ryu: round-trip conversion, shortest length, and correct rounding. Dragonbox only supports shortest-precision conversion, so we continue to use Ryu-printf for fixed precision. The new implementation has been fuzz-tested against the current Ryu implementation in addition to the existing test suite. Benchmarks show at least ~15-20% performance improvement. The following shows the relevant output from benchstat. Full benchmark results and plots are available at: https://github.com/taichimaeda/dragonbox-bench/ goos: darwin goarch: arm64 pkg: strconv cpu: Apple M1 │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ FormatFloat/Decimal-8 32.71n ± 14% 31.89n ± 12% ~ (p=0.165 n=10) FormatFloat/Float-8 45.54n ± 1% 42.48n ± 0% -6.70% (p=0.000 n=10) FormatFloat/Exp-8 50.06n ± 0% 32.27n ± 1% -35.54% (p=0.000 n=10) FormatFloat/NegExp-8 47.15n ± 1% 31.33n ± 0% -33.56% (p=0.000 n=10) FormatFloat/LongExp-8 46.15n ± 1% 43.66n ± 0% -5.38% (p=0.000 n=10) FormatFloat/Big-8 50.02n ± 0% 39.36n ± 0% -21.31% (p=0.000 n=10) FormatFloat/BinaryExp-8 27.89n ± 0% 27.88n ± 1% ~ (p=0.798 n=10) FormatFloat/32Integer-8 31.41n ± 0% 23.00n ± 3% -26.79% (p=0.000 n=10) FormatFloat/32ExactFraction-8 44.93n ± 1% 29.91n ± 0% -33.43% (p=0.000 n=10) FormatFloat/32Point-8 43.22n ± 1% 33.82n ± 0% -21.74% (p=0.000 n=10) FormatFloat/32Exp-8 45.91n ± 0% 25.48n ± 0% -44.50% (p=0.000 n=10) FormatFloat/32NegExp-8 44.66n ± 0% 25.12n ± 0% -43.76% (p=0.000 n=10) FormatFloat/32Shortest-8 37.96n ± 0% 27.83n ± 1% -26.68% (p=0.000 n=10) FormatFloat/Slowpath64-8 47.74n ± 2% 45.85n ± 0% -3.96% (p=0.000 n=10) FormatFloat/SlowpathDenormal64-8 42.78n ± 1% 41.46n ± 0% -3.07% (p=0.000 n=10) FormatFloat/ShorterIntervalCase32-8 25.49n ± 2% FormatFloat/ShorterIntervalCase64-8 27.72n ± 1% geomean 41.95n 31.89n -22.11% Fixes #74886 Co-authored-by: Junekey Jeon <j6jeon@ucsd.edu> Change-Id: I923f7259c9cecd0896b2340a43d1041cc2ed7787 GitHub-Last-Rev: fd735db GitHub-Pull-Request: #75195 Reviewed-on: https://go-review.googlesource.com/c/go/+/700075 Reviewed-by: Russ Cox <rsc@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com> TryBot-Bypass: Russ Cox <rsc@golang.org>
1 parent 6e5cfe9 commit 113eb42

File tree

3 files changed

+357
-2
lines changed

3 files changed

+357
-2
lines changed

src/internal/strconv/ftoa.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
8686
neg := bits>>(flt.expbits+flt.mantbits) != 0
8787
exp := int(bits>>flt.mantbits) & (1<<flt.expbits - 1)
8888
mant := bits & (uint64(1)<<flt.mantbits - 1)
89+
denorm := false
8990

9091
switch exp {
9192
case 1<<flt.expbits - 1:
@@ -104,6 +105,7 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
104105
case 0:
105106
// denormalized
106107
exp++
108+
denorm = true
107109

108110
default:
109111
// add implicit top bit
@@ -130,10 +132,10 @@ func genericFtoa(dst []byte, val float64, fmt byte, prec, bitSize int) []byte {
130132
return formatDigits(dst, shortest, neg, digs, prec, fmt)
131133
}
132134
if shortest {
133-
// Use Ryu algorithm.
135+
// Use the Dragonbox algorithm.
134136
var buf [32]byte
135137
digs.d = buf[:]
136-
ryuFtoaShortest(&digs, mant, exp-int(flt.mantbits), flt)
138+
dboxFtoa(&digs, mant, exp-int(flt.mantbits), denorm, bitSize)
137139
// Precision for shortest representation mode.
138140
switch fmt {
139141
case 'e', 'E':

src/internal/strconv/ftoa_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ var ftoaBenches = []struct {
351351
// 622666234635.321497e-320 ~= 622666234635.3215e-320
352352
// making it hard to find the 3rd digit
353353
{"SlowpathDenormal64", 622666234635.3213e-320, 'e', -1, 64},
354+
355+
// Trigger the shorter interval case (3.90625e-3 = 1/256).
356+
{"ShorterIntervalCase32", 3.90625e-3, 'e', -1, 32},
357+
{"ShorterIntervalCase64", 3.90625e-3, 'e', -1, 64},
354358
}
355359

356360
func BenchmarkFormatFloat(b *testing.B) {

src/internal/strconv/ftoadbox.go

Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package strconv
6+
7+
// Binary to decimal conversion using the Dragonbox algorithm by Junekey Jeon.
8+
//
9+
// Fixed precision format is not supported by the Dragonbox algorithm
10+
// so we continue to use Ryū-printf for this purpose.
11+
// See https://github.com/jk-jeon/dragonbox/issues/38 for more details.
12+
//
13+
// For binary to decimal rounding, uses round to nearest, tie to even.
14+
// For decimal to binary rounding, assumes round to nearest, tie to even.
15+
//
16+
// The original paper by Junekey Jeon can be found at:
17+
// https://github.com/jk-jeon/dragonbox/blob/d5dc40ae6a3f1a4559cda816738df2d6255b4e24/other_files/Dragonbox.pdf
18+
//
19+
// The reference implementation in C++ by Junekey Jeon can be found at:
20+
// https://github.com/jk-jeon/dragonbox/blob/6c7c925b571d54486b9ffae8d9d18a822801cbda/subproject/simple/include/simple_dragonbox.h
21+
22+
// dragonboxFtoa computes the decimal significand and exponent
23+
// from the binary significand and exponent using the Dragonbox algorithm
24+
// and formats the decimal floating point number in d.
25+
func dboxFtoa(d *decimalSlice, mant uint64, exp int, denorm bool, bitSize int) {
26+
if bitSize == 32 {
27+
dboxFtoa32(d, uint32(mant), exp, denorm)
28+
return
29+
}
30+
dboxFtoa64(d, mant, exp, denorm)
31+
}
32+
33+
func dboxFtoa64(d *decimalSlice, mant uint64, exp int, denorm bool) {
34+
if mant == 1<<float64MantBits && !denorm {
35+
// Algorithm 5.6 (page 24).
36+
k0 := -mulLog10_2MinusLog10_4Over3(exp)
37+
φ, β := dboxPow64(k0, exp)
38+
xi, zi := dboxRange64(φ, β)
39+
if exp != 2 && exp != 3 {
40+
xi++
41+
}
42+
q := zi / 10
43+
if xi <= q*10 {
44+
q, zeros := trimZeros(q)
45+
dboxDigits(d, q, -k0+1+zeros)
46+
return
47+
}
48+
yru := dboxRoundUp64(φ, β)
49+
if exp == -77 && yru%2 != 0 {
50+
yru--
51+
} else if yru < xi {
52+
yru++
53+
}
54+
dboxDigits(d, yru, -k0)
55+
return
56+
}
57+
58+
// κ = 2 for float64 (section 5.1.3)
59+
const (
60+
κ = 2
61+
p10κ = 100 // 10**κ
62+
p10κ1 = p10κ * 10 // 10**(κ+1)
63+
)
64+
65+
// Algorithm 5.2 (page 15).
66+
k0 := -mulLog10_2(exp)
67+
φ, β := dboxPow64(κ+k0, exp)
68+
zi, exact := dboxMulPow64(uint64(mant*2+1)<<β, φ)
69+
s, r := zi/p10κ1, uint32(zi%p10κ1)
70+
δi := dboxDelta64(φ, β)
71+
72+
if r < δi {
73+
if r != 0 || !exact || mant%2 == 0 {
74+
s, zeros := trimZeros(s)
75+
dboxDigits(d, s, -k0+1+zeros)
76+
return
77+
}
78+
s--
79+
r = p10κ * 10
80+
} else if r == δi {
81+
parity, exact := dboxParity64(uint64(mant*2-1), φ, β)
82+
if parity || (exact && mant%2 == 0) {
83+
s, zeros := trimZeros(s)
84+
dboxDigits(d, s, -k0+1+zeros)
85+
return
86+
}
87+
}
88+
89+
// Algorithm 5.4 (page 18).
90+
D := r + p10κ/2 - δi/2
91+
t, ρ := D/p10κ, D%p10κ
92+
yru := 10*s + uint64(t)
93+
if ρ == 0 {
94+
parity, exact := dboxParity64(mant*2, φ, β)
95+
if parity != ((D-p10κ/2)%2 != 0) || exact && yru%2 != 0 {
96+
yru--
97+
}
98+
}
99+
dboxDigits(d, yru, -k0)
100+
}
101+
102+
// Almost identical to dragonboxFtoa64.
103+
// This is kept as a separate copy to minimize runtime overhead.
104+
func dboxFtoa32(d *decimalSlice, mant uint32, exp int, denorm bool) {
105+
if mant == 1<<float32MantBits && !denorm {
106+
// Algorithm 5.6 (page 24).
107+
k0 := -mulLog10_2MinusLog10_4Over3(exp)
108+
φ, β := dboxPow32(k0, exp)
109+
xi, zi := dboxRange32(φ, β)
110+
if exp != 2 && exp != 3 {
111+
xi++
112+
}
113+
q := zi / 10
114+
if xi <= q*10 {
115+
q, zeros := trimZeros(uint64(q))
116+
dboxDigits(d, q, -k0+1+zeros)
117+
return
118+
}
119+
yru := dboxRoundUp32(φ, β)
120+
if exp == -77 && yru%2 != 0 {
121+
yru--
122+
} else if yru < xi {
123+
yru++
124+
}
125+
dboxDigits(d, uint64(yru), -k0)
126+
return
127+
}
128+
129+
// κ = 1 for float32 (section 5.1.3)
130+
const (
131+
κ = 1
132+
p10κ = 10
133+
p10κ1 = p10κ * 10
134+
)
135+
136+
// Algorithm 5.2 (page 15).
137+
k0 := -mulLog10_2(exp)
138+
φ, β := dboxPow32(κ+k0, exp)
139+
zi, exact := dboxMulPow32(uint32(mant*2+1)<<β, φ)
140+
s, r := zi/p10κ1, uint32(zi%p10κ1)
141+
δi := dboxDelta32(φ, β)
142+
143+
if r < δi {
144+
if r != 0 || !exact || mant%2 == 0 {
145+
s, zeros := trimZeros(uint64(s))
146+
dboxDigits(d, s, -k0+1+zeros)
147+
return
148+
}
149+
s--
150+
r = p10κ * 10
151+
} else if r == δi {
152+
parity, exact := dboxParity32(uint32(mant*2-1), φ, β)
153+
if parity || (exact && mant%2 == 0) {
154+
s, zeros := trimZeros(uint64(s))
155+
dboxDigits(d, s, -k0+1+zeros)
156+
return
157+
}
158+
}
159+
160+
// Algorithm 5.4 (page 18).
161+
D := r + p10κ/2 - δi/2
162+
t, ρ := D/p10κ, D%p10κ
163+
yru := 10*s + uint32(t)
164+
if ρ == 0 {
165+
parity, exact := dboxParity32(mant*2, φ, β)
166+
if parity != ((D-p10κ/2)%2 != 0) || exact && yru%2 != 0 {
167+
yru--
168+
}
169+
}
170+
dboxDigits(d, uint64(yru), -k0)
171+
}
172+
173+
// dboxDigits emits decimal digits of mant in d for float64
174+
// and adjusts the decimal point based on exp.
175+
func dboxDigits(d *decimalSlice, mant uint64, exp int) {
176+
i := formatBase10(d.d, mant)
177+
d.d = d.d[i:]
178+
d.nd = len(d.d)
179+
d.dp = d.nd + exp
180+
}
181+
182+
// uadd128 returns the full 128 bits of u + n.
183+
func uadd128(u uint128, n uint64) uint128 {
184+
sum := uint64(u.Lo + n)
185+
// Check if lo is wrapped around.
186+
if sum < u.Lo {
187+
u.Hi++
188+
}
189+
u.Lo = sum
190+
return u
191+
}
192+
193+
// umul64 returns the full 64 bits of x * y.
194+
func umul64(x, y uint32) uint64 {
195+
return uint64(x) * uint64(y)
196+
}
197+
198+
// umul96Upper64 returns the upper 64 bits (out of 96 bits) of x * y.
199+
func umul96Upper64(x uint32, y uint64) uint64 {
200+
yh := uint32(y >> 32)
201+
yl := uint32(y)
202+
203+
xyh := umul64(x, yh)
204+
xyl := umul64(x, yl)
205+
206+
return xyh + (xyl >> 32)
207+
}
208+
209+
// umul96Lower64 returns the lower 64 bits (out of 96 bits) of x * y.
210+
func umul96Lower64(x uint32, y uint64) uint64 {
211+
return uint64(uint64(x) * y)
212+
}
213+
214+
// umul128Upper64 returns the upper 64 bits (out of 128 bits) of x * y.
215+
func umul128Upper64(x, y uint64) uint64 {
216+
a := uint32(x >> 32)
217+
b := uint32(x)
218+
c := uint32(y >> 32)
219+
d := uint32(y)
220+
221+
ac := umul64(a, c)
222+
bc := umul64(b, c)
223+
ad := umul64(a, d)
224+
bd := umul64(b, d)
225+
226+
intermediate := (bd >> 32) + uint64(uint32(ad)) + uint64(uint32(bc))
227+
228+
return ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32)
229+
}
230+
231+
// umul192Upper128 returns the upper 128 bits (out of 192 bits) of x * y.
232+
func umul192Upper128(x uint64, y uint128) uint128 {
233+
r := umul128(x, y.Hi)
234+
t := umul128Upper64(x, y.Lo)
235+
return uadd128(r, t)
236+
}
237+
238+
// umul192Lower128 returns the lower 128 bits (out of 192 bits) of x * y.
239+
func umul192Lower128(x uint64, y uint128) uint128 {
240+
high := x * y.Hi
241+
highLow := umul128(x, y.Lo)
242+
return uint128{uint64(high + highLow.Hi), highLow.Lo}
243+
}
244+
245+
// dboxMulPow64 computes x^(i), y^(i), z^(i)
246+
// from the precomputed value of φ̃k for float64
247+
// and also checks if x^(f), y^(f), z^(f) == 0 (section 5.2.1).
248+
func dboxMulPow64(u uint64, phi uint128) (intPart uint64, isInt bool) {
249+
r := umul192Upper128(u, phi)
250+
intPart = r.Hi
251+
isInt = r.Lo == 0
252+
return
253+
}
254+
255+
// dboxMulPow32 computes x^(i), y^(i), z^(i)
256+
// from the precomputed value of φ̃k for float32
257+
// and also checks if x^(f), y^(f), z^(f) == 0 (section 5.2.1).
258+
func dboxMulPow32(u uint32, phi uint64) (intPart uint32, isInt bool) {
259+
r := umul96Upper64(u, phi)
260+
intPart = uint32(r >> 32)
261+
isInt = uint32(r) == 0
262+
return
263+
}
264+
265+
// dboxParity64 computes only the parity of x^(i), y^(i), z^(i)
266+
// from the precomputed value of φ̃k for float64
267+
// and also checks if x^(f), y^(f), z^(f) = 0 (section 5.2.1).
268+
func dboxParity64(mant2 uint64, phi uint128, beta int) (parity bool, isInt bool) {
269+
r := umul192Lower128(mant2, phi)
270+
parity = ((r.Hi >> (64 - beta)) & 1) != 0
271+
isInt = ((uint64(r.Hi << beta)) | (r.Lo >> (64 - beta))) == 0
272+
return
273+
}
274+
275+
// dboxParity32 computes only the parity of x^(i), y^(i), z^(i)
276+
// from the precomputed value of φ̃k for float32
277+
// and also checks if x^(f), y^(f), z^(f) = 0 (section 5.2.1).
278+
func dboxParity32(mant2 uint32, phi uint64, beta int) (parity bool, isInt bool) {
279+
r := umul96Lower64(mant2, phi)
280+
parity = ((r >> (64 - beta)) & 1) != 0
281+
isInt = uint32(r>>(32-beta)) == 0
282+
return
283+
}
284+
285+
// dboxDelta64 returns δ^(i) from the precomputed value of φ̃k for float64.
286+
func dboxDelta64(φ uint128, β int) uint32 {
287+
return uint32(φ.Hi >> (64 - 1 - β))
288+
}
289+
290+
// dboxDelta32 returns δ^(i) from the precomputed value of φ̃k for float32.
291+
func dboxDelta32(φ uint64, β int) uint32 {
292+
return uint32(φ >> (64 - 1 - β))
293+
}
294+
295+
// mulLog10_2MinusLog10_4Over3 computes
296+
// ⌊e*log10(2)-log10(4/3)⌋ = ⌊log10(2^e)-log10(4/3)⌋ (section 6.3).
297+
func mulLog10_2MinusLog10_4Over3(e int) int {
298+
// e should be in the range [-2985, 2936].
299+
return (e*631305 - 261663) >> 21
300+
}
301+
302+
const (
303+
floatMantBits64 = 52 // p = 52 for float64.
304+
floatMantBits32 = 23 // p = 23 for float32.
305+
)
306+
307+
// dboxRange64 returns the left and right float64 endpoints.
308+
func dboxRange64(φ uint128, β int) (left, right uint64) {
309+
left = (φ.Hi - (φ.Hi >> (float64MantBits + 2))) >> (64 - float64MantBits - 1 - β)
310+
right = (φ.Hi + (φ.Hi >> (float64MantBits + 1))) >> (64 - float64MantBits - 1 - β)
311+
return left, right
312+
}
313+
314+
// dboxRange32 returns the left and right float32 endpoints.
315+
func dboxRange32(φ uint64, β int) (left, right uint32) {
316+
left = uint32((φ - (φ >> (floatMantBits32 + 2))) >> (64 - floatMantBits32 - 1 - β))
317+
right = uint32((φ + (φ >> (floatMantBits32 + 1))) >> (64 - floatMantBits32 - 1 - β))
318+
return left, right
319+
}
320+
321+
// dboxRoundUp64 computes the round up of y (i.e., y^(ru)).
322+
func dboxRoundUp64(phi uint128, beta int) uint64 {
323+
return (phi.Hi>>(128/2-floatMantBits64-2-beta) + 1) / 2
324+
}
325+
326+
// dboxRoundUp32 computes the round up of y (i.e., y^(ru)).
327+
func dboxRoundUp32(phi uint64, beta int) uint32 {
328+
return uint32(phi>>(64-floatMantBits32-2-beta)+1) / 2
329+
}
330+
331+
// dboxPow64 gets the precomputed value of φ̃̃k for float64.
332+
func dboxPow64(k, e int) (φ uint128, β int) {
333+
φ, e1, _ := pow10(k)
334+
if k < 0 || k > 55 {
335+
φ.Lo++
336+
}
337+
β = e + e1 - 1
338+
return φ, β
339+
}
340+
341+
// dboxPow32 gets the precomputed value of φ̃̃k for float32.
342+
func dboxPow32(k, e int) (mant uint64, exp int) {
343+
m, e1, _ := pow10(k)
344+
if k < 0 || k > 27 {
345+
m.Hi++
346+
}
347+
exp = e + e1 - 1
348+
return m.Hi, exp
349+
}

0 commit comments

Comments
 (0)