Skip to content

Commit a41178e

Browse files
committed
Auto merge of #27110 - arthurprs:optintstr, r=Gankro
I wrote a reasonably optimized version for both functions. Further optimizations are possible but I tried to keep the code size small (which I think is important), it's a road of diminished gains. The repository used for testing/benchmarks is https://github.com/arthurprs/rust-optimized-inttostr Benchmarks are ran for 3 different distributions, bellow are string length histograms for the u32 type * _h (big numbers skew) [0, 0, 5, 29, 103, 212, 551, 1138, 1887, 3196, 2879] * _m (slight small number skew): [0, 2807, 1334, 1057, 905, 821, 772, 707, 627, 605, 365] * _l (small numbers skew): [0, 8004, 567, 351, 248, 212, 170, 126, 136, 112, 74] Tested processors are * x64 laptop (i7-2670QM) * x32 server (Digital Ocean E5-2630L-v2) ### Display It uses a small look up table (200 bytes) and decode up to 4 characters at a time. I also took special precautions to reduce 64bit arithmetic on 32bit architectures and the gains are huge in these cases. Overall, on modern 64bit CPUs it's pretty much the same speed as the stdlib implementation for very small numbers (0..99), but pulls ahead as the length of the decimal increases. On slight older CPUs (w/ worse ALUs) or 32bit architectures it's pretty much always faster. x64 benchmarks ``` test bench::display_h_new_u08 ... bench: 71,041 ns/iter (+/- 2,894) test bench::display_h_new_u16 ... bench: 378,255 ns/iter (+/- 36,547) test bench::display_h_new_u32 ... bench: 4,232,483 ns/iter (+/- 509,661) test bench::display_h_new_u64 ... bench: 5,166,740 ns/iter (+/- 421,124) test bench::display_h_stdlib_u08 ... bench: 73,536 ns/iter (+/- 5,287) test bench::display_h_stdlib_u16 ... bench: 451,443 ns/iter (+/- 16,879) test bench::display_h_stdlib_u32 ... bench: 5,551,070 ns/iter (+/- 518,151) test bench::display_h_stdlib_u64 ... bench: 8,624,374 ns/iter (+/- 643,701) test bench::display_l_new_u08 ... bench: 71,547 ns/iter (+/- 504) test bench::display_l_new_u16 ... bench: 399,727 ns/iter (+/- 28,030) test bench::display_l_new_u32 ... bench: 4,365,303 ns/iter (+/- 414,414) test bench::display_l_new_u64 ... bench: 5,302,382 ns/iter (+/- 292,324) test bench::display_l_stdlib_u08 ... bench: 75,445 ns/iter (+/- 2,487) test bench::display_l_stdlib_u16 ... bench: 444,313 ns/iter (+/- 16,203) test bench::display_l_stdlib_u32 ... bench: 5,761,801 ns/iter (+/- 387,186) test bench::display_l_stdlib_u64 ... bench: 8,790,365 ns/iter (+/- 614,846) test bench::display_m_new_u08 ... bench: 71,820 ns/iter (+/- 2,956) test bench::display_m_new_u16 ... bench: 399,649 ns/iter (+/- 20,643) test bench::display_m_new_u32 ... bench: 4,355,561 ns/iter (+/- 179,189) test bench::display_m_new_u64 ... bench: 5,070,594 ns/iter (+/- 341,950) test bench::display_m_stdlib_u08 ... bench: 74,900 ns/iter (+/- 1,909) test bench::display_m_stdlib_u16 ... bench: 448,788 ns/iter (+/- 20,791) test bench::display_m_stdlib_u32 ... bench: 5,717,939 ns/iter (+/- 316,824) test bench::display_m_stdlib_u64 ... bench: 8,787,160 ns/iter (+/- 482,864) ``` x86 benchmarks ``` test bench::display_h_new_u08 ... bench: 94,246 ns/iter (+/- 34,872) test bench::display_h_new_u16 ... bench: 533,805 ns/iter (+/- 22,499) test bench::display_h_new_u32 ... bench: 6,127,747 ns/iter (+/- 2,192,789) test bench::display_h_new_u64 ... bench: 14,994,203 ns/iter (+/- 1,609,345) test bench::display_h_stdlib_u08 ... bench: 107,233 ns/iter (+/- 8,571) test bench::display_h_stdlib_u16 ... bench: 631,186 ns/iter (+/- 11,332) test bench::display_h_stdlib_u32 ... bench: 7,696,344 ns/iter (+/- 957,917) test bench::display_h_stdlib_u64 ... bench: 45,677,401 ns/iter (+/- 4,991,344) test bench::display_l_new_u08 ... bench: 95,855 ns/iter (+/- 27,735) test bench::display_l_new_u16 ... bench: 532,084 ns/iter (+/- 40,479) test bench::display_l_new_u32 ... bench: 5,973,953 ns/iter (+/- 211,676) test bench::display_l_new_u64 ... bench: 14,773,064 ns/iter (+/- 1,276,579) test bench::display_l_stdlib_u08 ... bench: 106,350 ns/iter (+/- 63,963) test bench::display_l_stdlib_u16 ... bench: 637,746 ns/iter (+/- 101,005) test bench::display_l_stdlib_u32 ... bench: 7,740,640 ns/iter (+/- 848,478) test bench::display_l_stdlib_u64 ... bench: 44,846,932 ns/iter (+/- 4,514,694) test bench::display_m_new_u08 ... bench: 94,549 ns/iter (+/- 13,029) test bench::display_m_new_u16 ... bench: 546,030 ns/iter (+/- 35,804) test bench::display_m_new_u32 ... bench: 5,983,924 ns/iter (+/- 1,180,559) test bench::display_m_new_u64 ... bench: 14,817,873 ns/iter (+/- 2,271,464) test bench::display_m_stdlib_u08 ... bench: 107,806 ns/iter (+/- 8,805) test bench::display_m_stdlib_u16 ... bench: 630,714 ns/iter (+/- 6,586) test bench::display_m_stdlib_u32 ... bench: 7,784,210 ns/iter (+/- 358,601) test bench::display_m_stdlib_u64 ... bench: 46,223,927 ns/iter (+/- 6,553,176) ``` ### from_str_radix (FromStr) All valid digits are ascii so I modified the function to use the underlining bytes instead and simplified the match to avoid wasting cycles. x64 benchmarks ``` test bench::from_str_h_new_u08 ... bench: 28,153 ns/iter (+/- 624) test bench::from_str_h_new_u16 ... bench: 223,513 ns/iter (+/- 11,554) test bench::from_str_h_new_u32 ... bench: 3,098,935 ns/iter (+/- 231,022) test bench::from_str_h_new_u64 ... bench: 5,009,900 ns/iter (+/- 341,961) test bench::from_str_h_stdlib_u08 ... bench: 34,033 ns/iter (+/- 2,068) test bench::from_str_h_stdlib_u16 ... bench: 248,785 ns/iter (+/- 14,208) test bench::from_str_h_stdlib_u32 ... bench: 4,150,536 ns/iter (+/- 266,070) test bench::from_str_h_stdlib_u64 ... bench: 6,817,997 ns/iter (+/- 449,838) test bench::from_str_l_new_u08 ... bench: 27,552 ns/iter (+/- 1,500) test bench::from_str_l_new_u16 ... bench: 234,360 ns/iter (+/- 13,144) test bench::from_str_l_new_u32 ... bench: 3,140,261 ns/iter (+/- 248,175) test bench::from_str_l_new_u64 ... bench: 5,176,583 ns/iter (+/- 350,416) test bench::from_str_l_stdlib_u08 ... bench: 35,060 ns/iter (+/- 2,154) test bench::from_str_l_stdlib_u16 ... bench: 252,135 ns/iter (+/- 23,461) test bench::from_str_l_stdlib_u32 ... bench: 4,154,599 ns/iter (+/- 369,606) test bench::from_str_l_stdlib_u64 ... bench: 6,892,767 ns/iter (+/- 213,030) test bench::from_str_m_new_u08 ... bench: 28,252 ns/iter (+/- 1,384) test bench::from_str_m_new_u16 ... bench: 231,051 ns/iter (+/- 16,540) test bench::from_str_m_new_u32 ... bench: 3,166,504 ns/iter (+/- 134,418) test bench::from_str_m_new_u64 ... bench: 5,103,195 ns/iter (+/- 218,912) test bench::from_str_m_stdlib_u08 ... bench: 35,012 ns/iter (+/- 2,735) test bench::from_str_m_stdlib_u16 ... bench: 250,967 ns/iter (+/- 14,708) test bench::from_str_m_stdlib_u32 ... bench: 4,101,845 ns/iter (+/- 205,802) test bench::from_str_m_stdlib_u64 ... bench: 6,823,001 ns/iter (+/- 267,215) ``` x86 benchmarks ``` test bench::from_str_h_new_u08 ... bench: 23,682 ns/iter (+/- 3,590) test bench::from_str_h_new_u16 ... bench: 190,916 ns/iter (+/- 29,688) test bench::from_str_h_new_u32 ... bench: 2,649,952 ns/iter (+/- 308,576) test bench::from_str_h_new_u64 ... bench: 23,458,434 ns/iter (+/- 2,327,427) test bench::from_str_h_stdlib_u08 ... bench: 45,551 ns/iter (+/- 6,968) test bench::from_str_h_stdlib_u16 ... bench: 313,739 ns/iter (+/- 17,175) test bench::from_str_h_stdlib_u32 ... bench: 4,615,669 ns/iter (+/- 470,766) test bench::from_str_h_stdlib_u64 ... bench: 30,589,482 ns/iter (+/- 2,278,996) test bench::from_str_l_new_u08 ... bench: 23,763 ns/iter (+/- 5,545) test bench::from_str_l_new_u16 ... bench: 185,472 ns/iter (+/- 33,097) test bench::from_str_l_new_u32 ... bench: 2,691,307 ns/iter (+/- 473,886) test bench::from_str_l_new_u64 ... bench: 22,952,593 ns/iter (+/- 1,963,742) test bench::from_str_l_stdlib_u08 ... bench: 45,285 ns/iter (+/- 16,337) test bench::from_str_l_stdlib_u16 ... bench: 313,624 ns/iter (+/- 6,643) test bench::from_str_l_stdlib_u32 ... bench: 4,595,679 ns/iter (+/- 1,876,361) test bench::from_str_l_stdlib_u64 ... bench: 30,434,683 ns/iter (+/- 1,901,996) test bench::from_str_m_new_u08 ... bench: 23,812 ns/iter (+/- 1,505) test bench::from_str_m_new_u16 ... bench: 185,553 ns/iter (+/- 19,788) test bench::from_str_m_new_u32 ... bench: 2,614,920 ns/iter (+/- 66,230) test bench::from_str_m_new_u64 ... bench: 23,241,778 ns/iter (+/- 3,474,077) test bench::from_str_m_stdlib_u08 ... bench: 45,634 ns/iter (+/- 1,436) test bench::from_str_m_stdlib_u16 ... bench: 316,479 ns/iter (+/- 21,212) test bench::from_str_m_stdlib_u32 ... bench: 4,609,147 ns/iter (+/- 487,068) test bench::from_str_m_stdlib_u64 ... bench: 30,165,173 ns/iter (+/- 1,601,830) ```
2 parents 58fb9b5 + c073f81 commit a41178e

File tree

3 files changed

+120
-15
lines changed

3 files changed

+120
-15
lines changed

src/libcore/fmt/num.rs

+86-2
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,25 @@ use fmt;
2020
use num::Zero;
2121
use ops::{Div, Rem, Sub};
2222
use str;
23+
use slice;
24+
use ptr;
25+
use mem;
2326

2427
#[doc(hidden)]
2528
trait Int: Zero + PartialEq + PartialOrd + Div<Output=Self> + Rem<Output=Self> +
2629
Sub<Output=Self> + Copy {
2730
fn from_u8(u: u8) -> Self;
2831
fn to_u8(&self) -> u8;
32+
fn to_u32(&self) -> u32;
33+
fn to_u64(&self) -> u64;
2934
}
3035

3136
macro_rules! doit {
3237
($($t:ident)*) => ($(impl Int for $t {
3338
fn from_u8(u: u8) -> $t { u as $t }
3439
fn to_u8(&self) -> u8 { *self as u8 }
40+
fn to_u32(&self) -> u32 { *self as u32 }
41+
fn to_u64(&self) -> u64 { *self as u64 }
3542
})*)
3643
}
3744
doit! { i8 i16 i32 i64 isize u8 u16 u32 u64 usize }
@@ -188,6 +195,7 @@ macro_rules! radix_fmt {
188195
}
189196
}
190197
}
198+
191199
macro_rules! int_base {
192200
($Trait:ident for $T:ident as $U:ident -> $Radix:ident) => {
193201
#[stable(feature = "rust1", since = "1.0.0")]
@@ -209,17 +217,16 @@ macro_rules! debug {
209217
}
210218
}
211219
}
220+
212221
macro_rules! integer {
213222
($Int:ident, $Uint:ident) => {
214-
int_base! { Display for $Int as $Int -> Decimal }
215223
int_base! { Binary for $Int as $Uint -> Binary }
216224
int_base! { Octal for $Int as $Uint -> Octal }
217225
int_base! { LowerHex for $Int as $Uint -> LowerHex }
218226
int_base! { UpperHex for $Int as $Uint -> UpperHex }
219227
radix_fmt! { $Int as $Int, fmt_int }
220228
debug! { $Int }
221229

222-
int_base! { Display for $Uint as $Uint -> Decimal }
223230
int_base! { Binary for $Uint as $Uint -> Binary }
224231
int_base! { Octal for $Uint as $Uint -> Octal }
225232
int_base! { LowerHex for $Uint as $Uint -> LowerHex }
@@ -233,3 +240,80 @@ integer! { i8, u8 }
233240
integer! { i16, u16 }
234241
integer! { i32, u32 }
235242
integer! { i64, u64 }
243+
244+
const DEC_DIGITS_LUT: &'static[u8] =
245+
b"0001020304050607080910111213141516171819\
246+
2021222324252627282930313233343536373839\
247+
4041424344454647484950515253545556575859\
248+
6061626364656667686970717273747576777879\
249+
8081828384858687888990919293949596979899";
250+
251+
macro_rules! impl_Display {
252+
($($t:ident),*: $conv_fn:ident) => ($(
253+
impl fmt::Display for $t {
254+
#[allow(unused_comparisons)]
255+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
256+
let is_positive = *self >= 0;
257+
let mut n = if is_positive {
258+
self.$conv_fn()
259+
} else {
260+
// convert the negative num to positive by summing 1 to it's 2 complement
261+
(!self.$conv_fn()).wrapping_add(1)
262+
};
263+
let mut buf: [u8; 20] = unsafe { mem::uninitialized() };
264+
let mut curr = buf.len() as isize;
265+
let buf_ptr = buf.as_mut_ptr();
266+
let lut_ptr = DEC_DIGITS_LUT.as_ptr();
267+
268+
unsafe {
269+
// eagerly decode 4 characters at a time
270+
if <$t>::max_value() as u64 >= 10000 {
271+
while n >= 10000 {
272+
let rem = (n % 10000) as isize;
273+
n /= 10000;
274+
275+
let d1 = (rem / 100) << 1;
276+
let d2 = (rem % 100) << 1;
277+
curr -= 4;
278+
ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
279+
ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
280+
}
281+
}
282+
283+
// if we reach here numbers are <= 9999, so at most 4 chars long
284+
let mut n = n as isize; // possibly reduce 64bit math
285+
286+
// decode 2 more chars, if > 2 chars
287+
if n >= 100 {
288+
let d1 = (n % 100) << 1;
289+
n /= 100;
290+
curr -= 2;
291+
ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
292+
}
293+
294+
// decode last 1 or 2 chars
295+
if n < 10 {
296+
curr -= 1;
297+
*buf_ptr.offset(curr) = (n as u8) + 48;
298+
} else {
299+
let d1 = n << 1;
300+
curr -= 2;
301+
ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
302+
}
303+
}
304+
305+
let buf_slice = unsafe {
306+
str::from_utf8_unchecked(
307+
slice::from_raw_parts(buf_ptr.offset(curr), buf.len() - curr as usize))
308+
};
309+
f.pad_integral(is_positive, "", buf_slice)
310+
}
311+
})*);
312+
}
313+
314+
impl_Display!(i8, u8, i16, u16, i32, u32: to_u32);
315+
impl_Display!(i64, u64: to_u64);
316+
#[cfg(target_pointer_width = "32")]
317+
impl_Display!(isize, usize: to_u32);
318+
#[cfg(target_pointer_width = "64")]
319+
impl_Display!(isize, usize: to_u64);

src/libcore/num/mod.rs

+25-11
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use mem::size_of;
2424
use option::Option::{self, Some, None};
2525
use result::Result::{self, Ok, Err};
2626
use str::{FromStr, StrExt};
27+
use slice::SliceExt;
2728

2829
/// Provides intentionally-wrapped arithmetic on `T`.
2930
///
@@ -1448,19 +1449,30 @@ fn from_str_radix<T: FromStrRadixHelper>(src: &str, radix: u32)
14481449
-> Result<T, ParseIntError> {
14491450
use self::IntErrorKind::*;
14501451
use self::ParseIntError as PIE;
1452+
14511453
assert!(radix >= 2 && radix <= 36,
14521454
"from_str_radix_int: must lie in the range `[2, 36]` - found {}",
14531455
radix);
14541456

1457+
if src.is_empty() {
1458+
return Err(PIE { kind: Empty });
1459+
}
1460+
14551461
let is_signed_ty = T::from_u32(0) > T::min_value();
14561462

1457-
match src.slice_shift_char() {
1458-
Some(('-', "")) => Err(PIE { kind: Empty }),
1459-
Some(('-', src)) if is_signed_ty => {
1463+
// all valid digits are ascii, so we will just iterate over the utf8 bytes
1464+
// and cast them to chars. .to_digit() will safely return None for anything
1465+
// other than a valid ascii digit for a the given radix, including the first-byte
1466+
// of multi-byte sequences
1467+
let src = src.as_bytes();
1468+
1469+
match (src[0], &src[1..]) {
1470+
(b'-', digits) if digits.is_empty() => Err(PIE { kind: Empty }),
1471+
(b'-', digits) if is_signed_ty => {
14601472
// The number is negative
14611473
let mut result = T::from_u32(0);
1462-
for c in src.chars() {
1463-
let x = match c.to_digit(radix) {
1474+
for &c in digits {
1475+
let x = match (c as char).to_digit(radix) {
14641476
Some(x) => x,
14651477
None => return Err(PIE { kind: InvalidDigit }),
14661478
};
@@ -1475,11 +1487,14 @@ fn from_str_radix<T: FromStrRadixHelper>(src: &str, radix: u32)
14751487
}
14761488
Ok(result)
14771489
},
1478-
Some((_, _)) => {
1490+
(c, digits) => {
14791491
// The number is signed
1480-
let mut result = T::from_u32(0);
1481-
for c in src.chars() {
1482-
let x = match c.to_digit(radix) {
1492+
let mut result = match (c as char).to_digit(radix) {
1493+
Some(x) => T::from_u32(x),
1494+
None => return Err(PIE { kind: InvalidDigit }),
1495+
};
1496+
for &c in digits {
1497+
let x = match (c as char).to_digit(radix) {
14831498
Some(x) => x,
14841499
None => return Err(PIE { kind: InvalidDigit }),
14851500
};
@@ -1493,8 +1508,7 @@ fn from_str_radix<T: FromStrRadixHelper>(src: &str, radix: u32)
14931508
};
14941509
}
14951510
Ok(result)
1496-
},
1497-
None => Err(ParseIntError { kind: Empty }),
1511+
}
14981512
}
14991513
}
15001514

src/libcoretest/num/mod.rs

+9-2
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,14 @@ mod tests {
117117
}
118118

119119
#[test]
120-
fn test_int_from_minus_sign() {
121-
assert_eq!("-".parse::<i32>().ok(), None);
120+
fn test_invalid() {
121+
assert_eq!("--129".parse::<i8>().ok(), None);
122+
assert_eq!("Съешь".parse::<u8>().ok(), None);
123+
}
124+
125+
#[test]
126+
fn test_empty() {
127+
assert_eq!("-".parse::<i8>().ok(), None);
128+
assert_eq!("".parse::<u8>().ok(), None);
122129
}
123130
}

0 commit comments

Comments
 (0)