Skip to content

Commit 658adc6

Browse files
committed
faster fmt::Display of 128-bit integers, without unsafe pointer
1 parent a438345 commit 658adc6

File tree

1 file changed

+132
-155
lines changed

1 file changed

+132
-155
lines changed

library/core/src/fmt/num.rs

+132-155
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ macro_rules! impl_Display {
268268
// Format per two digits from the lookup table.
269269
if remain > 9 {
270270
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
271-
// and the while condition ensures at least 2 more decimals.
271+
// and the if condition ensures at least 2 more decimals.
272272
unsafe { core::hint::assert_unchecked(offset >= 2) }
273273
// SAFETY: The offset counts down from its initial buf.len()
274274
// without underflow due to the previous precondition.
@@ -555,93 +555,6 @@ mod imp {
555555
}
556556
impl_Exp!(i128, u128 as u128 via to_u128 named exp_u128);
557557

558-
/// Helper function for writing a u64 into `buf` going from last to first, with `curr`.
559-
fn parse_u64_into<const N: usize>(mut n: u64, buf: &mut [MaybeUninit<u8>; N], curr: &mut usize) {
560-
let buf_ptr = MaybeUninit::slice_as_mut_ptr(buf);
561-
let lut_ptr = DEC_DIGITS_LUT.as_ptr();
562-
assert!(*curr > 19);
563-
564-
// SAFETY:
565-
// Writes at most 19 characters into the buffer. Guaranteed that any ptr into LUT is at most
566-
// 198, so will never OOB. There is a check above that there are at least 19 characters
567-
// remaining.
568-
unsafe {
569-
if n >= 1e16 as u64 {
570-
let to_parse = n % 1e16 as u64;
571-
n /= 1e16 as u64;
572-
573-
// Some of these are nops but it looks more elegant this way.
574-
let d1 = ((to_parse / 1e14 as u64) % 100) << 1;
575-
let d2 = ((to_parse / 1e12 as u64) % 100) << 1;
576-
let d3 = ((to_parse / 1e10 as u64) % 100) << 1;
577-
let d4 = ((to_parse / 1e8 as u64) % 100) << 1;
578-
let d5 = ((to_parse / 1e6 as u64) % 100) << 1;
579-
let d6 = ((to_parse / 1e4 as u64) % 100) << 1;
580-
let d7 = ((to_parse / 1e2 as u64) % 100) << 1;
581-
let d8 = ((to_parse / 1e0 as u64) % 100) << 1;
582-
583-
*curr -= 16;
584-
585-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr + 0), 2);
586-
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(*curr + 2), 2);
587-
ptr::copy_nonoverlapping(lut_ptr.add(d3 as usize), buf_ptr.add(*curr + 4), 2);
588-
ptr::copy_nonoverlapping(lut_ptr.add(d4 as usize), buf_ptr.add(*curr + 6), 2);
589-
ptr::copy_nonoverlapping(lut_ptr.add(d5 as usize), buf_ptr.add(*curr + 8), 2);
590-
ptr::copy_nonoverlapping(lut_ptr.add(d6 as usize), buf_ptr.add(*curr + 10), 2);
591-
ptr::copy_nonoverlapping(lut_ptr.add(d7 as usize), buf_ptr.add(*curr + 12), 2);
592-
ptr::copy_nonoverlapping(lut_ptr.add(d8 as usize), buf_ptr.add(*curr + 14), 2);
593-
}
594-
if n >= 1e8 as u64 {
595-
let to_parse = n % 1e8 as u64;
596-
n /= 1e8 as u64;
597-
598-
// Some of these are nops but it looks more elegant this way.
599-
let d1 = ((to_parse / 1e6 as u64) % 100) << 1;
600-
let d2 = ((to_parse / 1e4 as u64) % 100) << 1;
601-
let d3 = ((to_parse / 1e2 as u64) % 100) << 1;
602-
let d4 = ((to_parse / 1e0 as u64) % 100) << 1;
603-
*curr -= 8;
604-
605-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr + 0), 2);
606-
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(*curr + 2), 2);
607-
ptr::copy_nonoverlapping(lut_ptr.add(d3 as usize), buf_ptr.add(*curr + 4), 2);
608-
ptr::copy_nonoverlapping(lut_ptr.add(d4 as usize), buf_ptr.add(*curr + 6), 2);
609-
}
610-
// `n` < 1e8 < (1 << 32)
611-
let mut n = n as u32;
612-
if n >= 1e4 as u32 {
613-
let to_parse = n % 1e4 as u32;
614-
n /= 1e4 as u32;
615-
616-
let d1 = (to_parse / 100) << 1;
617-
let d2 = (to_parse % 100) << 1;
618-
*curr -= 4;
619-
620-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr + 0), 2);
621-
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(*curr + 2), 2);
622-
}
623-
624-
// `n` < 1e4 < (1 << 16)
625-
let mut n = n as u16;
626-
if n >= 100 {
627-
let d1 = (n % 100) << 1;
628-
n /= 100;
629-
*curr -= 2;
630-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr), 2);
631-
}
632-
633-
// decode last 1 or 2 chars
634-
if n < 10 {
635-
*curr -= 1;
636-
*buf_ptr.add(*curr) = (n as u8) + b'0';
637-
} else {
638-
let d1 = n << 1;
639-
*curr -= 2;
640-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr), 2);
641-
}
642-
}
643-
}
644-
645558
#[stable(feature = "rust1", since = "1.0.0")]
646559
impl fmt::Display for u128 {
647560
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -652,96 +565,160 @@ impl fmt::Display for u128 {
652565
#[stable(feature = "rust1", since = "1.0.0")]
653566
impl fmt::Display for i128 {
654567
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
655-
let is_nonnegative = *self >= 0;
656-
let n = if is_nonnegative {
657-
self.to_u128()
658-
} else {
659-
// convert the negative num to positive by summing 1 to its 2s complement
660-
(!self.to_u128()).wrapping_add(1)
661-
};
662-
fmt_u128(n, is_nonnegative, f)
568+
fmt_u128(self.unsigned_abs(), *self >= 0, f)
663569
}
664570
}
665571

666-
/// Specialized optimization for u128. Instead of taking two items at a time, it splits
667-
/// into at most 2 u64s, and then chunks by 10e16, 10e8, 10e4, 10e2, and then 10e1.
668-
/// It also has to handle 1 last item, as 10^40 > 2^128 > 10^39, whereas
669-
/// 10^20 > 2^64 > 10^19.
572+
/// Format optimized for u128. Computation of 128 bits is limited by proccessing
573+
/// in batches of 16 decimals at a time.
670574
fn fmt_u128(n: u128, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
671-
// 2^128 is about 3*10^38, so 39 gives an extra byte of space
672-
let mut buf = [MaybeUninit::<u8>::uninit(); 39];
673-
let mut curr = buf.len();
674-
675-
let (n, rem) = udiv_1e19(n);
676-
parse_u64_into(rem, &mut buf, &mut curr);
677-
678-
if n != 0 {
679-
// 0 pad up to point
680-
let target = buf.len() - 19;
681-
// SAFETY: Guaranteed that we wrote at most 19 bytes, and there must be space
682-
// remaining since it has length 39
683-
unsafe {
684-
ptr::write_bytes(
685-
MaybeUninit::slice_as_mut_ptr(&mut buf).add(target),
686-
b'0',
687-
curr - target,
688-
);
689-
}
690-
curr = target;
691-
692-
let (n, rem) = udiv_1e19(n);
693-
parse_u64_into(rem, &mut buf, &mut curr);
694-
// Should this following branch be annotated with unlikely?
695-
if n != 0 {
696-
let target = buf.len() - 38;
697-
// The raw `buf_ptr` pointer is only valid until `buf` is used the next time,
698-
// buf `buf` is not used in this scope so we are good.
699-
let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf);
700-
// SAFETY: At this point we wrote at most 38 bytes, pad up to that point,
701-
// There can only be at most 1 digit remaining.
702-
unsafe {
703-
ptr::write_bytes(buf_ptr.add(target), b'0', curr - target);
704-
curr = target - 1;
705-
*buf_ptr.add(curr) = (n as u8) + b'0';
706-
}
575+
// Optimize common-case zero, which would also need special treatment due to
576+
// its "leading" zero.
577+
if n == 0 {
578+
return f.pad_integral(true, "", "0");
579+
}
580+
581+
// U128::MAX has 39 significant-decimals.
582+
const MAX_DEC_N: usize = 39;
583+
// Buffer decimals with right alignment.
584+
let mut buf = [MaybeUninit::<u8>::uninit(); MAX_DEC_N];
585+
// Count the number of bytes in buf that are not initialized.
586+
let mut offset = buf.len();
587+
588+
// Take the 16 least-significant decimals.
589+
let (n, mod_1e16) = div_rem_1e16(n);
590+
let mut remain = if n == 0 {
591+
mod_1e16
592+
} else {
593+
// write buf[23..39]
594+
enc_16lsd::<23>(&mut buf, mod_1e16);
595+
offset = 23;
596+
597+
// Take another 16 decimals.
598+
let (n, mod_1e16) = div_rem_1e16(n);
599+
if n == 0 {
600+
mod_1e16
601+
} else {
602+
// write buf[7..23]
603+
enc_16lsd::<7>(&mut buf, mod_1e16);
604+
offset = 7;
605+
606+
debug_assert!(n < 10);
607+
n as u64
707608
}
609+
};
610+
611+
// Format per four digits from the lookup table.
612+
while remain > 999 {
613+
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
614+
// and the while condition ensures at least 4 more decimals.
615+
unsafe { core::hint::assert_unchecked(offset >= 4) }
616+
// SAFETY: The offset counts down from its initial buf.len()
617+
// without underflow due to the previous precondition.
618+
unsafe { core::hint::assert_unchecked(offset <= buf.len()) }
619+
offset -= 4;
620+
621+
// pull two pairs
622+
let quad = remain % 1_00_00;
623+
remain /= 1_00_00;
624+
let pair1 = (quad / 100) as usize;
625+
let pair2 = (quad % 100) as usize;
626+
buf[offset + 0].write(DEC_DIGITS_LUT[pair1 * 2 + 0]);
627+
buf[offset + 1].write(DEC_DIGITS_LUT[pair1 * 2 + 1]);
628+
buf[offset + 2].write(DEC_DIGITS_LUT[pair2 * 2 + 0]);
629+
buf[offset + 3].write(DEC_DIGITS_LUT[pair2 * 2 + 1]);
630+
}
631+
632+
// Format per two digits from the lookup table.
633+
if remain > 9 {
634+
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
635+
// and the if condition ensures at least 2 more decimals.
636+
unsafe { core::hint::assert_unchecked(offset >= 2) }
637+
// SAFETY: The offset counts down from its initial buf.len()
638+
// without underflow due to the previous precondition.
639+
unsafe { core::hint::assert_unchecked(offset <= buf.len()) }
640+
offset -= 2;
641+
642+
let pair = (remain % 100) as usize;
643+
remain /= 100;
644+
buf[offset + 0].write(DEC_DIGITS_LUT[pair * 2 + 0]);
645+
buf[offset + 1].write(DEC_DIGITS_LUT[pair * 2 + 1]);
646+
}
647+
648+
// Format the last remaining digit, if any.
649+
if remain != 0 {
650+
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
651+
// and the if condition ensures (at least) 1 more decimals.
652+
unsafe { core::hint::assert_unchecked(offset >= 1) }
653+
// SAFETY: The offset counts down from its initial buf.len()
654+
// without underflow due to the previous precondition.
655+
unsafe { core::hint::assert_unchecked(offset <= buf.len()) }
656+
offset -= 1;
657+
658+
// Either the compiler sees that remain < 10, or it prevents
659+
// a boundary check up next.
660+
let last = (remain & 15) as usize;
661+
buf[offset].write(DEC_DIGITS_LUT[last * 2 + 1]);
662+
// not used: remain = 0;
708663
}
709664

710-
// SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid
711-
// UTF-8 since `DEC_DIGITS_LUT` is
712-
let buf_slice = unsafe {
665+
// SAFETY: All buf content since offset is set.
666+
let written = unsafe { buf.get_unchecked(offset..) };
667+
// SAFETY: Writes use ASCII from the lookup table exclusively.
668+
let as_str = unsafe {
713669
str::from_utf8_unchecked(slice::from_raw_parts(
714-
MaybeUninit::slice_as_mut_ptr(&mut buf).add(curr),
715-
buf.len() - curr,
670+
MaybeUninit::slice_as_ptr(written),
671+
written.len(),
716672
))
717673
};
718-
f.pad_integral(is_nonnegative, "", buf_slice)
674+
f.pad_integral(is_nonnegative, "", as_str)
719675
}
720676

721-
/// Partition of `n` into n > 1e19 and rem <= 1e19
677+
/// Encodes the 16 least significant decimals of n into buf.
678+
fn enc_16lsd<const OFFSET: usize>(buf: &mut [MaybeUninit<u8>; 39], n: u64) {
679+
// Consume the least-significant decimals from a working copy.
680+
let mut remain = n;
681+
682+
// Format per four digits from the lookup table.
683+
for quad_index in (0..4).rev() {
684+
// pull two pairs
685+
let quad = remain % 1_00_00;
686+
remain /= 1_00_00;
687+
let pair1 = (quad / 100) as usize;
688+
let pair2 = (quad % 100) as usize;
689+
buf[quad_index * 4 + OFFSET + 0].write(DEC_DIGITS_LUT[pair1 * 2 + 0]);
690+
buf[quad_index * 4 + OFFSET + 1].write(DEC_DIGITS_LUT[pair1 * 2 + 1]);
691+
buf[quad_index * 4 + OFFSET + 2].write(DEC_DIGITS_LUT[pair2 * 2 + 0]);
692+
buf[quad_index * 4 + OFFSET + 3].write(DEC_DIGITS_LUT[pair2 * 2 + 1]);
693+
}
694+
}
695+
696+
/// Euclidean division plus remainder with constant 1E16 basically consumes 16
697+
/// decimals from n.
722698
///
723-
/// Integer division algorithm is based on the following paper:
699+
/// The integer division algorithm is based on the following paper:
724700
///
725701
/// T. Granlund and P. Montgomery, “Division by Invariant Integers Using Multiplication”
726702
/// in Proc. of the SIGPLAN94 Conference on Programming Language Design and
727703
/// Implementation, 1994, pp. 61–72
728704
///
729-
fn udiv_1e19(n: u128) -> (u128, u64) {
730-
const DIV: u64 = 1e19 as u64;
731-
const FACTOR: u128 = 156927543384667019095894735580191660403;
705+
#[inline]
706+
fn div_rem_1e16(n: u128) -> (u128, u64) {
707+
const D: u128 = 1_0000_0000_0000_0000;
708+
if n < D {
709+
return (0, n as u64);
710+
}
732711

733-
let quot = if n < 1 << 83 {
734-
((n >> 19) as u64 / (DIV >> 19)) as u128
735-
} else {
736-
u128_mulhi(n, FACTOR) >> 62
737-
};
712+
// These constant values are computed with the CHOOSE_MULTIPLIER procedure.
713+
const M_HIGH: u128 = 76624777043294442917917351357515459181;
714+
const SH_POST: u8 = 51;
738715

739-
let rem = (n - quot * DIV as u128) as u64;
740-
(quot, rem)
716+
let quot = u128_mulhi(n, M_HIGH) >> SH_POST;
717+
let rem = n - quot * D;
718+
(quot, rem as u64)
741719
}
742720

743721
/// Multiply unsigned 128 bit integers, return upper 128 bits of the result
744-
#[inline]
745722
fn u128_mulhi(x: u128, y: u128) -> u128 {
746723
let x_lo = x as u64;
747724
let x_hi = (x >> 64) as u64;

0 commit comments

Comments
 (0)