- 
                Notifications
    You must be signed in to change notification settings 
- Fork 13.9k
Closed
Labels
A-incr-compArea: Incremental compilationArea: Incremental compilationC-enhancementCategory: An issue proposing an enhancement or a PR with one.Category: An issue proposing an enhancement or a PR with one.I-compiletimeIssue: Problems and improvements with respect to compile times.Issue: Problems and improvements with respect to compile times.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.Relevant to the compiler team, which will review and decide on the PR/issue.WG-compiler-performanceWorking group: Compiler PerformanceWorking group: Compiler Performance
Description
Here's Cachegrind data for clap-rs from rustc-benchmarks on a "base incremental" "check" build (i.e. the first incremental build):
--------------------------------------------------------------------------------
            Ir
--------------------------------------------------------------------------------
39,792,587,805  PROGRAM TOTALS
6,008,282,218  librustc_data_structures/sip128.rs:rustc_data_structures::sip128::SipHasher128::short_write
1,655,965,059  libcore/num/mod.rs:rustc_data_structures::sip128::SipHasher128::short_write
  319,188,771  libcore/cmp.rs:rustc_data_structures::sip128::SipHasher128::short_write
That's over 20% of the instructions under short_write.
Here are the annotations for the hot pieces of code in librustc_data_structures/sip128.rs.
          .  macro_rules! compress {
          .      ($state:expr) => ({
967,427,173          compress!($state.v0, $state.v1, $state.v2, $state.v3)
          .      });
          .      ($v0:expr, $v1:expr, $v2:expr, $v3:expr) =>
          .      ({
          .          $v0 = $v0.wrapping_add($v1); $v1 = $v1.rotate_left(13); $v1 ^= $v0;
          .          $v0 = $v0.rotate_left(32);
          .          $v2 = $v2.wrapping_add($v3); $v3 = $v3.rotate_left(16); $v3 ^= $v2;
          .          $v0 = $v0.wrapping_add($v3); $v3 = $v3.rotate_left(21); $v3 ^= $v0;
          .          $v2 = $v2.wrapping_add($v1); $v1 = $v1.rotate_left(17); $v1 ^= $v2;
 79,051,764          $v2 = $v2.rotate_left(32);
          .      });
          .  }
          .
          .  /// Load an integer of the desired type from a byte stream, in LE order. Uses
          .  /// `copy_nonoverlapping` to let the compiler generate the most efficient way
          .  /// to load it from a possibly unaligned address.
          .  ///
          .  /// Unsafe because: unchecked indexing at i..i+size_of(int_ty)
          .  macro_rules! load_int_le {
          .      ($buf:expr, $i:expr, $int_ty:ident) =>
          .      ({
          .         debug_assert!($i + mem::size_of::<$int_ty>() <= $buf.len());
          .         let mut data = 0 as $int_ty;
 13,166,995         ptr::copy_nonoverlapping($buf.get_unchecked($i),
          .                                  &mut data as *mut _ as *mut u8,
          .                                  mem::size_of::<$int_ty>());
          .         data.to_le()
          .      });
          .  }
          .
          .  /// Load an u64 using up to 7 bytes of a byte slice.
          .  ///
          .  /// Unsafe because: unchecked indexing at start..start+len
          .  #[inline]
          .  unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 {
          .      debug_assert!(len < 8);
          .      let mut i = 0; // current byte index (from LSB) in the output u64
          .      let mut out = 0;
345,951,546      if i + 3 < len {
 80,328,075          out = load_int_le!(buf, start + i, u32) as u64;
          .          i += 4;
          .      }
747,722,073      if i + 1 < len {
480,400,389          out |= (load_int_le!(buf, start + i, u16) as u64) << (i * 8);
 87,344,805          i += 2
          .      }
345,951,546      if i < len {
211,374,738          out |= (*buf.get_unchecked(start + i) as u64) << (i * 8);
          .          i += 1;
          .      }
          .      debug_assert_eq!(i, len);
          .      out
          .  }
and
          .      #[inline]
          .      fn short_write(&mut self, msg: &[u8]) {
          .          debug_assert!(msg.len() <= 8);
          .          let length = msg.len();
212,792,515          self.length += length;
          .
319,188,774          let needed = 8 - self.ntail;
          .          let fill = cmp::min(length, needed);
212,792,515          if fill == 8 {
 38,954,670              self.tail = unsafe { load_int_le!(msg, 0, u64) };
          .          } else {
560,468,202              self.tail |= unsafe { u8to64_le(msg, 0, fill) } << (8 * self.ntail);
186,822,734              if length < needed {
 55,081,556                  self.ntail += length;
          .                  return;
          .              }
          .          }
 78,855,480          self.state.v3 ^= self.tail;
          .          Sip24Rounds::c_rounds(&mut self.state);
157,710,960          self.state.v0 ^= self.tail;
          .
          .          // Buffered tail is now flushed, process new input.
157,710,958          self.ntail = length - needed;
 78,855,480          self.tail = unsafe { u8to64_le(msg, needed, self.ntail) };
212,792,514      }
And from libcore/num/mod.rs:
            .          #[inline]
            .          pub fn rotate_left(self, n: u32) -> Self {
            .              // Protect against undefined behaviour for over-long bit shifts
            .              let n = n % $BITS;
1,187,178,937              (self << n) | (self >> (($BITS - n) % $BITS))
            .          }
I stared at this for a while but wasn't able to come up with any notable improvements.
Hashing is the hottest part of most incremental check builds. If we can't speed up this code, perhaps we could use a different hasher, or find a way to hash less data.
Metadata
Metadata
Assignees
Labels
A-incr-compArea: Incremental compilationArea: Incremental compilationC-enhancementCategory: An issue proposing an enhancement or a PR with one.Category: An issue proposing an enhancement or a PR with one.I-compiletimeIssue: Problems and improvements with respect to compile times.Issue: Problems and improvements with respect to compile times.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.Relevant to the compiler team, which will review and decide on the PR/issue.WG-compiler-performanceWorking group: Compiler PerformanceWorking group: Compiler Performance