Auto merge of #37427 - nnethercote:opt-IchHasher, r=michaelwoerister

bors · web-flow · commit 08839965f96e · 2016-11-05T01:10:57.000-07:00
Reduce the number of bytes hashed by IchHasher.

IchHasher uses blake2b hashing, which is expensive, so the fewer bytes hashed
the better. There are two big ways to reduce the number of bytes hashed.
- Filenames in spans account for ~66% of all bytes (for builds with debuginfo).
  The vast majority of spans have the same filename for the start of the span
  and the end of the span, so hashing the filename just once in those cases is
  a big win.
- u32 and u64 and usize values account for ~25%--33% of all bytes (for builds
  with debuginfo). The vast majority of these are small, i.e. fit in a u8, so
  shrinking them down before hashing is also a big win.

This PR implements these two optimizations. I'm certain the first one is safe.
I'm about 90% sure that the second one is safe.

Here are measurements of the number of bytes hashed when doing
debuginfo-enabled builds of stdlib and
rustc-benchmarks/syntex-0.42.2-incr-clean.

```
                    stdlib   syntex-incr
                    ------   -----------
original       156,781,386   255,095,596
half-SawSpan   106,744,403   176,345,419
short-ints      45,890,534   118,014,227
no-SawSpan[*]    6,831,874    45,875,714

[*] don't hash the SawSpan at all. Not part of this PR, just implemented for
    comparison's sake.
```

For debug builds of syntex-0.42.2-incr-clean, the two changes give a 1--2%
speed-up.
diff --git a/src/librustc_incremental/calculate_svh/hasher.rs b/src/librustc_incremental/calculate_svh/hasher.rs
@@ -9,13 +9,16 @@
 // except according to those terms.
 
 use std::mem;
+use std::hash::Hasher;
 use rustc_data_structures::blake2b::Blake2bHasher;
 use rustc::ty::util::ArchIndependentHasher;
 use ich::Fingerprint;
+use rustc_serialize::leb128::write_unsigned_leb128;
 
 #[derive(Debug)]
 pub struct IchHasher {
     state: ArchIndependentHasher<Blake2bHasher>,
+    leb128_helper: Vec<u8>,
     bytes_hashed: u64,
 }
 
@@ -24,6 +27,7 @@ impl IchHasher {
         let hash_size = mem::size_of::<Fingerprint>();
         IchHasher {
             state: ArchIndependentHasher::new(Blake2bHasher::new(hash_size, &[])),
+            leb128_helper: vec![],
             bytes_hashed: 0
         }
     }
@@ -37,9 +41,19 @@ impl IchHasher {
         fingerprint.0.copy_from_slice(self.state.into_inner().finalize());
         fingerprint
     }
+
+    #[inline]
+    fn write_uleb128(&mut self, value: u64) {
+        let len = write_unsigned_leb128(&mut self.leb128_helper, 0, value);
+        self.state.write(&self.leb128_helper[0..len]);
+        self.bytes_hashed += len as u64;
+    }
 }
 
-impl ::std::hash::Hasher for IchHasher {
+// For the non-u8 integer cases we leb128 encode them first. Because small
+// integers dominate, this significantly and cheaply reduces the number of
+// bytes hashed, which is good because blake2b is expensive.
+impl Hasher for IchHasher {
     fn finish(&self) -> u64 {
         bug!("Use other finish() implementation to get the full 128-bit hash.");
     }
@@ -49,4 +63,26 @@ impl ::std::hash::Hasher for IchHasher {
         self.state.write(bytes);
         self.bytes_hashed += bytes.len() as u64;
     }
+
+    // There is no need to leb128-encode u8 values.
+
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.write_uleb128(i);
+    }
+
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.write_uleb128(i as u64);
+    }
 }
diff --git a/src/librustc_incremental/calculate_svh/svh_visitor.rs b/src/librustc_incremental/calculate_svh/svh_visitor.rs
@@ -88,6 +88,8 @@ impl<'a, 'hash, 'tcx> StrictVersionHashVisitor<'a, 'hash, 'tcx> {
     // within the CodeMap.
     // Also note that we are hashing byte offsets for the column, not unicode
     // codepoint offsets. For the purpose of the hash that's sufficient.
+    // Also, hashing filenames is expensive so we avoid doing it twice when the
+    // span starts and ends in the same file, which is almost always the case.
     fn hash_span(&mut self, span: Span) {
         debug!("hash_span: st={:?}", self.st);
 
@@ -103,21 +105,35 @@ impl<'a, 'hash, 'tcx> StrictVersionHashVisitor<'a, 'hash, 'tcx> {
             span.hi
         };
 
-        let loc1 = self.codemap.byte_pos_to_line_and_col(span.lo);
-        let loc2 = self.codemap.byte_pos_to_line_and_col(span_hi);
-
-        let expansion_kind = match span.expn_id {
+        let expn_kind = match span.expn_id {
             NO_EXPANSION => SawSpanExpnKind::NoExpansion,
             COMMAND_LINE_EXPN => SawSpanExpnKind::CommandLine,
             _ => SawSpanExpnKind::SomeExpansion,
         };
 
-        SawSpan(loc1.as_ref().map(|&(ref fm, line, col)| (&fm.name[..], line, col)),
-                loc2.as_ref().map(|&(ref fm, line, col)| (&fm.name[..], line, col)),
-                expansion_kind)
-            .hash(self.st);
+        let loc1 = self.codemap.byte_pos_to_line_and_col(span.lo);
+        let loc1 = loc1.as_ref()
+                       .map(|&(ref fm, line, col)| (&fm.name[..], line, col))
+                       .unwrap_or(("???", 0, BytePos(0)));
+
+        let loc2 = self.codemap.byte_pos_to_line_and_col(span_hi);
+        let loc2 = loc2.as_ref()
+                       .map(|&(ref fm, line, col)| (&fm.name[..], line, col))
+                       .unwrap_or(("???", 0, BytePos(0)));
+
+        let saw = if loc1.0 == loc2.0 {
+            SawSpan(loc1.0,
+                    loc1.1, loc1.2,
+                    loc2.1, loc2.2,
+                    expn_kind)
+        } else {
+            SawSpanTwoFiles(loc1.0, loc1.1, loc1.2,
+                            loc2.0, loc2.1, loc2.2,
+                            expn_kind)
+        };
+        saw.hash(self.st);
 
-        if expansion_kind == SawSpanExpnKind::SomeExpansion {
+        if expn_kind == SawSpanExpnKind::SomeExpansion {
             let call_site = self.codemap.codemap().source_callsite(span);
             self.hash_span(call_site);
         }
@@ -189,9 +205,13 @@ enum SawAbiComponent<'a> {
     SawAssocTypeBinding,
     SawAttribute(ast::AttrStyle),
     SawMacroDef,
-    SawSpan(Option<(&'a str, usize, BytePos)>,
-            Option<(&'a str, usize, BytePos)>,
+    SawSpan(&'a str,
+            usize, BytePos,
+            usize, BytePos,
             SawSpanExpnKind),
+    SawSpanTwoFiles(&'a str, usize, BytePos,
+                    &'a str, usize, BytePos,
+                    SawSpanExpnKind),
 }
 
 /// SawExprComponent carries all of the information that we want