Auto merge of #96551 - ferrocene:pa-ignore-paths-when-abbreviating, r=Mark-Simulacrum

bors · bors · commit 6609c6734de4 · 2022-06-06T05:21:49.000Z
[compiletest] Ignore known paths when abbreviating output To prevent out of memory conditions, compiletest limits the amount of output a test can generate, abbreviating it if the test emits more than a threshold. While the behavior is desirable, it also causes some issues (like #96229, #94322 and #92211). The latest one happened recently, when the `src/test/ui/numeric/numeric-cast.rs` test started to fail on systems where the path of the rust-lang/rust checkout is too long. This includes my own development machine and [LLVM's CI](#96362 (comment)). Rust's CI uses a pretty short directory name for the checkout, which hides these sort of problems until someone runs the test suite on their own computer. When developing the fix I tried to find the most targeted fix that would prevent this class of failures from happening in the future, deferring the decision on if/how to redesign abbreviation to a later date. The solution I came up with was to ignore known base paths when calculating whether the output exceeds the abbreviation threshold, which removes this kind of nondeterminism. This PR is best reviewed commit-by-commit.
diff --git a/src/tools/compiletest/src/read2.rs b/src/tools/compiletest/src/read2.rs
@@ -1,71 +1,24 @@
 // FIXME: This is a complete copy of `cargo/src/cargo/util/read2.rs`
 // Consider unify the read2() in libstd, cargo and this to prevent further code duplication.
 
+#[cfg(test)]
+mod tests;
+
 pub use self::imp::read2;
-use std::io;
+use std::io::{self, Write};
+use std::mem::replace;
 use std::process::{Child, Output};
 
-pub fn read2_abbreviated(mut child: Child) -> io::Result<Output> {
-    use io::Write;
-    use std::mem::replace;
-
-    const HEAD_LEN: usize = 160 * 1024;
-    const TAIL_LEN: usize = 256 * 1024;
-
-    enum ProcOutput {
-        Full(Vec<u8>),
-        Abbreviated { head: Vec<u8>, skipped: usize, tail: Box<[u8]> },
-    }
-
-    impl ProcOutput {
-        fn extend(&mut self, data: &[u8]) {
-            let new_self = match *self {
-                ProcOutput::Full(ref mut bytes) => {
-                    bytes.extend_from_slice(data);
-                    let new_len = bytes.len();
-                    if new_len <= HEAD_LEN + TAIL_LEN {
-                        return;
-                    }
-                    let tail = bytes.split_off(new_len - TAIL_LEN).into_boxed_slice();
-                    let head = replace(bytes, Vec::new());
-                    let skipped = new_len - HEAD_LEN - TAIL_LEN;
-                    ProcOutput::Abbreviated { head, skipped, tail }
-                }
-                ProcOutput::Abbreviated { ref mut skipped, ref mut tail, .. } => {
-                    *skipped += data.len();
-                    if data.len() <= TAIL_LEN {
-                        tail[..data.len()].copy_from_slice(data);
-                        tail.rotate_left(data.len());
-                    } else {
-                        tail.copy_from_slice(&data[(data.len() - TAIL_LEN)..]);
-                    }
-                    return;
-                }
-            };
-            *self = new_self;
-        }
-
-        fn into_bytes(self) -> Vec<u8> {
-            match self {
-                ProcOutput::Full(bytes) => bytes,
-                ProcOutput::Abbreviated { mut head, skipped, tail } => {
-                    write!(&mut head, "\n\n<<<<<< SKIPPED {} BYTES >>>>>>\n\n", skipped).unwrap();
-                    head.extend_from_slice(&tail);
-                    head
-                }
-            }
-        }
-    }
-
-    let mut stdout = ProcOutput::Full(Vec::new());
-    let mut stderr = ProcOutput::Full(Vec::new());
+pub fn read2_abbreviated(mut child: Child, filter_paths_from_len: &[String]) -> io::Result<Output> {
+    let mut stdout = ProcOutput::new();
+    let mut stderr = ProcOutput::new();
 
     drop(child.stdin.take());
     read2(
         child.stdout.take().unwrap(),
         child.stderr.take().unwrap(),
         &mut |is_stdout, data, _| {
-            if is_stdout { &mut stdout } else { &mut stderr }.extend(data);
+            if is_stdout { &mut stdout } else { &mut stderr }.extend(data, filter_paths_from_len);
             data.clear();
         },
     )?;
@@ -74,6 +27,98 @@ pub fn read2_abbreviated(mut child: Child) -> io::Result<Output> {
     Ok(Output { status, stdout: stdout.into_bytes(), stderr: stderr.into_bytes() })
 }
 
+const HEAD_LEN: usize = 160 * 1024;
+const TAIL_LEN: usize = 256 * 1024;
+
+// Whenever a path is filtered when counting the length of the output, we need to add some
+// placeholder length to ensure a compiler emitting only filtered paths doesn't cause a OOM.
+//
+// 32 was chosen semi-arbitrarily: it was the highest power of two that still allowed the test
+// suite to pass at the moment of implementing path filtering.
+const FILTERED_PATHS_PLACEHOLDER_LEN: usize = 32;
+
+enum ProcOutput {
+    Full { bytes: Vec<u8>, filtered_len: usize },
+    Abbreviated { head: Vec<u8>, skipped: usize, tail: Box<[u8]> },
+}
+
+impl ProcOutput {
+    fn new() -> Self {
+        ProcOutput::Full { bytes: Vec::new(), filtered_len: 0 }
+    }
+
+    fn extend(&mut self, data: &[u8], filter_paths_from_len: &[String]) {
+        let new_self = match *self {
+            ProcOutput::Full { ref mut bytes, ref mut filtered_len } => {
+                let old_len = bytes.len();
+                bytes.extend_from_slice(data);
+                *filtered_len += data.len();
+
+                // We had problems in the past with tests failing only in some environments,
+                // due to the length of the base path pushing the output size over the limit.
+                //
+                // To make those failures deterministic across all environments we ignore known
+                // paths when calculating the string length, while still including the full
+                // path in the output. This could result in some output being larger than the
+                // threshold, but it's better than having nondeterministic failures.
+                //
+                // The compiler emitting only excluded strings is addressed by adding a
+                // placeholder size for each excluded segment, which will eventually reach
+                // the configured threshold.
+                for path in filter_paths_from_len {
+                    let path_bytes = path.as_bytes();
+                    // We start matching `path_bytes - 1` into the previously loaded data,
+                    // to account for the fact a path_bytes might be included across multiple
+                    // `extend` calls. Starting from `- 1` avoids double-counting paths.
+                    let matches = (&bytes[(old_len.saturating_sub(path_bytes.len() - 1))..])
+                        .windows(path_bytes.len())
+                        .filter(|window| window == &path_bytes)
+                        .count();
+                    *filtered_len -= matches * path_bytes.len();
+
+                    // We can't just remove the length of the filtered path from the output lenght,
+                    // otherwise a compiler emitting only filtered paths would OOM compiletest. Add
+                    // a fixed placeholder length for each path to prevent that.
+                    *filtered_len += matches * FILTERED_PATHS_PLACEHOLDER_LEN;
+                }
+
+                let new_len = bytes.len();
+                if *filtered_len <= HEAD_LEN + TAIL_LEN {
+                    return;
+                }
+
+                let mut head = replace(bytes, Vec::new());
+                let mut middle = head.split_off(HEAD_LEN);
+                let tail = middle.split_off(middle.len() - TAIL_LEN).into_boxed_slice();
+                let skipped = new_len - HEAD_LEN - TAIL_LEN;
+                ProcOutput::Abbreviated { head, skipped, tail }
+            }
+            ProcOutput::Abbreviated { ref mut skipped, ref mut tail, .. } => {
+                *skipped += data.len();
+                if data.len() <= TAIL_LEN {
+                    tail[..data.len()].copy_from_slice(data);
+                    tail.rotate_left(data.len());
+                } else {
+                    tail.copy_from_slice(&data[(data.len() - TAIL_LEN)..]);
+                }
+                return;
+            }
+        };
+        *self = new_self;
+    }
+
+    fn into_bytes(self) -> Vec<u8> {
+        match self {
+            ProcOutput::Full { bytes, .. } => bytes,
+            ProcOutput::Abbreviated { mut head, skipped, tail } => {
+                write!(&mut head, "\n\n<<<<<< SKIPPED {} BYTES >>>>>>\n\n", skipped).unwrap();
+                head.extend_from_slice(&tail);
+                head
+            }
+        }
+    }
+}
+
 #[cfg(not(any(unix, windows)))]
 mod imp {
     use std::io::{self, Read};
diff --git a/src/tools/compiletest/src/read2/tests.rs b/src/tools/compiletest/src/read2/tests.rs
@@ -0,0 +1,123 @@
+use crate::read2::{ProcOutput, FILTERED_PATHS_PLACEHOLDER_LEN, HEAD_LEN, TAIL_LEN};
+
+#[test]
+fn test_abbreviate_short_string() {
+    let mut out = ProcOutput::new();
+    out.extend(b"Hello world!", &[]);
+    assert_eq!(b"Hello world!", &*out.into_bytes());
+}
+
+#[test]
+fn test_abbreviate_short_string_multiple_steps() {
+    let mut out = ProcOutput::new();
+
+    out.extend(b"Hello ", &[]);
+    out.extend(b"world!", &[]);
+
+    assert_eq!(b"Hello world!", &*out.into_bytes());
+}
+
+#[test]
+fn test_abbreviate_long_string() {
+    let mut out = ProcOutput::new();
+
+    let data = vec![b'.'; HEAD_LEN + TAIL_LEN + 16];
+    out.extend(&data, &[]);
+
+    let mut expected = vec![b'.'; HEAD_LEN];
+    expected.extend_from_slice(b"\n\n<<<<<< SKIPPED 16 BYTES >>>>>>\n\n");
+    expected.extend_from_slice(&vec![b'.'; TAIL_LEN]);
+
+    // We first check the length to avoid endless terminal output if the length differs, since
+    // `out` is hundreds of KBs in size.
+    let out = out.into_bytes();
+    assert_eq!(expected.len(), out.len());
+    assert_eq!(expected, out);
+}
+
+#[test]
+fn test_abbreviate_long_string_multiple_steps() {
+    let mut out = ProcOutput::new();
+
+    out.extend(&vec![b'.'; HEAD_LEN], &[]);
+    out.extend(&vec![b'.'; TAIL_LEN], &[]);
+    // Also test whether the rotation works
+    out.extend(&vec![b'!'; 16], &[]);
+    out.extend(&vec![b'?'; 16], &[]);
+
+    let mut expected = vec![b'.'; HEAD_LEN];
+    expected.extend_from_slice(b"\n\n<<<<<< SKIPPED 32 BYTES >>>>>>\n\n");
+    expected.extend_from_slice(&vec![b'.'; TAIL_LEN - 32]);
+    expected.extend_from_slice(&vec![b'!'; 16]);
+    expected.extend_from_slice(&vec![b'?'; 16]);
+
+    // We first check the length to avoid endless terminal output if the length differs, since
+    // `out` is hundreds of KBs in size.
+    let out = out.into_bytes();
+    assert_eq!(expected.len(), out.len());
+    assert_eq!(expected, out);
+}
+
+#[test]
+fn test_abbreviate_filterss_are_detected() {
+    let mut out = ProcOutput::new();
+    let filters = &["foo".to_string(), "quux".to_string()];
+
+    out.extend(b"Hello foo", filters);
+    // Check items from a previous extension are not double-counted.
+    out.extend(b"! This is a qu", filters);
+    // Check items are detected across extensions.
+    out.extend(b"ux.", filters);
+
+    match &out {
+        ProcOutput::Full { bytes, filtered_len } => assert_eq!(
+            *filtered_len,
+            bytes.len() + FILTERED_PATHS_PLACEHOLDER_LEN * filters.len()
+                - filters.iter().map(|i| i.len()).sum::<usize>()
+        ),
+        ProcOutput::Abbreviated { .. } => panic!("out should not be abbreviated"),
+    }
+
+    assert_eq!(b"Hello foo! This is a quux.", &*out.into_bytes());
+}
+
+#[test]
+fn test_abbreviate_filters_avoid_abbreviations() {
+    let mut out = ProcOutput::new();
+    let filters = &[std::iter::repeat('a').take(64).collect::<String>()];
+
+    let mut expected = vec![b'.'; HEAD_LEN - FILTERED_PATHS_PLACEHOLDER_LEN as usize];
+    expected.extend_from_slice(filters[0].as_bytes());
+    expected.extend_from_slice(&vec![b'.'; TAIL_LEN]);
+
+    out.extend(&expected, filters);
+
+    // We first check the length to avoid endless terminal output if the length differs, since
+    // `out` is hundreds of KBs in size.
+    let out = out.into_bytes();
+    assert_eq!(expected.len(), out.len());
+    assert_eq!(expected, out);
+}
+
+#[test]
+fn test_abbreviate_filters_can_still_cause_abbreviations() {
+    let mut out = ProcOutput::new();
+    let filters = &[std::iter::repeat('a').take(64).collect::<String>()];
+
+    let mut input = vec![b'.'; HEAD_LEN];
+    input.extend_from_slice(&vec![b'.'; TAIL_LEN]);
+    input.extend_from_slice(filters[0].as_bytes());
+
+    let mut expected = vec![b'.'; HEAD_LEN];
+    expected.extend_from_slice(b"\n\n<<<<<< SKIPPED 64 BYTES >>>>>>\n\n");
+    expected.extend_from_slice(&vec![b'.'; TAIL_LEN - 64]);
+    expected.extend_from_slice(&vec![b'a'; 64]);
+
+    out.extend(&input, filters);
+
+    // We first check the length to avoid endless terminal output if the length differs, since
+    // `out` is hundreds of KBs in size.
+    let out = out.into_bytes();
+    assert_eq!(expected.len(), out.len());
+    assert_eq!(expected, out);
+}
diff --git a/src/tools/compiletest/src/runtest.rs b/src/tools/compiletest/src/runtest.rs
@@ -28,7 +28,7 @@ use std::hash::{Hash, Hasher};
 use std::io::prelude::*;
 use std::io::{self, BufReader};
 use std::path::{Path, PathBuf};
-use std::process::{Command, ExitStatus, Output, Stdio};
+use std::process::{Child, Command, ExitStatus, Output, Stdio};
 use std::str;
 
 use glob::glob;
@@ -1745,6 +1745,28 @@ impl<'test> TestCx<'test> {
         dylib
     }
 
+    fn read2_abbreviated(&self, child: Child) -> Output {
+        let mut filter_paths_from_len = Vec::new();
+        let mut add_path = |path: &Path| {
+            let path = path.display().to_string();
+            let windows = path.replace("\\", "\\\\");
+            if windows != path {
+                filter_paths_from_len.push(windows);
+            }
+            filter_paths_from_len.push(path);
+        };
+
+        // List of paths that will not be measured when determining whether the output is larger
+        // than the output truncation threshold.
+        //
+        // Note: avoid adding a subdirectory of an already filtered directory here, otherwise the
+        // same slice of text will be double counted and the truncation might not happen.
+        add_path(&self.config.src_base);
+        add_path(&self.config.build_base);
+
+        read2_abbreviated(child, &filter_paths_from_len).expect("failed to read output")
+    }
+
     fn compose_and_run(
         &self,
         mut command: Command,
@@ -1779,8 +1801,7 @@ impl<'test> TestCx<'test> {
             child.stdin.as_mut().unwrap().write_all(input.as_bytes()).unwrap();
         }
 
-        let Output { status, stdout, stderr } =
-            read2_abbreviated(child).expect("failed to read output");
+        let Output { status, stdout, stderr } = self.read2_abbreviated(child);
 
         let result = ProcRes {
             status,
@@ -2969,7 +2990,7 @@ impl<'test> TestCx<'test> {
             }
         }
 
-        let output = cmd.spawn().and_then(read2_abbreviated).expect("failed to spawn `make`");
+        let output = self.read2_abbreviated(cmd.spawn().expect("failed to spawn `make`"));
         if !output.status.success() {
             let res = ProcRes {
                 status: output.status,