Skip to content

Commit c7620ca

Browse files
committed
Auto merge of #128200 - estebank:normalize-whitespace, r=<try>
Change output normalization logic to be linear against size of output I believe the previous code was accidentally quadratic. Let's perf it.
2 parents 48bbe12 + e35d147 commit c7620ca

File tree

3 files changed

+54
-46
lines changed

3 files changed

+54
-46
lines changed

Cargo.lock

+1
Original file line numberDiff line numberDiff line change
@@ -3873,6 +3873,7 @@ version = "0.0.0"
38733873
dependencies = [
38743874
"annotate-snippets 0.10.2",
38753875
"derive_setters",
3876+
"either",
38763877
"rustc_ast",
38773878
"rustc_ast_pretty",
38783879
"rustc_data_structures",

compiler/rustc_errors/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ edition = "2021"
77
# tidy-alphabetical-start
88
annotate-snippets = "0.10"
99
derive_setters = "0.1.6"
10+
either = "1.5.0"
1011
rustc_ast = { path = "../rustc_ast" }
1112
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
1213
rustc_data_structures = { path = "../rustc_data_structures" }

compiler/rustc_errors/src/emitter.rs

+52-46
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use crate::{
2121
SuggestionStyle, TerminalUrl,
2222
};
2323
use derive_setters::Setters;
24+
use either::Either;
2425
use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet};
2526
use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc};
2627
use rustc_error_messages::{FluentArgs, SpanLabel};
@@ -2559,60 +2560,65 @@ fn num_decimal_digits(num: usize) -> usize {
25592560

25602561
// We replace some characters so the CLI output is always consistent and underlines aligned.
25612562
// Keep the following list in sync with `rustc_span::char_width`.
2563+
// ATTENTION: keep lexicografically sorted so that the binary search will work
25622564
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2563-
('\t', " "), // We do our own tab replacement
2564-
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2565-
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
2566-
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2567-
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
2568-
('\u{202E}', "�"),
2565+
// In terminals without Unicode support the following will be garbled, but in *all* terminals
2566+
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
2567+
// support" gate.
2568+
('\0', "␀"),
2569+
('\u{1}', "␁"),
2570+
('\u{2}', "␂"),
2571+
('\u{3}', "␃"),
2572+
('\u{4}', "␄"),
2573+
('\u{5}', "␅"),
2574+
('\u{6}', "␆"),
2575+
('\u{7}', "␇"),
2576+
('\u{8}', "␈"),
2577+
('\t', " "), // We do our own tab replacement
2578+
('\u{b}', "␋"),
2579+
('\u{c}', "␌"),
2580+
('\r', "␍"),
2581+
('\u{e}', "␎"),
2582+
('\u{f}', "␏"),
2583+
('\u{10}', "␐"),
2584+
('\u{11}', "␑"),
2585+
('\u{12}', "␒"),
2586+
('\u{13}', "␓"),
2587+
('\u{14}', "␔"),
2588+
('\u{15}', "␕"),
2589+
('\u{16}', "␖"),
2590+
('\u{17}', "␗"),
2591+
('\u{18}', "␘"),
2592+
('\u{19}', "␙"),
2593+
('\u{1a}', "␚"),
2594+
('\u{1b}', "␛"),
2595+
('\u{1c}', "␜"),
2596+
('\u{1d}', "␝"),
2597+
('\u{1e}', "␞"),
2598+
('\u{1f}', "␟"),
2599+
('\u{7f}', "␡"),
2600+
('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
2601+
('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
2602+
('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2603+
('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
2604+
('\u{202d}', "�"),
2605+
('\u{202e}', "�"),
25692606
('\u{2066}', "�"),
25702607
('\u{2067}', "�"),
25712608
('\u{2068}', "�"),
2572-
('\u{202C}', "�"),
25732609
('\u{2069}', "�"),
2574-
// In terminals without Unicode support the following will be garbled, but in *all* terminals
2575-
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
2576-
// support" gate.
2577-
('\u{0000}', "␀"),
2578-
('\u{0001}', "␁"),
2579-
('\u{0002}', "␂"),
2580-
('\u{0003}', "␃"),
2581-
('\u{0004}', "␄"),
2582-
('\u{0005}', "␅"),
2583-
('\u{0006}', "␆"),
2584-
('\u{0007}', "␇"),
2585-
('\u{0008}', "␈"),
2586-
('\u{000B}', "␋"),
2587-
('\u{000C}', "␌"),
2588-
('\u{000D}', "␍"),
2589-
('\u{000E}', "␎"),
2590-
('\u{000F}', "␏"),
2591-
('\u{0010}', "␐"),
2592-
('\u{0011}', "␑"),
2593-
('\u{0012}', "␒"),
2594-
('\u{0013}', "␓"),
2595-
('\u{0014}', "␔"),
2596-
('\u{0015}', "␕"),
2597-
('\u{0016}', "␖"),
2598-
('\u{0017}', "␗"),
2599-
('\u{0018}', "␘"),
2600-
('\u{0019}', "␙"),
2601-
('\u{001A}', "␚"),
2602-
('\u{001B}', "␛"),
2603-
('\u{001C}', "␜"),
2604-
('\u{001D}', "␝"),
2605-
('\u{001E}', "␞"),
2606-
('\u{001F}', "␟"),
2607-
('\u{007F}', "␡"),
26082610
];
26092611

26102612
fn normalize_whitespace(str: &str) -> String {
2611-
let mut s = str.to_string();
2612-
for (c, replacement) in OUTPUT_REPLACEMENTS {
2613-
s = s.replace(*c, replacement);
2614-
}
2615-
s
2613+
// Scan the input string for a character in the ordered table above. If it's present, replace
2614+
// it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
2615+
// char. At the end, allocate all chars into a string in one operation.
2616+
str.chars()
2617+
.flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
2618+
Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()),
2619+
_ => Either::Right([c].into_iter()),
2620+
})
2621+
.collect()
26162622
}
26172623

26182624
fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {

0 commit comments

Comments
 (0)