Skip to content

Commit 1de00dc

Browse files
committed
Auto merge of rust-lang#128463 - GrigorenkoPV:perfect-hash, r=<try>
rustc_errors: use perfect hashing for character replacements The correctness of code in rust-lang#128200 relies on an array being sorted (so that it can be used in binary search later), which is currently enforced with `// tidy-alphabetical` (and characters being written in `\u{XXXX}` form), as well as lack of duplicate entries with conflicting keys, which is not currently enforced. A const assert or a test can be added checking that (implemented in rust-lang#128465). But this PR tries to use [perfect hashing](https://en.wikipedia.org/wiki/Perfect_hash_function) instead. The performance implications are unclear. Asymptotically it's faster, but in reality we should just benchmark. Plus if there are no significant performance wins, this entire things is probably not even worse the additional dependencies it brings. UPD: funnily enough, there's a PR optimizing the binary search implementation (rust-lang#128254) in the queue right now. So I guess we have to wait until that is merged too before benchmarking this.
2 parents 60d1465 + 4108ac4 commit 1de00dc

File tree

4 files changed

+70
-53
lines changed

4 files changed

+70
-53
lines changed

Cargo.lock

+15
Original file line numberDiff line numberDiff line change
@@ -2712,6 +2712,7 @@ version = "0.11.2"
27122712
source = "registry+https://github.com/rust-lang/crates.io-index"
27132713
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
27142714
dependencies = [
2715+
"phf_macros",
27152716
"phf_shared 0.11.2",
27162717
]
27172718

@@ -2745,6 +2746,19 @@ dependencies = [
27452746
"rand",
27462747
]
27472748

2749+
[[package]]
2750+
name = "phf_macros"
2751+
version = "0.11.2"
2752+
source = "registry+https://github.com/rust-lang/crates.io-index"
2753+
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
2754+
dependencies = [
2755+
"phf_generator 0.11.2",
2756+
"phf_shared 0.11.2",
2757+
"proc-macro2",
2758+
"quote",
2759+
"syn 2.0.67",
2760+
]
2761+
27482762
[[package]]
27492763
name = "phf_shared"
27502764
version = "0.10.0"
@@ -3653,6 +3667,7 @@ version = "0.0.0"
36533667
dependencies = [
36543668
"annotate-snippets 0.10.2",
36553669
"derive_setters",
3670+
"phf",
36563671
"rustc_ast",
36573672
"rustc_ast_pretty",
36583673
"rustc_data_structures",

compiler/rustc_errors/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ edition = "2021"
77
# tidy-alphabetical-start
88
annotate-snippets = "0.10"
99
derive_setters = "0.1.6"
10+
phf = { version = "0.11.2", features = ["macros"] }
1011
rustc_ast = { path = "../rustc_ast" }
1112
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
1213
rustc_data_structures = { path = "../rustc_data_structures" }

compiler/rustc_errors/src/emitter.rs

+49-53
Original file line numberDiff line numberDiff line change
@@ -2583,66 +2583,62 @@ fn num_decimal_digits(num: usize) -> usize {
25832583
}
25842584

25852585
// We replace some characters so the CLI output is always consistent and underlines aligned.
2586-
// Keep the following list in sync with `rustc_span::char_width`.
2587-
// ATTENTION: keep lexicografically sorted so that the binary search will work
2588-
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2589-
// tidy-alphabetical-start
2586+
const OUTPUT_REPLACEMENTS: phf::Map<char, &'static str> = phf::phf_map![
25902587
// In terminals without Unicode support the following will be garbled, but in *all* terminals
25912588
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
25922589
// support" gate.
2593-
('\0', "␀"),
2594-
('\u{0001}', "␁"),
2595-
('\u{0002}', "␂"),
2596-
('\u{0003}', "␃"),
2597-
('\u{0004}', "␄"),
2598-
('\u{0005}', "␅"),
2599-
('\u{0006}', "␆"),
2600-
('\u{0007}', "␇"),
2601-
('\u{0008}', "␈"),
2602-
('\u{0009}', " "), // We do our own tab replacement
2603-
('\u{000b}', "␋"),
2604-
('\u{000c}', "␌"),
2605-
('\u{000d}', "␍"),
2606-
('\u{000e}', "␎"),
2607-
('\u{000f}', "␏"),
2608-
('\u{0010}', "␐"),
2609-
('\u{0011}', "␑"),
2610-
('\u{0012}', "␒"),
2611-
('\u{0013}', "␓"),
2612-
('\u{0014}', "␔"),
2613-
('\u{0015}', "␕"),
2614-
('\u{0016}', "␖"),
2615-
('\u{0017}', "␗"),
2616-
('\u{0018}', "␘"),
2617-
('\u{0019}', "␙"),
2618-
('\u{001a}', "␚"),
2619-
('\u{001b}', "␛"),
2620-
('\u{001c}', "␜"),
2621-
('\u{001d}', "␝"),
2622-
('\u{001e}', "␞"),
2623-
('\u{001f}', "␟"),
2624-
('\u{007f}', "␡"),
2625-
('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
2626-
('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
2627-
('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2628-
('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
2629-
('\u{202d}', "�"),
2630-
('\u{202e}', "�"),
2631-
('\u{2066}', "�"),
2632-
('\u{2067}', "�"),
2633-
('\u{2068}', "�"),
2634-
('\u{2069}', "�"),
2635-
// tidy-alphabetical-end
2590+
'\0' => "␀",
2591+
'\t' => " ", // We do our own tab replacement
2592+
'\r' => "␍",
2593+
'\u{0001}' => "␁",
2594+
'\u{0002}' => "␂",
2595+
'\u{0003}' => "␃",
2596+
'\u{0004}' => "␄",
2597+
'\u{0005}' => "␅",
2598+
'\u{0006}' => "␆",
2599+
'\u{0007}' => "␇",
2600+
'\u{0008}' => "␈",
2601+
'\u{000b}' => "␋",
2602+
'\u{000c}' => "␌",
2603+
'\u{000e}' => "␎",
2604+
'\u{000f}' => "␏",
2605+
'\u{0010}' => "␐",
2606+
'\u{0011}' => "␑",
2607+
'\u{0012}' => "␒",
2608+
'\u{0013}' => "␓",
2609+
'\u{0014}' => "␔",
2610+
'\u{0015}' => "␕",
2611+
'\u{0016}' => "␖",
2612+
'\u{0017}' => "␗",
2613+
'\u{0018}' => "␘",
2614+
'\u{0019}' => "␙",
2615+
'\u{001a}' => "␚",
2616+
'\u{001b}' => "␛",
2617+
'\u{001c}' => "␜",
2618+
'\u{001d}' => "␝",
2619+
'\u{001e}' => "␞",
2620+
'\u{001f}' => "␟",
2621+
'\u{007f}' => "␡",
2622+
'\u{200d}' => "", // Replace ZWJ for consistent terminal output of grapheme clusters.
2623+
'\u{202a}' => "�", // The following unicode text flow control characters are inconsistently
2624+
'\u{202b}' => "�", // supported across CLIs and can cause confusion due to the bytes on disk
2625+
'\u{202c}' => "�", // not corresponding to the visible source code, so we replace them always.
2626+
'\u{202d}' => "�",
2627+
'\u{202e}' => "�",
2628+
'\u{2066}' => "�",
2629+
'\u{2067}' => "�",
2630+
'\u{2068}' => "�",
2631+
'\u{2069}' => "�",
26362632
];
26372633

26382634
fn normalize_whitespace(s: &str) -> String {
2639-
// Scan the input string for a character in the ordered table above. If it's present, replace
2640-
// it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
2641-
// char. At the end, allocate all chars into a string in one operation.
2635+
// Scan the input string for a character in the replacement table above.
2636+
// If it's present, replace it with its alternative string (it can be more than 1 char!).
2637+
// Otherwise, retain the input char.
26422638
s.chars().fold(String::with_capacity(s.len()), |mut s, c| {
2643-
match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
2644-
Ok(i) => s.push_str(OUTPUT_REPLACEMENTS[i].1),
2645-
_ => s.push(c),
2639+
match OUTPUT_REPLACEMENTS.get(&c) {
2640+
Some(r) => s.push_str(r),
2641+
None => s.push(c),
26462642
}
26472643
s
26482644
})

src/tools/tidy/src/deps.rs

+5
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,10 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
345345
"parking_lot_core",
346346
"pathdiff",
347347
"perf-event-open-sys",
348+
"phf",
349+
"phf_generator",
350+
"phf_macros",
351+
"phf_shared",
348352
"pin-project-lite",
349353
"polonius-engine",
350354
"portable-atomic", // dependency for platforms doesn't support `AtomicU64` in std
@@ -386,6 +390,7 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
386390
"sha2",
387391
"sharded-slab",
388392
"shlex",
393+
"siphasher",
389394
"smallvec",
390395
"snap",
391396
"stable_deref_trait",

0 commit comments

Comments
 (0)