Skip to content

Commit e3343bd

Browse files
authored
Unrolled build for rust-lang#127528
Rollup merge of rust-lang#127528 - estebank:ascii-control-chars, r=oli-obk Replace ASCII control chars with Unicode Control Pictures Replace ASCII control chars like `CR` with Unicode Control Pictures like `␍`: ``` error: bare CR not allowed in doc-comment --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:3:32 | LL | /// doc comment with bare CR: '␍' | ^ ``` Centralize the checking of unicode char width for the purposes of CLI display in one place. Account for the new replacements. Remove unneeded tracking of "zero-width" unicode chars, as we calculate these in the `SourceMap` as needed now.
2 parents e7d66ea + 9bd7680 commit e3343bd

File tree

67 files changed

+216
-308
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+216
-308
lines changed

Diff for: Cargo.lock

-1
Original file line numberDiff line numberDiff line change
@@ -3882,7 +3882,6 @@ dependencies = [
38823882
"termcolor",
38833883
"termize",
38843884
"tracing",
3885-
"unicode-width",
38863885
"windows",
38873886
]
38883887

Diff for: compiler/rustc_errors/Cargo.toml

-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ serde_json = "1.0.59"
2626
termcolor = "1.2.0"
2727
termize = "0.1.1"
2828
tracing = "0.1"
29-
unicode-width = "0.1.4"
3029
# tidy-alphabetical-end
3130

3231
[target.'cfg(windows)'.dependencies.windows]

Diff for: compiler/rustc_errors/src/emitter.rs

+51-26
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
//! The output types are defined in `rustc_session::config::ErrorOutputType`.
99
1010
use rustc_span::source_map::SourceMap;
11-
use rustc_span::{FileLines, FileName, SourceFile, Span};
11+
use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};
1212

1313
use crate::snippet::{
1414
Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
@@ -677,10 +677,7 @@ impl HumanEmitter {
677677
.skip(left)
678678
.take_while(|ch| {
679679
// Make sure that the trimming on the right will fall within the terminal width.
680-
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
681-
// is. For now, just accept that sometimes the code line will be longer than
682-
// desired.
683-
let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
680+
let next = char_width(*ch);
684681
if taken + next > right - left {
685682
return false;
686683
}
@@ -742,11 +739,7 @@ impl HumanEmitter {
742739
let left = margin.left(source_string.len());
743740

744741
// Account for unicode characters of width !=0 that were removed.
745-
let left = source_string
746-
.chars()
747-
.take(left)
748-
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
749-
.sum();
742+
let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();
750743

751744
self.draw_line(
752745
buffer,
@@ -2039,7 +2032,7 @@ impl HumanEmitter {
20392032
let sub_len: usize =
20402033
if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
20412034
.chars()
2042-
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
2035+
.map(|ch| char_width(ch))
20432036
.sum();
20442037

20452038
let offset: isize = offsets
@@ -2076,11 +2069,8 @@ impl HumanEmitter {
20762069
}
20772070

20782071
// length of the code after substitution
2079-
let full_sub_len = part
2080-
.snippet
2081-
.chars()
2082-
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
2083-
.sum::<usize>() as isize;
2072+
let full_sub_len =
2073+
part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;
20842074

20852075
// length of the code to be substituted
20862076
let snippet_len = span_end_pos as isize - span_start_pos as isize;
@@ -2568,18 +2558,53 @@ fn num_decimal_digits(num: usize) -> usize {
25682558
}
25692559

25702560
// We replace some characters so the CLI output is always consistent and underlines aligned.
2561+
// Keep the following list in sync with `rustc_span::char_width`.
25712562
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2572-
('\t', " "), // We do our own tab replacement
2563+
('\t', " "), // We do our own tab replacement
25732564
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2574-
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
2575-
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
2576-
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
2577-
('\u{202E}', ""),
2578-
('\u{2066}', ""),
2579-
('\u{2067}', ""),
2580-
('\u{2068}', ""),
2581-
('\u{202C}', ""),
2582-
('\u{2069}', ""),
2565+
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
2566+
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2567+
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
2568+
('\u{202E}', "�"),
2569+
('\u{2066}', "�"),
2570+
('\u{2067}', "�"),
2571+
('\u{2068}', "�"),
2572+
('\u{202C}', "�"),
2573+
('\u{2069}', "�"),
2574+
// In terminals without Unicode support the following will be garbled, but in *all* terminals
2575+
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
2576+
// support" gate.
2577+
('\u{0000}', "␀"),
2578+
('\u{0001}', "␁"),
2579+
('\u{0002}', "␂"),
2580+
('\u{0003}', "␃"),
2581+
('\u{0004}', "␄"),
2582+
('\u{0005}', "␅"),
2583+
('\u{0006}', "␆"),
2584+
('\u{0007}', "␇"),
2585+
('\u{0008}', "␈"),
2586+
('\u{000B}', "␋"),
2587+
('\u{000C}', "␌"),
2588+
('\u{000D}', "␍"),
2589+
('\u{000E}', "␎"),
2590+
('\u{000F}', "␏"),
2591+
('\u{0010}', "␐"),
2592+
('\u{0011}', "␑"),
2593+
('\u{0012}', "␒"),
2594+
('\u{0013}', "␓"),
2595+
('\u{0014}', "␔"),
2596+
('\u{0015}', "␕"),
2597+
('\u{0016}', "␖"),
2598+
('\u{0017}', "␗"),
2599+
('\u{0018}', "␘"),
2600+
('\u{0019}', "␙"),
2601+
('\u{001A}', "␚"),
2602+
('\u{001B}', "␛"),
2603+
('\u{001C}', "␜"),
2604+
('\u{001D}', "␝"),
2605+
('\u{001E}', "␞"),
2606+
('\u{001F}', "␟"),
2607+
('\u{007F}', "␡"),
25832608
];
25842609

25852610
fn normalize_whitespace(str: &str) -> String {

Diff for: compiler/rustc_metadata/src/rmeta/decoder.rs

-2
Original file line numberDiff line numberDiff line change
@@ -1728,7 +1728,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
17281728
source_len,
17291729
lines,
17301730
multibyte_chars,
1731-
non_narrow_chars,
17321731
normalized_pos,
17331732
stable_id,
17341733
..
@@ -1780,7 +1779,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
17801779
self.cnum,
17811780
lines,
17821781
multibyte_chars,
1783-
non_narrow_chars,
17841782
normalized_pos,
17851783
source_file_index,
17861784
);

Diff for: compiler/rustc_query_system/src/ich/impls_syntax.rs

-6
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
7373
source_len: _,
7474
lines: _,
7575
ref multibyte_chars,
76-
ref non_narrow_chars,
7776
ref normalized_pos,
7877
} = *self;
7978

@@ -98,11 +97,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
9897
char_pos.hash_stable(hcx, hasher);
9998
}
10099

101-
non_narrow_chars.len().hash_stable(hcx, hasher);
102-
for &char_pos in non_narrow_chars.iter() {
103-
char_pos.hash_stable(hcx, hasher);
104-
}
105-
106100
normalized_pos.len().hash_stable(hcx, hasher);
107101
for &char_pos in normalized_pos.iter() {
108102
char_pos.hash_stable(hcx, hasher);

Diff for: compiler/rustc_span/src/analyze_source_file.rs

+6-34
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
use super::*;
2-
use unicode_width::UnicodeWidthChar;
32

43
#[cfg(test)]
54
mod tests;
@@ -9,15 +8,12 @@ mod tests;
98
///
109
/// This function will use an SSE2 enhanced implementation if hardware support
1110
/// is detected at runtime.
12-
pub fn analyze_source_file(
13-
src: &str,
14-
) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
11+
pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
1512
let mut lines = vec![RelativeBytePos::from_u32(0)];
1613
let mut multi_byte_chars = vec![];
17-
let mut non_narrow_chars = vec![];
1814

1915
// Calls the right implementation, depending on hardware support available.
20-
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
16+
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
2117

2218
// The code above optimistically registers a new line *after* each \n
2319
// it encounters. If that point is already outside the source_file, remove
@@ -30,7 +26,7 @@ pub fn analyze_source_file(
3026
}
3127
}
3228

33-
(lines, multi_byte_chars, non_narrow_chars)
29+
(lines, multi_byte_chars)
3430
}
3531

3632
cfg_match! {
@@ -39,11 +35,10 @@ cfg_match! {
3935
src: &str,
4036
lines: &mut Vec<RelativeBytePos>,
4137
multi_byte_chars: &mut Vec<MultiByteChar>,
42-
non_narrow_chars: &mut Vec<NonNarrowChar>,
4338
) {
4439
if is_x86_feature_detected!("sse2") {
4540
unsafe {
46-
analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
41+
analyze_source_file_sse2(src, lines, multi_byte_chars);
4742
}
4843
} else {
4944
analyze_source_file_generic(
@@ -52,7 +47,6 @@ cfg_match! {
5247
RelativeBytePos::from_u32(0),
5348
lines,
5449
multi_byte_chars,
55-
non_narrow_chars,
5650
);
5751
}
5852
}
@@ -66,7 +60,6 @@ cfg_match! {
6660
src: &str,
6761
lines: &mut Vec<RelativeBytePos>,
6862
multi_byte_chars: &mut Vec<MultiByteChar>,
69-
non_narrow_chars: &mut Vec<NonNarrowChar>,
7063
) {
7164
#[cfg(target_arch = "x86")]
7265
use std::arch::x86::*;
@@ -159,7 +152,6 @@ cfg_match! {
159152
RelativeBytePos::from_usize(scan_start),
160153
lines,
161154
multi_byte_chars,
162-
non_narrow_chars,
163155
);
164156
}
165157

@@ -172,7 +164,6 @@ cfg_match! {
172164
RelativeBytePos::from_usize(tail_start),
173165
lines,
174166
multi_byte_chars,
175-
non_narrow_chars,
176167
);
177168
}
178169
}
@@ -183,15 +174,13 @@ cfg_match! {
183174
src: &str,
184175
lines: &mut Vec<RelativeBytePos>,
185176
multi_byte_chars: &mut Vec<MultiByteChar>,
186-
non_narrow_chars: &mut Vec<NonNarrowChar>,
187177
) {
188178
analyze_source_file_generic(
189179
src,
190180
src.len(),
191181
RelativeBytePos::from_u32(0),
192182
lines,
193183
multi_byte_chars,
194-
non_narrow_chars,
195184
);
196185
}
197186
}
@@ -205,7 +194,6 @@ fn analyze_source_file_generic(
205194
output_offset: RelativeBytePos,
206195
lines: &mut Vec<RelativeBytePos>,
207196
multi_byte_chars: &mut Vec<MultiByteChar>,
208-
non_narrow_chars: &mut Vec<NonNarrowChar>,
209197
) -> usize {
210198
assert!(src.len() >= scan_len);
211199
let mut i = 0;
@@ -227,16 +215,8 @@ fn analyze_source_file_generic(
227215

228216
let pos = RelativeBytePos::from_usize(i) + output_offset;
229217

230-
match byte {
231-
b'\n' => {
232-
lines.push(pos + RelativeBytePos(1));
233-
}
234-
b'\t' => {
235-
non_narrow_chars.push(NonNarrowChar::Tab(pos));
236-
}
237-
_ => {
238-
non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
239-
}
218+
if let b'\n' = byte {
219+
lines.push(pos + RelativeBytePos(1));
240220
}
241221
} else if byte >= 127 {
242222
// The slow path:
@@ -252,14 +232,6 @@ fn analyze_source_file_generic(
252232
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
253233
multi_byte_chars.push(mbc);
254234
}
255-
256-
// Assume control characters are zero width.
257-
// FIXME: How can we decide between `width` and `width_cjk`?
258-
let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
259-
260-
if char_width != 1 {
261-
non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
262-
}
263235
}
264236

265237
i += char_len;

0 commit comments

Comments
 (0)