Skip to content

Commit 6679c62

Browse files
committed
Revert "Rollup merge of #127528 - estebank:ascii-control-chars, r=oli-obk"
This reverts commit cce2db0, reversing changes made to cfc5f25.
1 parent 28e684b commit 6679c62

File tree

67 files changed

+308
-216
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+308
-216
lines changed

Cargo.lock

+1
Original file line numberDiff line numberDiff line change
@@ -3882,6 +3882,7 @@ dependencies = [
38823882
"termcolor",
38833883
"termize",
38843884
"tracing",
3885+
"unicode-width",
38853886
"windows",
38863887
]
38873888

compiler/rustc_errors/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ serde_json = "1.0.59"
2626
termcolor = "1.2.0"
2727
termize = "0.1.1"
2828
tracing = "0.1"
29+
unicode-width = "0.1.4"
2930
# tidy-alphabetical-end
3031

3132
[target.'cfg(windows)'.dependencies.windows]

compiler/rustc_errors/src/emitter.rs

+26-51
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
//! The output types are defined in `rustc_session::config::ErrorOutputType`.
99
1010
use rustc_span::source_map::SourceMap;
11-
use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};
11+
use rustc_span::{FileLines, FileName, SourceFile, Span};
1212

1313
use crate::snippet::{
1414
Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
@@ -677,7 +677,10 @@ impl HumanEmitter {
677677
.skip(left)
678678
.take_while(|ch| {
679679
// Make sure that the trimming on the right will fall within the terminal width.
680-
let next = char_width(*ch);
680+
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
681+
// is. For now, just accept that sometimes the code line will be longer than
682+
// desired.
683+
let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
681684
if taken + next > right - left {
682685
return false;
683686
}
@@ -739,7 +742,11 @@ impl HumanEmitter {
739742
let left = margin.left(source_string.len());
740743

741744
// Account for unicode characters of width !=0 that were removed.
742-
let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();
745+
let left = source_string
746+
.chars()
747+
.take(left)
748+
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
749+
.sum();
743750

744751
self.draw_line(
745752
buffer,
@@ -2032,7 +2039,7 @@ impl HumanEmitter {
20322039
let sub_len: usize =
20332040
if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
20342041
.chars()
2035-
.map(|ch| char_width(ch))
2042+
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
20362043
.sum();
20372044

20382045
let offset: isize = offsets
@@ -2069,8 +2076,11 @@ impl HumanEmitter {
20692076
}
20702077

20712078
// length of the code after substitution
2072-
let full_sub_len =
2073-
part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;
2079+
let full_sub_len = part
2080+
.snippet
2081+
.chars()
2082+
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
2083+
.sum::<usize>() as isize;
20742084

20752085
// length of the code to be substituted
20762086
let snippet_len = span_end_pos as isize - span_start_pos as isize;
@@ -2558,53 +2568,18 @@ fn num_decimal_digits(num: usize) -> usize {
25582568
}
25592569

25602570
// We replace some characters so the CLI output is always consistent and underlines aligned.
2561-
// Keep the following list in sync with `rustc_span::char_width`.
25622571
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2563-
('\t', " "), // We do our own tab replacement
2572+
('\t', " "), // We do our own tab replacement
25642573
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2565-
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
2566-
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
2567-
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
2568-
('\u{202E}', "�"),
2569-
('\u{2066}', "�"),
2570-
('\u{2067}', "�"),
2571-
('\u{2068}', "�"),
2572-
('\u{202C}', "�"),
2573-
('\u{2069}', "�"),
2574-
// In terminals without Unicode support the following will be garbled, but in *all* terminals
2575-
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
2576-
// support" gate.
2577-
('\u{0000}', "␀"),
2578-
('\u{0001}', "␁"),
2579-
('\u{0002}', "␂"),
2580-
('\u{0003}', "␃"),
2581-
('\u{0004}', "␄"),
2582-
('\u{0005}', "␅"),
2583-
('\u{0006}', "␆"),
2584-
('\u{0007}', "␇"),
2585-
('\u{0008}', "␈"),
2586-
('\u{000B}', "␋"),
2587-
('\u{000C}', "␌"),
2588-
('\u{000D}', "␍"),
2589-
('\u{000E}', "␎"),
2590-
('\u{000F}', "␏"),
2591-
('\u{0010}', "␐"),
2592-
('\u{0011}', "␑"),
2593-
('\u{0012}', "␒"),
2594-
('\u{0013}', "␓"),
2595-
('\u{0014}', "␔"),
2596-
('\u{0015}', "␕"),
2597-
('\u{0016}', "␖"),
2598-
('\u{0017}', "␗"),
2599-
('\u{0018}', "␘"),
2600-
('\u{0019}', "␙"),
2601-
('\u{001A}', "␚"),
2602-
('\u{001B}', "␛"),
2603-
('\u{001C}', "␜"),
2604-
('\u{001D}', "␝"),
2605-
('\u{001E}', "␞"),
2606-
('\u{001F}', "␟"),
2607-
('\u{007F}', "␡"),
2574+
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
2575+
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
2576+
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
2577+
('\u{202E}', ""),
2578+
('\u{2066}', ""),
2579+
('\u{2067}', ""),
2580+
('\u{2068}', ""),
2581+
('\u{202C}', ""),
2582+
('\u{2069}', ""),
26082583
];
26092584

26102585
fn normalize_whitespace(str: &str) -> String {

compiler/rustc_metadata/src/rmeta/decoder.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1728,6 +1728,7 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
17281728
source_len,
17291729
lines,
17301730
multibyte_chars,
1731+
non_narrow_chars,
17311732
normalized_pos,
17321733
stable_id,
17331734
..
@@ -1779,6 +1780,7 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
17791780
self.cnum,
17801781
lines,
17811782
multibyte_chars,
1783+
non_narrow_chars,
17821784
normalized_pos,
17831785
source_file_index,
17841786
);

compiler/rustc_query_system/src/ich/impls_syntax.rs

+6
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
7373
source_len: _,
7474
lines: _,
7575
ref multibyte_chars,
76+
ref non_narrow_chars,
7677
ref normalized_pos,
7778
} = *self;
7879

@@ -97,6 +98,11 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
9798
char_pos.hash_stable(hcx, hasher);
9899
}
99100

101+
non_narrow_chars.len().hash_stable(hcx, hasher);
102+
for &char_pos in non_narrow_chars.iter() {
103+
char_pos.hash_stable(hcx, hasher);
104+
}
105+
100106
normalized_pos.len().hash_stable(hcx, hasher);
101107
for &char_pos in normalized_pos.iter() {
102108
char_pos.hash_stable(hcx, hasher);

compiler/rustc_span/src/analyze_source_file.rs

+34-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use super::*;
2+
use unicode_width::UnicodeWidthChar;
23

34
#[cfg(test)]
45
mod tests;
@@ -8,12 +9,15 @@ mod tests;
89
///
910
/// This function will use an SSE2 enhanced implementation if hardware support
1011
/// is detected at runtime.
11-
pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
12+
pub fn analyze_source_file(
13+
src: &str,
14+
) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
1215
let mut lines = vec![RelativeBytePos::from_u32(0)];
1316
let mut multi_byte_chars = vec![];
17+
let mut non_narrow_chars = vec![];
1418

1519
// Calls the right implementation, depending on hardware support available.
16-
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
20+
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
1721

1822
// The code above optimistically registers a new line *after* each \n
1923
// it encounters. If that point is already outside the source_file, remove
@@ -26,7 +30,7 @@ pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteCha
2630
}
2731
}
2832

29-
(lines, multi_byte_chars)
33+
(lines, multi_byte_chars, non_narrow_chars)
3034
}
3135

3236
cfg_match! {
@@ -35,10 +39,11 @@ cfg_match! {
3539
src: &str,
3640
lines: &mut Vec<RelativeBytePos>,
3741
multi_byte_chars: &mut Vec<MultiByteChar>,
42+
non_narrow_chars: &mut Vec<NonNarrowChar>,
3843
) {
3944
if is_x86_feature_detected!("sse2") {
4045
unsafe {
41-
analyze_source_file_sse2(src, lines, multi_byte_chars);
46+
analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
4247
}
4348
} else {
4449
analyze_source_file_generic(
@@ -47,6 +52,7 @@ cfg_match! {
4752
RelativeBytePos::from_u32(0),
4853
lines,
4954
multi_byte_chars,
55+
non_narrow_chars,
5056
);
5157
}
5258
}
@@ -60,6 +66,7 @@ cfg_match! {
6066
src: &str,
6167
lines: &mut Vec<RelativeBytePos>,
6268
multi_byte_chars: &mut Vec<MultiByteChar>,
69+
non_narrow_chars: &mut Vec<NonNarrowChar>,
6370
) {
6471
#[cfg(target_arch = "x86")]
6572
use std::arch::x86::*;
@@ -152,6 +159,7 @@ cfg_match! {
152159
RelativeBytePos::from_usize(scan_start),
153160
lines,
154161
multi_byte_chars,
162+
non_narrow_chars,
155163
);
156164
}
157165

@@ -164,6 +172,7 @@ cfg_match! {
164172
RelativeBytePos::from_usize(tail_start),
165173
lines,
166174
multi_byte_chars,
175+
non_narrow_chars,
167176
);
168177
}
169178
}
@@ -174,13 +183,15 @@ cfg_match! {
174183
src: &str,
175184
lines: &mut Vec<RelativeBytePos>,
176185
multi_byte_chars: &mut Vec<MultiByteChar>,
186+
non_narrow_chars: &mut Vec<NonNarrowChar>,
177187
) {
178188
analyze_source_file_generic(
179189
src,
180190
src.len(),
181191
RelativeBytePos::from_u32(0),
182192
lines,
183193
multi_byte_chars,
194+
non_narrow_chars,
184195
);
185196
}
186197
}
@@ -194,6 +205,7 @@ fn analyze_source_file_generic(
194205
output_offset: RelativeBytePos,
195206
lines: &mut Vec<RelativeBytePos>,
196207
multi_byte_chars: &mut Vec<MultiByteChar>,
208+
non_narrow_chars: &mut Vec<NonNarrowChar>,
197209
) -> usize {
198210
assert!(src.len() >= scan_len);
199211
let mut i = 0;
@@ -215,8 +227,16 @@ fn analyze_source_file_generic(
215227

216228
let pos = RelativeBytePos::from_usize(i) + output_offset;
217229

218-
if let b'\n' = byte {
219-
lines.push(pos + RelativeBytePos(1));
230+
match byte {
231+
b'\n' => {
232+
lines.push(pos + RelativeBytePos(1));
233+
}
234+
b'\t' => {
235+
non_narrow_chars.push(NonNarrowChar::Tab(pos));
236+
}
237+
_ => {
238+
non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
239+
}
220240
}
221241
} else if byte >= 127 {
222242
// The slow path:
@@ -232,6 +252,14 @@ fn analyze_source_file_generic(
232252
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
233253
multi_byte_chars.push(mbc);
234254
}
255+
256+
// Assume control characters are zero width.
257+
// FIXME: How can we decide between `width` and `width_cjk`?
258+
let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
259+
260+
if char_width != 1 {
261+
non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
262+
}
235263
}
236264

237265
i += char_len;

0 commit comments

Comments
 (0)