Skip to content

Commit 8e0eecd

Browse files
authored
Rollup merge of #106566 - clubby789:contiguous-weird-unicode, r=cjgillot
Emit a single error for contiguous sequences of unknown tokens Closes #106101 On encountering a sequence of identical source characters which are unknown tokens, note the amount of subsequent characters and advance past them silently. The old behavior was to emit an error and 'help' note for every single one. `@rustbot` label +A-diagnostics +A-parser
2 parents 27db39b + a3d6bc3 commit 8e0eecd

File tree

8 files changed

+49
-9
lines changed

8 files changed

+49
-9
lines changed

compiler/rustc_parse/src/lexer/mod.rs

+24-4
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
7979
/// preceded by whitespace.
8080
fn next_token(&mut self) -> (Token, bool) {
8181
let mut preceded_by_whitespace = false;
82-
82+
let mut swallow_next_invalid = 0;
8383
// Skip trivial (whitespace & comments) tokens
8484
loop {
8585
let token = self.cursor.advance_token();
@@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
232232
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
233233

234234
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
235-
let c = self.str_from(start).chars().next().unwrap();
235+
// Don't emit diagnostics for sequences of the same invalid token
236+
if swallow_next_invalid > 0 {
237+
swallow_next_invalid -= 1;
238+
continue;
239+
}
240+
let mut it = self.str_from_to_end(start).chars();
241+
let c = it.next().unwrap();
242+
let repeats = it.take_while(|c1| *c1 == c).count();
236243
let mut err =
237-
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
244+
self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
238245
// FIXME: the lexer could be used to turn the ASCII version of unicode
239246
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
240247
// token. Ideally, this should be inside `rustc_lexer`. However, we should
241248
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
242249
// fancier error recovery to it, as there will be less overall work to do this
243250
// way.
244-
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
251+
let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
245252
if c == '\x00' {
246253
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
247254
}
255+
if repeats > 0 {
256+
if repeats == 1 {
257+
err.note(format!("character appears once more"));
258+
} else {
259+
err.note(format!("character appears {repeats} more times"));
260+
}
261+
swallow_next_invalid = repeats;
262+
}
248263
err.emit();
249264
if let Some(token) = token {
250265
token
@@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
486501
&self.src[self.src_index(start)..self.src_index(end)]
487502
}
488503

504+
/// Slice of the source text spanning from `start` until the end
505+
fn str_from_to_end(&self, start: BytePos) -> &str {
506+
&self.src[self.src_index(start)..]
507+
}
508+
489509
fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
490510
match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
491511
Err(RawStrError::InvalidStarter { bad_char }) => {

compiler/rustc_parse/src/lexer/unicode_chars.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
337337
pos: BytePos,
338338
ch: char,
339339
err: &mut Diagnostic,
340+
count: usize,
340341
) -> Option<token::TokenKind> {
341342
let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
342343

343-
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
344+
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
344345

345346
let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
346347
let msg = format!("substitution character not found for '{}'", ch);
@@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
369370
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
370371
ch, u_name, ascii_char, ascii_name
371372
);
372-
err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
373+
err.span_suggestion(
374+
span,
375+
&msg,
376+
ascii_char.to_string().repeat(count),
377+
Applicability::MaybeIncorrect,
378+
);
373379
}
374380
token.clone()
375381
}

tests/rustdoc-ui/invalid-syntax.stderr

-2
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ LL | /// ```
7777
| ^^^
7878
|
7979
= note: error from rustc: unknown start of token: `
80-
= note: error from rustc: unknown start of token: `
81-
= note: error from rustc: unknown start of token: `
8280

8381
warning: could not parse code block as Rust code
8482
--> $DIR/invalid-syntax.rs:64:5
-4.1 KB
Binary file not shown.
-194 Bytes
Binary file not shown.
40 Bytes
Binary file not shown.

tests/ui/parser/unicode-chars.rs

+4
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,8 @@ fn main() {
22
let y = 0;
33
//~^ ERROR unknown start of token: \u{37e}
44
//~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
5+
    let x = 0;
6+
//~^ ERROR unknown start of token: \u{a0}
7+
//~^^ NOTE character appears 3 more times
8+
//~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
59
}

tests/ui/parser/unicode-chars.stderr

+13-1
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,17 @@ help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), b
99
LL | let y = 0;
1010
| ~
1111

12-
error: aborting due to previous error
12+
error: unknown start of token: \u{a0}
13+
--> $DIR/unicode-chars.rs:5:5
14+
|
15+
LL |     let x = 0;
16+
| ^^^^
17+
|
18+
= note: character appears 3 more times
19+
help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
20+
|
21+
LL | let x = 0;
22+
| ++++
23+
24+
error: aborting due to 2 previous errors
1325

0 commit comments

Comments
 (0)