Skip to content

Commit a3d6bc3

Browse files
committed
Emit a single error for contiguous sequences of Unicode homoglyphs
1 parent ef4046e commit a3d6bc3

File tree

8 files changed

+49
-9
lines changed

8 files changed

+49
-9
lines changed

compiler/rustc_parse/src/lexer/mod.rs

+24-4
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
7979
/// preceded by whitespace.
8080
fn next_token(&mut self) -> (Token, bool) {
8181
let mut preceded_by_whitespace = false;
82-
82+
let mut swallow_next_invalid = 0;
8383
// Skip trivial (whitespace & comments) tokens
8484
loop {
8585
let token = self.cursor.advance_token();
@@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
232232
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
233233

234234
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
235-
let c = self.str_from(start).chars().next().unwrap();
235+
// Don't emit diagnostics for sequences of the same invalid token
236+
if swallow_next_invalid > 0 {
237+
swallow_next_invalid -= 1;
238+
continue;
239+
}
240+
let mut it = self.str_from_to_end(start).chars();
241+
let c = it.next().unwrap();
242+
let repeats = it.take_while(|c1| *c1 == c).count();
236243
let mut err =
237-
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
244+
self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
238245
// FIXME: the lexer could be used to turn the ASCII version of unicode
239246
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
240247
// token. Ideally, this should be inside `rustc_lexer`. However, we should
241248
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
242249
// fancier error recovery to it, as there will be less overall work to do this
243250
// way.
244-
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
251+
let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
245252
if c == '\x00' {
246253
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
247254
}
255+
if repeats > 0 {
256+
if repeats == 1 {
257+
err.note(format!("character appears once more"));
258+
} else {
259+
err.note(format!("character appears {repeats} more times"));
260+
}
261+
swallow_next_invalid = repeats;
262+
}
248263
err.emit();
249264
if let Some(token) = token {
250265
token
@@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
486501
&self.src[self.src_index(start)..self.src_index(end)]
487502
}
488503

504+
/// Slice of the source text spanning from `start` until the end
505+
fn str_from_to_end(&self, start: BytePos) -> &str {
506+
&self.src[self.src_index(start)..]
507+
}
508+
489509
fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
490510
match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
491511
Err(RawStrError::InvalidStarter { bad_char }) => {

compiler/rustc_parse/src/lexer/unicode_chars.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
337337
pos: BytePos,
338338
ch: char,
339339
err: &mut Diagnostic,
340+
count: usize,
340341
) -> Option<token::TokenKind> {
341342
let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
342343

343-
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
344+
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
344345

345346
let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
346347
let msg = format!("substitution character not found for '{}'", ch);
@@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
369370
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
370371
ch, u_name, ascii_char, ascii_name
371372
);
372-
err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
373+
err.span_suggestion(
374+
span,
375+
&msg,
376+
ascii_char.to_string().repeat(count),
377+
Applicability::MaybeIncorrect,
378+
);
373379
}
374380
token.clone()
375381
}

tests/rustdoc-ui/invalid-syntax.stderr

-2
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ LL | /// ```
7777
| ^^^
7878
|
7979
= note: error from rustc: unknown start of token: `
80-
= note: error from rustc: unknown start of token: `
81-
= note: error from rustc: unknown start of token: `
8280

8381
warning: could not parse code block as Rust code
8482
--> $DIR/invalid-syntax.rs:64:5
-4.1 KB
Binary file not shown.
-194 Bytes
Binary file not shown.
40 Bytes
Binary file not shown.

tests/ui/parser/unicode-chars.rs

+4
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,8 @@ fn main() {
22
let y = 0;
33
//~^ ERROR unknown start of token: \u{37e}
44
//~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
5+
    let x = 0;
6+
//~^ ERROR unknown start of token: \u{a0}
7+
//~^^ NOTE character appears 3 more times
8+
//~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
59
}

tests/ui/parser/unicode-chars.stderr

+13-1
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,17 @@ help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), b
99
LL | let y = 0;
1010
| ~
1111

12-
error: aborting due to previous error
12+
error: unknown start of token: \u{a0}
13+
--> $DIR/unicode-chars.rs:5:5
14+
|
15+
LL |     let x = 0;
16+
| ^^^^
17+
|
18+
= note: character appears 3 more times
19+
help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
20+
|
21+
LL | let x = 0;
22+
| ++++
23+
24+
error: aborting due to 2 previous errors
1325

0 commit comments

Comments
 (0)