Skip to content

Commit 7f63c7c

Browse files
committed
Detect confusing unicode characters and show the alternative
1 parent 929ca3c commit 7f63c7c

File tree

4 files changed

+211
-1
lines changed

4 files changed

+211
-1
lines changed

src/libsyntax/diagnostic.rs

+4
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,10 @@ impl SpanHandler {
174174
self.handler.emit(Some((&self.cm, sp)), msg, Bug);
175175
panic!(ExplicitBug);
176176
}
177+
pub fn span_bug_no_panic(&self, sp: Span, msg: &str) {
178+
self.handler.emit(Some((&self.cm, sp)), msg, Bug);
179+
self.handler.bump_err_count();
180+
}
177181
pub fn span_unimpl(&self, sp: Span, msg: &str) -> ! {
178182
self.span_bug(sp, &format!("unimplemented {}", msg));
179183
}

src/libsyntax/parse/lexer/mod.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use std::rc::Rc;
2626
pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};
2727

2828
pub mod comments;
29+
mod unicode_chars;
2930

3031
pub trait Reader {
3132
fn is_eof(&self) -> bool;
@@ -1224,7 +1225,8 @@ impl<'a> StringReader<'a> {
12241225
c => {
12251226
let last_bpos = self.last_pos;
12261227
let bpos = self.pos;
1227-
panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c));
1228+
unicode_chars::check_for_substitution(&self, c);
1229+
panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c))
12281230
}
12291231
}
12301232
}
+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// Characters and their corresponding confusables were collected from
12+
// http://www.unicode.org/Public/security/revision-06/confusables.txt
13+
14+
use codemap::mk_sp as make_span;
15+
use super::StringReader;
16+
17+
const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
18+
('ߺ', "Nko Lajanyalan", '_'),
19+
('﹍', "Dashed Low Line", '_'),
20+
('﹎', "Centreline Low Line", '_'),
21+
('﹏', "Wavy Low Line", '_'),
22+
('‐', "Hyphen", '-'),
23+
('‑', "Non-Breaking Hyphen", '-'),
24+
('‒', "Figure Dash", '-'),
25+
('–', "En Dash", '-'),
26+
('﹘', "Small Em Dash", '-'),
27+
('⁃', "Hyphen Bullet", '-'),
28+
('˗', "Modifier Letter Minus Sign", '-'),
29+
('−', "Minus Sign", '-'),
30+
('٫', "Arabic Decimal Separator", ','),
31+
('‚', "Single Low-9 Quotation Mark", ','),
32+
('ꓹ', "Lisu Letter Tone Na Po", ','),
33+
(';', "Greek Question Mark", ';'),
34+
('ः', "Devanagari Sign Visarga", ':'),
35+
('ઃ', "Gujarati Sign Visarga", ':'),
36+
(':', "Fullwidth Colon", ':'),
37+
('։', "Armenian Full Stop", ':'),
38+
('܃', "Syriac Supralinear Colon", ':'),
39+
('܄', "Syriac Sublinear Colon", ':'),
40+
('︰', "Presentation Form For Vertical Two Dot Leader", ':'),
41+
('᠃', "Mongolian Full Stop", ':'),
42+
('᠉', "Mongolian Manchu Full Stop", ':'),
43+
('⁚', "Two Dot Punctuation", ':'),
44+
('׃', "Hebrew Punctuation Sof Pasuq", ':'),
45+
('˸', "Modifier Letter Raised Colon", ':'),
46+
('꞉', "Modifier Letter Colon", ':'),
47+
('∶', "Ratio", ':'),
48+
('ː', "Modifier Letter Triangular Colon", ':'),
49+
('ꓽ', "Lisu Letter Tone Mya Jeu", ':'),
50+
('!', "Fullwidth Exclamation Mark", '!'),
51+
('ǃ', "Latin Letter Retroflex Click", '!'),
52+
('ʔ', "Latin Letter Glottal Stop", '?'),
53+
('ॽ', "Devanagari Letter Glottal Stop", '?'),
54+
('Ꭾ', "Cherokee Letter He", '?'),
55+
('𝅭', "Musical Symbol Combining Augmentation Dot", '.'),
56+
('․', "One Dot Leader", '.'),
57+
('۔', "Arabic Full Stop", '.'),
58+
('܁', "Syriac Supralinear Full Stop", '.'),
59+
('܂', "Syriac Sublinear Full Stop", '.'),
60+
('꘎', "Vai Full Stop", '.'),
61+
('𐩐', "Kharoshthi Punctuation Dot", '.'),
62+
('٠', "Arabic-Indic Digit Zero", '.'),
63+
('۰', "Extended Arabic-Indic Digit Zero", '.'),
64+
('ꓸ', "Lisu Letter Tone Mya Ti", '.'),
65+
('՝', "Armenian Comma", '\''),
66+
(''', "Fullwidth Apostrophe", '\''),
67+
('‘', "Left Single Quotation Mark", '\''),
68+
('’', "Right Single Quotation Mark", '\''),
69+
('‛', "Single High-Reversed-9 Quotation Mark", '\''),
70+
('′', "Prime", '\''),
71+
('‵', "Reversed Prime", '\''),
72+
('՚', "Armenian Apostrophe", '\''),
73+
('׳', "Hebrew Punctuation Geresh", '\''),
74+
('`', "Greek Varia", '\''),
75+
('`', "Fullwidth Grave Accent", '\''),
76+
('΄', "Greek Tonos", '\''),
77+
('´', "Greek Oxia", '\''),
78+
('᾽', "Greek Koronis", '\''),
79+
('᾿', "Greek Psili", '\''),
80+
('῾', "Greek Dasia", '\''),
81+
('ʹ', "Modifier Letter Prime", '\''),
82+
('ʹ', "Greek Numeral Sign", '\''),
83+
('ˊ', "Modifier Letter Acute Accent", '\''),
84+
('ˋ', "Modifier Letter Grave Accent", '\''),
85+
('˴', "Modifier Letter Middle Grave Accent", '\''),
86+
('ʻ', "Modifier Letter Turned Comma", '\''),
87+
('ʽ', "Modifier Letter Reversed Comma", '\''),
88+
('ʼ', "Modifier Letter Apostrophe", '\''),
89+
('ʾ', "Modifier Letter Right Half Ring", '\''),
90+
('ꞌ', "Latin Small Letter Saltillo", '\''),
91+
('י', "Hebrew Letter Yod", '\''),
92+
('ߴ', "Nko High Tone Apostrophe", '\''),
93+
('ߵ', "Nko Low Tone Apostrophe", '\''),
94+
('[', "Fullwidth Left Square Bracket", '('),
95+
('❨', "Medium Left Parenthesis Ornament", '('),
96+
('❲', "Light Left Tortoise Shell Bracket Ornament", '('),
97+
('〔', "Left Tortoise Shell Bracket", '('),
98+
('﴾', "Ornate Left Parenthesis", '('),
99+
(']', "Fullwidth Right Square Bracket", ')'),
100+
('❩', "Medium Right Parenthesis Ornament", ')'),
101+
('❳', "Light Right Tortoise Shell Bracket Ornament", ')'),
102+
('〕', "Right Tortoise Shell Bracket", ')'),
103+
('﴿', "Ornate Right Parenthesis", ')'),
104+
('❴', "Medium Left Curly Bracket Ornament", '{'),
105+
('❵', "Medium Right Curly Bracket Ornament", '}'),
106+
('⁎', "Low Asterisk", '*'),
107+
('٭', "Arabic Five Pointed Star", '*'),
108+
('∗', "Asterisk Operator", '*'),
109+
('᜵', "Philippine Single Punctuation", '/'),
110+
('⁁', "Caret Insertion Point", '/'),
111+
('∕', "Division Slash", '/'),
112+
('⁄', "Fraction Slash", '/'),
113+
('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'),
114+
('⟋', "Mathematical Rising Diagonal", '/'),
115+
('⧸', "Big Solidus", '/'),
116+
('㇓', "Cjk Stroke Sp", '/'),
117+
('〳', "Vertical Kana Repeat Mark Upper Half", '/'),
118+
('丿', "Cjk Unified Ideograph-4E3F", '/'),
119+
('⼃', "Kangxi Radical Slash", '/'),
120+
('\', "Fullwidth Reverse Solidus", '\\'),
121+
('﹨', "Small Reverse Solidus", '\\'),
122+
('∖', "Set Minus", '\\'),
123+
('⟍', "Mathematical Falling Diagonal", '\\'),
124+
('⧵', "Reverse Solidus Operator", '\\'),
125+
('⧹', "Big Reverse Solidus", '\\'),
126+
('㇔', "Cjk Stroke D", '\\'),
127+
('丶', "Cjk Unified Ideograph-4E36", '\\'),
128+
('⼂', "Kangxi Radical Dot", '\\'),
129+
('ꝸ', "Latin Small Letter Um", '&'),
130+
('﬩', "Hebrew Letter Alternative Plus Sign", '+'),
131+
('‹', "Single Left-Pointing Angle Quotation Mark", '<'),
132+
('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'),
133+
('˂', "Modifier Letter Left Arrowhead", '<'),
134+
('꓿', "Lisu Punctuation Full Stop", '='),
135+
('›', "Single Right-Pointing Angle Quotation Mark", '>'),
136+
('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'),
137+
('˃', "Modifier Letter Right Arrowhead", '>'),
138+
('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'),
139+
('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
140+
('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ];
141+
142+
const ASCII_ARRAY: &'static [(char, &'static str)] = &[
143+
('_', "Underscore"),
144+
('-', "Minus/Hyphen"),
145+
(',', "Comma"),
146+
(';', "Semicolon"),
147+
(':', "Colon"),
148+
('!', "Exclamation Mark"),
149+
('?', "Question Mark"),
150+
('.', "Period"),
151+
('\'', "Single Quote"),
152+
('(', "Left Parenthesis"),
153+
(')', "Right Parenthesis"),
154+
('{', "Left Curly Brace"),
155+
('}', "Right Curly Brace"),
156+
('*', "Asterisk"),
157+
('/', "Slash"),
158+
('\\', "Backslash"),
159+
('&', "Ampersand"),
160+
('+', "Plus Sign"),
161+
('<', "Less-Than Sign"),
162+
('=', "Equals Sign"),
163+
('>', "Greater-Than Sign"), ];
164+
165+
pub fn check_for_substitution(reader: &StringReader, ch: char) {
166+
UNICODE_ARRAY
167+
.iter()
168+
.find(|&&(c, _, _)| c == ch)
169+
.map(|&(_, u_name, ascii_char)| {
170+
let span = make_span(reader.last_pos, reader.pos);
171+
match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
172+
Some(&(ascii_char, ascii_name)) => {
173+
let msg =
174+
format!("unicode character '{}' ({}) looks much like '{}' ({}), but it's not",
175+
ch, u_name, ascii_char, ascii_name);
176+
reader.help_span(span, &msg);
177+
},
178+
None => {
179+
reader
180+
.span_diagnostic
181+
.span_bug_no_panic(span,
182+
&format!("substitution character not found for '{}'", ch));
183+
}
184+
}
185+
});
186+
}

src/test/parse-fail/unicode-chars.rs

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// compile-flags: -Z parse-only
12+
// ignore-tidy-linelength
13+
14+
fn main() {
15+
let y = 0;
16+
//~^ ERROR unknown start of token: \u{37e}
17+
//~^^ HELP unicode character ';' (Greek Question Mark) looks much like ';' (Semicolon), but it's not
18+
}

0 commit comments

Comments
 (0)