Detect confusing unicode characters and show the alternative

wafflespeanut · wafflespeanut · commit 7f63c7cf4c29 · 2015-11-17T12:14:28.000+05:30
diff --git a/src/libsyntax/diagnostic.rs b/src/libsyntax/diagnostic.rs
@@ -174,6 +174,10 @@ impl SpanHandler {
         self.handler.emit(Some((&self.cm, sp)), msg, Bug);
         panic!(ExplicitBug);
     }
+    pub fn span_bug_no_panic(&self, sp: Span, msg: &str) {
+        self.handler.emit(Some((&self.cm, sp)), msg, Bug);
+        self.handler.bump_err_count();
+    }
     pub fn span_unimpl(&self, sp: Span, msg: &str) -> ! {
         self.span_bug(sp, &format!("unimplemented {}", msg));
     }
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
@@ -26,6 +26,7 @@ use std::rc::Rc;
 pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};
 
 pub mod comments;
+mod unicode_chars;
 
 pub trait Reader {
     fn is_eof(&self) -> bool;
@@ -1224,7 +1225,8 @@ impl<'a> StringReader<'a> {
           c => {
               let last_bpos = self.last_pos;
               let bpos = self.pos;
-              panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c));
+              unicode_chars::check_for_substitution(&self, c);
+              panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c))
           }
         }
     }
diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs
@@ -0,0 +1,186 @@
+// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// Characters and their corresponding confusables were collected from
+// http://www.unicode.org/Public/security/revision-06/confusables.txt
+
+use codemap::mk_sp as make_span;
+use super::StringReader;
+
+const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
+    ('ߺ', "Nko Lajanyalan", '_'),
+    ('﹍', "Dashed Low Line", '_'),
+    ('﹎', "Centreline Low Line", '_'),
+    ('﹏', "Wavy Low Line", '_'),
+    ('‐', "Hyphen", '-'),
+    ('‑', "Non-Breaking Hyphen", '-'),
+    ('‒', "Figure Dash", '-'),
+    ('–', "En Dash", '-'),
+    ('﹘', "Small Em Dash", '-'),
+    ('⁃', "Hyphen Bullet", '-'),
+    ('˗', "Modifier Letter Minus Sign", '-'),
+    ('−', "Minus Sign", '-'),
+    ('٫', "Arabic Decimal Separator", ','),
+    ('‚', "Single Low-9 Quotation Mark", ','),
+    ('ꓹ', "Lisu Letter Tone Na Po", ','),
+    (';', "Greek Question Mark", ';'),
+    ('ः', "Devanagari Sign Visarga", ':'),
+    ('ઃ', "Gujarati Sign Visarga", ':'),
+    ('：', "Fullwidth Colon", ':'),
+    ('։', "Armenian Full Stop", ':'),
+    ('܃', "Syriac Supralinear Colon", ':'),
+    ('܄', "Syriac Sublinear Colon", ':'),
+    ('︰', "Presentation Form For Vertical Two Dot Leader", ':'),
+    ('᠃', "Mongolian Full Stop", ':'),
+    ('᠉', "Mongolian Manchu Full Stop", ':'),
+    ('⁚', "Two Dot Punctuation", ':'),
+    ('׃', "Hebrew Punctuation Sof Pasuq", ':'),
+    ('˸', "Modifier Letter Raised Colon", ':'),
+    ('꞉', "Modifier Letter Colon", ':'),
+    ('∶', "Ratio", ':'),
+    ('ː', "Modifier Letter Triangular Colon", ':'),
+    ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'),
+    ('！', "Fullwidth Exclamation Mark", '!'),
+    ('ǃ', "Latin Letter Retroflex Click", '!'),
+    ('ʔ', "Latin Letter Glottal Stop", '?'),
+    ('ॽ', "Devanagari Letter Glottal Stop", '?'),
+    ('Ꭾ', "Cherokee Letter He", '?'),
+    ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'),
+    ('․', "One Dot Leader", '.'),
+    ('۔', "Arabic Full Stop", '.'),
+    ('܁', "Syriac Supralinear Full Stop", '.'),
+    ('܂', "Syriac Sublinear Full Stop", '.'),
+    ('꘎', "Vai Full Stop", '.'),
+    ('𐩐', "Kharoshthi Punctuation Dot", '.'),
+    ('٠', "Arabic-Indic Digit Zero", '.'),
+    ('۰', "Extended Arabic-Indic Digit Zero", '.'),
+    ('ꓸ', "Lisu Letter Tone Mya Ti", '.'),
+    ('՝', "Armenian Comma", '\''),
+    ('＇', "Fullwidth Apostrophe", '\''),
+    ('‘', "Left Single Quotation Mark", '\''),
+    ('’', "Right Single Quotation Mark", '\''),
+    ('‛', "Single High-Reversed-9 Quotation Mark", '\''),
+    ('′', "Prime", '\''),
+    ('‵', "Reversed Prime", '\''),
+    ('՚', "Armenian Apostrophe", '\''),
+    ('׳', "Hebrew Punctuation Geresh", '\''),
+    ('`', "Greek Varia", '\''),
+    ('｀', "Fullwidth Grave Accent", '\''),
+    ('΄', "Greek Tonos", '\''),
+    ('´', "Greek Oxia", '\''),
+    ('᾽', "Greek Koronis", '\''),
+    ('᾿', "Greek Psili", '\''),
+    ('῾', "Greek Dasia", '\''),
+    ('ʹ', "Modifier Letter Prime", '\''),
+    ('ʹ', "Greek Numeral Sign", '\''),
+    ('ˊ', "Modifier Letter Acute Accent", '\''),
+    ('ˋ', "Modifier Letter Grave Accent", '\''),
+    ('˴', "Modifier Letter Middle Grave Accent", '\''),
+    ('ʻ', "Modifier Letter Turned Comma", '\''),
+    ('ʽ', "Modifier Letter Reversed Comma", '\''),
+    ('ʼ', "Modifier Letter Apostrophe", '\''),
+    ('ʾ', "Modifier Letter Right Half Ring", '\''),
+    ('ꞌ', "Latin Small Letter Saltillo", '\''),
+    ('י', "Hebrew Letter Yod", '\''),
+    ('ߴ', "Nko High Tone Apostrophe", '\''),
+    ('ߵ', "Nko Low Tone Apostrophe", '\''),
+    ('［', "Fullwidth Left Square Bracket", '('),
+    ('❨', "Medium Left Parenthesis Ornament", '('),
+    ('❲', "Light Left Tortoise Shell Bracket Ornament", '('),
+    ('〔', "Left Tortoise Shell Bracket", '('),
+    ('﴾', "Ornate Left Parenthesis", '('),
+    ('］', "Fullwidth Right Square Bracket", ')'),
+    ('❩', "Medium Right Parenthesis Ornament", ')'),
+    ('❳', "Light Right Tortoise Shell Bracket Ornament", ')'),
+    ('〕', "Right Tortoise Shell Bracket", ')'),
+    ('﴿', "Ornate Right Parenthesis", ')'),
+    ('❴', "Medium Left Curly Bracket Ornament", '{'),
+    ('❵', "Medium Right Curly Bracket Ornament", '}'),
+    ('⁎', "Low Asterisk", '*'),
+    ('٭', "Arabic Five Pointed Star", '*'),
+    ('∗', "Asterisk Operator", '*'),
+    ('᜵', "Philippine Single Punctuation", '/'),
+    ('⁁', "Caret Insertion Point", '/'),
+    ('∕', "Division Slash", '/'),
+    ('⁄', "Fraction Slash", '/'),
+    ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'),
+    ('⟋', "Mathematical Rising Diagonal", '/'),
+    ('⧸', "Big Solidus", '/'),
+    ('㇓', "Cjk Stroke Sp", '/'),
+    ('〳', "Vertical Kana Repeat Mark Upper Half", '/'),
+    ('丿', "Cjk Unified Ideograph-4E3F", '/'),
+    ('⼃', "Kangxi Radical Slash", '/'),
+    ('＼', "Fullwidth Reverse Solidus", '\\'),
+    ('﹨', "Small Reverse Solidus", '\\'),
+    ('∖', "Set Minus", '\\'),
+    ('⟍', "Mathematical Falling Diagonal", '\\'),
+    ('⧵', "Reverse Solidus Operator", '\\'),
+    ('⧹', "Big Reverse Solidus", '\\'),
+    ('㇔', "Cjk Stroke D", '\\'),
+    ('丶', "Cjk Unified Ideograph-4E36", '\\'),
+    ('⼂', "Kangxi Radical Dot", '\\'),
+    ('ꝸ', "Latin Small Letter Um", '&'),
+    ('﬩', "Hebrew Letter Alternative Plus Sign", '+'),
+    ('‹', "Single Left-Pointing Angle Quotation Mark", '<'),
+    ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'),
+    ('˂', "Modifier Letter Left Arrowhead", '<'),
+    ('꓿', "Lisu Punctuation Full Stop", '='),
+    ('›', "Single Right-Pointing Angle Quotation Mark", '>'),
+    ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'),
+    ('˃', "Modifier Letter Right Arrowhead", '>'),
+    ('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'),
+    ('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
+    ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ];
+
+const ASCII_ARRAY: &'static [(char, &'static str)] = &[
+    ('_', "Underscore"),
+    ('-', "Minus/Hyphen"),
+    (',', "Comma"),
+    (';', "Semicolon"),
+    (':', "Colon"),
+    ('!', "Exclamation Mark"),
+    ('?', "Question Mark"),
+    ('.', "Period"),
+    ('\'', "Single Quote"),
+    ('(', "Left Parenthesis"),
+    (')', "Right Parenthesis"),
+    ('{', "Left Curly Brace"),
+    ('}', "Right Curly Brace"),
+    ('*', "Asterisk"),
+    ('/', "Slash"),
+    ('\\', "Backslash"),
+    ('&', "Ampersand"),
+    ('+', "Plus Sign"),
+    ('<', "Less-Than Sign"),
+    ('=', "Equals Sign"),
+    ('>', "Greater-Than Sign"), ];
+
+pub fn check_for_substitution(reader: &StringReader, ch: char) {
+    UNICODE_ARRAY
+    .iter()
+    .find(|&&(c, _, _)| c == ch)
+    .map(|&(_, u_name, ascii_char)| {
+        let span = make_span(reader.last_pos, reader.pos);
+        match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
+            Some(&(ascii_char, ascii_name)) => {
+                let msg =
+                    format!("unicode character '{}' ({}) looks much like '{}' ({}), but it's not",
+                            ch, u_name, ascii_char, ascii_name);
+                reader.help_span(span, &msg);
+            },
+            None => {
+                reader
+                .span_diagnostic
+                .span_bug_no_panic(span,
+                                   &format!("substitution character not found for '{}'", ch));
+            }
+        }
+    });
+}
diff --git a/src/test/parse-fail/unicode-chars.rs b/src/test/parse-fail/unicode-chars.rs
@@ -0,0 +1,18 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// compile-flags: -Z parse-only
+// ignore-tidy-linelength
+
+fn main() {
+    let y = 0;
+    //~^ ERROR unknown start of token: \u{37e}
+    //~^^ HELP unicode character ';' (Greek Question Mark) looks much like ';' (Semicolon), but it's not
+}

Original file line number	Diff line number	Diff line change
`@@ -174,6 +174,10 @@ impl SpanHandler {`
`174`	`174`	`self.handler.emit(Some((&self.cm, sp)), msg, Bug);`
`175`	`175`	`panic!(ExplicitBug);`
`176`	`176`	`}`
	`177`	`+ pub fn span_bug_no_panic(&self, sp: Span, msg: &str) {`
	`178`	`+ self.handler.emit(Some((&self.cm, sp)), msg, Bug);`
	`179`	`+ self.handler.bump_err_count();`
	`180`	`+ }`
`177`	`181`	`pub fn span_unimpl(&self, sp: Span, msg: &str) -> ! {`
`178`	`182`	`self.span_bug(sp, &format!("unimplemented {}", msg));`
`179`	`183`	`}`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ use std::rc::Rc;`
`26`	`26`	`pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};`
`27`	`27`
`28`	`28`	`pub mod comments;`
	`29`	`+mod unicode_chars;`
`29`	`30`
`30`	`31`	`pub trait Reader {`
`31`	`32`	`fn is_eof(&self) -> bool;`
`@@ -1224,7 +1225,8 @@ impl<'a> StringReader<'a> {`
`1224`	`1225`	`c => {`
`1225`	`1226`	`let last_bpos = self.last_pos;`
`1226`	`1227`	`let bpos = self.pos;`
`1227`		`- panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c));`
	`1228`	`+ unicode_chars::check_for_substitution(&self, c);`
	`1229`	`+ panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c))`
`1228`	`1230`	`}`
`1229`	`1231`	`}`
`1230`	`1232`	`}`