Skip to content

Commit

Permalink
Detect confusing unicode characters and show the alternative
Browse files Browse the repository at this point in the history
  • Loading branch information
wafflespeanut committed Nov 17, 2015
1 parent 929ca3c commit 7f63c7c
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 1 deletion.
4 changes: 4 additions & 0 deletions src/libsyntax/diagnostic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ impl SpanHandler {
self.handler.emit(Some((&self.cm, sp)), msg, Bug);
panic!(ExplicitBug);
}
pub fn span_bug_no_panic(&self, sp: Span, msg: &str) {
self.handler.emit(Some((&self.cm, sp)), msg, Bug);
self.handler.bump_err_count();
}
pub fn span_unimpl(&self, sp: Span, msg: &str) -> ! {
self.span_bug(sp, &format!("unimplemented {}", msg));
}
Expand Down
4 changes: 3 additions & 1 deletion src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use std::rc::Rc;
pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};

pub mod comments;
mod unicode_chars;

pub trait Reader {
fn is_eof(&self) -> bool;
Expand Down Expand Up @@ -1224,7 +1225,8 @@ impl<'a> StringReader<'a> {
c => {
let last_bpos = self.last_pos;
let bpos = self.pos;
panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c));
unicode_chars::check_for_substitution(&self, c);
panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c))
}
}
}
Expand Down
186 changes: 186 additions & 0 deletions src/libsyntax/parse/lexer/unicode_chars.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Characters and their corresponding confusables were collected from
// http://www.unicode.org/Public/security/revision-06/confusables.txt

use codemap::mk_sp as make_span;
use super::StringReader;

const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[
('ߺ', "Nko Lajanyalan", '_'),
('﹍', "Dashed Low Line", '_'),
('﹎', "Centreline Low Line", '_'),
('﹏', "Wavy Low Line", '_'),
('‐', "Hyphen", '-'),
('‑', "Non-Breaking Hyphen", '-'),
('‒', "Figure Dash", '-'),
('–', "En Dash", '-'),
('﹘', "Small Em Dash", '-'),
('⁃', "Hyphen Bullet", '-'),
('˗', "Modifier Letter Minus Sign", '-'),
('−', "Minus Sign", '-'),
('٫', "Arabic Decimal Separator", ','),
('‚', "Single Low-9 Quotation Mark", ','),
('ꓹ', "Lisu Letter Tone Na Po", ','),
(';', "Greek Question Mark", ';'),
('ः', "Devanagari Sign Visarga", ':'),
('ઃ', "Gujarati Sign Visarga", ':'),
(':', "Fullwidth Colon", ':'),
('։', "Armenian Full Stop", ':'),
('܃', "Syriac Supralinear Colon", ':'),
('܄', "Syriac Sublinear Colon", ':'),
('︰', "Presentation Form For Vertical Two Dot Leader", ':'),
('᠃', "Mongolian Full Stop", ':'),
('᠉', "Mongolian Manchu Full Stop", ':'),
('⁚', "Two Dot Punctuation", ':'),
('׃', "Hebrew Punctuation Sof Pasuq", ':'),
('˸', "Modifier Letter Raised Colon", ':'),
('꞉', "Modifier Letter Colon", ':'),
('∶', "Ratio", ':'),
('ː', "Modifier Letter Triangular Colon", ':'),
('ꓽ', "Lisu Letter Tone Mya Jeu", ':'),
('!', "Fullwidth Exclamation Mark", '!'),
('ǃ', "Latin Letter Retroflex Click", '!'),
('ʔ', "Latin Letter Glottal Stop", '?'),
('ॽ', "Devanagari Letter Glottal Stop", '?'),
('Ꭾ', "Cherokee Letter He", '?'),
('𝅭', "Musical Symbol Combining Augmentation Dot", '.'),
('․', "One Dot Leader", '.'),
('۔', "Arabic Full Stop", '.'),
('܁', "Syriac Supralinear Full Stop", '.'),
('܂', "Syriac Sublinear Full Stop", '.'),
('꘎', "Vai Full Stop", '.'),
('𐩐', "Kharoshthi Punctuation Dot", '.'),
('٠', "Arabic-Indic Digit Zero", '.'),
('۰', "Extended Arabic-Indic Digit Zero", '.'),
('ꓸ', "Lisu Letter Tone Mya Ti", '.'),
('՝', "Armenian Comma", '\''),
(''', "Fullwidth Apostrophe", '\''),
('‘', "Left Single Quotation Mark", '\''),
('’', "Right Single Quotation Mark", '\''),
('‛', "Single High-Reversed-9 Quotation Mark", '\''),
('′', "Prime", '\''),
('‵', "Reversed Prime", '\''),
('՚', "Armenian Apostrophe", '\''),
('׳', "Hebrew Punctuation Geresh", '\''),
('`', "Greek Varia", '\''),
('`', "Fullwidth Grave Accent", '\''),
('΄', "Greek Tonos", '\''),
('´', "Greek Oxia", '\''),
('᾽', "Greek Koronis", '\''),
('᾿', "Greek Psili", '\''),
('῾', "Greek Dasia", '\''),
('ʹ', "Modifier Letter Prime", '\''),
('ʹ', "Greek Numeral Sign", '\''),
('ˊ', "Modifier Letter Acute Accent", '\''),
('ˋ', "Modifier Letter Grave Accent", '\''),
('˴', "Modifier Letter Middle Grave Accent", '\''),
('ʻ', "Modifier Letter Turned Comma", '\''),
('ʽ', "Modifier Letter Reversed Comma", '\''),
('ʼ', "Modifier Letter Apostrophe", '\''),
('ʾ', "Modifier Letter Right Half Ring", '\''),
('ꞌ', "Latin Small Letter Saltillo", '\''),
('י', "Hebrew Letter Yod", '\''),
('ߴ', "Nko High Tone Apostrophe", '\''),
('ߵ', "Nko Low Tone Apostrophe", '\''),
('[', "Fullwidth Left Square Bracket", '('),
('❨', "Medium Left Parenthesis Ornament", '('),
('❲', "Light Left Tortoise Shell Bracket Ornament", '('),
('〔', "Left Tortoise Shell Bracket", '('),
('﴾', "Ornate Left Parenthesis", '('),
(']', "Fullwidth Right Square Bracket", ')'),
('❩', "Medium Right Parenthesis Ornament", ')'),
('❳', "Light Right Tortoise Shell Bracket Ornament", ')'),
('〕', "Right Tortoise Shell Bracket", ')'),
('﴿', "Ornate Right Parenthesis", ')'),
('❴', "Medium Left Curly Bracket Ornament", '{'),
('❵', "Medium Right Curly Bracket Ornament", '}'),
('⁎', "Low Asterisk", '*'),
('٭', "Arabic Five Pointed Star", '*'),
('∗', "Asterisk Operator", '*'),
('᜵', "Philippine Single Punctuation", '/'),
('⁁', "Caret Insertion Point", '/'),
('∕', "Division Slash", '/'),
('⁄', "Fraction Slash", '/'),
('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'),
('⟋', "Mathematical Rising Diagonal", '/'),
('⧸', "Big Solidus", '/'),
('㇓', "Cjk Stroke Sp", '/'),
('〳', "Vertical Kana Repeat Mark Upper Half", '/'),
('丿', "Cjk Unified Ideograph-4E3F", '/'),
('⼃', "Kangxi Radical Slash", '/'),
('\', "Fullwidth Reverse Solidus", '\\'),
('﹨', "Small Reverse Solidus", '\\'),
('∖', "Set Minus", '\\'),
('⟍', "Mathematical Falling Diagonal", '\\'),
('⧵', "Reverse Solidus Operator", '\\'),
('⧹', "Big Reverse Solidus", '\\'),
('㇔', "Cjk Stroke D", '\\'),
('丶', "Cjk Unified Ideograph-4E36", '\\'),
('⼂', "Kangxi Radical Dot", '\\'),
('ꝸ', "Latin Small Letter Um", '&'),
('﬩', "Hebrew Letter Alternative Plus Sign", '+'),
('‹', "Single Left-Pointing Angle Quotation Mark", '<'),
('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'),
('˂', "Modifier Letter Left Arrowhead", '<'),
('꓿', "Lisu Punctuation Full Stop", '='),
('›', "Single Right-Pointing Angle Quotation Mark", '>'),
('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'),
('˃', "Modifier Letter Right Arrowhead", '>'),
('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'),
('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ];

const ASCII_ARRAY: &'static [(char, &'static str)] = &[
('_', "Underscore"),
('-', "Minus/Hyphen"),
(',', "Comma"),
(';', "Semicolon"),
(':', "Colon"),
('!', "Exclamation Mark"),
('?', "Question Mark"),
('.', "Period"),
('\'', "Single Quote"),
('(', "Left Parenthesis"),
(')', "Right Parenthesis"),
('{', "Left Curly Brace"),
('}', "Right Curly Brace"),
('*', "Asterisk"),
('/', "Slash"),
('\\', "Backslash"),
('&', "Ampersand"),
('+', "Plus Sign"),
('<', "Less-Than Sign"),
('=', "Equals Sign"),
('>', "Greater-Than Sign"), ];

pub fn check_for_substitution(reader: &StringReader, ch: char) {
UNICODE_ARRAY
.iter()
.find(|&&(c, _, _)| c == ch)
.map(|&(_, u_name, ascii_char)| {
let span = make_span(reader.last_pos, reader.pos);
match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
Some(&(ascii_char, ascii_name)) => {
let msg =
format!("unicode character '{}' ({}) looks much like '{}' ({}), but it's not",
ch, u_name, ascii_char, ascii_name);
reader.help_span(span, &msg);
},
None => {
reader
.span_diagnostic
.span_bug_no_panic(span,
&format!("substitution character not found for '{}'", ch));
}
}
});
}
18 changes: 18 additions & 0 deletions src/test/parse-fail/unicode-chars.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// compile-flags: -Z parse-only
// ignore-tidy-linelength

fn main() {
let y = 0;
//~^ ERROR unknown start of token: \u{37e}
//~^^ HELP unicode character ';' (Greek Question Mark) looks much like ';' (Semicolon), but it's not
}

0 comments on commit 7f63c7c

Please sign in to comment.