Skip to content

Commit

Permalink
Rollup merge of rust-lang#66670 - crlf0710:normalize_ident, r=estebank
Browse files Browse the repository at this point in the history
Normalize ident

Perform unicode normalization on identifiers. Resolving the first bullet point in rust-lang#55467.
  • Loading branch information
Centril authored Dec 19, 2019
2 parents 0de96d3 + 49f3bc9 commit 02206dc
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 5 deletions.
8 changes: 6 additions & 2 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3770,6 +3770,7 @@ dependencies = [
"smallvec 1.0.0",
"syntax",
"syntax_pos",
"unicode-normalization",
]

[[package]]
Expand Down Expand Up @@ -4976,9 +4977,12 @@ dependencies = [

[[package]]
name = "unicode-normalization"
version = "0.1.7"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25"
checksum = "b561e267b2326bb4cebfc0ef9e68355c7abe6c6f522aeac2f5bf95d56c59bdcf"
dependencies = [
"smallvec 1.0.0",
]

[[package]]
name = "unicode-segmentation"
Expand Down
1 change: 1 addition & 0 deletions src/librustc_parse/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ rustc_error_codes = { path = "../librustc_error_codes" }
smallvec = { version = "1.0", features = ["union", "may_dangle"] }
syntax_pos = { path = "../libsyntax_pos" }
syntax = { path = "../libsyntax" }
unicode-normalization = "0.1.11"
17 changes: 15 additions & 2 deletions src/librustc_parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,7 @@ impl<'a> StringReader<'a> {
if is_raw_ident {
ident_start = ident_start + BytePos(2);
}
// FIXME: perform NFKC normalization here. (Issue #2253)
let sym = self.symbol_from(ident_start);
let sym = self.nfc_symbol_from(ident_start);
if is_raw_ident {
let span = self.mk_sp(start, self.pos);
if !sym.can_be_raw() {
Expand Down Expand Up @@ -465,6 +464,20 @@ impl<'a> StringReader<'a> {
Symbol::intern(self.str_from_to(start, end))
}

/// As symbol_from, with the text normalized into Unicode NFC form.
fn nfc_symbol_from(&self, start: BytePos) -> Symbol {
use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization};
debug!("taking an normalized ident from {:?} to {:?}", start, self.pos);
let sym = self.str_from(start);
match is_nfc_quick(sym.chars()) {
IsNormalized::Yes => Symbol::intern(sym),
_ => {
let sym_str: String = sym.chars().nfc().collect();
Symbol::intern(&sym_str)
}
}
}

/// Slice of the source text spanning from `start` up to but excluding `end`.
fn str_from_to(&self, start: BytePos, end: BytePos) -> &str
{
Expand Down
2 changes: 1 addition & 1 deletion src/test/ui/codemap_tests/unicode_2.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ LL | let _ = ("아あ", 1i42);
|
= help: valid widths are 8, 16, 32, 64 and 128

error[E0425]: cannot find value `a̐é` in this scope
error[E0425]: cannot find value `a̐é` in this scope
--> $DIR/unicode_2.rs:6:13
|
LL | let _ = a̐é;
Expand Down
8 changes: 8 additions & 0 deletions src/test/ui/rfc-2457/idents-normalized.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// check-pass
#![feature(non_ascii_idents)]

struct Résumé; // ['LATIN SMALL LETTER E WITH ACUTE']

fn main() {
let _ = Résumé; // ['LATIN SMALL LETTER E', 'COMBINING ACUTE ACCENT']
}
1 change: 1 addition & 0 deletions src/tools/tidy/src/deps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ const WHITELIST: &[Crate<'_>] = &[
Crate("term_size"),
Crate("thread_local"),
Crate("ucd-util"),
Crate("unicode-normalization"),
Crate("unicode-width"),
Crate("unicode-xid"),
Crate("unreachable"),
Expand Down

0 comments on commit 02206dc

Please sign in to comment.