diff --git a/Cargo.lock b/Cargo.lock index 7465a4f9521..20834278e3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -377,12 +377,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "log" -version = "0.4.16" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6389c490849ff5bc16be905ae24bc913a9c8892e19b2341dbc175e14c341c2b8" -dependencies = [ - "cfg-if", -] +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "matchers" @@ -551,6 +548,7 @@ dependencies = [ "toml", "tracing", "tracing-subscriber", + "unicode-normalization", "unicode-properties", "unicode-segmentation", "unicode-width", @@ -698,6 +696,21 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "toml" version = "0.7.4" @@ -767,20 +780,20 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" dependencies = [ - "lazy_static", "log", + "once_cell", "tracing-core", ] [[package]] name = "tracing-subscriber" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "matchers", "nu-ansi-term", @@ -800,6 +813,15 @@ version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-properties" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index bcd3b420acb..75826ce0c63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,9 +53,12 @@ thiserror = "1.0.40" toml = "0.7.4" tracing = "0.1.37" tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } +unicode-normalization = "0.1.22" +unicode-properties = { version = "0.1", default-features = false, features = [ + "general-category", +] } unicode-segmentation = "1.9" unicode-width = "0.1" -unicode-properties = { version = "0.1", default-features = false, features = ["general-category"] } rustfmt-config_proc_macro = { version = "0.3", path = "config_proc_macro" } diff --git a/Configurations.md b/Configurations.md index 2d01fb3bb3b..0372a66569e 100644 --- a/Configurations.md +++ b/Configurations.md @@ -1069,6 +1069,14 @@ Number of lines to check for a `@generated` pragma header, starting from the top See also [format_generated_files](#format_generated_files) link here. +## `nfc_normalize_idents` + +Whether to normalize identifiers with Unicode Normalization Form C (NFC). The compiler considers identifiers with identical NFC normalizations to be interchangeable. + +- **Default value**: `false` +- **Possible values**: `true`, `false` +- **Stable**: No + ## `format_macro_matchers` Format the metavariable matching patterns in macros. diff --git a/src/config/mod.rs b/src/config/mod.rs index 9484b2e5829..1098d677c20 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -153,6 +153,8 @@ create_config! { format_generated_files: bool, true, false, "Format generated files"; generated_marker_line_search_limit: usize, 5, false, "Number of lines to check for a \ `@generated` marker when `format_generated_files` is enabled"; + nfc_normalize_idents: bool, false, false, "Whether to normalize identifiers \ + to Unicode Normalization Form C"; // Options that can change the source code beyond whitespace/blocks (somewhat linty things) merge_derives: bool, true, true, "Merge multiple `#[derive(...)]` into a single one"; @@ -683,6 +685,7 @@ version = "One" inline_attribute_width = 0 format_generated_files = true generated_marker_line_search_limit = 5 +nfc_normalize_idents = false merge_derives = true use_try_shorthand = false use_field_init_shorthand = false diff --git a/src/expr.rs b/src/expr.rs index 7808f891336..147124ddde7 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -32,7 +32,7 @@ use crate::types::{rewrite_path, PathContext}; use crate::utils::{ colon_spaces, contains_skip, count_newlines, filtered_str_fits, first_line_ends_with, inner_attributes, last_line_extendable, last_line_width, mk_sp, outer_attributes, - semicolon_for_expr, unicode_str_width, wrap_str, + rewrite_ident, semicolon_for_expr, unicode_str_width, wrap_str, }; use crate::vertical::rewrite_with_alignment; use crate::visitor::FmtVisitor; @@ -1754,9 +1754,9 @@ pub(crate) fn rewrite_field( if !attrs_str.is_empty() { attrs_str.push_str(&shape.indent.to_string_with_newline(context.config)); }; - let name = context.snippet(field.ident.span); + let name = rewrite_ident(context, field.ident); if field.is_shorthand { - Some(attrs_str + name) + Some(attrs_str + &name) } else { let mut separator = String::from(struct_lit_field_separator(context.config)); for _ in 0..prefix_max_width.saturating_sub(name.len()) { @@ -1770,7 +1770,7 @@ pub(crate) fn rewrite_field( Some(ref e) if !is_lit && e.as_str() == name && context.config.use_field_init_shorthand() => { - Some(attrs_str + name) + Some(attrs_str + &name) } Some(e) => Some(format!("{attrs_str}{name}{separator}{e}")), None => { diff --git a/src/imports.rs b/src/imports.rs index 05195553c08..f2095392720 100644 --- a/src/imports.rs +++ b/src/imports.rs @@ -185,7 +185,7 @@ impl UseSegment { if name.is_empty() || name == "{{root}}" { return None; } - let kind = match name { + let kind = match &*name { "self" => UseSegmentKind::Slf(None), "super" => UseSegmentKind::Super(None), "crate" => UseSegmentKind::Crate(None), @@ -498,7 +498,7 @@ impl UseTree { let name = if a.prefix.segments.len() == 2 && leading_modsep { context.snippet(a.prefix.span).to_owned() } else { - rewrite_ident(context, path_to_imported_ident(&a.prefix)).to_owned() + rewrite_ident(context, path_to_imported_ident(&a.prefix)).into_owned() }; let alias = rename.and_then(|ident| { if ident.name == sym::underscore_imports { @@ -507,7 +507,7 @@ impl UseTree { } else if ident == path_to_imported_ident(&a.prefix) { None } else { - Some(rewrite_ident(context, ident).to_owned()) + Some(rewrite_ident(context, ident).into_owned()) } }); let kind = match name.as_ref() { diff --git a/src/items.rs b/src/items.rs index e7ff5ff818b..01672002633 100644 --- a/src/items.rs +++ b/src/items.rs @@ -679,7 +679,7 @@ impl<'a> FmtVisitor<'a> { self.block_indent, Some(one_line_width), )?, - ast::VariantData::Unit(..) => rewrite_ident(&context, field.ident).to_owned(), + ast::VariantData::Unit(..) => rewrite_ident(&context, field.ident).into_owned(), }; let variant_body = if let Some(ref expr) = field.disr_expr { @@ -1160,8 +1160,12 @@ pub(crate) fn format_trait( let body_lo = context.snippet_provider.span_after(item.span, "{"); let shape = Shape::indented(offset, context.config).offset_left(result.len())?; - let generics_str = - rewrite_generics(context, rewrite_ident(context, item.ident), generics, shape)?; + let generics_str = rewrite_generics( + context, + &rewrite_ident(context, item.ident), + generics, + shape, + )?; result.push_str(&generics_str); // FIXME(#2055): rustfmt fails to format when there are comments between trait bounds. @@ -1356,7 +1360,7 @@ pub(crate) fn format_trait_alias( let alias = rewrite_ident(context, ident); // 6 = "trait ", 2 = " =" let g_shape = shape.offset_left(6)?.sub_width(2)?; - let generics_str = rewrite_generics(context, alias, generics, g_shape)?; + let generics_str = rewrite_generics(context, &alias, generics, g_shape)?; let vis_str = format_visibility(context, vis); let lhs = format!("{vis_str}trait {generics_str} ="); // 1 = ";" @@ -1749,13 +1753,13 @@ fn rewrite_ty( let ident_str = rewrite_ident(context, ident); if generics.params.is_empty() { - result.push_str(ident_str) + result.push_str(&ident_str) } else { // 2 = `= ` let g_shape = Shape::indented(indent, context.config) .offset_left(result.len())? .sub_width(2)?; - let generics_str = rewrite_generics(context, ident_str, generics, g_shape)?; + let generics_str = rewrite_generics(context, &ident_str, generics, g_shape)?; result.push_str(&generics_str); } @@ -2362,7 +2366,7 @@ fn rewrite_fn_base( let fd = fn_sig.decl; let generics_str = rewrite_generics( context, - rewrite_ident(context, ident), + &rewrite_ident(context, ident), &fn_sig.generics, shape, )?; @@ -3188,7 +3192,7 @@ fn format_header( } } - result.push_str(rewrite_ident(context, ident)); + result.push_str(&rewrite_ident(context, ident)); result } @@ -3438,7 +3442,7 @@ pub(crate) fn rewrite_mod( let mut result = String::with_capacity(32); result.push_str(&*format_visibility(context, &item.vis)); result.push_str("mod "); - result.push_str(rewrite_ident(context, item.ident)); + result.push_str(&rewrite_ident(context, item.ident)); result.push(';'); rewrite_attrs(context, item, &result, attrs_shape) } diff --git a/src/lib.rs b/src/lib.rs index a67adb1478f..a2ef4d34ecd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,7 @@ extern crate rustc_builtin_macros; extern crate rustc_data_structures; extern crate rustc_errors; extern crate rustc_expand; +extern crate rustc_lexer; extern crate rustc_parse; extern crate rustc_session; extern crate rustc_span; diff --git a/src/macros.rs b/src/macros.rs index 6e114c76f26..092b988f9b7 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -36,7 +36,7 @@ use crate::shape::{Indent, Shape}; use crate::source_map::SpanUtils; use crate::spanned::Spanned; use crate::utils::{ - filtered_str_fits, format_visibility, indent_next_line, is_empty_line, mk_sp, + filtered_str_fits, format_visibility, indent_next_line, is_empty_line, mk_sp, nfc_normalize, remove_trailing_white_spaces, rewrite_ident, trim_left_preserve_layout, NodeIdExt, }; use crate::visitor::FmtVisitor; @@ -284,7 +284,7 @@ fn rewrite_macro_inner( }, ) .map(|rw| match position { - MacroPosition::Item => format!("{};", rw), + MacroPosition::Item => format!("{rw};"), _ => rw, }) } @@ -425,7 +425,7 @@ pub(crate) fn rewrite_macro_def( }; result += " "; - result += rewrite_ident(context, ident); + result += &rewrite_ident(context, ident); let multi_branch_style = def.macro_rules || parsed_def.branches.len() != 1; @@ -490,6 +490,7 @@ pub(crate) fn rewrite_macro_def( } fn register_metavariable( + context: &RewriteContext<'_>, map: &mut HashMap, result: &mut String, name: &str, @@ -502,6 +503,10 @@ fn register_metavariable( new_name.push_str(name); old_name.push_str(name); + // `$` is `NFC_Inert`, so won't get mangled + let new_name = nfc_normalize(context, &new_name).into_owned(); + let old_name = nfc_normalize(context, &old_name).into_owned(); + result.push_str(&new_name); map.insert(old_name, new_name); } @@ -509,7 +514,10 @@ fn register_metavariable( // Replaces `$foo` with `zfoo`. We must check for name overlap to ensure we // aren't causing problems. // This should also work for escaped `$` variables, where we leave earlier `$`s. -fn replace_names(input: &str) -> Option<(String, HashMap)> { +fn replace_names( + context: &RewriteContext<'_>, + input: &str, +) -> Option<(String, HashMap)> { // Each substitution will require five or six extra bytes. let mut result = String::with_capacity(input.len() + 64); let mut substs = HashMap::new(); @@ -523,9 +531,9 @@ fn replace_names(input: &str) -> Option<(String, HashMap)> { dollar_count += 1; } else if dollar_count == 0 { result.push(c); - } else if !c.is_alphanumeric() && !cur_name.is_empty() { + } else if !rustc_lexer::is_id_continue(c) && !cur_name.is_empty() { // Terminates a name following one or more dollars. - register_metavariable(&mut substs, &mut result, &cur_name, dollar_count); + register_metavariable(context, &mut substs, &mut result, &cur_name, dollar_count); result.push(c); dollar_count = 0; @@ -533,13 +541,13 @@ fn replace_names(input: &str) -> Option<(String, HashMap)> { } else if c == '(' && cur_name.is_empty() { // FIXME: Support macro def with repeat. return None; - } else if c.is_alphanumeric() || c == '_' { + } else if rustc_lexer::is_id_continue(c) { cur_name.push(c); } } if !cur_name.is_empty() { - register_metavariable(&mut substs, &mut result, &cur_name, dollar_count); + register_metavariable(context, &mut substs, &mut result, &cur_name, dollar_count); } debug!("replace_names `{}` {:?}", result, substs); @@ -655,7 +663,9 @@ impl MacroArgKind { }; match *self { - MacroArgKind::MetaVariable(ty, ref name) => Some(format!("${name}:{ty}")), + MacroArgKind::MetaVariable(ty, ref name) => { + Some(format!("${}:{ty}", nfc_normalize(context, name))) + } MacroArgKind::Repeat(delim_tok, ref args, ref another, ref tok) => { let (lhs, inner, rhs) = rewrite_delimited_inner(delim_tok, args)?; let another = another @@ -1273,7 +1283,7 @@ impl MacroBranch { // `$$`). We'll try and format like an AST node, but we'll substitute // variables for new names with the same length first. - let (body_str, substs) = replace_names(old_body)?; + let (body_str, substs) = replace_names(context, old_body)?; let mut config = context.config.clone(); config.set().show_parse_errors(false); diff --git a/src/patterns.rs b/src/patterns.rs index 0fa6edaa5d7..1e086aa01d2 100644 --- a/src/patterns.rs +++ b/src/patterns.rs @@ -162,14 +162,14 @@ impl Rewrite for Pat { let hi = context.snippet_provider.span_before(self.span, "@"); combine_strs_with_missing_comments( context, - id_str, + &id_str, &sub_pat, mk_sp(ident.span.hi(), hi), shape, true, )? } else { - id_str.to_owned() + id_str.into_owned() }; combine_strs_with_missing_comments( diff --git a/src/types.rs b/src/types.rs index cd2582e66be..79fd274c1fe 100644 --- a/src/types.rs +++ b/src/types.rs @@ -180,7 +180,7 @@ impl Rewrite for ast::AssocConstraint { use ast::AssocConstraintKind::{Bound, Equality}; let mut result = String::with_capacity(128); - result.push_str(rewrite_ident(context, self.ident)); + result.push_str(&rewrite_ident(context, self.ident)); if let Some(ref gen_args) = self.gen_args { let budget = shape.width.checked_sub(result.len())?; @@ -236,7 +236,7 @@ fn rewrite_segment( shape: Shape, ) -> Option { let mut result = String::with_capacity(128); - result.push_str(rewrite_ident(context, segment.ident)); + result.push_str(&rewrite_ident(context, segment.ident)); let ident_len = result.len(); let shape = if context.use_block_indent() { @@ -530,7 +530,7 @@ impl Rewrite for ast::AnonConst { impl Rewrite for ast::Lifetime { fn rewrite(&self, context: &RewriteContext<'_>, _: Shape) -> Option { - Some(rewrite_ident(context, self.ident).to_owned()) + Some(rewrite_ident(context, self.ident).into_owned()) } } @@ -581,7 +581,7 @@ impl Rewrite for ast::GenericParam { } = &self.kind { param.push_str("const "); - param.push_str(rewrite_ident(context, self.ident)); + param.push_str(&rewrite_ident(context, self.ident)); param.push_str(": "); param.push_str(&ty.rewrite(context, shape)?); if let Some(default) = default { @@ -596,7 +596,7 @@ impl Rewrite for ast::GenericParam { } kw_span.lo() } else { - param.push_str(rewrite_ident(context, self.ident)); + param.push_str(&rewrite_ident(context, self.ident)); self.ident.span.lo() }; diff --git a/src/utils.rs b/src/utils.rs index 642b6603b1e..2ed90b3623b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -24,8 +24,26 @@ pub(crate) fn skip_annotation() -> Symbol { Symbol::intern("rustfmt::skip") } -pub(crate) fn rewrite_ident<'a>(context: &'a RewriteContext<'_>, ident: symbol::Ident) -> &'a str { - context.snippet(ident.span) +pub(crate) fn rewrite_ident<'a>( + context: &'a RewriteContext<'a>, + ident: symbol::Ident, +) -> Cow<'a, str> { + nfc_normalize(context, context.snippet(ident.span)) +} + +pub(crate) fn nfc_normalize<'r, 's>(context: &'r RewriteContext<'r>, s: &'s str) -> Cow<'s, str> { + use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization}; + + if context.config.nfc_normalize_idents() && is_nfc_quick(s.chars()) != IsNormalized::Yes { + let normalized_str: String = s.chars().nfc().collect(); + if s == normalized_str { + Cow::Borrowed(s) + } else { + Cow::Owned(normalized_str) + } + } else { + Cow::Borrowed(s) + } } // Computes the length of a string's last line, minus offset. diff --git a/src/vertical.rs b/src/vertical.rs index a06bc995aa5..ebbfffb152c 100644 --- a/src/vertical.rs +++ b/src/vertical.rs @@ -92,7 +92,7 @@ impl AlignedItem for ast::ExprField { combine_strs_with_missing_comments( context, &attrs_str, - name, + &name, missing_span, shape, is_attributes_extendable(&attrs_str), diff --git a/src/visitor.rs b/src/visitor.rs index 61e147ed8f5..3e649c7e96f 100644 --- a/src/visitor.rs +++ b/src/visitor.rs @@ -924,8 +924,8 @@ impl<'b, 'a: 'b> FmtVisitor<'a> { self.push_str(&*vis_str); self.push_str(format_unsafety(unsafety)); self.push_str("mod "); - // Calling `to_owned()` to work around borrow checker. - let ident_str = rewrite_ident(&self.get_context(), ident).to_owned(); + // Calling `into_owned()` to work around borrow checker. + let ident_str = rewrite_ident(&self.get_context(), ident).into_owned(); self.push_str(&ident_str); if let ast::ModKind::Loaded(ref items, ast::Inline::Yes, ref spans) = mod_kind { diff --git a/tests/source/configs/nfc_normalize_idents/true.rs b/tests/source/configs/nfc_normalize_idents/true.rs new file mode 100644 index 00000000000..2804d7dcd9e --- /dev/null +++ b/tests/source/configs/nfc_normalize_idents/true.rs @@ -0,0 +1,39 @@ +// rustfmt-nfc_normalize_idents: true +// rustfmt-format_macro_matchers: true +// Normalize identifiers to NFC. + +// Accents in source file are U+0301 COMBINING ACUTE ACCENT, +// in target they are precomposed characters. + +struct Foó { + pub foó: Option>, +} + +const FOÓ: Foó = Foó { foó: None }; + +macro_rules! foó { + (foó $foó:ident) => { + $foó + }; +} + +fn foó>(foó: Foó) -> Foó { + // FIXME: some macro invocations, like this one, don't get normalized + let foó: Foó = foó!(foó foó); + match foó { + Foó { foó: foó } if foó == foó => *foó.unwrap(), + } +} + +mod foó { + use super::Foó; + + trait Foó: Foó + where + Self: Foó, + { + type Foó: Foó; + + const FOÓ: Foó; + } +} diff --git a/tests/target/configs/nfc_normalize_idents/true.rs b/tests/target/configs/nfc_normalize_idents/true.rs new file mode 100644 index 00000000000..23b45455d03 --- /dev/null +++ b/tests/target/configs/nfc_normalize_idents/true.rs @@ -0,0 +1,39 @@ +// rustfmt-nfc_normalize_idents: true +// rustfmt-format_macro_matchers: true +// Normalize identifiers to NFC. + +// Accents in source file are U+0301 COMBINING ACUTE ACCENT, +// in target they are precomposed characters. + +struct Foó { + pub foó: Option>, +} + +const FOÓ: Foó = Foó { foó: None }; + +macro_rules! foó { + (foó $foó:ident) => { + $foó + }; +} + +fn foó>(foó: Foó) -> Foó { + // FIXME: some macro invocations, like this one, don't get normalized + let foó: Foó = foó!(foó foó); + match foó { + Foó { foó: foó } if foó == foó => *foó.unwrap(), + } +} + +mod foó { + use super::Foó; + + trait Foó: Foó + where + Self: Foó, + { + type Foó: Foó; + + const FOÓ: Foó; + } +}