From 2d9f0e2c50ff6131643fd0b2d5a9f65a7006f50c Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 3 Nov 2021 23:37:23 +0100 Subject: [PATCH 1/5] Optimize bidi character detection. --- compiler/rustc_parse/src/lexer/mod.rs | 45 +++++++++++++++++++++++---- compiler/rustc_parse/src/lib.rs | 1 + 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 09a3d1b902831..21d0ee60cdade 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -137,12 +137,45 @@ impl<'a> StringReader<'a> { // Opening delimiter of the length 2 is not included into the comment text. let content_start = start + BytePos(2); let content = self.str_from(content_start); - let span = self.mk_sp(start, self.pos); - const UNICODE_TEXT_FLOW_CHARS: &[char] = &[ - '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', - '\u{202C}', '\u{2069}', - ]; - if content.contains(UNICODE_TEXT_FLOW_CHARS) { + + // Char - UTF-8 + // U+202A - E2 80 AA + // U+202B - E2 80 AB + // U+202C - E2 80 AC + // U+202D - E2 80 AD + // U+202E - E2 80 AE + // U+2066 - E2 81 A6 + // U+2067 - E2 81 A7 + // U+2068 - E2 81 A8 + // U+2069 - E2 81 A9 + let mut bytes = content.as_bytes(); + let contains_flow_control_chars = loop { + match core::slice::memchr::memchr(0xE2, &bytes) { + Some(idx) => { + // bytes are valid UTF-8 -> E2 must be followed by two bytes + match bytes[idx + 1] { + 0x80 => { + if (0xAA..=0xAE).contains(&bytes[idx + 2]) { + break true; + } + } + 0x81 => { + if (0xA6..=0xA9).contains(&bytes[idx + 2]) { + break true; + } + } + _ => {} + } + bytes = &bytes[idx + 3..]; + } + None => { + break false; + } + } + }; + + if contains_flow_control_chars { + let span = self.mk_sp(start, self.pos); self.sess.buffer_lint_with_diagnostic( &TEXT_DIRECTION_CODEPOINT_IN_COMMENT, span, diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs index a40f47f895bbe..063b0183a8fd5 100644 --- a/compiler/rustc_parse/src/lib.rs +++ b/compiler/rustc_parse/src/lib.rs @@ -4,6 +4,7 @@ #![feature(crate_visibility_modifier)] #![feature(if_let_guard)] #![feature(box_patterns)] +#![feature(slice_internals)] #![recursion_limit = "256"] #[macro_use] From a5b25a2cfa1adb52723fa4a5b458dd9d6272117a Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 4 Nov 2021 17:03:13 +0100 Subject: [PATCH 2/5] Create subslice as that leads to a smaller code size. --- compiler/rustc_parse/src/lexer/mod.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 21d0ee60cdade..c0f2863d01cd7 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -153,14 +153,15 @@ impl<'a> StringReader<'a> { match core::slice::memchr::memchr(0xE2, &bytes) { Some(idx) => { // bytes are valid UTF-8 -> E2 must be followed by two bytes - match bytes[idx + 1] { + let ch = &bytes[idx..idx + 3]; + match ch[1] { 0x80 => { - if (0xAA..=0xAE).contains(&bytes[idx + 2]) { + if (0xAA..=0xAE).contains(&ch[2]) { break true; } } 0x81 => { - if (0xA6..=0xA9).contains(&bytes[idx + 2]) { + if (0xA6..=0xA9).contains(&ch[2]) { break true; } } From 7885233df01abf51d2947b6b466a17a1843b2a60 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 4 Nov 2021 23:31:42 +0100 Subject: [PATCH 3/5] Optimize literal, doc comment lint as well, extract function. --- compiler/rustc_ast/src/lib.rs | 2 + compiler/rustc_ast/src/util/unicode.rs | 44 +++++++++++++++++++ compiler/rustc_lint/src/context.rs | 4 +- .../src/hidden_unicode_codepoints.rs | 12 ++--- compiler/rustc_parse/src/lexer/mod.rs | 41 +---------------- 5 files changed, 54 insertions(+), 49 deletions(-) create mode 100644 compiler/rustc_ast/src/util/unicode.rs diff --git a/compiler/rustc_ast/src/lib.rs b/compiler/rustc_ast/src/lib.rs index e3c610585d978..8a8dc44489b36 100644 --- a/compiler/rustc_ast/src/lib.rs +++ b/compiler/rustc_ast/src/lib.rs @@ -16,6 +16,7 @@ #![feature(nll)] #![feature(min_specialization)] #![recursion_limit = "256"] +#![feature(slice_internals)] #[macro_use] extern crate rustc_macros; @@ -25,6 +26,7 @@ pub mod util { pub mod comments; pub mod literal; pub mod parser; + pub mod unicode; } pub mod ast; diff --git a/compiler/rustc_ast/src/util/unicode.rs b/compiler/rustc_ast/src/util/unicode.rs new file mode 100644 index 0000000000000..ad73d6e4fe205 --- /dev/null +++ b/compiler/rustc_ast/src/util/unicode.rs @@ -0,0 +1,44 @@ +pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[ + '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}', + '\u{2069}', +]; + +#[inline] +pub fn contains_text_flow_control_chars(s: &str) -> bool { + // Char - UTF-8 + // U+202A - E2 80 AA + // U+202B - E2 80 AB + // U+202C - E2 80 AC + // U+202D - E2 80 AD + // U+202E - E2 80 AE + // U+2066 - E2 81 A6 + // U+2067 - E2 81 A7 + // U+2068 - E2 81 A8 + // U+2069 - E2 81 A9 + let mut bytes = s.as_bytes(); + loop { + match core::slice::memchr::memchr(0xE2, &bytes) { + Some(idx) => { + // bytes are valid UTF-8 -> E2 must be followed by two bytes + let ch = &bytes[idx..idx + 3]; + match ch[1] { + 0x80 => { + if (0xAA..=0xAE).contains(&ch[2]) { + break true; + } + } + 0x81 => { + if (0xA6..=0xA9).contains(&ch[2]) { + break true; + } + } + _ => {} + } + bytes = &bytes[idx + 3..]; + } + None => { + break false; + } + } + } +} diff --git a/compiler/rustc_lint/src/context.rs b/compiler/rustc_lint/src/context.rs index 6fd0a5b95f9f6..4c936dec6f2cd 100644 --- a/compiler/rustc_lint/src/context.rs +++ b/compiler/rustc_lint/src/context.rs @@ -16,9 +16,9 @@ use self::TargetLint::*; -use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS; use crate::levels::{is_known_lint_tool, LintLevelsBuilder}; use crate::passes::{EarlyLintPassObject, LateLintPassObject}; +use ast::util::unicode::TEXT_FLOW_CONTROL_CHARS; use rustc_ast as ast; use rustc_data_structures::fx::FxHashMap; use rustc_data_structures::sync; @@ -602,7 +602,7 @@ pub trait LintContext: Sized { let spans: Vec<_> = content .char_indices() .filter_map(|(i, c)| { - UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| { + TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| { let lo = span.lo() + BytePos(2 + i as u32); (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32))) }) diff --git a/compiler/rustc_lint/src/hidden_unicode_codepoints.rs b/compiler/rustc_lint/src/hidden_unicode_codepoints.rs index 1bcdcb806fc43..fde84be9a7c30 100644 --- a/compiler/rustc_lint/src/hidden_unicode_codepoints.rs +++ b/compiler/rustc_lint/src/hidden_unicode_codepoints.rs @@ -1,4 +1,5 @@ use crate::{EarlyContext, EarlyLintPass, LintContext}; +use ast::util::unicode::{contains_text_flow_control_chars, TEXT_FLOW_CONTROL_CHARS}; use rustc_ast as ast; use rustc_errors::{Applicability, SuggestionStyle}; use rustc_span::{BytePos, Span, Symbol}; @@ -37,11 +38,6 @@ declare_lint! { declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]); -crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[ - '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}', - '\u{2069}', -]; - impl HiddenUnicodeCodepoints { fn lint_text_direction_codepoint( &self, @@ -57,7 +53,7 @@ impl HiddenUnicodeCodepoints { .as_str() .char_indices() .filter_map(|(i, c)| { - UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| { + TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| { let lo = span.lo() + BytePos(i as u32 + padding); (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32))) }) @@ -131,7 +127,7 @@ impl HiddenUnicodeCodepoints { impl EarlyLintPass for HiddenUnicodeCodepoints { fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) { if let ast::AttrKind::DocComment(_, comment) = attr.kind { - if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) { + if contains_text_flow_control_chars(&comment.as_str()) { self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment"); } } @@ -142,7 +138,7 @@ impl EarlyLintPass for HiddenUnicodeCodepoints { let (text, span, padding) = match &expr.kind { ast::ExprKind::Lit(ast::Lit { token, kind, span }) => { let text = token.symbol; - if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) { + if !contains_text_flow_control_chars(&text.as_str()) { return; } let padding = match kind { diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index c0f2863d01cd7..cf35c3cd53b2f 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,6 +1,7 @@ use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Token, TokenKind}; use rustc_ast::tokenstream::{Spacing, TokenStream}; +use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult}; use rustc_lexer::unescape::{self, Mode}; use rustc_lexer::{Base, DocStyle, RawStrError}; @@ -137,45 +138,7 @@ impl<'a> StringReader<'a> { // Opening delimiter of the length 2 is not included into the comment text. let content_start = start + BytePos(2); let content = self.str_from(content_start); - - // Char - UTF-8 - // U+202A - E2 80 AA - // U+202B - E2 80 AB - // U+202C - E2 80 AC - // U+202D - E2 80 AD - // U+202E - E2 80 AE - // U+2066 - E2 81 A6 - // U+2067 - E2 81 A7 - // U+2068 - E2 81 A8 - // U+2069 - E2 81 A9 - let mut bytes = content.as_bytes(); - let contains_flow_control_chars = loop { - match core::slice::memchr::memchr(0xE2, &bytes) { - Some(idx) => { - // bytes are valid UTF-8 -> E2 must be followed by two bytes - let ch = &bytes[idx..idx + 3]; - match ch[1] { - 0x80 => { - if (0xAA..=0xAE).contains(&ch[2]) { - break true; - } - } - 0x81 => { - if (0xA6..=0xA9).contains(&ch[2]) { - break true; - } - } - _ => {} - } - bytes = &bytes[idx + 3..]; - } - None => { - break false; - } - } - }; - - if contains_flow_control_chars { + if contains_text_flow_control_chars(content) { let span = self.mk_sp(start, self.pos); self.sess.buffer_lint_with_diagnostic( &TEXT_DIRECTION_CODEPOINT_IN_COMMENT, From e339e4789f919d27bf8c13ca41abf5529b4fd056 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 4 Nov 2021 23:40:59 +0100 Subject: [PATCH 4/5] Remove now unused feature from rustc_parse --- compiler/rustc_parse/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs index 063b0183a8fd5..a40f47f895bbe 100644 --- a/compiler/rustc_parse/src/lib.rs +++ b/compiler/rustc_parse/src/lib.rs @@ -4,7 +4,6 @@ #![feature(crate_visibility_modifier)] #![feature(if_let_guard)] #![feature(box_patterns)] -#![feature(slice_internals)] #![recursion_limit = "256"] #[macro_use] From 39110beab03ed74d1cb97df77721c0ad2fcb165b Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 5 Nov 2021 00:39:34 +0100 Subject: [PATCH 5/5] Use one match instead of a staggered match. --- compiler/rustc_ast/src/util/unicode.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/compiler/rustc_ast/src/util/unicode.rs b/compiler/rustc_ast/src/util/unicode.rs index ad73d6e4fe205..f009f7b300ce0 100644 --- a/compiler/rustc_ast/src/util/unicode.rs +++ b/compiler/rustc_ast/src/util/unicode.rs @@ -21,17 +21,8 @@ pub fn contains_text_flow_control_chars(s: &str) -> bool { Some(idx) => { // bytes are valid UTF-8 -> E2 must be followed by two bytes let ch = &bytes[idx..idx + 3]; - match ch[1] { - 0x80 => { - if (0xAA..=0xAE).contains(&ch[2]) { - break true; - } - } - 0x81 => { - if (0xA6..=0xA9).contains(&ch[2]) { - break true; - } - } + match ch { + [_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true, _ => {} } bytes = &bytes[idx + 3..];