Skip to content

Optimize bidi character detection. #90559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions compiler/rustc_ast/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#![feature(nll)]
#![feature(min_specialization)]
#![recursion_limit = "256"]
#![feature(slice_internals)]

#[macro_use]
extern crate rustc_macros;
Expand All @@ -25,6 +26,7 @@ pub mod util {
pub mod comments;
pub mod literal;
pub mod parser;
pub mod unicode;
}

pub mod ast;
Expand Down
35 changes: 35 additions & 0 deletions compiler/rustc_ast/src/util/unicode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
'\u{2069}',
];

#[inline]
pub fn contains_text_flow_control_chars(s: &str) -> bool {
// Char - UTF-8
// U+202A - E2 80 AA
// U+202B - E2 80 AB
// U+202C - E2 80 AC
// U+202D - E2 80 AD
// U+202E - E2 80 AE
// U+2066 - E2 81 A6
// U+2067 - E2 81 A7
// U+2068 - E2 81 A8
// U+2069 - E2 81 A9
let mut bytes = s.as_bytes();
loop {
match core::slice::memchr::memchr(0xE2, &bytes) {
Some(idx) => {
// bytes are valid UTF-8 -> E2 must be followed by two bytes
let ch = &bytes[idx..idx + 3];
match ch {
[_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true,
_ => {}
}
bytes = &bytes[idx + 3..];
}
None => {
break false;
}
}
}
}
4 changes: 2 additions & 2 deletions compiler/rustc_lint/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

use self::TargetLint::*;

use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS;
use crate::levels::{is_known_lint_tool, LintLevelsBuilder};
use crate::passes::{EarlyLintPassObject, LateLintPassObject};
use ast::util::unicode::TEXT_FLOW_CONTROL_CHARS;
use rustc_ast as ast;
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::sync;
Expand Down Expand Up @@ -602,7 +602,7 @@ pub trait LintContext: Sized {
let spans: Vec<_> = content
.char_indices()
.filter_map(|(i, c)| {
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
let lo = span.lo() + BytePos(2 + i as u32);
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
})
Expand Down
12 changes: 4 additions & 8 deletions compiler/rustc_lint/src/hidden_unicode_codepoints.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::{EarlyContext, EarlyLintPass, LintContext};
use ast::util::unicode::{contains_text_flow_control_chars, TEXT_FLOW_CONTROL_CHARS};
use rustc_ast as ast;
use rustc_errors::{Applicability, SuggestionStyle};
use rustc_span::{BytePos, Span, Symbol};
Expand Down Expand Up @@ -37,11 +38,6 @@ declare_lint! {

declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]);

crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
'\u{2069}',
];

impl HiddenUnicodeCodepoints {
fn lint_text_direction_codepoint(
&self,
Expand All @@ -57,7 +53,7 @@ impl HiddenUnicodeCodepoints {
.as_str()
.char_indices()
.filter_map(|(i, c)| {
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
let lo = span.lo() + BytePos(i as u32 + padding);
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
})
Expand Down Expand Up @@ -131,7 +127,7 @@ impl HiddenUnicodeCodepoints {
impl EarlyLintPass for HiddenUnicodeCodepoints {
fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) {
if let ast::AttrKind::DocComment(_, comment) = attr.kind {
if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
if contains_text_flow_control_chars(&comment.as_str()) {
self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment");
}
}
Expand All @@ -142,7 +138,7 @@ impl EarlyLintPass for HiddenUnicodeCodepoints {
let (text, span, padding) = match &expr.kind {
ast::ExprKind::Lit(ast::Lit { token, kind, span }) => {
let text = token.symbol;
if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
if !contains_text_flow_control_chars(&text.as_str()) {
return;
}
let padding = match kind {
Expand Down
9 changes: 3 additions & 6 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use rustc_ast::ast::{self, AttrStyle};
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::tokenstream::{Spacing, TokenStream};
use rustc_ast::util::unicode::contains_text_flow_control_chars;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult};
use rustc_lexer::unescape::{self, Mode};
use rustc_lexer::{Base, DocStyle, RawStrError};
Expand Down Expand Up @@ -137,12 +138,8 @@ impl<'a> StringReader<'a> {
// Opening delimiter of the length 2 is not included into the comment text.
let content_start = start + BytePos(2);
let content = self.str_from(content_start);
let span = self.mk_sp(start, self.pos);
const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
'\u{202C}', '\u{2069}',
];
if content.contains(UNICODE_TEXT_FLOW_CHARS) {
if contains_text_flow_control_chars(content) {
let span = self.mk_sp(start, self.pos);
self.sess.buffer_lint_with_diagnostic(
&TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
span,
Expand Down