Skip to content

Commit

Permalink
perf: Use standard identifier rules to avoid doing umber checks
Browse files Browse the repository at this point in the history
  • Loading branch information
epage committed Nov 3, 2019
1 parent ed00f3c commit 107308a
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 20 deletions.
43 changes: 40 additions & 3 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,27 @@ pub trait FileSource {
None
}

/// Do not check identifiers that appear to be hexadecimal values
/// Do not check identifiers that appear to be hexadecimal values.
fn ignore_hex(&self) -> Option<bool> {
None
}

/// Allow identifiers to include digits, in addition to letters
/// Allow identifiers to start with digits, in addition to letters.
fn identifier_leading_digits(&self) -> Option<bool> {
None
}

/// Allow identifiers to start with one of these characters.
fn identifier_leading_chars(&self) -> Option<&str> {
None
}

/// Allow identifiers to include digits, in addition to letters.
fn identifier_include_digits(&self) -> Option<bool> {
None
}

/// Specify additional characters to be included in identifiers
/// Allow identifiers to include these characters.
fn identifier_include_chars(&self) -> Option<&str> {
None
}
Expand Down Expand Up @@ -233,6 +243,8 @@ pub struct FileConfig {
pub check_filename: Option<bool>,
pub check_file: Option<bool>,
pub ignore_hex: Option<bool>,
pub identifier_leading_digits: Option<bool>,
pub identifier_leading_chars: Option<String>,
pub identifier_include_digits: Option<bool>,
pub identifier_include_chars: Option<String>,
}
Expand All @@ -248,6 +260,12 @@ impl FileConfig {
if let Some(source) = source.ignore_hex() {
self.ignore_hex = Some(source);
}
if let Some(source) = source.identifier_leading_digits() {
self.identifier_leading_digits = Some(source);
}
if let Some(source) = source.identifier_leading_chars() {
self.identifier_leading_chars = Some(source.to_owned());
}
if let Some(source) = source.identifier_include_digits() {
self.identifier_include_digits = Some(source);
}
Expand All @@ -268,6 +286,17 @@ impl FileConfig {
self.ignore_hex.unwrap_or(true)
}

pub fn identifier_leading_digits(&self) -> bool {
self.identifier_leading_digits.unwrap_or(false)
}

pub fn identifier_leading_chars(&self) -> &str {
self.identifier_leading_chars
.as_ref()
.map(|s| s.as_str())
.unwrap_or("_")
}

pub fn identifier_include_digits(&self) -> bool {
self.identifier_include_digits.unwrap_or(true)
}
Expand All @@ -293,6 +322,14 @@ impl FileSource for FileConfig {
self.ignore_hex
}

fn identifier_leading_digits(&self) -> Option<bool> {
self.identifier_leading_digits
}

fn identifier_leading_chars(&self) -> Option<&str> {
self.identifier_leading_chars.as_ref().map(|s| s.as_str())
}

fn identifier_include_digits(&self) -> Option<bool> {
self.identifier_include_digits
}
Expand Down
2 changes: 2 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,8 @@ fn run() -> Result<i32, anyhow::Error> {

let parser = typos::tokens::ParserBuilder::new()
.ignore_hex(config.default.ignore_hex())
.leading_digits(config.default.identifier_leading_digits())
.leading_chars(config.default.identifier_leading_chars().to_owned())
.include_digits(config.default.identifier_include_digits())
.include_chars(config.default.identifier_include_chars().to_owned())
.build();
Expand Down
63 changes: 46 additions & 17 deletions typos/src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ pub enum Case {
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ParserBuilder {
ignore_hex: bool,
leading_digits: bool,
leading_chars: String,
include_digits: bool,
include_chars: String,
}
Expand All @@ -23,6 +25,16 @@ impl ParserBuilder {
self
}

pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
self.leading_digits = yes;
self
}

pub fn leading_chars(&mut self, chars: String) -> &mut Self {
self.leading_chars = chars;
self
}

pub fn include_digits(&mut self, yes: bool) -> &mut Self {
self.include_digits = yes;
self
Expand All @@ -34,31 +46,44 @@ impl ParserBuilder {
}

pub fn build(&self) -> Parser {
let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned();
if self.include_digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in
unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
{
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")+\b"#);
let mut pattern = r#"\b("#.to_owned();
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
pattern.push_str(r#"*)\b"#);
let pattern = dbg!(pattern);

let words_str = regex::Regex::new(&pattern).unwrap();
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();

Parser {
words_str,
words_bytes,
ignore_hex: self.ignore_hex && self.include_digits,
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
// hexadecimal number without a leading digit.
ignore_numbers: self.leading_digits,
ignore_hex: self.ignore_hex && self.leading_digits,
}
}

fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
pattern.push_str(r#"(\p{Alphabetic}"#);
if digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")"#);
}
}

impl Default for ParserBuilder {
fn default() -> Self {
Self {
ignore_hex: true,
leading_digits: false,
leading_chars: "_".to_owned(),
include_digits: true,
include_chars: "_'".to_owned(),
}
Expand All @@ -69,6 +94,7 @@ impl Default for ParserBuilder {
pub struct Parser {
words_str: regex::Regex,
words_bytes: regex::bytes::Regex,
ignore_numbers: bool,
ignore_hex: bool,
}

Expand All @@ -95,12 +121,12 @@ impl Parser {
}

fn accept(&self, contents: &[u8]) -> bool {
if is_number(contents) {
if self.ignore_numbers && is_number(contents) {
return false;
};
}

if self.ignore_hex {
return !is_hex(contents);
if self.ignore_hex && is_hex(contents) {
return false;
}

true
Expand Down Expand Up @@ -455,7 +481,10 @@ mod test {

#[test]
fn tokenize_ignore_hex_disabled() {
let parser = ParserBuilder::new().ignore_hex(false).build();
let parser = ParserBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();

let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Expand Down

0 comments on commit 107308a

Please sign in to comment.