Skip to content

Commit

Permalink
Merge pull request #292 from epage/unicode
Browse files Browse the repository at this point in the history
perf(parser): Auto-detect unicode
  • Loading branch information
epage authored Jun 29, 2021
2 parents 21231bf + ded90f2 commit a46cc76
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 2 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/typos/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
itertools = "0.10"
log = "0.4"
unicode-segmentation = "1.7.1"
bstr = "0.2"
6 changes: 4 additions & 2 deletions crates/typos/src/tokens.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use bstr::ByteSlice;

/// Define rules for tokenizaing a buffer.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerBuilder {
Expand Down Expand Up @@ -67,7 +69,7 @@ impl Tokenizer {
}

pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode {
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
itertools::Either::Left(unicode_parser::iter_literals(content))
} else {
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
Expand All @@ -79,7 +81,7 @@ impl Tokenizer {
}

pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode {
let iter = if self.unicode && !ByteSlice::is_ascii(content) {
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
itertools::Either::Left(iter)
} else {
Expand Down

0 comments on commit a46cc76

Please sign in to comment.