Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance optimizations (up to 3518% faster language detection) #177

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 62 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,15 @@ lingua-yoruba-language-model = { path = "language-models/yo", version = "=1.0.1"
lingua-zulu-language-model = { path = "language-models/zu", version = "=1.0.1", optional = true }

[target.'cfg(not(target_family = "wasm"))'.dependencies]
ahash = "0.8.3"
cld2 = { version = "1.0.2", optional = true }
indoc = { version = "2.0.1", optional = true }
rayon = "1.7.0"
titlecase = { version = "2.2.0", optional = true }
whatlang = { version = "0.16.2", optional = true }

[target.'cfg(target_family = "wasm")'.dependencies]
ahash = { version = "0.8.3", default-features = false, features = ["std", "compile-time-rng"] }
wasm-bindgen = { version = "0.2.84", features = ["serde-serialize"] }

[dev-dependencies]
Expand Down
98 changes: 76 additions & 22 deletions src/alphabet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
*/

use crate::language::Language;
use ahash::AHashMap as HashMap;
use ahash::AHashSet as HashSet;
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashMap;
use strum::IntoEnumIterator;
use strum_macros::EnumIter;

Expand Down Expand Up @@ -67,6 +67,29 @@ impl Alphabet {
}
}

pub fn matches_char(&self, ch: char) -> bool {
match self {
Alphabet::Arabic => ARABIC.is_match_char(ch),
Alphabet::Armenian => ARMENIAN.is_match_char(ch),
Alphabet::Bengali => BENGALI.is_match_char(ch),
Alphabet::Cyrillic => CYRILLIC.is_match_char(ch),
Alphabet::Devanagari => DEVANAGARI.is_match_char(ch),
Alphabet::Georgian => GEORGIAN.is_match_char(ch),
Alphabet::Greek => GREEK.is_match_char(ch),
Alphabet::Gujarati => GUJARATI.is_match_char(ch),
Alphabet::Gurmukhi => GURMUKHI.is_match_char(ch),
Alphabet::Han => HAN.is_match_char(ch),
Alphabet::Hangul => HANGUL.is_match_char(ch),
Alphabet::Hebrew => HEBREW.is_match_char(ch),
Alphabet::Hiragana => HIRAGANA.is_match_char(ch),
Alphabet::Katakana => KATAKANA.is_match_char(ch),
Alphabet::Latin => LATIN.is_match_char(ch),
Alphabet::Tamil => TAMIL.is_match_char(ch),
Alphabet::Telugu => TELUGU.is_match_char(ch),
Alphabet::Thai => THAI.is_match_char(ch),
}
}

pub fn all_supporting_single_language() -> HashMap<Alphabet, Language> {
let mut alphabets = HashMap::new();
for alphabet in Alphabet::iter() {
Expand All @@ -89,25 +112,56 @@ impl Alphabet {
}
}

static ARABIC: Lazy<Regex> = Lazy::new(|| create_regex("Arabic"));
static ARMENIAN: Lazy<Regex> = Lazy::new(|| create_regex("Armenian"));
static BENGALI: Lazy<Regex> = Lazy::new(|| create_regex("Bengali"));
static CYRILLIC: Lazy<Regex> = Lazy::new(|| create_regex("Cyrillic"));
static DEVANAGARI: Lazy<Regex> = Lazy::new(|| create_regex("Devanagari"));
static GEORGIAN: Lazy<Regex> = Lazy::new(|| create_regex("Georgian"));
static GREEK: Lazy<Regex> = Lazy::new(|| create_regex("Greek"));
static GUJARATI: Lazy<Regex> = Lazy::new(|| create_regex("Gujarati"));
static GURMUKHI: Lazy<Regex> = Lazy::new(|| create_regex("Gurmukhi"));
static HAN: Lazy<Regex> = Lazy::new(|| create_regex("Han"));
static HANGUL: Lazy<Regex> = Lazy::new(|| create_regex("Hangul"));
static HEBREW: Lazy<Regex> = Lazy::new(|| create_regex("Hebrew"));
static HIRAGANA: Lazy<Regex> = Lazy::new(|| create_regex("Hiragana"));
static KATAKANA: Lazy<Regex> = Lazy::new(|| create_regex("Katakana"));
static LATIN: Lazy<Regex> = Lazy::new(|| create_regex("Latin"));
static TAMIL: Lazy<Regex> = Lazy::new(|| create_regex("Tamil"));
static TELUGU: Lazy<Regex> = Lazy::new(|| create_regex("Telugu"));
static THAI: Lazy<Regex> = Lazy::new(|| create_regex("Thai"));
pub(crate) struct CharSet(HashSet<char>);

fn create_regex(char_class: &str) -> Regex {
Regex::new(&format!("^\\p{{{char_class}}}+$")).unwrap()
impl CharSet {
pub fn from_classes(char_classes: &[&str]) -> Self {
let mut set = HashSet::new();
for char_class in char_classes {
let table = crate::script::BY_NAME
.iter()
.find(|(name, _)| *name == *char_class)
.unwrap()
.1;
for &(start, end) in table {
for codepoint in start as u32..=end as u32 {
let ch = char::from_u32(codepoint).unwrap();
set.insert(ch);
}
}
}

CharSet(set)
}

pub fn from_class(char_class: &str) -> Self {
Self::from_classes(&[char_class])
}

pub fn is_match(&self, text: &str) -> bool {
text.chars().all(|ch| self.0.contains(&ch))
}

pub fn is_match_char(&self, ch: char) -> bool {
self.0.contains(&ch)
}
}

static ARABIC: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Arabic"));
static ARMENIAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Armenian"));
static BENGALI: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Bengali"));
static CYRILLIC: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Cyrillic"));
static DEVANAGARI: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Devanagari"));
static GEORGIAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Georgian"));
static GREEK: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Greek"));
static GUJARATI: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Gujarati"));
static GURMUKHI: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Gurmukhi"));
static HAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Han"));
static HANGUL: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Hangul"));
static HEBREW: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Hebrew"));
static HIRAGANA: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Hiragana"));
static KATAKANA: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Katakana"));
static LATIN: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Latin"));
static TAMIL: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Tamil"));
static TELUGU: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Telugu"));
static THAI: Lazy<CharSet> = Lazy::new(|| CharSet::from_class("Thai"));
5 changes: 3 additions & 2 deletions src/constant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ use std::str::FromStr;
use once_cell::sync::Lazy;
use regex::Regex;

use crate::alphabet::CharSet;
use crate::language::Language;

pub(crate) static JAPANESE_CHARACTER_SET: Lazy<Regex> =
Lazy::new(|| Regex::new("^[\\p{Hiragana}\\p{Katakana}\\p{Han}]+$").unwrap());
pub(crate) static JAPANESE_CHARACTER_SET: Lazy<CharSet> =
Lazy::new(|| CharSet::from_classes(&["Hiragana", "Katakana", "Han"]));
pub(crate) static MULTIPLE_WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new("\\s+").unwrap());
pub(crate) static NUMBERS: Lazy<Regex> = Lazy::new(|| Regex::new("\\p{N}").unwrap());
pub(crate) static PUNCTUATION: Lazy<Regex> = Lazy::new(|| Regex::new("\\p{P}").unwrap());
Expand Down
Loading