Skip to content

Commit

Permalink
Split camelCase in Latin segmenter
Browse files Browse the repository at this point in the history
To improve recall and to be consistent with snake_case and kebab-case
splitting that's already in place, make the Latin Segmenter split words
on camelCase boundaries.

Define camelCase boundary as a lowercase letter directly followed by an
uppercase one. (Or the position between them, to be precise.) This
treats most cases and avoids the common pitfalls like eg. ALL_CAPS.
What is not handled though, are abbreviations within a longer word.
Especially in code it's common to write eg. "meiliAPIClient". With this
implementation it's split into ["meili", "APIClient"].

Leverage the Unicode General Categories
https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
and their support in the Regex crate for matching lowercase and
uppercase letters.

Put the logic into a separate module and expose API similar to
UnicodeSegmentation's to keep the call-side in latin.rs clean and
concise.

Closes meilisearch#129.
  • Loading branch information
goodhoko committed Feb 1, 2023
1 parent a3eab30 commit 6703a20
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 4 deletions.
1 change: 1 addition & 0 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ deunicode = "1.1.1"
fst = "0.4"
jieba-rs = { version = "0.6", optional = true }
once_cell = "1.5.2"
regex = "1.7.1"
serde = "1.0"
slice-group-by = "0.3.0"
unicode-segmentation = "1.6.0"
Expand Down
56 changes: 56 additions & 0 deletions charabia/src/segmenter/camel_case.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use once_cell::sync::Lazy;
use regex::Regex;

pub(crate) trait CamelCaseSegmentation {
/// Returns an iterator over substrings of `self` separated on camelCase boundaries.
/// For instance, "camelCase" is split into ["camel", "Case"].
/// A camelCase boundary constitutes a lowercase letter directly followed by an uppercase letter
/// where lower and uppercase letters are defined by the corresponding Unicode General Categories.
fn split_camel_case_bounds(&self) -> CamelCaseParts;
}

pub(crate) struct CamelCaseParts<'t> {
state: State<'t>,
}

enum State<'t> {
InProgress { remainder: &'t str },
Exhausted,
}

impl CamelCaseSegmentation for str {
fn split_camel_case_bounds(&self) -> CamelCaseParts {
CamelCaseParts { state: State::InProgress { remainder: self } }
}
}

/// Matches a lower-case letter followed by an upper-case one and captures
/// the boundary between them with a group named "boundary".
static CAMEL_CASE_BOUNDARY_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\p{Ll}(?P<boundary>)\p{Lu}").unwrap());

impl<'t> Iterator for CamelCaseParts<'t> {
type Item = &'t str;

fn next(&mut self) -> Option<Self::Item> {
match self.state {
State::Exhausted => None,
State::InProgress { remainder } => {
match CAMEL_CASE_BOUNDARY_REGEX.captures(remainder) {
None => {
// All boundaries processed. Mark `self` as exhausted.
self.state = State::Exhausted;
// But don't forget to yield the part of the string remaining after the last boundary.
Some(remainder)
}
Some(captures) => {
// By the nature of the regex, this group is always present and this should never panic.
let boundary = captures.name("boundary").unwrap().start();
self.state = State::InProgress { remainder: &remainder[boundary..] };
Some(&remainder[..boundary])
}
}
}
}
}
}
16 changes: 12 additions & 4 deletions charabia/src/segmenter/latin.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use unicode_segmentation::UnicodeSegmentation;

use super::camel_case::CamelCaseSegmentation;
use super::Segmenter;

/// Latin specialized [`Segmenter`].
Expand All @@ -9,24 +10,31 @@ pub struct LatinSegmenter;

impl Segmenter for LatinSegmenter {
fn segment_str<'o>(&self, s: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
Box::new(s.split_word_bounds().flat_map(|lemma| lemma.split_inclusive('\'')))
let lemmas = s
.split_word_bounds()
.flat_map(|lemma| lemma.split_inclusive('\''))
.flat_map(|lemma| lemma.split_camel_case_bounds());

Box::new(lemmas)
}
}

#[cfg(test)]
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
const TEXT: &str = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F! camelCase PascalCase IJsland CASE";
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can'", "t", " ",
"jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", " ", "Brr", ",", " ", "it'", "s",
" ", "29.3", "°", "F", "!",
" ", "29.3", "°", "F", "!", " ", "camel", "Case", " ", "Pascal", "Case", " ", "IJsland",
" ", "CASE",
];
const TOKENIZED: &[&str] = &[
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can'", "t", " ",
"jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", " ", "brr", ",", " ", "it'", "s",
" ", "29.3", "°", "f", "!",
" ", "29.3", "°", "f", "!", " ", "camel", "case", " ", "pascal", "case", " ", "ijsland",
" ", "case",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Other);
Expand Down
1 change: 1 addition & 0 deletions charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mod latin;
#[cfg(feature = "thai")]
mod thai;
mod utils;
mod camel_case;

/// List of used [`Segmenter`]s linked to their corresponding [`Script`] and [`Language`].
///
Expand Down

0 comments on commit 6703a20

Please sign in to comment.