Split camelCase in Latin segmenter

To improve recall and to be consistent with snake_case and kebab-case splitting that's already in place, make the Latin Segmenter split words on camelCase boundaries. Define camelCase boundary as a lowercase letter directly followed by an uppercase one. (Or the position between them, to be precise.) This treats most cases and avoids the common pitfalls like eg. ALL_CAPS. What is not handled though, are abbreviations within a longer word. Especially in code it's common to write eg. "meiliAPIClient". With this implementation it's split into ["meili", "APIClient"]. Leverage the Unicode General Categories https://en.wikipedia.org/wiki/Unicode_character_property#General_Category and their support in the Regex crate for matching lowercase and uppercase letters. Put the logic into a separate module and expose API similar to UnicodeSegmentation's to keep the call-side in latin.rs clean and concise. Closes meilisearch#129.
goodhoko · Feb 1, 2023 · 6703a20 · 6703a20
1 parent a3eab30
commit 6703a20
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 4 deletions.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -18,6 +18,7 @@ deunicode = "1.1.1"
 fst = "0.4"
 jieba-rs = { version = "0.6", optional = true }
 once_cell = "1.5.2"
+regex = "1.7.1"
 serde = "1.0"
 slice-group-by = "0.3.0"
 unicode-segmentation = "1.6.0"

diff --git a/charabia/src/segmenter/camel_case.rs b/charabia/src/segmenter/camel_case.rs
@@ -0,0 +1,56 @@
+use once_cell::sync::Lazy;
+use regex::Regex;
+
+pub(crate) trait CamelCaseSegmentation {
+    /// Returns an iterator over substrings of `self` separated on camelCase boundaries.
+    /// For instance, "camelCase" is split into ["camel", "Case"].
+    /// A camelCase boundary constitutes a lowercase letter directly followed by an uppercase letter
+    /// where lower and uppercase letters are defined by the corresponding Unicode General Categories.
+    fn split_camel_case_bounds(&self) -> CamelCaseParts;
+}
+
+pub(crate) struct CamelCaseParts<'t> {
+    state: State<'t>,
+}
+
+enum State<'t> {
+    InProgress { remainder: &'t str },
+    Exhausted,
+}
+
+impl CamelCaseSegmentation for str {
+    fn split_camel_case_bounds(&self) -> CamelCaseParts {
+        CamelCaseParts { state: State::InProgress { remainder: self } }
+    }
+}
+
+/// Matches a lower-case letter followed by an upper-case one and captures
+/// the boundary between them with a group named "boundary".
+static CAMEL_CASE_BOUNDARY_REGEX: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"\p{Ll}(?P<boundary>)\p{Lu}").unwrap());
+
+impl<'t> Iterator for CamelCaseParts<'t> {
+    type Item = &'t str;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.state {
+            State::Exhausted => None,
+            State::InProgress { remainder } => {
+                match CAMEL_CASE_BOUNDARY_REGEX.captures(remainder) {
+                    None => {
+                        // All boundaries processed. Mark `self` as exhausted.
+                        self.state = State::Exhausted;
+                        // But don't forget to yield the part of the string remaining after the last boundary.
+                        Some(remainder)
+                    }
+                    Some(captures) => {
+                        // By the nature of the regex, this group is always present and this should never panic.
+                        let boundary = captures.name("boundary").unwrap().start();
+                        self.state = State::InProgress { remainder: &remainder[boundary..] };
+                        Some(&remainder[..boundary])
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/charabia/src/segmenter/latin.rs b/charabia/src/segmenter/latin.rs
@@ -1,5 +1,6 @@
 use unicode_segmentation::UnicodeSegmentation;
 
+use super::camel_case::CamelCaseSegmentation;
 use super::Segmenter;
 
 /// Latin specialized [`Segmenter`].
@@ -9,24 +10,31 @@ pub struct LatinSegmenter;
 
 impl Segmenter for LatinSegmenter {
     fn segment_str<'o>(&self, s: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
-        Box::new(s.split_word_bounds().flat_map(|lemma| lemma.split_inclusive('\'')))
+        let lemmas = s
+            .split_word_bounds()
+            .flat_map(|lemma| lemma.split_inclusive('\''))
+            .flat_map(|lemma| lemma.split_camel_case_bounds());
+
+        Box::new(lemmas)
     }
 }
 
 #[cfg(test)]
 mod test {
     use crate::segmenter::test::test_segmenter;
 
-    const TEXT: &str = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
+    const TEXT: &str = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F! camelCase PascalCase IJsland CASE";
     const SEGMENTED: &[&str] = &[
         "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can'", "t", " ",
         "jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", " ", "Brr", ",", " ", "it'", "s",
-        " ", "29.3", "°", "F", "!",
+        " ", "29.3", "°", "F", "!", " ", "camel", "Case", " ", "Pascal", "Case", " ", "IJsland",
+        " ", "CASE",
     ];
     const TOKENIZED: &[&str] = &[
         "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can'", "t", " ",
         "jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", " ", "brr", ",", " ", "it'", "s",
-        " ", "29.3", "°", "f", "!",
+        " ", "29.3", "°", "f", "!", " ", "camel", "case", " ", "pascal", "case", " ", "ijsland",
+        " ", "case",
     ];
 
     test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Other);

diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
@@ -30,6 +30,7 @@ mod latin;
 #[cfg(feature = "thai")]
 mod thai;
 mod utils;
+mod camel_case;
 
 /// List of used [`Segmenter`]s linked to their corresponding [`Script`] and [`Language`].
 ///