-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
287: Add swedish recomposition normalizer and link it to a feature r=Kerollmops a=ManyTheFish # Pull Request - Add a Swedish normalizer, avoiding the diacritic removal from the letter, and preserving the expected Swedish character ordering. - Add a feature flag for it (not enabled by default) - Trigger test in the CI Co-authored-by: ManyTheFish <many@meilisearch.com>
- Loading branch information
Showing
4 changed files
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
use std::borrow::Cow; | ||
|
||
use aho_corasick::AhoCorasick; | ||
use once_cell::sync::Lazy; | ||
|
||
use super::Normalizer; | ||
use crate::normalizer::NormalizerOption; | ||
use crate::{Script, Token}; | ||
|
||
static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| { | ||
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"]) | ||
.unwrap() | ||
}); | ||
|
||
/// Swedish specialized [`Normalizer`]. | ||
/// | ||
/// This Normalizer recompose swedish characters containing diacritics. | ||
/// | ||
/// This avoids the diacritic removal from the letter and preserves expected swedish character ordering. | ||
pub struct SwedishRecompositionNormalizer; | ||
|
||
impl Normalizer for SwedishRecompositionNormalizer { | ||
fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> { | ||
match token.char_map.take() { | ||
Some(mut char_map) => { | ||
// if a char_map already exists,iterate over it to reconstruct sub-strings. | ||
let mut lemma = String::new(); | ||
let mut tail = token.lemma.as_ref(); | ||
let mut normalized = String::new(); | ||
for (_, normalized_len) in char_map.iter_mut() { | ||
let (head, t) = tail.split_at(*normalized_len as usize); | ||
tail = t; | ||
normalized.clear(); | ||
// then normalize each sub-strings recomputing the size in the char_map. | ||
let mut peekable = head.chars().peekable(); | ||
while let Some(c) = peekable.next() { | ||
let (c, peek_consumed) = recompose_swedish(c, peekable.peek()); | ||
if peek_consumed { | ||
peekable.next(); | ||
} | ||
|
||
normalized.push(c); | ||
} | ||
|
||
*normalized_len = normalized.len() as u8; | ||
lemma.push_str(normalized.as_ref()); | ||
} | ||
|
||
token.lemma = Cow::Owned(lemma); | ||
token.char_map = Some(char_map); | ||
} | ||
None => { | ||
// if no char_map exists, iterate over the lemma recomposing characters. | ||
let mut char_map = Vec::new(); | ||
let mut lemma = String::new(); | ||
let mut peekable = token.lemma.chars().peekable(); | ||
while let Some(c) = peekable.next() { | ||
let (normalized, peek_consumed) = recompose_swedish(c, peekable.peek()); | ||
if peek_consumed { | ||
peekable.next(); | ||
} | ||
|
||
if options.create_char_map { | ||
char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8)); | ||
} | ||
lemma.push(normalized); | ||
} | ||
token.lemma = Cow::Owned(lemma); | ||
if options.create_char_map { | ||
token.char_map = Some(char_map); | ||
} | ||
} | ||
} | ||
|
||
token | ||
} | ||
|
||
// Returns `true` if the Normalizer should be used. | ||
fn should_normalize(&self, token: &Token) -> bool { | ||
token.script == Script::Latin && MATCHING_STR.is_match(token.lemma()) | ||
} | ||
} | ||
|
||
fn recompose_swedish(current: char, next: Option<&char>) -> (char, bool) { | ||
match (current, next) { | ||
('A', Some('\u{30a}')) => ('Å', true), | ||
('a', Some('\u{30a}')) => ('å', true), | ||
('A', Some('\u{308}')) => ('Ä', true), | ||
('a', Some('\u{308}')) => ('ä', true), | ||
('O', Some('\u{308}')) => ('Ö', true), | ||
('o', Some('\u{308}')) => ('ö', true), | ||
(c, _) => (c, false), | ||
} | ||
} | ||
|
||
// Test the normalizer: | ||
#[cfg(test)] | ||
mod test { | ||
use std::borrow::Cow::Owned; | ||
|
||
use crate::normalizer::test::test_normalizer; | ||
use crate::normalizer::Normalizer; | ||
use crate::token::TokenKind; | ||
|
||
// base tokens to normalize. | ||
fn tokens() -> Vec<Token<'static>> { | ||
vec![Token { | ||
lemma: Owned("öpÅscålcäsÄÖs".to_string()), | ||
char_end: 13, | ||
byte_end: 19, | ||
script: Script::Latin, | ||
..Default::default() | ||
}] | ||
} | ||
|
||
// expected result of the current Normalizer. | ||
fn normalizer_result() -> Vec<Token<'static>> { | ||
vec![Token { | ||
// lowercased | ||
lemma: Owned("öpÅscålcäsÄÖs".to_string()), | ||
char_end: 13, | ||
byte_end: 19, | ||
script: Script::Latin, | ||
..Default::default() | ||
}] | ||
} | ||
|
||
// expected result of the complete Normalizer pieline. | ||
fn normalized_tokens() -> Vec<Token<'static>> { | ||
vec![Token { | ||
lemma: Owned("öpåscålcäsäös".to_string()), | ||
char_end: 13, | ||
byte_end: 19, | ||
char_map: Some(vec![ | ||
(2, 2), | ||
(1, 1), | ||
(2, 2), | ||
(1, 1), | ||
(1, 1), | ||
(2, 2), | ||
(1, 1), | ||
(1, 1), | ||
(2, 2), | ||
(1, 1), | ||
(2, 2), | ||
(2, 2), | ||
(1, 1), | ||
]), | ||
script: Script::Latin, | ||
kind: TokenKind::Word, | ||
..Default::default() | ||
}] | ||
} | ||
|
||
test_normalizer!( | ||
SwedishRecompositionNormalizer, | ||
tokens(), | ||
normalizer_result(), | ||
normalized_tokens() | ||
); | ||
} |