From aee3e43707f3922779225e57cf799262c9ad8bc5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 13 Feb 2024 16:35:12 +0100 Subject: [PATCH] Add laking tests and fix bug --- charabia/src/normalizer/vietnamese.rs | 164 +++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 5 deletions(-) diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs index 31e0836f..df6591a8 100644 --- a/charabia/src/normalizer/vietnamese.rs +++ b/charabia/src/normalizer/vietnamese.rs @@ -1,14 +1,13 @@ use super::{CharNormalizer, CharOrStr}; -use crate::Script; -use crate::Token; +use crate::{Script, Token}; pub struct VietnameseNormalizer; impl CharNormalizer for VietnameseNormalizer { fn normalize_char(&self, c: char) -> Option { match c { - 'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters - _ => None, + 'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters + _ => Some(c.into()), } } @@ -18,5 +17,160 @@ impl CharNormalizer for VietnameseNormalizer { } fn is_should_normalize(c: char) -> bool { - matches!(c, 'Ð' | 'Đ' | 'đ') + matches!(c, 'Ð' | 'Đ' | 'đ' | 'ð') +} + +#[cfg(test)] +mod test { + use std::borrow::Cow::Owned; + + use crate::normalizer::test::test_normalizer; + use crate::normalizer::{Normalizer, NormalizerOption}; + use crate::token::TokenKind; + + // base tokens to normalize. + fn tokens() -> Vec> { + vec![ + Token { + lemma: Owned("Ðại Việt".to_string()), + char_end: 8, + byte_end: 13, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("Đại Việt".to_string()), + char_end: 8, + byte_end: 13, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("đại Việt".to_string()), + char_end: 8, + byte_end: 13, + script: Script::Latin, + ..Default::default() + }, + ] + } + + // expected result of the current Normalizer. + fn normalizer_result() -> Vec> { + vec![ + Token { + lemma: Owned("dại Việt".to_string()), + char_end: 8, + byte_end: 13, + char_map: Some(vec![ + (2, 1), + (3, 3), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (3, 3), + (1, 1), + ]), + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("dại Việt".to_string()), + char_end: 8, + byte_end: 13, + char_map: Some(vec![ + (2, 1), + (3, 3), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (3, 3), + (1, 1), + ]), + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("dại Việt".to_string()), + char_end: 8, + byte_end: 13, + char_map: Some(vec![ + (2, 1), + (3, 3), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (3, 3), + (1, 1), + ]), + script: Script::Latin, + ..Default::default() + }, + ] + } + + // expected result of the complete Normalizer pieline. + fn normalized_tokens() -> Vec> { + vec![ + Token { + kind: TokenKind::Word, + lemma: Owned("dai viet".to_string()), + char_end: 8, + byte_end: 13, + char_map: Some(vec![ + (2, 1), + (3, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (3, 1), + (1, 1), + ]), + script: Script::Latin, + ..Default::default() + }, + Token { + kind: TokenKind::Word, + lemma: Owned("dai viet".to_string()), + char_end: 8, + byte_end: 13, + char_map: Some(vec![ + (2, 1), + (3, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (3, 1), + (1, 1), + ]), + script: Script::Latin, + ..Default::default() + }, + Token { + kind: TokenKind::Word, + lemma: Owned("dai viet".to_string()), + char_end: 8, + byte_end: 13, + char_map: Some(vec![ + (2, 1), + (3, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (3, 1), + (1, 1), + ]), + script: Script::Latin, + ..Default::default() + }, + ] + } + + test_normalizer!(VietnameseNormalizer, tokens(), normalizer_result(), normalized_tokens()); }