Skip to content

Commit

Permalink
Merge #270
Browse files Browse the repository at this point in the history
270: Vietnamese: Add laking tests and fix bug r=ManyTheFish a=ManyTheFish

The pre-implemented tests have been removed during the implementation of the Vietnamese normalizer, which missed the bug


Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Many the fish <many@meilisearch.com>
  • Loading branch information
meili-bors[bot] and ManyTheFish authored Feb 13, 2024
2 parents b140e49 + ed34dd6 commit a869338
Showing 1 changed file with 159 additions and 5 deletions.
164 changes: 159 additions & 5 deletions charabia/src/normalizer/vietnamese.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use super::{CharNormalizer, CharOrStr};
use crate::Script;
use crate::Token;
use crate::{Script, Token};

pub struct VietnameseNormalizer;

impl CharNormalizer for VietnameseNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
match c {
'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
_ => None,
'Ð' | 'Đ' | 'đ' | 'ð' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
_ => Some(c.into()),
}
}

Expand All @@ -18,5 +17,160 @@ impl CharNormalizer for VietnameseNormalizer {
}

fn is_should_normalize(c: char) -> bool {
matches!(c, 'Ð' | 'Đ' | 'đ')
matches!(c, 'Ð' | 'Đ' | 'đ' | 'ð')
}

#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::{Normalizer, NormalizerOption};
use crate::token::TokenKind;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("Ðại Việt".to_string()),
char_end: 8,
byte_end: 13,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("Đại Việt".to_string()),
char_end: 8,
byte_end: 13,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("đại Việt".to_string()),
char_end: 8,
byte_end: 13,
script: Script::Latin,
..Default::default()
},
]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("dại Việt".to_string()),
char_end: 8,
byte_end: 13,
char_map: Some(vec![
(2, 1),
(3, 3),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 3),
(1, 1),
]),
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("dại Việt".to_string()),
char_end: 8,
byte_end: 13,
char_map: Some(vec![
(2, 1),
(3, 3),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 3),
(1, 1),
]),
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("dại Việt".to_string()),
char_end: 8,
byte_end: 13,
char_map: Some(vec![
(2, 1),
(3, 3),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 3),
(1, 1),
]),
script: Script::Latin,
..Default::default()
},
]
}

// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
kind: TokenKind::Word,
lemma: Owned("dai viet".to_string()),
char_end: 8,
byte_end: 13,
char_map: Some(vec![
(2, 1),
(3, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 1),
(1, 1),
]),
script: Script::Latin,
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Owned("dai viet".to_string()),
char_end: 8,
byte_end: 13,
char_map: Some(vec![
(2, 1),
(3, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 1),
(1, 1),
]),
script: Script::Latin,
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Owned("dai viet".to_string()),
char_end: 8,
byte_end: 13,
char_map: Some(vec![
(2, 1),
(3, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(3, 1),
(1, 1),
]),
script: Script::Latin,
..Default::default()
},
]
}

test_normalizer!(VietnameseNormalizer, tokens(), normalizer_result(), normalized_tokens());
}

0 comments on commit a869338

Please sign in to comment.