Skip to content

Commit

Permalink
Restore boundary_property when iterator reaches EOF and matches no ru…
Browse files Browse the repository at this point in the history
…le (#3404)

In the testcase `one.`, when we reached `e` and `.`, we must look ahead one more
character to determine if it matches WB6 [1]. However, `.` and EOF doesn't match
any rule, and it makes `e` and `.` matching WB999 [2] (a break) instead. We should
restore `boundary_property` in this scenario.

[1] https://www.unicode.org/reports/tr29/#WB6
[2] https://www.unicode.org/reports/tr29/#WB999
aethanyc authored May 9, 2023
1 parent 8e389a5 commit 1286699
Showing 2 changed files with 39 additions and 0 deletions.
1 change: 1 addition & 0 deletions components/segmenter/src/rule_segmenter.rs
Original file line number Diff line number Diff line change
@@ -151,6 +151,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
.get_break_state_from_table(break_state as u8, self.data.eot_property)
== NOT_MATCH_RULE
{
self.boundary_property = previous_left_prop;
self.iter = previous_iter;
self.current_pos_data = previous_pos_data;
return self.get_current_position();
38 changes: 38 additions & 0 deletions components/segmenter/tests/word_rule_status.rs
Original file line number Diff line number Diff line change
@@ -36,6 +36,44 @@ fn rule_status() {
assert!(iter.is_word_like(), "Number is true");
}

#[test]
fn rule_status_letter_eof() {
let segmenter =
WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists");
let mut iter = segmenter.segment_str("one.");

assert_eq!(iter.next(), Some(0), "SOT");
assert_eq!(iter.word_type(), WordType::None, "none");
assert!(!iter.is_word_like(), "SOT is false");

assert_eq!(iter.next(), Some(3), "after one");
assert_eq!(iter.word_type(), WordType::Letter, "letter");
assert!(iter.is_word_like(), "Letter is true");

assert_eq!(iter.next(), Some(4), "after full stop");
assert_eq!(iter.word_type(), WordType::None, "none");
assert!(!iter.is_word_like(), "None is false");
}

#[test]
fn rule_status_numeric_eof() {
let segmenter =
WordSegmenter::try_new_auto_unstable(&icu_testdata::unstable()).expect("Data exists");
let mut iter = segmenter.segment_str("42.");

assert_eq!(iter.next(), Some(0), "SOT");
assert_eq!(iter.word_type(), WordType::None, "none");
assert!(!iter.is_word_like(), "SOT is false");

assert_eq!(iter.next(), Some(2), "after 42");
assert_eq!(iter.word_type(), WordType::Number, "Number");
assert!(iter.is_word_like(), "Number is true");

assert_eq!(iter.next(), Some(3), "after full stop");
assert_eq!(iter.word_type(), WordType::None, "none");
assert!(!iter.is_word_like(), "None is false");
}

#[test]
fn rule_status_th() {
let segmenter =

0 comments on commit 1286699

Please sign in to comment.