unicode-org · robertbastian · May 4, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
@@ -155,24 +155,19 @@ impl<'l> LstmSegmenter<'l> {
  return self.dic.len() as u16;
  };
 
- // The maximum UTF-8 size of a grapheme cluster seems to be 41 bytes
- let mut i = 0;
- let mut buf = [0; 41];
-
- #[allow(clippy::unwrap_used)]
- // debug_asserting whether my assumption is correct
- decode_utf16(grapheme_cluster.iter().copied()).for_each(|c| {
- debug_assert!(i < 37);
- i += c
- .unwrap_or(REPLACEMENT_CHARACTER)
- .encode_utf8(&mut buf[i..])
- .len()
- });
-
- #[allow(clippy::unwrap_used)]
- // debug_asserting whether my assumption is correct
  self.dic
- .get_copied(UnvalidatedStr::from_bytes(&buf[..i]))
+ .get_copied_by(|key| {
+ key.as_bytes().iter().copied().cmp(
 utf8_iter = "1.0.3" 
 utf8_iter = "1.0.3" 
+ decode_utf16(grapheme_cluster.iter().copied()).flat_map(|c| {
+ let mut buf = [0; 4];
+ let len = c
+ .unwrap_or(REPLACEMENT_CHARACTER)
+ .encode_utf8(&mut buf)
+ .len();
+ buf.into_iter().take(len)
+ }),
+ )
+ })
  .unwrap_or_else(|| self.dic.len() as u16)
  })
  .collect()

@@ -235,7 +235,6 @@ mod tests {
 
  #[test]
  fn thai_word_break_with_grapheme_model() {
- const TEST_STR: &str = "ภาษาไทยภาษาไทย";
  let provider = crate::DatagenProvider::for_test();
  let raw_data = provider
  .source
@@ -249,12 +248,16 @@ mod tests {
  ),
  provider,
  );
+
  let segmenter = LineSegmenter::try_new_lstm_with_any_provider(&provider).unwrap();
+
+ const TEST_STR: &str = "ภาษาไทยภาษาไทย";
+ let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
+
  let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
- assert_eq!(
- breaks,
- [0, 6, 12, 21, 27, 33, TEST_STR.len()],
- "Thai test with grapheme model"
- );
+ assert_eq!(breaks, [0, 6, 12, 21, 27, 33, TEST_STR.len()],);
+
+ let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
+ assert_eq!(breaks, [0, 2, 4, 7, 9, 11, utf16.len()],);
  }
 }
@@ -252,6 +252,12 @@ where
  self.values.get(index)
  }
 
+ /// For cases when `V` is fixed-size, obtain a direct copy of `V` instead of `V::ULE`
+ pub fn get_copied_by(&self, predicate: impl FnMut(&K) -> Ordering) -> Option<V> {
+ let index = self.keys.zvl_binary_search_by(predicate).ok()?;
+ self.values.get(index)
+ }
+
  /// Similar to [`Self::iter()`] except it returns a direct copy of the values instead of references
  /// to `V::ULE`, in cases when `V` is fixed-size
  pub fn iter_copied_values<'b>(