From 0bee697fbae06edf672012713430ec12f5e83f1c Mon Sep 17 00:00:00 2001 From: Robert Bastian Date: Wed, 3 May 2023 20:10:03 +0200 Subject: [PATCH] cmp iters --- components/segmenter/src/complex/lstm/mod.rs | 29 ++++++++------------ utils/zerovec/src/map/borrowed.rs | 6 ++++ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/components/segmenter/src/complex/lstm/mod.rs b/components/segmenter/src/complex/lstm/mod.rs index 2ff44e7e15f..c9f72cbd740 100644 --- a/components/segmenter/src/complex/lstm/mod.rs +++ b/components/segmenter/src/complex/lstm/mod.rs @@ -155,24 +155,19 @@ impl<'l> LstmSegmenter<'l> { return self.dic.len() as u16; }; - // The maximum UTF-8 size of a grapheme cluster seems to be 41 bytes - let mut i = 0; - let mut buf = [0; 41]; - - #[allow(clippy::unwrap_used)] - // debug_asserting whether my assumption is correct - decode_utf16(grapheme_cluster.iter().copied()).for_each(|c| { - debug_assert!(i < 37); - i += c - .unwrap_or(REPLACEMENT_CHARACTER) - .encode_utf8(&mut buf[i..]) - .len() - }); - - #[allow(clippy::unwrap_used)] - // debug_asserting whether my assumption is correct self.dic - .get_copied(UnvalidatedStr::from_bytes(&buf[..i])) + .get_copied_by(|key| { + key.as_bytes().iter().copied().cmp( + decode_utf16(grapheme_cluster.iter().copied()).flat_map(|c| { + let mut buf = [0; 4]; + let len = c + .unwrap_or(REPLACEMENT_CHARACTER) + .encode_utf8(&mut buf) + .len(); + buf.into_iter().take(len) + }), + ) + }) .unwrap_or_else(|| self.dic.len() as u16) }) .collect() diff --git a/utils/zerovec/src/map/borrowed.rs b/utils/zerovec/src/map/borrowed.rs index bc93ee49795..b6307990d1f 100644 --- a/utils/zerovec/src/map/borrowed.rs +++ b/utils/zerovec/src/map/borrowed.rs @@ -252,6 +252,12 @@ where self.values.get(index) } + /// For cases when `V` is fixed-size, obtain a direct copy of `V` instead of `V::ULE` + pub fn get_copied_by(&self, predicate: impl FnMut(&K) -> Ordering) -> Option { + let index = self.keys.zvl_binary_search_by(predicate).ok()?; + self.values.get(index) + } + /// Similar to [`Self::iter()`] except it returns a direct copy of the values instead of references /// to `V::ULE`, in cases when `V` is fixed-size pub fn iter_copied_values<'b>(