rapidfuzz · guwidoe · Nov 21, 2024 · Nov 21, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/src/common.rs b/src/common.rs
@@ -1,4 +1,118 @@
+use std::cmp::Ord;
 use std::fmt::Debug;
+use std::vec::Vec;
+
+use crate::details::splitted_sentence::{is_space, IsSpace, SplittedSentence};
+use crate::HashableChar;
+
+#[derive(Debug, Clone)]
+pub struct DecomposedSet<CharT> {
+    pub difference_ab: SplittedSentence<CharT>,
+    pub difference_ba: SplittedSentence<CharT>,
+    pub intersection: SplittedSentence<CharT>,
+}
+
+/// Computes the decomposition of two splitted sentences into their intersection and differences.
+///
+/// This function mirrors the logic of the C++ version:
+/// - Dedupe both `a` and `b`
+/// - Compute intersection and differences
+///
+/// # Parameters
+/// - `a`: a `SplittedSentence<CharT>`
+/// - `b`: a `SplittedSentence<CharT>`
+///
+/// # Returns
+/// - `DecomposedSet<CharT>` containing difference_ab, difference_ba, and intersection
+///
+/// # Requirements
+/// `CharT` must implement `IsSpace`, `HashableChar`, `Copy`, and `Ord` to ensure tokens are deduplicated and searchable.
+pub fn set_decomposition<CharT>(
+    mut a: SplittedSentence<CharT>,
+    mut b: SplittedSentence<CharT>,
+) -> DecomposedSet<CharT>
+where
+    CharT: IsSpace + HashableChar + Copy + Ord,
+{
+    // Deduplicate both splitted sentences
+    a.dedupe();
+    b.dedupe();
+
+    // difference_ba initially contains all words from b
+    let mut difference_ba_tokens = b.words().clone();
+    let mut intersection_tokens = Vec::new();
+    let mut difference_ab_tokens = Vec::new();
+
+    // For each token in a, check if it exists in difference_ba_tokens
+    for current_a in a.words() {
+        if let Some(pos) = difference_ba_tokens
+            .iter()
+            .position(|word| word == current_a)
+        {
+            // Found common token, move it to intersection
+            difference_ba_tokens.remove(pos);
+            intersection_tokens.push(current_a.clone());
+        } else {
+            // Token does not exist in b, add to difference_ab
+            difference_ab_tokens.push(current_a.clone());
+        }
+    }
+
+    DecomposedSet {
+        difference_ab: SplittedSentence::new(difference_ab_tokens),
+        difference_ba: SplittedSentence::new(difference_ba_tokens),
+        intersection: SplittedSentence::new(intersection_tokens),
+    }
+}
+
+/// Splits an input iterator into tokens based on whitespace, sorts them, and returns a `SplittedSentence`.
+///
+/// # Parameters
+/// - `input`: An iterator over the input sequence.
+///
+/// # Returns
+/// - A `SplittedSentence` containing sorted tokens.
+///
+/// # Notes
+/// - Tokens are split based on whitespace characters determined by the `is_space` function.
+/// - The function collects tokens into a vector of ranges or slices, sorts them, and constructs a `SplittedSentence`.
+pub fn sorted_split<Iter, CharT>(input: Iter) -> SplittedSentence<CharT>
+where
+    Iter: IntoIterator<Item = CharT>,
+    Iter::IntoIter: Clone + Iterator<Item = CharT>,
+    CharT: IsSpace + HashableChar + Copy + Ord,
+{
+    let mut splitted: Vec<Vec<CharT>> = Vec::new();
+    let mut iter = input.into_iter().peekable();
+
+    while let Some(&ch) = iter.peek() {
+        // Skip over any whitespace characters
+        if is_space(ch) {
+            iter.next();
+            continue;
+        }
+
+        // Collect the token
+        let mut token = Vec::new();
+        while let Some(&ch) = iter.peek() {
+            if is_space(ch) {
+                break;
+            }
+            token.push(ch);
+            iter.next();
+        }
+
+        if !token.is_empty() {
+            splitted.push(token);
+        }
+    }
+
+    // Sort the tokens
+    splitted.sort();
+
+    // Construct a SplittedSentence from the sorted tokens
+    SplittedSentence::new(splitted)
+}
 
 #[derive(Default, Copy, Clone)]
 pub struct NoScoreCutoff;
@@ -84,3 +198,46 @@ where
         (raw >= self.0).then_some(raw)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_set_decomposition() {
+        let s1_tokens = vec![
+            vec!['f', 'u', 'z', 'z', 'y'],
+            vec!['w', 'u', 'z', 'z', 'y'],
+            vec!['w', 'a', 's'],
+        ];
+        let s2_tokens = vec![
+            vec!['f', 'u', 'z', 'z', 'y'],
+            vec!['f', 'u', 'z', 'z', 'y'],
+            vec!['b', 'e', 'a', 'r'],
+        ];
+        let s1 = SplittedSentence::new(s1_tokens);
+        let s2 = SplittedSentence::new(s2_tokens);
+
+        let result = set_decomposition(s1, s2);
+
+        // After dedupe:
+        // s1 words: fuzzy, wuzzy, was
+        // s2 words: fuzzy, bear
+        // intersection: fuzzy
+        // difference_ab: wuzzy, was
+        // difference_ba: bear
+
+        assert_eq!(
+            result.intersection.words(),
+            &vec![vec!['f', 'u', 'z', 'z', 'y']]
+        );
+        assert_eq!(
+            result.difference_ab.words(),
+            &vec![vec!['w', 'u', 'z', 'z', 'y'], vec!['w', 'a', 's']]
+        );
+        assert_eq!(
+            result.difference_ba.words(),
+            &vec![vec!['b', 'e', 'a', 'r']]
+        );
+    }
+}
diff --git a/src/details.rs b/src/details.rs
@@ -4,3 +4,4 @@ pub mod growing_hashmap;
 pub mod intrinsics;
 pub mod matrix;
 pub mod pattern_match_vector;
+pub mod splitted_sentence;
diff --git a/src/details/splitted_sentence.rs b/src/details/splitted_sentence.rs
@@ -0,0 +1,216 @@
+use crate::HashableChar;
+
+// src/details/splitted_sentence.rs
+
+/// Trait to determine if a character is a whitespace and to provide a space character.
+pub trait IsSpace: Sized + Copy {
+    /// Determines if the character is a whitespace character.
+    fn is_space(&self) -> bool;
+
+    /// Returns a space character of the same type.
+    fn space() -> Self;
+}
+
+impl IsSpace for char {
+    fn is_space(&self) -> bool {
+        matches!(
+            *self,
+            '\u{0009}' // TAB
+                | '\u{000A}' // LF
+                | '\u{000B}' // VT
+                | '\u{000C}' // FF
+                | '\u{000D}' // CR
+                | '\u{001C}'
+                | '\u{001D}'
+                | '\u{001E}'
+                | '\u{001F}'
+                | '\u{0020}' // SPACE
+                | '\u{0085}'
+                | '\u{00A0}'
+                | '\u{1680}'
+                | '\u{2000}'
+                | '\u{2001}'
+                | '\u{2002}'
+                | '\u{2003}'
+                | '\u{2004}'
+                | '\u{2005}'
+                | '\u{2006}'
+                | '\u{2007}'
+                | '\u{2008}'
+                | '\u{2009}'
+                | '\u{200A}'
+                | '\u{2028}'
+                | '\u{2029}'
+                | '\u{202F}'
+                | '\u{205F}'
+                | '\u{3000}'
+        )
+    }
+
+    fn space() -> Self {
+        ' '
+    }
+}
+
+impl IsSpace for u8 {
+    fn is_space(&self) -> bool {
+        matches!(
+            *self,
+            0x09 | 0x0A | 0x0B | 0x0C | 0x0D | 0x1C | 0x1D | 0x1E | 0x1F | 0x20
+        )
+    }
+
+    fn space() -> Self {
+        0x20 // ASCII space
+    }
+}
+
+/// Determines if a character is considered a whitespace character.
+///
+/// This function now operates on any type that implements the `IsSpace` trait.
+pub fn is_space<CharT: IsSpace>(ch: CharT) -> bool {
+    ch.is_space()
+}
+
+/// A view into a splitted sentence, containing sorted tokens.
+#[derive(Debug, Clone)]
+pub struct SplittedSentence<CharT> {
+    tokens: Vec<Vec<CharT>>,
+}
+
+impl<CharT> SplittedSentence<CharT>
+where
+    CharT: IsSpace + HashableChar + Copy + Ord,
+{
+    /// Creates a new `SplittedSentence` from a vector of token vectors.
+    pub fn new(tokens: Vec<Vec<CharT>>) -> Self {
+        SplittedSentence { tokens }
+    }
+
+    /// Removes duplicate tokens, keeping only unique tokens.
+    ///
+    /// Returns the number of duplicates removed.
+    pub fn dedupe(&mut self) -> usize {
+        let old_word_count = self.word_count();
+        self.tokens.dedup(); // Removes consecutive duplicates while preserving order.
+        old_word_count - self.word_count()
+    }
+
+    /// Returns the total size (number of characters plus spaces) of the splitted sentence.
+    pub fn size(&self) -> usize {
+        if self.tokens.is_empty() {
+            return 0;
+        }
+
+        // There is a space between each word
+        let mut result = self.tokens.len() - 1;
+        for token in &self.tokens {
+            result += token.len();
+        }
+
+        result
+    }
+
+    /// Returns the length of the splitted sentence.
+    ///
+    /// This is an alias for `size`.
+    pub fn length(&self) -> usize {
+        self.size()
+    }
+
+    /// Checks if the splitted sentence is empty.
+    pub fn empty(&self) -> bool {
+        self.tokens.is_empty()
+    }
+
+    /// Returns the number of words (tokens) in the splitted sentence.
+    pub fn word_count(&self) -> usize {
+        self.tokens.len()
+    }
+
+    /// Joins the tokens back into a single vector of characters, separated by spaces.
+    pub fn join(&self) -> Vec<CharT> {
+        if self.tokens.is_empty() {
+            return Vec::new();
+        }
+
+        let mut joined = Vec::with_capacity(self.size());
+        joined.extend(&self.tokens[0]);
+
+        for token in self.tokens.iter().skip(1) {
+            joined.push(CharT::space());
+            joined.extend(token);
+        }
+
+        joined
+    }
+
+    /// Returns a reference to the internal tokens.
+    pub fn words(&self) -> &Vec<Vec<CharT>> {
+        &self.tokens
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_splitted_sentence_char() {
+        let tokens = vec![
+            vec!['f', 'u', 'z', 'z', 'y'],
+            vec!['w', 'u', 'z', 'z', 'y'],
+            vec!['w', 'a', 's'],
+            vec!['a'],
+            vec!['b', 'e', 'a', 'r'],
+        ];
+        let mut splitted = SplittedSentence::new(tokens.clone());
+        // 'fuzzy wuzzy was a bear' has 5 + 1 + 5 + 1 + 3 + 1 + 1 + 1 + 4 = 22 characters
+        assert_eq!(splitted.size(), 22);
+
+        let removed = splitted.dedupe();
+        // All tokens are unique, so dedupe should remove 0
+        assert_eq!(removed, 0);
+        assert_eq!(splitted.word_count(), 5);
+
+        let joined = splitted.join();
+        assert_eq!(
+            joined,
+            vec![
+                'f', 'u', 'z', 'z', 'y', ' ', 'w', 'u', 'z', 'z', 'y', ' ', 'w', 'a', 's', ' ',
+                'a', ' ', 'b', 'e', 'a', 'r'
+            ]
+        );
+    }
+
+    #[test]
+    fn test_splitted_sentence_u8() {
+        let tokens = vec![
+            vec![102, 117, 122, 122, 121], // "fuzzy"
+            vec![119, 117, 122, 122, 121], // "wuzzy"
+            vec![119, 97, 115],            // "was"
+            vec![97],                      // "a"
+            vec![98, 101, 97, 114],        // "bear"
+        ];
+        let mut splitted = SplittedSentence::new(tokens.clone());
+        // 'fuzzy wuzzy was a bear' has 5 + 1 + 5 + 1 + 3 + 1 + 1 + 1 + 4 = 22 characters
+        assert_eq!(splitted.size(), 22);
+
+        let removed = splitted.dedupe();
+        // All tokens are unique, so dedupe should remove 0
+        assert_eq!(removed, 0);
+        assert_eq!(splitted.word_count(), 5);
+
+        let joined = splitted.join();
+        assert_eq!(
+            joined,
+            vec![
+                102, 117, 122, 122, 121, 32, // "fuzzy "
+                119, 117, 122, 122, 121, 32, // "wuzzy "
+                119, 97, 115, 32, // "was "
+                97, 32, // "a "
+                98, 101, 97, 114 // "bear"
+            ]
+        );
+    }
+}