Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of WRatio #9

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions src/common.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,118 @@
use std::cmp::Ord;
use std::fmt::Debug;
use std::vec::Vec;

use crate::details::splitted_sentence::{is_space, IsSpace, SplittedSentence};
use crate::HashableChar;

#[derive(Debug, Clone)]
pub struct DecomposedSet<CharT> {
pub difference_ab: SplittedSentence<CharT>,
pub difference_ba: SplittedSentence<CharT>,
pub intersection: SplittedSentence<CharT>,
}

/// Computes the decomposition of two splitted sentences into their intersection and differences.
///
/// This function mirrors the logic of the C++ version:
/// - Dedupe both `a` and `b`
/// - Compute intersection and differences
///
/// # Parameters
/// - `a`: a `SplittedSentence<CharT>`
/// - `b`: a `SplittedSentence<CharT>`
///
/// # Returns
/// - `DecomposedSet<CharT>` containing difference_ab, difference_ba, and intersection
///
/// # Requirements
/// `CharT` must implement `IsSpace`, `HashableChar`, `Copy`, and `Ord` to ensure tokens are deduplicated and searchable.
pub fn set_decomposition<CharT>(
mut a: SplittedSentence<CharT>,
mut b: SplittedSentence<CharT>,
) -> DecomposedSet<CharT>
where
CharT: IsSpace + HashableChar + Copy + Ord,
{
// Deduplicate both splitted sentences
a.dedupe();
b.dedupe();

// difference_ba initially contains all words from b
let mut difference_ba_tokens = b.words().clone();
let mut intersection_tokens = Vec::new();
let mut difference_ab_tokens = Vec::new();

// For each token in a, check if it exists in difference_ba_tokens
for current_a in a.words() {
if let Some(pos) = difference_ba_tokens
.iter()
.position(|word| word == current_a)
{
// Found common token, move it to intersection
difference_ba_tokens.remove(pos);
intersection_tokens.push(current_a.clone());
} else {
// Token does not exist in b, add to difference_ab
difference_ab_tokens.push(current_a.clone());
}
}

DecomposedSet {
difference_ab: SplittedSentence::new(difference_ab_tokens),
difference_ba: SplittedSentence::new(difference_ba_tokens),
intersection: SplittedSentence::new(intersection_tokens),
}
}

/// Splits an input iterator into tokens based on whitespace, sorts them, and returns a `SplittedSentence`.
///
/// # Parameters
/// - `input`: An iterator over the input sequence.
///
/// # Returns
/// - A `SplittedSentence` containing sorted tokens.
///
/// # Notes
/// - Tokens are split based on whitespace characters determined by the `is_space` function.
/// - The function collects tokens into a vector of ranges or slices, sorts them, and constructs a `SplittedSentence`.
pub fn sorted_split<Iter, CharT>(input: Iter) -> SplittedSentence<CharT>
where
Iter: IntoIterator<Item = CharT>,
Iter::IntoIter: Clone + Iterator<Item = CharT>,
CharT: IsSpace + HashableChar + Copy + Ord,
{
let mut splitted: Vec<Vec<CharT>> = Vec::new();
let mut iter = input.into_iter().peekable();

while let Some(&ch) = iter.peek() {
// Skip over any whitespace characters
if is_space(ch) {
iter.next();
continue;
}

// Collect the token
let mut token = Vec::new();
while let Some(&ch) = iter.peek() {
if is_space(ch) {
break;
}
token.push(ch);
iter.next();
}

if !token.is_empty() {
splitted.push(token);
}
}

// Sort the tokens
splitted.sort();

// Construct a SplittedSentence from the sorted tokens
SplittedSentence::new(splitted)
}

#[derive(Default, Copy, Clone)]
pub struct NoScoreCutoff;
Expand Down Expand Up @@ -84,3 +198,46 @@ where
(raw >= self.0).then_some(raw)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_set_decomposition() {
let s1_tokens = vec![
vec!['f', 'u', 'z', 'z', 'y'],
vec!['w', 'u', 'z', 'z', 'y'],
vec!['w', 'a', 's'],
];
let s2_tokens = vec![
vec!['f', 'u', 'z', 'z', 'y'],
vec!['f', 'u', 'z', 'z', 'y'],
vec!['b', 'e', 'a', 'r'],
];
let s1 = SplittedSentence::new(s1_tokens);
let s2 = SplittedSentence::new(s2_tokens);

let result = set_decomposition(s1, s2);

// After dedupe:
// s1 words: fuzzy, wuzzy, was
// s2 words: fuzzy, bear
// intersection: fuzzy
// difference_ab: wuzzy, was
// difference_ba: bear

assert_eq!(
result.intersection.words(),
&vec![vec!['f', 'u', 'z', 'z', 'y']]
);
assert_eq!(
result.difference_ab.words(),
&vec![vec!['w', 'u', 'z', 'z', 'y'], vec!['w', 'a', 's']]
);
assert_eq!(
result.difference_ba.words(),
&vec![vec!['b', 'e', 'a', 'r']]
);
}
}
1 change: 1 addition & 0 deletions src/details.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pub mod growing_hashmap;
pub mod intrinsics;
pub mod matrix;
pub mod pattern_match_vector;
pub mod splitted_sentence;
216 changes: 216 additions & 0 deletions src/details/splitted_sentence.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
use crate::HashableChar;

// src/details/splitted_sentence.rs

/// Trait to determine if a character is a whitespace and to provide a space character.
pub trait IsSpace: Sized + Copy {
/// Determines if the character is a whitespace character.
fn is_space(&self) -> bool;

/// Returns a space character of the same type.
fn space() -> Self;
}

impl IsSpace for char {
fn is_space(&self) -> bool {
matches!(
*self,
'\u{0009}' // TAB
| '\u{000A}' // LF
| '\u{000B}' // VT
| '\u{000C}' // FF
| '\u{000D}' // CR
| '\u{001C}'
| '\u{001D}'
| '\u{001E}'
| '\u{001F}'
| '\u{0020}' // SPACE
| '\u{0085}'
| '\u{00A0}'
| '\u{1680}'
| '\u{2000}'
| '\u{2001}'
| '\u{2002}'
| '\u{2003}'
| '\u{2004}'
| '\u{2005}'
| '\u{2006}'
| '\u{2007}'
| '\u{2008}'
| '\u{2009}'
| '\u{200A}'
| '\u{2028}'
| '\u{2029}'
| '\u{202F}'
| '\u{205F}'
| '\u{3000}'
)
}

fn space() -> Self {
' '
}
}

impl IsSpace for u8 {
fn is_space(&self) -> bool {
matches!(
*self,
0x09 | 0x0A | 0x0B | 0x0C | 0x0D | 0x1C | 0x1D | 0x1E | 0x1F | 0x20
)
}

fn space() -> Self {
0x20 // ASCII space
}
}

/// Determines if a character is considered a whitespace character.
///
/// This function now operates on any type that implements the `IsSpace` trait.
pub fn is_space<CharT: IsSpace>(ch: CharT) -> bool {
ch.is_space()
}

/// A view into a splitted sentence, containing sorted tokens.
#[derive(Debug, Clone)]
pub struct SplittedSentence<CharT> {
tokens: Vec<Vec<CharT>>,
}

impl<CharT> SplittedSentence<CharT>
where
CharT: IsSpace + HashableChar + Copy + Ord,
{
/// Creates a new `SplittedSentence` from a vector of token vectors.
pub fn new(tokens: Vec<Vec<CharT>>) -> Self {
SplittedSentence { tokens }
}

/// Removes duplicate tokens, keeping only unique tokens.
///
/// Returns the number of duplicates removed.
pub fn dedupe(&mut self) -> usize {
let old_word_count = self.word_count();
self.tokens.dedup(); // Removes consecutive duplicates while preserving order.
old_word_count - self.word_count()
}

/// Returns the total size (number of characters plus spaces) of the splitted sentence.
pub fn size(&self) -> usize {
if self.tokens.is_empty() {
return 0;
}

// There is a space between each word
let mut result = self.tokens.len() - 1;
for token in &self.tokens {
result += token.len();
}

result
}

/// Returns the length of the splitted sentence.
///
/// This is an alias for `size`.
pub fn length(&self) -> usize {
self.size()
}

/// Checks if the splitted sentence is empty.
pub fn empty(&self) -> bool {
self.tokens.is_empty()
}

/// Returns the number of words (tokens) in the splitted sentence.
pub fn word_count(&self) -> usize {
self.tokens.len()
}

/// Joins the tokens back into a single vector of characters, separated by spaces.
pub fn join(&self) -> Vec<CharT> {
if self.tokens.is_empty() {
return Vec::new();
}

let mut joined = Vec::with_capacity(self.size());
joined.extend(&self.tokens[0]);

for token in self.tokens.iter().skip(1) {
joined.push(CharT::space());
joined.extend(token);
}

joined
}

/// Returns a reference to the internal tokens.
pub fn words(&self) -> &Vec<Vec<CharT>> {
&self.tokens
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_splitted_sentence_char() {
let tokens = vec![
vec!['f', 'u', 'z', 'z', 'y'],
vec!['w', 'u', 'z', 'z', 'y'],
vec!['w', 'a', 's'],
vec!['a'],
vec!['b', 'e', 'a', 'r'],
];
let mut splitted = SplittedSentence::new(tokens.clone());
// 'fuzzy wuzzy was a bear' has 5 + 1 + 5 + 1 + 3 + 1 + 1 + 1 + 4 = 22 characters
assert_eq!(splitted.size(), 22);

let removed = splitted.dedupe();
// All tokens are unique, so dedupe should remove 0
assert_eq!(removed, 0);
assert_eq!(splitted.word_count(), 5);

let joined = splitted.join();
assert_eq!(
joined,
vec![
'f', 'u', 'z', 'z', 'y', ' ', 'w', 'u', 'z', 'z', 'y', ' ', 'w', 'a', 's', ' ',
'a', ' ', 'b', 'e', 'a', 'r'
]
);
}

#[test]
fn test_splitted_sentence_u8() {
let tokens = vec![
vec![102, 117, 122, 122, 121], // "fuzzy"
vec![119, 117, 122, 122, 121], // "wuzzy"
vec![119, 97, 115], // "was"
vec![97], // "a"
vec![98, 101, 97, 114], // "bear"
];
let mut splitted = SplittedSentence::new(tokens.clone());
// 'fuzzy wuzzy was a bear' has 5 + 1 + 5 + 1 + 3 + 1 + 1 + 1 + 4 = 22 characters
assert_eq!(splitted.size(), 22);

let removed = splitted.dedupe();
// All tokens are unique, so dedupe should remove 0
assert_eq!(removed, 0);
assert_eq!(splitted.word_count(), 5);

let joined = splitted.join();
assert_eq!(
joined,
vec![
102, 117, 122, 122, 121, 32, // "fuzzy "
119, 117, 122, 122, 121, 32, // "wuzzy "
119, 97, 115, 32, // "was "
97, 32, // "a "
98, 101, 97, 114 // "bear"
]
);
}
}
Loading