Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add StopWordFilter #78

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ let tokens = tok.tokenize("Flights can't depart after 2:00 pm.");
pub mod errors;
mod math;
pub mod metrics;
pub mod token_processor;
pub mod tokenize;
pub mod tokenize_sentence;
pub mod vectorize;
98 changes: 98 additions & 0 deletions src/token_processor/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright 2019 vtext developers
//
// Licensed under the Apache License, Version 2.0,
// <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
// modified, or distributed except according to those terms.

/*!
# Token processor module

This modules includes estimators that operate on tokens, for instance for stop words filtering,
n-gram construction or stemming.

## Stop word filtering

```rust
use vtext::token_processor::*;

let tokens = vec!["this", "is", "a", "long", "sentence"];
let stop_words = vec!["a", "this"];

let filter = StopWordFilterParams::default()
.stop_words(stop_words)
.build()
.unwrap();

let tokens_out: Vec<&str> = filter.transform(tokens.iter().cloned()).collect();
assert_eq!(tokens_out, vec!["is", "long", "sentence"]);
*/

use crate::errors::EstimatorErr;
#[cfg(feature = "python")]
use dict_derive::{FromPyObject, IntoPyObject};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fmt;

#[cfg(test)]
mod tests;

pub trait TokenProcessor: fmt::Debug {
fn transform<'a, T>(&'a self, tokens: T) -> Box<dyn Iterator<Item = &'a str> + 'a>
where
T: Iterator<Item = &'a str> + 'a;
}

/// Stop words filter
///
#[derive(Clone, Debug)]
pub struct StopWordFilter {
pub params: StopWordFilterParams,
}

/// Builder for the stop words filter
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
pub struct StopWordFilterParams {
stop_words: HashSet<String>,
}

impl StopWordFilterParams {
pub fn stop_words(&mut self, value: Vec<&str>) -> StopWordFilterParams {
self.stop_words = value.iter().map(|el| el.to_string()).collect();
self.clone()
}
pub fn build(&mut self) -> Result<StopWordFilter, EstimatorErr> {
Ok(StopWordFilter {
params: self.clone(),
})
}
}

impl Default for StopWordFilterParams {
/// Create a new instance
fn default() -> StopWordFilterParams {
StopWordFilterParams {
stop_words: vec!["and", "or", "this"]
.iter()
.map(|el| el.to_string())
.collect(),
}
}
}

impl Default for StopWordFilter {
/// Create a new instance
fn default() -> StopWordFilter {
StopWordFilterParams::default().build().unwrap()
}
}

impl TokenProcessor for StopWordFilter {
fn transform<'a, T>(&'a self, tokens: T) -> Box<dyn Iterator<Item = &'a str> + 'a>
where
T: Iterator<Item = &'a str> + 'a,
{
Box::new(tokens.filter(move |tok| !self.params.stop_words.contains(*tok)))
}
}
20 changes: 20 additions & 0 deletions src/token_processor/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright 2019 vtext developers
//
// Licensed under the Apache License, Version 2.0,
// <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied,
// modified, or distributed except according to those terms.
use crate::token_processor::*;

#[test]
fn test_regexp_tokenizer() {
let stop_words = vec!["and", "or"];
let tokens = vec!["Today", "and", "tomorrow"];

let filter = StopWordFilterParams::default()
.stop_words(stop_words)
.build()
.unwrap();

let tokens_out: Vec<&str> = filter.transform(tokens.iter().cloned()).collect();
assert_eq!(tokens_out, vec!["Today", "tomorrow"]);
}
3 changes: 0 additions & 3 deletions src/tokenize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
assert_eq!(tokens, &["The", "“", "brown", "”", "fox", "ca", "n't", "jump", "32.3", "feet", ",", "right", "?"]);

*/
extern crate regex;
extern crate unicode_segmentation;

use crate::errors::EstimatorErr;
#[cfg(feature = "python")]
use dict_derive::{FromPyObject, IntoPyObject};
Expand Down