Skip to content

Commit

Permalink
Merge pull request #25 from snipsco/fix/multiple_beginning_stop_words
Browse files Browse the repository at this point in the history
Fix condition on expanding match with previous stop words
  • Loading branch information
adrienball authored Oct 15, 2018
2 parents 367d9dc + bf0ff2d commit b6b1529
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 7 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Changelog
All notable changes to this project will be documented in this file.

## [0.5.1] - 2018-10-15
### Changed
- Fig bug affecting the backward expansion of possible matches starting with stop words

[0.5.1]: https://github.com/snipsco/gazetteer-entity-parser/compare/0.5.0...0.5.1

## [0.5.0] - 2018-10-01
### Changed
- Clearer `ParserBuilder`'s API
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gazetteer-entity-parser"
version = "0.5.0"
version = "0.5.1"
authors = ["Alaa Saade <alaa.saade@snips.ai>"]

[profile.bench]
Expand Down
4 changes: 2 additions & 2 deletions src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::result::Result;
use serde::{Deserialize, Deserializer, Serialize, Serializer};

/// Struct representing the value of an entity to be added to the parser
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
pub struct EntityValue {
pub resolved_value: String,
pub raw_value: String,
Expand All @@ -12,7 +12,7 @@ pub struct EntityValue {
/// Struct holding a gazetteer, i.e. an ordered list of `EntityValue` to be added to the parser.
/// The values should be added in order of popularity or probability, with the most popular value
/// added first (see Parser).
#[derive(Debug, Clone, PartialEq, Default)]
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct Gazetteer {
pub data: Vec<EntityValue>,
}
Expand Down
25 changes: 21 additions & 4 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pub struct PossibleMatch {
raw_value_length: u32,
n_consumed_tokens: u32,
last_token_in_input: usize,
first_token_in_resolution: usize,
last_token_in_resolution: usize,
rank: u32,
}
Expand Down Expand Up @@ -643,6 +644,7 @@ impl Parser {
tokens_range: token_idx..(token_idx + 1),
raw_value_length: otokens.len() as u32,
last_token_in_input: token_idx,
first_token_in_resolution: last_token_in_resolution,
last_token_in_resolution,
n_consumed_tokens: 1,
rank: *rank,
Expand Down Expand Up @@ -675,23 +677,24 @@ impl Parser {
range: range_start..range_end,
tokens_range: token_idx..(token_idx + 1),
last_token_in_input: token_idx,
first_token_in_resolution: last_token_in_resolution,
last_token_in_resolution,
n_consumed_tokens: 1,
raw_value_length: otokens.len() as u32,
rank: *rank,
};
let mut n_skips = last_token_in_resolution as u32;

// Bactrack to check if we left out from skipped words at the beginning
// Backtrack to check if we left out from skipped words at the beginning
'outer: for btok_idx in (0..token_idx).rev() {
if skipped_tokens.contains_key(&btok_idx) {
let (skip_range, skip_tok) = skipped_tokens.get(&btok_idx).unwrap();
match otokens.iter().position(|e| *e == *skip_tok) {
Some(idx) => {
if idx < possible_match.last_token_in_resolution {
if idx < possible_match.first_token_in_resolution {
possible_match.range.start = skip_range.start;
possible_match.tokens_range.start = btok_idx;
possible_match.n_consumed_tokens += 1;
possible_match.first_token_in_resolution -= 1;
n_skips -= 1;
} else {
break 'outer;
Expand All @@ -710,7 +713,7 @@ impl Parser {
if possible_match.raw_value_length < n_skips {
return Err(FindPossibleMatchRootError::PossibleMatchRootError(
PossibleMatchRootError::PossibleMatchSkippedError {
possible_match: possible_match,
possible_match
},
));
}
Expand Down Expand Up @@ -773,6 +776,7 @@ impl Parser {
n_consumed_tokens: 1,
last_token_in_input: 0, // we are not going to need this one
last_token_in_resolution: 0, // we are not going to need this one
first_token_in_resolution: 0, // we are not going to need this one
rank: possible_match.rank,
})
}
Expand Down Expand Up @@ -1153,6 +1157,19 @@ mod tests {
range: 26..44,
}]
);

// Multiple stop words at the beginning of a value
let parsed = parser
.run("hello I want to listen to the the rolling stones")
.unwrap();
assert_eq!(
parsed,
vec![ParsedValue {
raw_value: "the rolling stones".to_string(),
resolved_value: "The Rolling Stones".to_string(),
range: 30..48,
}]
);
}

#[test]
Expand Down

0 comments on commit b6b1529

Please sign in to comment.