diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a6fd370 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,64 @@ +name: CI + +on: + pull_request: + push: + branches: + - 'main' + schedule: + - cron: '0 9 * * *' + +jobs: + fmt: + name: Rust fmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.74.0 + components: rustfmt + - name: Cargo fmt check + run: cargo fmt --all -- --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.74.0 + components: clippy + - name: Cargo Clippy + run: cargo clippy -- -D warnings + + build: + name: Build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.74.0 + - name: Cargo build + run: cargo build + + test: + name: Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.74.0 + - name: Cargo test + run: cargo test + + + + diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index cdb8af4..0000000 --- a/.travis.yml +++ /dev/null @@ -1,14 +0,0 @@ -language: rust - -rust: -- stable -- beta -- nightly - -matrix: - allow_failures: - - rust: nightly - -script: - - cargo check --all --tests --benches --examples - - cargo test --all diff --git a/Cargo.toml b/Cargo.toml index b51aec4..4c2a149 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,24 +11,10 @@ debug = true [dependencies] failure = "0.1" -serde_json = "1.0" -serde = { version = "1.0", features = ["derive"] } -rmp-serde = "0.13" -fnv = "1.0" +serde_json = "1" +serde = { version = "1", features = ["derive"] } +rmp-serde = "1" +fnv = "1" [dev-dependencies] -criterion = "0.2" -dinghy-test = "0.4" -rand = "0.7" tempfile = "3" -clap = "2" - -[[bench]] -name = "bench_parser" -harness = false - -[[example]] -name = "interactive_parsing_cli" - -[[example]] -name = "entity_parsing_from_scratch" diff --git a/README.rst b/README.rst index f235310..d4e74f6 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,7 @@ Gazetteer Entity Parser ======================= -.. image:: https://travis-ci.org/snipsco/gazetteer-entity-parser.svg?branch=master - :target: https://travis-ci.org/snipsco/gazetteer-entity-parser +**THIS LIBRARY IS NOT ACTIVELY MAINTAINED ANYMORE** This Rust library allows to parse and resolve entity values based on a gazetteer, in the context of an `Information Extraction `_ task. diff --git a/benches/bench_parser.rs b/benches/bench_parser.rs deleted file mode 100644 index 9cd7835..0000000 --- a/benches/bench_parser.rs +++ /dev/null @@ -1,157 +0,0 @@ -#[macro_use] -extern crate criterion; -extern crate dinghy_test; -extern crate gazetteer_entity_parser; -extern crate rand; -extern crate serde_json; - -use criterion::Criterion; -use gazetteer_entity_parser::*; -use rand::distributions::Alphanumeric; -use rand::rngs::ThreadRng; -use rand::seq::IteratorRandom; -use rand::thread_rng; -use rand::Rng; -use std::collections::HashSet; - -pub fn test_data_path() -> ::std::path::PathBuf { - ::dinghy_test::try_test_file_path("data").unwrap_or_else(|| "data".into()) -} - -/// Function generating a random string representing a single word of various length -fn generate_random_string(rng: &mut ThreadRng) -> String { - let n_char = rng.gen_range(3, 8); - rng.sample_iter(&Alphanumeric).take(n_char).collect() -} - -/// Random string generator with tunable redundancy to make it harder for the parser -#[derive(Clone)] -struct RandomStringGenerator { - vocabulary: Vec, - max_words: usize, - rng: ThreadRng, - already_generated: HashSet, -} - -impl RandomStringGenerator { - fn new(vocab_size: usize, max_words: usize) -> RandomStringGenerator { - let mut rng = thread_rng(); - let unique_strings = (0..vocab_size) - .map(|_| generate_random_string(&mut rng)) - .collect(); - RandomStringGenerator { - vocabulary: unique_strings, - max_words, - rng, - already_generated: HashSet::new(), - } - } -} - -impl Iterator for RandomStringGenerator { - type Item = String; - - fn next(&mut self) -> Option { - loop { - let n_words = self.rng.gen_range(1, self.max_words); - let generated_value = self - .vocabulary - .iter() - .choose_multiple(&mut self.rng, n_words) - .iter() - .map(|sample_string| sample_string.to_string()) - .collect::>() - .join(" "); - if !self.already_generated.contains(&generated_value) { - self.already_generated.insert(generated_value.clone()); - break Some(generated_value); - } - } - } -} - -fn generate_random_gazetteer( - vocab_size: usize, - nb_entity_values: usize, - max_words: usize, -) -> (Gazetteer, RandomStringGenerator) { - let rsg = RandomStringGenerator::new(vocab_size, max_words); - let entity_values = rsg - .clone() - .take(nb_entity_values) - .map(|string| EntityValue { - resolved_value: string.to_lowercase(), - raw_value: string, - }) - .collect(); - let gazetteer = Gazetteer { - data: entity_values, - }; - (gazetteer, rsg) -} - -fn generate_random_parser( - vocab_size: usize, - nb_entity_values: usize, - max_words: usize, - minimum_tokens_ratio: f32, - n_stop_words: usize, -) -> (Parser, RandomStringGenerator) { - let (gazetteer, rsg) = generate_random_gazetteer(vocab_size, nb_entity_values, max_words); - let parser = ParserBuilder::default() - .gazetteer(gazetteer) - .minimum_tokens_ratio(minimum_tokens_ratio) - .n_stop_words(n_stop_words) - .build() - .unwrap(); - (parser, rsg) -} - -fn get_low_redundancy_parser() -> (Parser, RandomStringGenerator) { - generate_random_parser(10000, 100000, 10, 0.5, 50) -} - -fn get_high_redundancy_parser() -> (Parser, RandomStringGenerator) { - generate_random_parser(100, 100000, 5, 0.5, 50) -} - -fn parsing_low_redundancy(c: &mut Criterion) { - let (parser, mut rsg) = get_low_redundancy_parser(); - c.bench_function("Parse random value - low redundancy", move |b| { - b.iter(|| parser.run(&rsg.next().unwrap(), 10)) - }); -} - -fn parsing_high_redundancy(c: &mut Criterion) { - let (parser, mut rsg) = get_high_redundancy_parser(); - c.bench_function("Parse random value - high redundancy", move |b| { - b.iter(|| parser.run(&rsg.next().unwrap(), 10)) - }); -} - -fn loading(c: &mut Criterion) { - let (gazetteer, _) = generate_random_gazetteer(100, 1000, 5); - let parser_directory = test_data_path().join("benches").join("parser"); - if !parser_directory.exists() { - let parser = ParserBuilder::default() - .gazetteer(gazetteer) - .minimum_tokens_ratio(0.5) - .n_stop_words(50) - .build() - .unwrap(); - - parser.dump(&parser_directory).unwrap(); - } - c.bench_function( - "Loading random gazetteer parser with low redundancy", - move |b| b.iter(|| Parser::from_folder(parser_directory.clone()).unwrap()), - ); -} - -criterion_group!( - benches, - parsing_low_redundancy, - parsing_high_redundancy, - loading -); -criterion_main!(benches); diff --git a/data/benches/.gitignore b/data/benches/.gitignore deleted file mode 100644 index 763d456..0000000 --- a/data/benches/.gitignore +++ /dev/null @@ -1 +0,0 @@ -parser diff --git a/examples/entity_parsing_from_scratch.rs b/examples/entity_parsing_from_scratch.rs deleted file mode 100644 index 23d8df0..0000000 --- a/examples/entity_parsing_from_scratch.rs +++ /dev/null @@ -1,47 +0,0 @@ -extern crate gazetteer_entity_parser; - -use gazetteer_entity_parser::*; - -fn main() { - let gazetteer = gazetteer!( - ("king of pop", "Michael Jackson"), - ("the rolling stones", "The Rolling Stones"), - ("the crying stones", "The Crying Stones"), - ("the fab four", "The Beatles"), - ("queen of soul", "Aretha Franklin"), - ); - let parser = ParserBuilder::default() - .gazetteer(gazetteer) - .minimum_tokens_ratio(2. / 3.) - .build() - .unwrap(); - - let sentence = "My favourite artists are the stones and fab four"; - let extracted_entities = parser.run(sentence, 5); - assert_eq!( - extracted_entities, - vec![ - ParsedValue { - matched_value: "the stones".to_string(), - resolved_value: ResolvedValue { - resolved: "The Rolling Stones".to_string(), - raw_value: "the rolling stones".to_string(), - }, - alternatives: vec![ResolvedValue { - resolved: "The Crying Stones".to_string(), - raw_value: "the crying stones".to_string(), - }], - range: 25..35, - }, - ParsedValue { - matched_value: "fab four".to_string(), - resolved_value: ResolvedValue { - resolved: "The Beatles".to_string(), - raw_value: "the fab four".to_string(), - }, - alternatives: vec![], - range: 40..48, - } - ] - ); -} diff --git a/examples/interactive_parsing_cli.rs b/examples/interactive_parsing_cli.rs deleted file mode 100644 index 075b48d..0000000 --- a/examples/interactive_parsing_cli.rs +++ /dev/null @@ -1,104 +0,0 @@ -extern crate clap; -extern crate gazetteer_entity_parser; -extern crate serde_json; - -use std::io::Write; -use std::{fs, io}; - -use clap::{App, Arg}; - -use gazetteer_entity_parser::{Gazetteer, Parser, ParserBuilder}; - -fn main() { - let mut app = App::new("gazetteer-entity-parser-demo") - .about("Interactive CLI for parsing gazetteer entities") - .arg( - Arg::with_name("parser") - .short("p") - .long("--parser") - .takes_value(true) - .help("path to the parser directory"), - ) - .arg( - Arg::with_name("gazetteer") - .short("g") - .long("--gazetteer") - .takes_value(true) - .help("path to the json gazetteer file"), - ) - .arg( - Arg::with_name("opt_nb_stop_words") - .short("n") - .long("--nb-stop-words") - .takes_value(true) - .help("number of stop words to use"), - ) - .arg( - Arg::with_name("opt_tokens_ratio") - .short("r") - .long("--ratio") - .takes_value(true) - .help("minimum tokens ratio for the parser"), - ) - .arg( - Arg::with_name("opt_max_alternatives") - .short("a") - .long("--alternatives") - .takes_value(true) - .help("maximum number of alternative resolved values"), - ); - let matches = app.clone().get_matches(); - - let opt_nb_stop_words = matches - .value_of("opt_nb_stop_words") - .map(|nb_str| nb_str.to_string().parse::().unwrap()); - - let opt_tokens_ratio = matches - .value_of("opt_tokens_ratio") - .map(|ratio_str| ratio_str.to_string().parse::().unwrap()); - let max_alternatives = matches - .value_of("opt_max_alternatives") - .map(|max_str| max_str.to_string().parse::().unwrap()) - .unwrap_or(5); - - if let Some(parser) = matches - .value_of("parser") - .map(|parser_dir| { - println!("\nLoading the parser..."); - let mut parser = Parser::from_folder(parser_dir).unwrap(); - if let Some(ratio) = opt_tokens_ratio { - parser.set_threshold(ratio); - }; - if let Some(nb_stop_words) = opt_nb_stop_words { - parser.set_stop_words(nb_stop_words, None); - }; - parser - }) - .or_else(|| { - matches.value_of("gazetteer").map(|gazetteer_path| { - println!("\nLoading the gazetteer..."); - let gazetteer_file = fs::File::open(&gazetteer_path).unwrap(); - let gazetteer: Gazetteer = serde_json::from_reader(gazetteer_file).unwrap(); - - println!("\nBuilding the parser..."); - ParserBuilder::default() - .gazetteer(gazetteer) - .n_stop_words(opt_nb_stop_words.unwrap_or(0)) - .minimum_tokens_ratio(opt_tokens_ratio.unwrap_or(1.0)) - .build() - .unwrap() - }) - }) - { - loop { - print!("> "); - io::stdout().flush().unwrap(); - let mut query = String::new(); - io::stdin().read_line(&mut query).unwrap(); - let result = parser.run(query.trim(), max_alternatives); - println!("{:?}", result); - } - } else { - app.print_long_help().unwrap(); - } -} diff --git a/src/data.rs b/src/data.rs index e9570ab..9e88837 100644 --- a/src/data.rs +++ b/src/data.rs @@ -15,8 +15,7 @@ impl EntityValue { pub fn into_tokenized(self) -> TokenizedEntityValue { TokenizedEntityValue { resolved_value: self.resolved_value, - tokens: whitespace_tokenizer(&*self.raw_value) - .into_iter() + tokens: whitespace_tokenizer(&self.raw_value) .map(|(_, token)| token) .collect(), } @@ -128,7 +127,7 @@ impl Gazetteer { /// Extend the Gazetteer with the values of another Gazetteer pub fn extend(&mut self, gazetteer: Self) { - self.data.extend(gazetteer.data.into_iter()) + self.data.extend(gazetteer.data) } } @@ -163,6 +162,7 @@ impl Ord for ParsedValue { } impl PartialOrd for ParsedValue { + #[allow(clippy::non_canonical_partial_ord_impl)] fn partial_cmp(&self, other: &ParsedValue) -> Option { if self.range.end <= other.range.start { Some(Ordering::Less) diff --git a/src/parser.rs b/src/parser.rs index 65a8a78..af67013 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,14 +1,3 @@ -use crate::constants::*; -use crate::data::EntityValue; -use crate::errors::*; -use crate::parser_registry::ParserRegistry; -use crate::utils::{check_threshold, whitespace_tokenizer}; -use crate::ParsedValue; -use failure::{format_err, ResultExt}; -use fnv::{FnvHashMap, FnvHashSet}; -use rmp_serde::{from_read, Serializer}; -use serde::{Deserialize, Serialize}; -use serde_json; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashSet}; @@ -16,6 +5,18 @@ use std::fs; use std::ops::Range; use std::path::Path; +use failure::{format_err, ResultExt}; +use fnv::{FnvHashMap, FnvHashSet}; +use rmp_serde::{from_read, Serializer}; +use serde::{Deserialize, Serialize}; + +use crate::constants::*; +use crate::data::EntityValue; +use crate::errors::*; +use crate::parser_registry::ParserRegistry; +use crate::utils::{check_threshold, whitespace_tokenizer}; +use crate::ParsedValue; + /// Struct representing the parser. The Parser will match the longest possible contiguous /// substrings of a query that match partial entity values. The order in which the values are /// added to the parser matters: In case of ambiguity between two parsings, the Parser will output @@ -73,19 +74,18 @@ impl PossibleMatch { } impl Ord for PossibleMatch { + #[allow(clippy::if_same_then_else)] fn cmp(&self, other: &PossibleMatch) -> Ordering { if self.n_consumed_tokens < other.n_consumed_tokens { Ordering::Less } else if self.n_consumed_tokens > other.n_consumed_tokens { Ordering::Greater + } else if self.raw_value_length < other.raw_value_length { + Ordering::Greater + } else if self.raw_value_length > other.raw_value_length { + Ordering::Less } else { - if self.raw_value_length < other.raw_value_length { - Ordering::Greater - } else if self.raw_value_length > other.raw_value_length { - Ordering::Less - } else { - other.rank.cmp(&self.rank) - } + other.rank.cmp(&self.rank) } } } @@ -183,7 +183,7 @@ impl Parser { .with_context(|_| format_err!("Error when serializing the parser's metadata"))?; let parser_path = folder_name.as_ref().join(config.parser_filename); - let mut writer = fs::File::create(&parser_path) + let mut writer = fs::File::create(parser_path) .with_context(|_| format_err!("Error when creating the parser file"))?; self.serialize(&mut Serializer::new(&mut writer)) @@ -201,14 +201,14 @@ impl Parser { /// Load a parser from a folder pub fn from_folder>(folder_name: P) -> Result { let metadata_path = folder_name.as_ref().join(METADATA_FILENAME); - let metadata_file = fs::File::open(&metadata_path) + let metadata_file = fs::File::open(metadata_path) .with_context(|_| format_err!("Error when opening the metadata file"))?; let config: ParserConfig = serde_json::from_reader(metadata_file) .with_context(|_| format_err!("Error when deserializing the metadata"))?; let parser_path = folder_name.as_ref().join(config.parser_filename); - let reader = fs::File::open(&parser_path) + let reader = fs::File::open(parser_path) .with_context(|_| format_err!("Error when opening the parser file"))?; Ok(from_read(reader) @@ -270,14 +270,14 @@ impl Parser { // Iterate over current possible matches containing the stop word and // try to grow them (but do not initiate a new possible match) - for (res_val, mut possible_match) in &mut partial_matches { + for (res_val, possible_match) in &mut partial_matches { if !res_vals_from_token.contains(res_val) || self.registry.is_edge_case(*res_val) { continue; } self.update_previous_match( - &mut possible_match, + possible_match, token_idx, *value, range.clone(), @@ -308,6 +308,7 @@ impl Parser { group_matches(final_matches, max_alternatives) } + #[allow(clippy::too_many_arguments)] fn update_or_insert_possible_match( &self, value: u32, @@ -315,7 +316,7 @@ impl Parser { token_idx: usize, range: Range, partial_matches: &mut FnvHashMap, - mut final_matches: &mut Vec, + final_matches: &mut Vec, skipped_tokens: &mut FnvHashMap, u32)>, threshold: f32, ) { @@ -327,21 +328,20 @@ impl Parser { value, range, threshold, - &mut final_matches, + final_matches, ); } Entry::Vacant(entry) => { - self.insert_new_possible_match( + if let Some(new_possible_match) = self.insert_new_possible_match( res_val, value, range, token_idx, threshold, - &skipped_tokens, - ) - .map(|new_possible_match| { + skipped_tokens, + ) { entry.insert(new_possible_match); - }); + } } } } @@ -353,7 +353,7 @@ impl Parser { value: u32, range: Range, threshold: f32, - ref mut final_matches: &mut Vec, + final_matches: &mut Vec, ) { let (rank, otokens) = self.registry.get_tokens(possible_match.resolved_value); @@ -361,9 +361,12 @@ impl Parser { // Grow the last Possible Match // Find the next token in the resolved value that matches the // input token - for otoken_idx in (possible_match.last_token_in_resolution + 1)..otokens.len() { - let otok = otokens[otoken_idx]; - if value == otok { + for (otoken_idx, otoken) in otokens + .iter() + .enumerate() + .skip(possible_match.last_token_in_resolution + 1) + { + if value == *otoken { possible_match.range.end = range.end; possible_match.n_consumed_tokens += 1; possible_match.last_token_in_input = token_idx; @@ -382,11 +385,10 @@ impl Parser { final_matches.push(possible_match.clone()); } // Then we initialize a new PossibleMatch with the same res val - let last_token_in_resolution = otokens.iter().position(|e| *e == value).expect(&*format!( - "Missing token {} from list {:?}", - value, - otokens.clone() - )); + let last_token_in_resolution = otokens + .iter() + .position(|e| *e == value) + .unwrap_or_else(|| panic!("Missing token {} from list {:?}", value, otokens.clone())); *possible_match = PossibleMatch { resolved_value: possible_match.resolved_value, @@ -414,11 +416,10 @@ impl Parser { skipped_tokens: &FnvHashMap, u32)>, ) -> Option { let (rank, otokens) = self.registry.get_tokens(res_val); - let last_token_in_resolution = otokens.iter().position(|e| *e == value).expect(&*format!( - "Missing token {} from list {:?}", - value, - otokens.clone() - )); + let last_token_in_resolution = otokens + .iter() + .position(|e| *e == value) + .unwrap_or_else(|| panic!("Missing token {} from list {:?}", value, otokens.clone())); let mut possible_match = PossibleMatch { resolved_value: res_val, range, @@ -523,7 +524,7 @@ impl Parser { possible_match.tokens_range.start <= **idx && possible_match.tokens_range.end > **idx }) - .map(|idx| *idx) + .copied() .collect(); if !overlapping_tokens.is_empty() { @@ -601,8 +602,8 @@ fn group_matches( grouped_matches }, ) - .into_iter() - .map(|(_, mut matches)| { + .into_values() + .map(|mut matches| { let mut best_match = matches.pop().unwrap().clone(); while !matches.is_empty() && best_match.alternative_resolved_values.len() < max_alternatives @@ -623,12 +624,14 @@ fn group_matches( #[cfg(test)] mod tests { - use super::*; + use failure::ResultExt; + use tempfile::tempdir; + use crate::data::*; use crate::gazetteer; use crate::parser_builder::ParserBuilder; - use failure::ResultExt; - use tempfile::tempdir; + + use super::*; fn get_license_info() -> LicenseInfo { let license_content = "Some content here".to_string(); diff --git a/src/parser_registry.rs b/src/parser_registry.rs index 68392d4..1f5e61c 100644 --- a/src/parser_registry.rs +++ b/src/parser_registry.rs @@ -1,7 +1,9 @@ +use std::collections::{BTreeSet, HashSet}; + +use serde::{Deserialize, Serialize}; + use crate::data::{RegisteredEntityValue, ResolvedValue, TokenizedEntityValue}; use crate::symbol_table::{ResolvedSymbolTable, TokenSymbolTable}; -use serde::{Deserialize, Serialize}; -use std::collections::{BTreeSet, HashSet}; type Rank = u32; @@ -59,7 +61,7 @@ impl ParserRegistry { .push(token_idx); } } - return Some(res_value_idx); + Some(res_value_idx) } /// Prepends a list of entity values to the parser and update the ranks accordingly. @@ -131,7 +133,7 @@ impl ParserRegistry { }) .collect() }) - .unwrap_or_else(|| vec![]); + .unwrap_or_else(Vec::new); self.set_top_stop_words(n_stop_words); } @@ -151,7 +153,7 @@ impl ParserRegistry { .into_iter() .take(nb_stop_words) .map(|(idx, _)| idx) - .chain(self.additional_stop_words.clone().into_iter()) + .chain(self.additional_stop_words.clone()) .collect(); // Update the set of edge_cases. i.e. resolved values that only contain stop words diff --git a/src/symbol_table.rs b/src/symbol_table.rs index 2056b15..54ed851 100644 --- a/src/symbol_table.rs +++ b/src/symbol_table.rs @@ -1,6 +1,7 @@ -use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; +use serde::{Deserialize, Serialize}; + /// Implementation of a symbol table that /// - always maps a given index to a single string /// - allows mapping a string to several indices @@ -16,7 +17,7 @@ impl TokenSymbolTable { pub fn add_symbol(&mut self, symbol: String) -> u32 { self.string_to_index .get(&symbol) - .map(|idx| *idx) + .copied() .unwrap_or_else(|| { let symbol_index = self.available_index; self.available_index += 1; diff --git a/src/utils.rs b/src/utils.rs index 5efde2e..ff8eeca 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -79,10 +79,7 @@ mod tests { let mut tokenizer = whitespace_tokenizer("дра \t नमस्ते"); assert_eq!(tokenizer.next(), Some((0..3, "дра".to_string()))); - assert_eq!( - tokenizer.next(), - Some((6..12, "नमस्ते".to_string())) - ); + assert_eq!(tokenizer.next(), Some((6..12, "नमस्ते".to_string()))); let mut tokenizer = whitespace_tokenizer("je veux écouter les rolling stones"); assert_eq!(tokenizer.next(), Some((0..2, "je".to_string())));