From c0f3c81f021b7c7bc96bc01302af54422d69c193 Mon Sep 17 00:00:00 2001 From: Michael Kohler Date: Tue, 27 Oct 2020 22:33:34 +0100 Subject: [PATCH] Add simple file loader --- README.md | 10 ++++++++++ src/app.rs | 17 ++++++++++++++++- src/loaders/file.rs | 40 ++++++++++++++++++++++++++++++++++++++++ src/loaders/mod.rs | 2 ++ 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 src/loaders/file.rs diff --git a/README.md b/README.md index a594600f..dc72e959 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,16 @@ cargo run -- extract -l en -d ../wikiextractor/text/ >> wiki.en.txt *Tip: You don't need this last process to finish to start observing the output, wiki.en.txt should get a few thousands sentences in just a few minutes, and you can use that as a way to estimate the quality of the output early on and stop the process if you are not happy.* +### Extract from line break separated files + +If you have one or multiple files with one sentence per line, you can use this extractor to extract sentences from these files applying the defined language rules. This can be useful if you have a large list of sentences and you want to only have sentences which match the rules. + +By default you can extract 10000 sentences per file. + +``` +cargo run -- extract-file -l en -d ../texts/ >> file.en.txt +``` + ## Using language rules The following rules can be configured per language. Add a `.toml` file in the `rules` directory to enable a new locale. diff --git a/src/app.rs b/src/app.rs index 1604e7ae..60b3b3c2 100644 --- a/src/app.rs +++ b/src/app.rs @@ -2,7 +2,7 @@ use clap::{App, Arg, ArgMatches, SubCommand}; use std::ffi::OsString; use crate::extractor::extract; -use crate::loaders::{Wikipedia}; +use crate::loaders::{File, Wikipedia}; const VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -41,6 +41,12 @@ where .arg(&language_argument) .arg(&directory_argument) ) + .subcommand( + SubCommand::with_name("extract-file") + .about("Extract sentences from files which have one sentence per line") + .arg(&language_argument) + .arg(&directory_argument) + ) .get_matches_from(itr) } @@ -65,6 +71,15 @@ fn start(all_matches: ArgMatches) -> Result<(), String> { return extract(wikipedia_loader, no_check); } + // File + if let Some(matches) = all_matches.subcommand_matches("extract-file") { + let language = String::from(matches.value_of("language").unwrap_or("en")); + let directory = String::from(matches.value_of("dir").unwrap_or_default()); + + let file_loader = File::new(language, directory); + return extract(file_loader, no_check); + } + println!("{}", all_matches.usage()); Err(String::from("Did you forget to add a subcommand?")) } diff --git a/src/loaders/file.rs b/src/loaders/file.rs new file mode 100644 index 00000000..b2696bbc --- /dev/null +++ b/src/loaders/file.rs @@ -0,0 +1,40 @@ +use std::fs::File; +use std::io::Read; +use std::path::PathBuf; + +use super::definition::Loader; +use crate::config::Config; + +pub struct FileLoader { + pub config: Config, +} + +impl FileLoader { + pub fn new(language: String, directory: String) -> Self { + let config = Config { + language, + directory, + max_sentences_per_text: std::usize::MAX, + file_prefix: String::from(""), + }; + + Self { config } + } +} + +impl Loader for FileLoader { + fn get_config(&self) -> &Config { + &self.config + } + + fn load(&self, file_name: &PathBuf) -> Result, String> { + let mut file = File::open(file_name).map_err(|e| format!("{}", e))?; + let mut all_sentences = String::new(); + file.read_to_string(&mut all_sentences) + .map_err(|e| format!("{}", e))?; + Ok(all_sentences + .lines() + .map(|sentence| String::from(sentence)) + .collect()) + } +} diff --git a/src/loaders/mod.rs b/src/loaders/mod.rs index b86e783a..e57c873e 100644 --- a/src/loaders/mod.rs +++ b/src/loaders/mod.rs @@ -1,5 +1,7 @@ pub use wikipedia::Wikipedia; +pub use file::FileLoader as File; pub use definition::Loader; pub mod wikipedia; +pub mod file; mod definition; \ No newline at end of file