diff --git a/.envrc b/.envrc index 3550a30..e8221e9 100644 --- a/.envrc +++ b/.envrc @@ -1 +1,3 @@ use flake + +PATH_add "$(git rev-parse --show-toplevel)/result/bin" diff --git a/src/adapters.rs b/src/adapters.rs index f1f36f9..307ed85 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -12,6 +12,7 @@ use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; use anyhow::{format_err, Context, Result}; use async_trait::async_trait; use custom::CustomAdapterConfig; +use custom::CustomIdentifiers; use custom::BUILTIN_SPAWNING_ADAPTERS; use log::*; use tokio::io::AsyncRead; @@ -35,7 +36,7 @@ pub struct AdapterMeta { /// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters pub recurses: bool, /// list of matchers (interpreted as a OR b OR ...) - pub fast_matchers: Vec, + pub fast_matchers: Option>, /// list of matchers when we have mime type detection active (interpreted as ORed) /// warning: this *overrides* the fast matchers pub slow_matchers: Option>, @@ -48,39 +49,65 @@ pub struct AdapterMeta { } impl AdapterMeta { // todo: this is pretty ugly - pub fn get_matchers<'a>( - &'a self, - slow: bool, - ) -> Box> + 'a> { + pub fn get_matchers(&self, slow: bool) -> Box> + '_> { match ( slow, self.keep_fast_matchers_if_accurate, &self.slow_matchers, + &self.fast_matchers, ) { - (true, false, Some(ref sm)) => Box::new(sm.iter().map(Cow::Borrowed)), - (true, true, Some(ref sm)) => Box::new( + (true, false, Some(ref sm), _) => Box::new(sm.iter().map(Cow::Borrowed)), + (true, true, Some(ref sm), Some(ref fm)) => Box::new( sm.iter().map(Cow::Borrowed).chain( - self.fast_matchers - .iter() - .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), + fm.iter() + .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))) + .collect::>(), ), ), // don't have slow matchers or slow matching disabled - (true, _, None) | (false, _, _) => Box::new( - self.fast_matchers - .iter() - .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), - ), + (true, _, None, Some(ref fm)) | (false, _, _, Some(ref fm)) => { + Box::new(fm.iter().map(|e| Cow::Owned(FileMatcher::Fast(e.clone())))) + } + _ => Box::new(::std::iter::empty()), } } } -pub trait GetMetadata { - fn metadata(&self) -> &AdapterMeta; +pub trait Adapter { + fn name(&self) -> String; + fn version(&self) -> i32; + fn description(&self) -> String; + fn recurses(&self) -> bool; + fn disabled_by_default(&self) -> bool; + fn keep_fast_matchers_if_accurate(&self) -> bool; + fn extensions(&self) -> Option>; + fn mimetypes(&self) -> Option>; + + fn metadata(&self) -> AdapterMeta { + return AdapterMeta { + name: self.name(), + version: self.version(), + description: self.description(), + recurses: true, + fast_matchers: self.extensions().map(|exts| { + exts.iter() + .map(|s| FastFileMatcher::FileExtension(s.to_string())) + .collect() + }), + slow_matchers: self.mimetypes().map(|mimetypes| { + mimetypes + .iter() + .map(|mimetype| FileMatcher::MimeType(mimetype.to_string())) + .collect() + }), + disabled_by_default: true, + keep_fast_matchers_if_accurate: true, + }; + } } #[async_trait] -pub trait FileAdapter: GetMetadata + Send + Sync { +pub trait FileAdapter: Adapter + Send + Sync { /// adapt a file. /// /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher @@ -109,7 +136,10 @@ pub struct AdaptInfo { /// (enabledAdapters, disabledAdapters) type AdaptersTuple = (Vec>, Vec>); -pub fn get_all_adapters(custom_adapters: Option>) -> AdaptersTuple { +pub fn get_all_adapters( + custom_identifiers: Option, + custom_adapters: Option>, +) -> AdaptersTuple { // order in descending priority let mut adapters: Vec> = vec![]; if let Some(custom_adapters) = custom_adapters { @@ -118,12 +148,137 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad } } + let custom_identifiers = custom_identifiers.unwrap_or_default(); let internal_adapters: Vec> = vec![ Arc::new(PostprocPageBreaks::default()), - Arc::new(ffmpeg::FFmpegAdapter::new()), - Arc::new(zip::ZipAdapter::new()), - Arc::new(decompress::DecompressAdapter::new()), - Arc::new(mbox::MboxAdapter::new()), + Arc::new(ffmpeg::FFmpegAdapter { + extensions: custom_identifiers + .ffmpeg + .extensions + .clone() + .unwrap_or_else(|| { + ffmpeg::EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes: custom_identifiers + .ffmpeg + .mimetypes + .clone() + .unwrap_or_else(|| { + ffmpeg::MIMETYPES + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + }), + Arc::new(zip::ZipAdapter { + extensions: custom_identifiers + .zip + .extensions + .clone() + .unwrap_or_else(|| { + zip::EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes: custom_identifiers.zip.mimetypes.clone().unwrap_or_else(|| { + zip::MIMETYPES + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + }), + Arc::new(decompress::DecompressAdapter { + extensions_gz: custom_identifiers.gz.extensions.clone().unwrap_or_else(|| { + decompress::EXTENSIONS_GZ + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + extensions_bz2: custom_identifiers + .bz2 + .extensions + .clone() + .unwrap_or_else(|| { + decompress::EXTENSIONS_BZ2 + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + extensions_xz: custom_identifiers.xz.extensions.clone().unwrap_or_else(|| { + decompress::EXTENSIONS_XZ + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + extensions_zst: custom_identifiers + .zst + .extensions + .clone() + .unwrap_or_else(|| { + decompress::EXTENSIONS_ZST + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes_gz: custom_identifiers.gz.extensions.clone().unwrap_or_else(|| { + decompress::MIMETYPES_GZ + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes_bz2: custom_identifiers + .bz2 + .extensions + .clone() + .unwrap_or_else(|| { + decompress::MIMETYPES_BZ2 + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes_xz: custom_identifiers.xz.extensions.clone().unwrap_or_else(|| { + decompress::MIMETYPES_XZ + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes_zst: custom_identifiers + .zst + .extensions + .clone() + .unwrap_or_else(|| { + decompress::MIMETYPES_ZST + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + }), + Arc::new(mbox::MboxAdapter { + extensions: custom_identifiers + .mbox + .extensions + .clone() + .unwrap_or_else(|| { + mbox::EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + mimetypes: custom_identifiers + .mbox + .mimetypes + .clone() + .unwrap_or_else(|| { + mbox::MIMETYPES + .iter() + .map(|&s| s.to_string()) + .collect::>() + }), + }), Arc::new(tar::TarAdapter::new()), Arc::new(sqlite::SqliteAdapter::new()), ]; @@ -148,10 +303,12 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) */ pub fn get_adapters_filtered>( + custom_identifiers: Option, custom_adapters: Option>, adapter_names: &[T], ) -> Result>> { - let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters); + let (def_enabled_adapters, def_disabled_adapters) = + get_all_adapters(custom_identifiers, custom_adapters); let adapters = if !adapter_names.is_empty() { let adapters_map: HashMap<_, _> = def_enabled_adapters .iter() diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs index 3ae0e34..9d15557 100644 --- a/src/adapters/custom.rs +++ b/src/adapters/custom.rs @@ -1,12 +1,8 @@ use super::*; -use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; use crate::adapted_iter::one_file; -use crate::{ - adapted_iter::AdaptedFilesIterBox, - expand::expand_str_ez, - matching::{FastFileMatcher, FileMatcher}, -}; +use crate::{adapted_iter::AdaptedFilesIterBox, expand::expand_str_ez, matching::FileMatcher}; use crate::{join_handle_to_stream, to_io_err}; use anyhow::Result; use async_stream::stream; @@ -23,6 +19,139 @@ use tokio::process::Command; use tokio_util::io::StreamReader; // mostly the same as AdapterMeta + SpawningFileAdapter + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] +pub struct CustomIdentifier { + /// the file extensions this adapter supports. For example ["gz", "tgz"] + pub extensions: Option>, + /// if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching + pub mimetypes: Option>, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, PartialEq, Clone)] +pub struct CustomIdentifiers { + /// The identifiers to process as bz2 archives + pub bz2: CustomIdentifier, + /// The identifiers to process via ffmpeg + pub ffmpeg: CustomIdentifier, + /// The identifiers to process as gz archives + pub gz: CustomIdentifier, + /// The identifiers to process as xz archives + pub xz: CustomIdentifier, + /// The identifiers to process as zip archives + pub zip: CustomIdentifier, + /// The identifiers to process as zst archives + pub zst: CustomIdentifier, + /// The identifiers to process as mbox files + pub mbox: CustomIdentifier, +} + +impl Default for CustomIdentifiers { + fn default() -> CustomIdentifiers { + CustomIdentifiers { + ffmpeg: CustomIdentifier { + extensions: Some( + ffmpeg::EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + ffmpeg::MIMETYPES + .to_vec() + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + gz: CustomIdentifier { + extensions: Some( + decompress::EXTENSIONS_GZ + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + decompress::MIMETYPES_GZ + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + bz2: CustomIdentifier { + extensions: Some( + decompress::EXTENSIONS_BZ2 + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + decompress::MIMETYPES_BZ2 + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + xz: CustomIdentifier { + extensions: Some( + decompress::EXTENSIONS_XZ + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + decompress::MIMETYPES_XZ + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + zst: CustomIdentifier { + extensions: Some( + decompress::EXTENSIONS_ZST + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + decompress::MIMETYPES_ZST + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + zip: CustomIdentifier { + extensions: Some( + zip::EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + zip::MIMETYPES + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + mbox: CustomIdentifier { + extensions: Some( + mbox::EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + mimetypes: Some( + mbox::MIMETYPES + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ), + }, + } + } +} + #[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] pub struct CustomAdapterConfig { /// the unique identifier and name of this adapter. Must only include a-z, 0-9, _ @@ -183,16 +312,46 @@ pub fn pipe_output( } pub struct CustomSpawningFileAdapter { + name: String, + version: i32, + description: String, + recurses: bool, + disabled_by_default: bool, + keep_fast_matchers_if_accurate: bool, + extensions: Option>, + mimetypes: Option>, binary: String, args: Vec, - meta: AdapterMeta, output_path_hint: Option, } -impl GetMetadata for CustomSpawningFileAdapter { - fn metadata(&self) -> &AdapterMeta { - &self.meta + +impl Adapter for CustomSpawningFileAdapter { + fn name(&self) -> String { + self.name.clone() + } + fn version(&self) -> i32 { + self.version + } + fn description(&self) -> String { + self.description.clone() + } + fn recurses(&self) -> bool { + self.recurses + } + fn disabled_by_default(&self) -> bool { + self.disabled_by_default + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + self.keep_fast_matchers_if_accurate + } + fn extensions(&self) -> Option> { + self.extensions.clone() + } + fn mimetypes(&self) -> Option> { + self.mimetypes.clone() } } + fn arg_replacer(arg: &str, filepath_hint: &Path) -> Result { expand_str_ez(arg, |s| match s { "input_virtual_path" => Ok(filepath_hint.to_string_lossy()), @@ -265,33 +424,26 @@ impl FileAdapter for CustomSpawningFileAdapter { impl CustomAdapterConfig { pub fn to_adapter(&self) -> CustomSpawningFileAdapter { CustomSpawningFileAdapter { + name: self.name.clone(), + version: self.version, + description: format!( + "{}\nRuns: {} {}", + self.description, + self.binary, + self.args.join(" ") + ), + recurses: false, + disabled_by_default: self.disabled_by_default.unwrap_or(false), + keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), + extensions: if self.extensions.is_empty() { + None + } else { + Some(self.extensions.clone()) + }, + mimetypes: self.mimetypes.clone(), binary: self.binary.clone(), args: self.args.clone(), output_path_hint: self.output_path_hint.clone(), - meta: AdapterMeta { - name: self.name.clone(), - version: self.version, - description: format!( - "{}\nRuns: {} {}", - self.description, - self.binary, - self.args.join(" ") - ), - recurses: true, - fast_matchers: self - .extensions - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: self.mimetypes.as_ref().map(|mimetypes| { - mimetypes - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - }), - keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), - disabled_by_default: self.disabled_by_default.unwrap_or(false), - }, } } } diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index f4b96a7..07e7e1c 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -3,51 +3,164 @@ use crate::adapted_iter::one_file; use super::*; use anyhow::Result; -use lazy_static::lazy_static; +use std::path::{Path, PathBuf}; +use std::str::FromStr; use tokio::io::BufReader; -use std::path::{Path, PathBuf}; +pub const EXTENSIONS_GZ: &[&str] = &["als", "gz", "tgz"]; +pub const EXTENSIONS_BZ2: &[&str] = &["bz2", "tbz", "tbz2"]; +pub const EXTENSIONS_XZ: &[&str] = &["xz"]; +pub const EXTENSIONS_ZST: &[&str] = &["zst"]; +// pub static EXTENSIONS: Vec = +// [EXTENSIONS_GZ, EXTENSIONS_BZ2, EXTENSIONS_XZ, EXTENSIONS_ZST].concat(); -static EXTENSIONS: &[&str] = &["als", "bz2", "gz", "tbz", "tbz2", "tgz", "xz", "zst"]; -static MIME_TYPES: &[&str] = &[ - "application/gzip", - "application/x-bzip", - "application/x-xz", - "application/zstd", -]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "decompress".to_owned(), - version: 1, - description: - "Reads compressed file as a stream and runs a different extractor on the contents." - .to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some( - MIME_TYPES - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - ), - disabled_by_default: false, - keep_fast_matchers_if_accurate: true - }; +#[derive(Debug, PartialEq, Eq)] +struct DecompressError; + +#[derive(Debug, PartialEq)] +enum Extension { + Gz, + Bz2, + Xz, + Zst, } -#[derive(Default)] -pub struct DecompressAdapter; +impl FromStr for Extension { + type Err = DecompressError; + + fn from_str(ext: &str) -> Result { + if EXTENSIONS_GZ.contains(&ext) { + Ok(Extension::Gz) + } else if EXTENSIONS_BZ2.contains(&ext) { + Ok(Extension::Bz2) + } else if EXTENSIONS_XZ.contains(&ext) { + Ok(Extension::Xz) + } else if EXTENSIONS_ZST.contains(&ext) { + Ok(Extension::Zst) + } else { + Err(DecompressError) + } + } +} + +pub const MIMETYPES_GZ: &[&str] = &["application/gzip"]; +pub const MIMETYPES_BZ2: &[&str] = &["application/x-bzip"]; +pub const MIMETYPES_XZ: &[&str] = &["application/x-xz"]; +pub const MIMETYPES_ZST: &[&str] = &["application/zstd"]; +// pub const MIMES: &[&str] = [MIMES_GZ, MIMES_BZ2, MIMES_XZ, MIMES_ZST].concat(); + +#[derive(Debug, PartialEq)] +enum Mime { + Gz, + Bz2, + Xz, + Zst, +} +impl FromStr for Mime { + type Err = DecompressError; -impl DecompressAdapter { - pub fn new() -> DecompressAdapter { - DecompressAdapter + fn from_str(ext: &str) -> Result { + if MIMETYPES_GZ.contains(&ext) { + Ok(Mime::Gz) + } else if MIMETYPES_BZ2.contains(&ext) { + Ok(Mime::Bz2) + } else if MIMETYPES_XZ.contains(&ext) { + Ok(Mime::Xz) + } else if MIMETYPES_ZST.contains(&ext) { + Ok(Mime::Zst) + } else { + Err(DecompressError) + } } } -impl GetMetadata for DecompressAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +// static EXTENSIONS_GZ: HashSet<&str> = ["als", "gz", "tgz"].into(); +// static EXTENSIONS_XZ: HashSet<&str> = ["xz"].into(); +// static EXTENSIONS_ZST: HashSet<&str> = ["zst"].into(); +// pub static EXTENSIONS: HashSet<&str> = +// [EXTENSIONS_GZ, EXTENSIONS_BZ2, EXTENSIONS_XZ, EXTENSIONS_ZST] +// .iter() +// .cloned() +// .reduce(|acc, e| acc.union(&e).cloned().collect()) +// .unwrap_or_default(); + +// static MIMETYPES_GZ: HashSet<&str> = ["application/gzip"].into(); +// static MIMETYPES_BZ2: HashSet<&str> = ["application/x-bzip"].into(); +// static MIMETYPES_XZ: HashSet<&str> = ["application/x-xz"].into(); +// static MIMTEYPTES_ZST: HashSet<&str> = ["application/zstd"].into(); +// pub static MIMETYPES: HashSet<&str> = [MIMETYPES_GZ, MIMETYPES_BZ2, MIMETYPES_XZ, MIMTEYPTES_ZST] +// .iter() +// .cloned() +// .reduce(|acc, e| acc.union(&e).cloned().collect()) +// .unwrap_or_default(); + +#[derive(Default)] +pub struct DecompressAdapter { + pub extensions_gz: Vec, + pub extensions_bz2: Vec, + pub extensions_xz: Vec, + pub extensions_zst: Vec, + pub mimetypes_gz: Vec, + pub mimetypes_bz2: Vec, + pub mimetypes_xz: Vec, + pub mimetypes_zst: Vec, +} + +impl Adapter for DecompressAdapter { + fn name(&self) -> String { + String::from("decompress") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Reads compressed file as a stream and runs a different extractor on the contents.", + ) + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Option> { + let mut extensions: Vec = Vec::new(); + for exts in [ + &self.extensions_gz, + &self.extensions_bz2, + &self.extensions_xz, + &self.extensions_zst, + ] { + for ext in exts { + extensions.push(ext.to_string()) + } + } + if extensions.is_empty() { + None + } else { + Some(extensions) + } + } + fn mimetypes(&self) -> Option> { + let mut mimetypes: Vec = Vec::new(); + for mimes in [ + &self.mimetypes_gz, + &self.mimetypes_bz2, + &self.mimetypes_xz, + &self.mimetypes_zst, + ] { + for mime in mimes { + mimetypes.push(mime.to_string()) + } + } + if mimetypes.is_empty() { + None + } else { + Some(mimetypes) + } } } @@ -61,19 +174,19 @@ fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result { let zst = |inp: ReadBox| Box::pin(bufread::ZstdDecoder::new(BufReader::new(inp))); Ok(match reason { - Fast(FileExtension(ext)) => match ext.as_ref() { - "als" | "gz" | "tgz" => gz(inp), - "bz2" | "tbz" | "tbz2" => bz2(inp), - "zst" => zst(inp), - "xz" => xz(inp), - ext => Err(format_err!("don't know how to decompress {}", ext))?, + Fast(FileExtension(ext)) => match Extension::from_str(ext) { + Ok(Extension::Gz) => gz(inp), + Ok(Extension::Bz2) => gz(inp), + Ok(Extension::Zst) => gz(inp), + Ok(Extension::Xz) => gz(inp), + Err(_) => Err(format_err!("don't know how to decompress {}", ext))?, }, - MimeType(mime) => match mime.as_ref() { - "application/gzip" => gz(inp), - "application/x-bzip" => bz2(inp), - "application/x-xz" => xz(inp), - "application/zstd" => zst(inp), - mime => Err(format_err!("don't know how to decompress mime {}", mime))?, + MimeType(mime) => match Mime::from_str(mime) { + Ok(Mime::Gz) => gz(inp), + Ok(Mime::Bz2) => bz2(inp), + Ok(Mime::Xz) => xz(inp), + Ok(Mime::Zst) => zst(inp), + Err(_) => Err(format_err!("don't know how to decompress mime {}", mime))?, }, }) } @@ -137,7 +250,7 @@ mod tests { #[tokio::test] async fn gz() -> Result<()> { - let adapter = DecompressAdapter; + let adapter = DecompressAdapter::default(); let filepath = test_data_dir().join("hello.gz"); @@ -150,7 +263,7 @@ mod tests { #[tokio::test] async fn pdf_gz() -> Result<()> { - let adapter = DecompressAdapter; + let adapter = DecompressAdapter::default(); let filepath = test_data_dir().join("short.pdf.gz"); diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 32298fe..950ba37 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -2,7 +2,6 @@ use super::*; use super::{custom::map_exe_error, writing::async_writeln}; use anyhow::*; use async_trait::async_trait; -use lazy_static::lazy_static; use regex::Regex; use serde::{Deserialize, Serialize}; use std::process::Stdio; @@ -10,41 +9,53 @@ use tokio::io::AsyncWrite; use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command; use writing::WritingFileAdapter; -// todo: + // maybe todo: read list of extensions from // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null // but really, the probability of getting useful information from a .flv is low -static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; +pub const EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; +pub const MIMETYPES: &[&str] = &[]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "ffmpeg".to_owned(), - version: 1, - description: - "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata" - .to_owned(), - recurses: false, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None, - disabled_by_default: false, - keep_fast_matchers_if_accurate: true - }; +#[derive(Clone)] +pub struct FFmpegAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct FFmpegAdapter; - -impl FFmpegAdapter { - pub fn new() -> FFmpegAdapter { - FFmpegAdapter +impl Adapter for FFmpegAdapter { + fn name(&self) -> String { + String::from("ffmpeg") } -} -impl GetMetadata for FFmpegAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata.", + ) + } + fn recurses(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Option> { + if self.extensions.is_empty() { + None + } else { + Some(self.extensions.clone()) + } + } + fn mimetypes(&self) -> Option> { + if self.mimetypes.is_empty() { + None + } else { + Some(self.mimetypes.clone()) + } } } diff --git a/src/adapters/mbox.rs b/src/adapters/mbox.rs index ee39d0d..2593558 100644 --- a/src/adapters/mbox.rs +++ b/src/adapters/mbox.rs @@ -9,42 +9,53 @@ use tokio::io::AsyncReadExt; use std::{collections::VecDeque, io::Cursor}; -static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; -static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"]; +pub const EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; +pub const MIMETYPES: &[&str] = &["application/mbox", "message/rfc822"]; + lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "mail".to_owned(), - version: 1, - description: - "Reads mailbox/mail files and runs extractors on the contents and attachments." - .to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some( - MIME_TYPES - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - ), - disabled_by_default: true, - keep_fast_matchers_if_accurate: true - }; static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap(); } + #[derive(Default)] -pub struct MboxAdapter; +pub struct MboxAdapter { + pub extensions: Vec, + pub mimetypes: Vec, +} -impl MboxAdapter { - pub fn new() -> MboxAdapter { - MboxAdapter +impl Adapter for MboxAdapter { + fn name(&self) -> String { + String::from("mail") } -} -impl GetMetadata for MboxAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Reads mailbox/mail files and runs extractors on the contents and attachments.", + ) + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Option> { + if self.extensions.is_empty() { + None + } else { + Some(self.extensions.clone()) + } + } + fn mimetypes(&self) -> Option> { + if self.mimetypes.is_empty() { + None + } else { + Some(self.mimetypes.clone()) + } } } @@ -138,7 +149,7 @@ mod tests { #[tokio::test] async fn mail_simple() -> Result<()> { - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("github_email.eml"); @@ -171,7 +182,7 @@ mod tests { #[tokio::test] async fn mbox_simple() -> Result<()> { - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("test.mbx"); @@ -197,7 +208,7 @@ mod tests { async fn mbox_attachment() -> Result<()> { init_logging(); - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("mail_with_attachment.mbox"); diff --git a/src/adapters/postproc.rs b/src/adapters/postproc.rs index 45ec2a7..5bf3109 100644 --- a/src/adapters/postproc.rs +++ b/src/adapters/postproc.rs @@ -1,4 +1,4 @@ -//trait RunFnAdapter: GetMetadata {} +//trait RunFnAdapter: Adapter {} //impl FileAdapter for T where T: RunFnAdapter {} @@ -19,30 +19,38 @@ use tokio_util::io::StreamReader; use crate::adapted_iter::one_file; use crate::adapted_iter::AdaptedFilesIterBox; -use crate::matching::FastFileMatcher; -use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; fn add_newline(ar: impl AsyncRead + Send) -> impl AsyncRead + Send { ar.chain(Cursor::new(&[b'\n'])) } pub struct PostprocPrefix {} -impl GetMetadata for PostprocPrefix { - fn metadata(&self) -> &super::AdapterMeta { - lazy_static::lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "postprocprefix".to_owned(), - version: 1, - description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(), - recurses: false, - fast_matchers: vec![], - slow_matchers: None, - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; - } - &METADATA +impl Adapter for PostprocPrefix { + fn name(&self) -> String { + String::from("postprocprefix") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Adds the line prefix to each line (e.g. the filename within a zip)") + } + fn recurses(&self) -> bool { + false + } + fn mimetypes(&self) -> Option> { + None + } + fn extensions(&self) -> Option> { + None + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false } } #[async_trait] @@ -155,21 +163,30 @@ pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl As #[derive(Default)] pub struct PostprocPageBreaks {} -impl GetMetadata for PostprocPageBreaks { - fn metadata(&self) -> &super::AdapterMeta { - lazy_static::lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "postprocpagebreaks".to_owned(), - version: 1, - description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.".to_owned(), - recurses: false, - fast_matchers: vec![FastFileMatcher::FileExtension("asciipagebreaks".to_string())], - slow_matchers: None, - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; - } - &METADATA +impl Adapter for PostprocPageBreaks { + fn name(&self) -> String { + String::from("postprocpagebreaks") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.") + } + fn recurses(&self) -> bool { + false + } + fn extensions(&self) -> Option> { + Some(vec![String::from("asciipagebreaks")]) + } + fn mimetypes(&self) -> Option> { + None + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true } } #[async_trait] diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index 0e8c1b9..2c10cfa 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -1,7 +1,6 @@ use super::{writing::WritingFileAdapter, *}; use anyhow::Result; use async_trait::async_trait; -use lazy_static::lazy_static; use log::*; use rusqlite::types::ValueRef; use rusqlite::*; @@ -10,27 +9,8 @@ use tokio::io::AsyncWrite; use tokio_util::io::SyncIoBridge; -static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; - -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "sqlite".to_owned(), - version: 1, - description: - "Uses sqlite bindings to convert sqlite databases into a simple plain text format" - .to_owned(), - recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think) - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType( - "application/x-sqlite3".to_owned() - )]), - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; -} +const EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; +const MIMES: &[&str] = &["application/x-sqlite3"]; #[derive(Default, Clone)] pub struct SqliteAdapter; @@ -40,9 +20,42 @@ impl SqliteAdapter { SqliteAdapter } } -impl GetMetadata for SqliteAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA +impl Adapter for SqliteAdapter { + fn name(&self) -> String { + String::from("sqlite") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Uses sqlite bindings to convert sqlite databases into a simple plain text format", + ) + } + fn recurses(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn extensions(&self) -> Option> { + Some( + EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ) + } + fn mimetypes(&self) -> Option> { + Some( + MIMES + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ) } } diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 144bd20..c1eba13 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -1,37 +1,16 @@ -use crate::{ - adapted_iter::AdaptedFilesIterBox, - adapters::AdapterMeta, - matching::{FastFileMatcher, FileMatcher}, - print_bytes, -}; +use crate::{adapted_iter::AdaptedFilesIterBox, matching::FileMatcher, print_bytes}; use anyhow::*; use async_stream::stream; use async_trait::async_trait; -use lazy_static::lazy_static; use log::*; use std::path::PathBuf; use tokio_stream::StreamExt; -use super::{AdaptInfo, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; -static EXTENSIONS: &[&str] = &["tar"]; +const EXTENSIONS: &[&str] = &["tar"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "tar".to_owned(), - version: 1, - description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None, - keep_fast_matchers_if_accurate: true, - disabled_by_default: false - }; -} #[derive(Default, Clone)] pub struct TarAdapter; @@ -40,9 +19,35 @@ impl TarAdapter { TarAdapter } } -impl GetMetadata for TarAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA +impl Adapter for TarAdapter { + fn name(&self) -> String { + String::from("tar") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Reads a tar file as a stream and recurses down into its contents") + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Option> { + Some( + EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect::>(), + ) + } + fn mimetypes(&self) -> Option> { + None } } diff --git a/src/adapters/writing.rs b/src/adapters/writing.rs index b17152a..1ed13d0 100644 --- a/src/adapters/writing.rs +++ b/src/adapters/writing.rs @@ -2,13 +2,13 @@ use std::pin::Pin; use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err}; -use super::{AdaptInfo, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; use anyhow::{Context, Result}; use async_trait::async_trait; use tokio::io::{AsyncReadExt, AsyncWrite}; #[async_trait] -pub trait WritingFileAdapter: GetMetadata + Send + Sync + Clone { +pub trait WritingFileAdapter: Adapter + Send + Sync + Clone { async fn adapt_write( a: super::AdaptInfo, detection_reason: &crate::matching::FileMatcher, diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index 8c30407..213444b 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -2,39 +2,58 @@ use super::*; use crate::print_bytes; use anyhow::*; use async_stream::stream; -use lazy_static::lazy_static; use log::*; -// TODO: allow users to configure file extensions instead of hard coding the list -// https://github.com/phiresky/ripgrep-all/pull/208#issuecomment-2173241243 -static EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; +pub const EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; +pub const MIMETYPES: &[&str] = &["application/zip"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "zip".to_owned(), - version: 1, - description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType("application/zip".to_owned())]), - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; +#[derive(Debug, Clone)] +pub struct ZipAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct ZipAdapter; -impl ZipAdapter { - pub fn new() -> ZipAdapter { - ZipAdapter +impl Default for ZipAdapter { + fn default() -> ZipAdapter { + ZipAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for ZipAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for ZipAdapter { + fn name(&self) -> String { + String::from("zip") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Reads a zip file as a stream and recurses down into its contents") + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn extensions(&self) -> Option> { + if self.extensions.is_empty() { + None + } else { + Some(self.extensions.clone()) + } + } + fn mimetypes(&self) -> Option> { + if self.mimetypes.is_empty() { + None + } else { + Some(self.mimetypes.clone()) + } } } @@ -225,7 +244,7 @@ mod test { async fn only_seek_zip_fs() -> Result<()> { let zip = test_data_dir().join("only-seek-zip.zip"); let (a, d) = simple_fs_adapt_info(&zip).await?; - let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?; + let _v = adapted_to_vec(loop_adapt(&ZipAdapter::default(), d, a).await?).await?; // assert_eq!(String::from_utf8(v)?, ""); Ok(()) @@ -242,7 +261,7 @@ mod test { #[tokio::test] async fn recurse() -> Result<()> { let zipfile = create_zip("outer.txt", "outer text file", true).await?; - let adapter = ZipAdapter::new(); + let adapter = ZipAdapter::default(); let (a, d) = simple_adapt_info( &PathBuf::from("outer.zip"), diff --git a/src/bin/rga.rs b/src/bin/rga.rs index b1ebb82..e698ff3 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -12,13 +12,15 @@ use std::process::Command; use std::time::Instant; fn list_adapters(args: RgaConfig) -> Result<()> { - let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters); + let (enabled_adapters, disabled_adapters) = + get_all_adapters(args.custom_identifiers, args.custom_adapters); println!("Adapters:\n"); let print = |adapter: std::sync::Arc| { let meta = adapter.metadata(); let matchers = meta .fast_matchers + .unwrap_or_default() .iter() .map(|m| match m { FastFileMatcher::FileExtension(ext) => format!(".{ext}"), @@ -92,14 +94,18 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; + let adapters = get_adapters_filtered( + config.custom_identifiers.clone(), + config.custom_adapters.clone(), + &config.adapters, + )?; let pre_glob = if !config.accurate { let extensions = adapters .iter() - .flat_map(|a| &a.metadata().fast_matchers) - .flat_map(|m| match m { - FastFileMatcher::FileExtension(ext) => vec![ext.clone(), ext.to_ascii_uppercase()], + .flat_map(|a| a.metadata().fast_matchers.unwrap_or_default()) + .map(|matcher| match matcher { + FastFileMatcher::FileExtension(_) => matcher.to_string(), }) .collect::>() .join(","); diff --git a/src/config.rs b/src/config.rs index 768ce12..e589f4a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,4 +1,7 @@ -use crate::{adapters::custom::CustomAdapterConfig, project_dirs}; +use crate::{ + adapters::custom::{CustomAdapterConfig, CustomIdentifiers}, + project_dirs, +}; use anyhow::{Context, Result}; use derive_more::FromStr; use log::*; @@ -171,6 +174,12 @@ pub struct RgaConfig { #[structopt(skip)] pub custom_adapters: Option>, ////////////////////////////////////////// + //////////////////////////// Config file only + ////////////////////////////////////////// + #[serde(default, skip_serializing_if = "is_default")] + #[structopt(skip)] + pub custom_identifiers: Option, + ////////////////////////////////////////// //////////////////////////// CMD line only ////////////////////////////////////////// #[serde(skip)] diff --git a/src/matching.rs b/src/matching.rs index d5a4be9..81f6998 100644 --- a/src/matching.rs +++ b/src/matching.rs @@ -7,8 +7,8 @@ use anyhow::*; use regex::{Regex, RegexSet}; +use std::fmt; use std::iter::Iterator; - use std::sync::Arc; // match only based on file path @@ -24,6 +24,20 @@ pub enum FastFileMatcher { // todo: maybe allow matching a directory (e.g. /var/lib/postgres) } +impl std::fmt::Display for FastFileMatcher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + FastFileMatcher::FileExtension(val) => { + // Write strictly the first element into the supplied output + // stream: `f`. Returns `fmt::Result` which indicates whether the + // operation succeeded or failed. Note that `write!` uses syntax which + // is very similar to `println!`. + write!(f, "{}", val) + } + } + } +} + #[derive(Clone, Debug, Eq, PartialEq, Hash)] pub enum FileMatcher { /// any type of fast matcher @@ -40,12 +54,12 @@ impl From for FileMatcher { } } -pub struct FileMeta { +pub struct FileMeta<'a> { // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed pub lossy_filename: String, // only given when slow matching is enabled - pub mimetype: Option<&'static str>, + pub mimetype: Option<&'a str>, } pub fn extension_to_regex(extension: &str) -> Regex { diff --git a/src/preproc.rs b/src/preproc.rs index 32f3fa8..089ad03 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -32,7 +32,11 @@ async fn choose_adapter( archive_recursion_depth: i32, inp: &mut (impl AsyncBufRead + Unpin), ) -> Result, FileMatcher, ActiveAdapters)>> { - let active_adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; + let active_adapters = get_adapters_filtered( + config.custom_identifiers.clone(), + config.custom_adapters.clone(), + &config.adapters, + )?; let adapters = adapter_matcher(&active_adapters, config.accurate)?; let filename = filepath_hint .file_name() @@ -255,7 +259,7 @@ pub async fn loop_adapt_inner( ai.filepath_hint.to_string_lossy(), &adapter.metadata().name ); - for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? { + for await ifile in loop_adapt(adapter.clone().as_ref(), detection_reason, ai).await? { yield ifile; } }