From 4756038d89ecb8c6f2135bc0c6a20246dd35fbb8 Mon Sep 17 00:00:00 2001 From: veeshi <31014797+veeshi@users.noreply.github.com> Date: Fri, 4 Feb 2022 11:41:56 +0000 Subject: [PATCH 1/4] Formatted with rustfmt and fixed nearly all clippy hints --- src/cache.rs | 571 +++++++++++++++++++++++++------------------------ src/lazy.rs | 357 ++++++++++++++++--------------- src/lib.rs | 8 +- src/options.rs | 62 +++--- 4 files changed, 505 insertions(+), 493 deletions(-) diff --git a/src/cache.rs b/src/cache.rs index 0517744..2570a3e 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -18,338 +18,343 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -use std::ops::{Deref, DerefMut}; -use std::sync::{Mutex, Arc}; use std::borrow::Cow; use std::fmt; +use std::ops::{Deref, DerefMut}; use std::str; +use std::sync::{Arc, Mutex}; -use regex::{Regex, RegexBuilder, Error}; -use regex::{Match, Captures, Replacer}; -use crate::syntax; -use crate::options::Options; use crate::lru::LruCache; +use crate::options::Options; +use crate::syntax; +use regex::{Captures, Match, Replacer}; +use regex::{Error, Regex, RegexBuilder}; /// An LRU cache for regular expressions. #[derive(Clone, Debug)] pub struct RegexCache(LruCache); impl RegexCache { - /// Create a new LRU cache with the given size limit. - pub fn new(capacity: usize) -> RegexCache { - RegexCache(LruCache::new(capacity)) - } - - /// Save the given regular expression in the cache. - /// - /// # Example - /// - /// ``` - /// # use regex_cache::{Regex, RegexCache}; - /// let mut cache = RegexCache::new(100); - /// let re = Regex::new(r"^\d+$").unwrap(); - /// - /// // By saving the previously created regular expression further calls to - /// // `compile` won't actually compile the regular expression. - /// cache.save(re); - /// - /// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234")); - /// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd")); - /// ``` - pub fn save(&mut self, re: Regex) -> &Regex { - let source = re.as_str().to_owned(); - - if !self.0.contains_key(re.as_str()) { - self.insert(source.clone(), re); - } - - self.0.get_mut(&source).unwrap() - } - - /// Create a new regular expression in the cache. - /// - /// # Example - /// - /// ``` - /// # use regex_cache::RegexCache; - /// let mut cache = RegexCache::new(100); - /// - /// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234")); - /// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd")); - /// ``` - pub fn compile(&mut self, source: &str) -> Result<&Regex, Error> { - if !self.0.contains_key(source) { - self.0.insert(source.into(), Regex::new(source)?); - } - - Ok(self.0.get_mut(source).unwrap()) - } - - /// Configure a new regular expression. - /// - /// # Example - /// - /// ``` - /// # use regex_cache::RegexCache; - /// let mut cache = RegexCache::new(100); - /// - /// assert!(cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap() - /// .is_match("ABC")); - /// - /// assert!(!cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap() - /// .is_match("123")); - /// ``` - pub fn configure(&mut self, source: &str, f: F) -> Result<&Regex, Error> - where F: FnOnce(&mut RegexBuilder) -> &mut RegexBuilder - { - if !self.0.contains_key(source) { - self.0.insert(source.into(), f(&mut RegexBuilder::new(source)).build()?); - } - - Ok(self.0.get_mut(source).unwrap()) - } + /// Create a new LRU cache with the given size limit. + pub fn new(capacity: usize) -> RegexCache { + RegexCache(LruCache::new(capacity)) + } + + /// Save the given regular expression in the cache. + /// + /// # Example + /// + /// ``` + /// # use regex_cache::{Regex, RegexCache}; + /// let mut cache = RegexCache::new(100); + /// let re = Regex::new(r"^\d+$").unwrap(); + /// + /// // By saving the previously created regular expression further calls to + /// // `compile` won't actually compile the regular expression. + /// cache.save(re); + /// + /// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234")); + /// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd")); + /// ``` + pub fn save(&mut self, re: Regex) -> &Regex { + let source = re.as_str().to_owned(); + + if !self.0.contains_key(re.as_str()) { + self.insert(source.clone(), re); + } + + self.0.get_mut(&source).unwrap() + } + + /// Create a new regular expression in the cache. + /// + /// # Example + /// + /// ``` + /// # use regex_cache::RegexCache; + /// let mut cache = RegexCache::new(100); + /// + /// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234")); + /// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd")); + /// ``` + pub fn compile(&mut self, source: &str) -> Result<&Regex, Error> { + if !self.0.contains_key(source) { + self.0.insert(source.into(), Regex::new(source)?); + } + + Ok(self.0.get_mut(source).unwrap()) + } + + /// Configure a new regular expression. + /// + /// # Example + /// + /// ``` + /// # use regex_cache::RegexCache; + /// let mut cache = RegexCache::new(100); + /// + /// assert!(cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap() + /// .is_match("ABC")); + /// + /// assert!(!cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap() + /// .is_match("123")); + /// ``` + pub fn configure(&mut self, source: &str, f: F) -> Result<&Regex, Error> + where + F: FnOnce(&mut RegexBuilder) -> &mut RegexBuilder, + { + if !self.0.contains_key(source) { + self.0 + .insert(source.into(), f(&mut RegexBuilder::new(source)).build()?); + } + + Ok(self.0.get_mut(source).unwrap()) + } } impl Deref for RegexCache { - type Target = LruCache; + type Target = LruCache; - fn deref(&self) -> &Self::Target { - &self.0 - } + fn deref(&self) -> &Self::Target { + &self.0 + } } impl DerefMut for RegexCache { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } } #[derive(Clone)] pub struct CachedRegex { - builder: CachedRegexBuilder, + builder: CachedRegexBuilder, } macro_rules! regex { - ($self:ident) => ( - $self.builder.cache.lock().unwrap().configure(&$self.builder.source, |b| - $self.builder.options.define(b)).unwrap() - ) + ($self:ident) => { + $self + .builder + .cache + .lock() + .unwrap() + .configure(&$self.builder.source, |b| $self.builder.options.define(b)) + .unwrap() + }; } impl CachedRegex { - /// Create a new cached `Regex` for the given source, checking the syntax is - /// valid. - pub fn new(cache: Arc>, source: &str) -> Result { - if let Err(err) = syntax::Parser::new().parse(source) { - return Err(Error::Syntax(err.to_string())); - } - - Ok(CachedRegex::new_unchecked(cache, source)) - } - - /// Create a new cached `Regex` for the given source, without checking if the - /// syntax is valid. - /// - /// Only use this if you know that the syntax is valid or you are ready to - /// handle potential syntax errors later on. - pub fn new_unchecked(cache: Arc>, source: &str) -> CachedRegex { - CachedRegex::from(CachedRegexBuilder::new(cache, source)) - } - - fn from(builder: CachedRegexBuilder) -> Self { - CachedRegex { - builder: builder, - } - } - - /// Refer to `Regex::is_match`. - pub fn is_match(&self, text: &str) -> bool { - regex!(self).is_match(text) - } - - /// Refer to `Regex::find`. - pub fn find<'t>(&self, text: &'t str) -> Option> { - regex!(self).find(text) - } - - /// Refer to `Regex::captures`. - pub fn captures<'t>(&self, text: &'t str) -> Option> { - regex!(self).captures(text) - } - - /// Refer to `Regex::replace`. - pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> { - regex!(self).replace(text, rep) - } - - /// Refer to `Regex::replace_all`. - pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> { - regex!(self).replace_all(text, rep) - } - - /// Refer to `Regex::shortest_match`. - pub fn shortest_match(&self, text: &str) -> Option { - regex!(self).shortest_match(text) - } - - pub fn captures_len(&self) -> usize { - regex!(self).captures_len() - } - - pub fn as_str(&self) -> &str { - &self.builder.source - } + /// Create a new cached `Regex` for the given source, checking the syntax is + /// valid. + pub fn new(cache: Arc>, source: &str) -> Result { + if let Err(err) = syntax::Parser::new().parse(source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(CachedRegex::new_unchecked(cache, source)) + } + + /// Create a new cached `Regex` for the given source, without checking if the + /// syntax is valid. + /// + /// Only use this if you know that the syntax is valid or you are ready to + /// handle potential syntax errors later on. + pub fn new_unchecked(cache: Arc>, source: &str) -> CachedRegex { + CachedRegex::from(CachedRegexBuilder::new(cache, source)) + } + + fn from(builder: CachedRegexBuilder) -> Self { + CachedRegex { builder } + } + + /// Refer to `Regex::is_match`. + pub fn is_match(&self, text: &str) -> bool { + regex!(self).is_match(text) + } + + /// Refer to `Regex::find`. + pub fn find<'t>(&self, text: &'t str) -> Option> { + regex!(self).find(text) + } + + /// Refer to `Regex::captures`. + pub fn captures<'t>(&self, text: &'t str) -> Option> { + regex!(self).captures(text) + } + + /// Refer to `Regex::replace`. + pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> { + regex!(self).replace(text, rep) + } + + /// Refer to `Regex::replace_all`. + pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> { + regex!(self).replace_all(text, rep) + } + + /// Refer to `Regex::shortest_match`. + pub fn shortest_match(&self, text: &str) -> Option { + regex!(self).shortest_match(text) + } + + pub fn captures_len(&self) -> usize { + regex!(self).captures_len() + } + + pub fn as_str(&self) -> &str { + &self.builder.source + } } impl fmt::Debug for CachedRegex { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Debug::fmt(regex!(self), f) - } + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(regex!(self), f) + } } impl fmt::Display for CachedRegex { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Display::fmt(regex!(self), f) - } + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(regex!(self), f) + } } /// A configurable builder for a cached `Regex`. #[derive(Clone, Debug)] pub struct CachedRegexBuilder { - cache: Arc>, - source: String, - options: Options, + cache: Arc>, + source: String, + options: Options, } impl CachedRegexBuilder { - /// Create a new regular expression builder with the given pattern. - /// - /// If the pattern is invalid, then an error will be returned when - /// `compile` is called. - pub fn new(cache: Arc>, source: &str) -> CachedRegexBuilder { - CachedRegexBuilder { - cache: cache, - source: source.to_owned(), - options: Default::default(), - } - } - - /// Consume the builder and compile the regular expression. - /// - /// Note that calling `as_str` on the resulting `Regex` will produce the - /// pattern given to `new` verbatim. Notably, it will not incorporate any - /// of the flags set on this builder. - pub fn build(&self) -> Result { - if let Err(err) = syntax::Parser::new().parse(&self.source) { - return Err(Error::Syntax(err.to_string())); - } - - Ok(CachedRegex::from(self.clone())) - } - - /// Consume the builder and compile the regular expression without checking - /// if the syntax is valid. - /// - /// Only use this if you know that the syntax is valid or you are ready to - /// handle potential syntax errors later on. - /// - /// Note that calling `as_str` on the resulting `Regex` will produce the - /// pattern given to `new` verbatim. Notably, it will not incorporate any - /// of the flags set on this builder. - pub fn build_unchecked(&self) -> CachedRegex { - CachedRegex::from(self.clone()) - } - - /// Set the value for the case insensitive (`i`) flag. - pub fn case_insensitive(&mut self, yes: bool) -> &mut CachedRegexBuilder { - self.options.case_insensitive = yes; - self - } - - /// Set the value for the multi-line matching (`m`) flag. - pub fn multi_line(&mut self, yes: bool) -> &mut CachedRegexBuilder { - self.options.multi_line = yes; - self - } - - /// Set the value for the any character (`s`) flag, where in `.` matches - /// anything when `s` is set and matches anything except for new line when - /// it is not set (the default). - /// - /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` - /// expressions and means "any Unicode scalar value" for `regex::Regex` - /// expressions. - pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut CachedRegexBuilder { - self.options.dot_matches_new_line = yes; - self - } - - /// Set the value for the greedy swap (`U`) flag. - pub fn swap_greed(&mut self, yes: bool) -> &mut CachedRegexBuilder { - self.options.swap_greed = yes; - self - } - - /// Set the value for the ignore whitespace (`x`) flag. - pub fn ignore_whitespace(&mut self, yes: bool) -> &mut CachedRegexBuilder { - self.options.ignore_whitespace = yes; - self - } - - /// Set the value for the Unicode (`u`) flag. - pub fn unicode(&mut self, yes: bool) -> &mut CachedRegexBuilder { - self.options.unicode = yes; - self - } - - /// Set the approximate size limit of the compiled regular expression. - /// - /// This roughly corresponds to the number of bytes occupied by a single - /// compiled program. If the program exceeds this number, then a - /// compilation error is returned. - pub fn size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder { - self.options.size_limit = limit; - self - } - - /// Set the approximate size of the cache used by the DFA. - /// - /// This roughly corresponds to the number of bytes that the DFA will - /// use while searching. - /// - /// Note that this is a *per thread* limit. There is no way to set a global - /// limit. In particular, if a regex is used from multiple threads - /// simulanteously, then each thread may use up to the number of bytes - /// specified here. - pub fn dfa_size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder { - self.options.dfa_size_limit = limit; - self - } + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `compile` is called. + pub fn new(cache: Arc>, source: &str) -> CachedRegexBuilder { + CachedRegexBuilder { + cache, + source: source.to_owned(), + options: Default::default(), + } + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result { + if let Err(err) = syntax::Parser::new().parse(&self.source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(CachedRegex::from(self.clone())) + } + + /// Consume the builder and compile the regular expression without checking + /// if the syntax is valid. + /// + /// Only use this if you know that the syntax is valid or you are ready to + /// handle potential syntax errors later on. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build_unchecked(&self) -> CachedRegex { + CachedRegex::from(self.clone()) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` + /// expressions and means "any Unicode scalar value" for `regex::Regex` + /// expressions. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.unicode = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder { + self.options.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simulanteously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder { + self.options.dfa_size_limit = limit; + self + } } #[cfg(test)] mod test { - use std::sync::{Arc, Mutex}; - use crate::cache::{RegexCache, CachedRegex}; + use crate::cache::{CachedRegex, RegexCache}; + use std::sync::{Arc, Mutex}; - #[test] - fn respects_limit() { - let mut cache = RegexCache::new(2); + #[test] + fn respects_limit() { + let mut cache = RegexCache::new(2); - cache.compile("[01]2").unwrap(); - cache.compile("[21]0").unwrap(); + cache.compile("[01]2").unwrap(); + cache.compile("[21]0").unwrap(); - assert_eq!(cache.len(), 2); - cache.compile("[21]3").unwrap(); - assert_eq!(cache.len(), 2); - } + assert_eq!(cache.len(), 2); + cache.compile("[21]3").unwrap(); + assert_eq!(cache.len(), 2); + } - #[test] - fn cached_regex() { - let cache = Arc::new(Mutex::new(RegexCache::new(100))); - let re = CachedRegex::new(cache.clone(), r"^\d+$").unwrap(); + #[test] + fn cached_regex() { + let cache = Arc::new(Mutex::new(RegexCache::new(100))); + let re = CachedRegex::new(cache.clone(), r"^\d+$").unwrap(); - assert!(re.is_match("123")); - assert!(!re.is_match("abc")); - } + assert!(re.is_match("123")); + assert!(!re.is_match("abc")); + } } diff --git a/src/lazy.rs b/src/lazy.rs index efdbcc8..d43e367 100644 --- a/src/lazy.rs +++ b/src/lazy.rs @@ -18,16 +18,16 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -use std::ops::Deref; use std::fmt; +use std::ops::Deref; use std::str; -use std::sync::Arc; use oncemutex::OnceMutex; +use std::sync::Arc; -use regex::{Regex, RegexBuilder, Error}; -use crate::syntax; use crate::options::Options; +use crate::syntax; +use regex::{Error, Regex, RegexBuilder}; /// A lazily created `Regex`. /// @@ -46,211 +46,218 @@ use crate::options::Options; /// ``` #[derive(Clone)] pub struct LazyRegex { - builder: LazyRegexBuilder, - regex: Arc>> + builder: LazyRegexBuilder, + regex: Arc>>, } impl LazyRegex { - /// Create a new lazy `Regex` for the given source, checking the syntax is - /// valid. - pub fn new(source: &str) -> Result { - if let Err(err) = syntax::Parser::new().parse(source) { - return Err(Error::Syntax(err.to_string())); - } - - Ok(LazyRegex::from(LazyRegexBuilder::new(source))) - } - - fn from(builder: LazyRegexBuilder) -> Self { - LazyRegex { - builder: builder, - regex: Arc::new(OnceMutex::new(None)), - } - } - - fn create(builder: &LazyRegexBuilder) -> Regex { - builder.options.define(&mut RegexBuilder::new(&builder.source)) - .build().unwrap() - } + /// Create a new lazy `Regex` for the given source, checking the syntax is + /// valid. + pub fn new(source: &str) -> Result { + if let Err(err) = syntax::Parser::new().parse(source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(LazyRegex::from(LazyRegexBuilder::new(source))) + } + + fn from(builder: LazyRegexBuilder) -> Self { + LazyRegex { + builder, + regex: Arc::new(OnceMutex::new(None)), + } + } + + fn create(builder: &LazyRegexBuilder) -> Regex { + builder + .options + .define(&mut RegexBuilder::new(&builder.source)) + .build() + .unwrap() + } } impl Deref for LazyRegex { - type Target = Regex; + type Target = Regex; - fn deref(&self) -> &Regex { - self.as_ref() - } + fn deref(&self) -> &Regex { + self.as_ref() + } } impl AsRef for LazyRegex { - fn as_ref(&self) -> &Regex { - if let Some(mut guard) = self.regex.lock() { - *guard = Some(LazyRegex::create(&self.builder)); - } + fn as_ref(&self) -> &Regex { + if let Some(mut guard) = self.regex.lock() { + *guard = Some(LazyRegex::create(&self.builder)); + } - (*self.regex).as_ref().unwrap() - } + (*self.regex).as_ref().unwrap() + } } impl Into for LazyRegex { - fn into(self) -> Regex { - let (regex, builder) = (self.regex, self.builder); - - Arc::try_unwrap(regex).ok().and_then(|m| m.into_inner()).unwrap_or_else(|| - LazyRegex::create(&builder)) - } + fn into(self) -> Regex { + let (regex, builder) = (self.regex, self.builder); + + Arc::try_unwrap(regex) + .ok() + .and_then(|m| m.into_inner()) + .unwrap_or_else(|| LazyRegex::create(&builder)) + } } impl fmt::Debug for LazyRegex { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Debug::fmt(&**self, f) - } + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } } impl fmt::Display for LazyRegex { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Display::fmt(&**self, f) - } + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } } impl str::FromStr for LazyRegex { - type Err = Error; + type Err = Error; - fn from_str(s: &str) -> Result { - LazyRegex::new(s) - } + fn from_str(s: &str) -> Result { + LazyRegex::new(s) + } } /// A configurable builder for a lazy `Regex`. #[derive(Clone, Eq, PartialEq, Debug)] pub struct LazyRegexBuilder { - source: String, - options: Options, + source: String, + options: Options, } impl LazyRegexBuilder { - /// Create a new regular expression builder with the given pattern. - /// - /// If the pattern is invalid, then an error will be returned when - /// `compile` is called. - pub fn new(source: &str) -> LazyRegexBuilder { - LazyRegexBuilder { - source: source.to_owned(), - options: Default::default(), - } - } - - /// Consume the builder and compile the regular expression. - /// - /// Note that calling `as_str` on the resulting `Regex` will produce the - /// pattern given to `new` verbatim. Notably, it will not incorporate any - /// of the flags set on this builder. - pub fn build(&self) -> Result { - if let Err(err) = syntax::Parser::new().parse(&self.source) { - return Err(Error::Syntax(err.to_string())); - } - - Ok(LazyRegex::from(self.clone())) - } - - /// Set the value for the case insensitive (`i`) flag. - pub fn case_insensitive(&mut self, yes: bool) -> &mut LazyRegexBuilder { - self.options.case_insensitive = yes; - self - } - - /// Set the value for the multi-line matching (`m`) flag. - pub fn multi_line(&mut self, yes: bool) -> &mut LazyRegexBuilder { - self.options.multi_line = yes; - self - } - - /// Set the value for the any character (`s`) flag, where in `.` matches - /// anything when `s` is set and matches anything except for new line when - /// it is not set (the default). - /// - /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` - /// expressions and means "any Unicode scalar value" for `regex::Regex` - /// expressions. - pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut LazyRegexBuilder { - self.options.dot_matches_new_line = yes; - self - } - - /// Set the value for the greedy swap (`U`) flag. - pub fn swap_greed(&mut self, yes: bool) -> &mut LazyRegexBuilder { - self.options.swap_greed = yes; - self - } - - /// Set the value for the ignore whitespace (`x`) flag. - pub fn ignore_whitespace(&mut self, yes: bool) -> &mut LazyRegexBuilder { - self.options.ignore_whitespace = yes; - self - } - - /// Set the value for the Unicode (`u`) flag. - pub fn unicode(&mut self, yes: bool) -> &mut LazyRegexBuilder { - self.options.unicode = yes; - self - } - - /// Set the approximate size limit of the compiled regular expression. - /// - /// This roughly corresponds to the number of bytes occupied by a single - /// compiled program. If the program exceeds this number, then a - /// compilation error is returned. - pub fn size_limit(&mut self, limit: usize) -> &mut LazyRegexBuilder { - self.options.size_limit = limit; - self - } - - /// Set the approximate size of the cache used by the DFA. - /// - /// This roughly corresponds to the number of bytes that the DFA will - /// use while searching. - /// - /// Note that this is a *per thread* limit. There is no way to set a global - /// limit. In particular, if a regex is used from multiple threads - /// simulanteously, then each thread may use up to the number of bytes - /// specified here. - pub fn dfa_size_limit(&mut self, limit: usize) -> &mut LazyRegexBuilder { - self.options.dfa_size_limit = limit; - self - } + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `compile` is called. + pub fn new(source: &str) -> LazyRegexBuilder { + LazyRegexBuilder { + source: source.to_owned(), + options: Default::default(), + } + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result { + if let Err(err) = syntax::Parser::new().parse(&self.source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(LazyRegex::from(self.clone())) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` + /// expressions and means "any Unicode scalar value" for `regex::Regex` + /// expressions. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.unicode = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(&mut self, limit: usize) -> &mut LazyRegexBuilder { + self.options.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simulanteously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut LazyRegexBuilder { + self.options.dfa_size_limit = limit; + self + } } #[cfg(test)] mod test { - use crate::{LazyRegex, LazyRegexBuilder}; - - #[test] - fn new() { - assert!(LazyRegex::new(r"^\d+$").unwrap() - .is_match("2345")); - - assert!(!LazyRegex::new(r"^[a-z]+$").unwrap() - .is_match("2345")); - } - - #[test] - fn build() { - assert!(LazyRegexBuilder::new(r"^abc$") - .case_insensitive(true).build().unwrap() - .is_match("ABC")); - - assert!(!LazyRegexBuilder::new(r"^abc$") - .case_insensitive(false).build().unwrap() - .is_match("ABC")); - } - - #[test] - fn same() { - let re = LazyRegex::new(r"^\d+$").unwrap(); - - assert!(re.is_match("1234")); - assert!(re.is_match("1234")); - assert!(re.is_match("1234")); - assert!(re.is_match("1234")); - } + use crate::{LazyRegex, LazyRegexBuilder}; + + #[test] + fn new() { + assert!(LazyRegex::new(r"^\d+$").unwrap().is_match("2345")); + + assert!(!LazyRegex::new(r"^[a-z]+$").unwrap().is_match("2345")); + } + + #[test] + fn build() { + assert!(LazyRegexBuilder::new(r"^abc$") + .case_insensitive(true) + .build() + .unwrap() + .is_match("ABC")); + + assert!(!LazyRegexBuilder::new(r"^abc$") + .case_insensitive(false) + .build() + .unwrap() + .is_match("ABC")); + } + + #[test] + fn same() { + let re = LazyRegex::new(r"^\d+$").unwrap(); + + assert!(re.is_match("1234")); + assert!(re.is_match("1234")); + assert!(re.is_match("1234")); + assert!(re.is_match("1234")); + } } diff --git a/src/lib.rs b/src/lib.rs index 9347327..f1122c7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,17 +21,17 @@ //! This crate provides a library for caching or lazily creating regular //! expressions. -extern crate regex; -extern crate regex_syntax as syntax; extern crate lru_cache as lru; extern crate oncemutex; +extern crate regex; +extern crate regex_syntax as syntax; -pub use regex::{Regex, RegexBuilder, Error}; +pub use regex::{Error, Regex, RegexBuilder}; mod options; mod cache; -pub use crate::cache::{RegexCache, CachedRegex, CachedRegexBuilder}; +pub use crate::cache::{CachedRegex, CachedRegexBuilder, RegexCache}; mod lazy; pub use crate::lazy::{LazyRegex, LazyRegexBuilder}; diff --git a/src/options.rs b/src/options.rs index bcf4cf0..55e60af 100644 --- a/src/options.rs +++ b/src/options.rs @@ -22,41 +22,41 @@ use regex::RegexBuilder; #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub struct Options { - pub case_insensitive: bool, - pub multi_line: bool, - pub dot_matches_new_line: bool, - pub swap_greed: bool, - pub ignore_whitespace: bool, - pub unicode: bool, - pub size_limit: usize, - pub dfa_size_limit: usize, + pub case_insensitive: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub size_limit: usize, + pub dfa_size_limit: usize, } impl Default for Options { - fn default() -> Self { - Options { - case_insensitive: false, - multi_line: false, - dot_matches_new_line: false, - swap_greed: false, - ignore_whitespace: false, - unicode: true, - size_limit: 10 * (1 << 20), - dfa_size_limit: 2 * (1 << 20), - } - } + fn default() -> Self { + Options { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + } + } } impl Options { - pub fn define<'b>(&self, builder: &'b mut RegexBuilder) -> &'b mut RegexBuilder { - builder - .case_insensitive(self.case_insensitive) - .multi_line(self.multi_line) - .dot_matches_new_line(self.dot_matches_new_line) - .swap_greed(self.swap_greed) - .ignore_whitespace(self.ignore_whitespace) - .unicode(self.unicode) - .size_limit(self.size_limit) - .dfa_size_limit(self.dfa_size_limit) - } + pub fn define<'b>(&self, builder: &'b mut RegexBuilder) -> &'b mut RegexBuilder { + builder + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .unicode(self.unicode) + .size_limit(self.size_limit) + .dfa_size_limit(self.dfa_size_limit) + } } From 89f24bb8e5cde1f1c43f0e2c7fe22c2008328470 Mon Sep 17 00:00:00 2001 From: veeshi <31014797+veeshi@users.noreply.github.com> Date: Fri, 4 Feb 2022 12:06:49 +0000 Subject: [PATCH 2/4] Added support for `regex::bytes` --- src/bytes.rs | 9 ++ src/bytes/cache.rs | 360 +++++++++++++++++++++++++++++++++++++++++++ src/bytes/lazy.rs | 268 ++++++++++++++++++++++++++++++++ src/bytes/options.rs | 62 ++++++++ src/lib.rs | 2 + 5 files changed, 701 insertions(+) create mode 100644 src/bytes.rs create mode 100644 src/bytes/cache.rs create mode 100644 src/bytes/lazy.rs create mode 100644 src/bytes/options.rs diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..a6915ac --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,9 @@ +pub use regex::bytes::{Regex, RegexBuilder}; + +mod cache; +pub use cache::{CachedRegex, CachedRegexBuilder, RegexCache}; + +mod lazy; +pub use lazy::{LazyRegex, LazyRegexBuilder}; + +mod options; diff --git a/src/bytes/cache.rs b/src/bytes/cache.rs new file mode 100644 index 0000000..d17f194 --- /dev/null +++ b/src/bytes/cache.rs @@ -0,0 +1,360 @@ +// Copyright 2017 1aim GmbH +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +use std::borrow::Cow; +use std::fmt; +use std::ops::{Deref, DerefMut}; +use std::str; +use std::sync::{Arc, Mutex}; + +use crate::bytes::options::Options; +use crate::lru::LruCache; +use crate::syntax; +use regex::bytes::{Captures, Match, Regex, RegexBuilder, Replacer}; +use regex::Error; + +/// An LRU cache for regular expressions. +#[derive(Clone, Debug)] +pub struct RegexCache(LruCache); + +impl RegexCache { + /// Create a new LRU cache with the given size limit. + pub fn new(capacity: usize) -> RegexCache { + RegexCache(LruCache::new(capacity)) + } + + /// Save the given regular expression in the cache. + /// + /// # Example + /// + /// ``` + /// # use regex_cache::bytes::{Regex, RegexCache}; + /// let mut cache = RegexCache::new(100); + /// let re = Regex::new(r"^\d+$").unwrap(); + /// + /// // By saving the previously created regular expression further calls to + /// // `compile` won't actually compile the regular expression. + /// cache.save(re); + /// + /// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234".as_bytes())); + /// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd".as_bytes())); + /// ``` + pub fn save(&mut self, re: Regex) -> &Regex { + let source = re.as_str().to_owned(); + + if !self.0.contains_key(re.as_str()) { + self.insert(source.clone(), re); + } + + self.0.get_mut(&source).unwrap() + } + + /// Create a new regular expression in the cache. + /// + /// # Example + /// + /// ``` + /// # use regex_cache::bytes::RegexCache; + /// let mut cache = RegexCache::new(100); + /// + /// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234".as_bytes())); + /// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd".as_bytes())); + /// ``` + pub fn compile(&mut self, source: &str) -> Result<&Regex, Error> { + if !self.0.contains_key(source) { + self.0.insert(source.into(), Regex::new(source)?); + } + + Ok(self.0.get_mut(source).unwrap()) + } + + /// Configure a new regular expression. + /// + /// # Example + /// + /// ``` + /// # use regex_cache::bytes::RegexCache; + /// let mut cache = RegexCache::new(100); + /// + /// assert!(cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap() + /// .is_match("ABC".as_bytes())); + /// + /// assert!(!cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap() + /// .is_match("123".as_bytes())); + /// ``` + pub fn configure(&mut self, source: &str, f: F) -> Result<&Regex, Error> + where + F: FnOnce(&mut RegexBuilder) -> &mut RegexBuilder, + { + if !self.0.contains_key(source) { + self.0 + .insert(source.into(), f(&mut RegexBuilder::new(source)).build()?); + } + + Ok(self.0.get_mut(source).unwrap()) + } +} + +impl Deref for RegexCache { + type Target = LruCache; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RegexCache { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +#[derive(Clone)] +pub struct CachedRegex { + builder: CachedRegexBuilder, +} + +macro_rules! regex { + ($self:ident) => { + $self + .builder + .cache + .lock() + .unwrap() + .configure(&$self.builder.source, |b| $self.builder.options.define(b)) + .unwrap() + }; +} + +impl CachedRegex { + /// Create a new cached `Regex` for the given source, checking the syntax is + /// valid. + pub fn new(cache: Arc>, source: &str) -> Result { + if let Err(err) = syntax::Parser::new().parse(source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(CachedRegex::new_unchecked(cache, source)) + } + + /// Create a new cached `Regex` for the given source, without checking if the + /// syntax is valid. + /// + /// Only use this if you know that the syntax is valid or you are ready to + /// handle potential syntax errors later on. + pub fn new_unchecked(cache: Arc>, source: &str) -> CachedRegex { + CachedRegex::from(CachedRegexBuilder::new(cache, source)) + } + + fn from(builder: CachedRegexBuilder) -> Self { + CachedRegex { builder } + } + + /// Refer to `Regex::bytes::is_match`. + pub fn is_match(&self, text: &[u8]) -> bool { + regex!(self).is_match(text) + } + + /// Refer to `Regex::find`. + pub fn find<'t>(&self, text: &'t [u8]) -> Option> { + regex!(self).find(text) + } + + /// Refer to `Regex::bytes::captures`. + pub fn captures<'t>(&self, text: &'t [u8]) -> Option> { + regex!(self).captures(text) + } + + /// Refer to `Regex::replace`. + pub fn replace<'t, R: Replacer>(&self, text: &'t [u8], rep: R) -> Cow<'t, [u8]> { + regex!(self).replace(text, rep) + } + + /// Refer to `Regex::bytes::replace_all`. + pub fn replace_all<'t, R: Replacer>(&self, text: &'t [u8], rep: R) -> Cow<'t, [u8]> { + regex!(self).replace_all(text, rep) + } + + /// Refer to `Regex::shortest_match`. + pub fn shortest_match(&self, text: &[u8]) -> Option { + regex!(self).shortest_match(text) + } + + pub fn captures_len(&self) -> usize { + regex!(self).captures_len() + } + + pub fn as_str(&self) -> &str { + &self.builder.source + } +} + +impl fmt::Debug for CachedRegex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(regex!(self), f) + } +} + +impl fmt::Display for CachedRegex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(regex!(self), f) + } +} + +/// A configurable builder for a cached `Regex`. +#[derive(Clone, Debug)] +pub struct CachedRegexBuilder { + cache: Arc>, + source: String, + options: Options, +} + +impl CachedRegexBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `compile` is called. + pub fn new(cache: Arc>, source: &str) -> CachedRegexBuilder { + CachedRegexBuilder { + cache, + source: source.to_owned(), + options: Default::default(), + } + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result { + if let Err(err) = syntax::Parser::new().parse(&self.source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(CachedRegex::from(self.clone())) + } + + /// Consume the builder and compile the regular expression without checking + /// if the syntax is valid. + /// + /// Only use this if you know that the syntax is valid or you are ready to + /// handle potential syntax errors later on. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build_unchecked(&self) -> CachedRegex { + CachedRegex::from(self.clone()) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` + /// expressions and means "any Unicode scalar value" for `regex::Regex` + /// expressions. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut CachedRegexBuilder { + self.options.unicode = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder { + self.options.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simulanteously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder { + self.options.dfa_size_limit = limit; + self + } +} + +#[cfg(test)] +mod test { + use super::{CachedRegex, RegexCache}; + use std::sync::{Arc, Mutex}; + + #[test] + fn respects_limit() { + let mut cache = RegexCache::new(2); + + cache.compile("[01]2").unwrap(); + cache.compile("[21]0").unwrap(); + + assert_eq!(cache.len(), 2); + cache.compile("[21]3").unwrap(); + assert_eq!(cache.len(), 2); + } + + #[test] + fn cached_regex() { + let cache = Arc::new(Mutex::new(RegexCache::new(100))); + let re = CachedRegex::new(cache.clone(), r"^\d+$").unwrap(); + + assert!(re.is_match("123".as_bytes())); + assert!(!re.is_match("abc".as_bytes())); + } +} diff --git a/src/bytes/lazy.rs b/src/bytes/lazy.rs new file mode 100644 index 0000000..ca22e20 --- /dev/null +++ b/src/bytes/lazy.rs @@ -0,0 +1,268 @@ +// Copyright 2017 1aim GmbH +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +use std::fmt; +use std::ops::Deref; +use std::str; + +use oncemutex::OnceMutex; +use std::sync::Arc; + +use crate::bytes::options::Options; +use crate::syntax; +use regex::bytes::{Regex, RegexBuilder}; +use regex::Error; + +/// A lazily created `Regex`. +/// +/// At the first `Deref` the given source will be compiled and saved in the +/// Local Thread Storage, thus avoiding locking. +/// +/// # Example +/// +/// Find the location of a US phone number: +/// +/// ``` +/// # use regex_cache::bytes::LazyRegex; +/// let re = LazyRegex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let m = re.find("phone: 111-222-3333".as_bytes()).unwrap(); +/// assert_eq!((m.start(), m.end()), (7, 19)); +/// ``` +#[derive(Clone)] +pub struct LazyRegex { + builder: LazyRegexBuilder, + regex: Arc>>, +} + +impl LazyRegex { + /// Create a new lazy `Regex` for the given source, checking the syntax is + /// valid. + pub fn new(source: &str) -> Result { + if let Err(err) = syntax::Parser::new().parse(source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(LazyRegex::from(LazyRegexBuilder::new(source))) + } + + fn from(builder: LazyRegexBuilder) -> Self { + LazyRegex { + builder, + regex: Arc::new(OnceMutex::new(None)), + } + } + + fn create(builder: &LazyRegexBuilder) -> Regex { + builder + .options + .define(&mut RegexBuilder::new(&builder.source)) + .build() + .unwrap() + } +} + +impl Deref for LazyRegex { + type Target = Regex; + + fn deref(&self) -> &Regex { + self.as_ref() + } +} + +impl AsRef for LazyRegex { + fn as_ref(&self) -> &Regex { + if let Some(mut guard) = self.regex.lock() { + *guard = Some(LazyRegex::create(&self.builder)); + } + + (*self.regex).as_ref().unwrap() + } +} + +impl Into for LazyRegex { + fn into(self) -> Regex { + let (regex, builder) = (self.regex, self.builder); + + Arc::try_unwrap(regex) + .ok() + .and_then(|m| m.into_inner()) + .unwrap_or_else(|| LazyRegex::create(&builder)) + } +} + +impl fmt::Debug for LazyRegex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } +} + +impl fmt::Display for LazyRegex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } +} + +impl str::FromStr for LazyRegex { + type Err = Error; + + fn from_str(s: &str) -> Result { + LazyRegex::new(s) + } +} + +/// A configurable builder for a lazy `Regex`. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct LazyRegexBuilder { + source: String, + options: Options, +} + +impl LazyRegexBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `compile` is called. + pub fn new(source: &str) -> LazyRegexBuilder { + LazyRegexBuilder { + source: source.to_owned(), + options: Default::default(), + } + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result { + if let Err(err) = syntax::Parser::new().parse(&self.source) { + return Err(Error::Syntax(err.to_string())); + } + + Ok(LazyRegex::from(self.clone())) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` + /// expressions and means "any Unicode scalar value" for `regex::Regex` + /// expressions. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut LazyRegexBuilder { + self.options.unicode = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(&mut self, limit: usize) -> &mut LazyRegexBuilder { + self.options.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simulanteously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut LazyRegexBuilder { + self.options.dfa_size_limit = limit; + self + } +} + +#[cfg(test)] +mod test { + use super::{LazyRegex, LazyRegexBuilder}; + + #[test] + fn new() { + assert!(LazyRegex::new(r"^\d+$") + .unwrap() + .is_match("2345".as_bytes())); + + assert!(!LazyRegex::new(r"^[a-z]+$") + .unwrap() + .is_match("2345".as_bytes())); + } + + #[test] + fn build() { + assert!(LazyRegexBuilder::new(r"^abc$") + .case_insensitive(true) + .build() + .unwrap() + .is_match("ABC".as_bytes())); + + assert!(!LazyRegexBuilder::new(r"^abc$") + .case_insensitive(false) + .build() + .unwrap() + .is_match("ABC".as_bytes())); + } + + #[test] + fn same() { + let re = LazyRegex::new(r"^\d+$").unwrap(); + + assert!(re.is_match("1234".as_bytes())); + assert!(re.is_match("1234".as_bytes())); + assert!(re.is_match("1234".as_bytes())); + assert!(re.is_match("1234".as_bytes())); + } +} diff --git a/src/bytes/options.rs b/src/bytes/options.rs new file mode 100644 index 0000000..bed6d5f --- /dev/null +++ b/src/bytes/options.rs @@ -0,0 +1,62 @@ +// Copyright 2017 1aim GmbH +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +use regex::bytes::RegexBuilder; + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub struct Options { + pub case_insensitive: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub size_limit: usize, + pub dfa_size_limit: usize, +} + +impl Default for Options { + fn default() -> Self { + Options { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + } + } +} + +impl Options { + pub fn define<'b>(&self, builder: &'b mut RegexBuilder) -> &'b mut RegexBuilder { + builder + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .unicode(self.unicode) + .size_limit(self.size_limit) + .dfa_size_limit(self.dfa_size_limit) + } +} diff --git a/src/lib.rs b/src/lib.rs index f1122c7..8e5dc37 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,3 +35,5 @@ pub use crate::cache::{CachedRegex, CachedRegexBuilder, RegexCache}; mod lazy; pub use crate::lazy::{LazyRegex, LazyRegexBuilder}; + +pub mod bytes; From 1e3de010c9612d5fff3e566780260585537acf52 Mon Sep 17 00:00:00 2001 From: veeshi <31014797+veeshi@users.noreply.github.com> Date: Fri, 4 Feb 2022 12:37:40 +0000 Subject: [PATCH 3/4] Dependency upgrade --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2da72f5..e78dff8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ keywords = ["regex", "lazy", "cache"] readme = "README.md" [dependencies] -regex = "1.4" -regex-syntax = "0.6" -lru-cache = "0.1" -oncemutex = "0.1" +regex = "1.5.4" +regex-syntax = "0.6.25" +lru-cache = "0.1.2" +oncemutex = "0.1.1" From c518663ac4d1cb90a1b8aa7544c074b66c656f8e Mon Sep 17 00:00:00 2001 From: veeshi <31014797+veeshi@users.noreply.github.com> Date: Tue, 7 May 2024 23:54:51 +0100 Subject: [PATCH 4/4] bump deps and clean up clippy lints --- Cargo.toml | 6 +++--- src/bytes/cache.rs | 11 ++++++----- src/bytes/lazy.rs | 20 +++++++++++--------- src/cache.rs | 11 +++++------ src/lazy.rs | 15 +++++++-------- src/lib.rs | 13 +++---------- 6 files changed, 35 insertions(+), 41 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e78dff8..4b197e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "regex-cache" version = "0.2.1" -edition = '2018' +edition = '2021' authors = ["meh. "] license = "MIT" @@ -12,7 +12,7 @@ keywords = ["regex", "lazy", "cache"] readme = "README.md" [dependencies] -regex = "1.5.4" -regex-syntax = "0.6.25" +regex = "1.10" +regex-syntax = "0.8" lru-cache = "0.1.2" oncemutex = "0.1.1" diff --git a/src/bytes/cache.rs b/src/bytes/cache.rs index d17f194..d7ba327 100644 --- a/src/bytes/cache.rs +++ b/src/bytes/cache.rs @@ -24,12 +24,13 @@ use std::ops::{Deref, DerefMut}; use std::str; use std::sync::{Arc, Mutex}; -use crate::bytes::options::Options; -use crate::lru::LruCache; -use crate::syntax; +use lru_cache::LruCache; + use regex::bytes::{Captures, Match, Regex, RegexBuilder, Replacer}; use regex::Error; +use crate::bytes::options::Options; + /// An LRU cache for regular expressions. #[derive(Clone, Debug)] pub struct RegexCache(LruCache); @@ -147,7 +148,7 @@ impl CachedRegex { /// Create a new cached `Regex` for the given source, checking the syntax is /// valid. pub fn new(cache: Arc>, source: &str) -> Result { - if let Err(err) = syntax::Parser::new().parse(source) { + if let Err(err) = regex_syntax::Parser::new().parse(source) { return Err(Error::Syntax(err.to_string())); } @@ -245,7 +246,7 @@ impl CachedRegexBuilder { /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. pub fn build(&self) -> Result { - if let Err(err) = syntax::Parser::new().parse(&self.source) { + if let Err(err) = regex_syntax::Parser::new().parse(&self.source) { return Err(Error::Syntax(err.to_string())); } diff --git a/src/bytes/lazy.rs b/src/bytes/lazy.rs index ca22e20..7c5191a 100644 --- a/src/bytes/lazy.rs +++ b/src/bytes/lazy.rs @@ -21,14 +21,16 @@ use std::fmt; use std::ops::Deref; use std::str; +use std::sync::Arc; use oncemutex::OnceMutex; -use std::sync::Arc; + +use regex::{ + bytes::{Regex, RegexBuilder}, + Error, +}; use crate::bytes::options::Options; -use crate::syntax; -use regex::bytes::{Regex, RegexBuilder}; -use regex::Error; /// A lazily created `Regex`. /// @@ -55,7 +57,7 @@ impl LazyRegex { /// Create a new lazy `Regex` for the given source, checking the syntax is /// valid. pub fn new(source: &str) -> Result { - if let Err(err) = syntax::Parser::new().parse(source) { + if let Err(err) = regex_syntax::Parser::new().parse(source) { return Err(Error::Syntax(err.to_string())); } @@ -96,9 +98,9 @@ impl AsRef for LazyRegex { } } -impl Into for LazyRegex { - fn into(self) -> Regex { - let (regex, builder) = (self.regex, self.builder); +impl From for Regex { + fn from(val: LazyRegex) -> Self { + let (regex, builder) = (val.regex, val.builder); Arc::try_unwrap(regex) .ok() @@ -152,7 +154,7 @@ impl LazyRegexBuilder { /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. pub fn build(&self) -> Result { - if let Err(err) = syntax::Parser::new().parse(&self.source) { + if let Err(err) = regex_syntax::Parser::new().parse(&self.source) { return Err(Error::Syntax(err.to_string())); } diff --git a/src/cache.rs b/src/cache.rs index 2570a3e..d3e19a2 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -24,11 +24,10 @@ use std::ops::{Deref, DerefMut}; use std::str; use std::sync::{Arc, Mutex}; -use crate::lru::LruCache; +use lru_cache::LruCache; +use regex::{Captures, Error, Match, Regex, RegexBuilder, Replacer}; + use crate::options::Options; -use crate::syntax; -use regex::{Captures, Match, Replacer}; -use regex::{Error, Regex, RegexBuilder}; /// An LRU cache for regular expressions. #[derive(Clone, Debug)] @@ -147,7 +146,7 @@ impl CachedRegex { /// Create a new cached `Regex` for the given source, checking the syntax is /// valid. pub fn new(cache: Arc>, source: &str) -> Result { - if let Err(err) = syntax::Parser::new().parse(source) { + if let Err(err) = regex_syntax::Parser::new().parse(source) { return Err(Error::Syntax(err.to_string())); } @@ -245,7 +244,7 @@ impl CachedRegexBuilder { /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. pub fn build(&self) -> Result { - if let Err(err) = syntax::Parser::new().parse(&self.source) { + if let Err(err) = regex_syntax::Parser::new().parse(&self.source) { return Err(Error::Syntax(err.to_string())); } diff --git a/src/lazy.rs b/src/lazy.rs index d43e367..cba4d67 100644 --- a/src/lazy.rs +++ b/src/lazy.rs @@ -21,13 +21,12 @@ use std::fmt; use std::ops::Deref; use std::str; +use std::sync::Arc; use oncemutex::OnceMutex; -use std::sync::Arc; +use regex::{Error, Regex, RegexBuilder}; use crate::options::Options; -use crate::syntax; -use regex::{Error, Regex, RegexBuilder}; /// A lazily created `Regex`. /// @@ -54,7 +53,7 @@ impl LazyRegex { /// Create a new lazy `Regex` for the given source, checking the syntax is /// valid. pub fn new(source: &str) -> Result { - if let Err(err) = syntax::Parser::new().parse(source) { + if let Err(err) = regex_syntax::Parser::new().parse(source) { return Err(Error::Syntax(err.to_string())); } @@ -95,9 +94,9 @@ impl AsRef for LazyRegex { } } -impl Into for LazyRegex { - fn into(self) -> Regex { - let (regex, builder) = (self.regex, self.builder); +impl From for Regex { + fn from(val: LazyRegex) -> Self { + let (regex, builder) = (val.regex, val.builder); Arc::try_unwrap(regex) .ok() @@ -151,7 +150,7 @@ impl LazyRegexBuilder { /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. pub fn build(&self) -> Result { - if let Err(err) = syntax::Parser::new().parse(&self.source) { + if let Err(err) = regex_syntax::Parser::new().parse(&self.source) { return Err(Error::Syntax(err.to_string())); } diff --git a/src/lib.rs b/src/lib.rs index 8e5dc37..3f55069 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,19 +21,12 @@ //! This crate provides a library for caching or lazily creating regular //! expressions. -extern crate lru_cache as lru; -extern crate oncemutex; -extern crate regex; -extern crate regex_syntax as syntax; - pub use regex::{Error, Regex, RegexBuilder}; +pub mod bytes; +mod cache; +mod lazy; mod options; -mod cache; pub use crate::cache::{CachedRegex, CachedRegexBuilder, RegexCache}; - -mod lazy; pub use crate::lazy::{LazyRegex, LazyRegexBuilder}; - -pub mod bytes;