From 1f5a95030a1dedbd400ed79c85df98dffd00d54d Mon Sep 17 00:00:00 2001 From: 35V LG84 <35vlg84-x4e6b92@e257.fi> Date: Sun, 22 Dec 2024 15:28:58 +0200 Subject: [PATCH] tackler-rs: full haystack regex matcher GH-31 Signed-off-by: 35V LG84 <35vlg84-x4e6b92@e257.fi> --- tackler-rs/CRATES.md | 24 +++ tackler-rs/src/lib.rs | 3 + tackler-rs/src/regex.rs | 202 ++++++++++++++++++ tackler-rs/src/regex/serde.rs | 19 ++ .../src/regex/serde/full_haystack_matcher.rs | 116 ++++++++++ 5 files changed, 364 insertions(+) diff --git a/tackler-rs/CRATES.md b/tackler-rs/CRATES.md index 5e09ad8..e41df6d 100644 --- a/tackler-rs/CRATES.md +++ b/tackler-rs/CRATES.md @@ -9,6 +9,30 @@ The Rusty Services are assorted bits and pieces which are needed for Tackler, but won't fit into the domain of plain text accounting. +## Full haystack regex matchers + +By default Rust `regex::Regex::is_match` will test if there is a match for the regex [anywhere in the haystack](https://docs.rs/regex/latest/regex/struct.Regex.html#method.is_match) given. + +These constructors create a regex which will try to match against the full haystack by default. This logic is similar than [java.util.regex.Matcher.matches()](https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/util/regex/Matcher.html#matches()) + +```rust +tackler_rs::regex::{ + new_full_haystack_regex, + new_full_haystack_regex_set, + peeled_pattern, + peeled_patterns +} +``` + +### Serializers and Deserializers for full haystack matchers + +This is serializer and deserializer implementation of full haystack matcher for Serde. + +```rust +tackler_rs::regex::serde::full_haystack_matcher +``` + + ## Tackler components on Crates.io * Tackler CLI application: [tackler](https://crates.io/crates/tackler) diff --git a/tackler-rs/src/lib.rs b/tackler-rs/src/lib.rs index 27e79b0..7297aae 100644 --- a/tackler-rs/src/lib.rs +++ b/tackler-rs/src/lib.rs @@ -30,6 +30,9 @@ use std::io::BufWriter; use std::path::{Path, PathBuf}; use walkdir::{DirEntry, WalkDir}; +/// Regex helpers to have full haystack matcher (JDK matches()) +pub mod regex; + /// /// Get full path based on /// directory, filename prefix, filename and extension diff --git a/tackler-rs/src/regex.rs b/tackler-rs/src/regex.rs index abcbd2a..d5d6914 100644 --- a/tackler-rs/src/regex.rs +++ b/tackler-rs/src/regex.rs @@ -48,3 +48,205 @@ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE * OR OTHER DEALINGS IN THE SOFTWARE. */ + +/// Serialization and Deserialization for full haystack regex matchers +pub mod serde; + +use regex::{Regex, RegexSet}; + +fn into_full_haystack_pattern(re: S) -> String +where + S: AsRef, +{ + format!("^(?:{})$", re.as_ref()) +} + +fn peel_full_haystack_pattern(re: &str) -> &str { + match re.strip_prefix("^(?:") { + Some(prefix_clean) => prefix_clean.strip_suffix(r")$").unwrap_or(re), + None => re, + } +} + +/// Compiles a full haystack regular expression +/// +/// This will augment (anchor) the given re so that it will match against +/// full haystack. +/// +/// See `Regex::Regex::new` for actual documentation of this method. +/// +/// See `peeled_pattern_as_str` how to get back the original string +/// +/// # Examples +/// ```rust +/// # use std::error::Error; +/// use tackler_rs::regex::new_full_haystack_regex; +/// +/// let re_foo = new_full_haystack_regex("foo")?; +/// let re_bar = new_full_haystack_regex("bar")?; +/// +/// assert!(re_foo.is_match("foo")); +/// assert!(re_bar.is_match("bar")); +/// +/// assert!(!re_foo.is_match("foobar")); +/// assert!(!re_bar.is_match("foobar")); +/// # Ok::<(), Box>(()) +/// ``` +pub fn new_full_haystack_regex(re: &str) -> Result { + Regex::new(into_full_haystack_pattern(re).as_str()) +} + +/// Returns the original string of this regex. +/// # Examples +/// ```rust +/// # use std::error::Error; +/// use tackler_rs::regex::new_full_haystack_regex; +/// use tackler_rs::regex::peeled_pattern; +/// +/// let re_foo = new_full_haystack_regex(r"foo.*")?; +/// +/// assert_eq!(peeled_pattern(&re_foo), r"foo.*"); +/// # Ok::<(), Box>(()) +/// ``` +pub fn peeled_pattern(regex: &Regex) -> &str { + peel_full_haystack_pattern(regex.as_str()) +} + +/// Compiles a set of full haystack regular expressions +/// +/// This will augment (anchor) the given expressions so +/// that each of those will match against full haystack. +/// +/// See `Regex::RegexSet::new` for actual documentation of this method. +/// +/// See `peeled_pattern` how to get back the original string +/// +/// # Examples +/// ```rust +/// # use std::error::Error; +/// use tackler_rs::regex::new_full_haystack_regex_set; +/// +/// let re_set = new_full_haystack_regex_set(["foo", "bar"])?; +/// +/// assert!(re_set.is_match("foo")); +/// assert!(re_set.is_match("bar")); +/// +/// assert!(!re_set.is_match("foobar")); +/// assert!(!re_set.is_match("foobar")); +/// # Ok::<(), Box>(()) +/// ``` +pub fn new_full_haystack_regex_set(exprs: I) -> Result +where + S: AsRef, + I: IntoIterator, +{ + RegexSet::new(exprs.into_iter().map(|re| into_full_haystack_pattern(re))) +} + +/// Returns the peeled regex patterns that this regex set was constructed from. +/// +/// # Examples +/// ```rust +/// # use std::error::Error; +/// use tackler_rs::regex::new_full_haystack_regex_set; +/// use tackler_rs::regex::peeled_patterns; +/// +/// let re_set = new_full_haystack_regex_set(["foo", "bar"])?; +/// +/// assert_eq!(peeled_patterns(&re_set), vec!["foo", "bar"]); +/// # Ok::<(), Box>(()) +/// ``` +pub fn peeled_patterns(regex_set: &RegexSet) -> Vec { + regex_set + .patterns() + .iter() + .map(|re| peel_full_haystack_pattern(re).to_string()) + .collect::>() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_peel_full_haystack_pattern() { + assert_eq!(peel_full_haystack_pattern("abc"), "abc"); + assert_eq!(peel_full_haystack_pattern(".*"), ".*"); + assert_eq!(peel_full_haystack_pattern("(.*)"), "(.*)"); + assert_eq!(peel_full_haystack_pattern("^(?:.*)"), "^(?:.*)"); + assert_eq!(peel_full_haystack_pattern("(.*)$"), "(.*)$"); + assert_eq!(peel_full_haystack_pattern("^(?:.*)$"), ".*"); + } + + #[test] + fn test_full_haystack_pattern() { + let re = new_full_haystack_regex(r"o.a").unwrap(/*:test:*/); + assert_eq!(re.as_str(), r"^(?:o.a)$"); + + assert!(!re.is_match("foobar")); + assert!(!re.is_match("ooba")); + assert!(!re.is_match("obar")); + assert!(re.is_match("oba")); + } + + #[test] + fn test_full_haystack_pattern_anchored() { + let re = new_full_haystack_regex(r"^o.a$").unwrap(/*:test:*/); + assert_eq!(re.as_str(), r"^(?:^o.a$)$"); + + assert!(!re.is_match("foobar")); + assert!(!re.is_match("ooba")); + assert!(!re.is_match("obar")); + assert!(re.is_match("oba")); + } + + #[test] + fn test_full_haystack_pattern_peeled() { + let re_str = r"^(?:o.a)$"; + let re = new_full_haystack_regex(re_str).unwrap(/*:test:*/); + assert_eq!(re.as_str(), r"^(?:^(?:o.a)$)$"); + + assert!(!re.is_match("foobar")); + assert!(!re.is_match("ooba")); + assert!(!re.is_match("obar")); + assert!(re.is_match("oba")); + + assert_eq!(peeled_pattern(&re), re_str); + } + + #[test] + fn test_full_haystack_patterns() { + let re_set = new_full_haystack_regex_set([r".*foo", r"bar.*"]).unwrap(/*:test:*/); + assert_eq!(re_set.patterns(), [r"^(?:.*foo)$", r"^(?:bar.*)$"]); + + assert!(!re_set.is_match("foobar")); + assert!(re_set.is_match("foo")); + assert!(re_set.is_match("bar")); + } + + #[test] + fn test_full_haystack_patterns_anchored() { + let re_set = new_full_haystack_regex_set([r"^.*foo$", r"^bar.*$"]).unwrap(/*:test:*/); + assert_eq!(re_set.patterns(), [r"^(?:^.*foo$)$", r"^(?:^bar.*$)$"]); + + assert!(!re_set.is_match("foobar")); + assert!(re_set.is_match("foo")); + assert!(re_set.is_match("bar")); + } + + #[test] + fn test_full_haystack_patterns_peeled() { + let re_set_str = [r"^(?:.*foo)$", r"^(?:bar.*)$"]; + let re_set = new_full_haystack_regex_set(re_set_str).unwrap(/*:test:*/); + assert_eq!( + re_set.patterns(), + [r"^(?:^(?:.*foo)$)$", r"^(?:^(?:bar.*)$)$"] + ); + + assert!(!re_set.is_match("foobar")); + assert!(re_set.is_match("foo")); + assert!(re_set.is_match("bar")); + + assert_eq!(peeled_patterns(&re_set), re_set_str); + } +} diff --git a/tackler-rs/src/regex/serde.rs b/tackler-rs/src/regex/serde.rs index fe1e386..107776f 100644 --- a/tackler-rs/src/regex/serde.rs +++ b/tackler-rs/src/regex/serde.rs @@ -49,3 +49,22 @@ * OR OTHER DEALINGS IN THE SOFTWARE. */ +/// Full Haystack matcher serializer and deserializer +/// +/// # Example +/// +/// ```rust +/// use regex::Regex; +/// use serde::{Deserialize, Serialize}; +/// use tackler_rs::regex::serde::full_haystack_matcher; +/// +/// #[derive(Serialize, Deserialize)] +/// struct Account { +/// #[serde(with = "full_haystack_matcher")] +/// regex: Regex, +/// } +/// +/// # +/// # fn main() {} +/// ``` +pub mod full_haystack_matcher; diff --git a/tackler-rs/src/regex/serde/full_haystack_matcher.rs b/tackler-rs/src/regex/serde/full_haystack_matcher.rs index fe1e386..6161af6 100644 --- a/tackler-rs/src/regex/serde/full_haystack_matcher.rs +++ b/tackler-rs/src/regex/serde/full_haystack_matcher.rs @@ -49,3 +49,119 @@ * OR OTHER DEALINGS IN THE SOFTWARE. */ +// +// This code is based on: https://github.com/tailhook/serde-regex, +// which is licensed as Apache-2.0 OR MIT +// + +use regex::Regex; +use std::{ + borrow::Cow, + hash::Hash, + ops::{Deref, DerefMut}, +}; + +use crate::regex::{new_full_haystack_regex, peeled_pattern}; +use serde::{de::Error, Deserialize, Deserializer, Serialize, Serializer}; + +/// A wrapper type which implements `Serialize` and `Deserialize` for +/// types involving `Regex` +#[derive(Debug, Clone, Eq, Hash, PartialEq)] +pub struct Serde(pub T); + +impl<'de> Deserialize<'de> for Serde { + fn deserialize(d: D) -> Result, D::Error> + where + D: Deserializer<'de>, + { + let s = >::deserialize(d)?; + + match new_full_haystack_regex(s.as_ref()) { + Ok(regex) => Ok(Serde(regex)), + Err(err) => Err(D::Error::custom(err)), + } + } +} + +/// Deserialize function, see crate docs to see how to use it +pub fn deserialize<'de, T, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, + Serde: Deserialize<'de>, +{ + Serde::deserialize(deserializer).map(|x| x.0) +} + +/// Serialize function, see crate docs to see how to use it +pub fn serialize(value: &T, serializer: S) -> Result +where + S: Serializer, + for<'a> Serde<&'a T>: Serialize, +{ + Serde(value).serialize(serializer) +} + +impl Deref for Serde { + type Target = T; + + fn deref(&self) -> &T { + &self.0 + } +} + +impl DerefMut for Serde { + fn deref_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl Serde { + /// Consumes the `Serde`, returning the inner value. + pub fn into_inner(self) -> T { + self.0 + } +} + +impl From for Serde { + fn from(val: T) -> Serde { + Serde(val) + } +} + +impl Serialize for Serde<&Regex> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + peeled_pattern(self.0).serialize(serializer) + } +} + +impl Serialize for Serde { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + peeled_pattern(&self.0).serialize(serializer) + } +} + +#[cfg(test)] +mod test { + use super::*; + + use crate::regex::into_full_haystack_pattern; + use regex::Regex; + use serde_json::{from_str, to_string}; + + const SAMPLE: &str = r#"[a-z"\]]+\d{1,10}""#; + const SAMPLE_JSON: &str = r#""[a-z\"\\]]+\\d{1,10}\"""#; + + #[test] + fn test_regex() { + let re: Serde = from_str(SAMPLE_JSON).unwrap(); + + assert_eq!(re.as_str(), into_full_haystack_pattern(SAMPLE)); + assert_eq!(to_string(&re).unwrap(), SAMPLE_JSON); + } +}