diff --git a/Cargo.lock b/Cargo.lock index 52a2c32516dd0a..2a31d199f0d42c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2048,12 +2048,12 @@ dependencies = [ "bytes 1.1.0", "chrono", "chrono-tz", + "criterion", "itertools", "lalrpop", "lalrpop-util", "lookup", "nom 7.1.1", - "nom-regex", "once_cell", "onig", "ordered-float", @@ -4852,16 +4852,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "nom-regex" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e5c7731c4c1370b61604ed52a2475e861aac9e08dec9f23903d4ddfdc91c18" -dependencies = [ - "nom 7.1.1", - "regex", -] - [[package]] name = "nonzero_ext" version = "0.3.0" diff --git a/lib/datadog/grok/Cargo.toml b/lib/datadog/grok/Cargo.toml index f42bcc4ad18216..7b976e5f964ad0 100644 --- a/lib/datadog/grok/Cargo.toml +++ b/lib/datadog/grok/Cargo.toml @@ -11,8 +11,7 @@ chrono = { version = "0.4.19", default-features = false } chrono-tz = { version = "0.6.1", default-features = false } itertools = { version = "0.10.3", default-features = false } lalrpop-util = { version = "0.19", default-features = false } -nom = { version = "7.1.1", default-features = false } -nom-regex = { version = "0.2.0", default-features = false } +nom = { version = "7.1.1", default-features = false, features = ["std"] } once_cell = { version = "1.10", default-features = false, features = ["std", "parking_lot"] } onig = { version = "6.3", default-features = false } ordered-float = { version = "2", default-features = false } @@ -32,6 +31,12 @@ vrl-compiler = { path = "../../vrl/compiler" } [dev-dependencies] vrl-compiler = { path = "../../vrl/compiler", features = ["test"] } +criterion = { version = "0.3.5" } [build-dependencies] lalrpop = { version = "0.19.7", default-features = false } + +[[bench]] +name = "filters" +path = "benches/filters/main.rs" +harness = false diff --git a/lib/datadog/grok/benches/filters/keyvalue.rs b/lib/datadog/grok/benches/filters/keyvalue.rs new file mode 100644 index 00000000000000..f1f7389347810a --- /dev/null +++ b/lib/datadog/grok/benches/filters/keyvalue.rs @@ -0,0 +1,77 @@ +use std::time::Duration; + +use bytes::Bytes; +use criterion::{ + criterion_group, measurement::WallTime, BatchSize, BenchmarkGroup, Criterion, SamplingMode, +}; +use datadog_grok::filters::keyvalue::{apply_filter, KeyValueFilter}; +use regex::Regex; +use value::Value; + +fn apply_filter_bench(c: &mut Criterion) { + let mut group: BenchmarkGroup = + c.benchmark_group("datadog_grok::filters::keyvalue::apply_filter"); + group.sampling_mode(SamplingMode::Auto); + + group.bench_function("apply_filter key=valueStr", move |b| { + b.iter_batched( + || { + let value = Value::Bytes(Bytes::from("key=valueStr")); + let filter = KeyValueFilter { + key_value_delimiter: "=".into(), + value_re: Regex::new(r"^[\w.\-_@]+").unwrap(), + quotes: vec![('"', '"'), ('\'', '\''), ('<', '>')], + field_delimiters: [" ", ",", ";"] + .iter() + .map(|x| String::from(*x)) + .collect::>(), + }; + (value, filter) + }, + |(value, filter): (Value, KeyValueFilter)| { + let _ = apply_filter(&value, &filter); + }, + BatchSize::SmallInput, + ) + }); + + group.bench_function("apply_filter key1=value1|key2=value2", move |b| { + b.iter_batched( + || { + let value = Value::Bytes(Bytes::from("key1=value1|key2=value2")); + let filter = KeyValueFilter { + key_value_delimiter: "=".into(), + value_re: Regex::new(r"^[\w.\-_@]+").unwrap(), + quotes: vec![('"', '"'), ('\'', '\''), ('<', '>')], + field_delimiters: ["|"] + .iter() + .map(|x| String::from(*x)) + .collect::>(), + }; + (value, filter) + }, + |(value, filter): (Value, KeyValueFilter)| { + let _ = apply_filter(&value, &filter); + }, + BatchSize::SmallInput, + ) + }); +} + +criterion_group!( + name = benches; + config = Criterion::default() + .warm_up_time(Duration::from_secs(5)) + .measurement_time(Duration::from_secs(120)) + // degree of noise to ignore in measurements, here 1% + .noise_threshold(0.01) + // likelihood of noise registering as difference, here 5% + .significance_level(0.05) + // likelihood of capturing the true runtime, here 95% + .confidence_level(0.95) + // total number of bootstrap resamples, higher is less noisy but slower + .nresamples(100_000) + // total samples to collect within the set measurement time + .sample_size(150); + targets = apply_filter_bench +); diff --git a/lib/datadog/grok/benches/filters/main.rs b/lib/datadog/grok/benches/filters/main.rs new file mode 100644 index 00000000000000..5d747eb8f16d25 --- /dev/null +++ b/lib/datadog/grok/benches/filters/main.rs @@ -0,0 +1,5 @@ +use criterion::criterion_main; + +mod keyvalue; + +criterion_main!(keyvalue::benches); diff --git a/lib/datadog/grok/src/filters/keyvalue.rs b/lib/datadog/grok/src/filters/keyvalue.rs index e06e0e1c91f2a2..453793a7fb714f 100644 --- a/lib/datadog/grok/src/filters/keyvalue.rs +++ b/lib/datadog/grok/src/filters/keyvalue.rs @@ -1,6 +1,3 @@ -use std::collections::BTreeMap; -use std::fmt::Formatter; - use crate::{ ast::{Function, FunctionArgument}, grok_filter::GrokFilter, @@ -18,14 +15,18 @@ use nom::{ multi::{many_m_n, separated_list1}, number::complete::double, sequence::{delimited, preceded, terminated, tuple}, - IResult, + IResult, Slice, }; -use nom_regex::str::re_find; +use once_cell::sync::Lazy; use ordered_float::NotNan; use regex::Regex; +use std::collections::BTreeMap; +use std::fmt::Formatter; use tracing::warn; use vrl_compiler::{Target, Value}; +static DEFAULT_FILTER_RE: Lazy = Lazy::new(|| Regex::new(r"^[\w.\-_@]*").unwrap()); + pub fn filter_from_function(f: &Function) -> Result { { let args_len = f.args.as_ref().map_or(0, |args| args.len()); @@ -55,7 +56,7 @@ pub fn filter_from_function(f: &Function) -> Result } } else { // default allowed unescaped symbols - Regex::new(r"^[\w.\-_@]*").unwrap() + DEFAULT_FILTER_RE.clone() }; let quotes = if args_len > 2 { @@ -161,6 +162,7 @@ pub fn apply_filter(value: &Value, filter: &KeyValueFilter) -> Result = IResult<&'a str, O, (&'a str, nom::error::ErrorKind)>; +#[inline] fn parse<'a>( input: &'a str, key_value_delimiter: &'a str, @@ -185,6 +187,7 @@ fn parse<'a>( } /// Parse the line as a separated list of key value pairs. +#[inline] fn parse_line<'a>( input: &'a str, key_value_delimiter: &'a str, @@ -259,6 +262,7 @@ fn parse_key_value<'a>( } /// Parses quoted strings. +#[inline] fn parse_quoted<'a>( quotes: &'a (char, char), field_terminator: &'a str, @@ -281,10 +285,12 @@ fn parse_quoted<'a>( } /// A delimited value is all the text until our field_delimiter, or the rest of the string if it is the last value in the line, +#[inline] fn parse_delimited<'a>(field_delimiter: &'a str) -> impl Fn(&'a str) -> SResult<&'a str> { move |input| map(alt((take_until(field_delimiter), rest)), |s: &str| s.trim())(input) } +#[inline] fn quoted<'a>( quotes: &'a [(char, char)], delimiter: &'a str, @@ -301,13 +307,30 @@ fn quoted<'a>( } } +fn re_find<'a, E>(re: &'a Regex) -> impl Fn(&'a str) -> IResult<&'a str, &'a str, E> +where + E: nom::error::ParseError<&'a str>, +{ + move |i| { + if let Some(m) = re.find(i) { + Ok((i.slice(m.end()..), i.slice(m.start()..m.end()))) + } else { + Err(nom::Err::Error(E::from_error_kind( + i, + nom::error::ErrorKind::RegexpFind, + ))) + } + } +} + /// Parses an input while it matches a given regex, otherwise skips an input until the next field delimiter +#[inline] fn match_re_or_empty<'a>( value_re: &'a Regex, field_delimiter: &'a str, ) -> impl Fn(&'a str) -> SResult<&'a str> { move |input| { - re_find::<'a, (&'a str, nom::error::ErrorKind)>(value_re.clone())(input) + re_find::<'a, (&'a str, nom::error::ErrorKind)>(value_re)(input) .or_else(|_| parse_delimited(field_delimiter)(input).map(|(rest, _v)| (rest, ""))) } } @@ -317,6 +340,7 @@ fn match_re_or_empty<'a>( /// /// 1. The value is quoted - parse until the end quote /// 2. Otherwise we parse until regex matches +#[inline] fn parse_value<'a>( field_delimiter: &'a str, quotes: &'a [(char, char)], @@ -382,7 +406,7 @@ fn parse_key<'a>( quotes: &'a [(char, char)], re: &'a Regex, ) -> impl Fn(&'a str) -> SResult<&'a str> { - move |input| alt((quoted(quotes, key_value_delimiter), re_find(re.to_owned())))(input) + move |input| alt((quoted(quotes, key_value_delimiter), re_find(re)))(input) } #[cfg(test)] diff --git a/lib/datadog/grok/src/lib.rs b/lib/datadog/grok/src/lib.rs index 34909a483965a7..004a7bd7eeaf43 100644 --- a/lib/datadog/grok/src/lib.rs +++ b/lib/datadog/grok/src/lib.rs @@ -5,7 +5,8 @@ #![deny(unused_comparisons)] mod ast; -mod filters; +#[doc(hidden)] +pub mod filters; // TODO Must be exposed for criterion. Perhaps we should pass a feature? Yuck. mod grok; mod grok_filter; mod lexer; diff --git a/lib/datadog/grok/src/parse_grok.rs b/lib/datadog/grok/src/parse_grok.rs index fe6dd0c4a92b47..845214c58f1c80 100644 --- a/lib/datadog/grok/src/parse_grok.rs +++ b/lib/datadog/grok/src/parse_grok.rs @@ -6,8 +6,8 @@ use itertools::{ FoldWhile::{Continue, Done}, Itertools, }; +use std::collections::BTreeMap; use tracing::warn; -use vector_common::btreemap; use vrl_compiler::{Target, Value}; #[derive(thiserror::Error, Debug, PartialEq)] @@ -43,7 +43,7 @@ pub fn parse_grok( /// - FailedToApplyFilter - matches the rule, but there was a runtime error while applying on of the filters /// - NoMatch - this rule does not match a given string fn apply_grok_rule(source: &str, grok_rule: &GrokRule, remove_empty: bool) -> Result { - let mut parsed = Value::from(btreemap! {}); + let mut parsed = Value::Object(BTreeMap::new()); if let Some(ref matches) = grok_rule.pattern.match_against(source) { for (name, value) in matches.iter() { @@ -115,6 +115,7 @@ mod tests { use super::*; use crate::parse_grok_rules::parse_grok_rules; + use vector_common::btreemap; #[test] fn parses_simple_grok() {