Add microbenchmarks to lib/datadog/grok

With reference to #10144 and in light of #11849 we now have an understanding that http -> pipelines -> blackhole is significantly bottlenecked in datadog-grok. Unfortunately most of our data indicates we're looking at regex being the prime pain point. This commit does two things: introduces micro-benchmarks for `datadog_grok::filters::keyvalue::apply_filter` -- unfortunately exposing `datadog_grok::filters` from the crate so we can benchmark it -- and improves the performance of said function by +40% in the micro when there is a field delimiter in place. Specifically, we remove the need for nom-regex and avoid cloning a `regex::Regex` instance for each key and each value in a field. Signed-off-by: Brian L. Troutwine <brian@troutwine.us>
vectordotdev · Apr 12, 2022 · 885c598 · 885c598
1 parent a7191c8
commit 885c598
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 24 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/lib/datadog/grok/Cargo.toml b/lib/datadog/grok/Cargo.toml
@@ -11,8 +11,7 @@ chrono = { version = "0.4.19", default-features = false }
 chrono-tz = { version = "0.6.1", default-features = false }
 itertools = { version = "0.10.3", default-features = false }
 lalrpop-util = { version = "0.19", default-features = false }
-nom = { version = "7.1.1", default-features = false }
-nom-regex = { version = "0.2.0", default-features = false }
+nom = { version = "7.1.1", default-features = false, features = ["std"] }
 once_cell = { version = "1.10", default-features = false, features = ["std", "parking_lot"] }
 onig = { version = "6.3", default-features = false }
 ordered-float = { version = "2", default-features = false }
@@ -32,6 +31,12 @@ vrl-compiler = { path = "../../vrl/compiler" }
 
 [dev-dependencies]
 vrl-compiler = { path = "../../vrl/compiler", features = ["test"] }
+criterion = { version = "0.3.5" }
 
 [build-dependencies]
 lalrpop = { version = "0.19.7", default-features = false }
+
+[[bench]]
+name = "filters"
+path = "benches/filters/main.rs"
+harness = false
diff --git a/lib/datadog/grok/benches/filters/keyvalue.rs b/lib/datadog/grok/benches/filters/keyvalue.rs
@@ -0,0 +1,77 @@
+use std::time::Duration;
+
+use bytes::Bytes;
+use criterion::{
+    criterion_group, measurement::WallTime, BatchSize, BenchmarkGroup, Criterion, SamplingMode,
+};
+use datadog_grok::filters::keyvalue::{apply_filter, KeyValueFilter};
+use regex::Regex;
+use value::Value;
+
+fn apply_filter_bench(c: &mut Criterion) {
+    let mut group: BenchmarkGroup<WallTime> =
+        c.benchmark_group("datadog_grok::filters::keyvalue::apply_filter");
+    group.sampling_mode(SamplingMode::Auto);
+
+    group.bench_function("apply_filter key=valueStr", move |b| {
+        b.iter_batched(
+            || {
+                let value = Value::Bytes(Bytes::from("key=valueStr"));
+                let filter = KeyValueFilter {
+                    key_value_delimiter: "=".into(),
+                    value_re: Regex::new(r"^[\w.\-_@]+").unwrap(),
+                    quotes: vec![('"', '"'), ('\'', '\''), ('<', '>')],
+                    field_delimiters: [" ", ",", ";"]
+                        .iter()
+                        .map(|x| String::from(*x))
+                        .collect::<Vec<String>>(),
+                };
+                (value, filter)
+            },
+            |(value, filter): (Value, KeyValueFilter)| {
+                let _ = apply_filter(&value, &filter);
+            },
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("apply_filter key1=value1|key2=value2", move |b| {
+        b.iter_batched(
+            || {
+                let value = Value::Bytes(Bytes::from("key1=value1|key2=value2"));
+                let filter = KeyValueFilter {
+                    key_value_delimiter: "=".into(),
+                    value_re: Regex::new(r"^[\w.\-_@]+").unwrap(),
+                    quotes: vec![('"', '"'), ('\'', '\''), ('<', '>')],
+                    field_delimiters: ["|"]
+                        .iter()
+                        .map(|x| String::from(*x))
+                        .collect::<Vec<String>>(),
+                };
+                (value, filter)
+            },
+            |(value, filter): (Value, KeyValueFilter)| {
+                let _ = apply_filter(&value, &filter);
+            },
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+criterion_group!(
+    name = benches;
+    config = Criterion::default()
+        .warm_up_time(Duration::from_secs(5))
+        .measurement_time(Duration::from_secs(120))
+        // degree of noise to ignore in measurements, here 1%
+        .noise_threshold(0.01)
+        // likelihood of noise registering as difference, here 5%
+        .significance_level(0.05)
+        // likelihood of capturing the true runtime, here 95%
+        .confidence_level(0.95)
+        // total number of bootstrap resamples, higher is less noisy but slower
+        .nresamples(100_000)
+        // total samples to collect within the set measurement time
+        .sample_size(150);
+    targets = apply_filter_bench
+);
diff --git a/lib/datadog/grok/benches/filters/main.rs b/lib/datadog/grok/benches/filters/main.rs
@@ -0,0 +1,5 @@
+use criterion::criterion_main;
+
+mod keyvalue;
+
+criterion_main!(keyvalue::benches);
diff --git a/lib/datadog/grok/src/filters/keyvalue.rs b/lib/datadog/grok/src/filters/keyvalue.rs
@@ -1,6 +1,3 @@
-use std::collections::BTreeMap;
-use std::fmt::Formatter;
-
 use crate::{
     ast::{Function, FunctionArgument},
     grok_filter::GrokFilter,
@@ -18,14 +15,18 @@ use nom::{
     multi::{many_m_n, separated_list1},
     number::complete::double,
     sequence::{delimited, preceded, terminated, tuple},
-    IResult,
+    IResult, Slice,
 };
-use nom_regex::str::re_find;
+use once_cell::sync::Lazy;
 use ordered_float::NotNan;
 use regex::Regex;
+use std::collections::BTreeMap;
+use std::fmt::Formatter;
 use tracing::warn;
 use vrl_compiler::{Target, Value};
 
+static DEFAULT_FILTER_RE: Lazy<regex::Regex> = Lazy::new(|| Regex::new(r"^[\w.\-_@]*").unwrap());
+
 pub fn filter_from_function(f: &Function) -> Result<GrokFilter, GrokStaticError> {
     {
         let args_len = f.args.as_ref().map_or(0, |args| args.len());
@@ -55,7 +56,7 @@ pub fn filter_from_function(f: &Function) -> Result<GrokFilter, GrokStaticError>
             }
         } else {
             // default allowed unescaped symbols
-            Regex::new(r"^[\w.\-_@]*").unwrap()
+            DEFAULT_FILTER_RE.clone()
         };
 
         let quotes = if args_len > 2 {
@@ -161,6 +162,7 @@ pub fn apply_filter(value: &Value, filter: &KeyValueFilter) -> Result<Value, Gro
 
 type SResult<'a, O> = IResult<&'a str, O, (&'a str, nom::error::ErrorKind)>;
 
+#[inline]
 fn parse<'a>(
     input: &'a str,
     key_value_delimiter: &'a str,
@@ -185,6 +187,7 @@ fn parse<'a>(
 }
 
 /// Parse the line as a separated list of key value pairs.
+#[inline]
 fn parse_line<'a>(
     input: &'a str,
     key_value_delimiter: &'a str,
@@ -259,6 +262,7 @@ fn parse_key_value<'a>(
 }
 
 /// Parses quoted strings.
+#[inline]
 fn parse_quoted<'a>(
     quotes: &'a (char, char),
     field_terminator: &'a str,
@@ -281,10 +285,12 @@ fn parse_quoted<'a>(
 }
 
 /// A delimited value is all the text until our field_delimiter, or the rest of the string if it is the last value in the line,
+#[inline]
 fn parse_delimited<'a>(field_delimiter: &'a str) -> impl Fn(&'a str) -> SResult<&'a str> {
     move |input| map(alt((take_until(field_delimiter), rest)), |s: &str| s.trim())(input)
 }
 
+#[inline]
 fn quoted<'a>(
     quotes: &'a [(char, char)],
     delimiter: &'a str,
@@ -301,13 +307,30 @@ fn quoted<'a>(
     }
 }
 
+fn re_find<'a, E>(re: &'a Regex) -> impl Fn(&'a str) -> IResult<&'a str, &'a str, E>
+where
+    E: nom::error::ParseError<&'a str>,
+{
+    move |i| {
+        if let Some(m) = re.find(i) {
+            Ok((i.slice(m.end()..), i.slice(m.start()..m.end())))
+        } else {
+            Err(nom::Err::Error(E::from_error_kind(
+                i,
+                nom::error::ErrorKind::RegexpFind,
+            )))
+        }
+    }
+}
+
 /// Parses an input while it matches a given regex, otherwise skips an input until the next field delimiter
+#[inline]
 fn match_re_or_empty<'a>(
     value_re: &'a Regex,
     field_delimiter: &'a str,
 ) -> impl Fn(&'a str) -> SResult<&'a str> {
     move |input| {
-        re_find::<'a, (&'a str, nom::error::ErrorKind)>(value_re.clone())(input)
+        re_find::<'a, (&'a str, nom::error::ErrorKind)>(value_re)(input)
             .or_else(|_| parse_delimited(field_delimiter)(input).map(|(rest, _v)| (rest, "")))
     }
 }
@@ -317,6 +340,7 @@ fn match_re_or_empty<'a>(
 ///
 /// 1. The value is quoted - parse until the end quote
 /// 2. Otherwise we parse until regex matches
+#[inline]
 fn parse_value<'a>(
     field_delimiter: &'a str,
     quotes: &'a [(char, char)],
@@ -382,7 +406,7 @@ fn parse_key<'a>(
     quotes: &'a [(char, char)],
     re: &'a Regex,
 ) -> impl Fn(&'a str) -> SResult<&'a str> {
-    move |input| alt((quoted(quotes, key_value_delimiter), re_find(re.to_owned())))(input)
+    move |input| alt((quoted(quotes, key_value_delimiter), re_find(re)))(input)
 }
 
 #[cfg(test)]

diff --git a/lib/datadog/grok/src/lib.rs b/lib/datadog/grok/src/lib.rs
@@ -5,7 +5,8 @@
 #![deny(unused_comparisons)]
 
 mod ast;
-mod filters;
+#[doc(hidden)]
+pub mod filters; // TODO Must be exposed for criterion. Perhaps we should pass a feature? Yuck.
 mod grok;
 mod grok_filter;
 mod lexer;

diff --git a/lib/datadog/grok/src/parse_grok.rs b/lib/datadog/grok/src/parse_grok.rs
@@ -6,8 +6,8 @@ use itertools::{
     FoldWhile::{Continue, Done},
     Itertools,
 };
+use std::collections::BTreeMap;
 use tracing::warn;
-use vector_common::btreemap;
 use vrl_compiler::{Target, Value};
 
 #[derive(thiserror::Error, Debug, PartialEq)]
@@ -43,7 +43,7 @@ pub fn parse_grok(
 /// - FailedToApplyFilter - matches the rule, but there was a runtime error while applying on of the filters
 /// - NoMatch - this rule does not match a given string
 fn apply_grok_rule(source: &str, grok_rule: &GrokRule, remove_empty: bool) -> Result<Value, Error> {
-    let mut parsed = Value::from(btreemap! {});
+    let mut parsed = Value::Object(BTreeMap::new());
 
     if let Some(ref matches) = grok_rule.pattern.match_against(source) {
         for (name, value) in matches.iter() {
@@ -115,6 +115,7 @@ mod tests {
 
     use super::*;
     use crate::parse_grok_rules::parse_grok_rules;
+    use vector_common::btreemap;
 
     #[test]
     fn parses_simple_grok() {