Skip to content

Commit

Permalink
Add microbenchmarks to lib/datadog/grok
Browse files Browse the repository at this point in the history
With reference to #10144 and in light of #11849 we now have an understanding
that http -> pipelines -> blackhole is significantly bottlenecked in
datadog-grok. Unfortunately most of our data indicates we're looking at regex
being the prime pain point. This commit does two things: introduces
micro-benchmarks for `datadog_grok::filters::keyvalue::apply_filter` --
unfortunately exposing `datadog_grok::filters` from the crate so we can
benchmark it -- and improves the performance of said function by +40% in the
micro when there is a field delimiter in place. Specifically, we remove the need
for nom-regex and avoid cloning a `regex::Regex` instance for each key and each
value in a field.

Signed-off-by: Brian L. Troutwine <brian@troutwine.us>
  • Loading branch information
blt committed Apr 12, 2022
1 parent 4ad156f commit 4a09351
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 24 deletions.
12 changes: 1 addition & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions lib/datadog/grok/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ chrono = { version = "0.4.19", default-features = false }
chrono-tz = { version = "0.6.1", default-features = false }
itertools = { version = "0.10.3", default-features = false }
lalrpop-util = { version = "0.19", default-features = false }
nom = { version = "7.1.1", default-features = false }
nom-regex = { version = "0.2.0", default-features = false }
nom = { version = "7.1.1", default-features = false, features = ["std"] }
once_cell = { version = "1.10", default-features = false, features = ["std", "parking_lot"] }
onig = { version = "6.3", default-features = false }
ordered-float = { version = "2", default-features = false }
Expand All @@ -32,6 +31,12 @@ vrl-compiler = { path = "../../vrl/compiler" }

[dev-dependencies]
vrl-compiler = { path = "../../vrl/compiler", features = ["test"] }
criterion = { version = "0.3.5" }

[build-dependencies]
lalrpop = { version = "0.19.7", default-features = false }

[[bench]]
name = "filters"
path = "benches/filters/main.rs"
harness = false
77 changes: 77 additions & 0 deletions lib/datadog/grok/benches/filters/keyvalue.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use std::time::Duration;

use bytes::Bytes;
use criterion::{
criterion_group, measurement::WallTime, BatchSize, BenchmarkGroup, Criterion, SamplingMode,
};
use datadog_grok::filters::keyvalue::{apply_filter, KeyValueFilter};
use regex::Regex;
use value::Value;

fn apply_filter_bench(c: &mut Criterion) {
let mut group: BenchmarkGroup<WallTime> =
c.benchmark_group("datadog_grok::filters::keyvalue::apply_filter");
group.sampling_mode(SamplingMode::Auto);

group.bench_function("apply_filter key=valueStr", move |b| {
b.iter_batched(
|| {
let value = Value::Bytes(Bytes::from("key=valueStr"));
let filter = KeyValueFilter {
key_value_delimiter: "=".into(),
value_re: Regex::new(r"^[\w.\-_@]+").unwrap(),
quotes: vec![('"', '"'), ('\'', '\''), ('<', '>')],
field_delimiters: [" ", ",", ";"]
.iter()
.map(|x| String::from(*x))
.collect::<Vec<String>>(),
};
(value, filter)
},
|(value, filter): (Value, KeyValueFilter)| {
let _ = apply_filter(&value, &filter);
},
BatchSize::SmallInput,
)
});

group.bench_function("apply_filter key1=value1|key2=value2", move |b| {
b.iter_batched(
|| {
let value = Value::Bytes(Bytes::from("key1=value1|key2=value2"));
let filter = KeyValueFilter {
key_value_delimiter: "=".into(),
value_re: Regex::new(r"^[\w.\-_@]+").unwrap(),
quotes: vec![('"', '"'), ('\'', '\''), ('<', '>')],
field_delimiters: ["|"]
.iter()
.map(|x| String::from(*x))
.collect::<Vec<String>>(),
};
(value, filter)
},
|(value, filter): (Value, KeyValueFilter)| {
let _ = apply_filter(&value, &filter);
},
BatchSize::SmallInput,
)
});
}

criterion_group!(
name = benches;
config = Criterion::default()
.warm_up_time(Duration::from_secs(5))
.measurement_time(Duration::from_secs(120))
// degree of noise to ignore in measurements, here 1%
.noise_threshold(0.01)
// likelihood of noise registering as difference, here 5%
.significance_level(0.05)
// likelihood of capturing the true runtime, here 95%
.confidence_level(0.95)
// total number of bootstrap resamples, higher is less noisy but slower
.nresamples(100_000)
// total samples to collect within the set measurement time
.sample_size(150);
targets = apply_filter_bench
);
5 changes: 5 additions & 0 deletions lib/datadog/grok/benches/filters/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
use criterion::criterion_main;

mod keyvalue;

criterion_main!(keyvalue::benches);
40 changes: 32 additions & 8 deletions lib/datadog/grok/src/filters/keyvalue.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
use std::collections::BTreeMap;
use std::fmt::Formatter;

use crate::{
ast::{Function, FunctionArgument},
grok_filter::GrokFilter,
Expand All @@ -18,14 +15,18 @@ use nom::{
multi::{many_m_n, separated_list1},
number::complete::double,
sequence::{delimited, preceded, terminated, tuple},
IResult,
IResult, Slice,
};
use nom_regex::str::re_find;
use once_cell::sync::Lazy;
use ordered_float::NotNan;
use regex::Regex;
use std::collections::BTreeMap;
use std::fmt::Formatter;
use tracing::warn;
use vrl_compiler::{Target, Value};

static DEFAULT_FILTER_RE: Lazy<regex::Regex> = Lazy::new(|| Regex::new(r"^[\w.\-_@]*").unwrap());

pub fn filter_from_function(f: &Function) -> Result<GrokFilter, GrokStaticError> {
{
let args_len = f.args.as_ref().map_or(0, |args| args.len());
Expand Down Expand Up @@ -55,7 +56,7 @@ pub fn filter_from_function(f: &Function) -> Result<GrokFilter, GrokStaticError>
}
} else {
// default allowed unescaped symbols
Regex::new(r"^[\w.\-_@]*").unwrap()
DEFAULT_FILTER_RE.clone()
};

let quotes = if args_len > 2 {
Expand Down Expand Up @@ -161,6 +162,7 @@ pub fn apply_filter(value: &Value, filter: &KeyValueFilter) -> Result<Value, Gro

type SResult<'a, O> = IResult<&'a str, O, (&'a str, nom::error::ErrorKind)>;

#[inline]
fn parse<'a>(
input: &'a str,
key_value_delimiter: &'a str,
Expand All @@ -185,6 +187,7 @@ fn parse<'a>(
}

/// Parse the line as a separated list of key value pairs.
#[inline]
fn parse_line<'a>(
input: &'a str,
key_value_delimiter: &'a str,
Expand Down Expand Up @@ -259,6 +262,7 @@ fn parse_key_value<'a>(
}

/// Parses quoted strings.
#[inline]
fn parse_quoted<'a>(
quotes: &'a (char, char),
field_terminator: &'a str,
Expand All @@ -281,10 +285,12 @@ fn parse_quoted<'a>(
}

/// A delimited value is all the text until our field_delimiter, or the rest of the string if it is the last value in the line,
#[inline]
fn parse_delimited<'a>(field_delimiter: &'a str) -> impl Fn(&'a str) -> SResult<&'a str> {
move |input| map(alt((take_until(field_delimiter), rest)), |s: &str| s.trim())(input)
}

#[inline]
fn quoted<'a>(
quotes: &'a [(char, char)],
delimiter: &'a str,
Expand All @@ -301,13 +307,30 @@ fn quoted<'a>(
}
}

fn re_find<'a, E>(re: &'a Regex) -> impl Fn(&'a str) -> IResult<&'a str, &'a str, E>
where
E: nom::error::ParseError<&'a str>,
{
move |i| {
if let Some(m) = re.find(i) {
Ok((i.slice(m.end()..), i.slice(m.start()..m.end())))
} else {
Err(nom::Err::Error(E::from_error_kind(
i,
nom::error::ErrorKind::RegexpFind,
)))
}
}
}

/// Parses an input while it matches a given regex, otherwise skips an input until the next field delimiter
#[inline]
fn match_re_or_empty<'a>(
value_re: &'a Regex,
field_delimiter: &'a str,
) -> impl Fn(&'a str) -> SResult<&'a str> {
move |input| {
re_find::<'a, (&'a str, nom::error::ErrorKind)>(value_re.clone())(input)
re_find::<'a, (&'a str, nom::error::ErrorKind)>(value_re)(input)
.or_else(|_| parse_delimited(field_delimiter)(input).map(|(rest, _v)| (rest, "")))
}
}
Expand All @@ -317,6 +340,7 @@ fn match_re_or_empty<'a>(
///
/// 1. The value is quoted - parse until the end quote
/// 2. Otherwise we parse until regex matches
#[inline]
fn parse_value<'a>(
field_delimiter: &'a str,
quotes: &'a [(char, char)],
Expand Down Expand Up @@ -382,7 +406,7 @@ fn parse_key<'a>(
quotes: &'a [(char, char)],
re: &'a Regex,
) -> impl Fn(&'a str) -> SResult<&'a str> {
move |input| alt((quoted(quotes, key_value_delimiter), re_find(re.to_owned())))(input)
move |input| alt((quoted(quotes, key_value_delimiter), re_find(re)))(input)
}

#[cfg(test)]
Expand Down
3 changes: 2 additions & 1 deletion lib/datadog/grok/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
#![deny(unused_comparisons)]

mod ast;
mod filters;
#[doc(hidden)]
pub mod filters; // TODO Must be exposed for criterion. Perhaps we should pass a feature? Yuck.
mod grok;
mod grok_filter;
mod lexer;
Expand Down
5 changes: 3 additions & 2 deletions lib/datadog/grok/src/parse_grok.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use itertools::{
FoldWhile::{Continue, Done},
Itertools,
};
use std::collections::BTreeMap;
use tracing::warn;
use vector_common::btreemap;
use vrl_compiler::{Target, Value};

#[derive(thiserror::Error, Debug, PartialEq)]
Expand Down Expand Up @@ -43,7 +43,7 @@ pub fn parse_grok(
/// - FailedToApplyFilter - matches the rule, but there was a runtime error while applying on of the filters
/// - NoMatch - this rule does not match a given string
fn apply_grok_rule(source: &str, grok_rule: &GrokRule, remove_empty: bool) -> Result<Value, Error> {
let mut parsed = Value::from(btreemap! {});
let mut parsed = Value::Object(BTreeMap::new());

if let Some(ref matches) = grok_rule.pattern.match_against(source) {
for (name, value) in matches.iter() {
Expand Down Expand Up @@ -115,6 +115,7 @@ mod tests {

use super::*;
use crate::parse_grok_rules::parse_grok_rules;
use vector_common::btreemap;

#[test]
fn parses_simple_grok() {
Expand Down

0 comments on commit 4a09351

Please sign in to comment.