From 82142fbc662aa1035124f1eb53c636a5f1d3119f Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 10 Sep 2020 19:35:58 +0200 Subject: [PATCH 1/3] fix(perf): Do not clone regex during processing We clone the regex field for newtypes as we inherit the parent processing state. That is very expensive. We could just use Arc here, but we could also just make FieldAttrs Copy again like Armin had it from the start, and build up a map of regexes someplace else. I think I didn't know how to write Rust when I introduced the newtypes behavior. --- relay-general/derive/src/lib.rs | 7 ++--- relay-general/derive/src/process.rs | 11 +++---- relay-general/src/processor/attrs.rs | 5 ++- relay-general/src/store/schema.rs | 46 +++++++++++++++++++++++++--- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/relay-general/derive/src/lib.rs b/relay-general/derive/src/lib.rs index 1c9306fd65..641bac42eb 100644 --- a/relay-general/derive/src/lib.rs +++ b/relay-general/derive/src/lib.rs @@ -728,12 +728,9 @@ impl FieldAttrs { }; let match_regex = if let Some(ref match_regex) = self.match_regex { - quote!(Some( - #[allow(clippy::trivial_regex)] - ::regex::Regex::new(#match_regex).unwrap() - )) + quote!(Some(#match_regex)) } else if let Some(ref parent_attrs) = inherit_from_field_attrs { - quote!(#parent_attrs.match_regex.clone()) + quote!(#parent_attrs.match_regex) } else { quote!(None) }; diff --git a/relay-general/derive/src/process.rs b/relay-general/derive/src/process.rs index adb912e23a..0a4b65d7e6 100644 --- a/relay-general/derive/src/process.rs +++ b/relay-general/derive/src/process.rs @@ -81,10 +81,7 @@ pub fn derive_process_value(mut s: synstructure::Structure<'_>) -> TokenStream { let field_attrs_tokens = field_attrs.as_tokens(None); (quote! { - ::lazy_static::lazy_static! { - static ref #field_attrs_name: crate::processor::FieldAttrs = - #field_attrs_tokens; - } + static #field_attrs_name: crate::processor::FieldAttrs = #field_attrs_tokens; }) .to_tokens(&mut body); @@ -94,13 +91,13 @@ pub fn derive_process_value(mut s: synstructure::Structure<'_>) -> TokenStream { } quote! { - __state.enter_nothing(Some(::std::borrow::Cow::Borrowed(&*#field_attrs_name))) + __state.enter_nothing(Some(::std::borrow::Cow::Borrowed(&#field_attrs_name))) } } else if is_tuple_struct { quote! { __state.enter_index( #index, - Some(::std::borrow::Cow::Borrowed(&*#field_attrs_name)), + Some(::std::borrow::Cow::Borrowed(&#field_attrs_name)), crate::processor::ValueType::for_field(#ident), ) } @@ -108,7 +105,7 @@ pub fn derive_process_value(mut s: synstructure::Structure<'_>) -> TokenStream { quote! { __state.enter_static( #field_name, - Some(::std::borrow::Cow::Borrowed(&*#field_attrs_name)), + Some(::std::borrow::Cow::Borrowed(&#field_attrs_name)), crate::processor::ValueType::for_field(#ident), ) } diff --git a/relay-general/src/processor/attrs.rs b/relay-general/src/processor/attrs.rs index 2f21539f5c..51450ff144 100644 --- a/relay-general/src/processor/attrs.rs +++ b/relay-general/src/processor/attrs.rs @@ -3,7 +3,6 @@ use std::fmt; use std::str::FromStr; use failure::Fail; -use regex::Regex; use smallvec::SmallVec; use crate::processor::{ProcessValue, SelectorPathItem, SelectorSpec}; @@ -232,7 +231,7 @@ pub enum Pii { } /// Meta information about a field. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub struct FieldAttrs { /// Optionally the name of the field. pub name: Option<&'static str>, @@ -243,7 +242,7 @@ pub struct FieldAttrs { /// Whether to trim whitespace from this string. pub trim_whitespace: bool, /// A regex to validate the (string) value against. - pub match_regex: Option, + pub match_regex: Option<&'static str>, /// The maximum char length of this field. pub max_chars: Option, /// The maximum bag size of this field. diff --git a/relay-general/src/store/schema.rs b/relay-general/src/store/schema.rs index 9869a58b17..d0baa21855 100644 --- a/relay-general/src/store/schema.rs +++ b/relay-general/src/store/schema.rs @@ -1,3 +1,7 @@ +use std::collections::BTreeMap; + +use regex::Regex; + use crate::processor::{ProcessValue, ProcessingState, Processor}; use crate::types::{ Array, Empty, Error, ErrorKind, Meta, Object, ProcessingAction, ProcessingResult, @@ -5,6 +9,32 @@ use crate::types::{ pub struct SchemaProcessor; +macro_rules! declare_used_field_regexes { + ($($regex:expr),* $(,)*) => { + lazy_static::lazy_static! { + static ref USED_FIELD_REGEXES: BTreeMap<&'static str, Regex> = { + let mut rv = BTreeMap::new(); + $( + if rv.insert($regex, Regex::new($regex).unwrap()).is_some() { + panic!("Regex {} declared twice.", $regex); + } + )* + rv + }; + } + } +} + +// Pre-built list of regexes for max performance. +declare_used_field_regexes![ + r"^[^\r\n\f\t/]*\z", + r"^[^\r\n\x0C/]+$", + r"^[^\r\n]*\z", + r"^[a-zA-Z0-9_\.:-]+\z", + r"^\s*[a-zA-Z0-9_.-]*\s*$", + r"^[^\n]+\z", +]; + impl Processor for SchemaProcessor { fn process_string( &mut self, @@ -95,10 +125,18 @@ fn verify_value_pattern( meta: &mut Meta, state: &ProcessingState<'_>, ) -> ProcessingResult { - if let Some(ref regex) = state.attrs().match_regex { - if !regex.is_match(value) { - meta.add_error(Error::invalid("invalid characters in string")); - return Err(ProcessingAction::DeleteValueSoft); + if let Some(ref regex_string) = state.attrs().match_regex { + match USED_FIELD_REGEXES.get(regex_string) { + Some(regex) => { + if !regex.is_match(value) { + meta.add_error(Error::invalid("invalid characters in string")); + return Err(ProcessingAction::DeleteValueSoft); + } + } + None => panic!( + "Regex {} is not registered in USED_FIELD_REGEXES. Please add it there.", + regex_string + ), } } From 8d2f473e610c2cc085e1091d5ebe56e6ae994cb1 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 10 Sep 2020 22:45:14 +0200 Subject: [PATCH 2/3] remove hashmap --- relay-general/src/store/schema.rs | 48 +++++++++++++------------------ 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/relay-general/src/store/schema.rs b/relay-general/src/store/schema.rs index d0baa21855..674bde4437 100644 --- a/relay-general/src/store/schema.rs +++ b/relay-general/src/store/schema.rs @@ -1,5 +1,3 @@ -use std::collections::BTreeMap; - use regex::Regex; use crate::processor::{ProcessValue, ProcessingState, Processor}; @@ -10,29 +8,31 @@ use crate::types::{ pub struct SchemaProcessor; macro_rules! declare_used_field_regexes { - ($($regex:expr),* $(,)*) => { - lazy_static::lazy_static! { - static ref USED_FIELD_REGEXES: BTreeMap<&'static str, Regex> = { - let mut rv = BTreeMap::new(); + ($($ident:ident: $regex:expr),* $(,)*) => { + fn get_regex(name: &'static str) -> &'static Regex { + lazy_static::lazy_static! { $( - if rv.insert($regex, Regex::new($regex).unwrap()).is_some() { - panic!("Regex {} declared twice.", $regex); - } + static ref $ident: Regex = Regex::new($regex).unwrap(); )* - rv }; + + match name { + $($regex => &$ident, )* + _ => panic!("Please declare Regex {} using declare_used_field_regexes.", name), + } } } } -// Pre-built list of regexes for max performance. +// Pre-built list of regexes for max performance. The identifier in front is arbitrary, but needs +// to be unique. declare_used_field_regexes![ - r"^[^\r\n\f\t/]*\z", - r"^[^\r\n\x0C/]+$", - r"^[^\r\n]*\z", - r"^[a-zA-Z0-9_\.:-]+\z", - r"^\s*[a-zA-Z0-9_.-]*\s*$", - r"^[^\n]+\z", + A: r"^[^\r\n\f\t/]*\z", + B: r"^[^\r\n\x0C/]+$", + C: r"^[^\r\n]*\z", + D: r"^[a-zA-Z0-9_\.:-]+\z", + E: r"^\s*[a-zA-Z0-9_.-]*\s*$", + F: r"^[^\n]+\z", ]; impl Processor for SchemaProcessor { @@ -126,17 +126,9 @@ fn verify_value_pattern( state: &ProcessingState<'_>, ) -> ProcessingResult { if let Some(ref regex_string) = state.attrs().match_regex { - match USED_FIELD_REGEXES.get(regex_string) { - Some(regex) => { - if !regex.is_match(value) { - meta.add_error(Error::invalid("invalid characters in string")); - return Err(ProcessingAction::DeleteValueSoft); - } - } - None => panic!( - "Regex {} is not registered in USED_FIELD_REGEXES. Please add it there.", - regex_string - ), + if !get_regex(regex_string).is_match(value) { + meta.add_error(Error::invalid("invalid characters in string")); + return Err(ProcessingAction::DeleteValueSoft); } } From 22e0c8a4fa4cf61bc8fd4c6546716b87ba004d1d Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Fri, 11 Sep 2020 10:39:11 +0200 Subject: [PATCH 3/3] add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab03962fa4..469bad2b83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - Add the client SDK to session kafka payloads. ([#751](https://github.com/getsentry/relay/pull/751)) - Add a standalone tool to document metrics in JSON or YAML. ([#752](https://github.com/getsentry/relay/pull/752)) - Emit `processing.event.produced` for user report and session Kafka messages. ([#757](https://github.com/getsentry/relay/pull/757)) +- Improve performance of event processing by avoiding regex clone. ([#767](https://github.com/getsentry/relay/pull/767)) ## 20.8.0