Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(normalization): Scrub span description URLs #2095

Merged
merged 10 commits into from
May 8, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Add support for old 'violated-directive' CSP format. ([#2048](https://github.com/getsentry/relay/pull/2048))
- Add document_uri to csp filter. ([#2059](https://github.com/getsentry/relay/pull/2059))
- Store `geo.subdivision` of the end user location. ([#2058](https://github.com/getsentry/relay/pull/2058))
- Scrub URLs in span descriptions. ([#2095](https://github.com/getsentry/relay/pull/2095))

**Internal**:

Expand Down
1 change: 1 addition & 0 deletions py/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Unreleased

- Store `geo.subdivision` of the end user location. ([#2058](https://github.com/getsentry/relay/pull/2058))
- Scrub URLs in span descriptions. ([#2095](https://github.com/getsentry/relay/pull/2095))
- Add new FFI function for running dynamic sampling. ([#2091](https://github.com/getsentry/relay/pull/2091))

## 0.8.21
Expand Down
1 change: 1 addition & 0 deletions relay-cabi/src/processing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ pub unsafe extern "C" fn relay_store_normalizer_normalize_event(
transaction_name_config: Default::default(), // only supported in relay
is_renormalize: config.is_renormalize.unwrap_or(false),
device_class_synthesis_config: false, // only supported in relay
scrub_span_descriptions: false,
};
light_normalize_event(&mut event, light_normalization_config)?;
process_value(&mut event, &mut *processor, ProcessingState::root())?;
Expand Down
7 changes: 5 additions & 2 deletions relay-general/src/store/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ pub struct LightNormalizationConfig<'a> {
pub transaction_name_config: TransactionNameConfig<'a>,
pub is_renormalize: bool,
pub device_class_synthesis_config: bool,
pub scrub_span_descriptions: bool,
}

pub fn light_normalize_event(
Expand All @@ -714,8 +715,10 @@ pub fn light_normalize_event(
// (internally noops for non-transaction events).
// TODO: Parts of this processor should probably be a filter so we
// can revert some changes to ProcessingAction)
let mut transactions_processor =
transactions::TransactionsProcessor::new(config.transaction_name_config);
let mut transactions_processor = transactions::TransactionsProcessor::new(
config.transaction_name_config,
config.scrub_span_descriptions,
);
transactions_processor.process_event(event, meta, ProcessingState::root())?;

// Check for required and non-empty values
Expand Down
219 changes: 182 additions & 37 deletions relay-general/src/store/transactions/processor.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::collections::BTreeMap;

use relay_common::SpanStatus;

Expand All @@ -8,7 +9,9 @@ use crate::protocol::{
Context, ContextInner, Event, EventType, Span, Timestamp, TransactionInfo, TransactionSource,
};
use crate::store::regexes::TRANSACTION_NAME_NORMALIZER_REGEX;
use crate::types::{Annotated, Meta, ProcessingAction, ProcessingResult, Remark, RemarkType};
use crate::types::{
Annotated, Meta, ProcessingAction, ProcessingResult, Remark, RemarkType, Value,
};

/// Configuration around removing high-cardinality parts of URL transactions.
#[derive(Clone, Debug, Default)]
Expand All @@ -21,11 +24,15 @@ pub struct TransactionNameConfig<'r> {
#[derive(Default)]
pub struct TransactionsProcessor<'r> {
name_config: TransactionNameConfig<'r>,
scrub_span_descriptions: bool,
}

impl<'r> TransactionsProcessor<'r> {
pub fn new(name_config: TransactionNameConfig<'r>) -> Self {
Self { name_config }
pub fn new(name_config: TransactionNameConfig<'r>, scrub_span_descriptions: bool) -> Self {
Self {
name_config,
scrub_span_descriptions,
}
}

/// Applies the rule if any found to the transaction name.
Expand Down Expand Up @@ -264,18 +271,18 @@ fn set_default_transaction_source(event: &mut Event) {
}
}

/// Normalize the transaction name.
/// Normalize the given string.
///
/// Replaces UUIDs, SHAs and numerical IDs in transaction names by placeholders.
/// Returns `Ok(true)` if the name was changed.
fn scrub_identifiers(transaction: &mut Annotated<String>) -> Result<bool, ProcessingAction> {
fn scrub_identifiers(string: &mut Annotated<String>) -> Result<bool, ProcessingAction> {
let capture_names = TRANSACTION_NAME_NORMALIZER_REGEX
.capture_names()
.flatten()
.collect::<Vec<_>>();

let mut did_change = false;
transaction.apply(|trans, meta| {
string.apply(|trans, meta| {
let mut caps = Vec::new();
// Collect all the remarks if anything matches.
for captures in TRANSACTION_NAME_NORMALIZER_REGEX.captures_iter(trans) {
Expand Down Expand Up @@ -438,12 +445,42 @@ impl Processor for TransactionsProcessor<'_> {

span.op.get_or_insert_with(|| "default".to_owned());

if self.scrub_span_descriptions {
scrub_span_description(span)?;
}

span.process_child_values(self, state)?;

Ok(())
}
}

fn scrub_span_description(span: &mut Span) -> Result<(), ProcessingAction> {
if span.description.value().is_none() {
return Ok(());
}

// let mut scrubbed: Annotated<Value::String> = span.description.clone();
iker-barriocanal marked this conversation as resolved.
Show resolved Hide resolved
let mut scrubbed = span.description.clone();

// Scrub URLs
if scrub_identifiers(&mut scrubbed)? {
iker-barriocanal marked this conversation as resolved.
Show resolved Hide resolved
let Some(new_desc) = scrubbed.value() else {
return Ok(());
};
span.data
.get_or_insert_with(BTreeMap::new)
// We don't care what the cause of scrubbing was, since we assume
// that after scrubbing the value is sanitized.
.insert(
"description.scrubbed".to_owned(),
Annotated::new(Value::String(new_desc.to_owned())),
);
}

Ok(())
}

#[cfg(test)]
mod tests {

Expand Down Expand Up @@ -1445,7 +1482,7 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig::default()),
&mut TransactionsProcessor::new(TransactionNameConfig::default(), false),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1529,7 +1566,7 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig::default()),
&mut TransactionsProcessor::new(TransactionNameConfig::default(), false),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1563,7 +1600,7 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig::default()),
&mut TransactionsProcessor::new(TransactionNameConfig::default(), false),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1662,9 +1699,12 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig {
rules: rules.as_ref(),
}),
&mut TransactionsProcessor::new(
TransactionNameConfig {
rules: rules.as_ref(),
},
false,
),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1723,9 +1763,12 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig {
rules: rules.as_ref(),
}),
&mut TransactionsProcessor::new(
TransactionNameConfig {
rules: rules.as_ref(),
},
false,
),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1818,9 +1861,12 @@ mod tests {
// This must not normalize transaction name, since it's disabled.
process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig {
rules: rules.as_ref(),
}),
&mut TransactionsProcessor::new(
TransactionNameConfig {
rules: rules.as_ref(),
},
false,
),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1884,7 +1930,7 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig { rules: &[rule] }),
&mut TransactionsProcessor::new(TransactionNameConfig { rules: &[rule] }, false),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -1978,7 +2024,7 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig::default()),
&mut TransactionsProcessor::new(TransactionNameConfig::default(), false),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -2103,14 +2149,17 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig {
rules: &[TransactionNameRule {
pattern: LazyGlob::new("/remains/*/1234567890/".to_owned()),
expiry: Utc.with_ymd_and_hms(3000, 1, 1, 1, 1, 1).unwrap(),
scope: RuleScope::default(),
redaction: RedactionRule::default(),
}],
}),
&mut TransactionsProcessor::new(
TransactionNameConfig {
rules: &[TransactionNameRule {
pattern: LazyGlob::new("/remains/*/1234567890/".to_owned()),
expiry: Utc.with_ymd_and_hms(3000, 1, 1, 1, 1, 1).unwrap(),
scope: RuleScope::default(),
redaction: RedactionRule::default(),
}],
},
false,
),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -2146,14 +2195,17 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig {
rules: &[TransactionNameRule {
pattern: LazyGlob::new("/remains/*/**".to_owned()),
expiry: Utc.with_ymd_and_hms(3000, 1, 1, 1, 1, 1).unwrap(),
scope: RuleScope::default(),
redaction: RedactionRule::default(),
}],
}),
&mut TransactionsProcessor::new(
TransactionNameConfig {
rules: &[TransactionNameRule {
pattern: LazyGlob::new("/remains/*/**".to_owned()),
expiry: Utc.with_ymd_and_hms(3000, 1, 1, 1, 1, 1).unwrap(),
scope: RuleScope::default(),
redaction: RedactionRule::default(),
}],
},
false,
),
ProcessingState::root(),
)
.unwrap();
Expand Down Expand Up @@ -2184,11 +2236,104 @@ mod tests {

process_value(
&mut event,
&mut TransactionsProcessor::new(TransactionNameConfig::default()),
&mut TransactionsProcessor::new(TransactionNameConfig::default(), false),
ProcessingState::root(),
)
.unwrap();

assert_annotated_snapshot!(event);
}

macro_rules! span_description_test {
($name:ident, $input:literal, $output:literal) => {
#[test]
fn $name() {
let json = format!(
r#"
{{
"description": "{}",
"span_id": "bd2eb23da2beb459",
"start_timestamp": 1597976393.4619668,
"timestamp": 1597976393.4718769,
"trace_id": "ff62a8b040f340bda5d830223def1d81"
}}
"#,
$input
);

let mut span = Annotated::<Span>::from_json(&json).unwrap();

process_value(
&mut span,
&mut TransactionsProcessor::new(TransactionNameConfig::default(), true),
ProcessingState::root(),
)
.unwrap();

assert_eq!($input, span.value().unwrap().description.value().unwrap());
if $output == "" {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, this is Option::None, I'll address this in a future PR.

assert!(span
.value()
.and_then(|span| span.data.value())
.and_then(|data| data.get("description.scrubbed"))
.is_none());
} else {
assert_eq!(
$output,
span.value()
.and_then(|span| span.data.value())
.and_then(|data| data.get("description.scrubbed"))
.and_then(|an_value| an_value.as_str())
.unwrap()
);
}
}
};
}

span_description_test!(span_description_scrub_empty, "", "");

span_description_test!(
span_description_scrub_only_domain,
"GET http://service.io",
""
);

span_description_test!(
span_description_scrub_path_ids_end,
"GET https://www.service.io/resources/01234",
"GET https://www.service.io/resources/*"
);

span_description_test!(
span_description_scrub_path_ids_middle,
"GET https://www.service.io/resources/01234/details",
"GET https://www.service.io/resources/*/details"
);

span_description_test!(
span_description_scrub_path_multiple_ids,
"GET https://www.service.io/users/01234-qwerty/settings/98765-adfghj",
"GET https://www.service.io/users/*/settings/*"
);

span_description_test!(
span_description_scrub_path_md5_hashes,
"GET /clients/563712f9722fb0996ac8f3905b40786f/project/01234",
"GET /clients/*/project/*"
);

span_description_test!(
span_description_scrub_path_sha_hashes,
"GET /clients/403926033d001b5279df37cbbe5287b7c7c267fa/project/01234",
"GET /clients/*/project/*"
);

span_description_test!(
span_description_scrub_path_uuids,
"GET /clients/8ff81d74-606d-4c75-ac5e-cee65cbbc866/project/01234",
"GET /clients/*/project/*"
);

// TODO(iker): Add span description test for URLs with paths
}
Loading