diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e4a6232ed..45441edcc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - Use correct field to pick SQL parser for span normalization. ([#2536](https://github.com/getsentry/relay/pull/2536)) - Prevent stack overflow on SQL serialization. ([#2538](https://github.com/getsentry/relay/pull/2538)) - Bind exclusively to the port for the HTTP server. ([#2582](https://github.com/getsentry/relay/pull/2582)) +- Scrub resource spans even when there's no domain or extension or when the description is an image. ([#2591](https://github.com/getsentry/relay/pull/2591)) **Internal**: diff --git a/relay-dynamic-config/src/defaults.rs b/relay-dynamic-config/src/defaults.rs index c100af7c5f..3f0bc493c3 100644 --- a/relay-dynamic-config/src/defaults.rs +++ b/relay-dynamic-config/src/defaults.rs @@ -12,6 +12,9 @@ const DISABLED_DATABASES: &[&str] = &["*clickhouse*", "*mongodb*", "*redis*", "* /// A list of span.op` patterns we want to enable for mobile. const MOBILE_OPS: &[&str] = &["app.*", "ui.load*"]; +/// A list of patterns found in MongoDB queries +const MONGODB_QUERIES: &[&str] = &["*\"$*", "{*", "*({*", "*[{*"]; + /// Adds configuration for extracting metrics from spans. /// /// This configuration is temporarily hard-coded here. It will later be provided by the upstream. @@ -30,7 +33,7 @@ pub fn add_span_metrics(project_config: &mut ProjectConfig) { return; } - let resource_condition = RuleCondition::glob("span.op", "resource.*"); + let resource_condition = RuleCondition::glob("span.op", "resource*"); // Add conditions to filter spans if a specific module is enabled. // By default, this will extract all spans. @@ -40,14 +43,13 @@ pub fn add_span_metrics(project_config: &mut ProjectConfig) { { RuleCondition::all() } else { + let is_disabled = RuleCondition::glob("span.op", DISABLED_DATABASES); let is_mongo = RuleCondition::eq("span.system", "mongodb") - | RuleCondition::glob("span.description", "*\"$*"); + | RuleCondition::glob("span.description", MONGODB_QUERIES); let mut conditions = RuleCondition::eq("span.op", "http.client") - | (RuleCondition::glob("span.op", "db*") - & !RuleCondition::glob("span.op", DISABLED_DATABASES) - & !(RuleCondition::eq("span.op", "db.sql.query") & is_mongo)) - | RuleCondition::glob("span.op", MOBILE_OPS); + | RuleCondition::glob("span.op", MOBILE_OPS) + | (RuleCondition::glob("span.op", "db*") & !is_disabled & !is_mongo); if project_config .features diff --git a/relay-event-normalization/src/normalize/span/description/mod.rs b/relay-event-normalization/src/normalize/span/description/mod.rs index 053dd5022c..fc801b1be5 100644 --- a/relay-event-normalization/src/normalize/span/description/mod.rs +++ b/relay-event-normalization/src/normalize/span/description/mod.rs @@ -12,7 +12,9 @@ use relay_event_schema::protocol::Span; use relay_protocol::{Annotated, Remark, RemarkType, Value}; use url::Url; -use crate::regexes::{REDIS_COMMAND_REGEX, RESOURCE_NORMALIZER_REGEX}; +use crate::regexes::{ + DB_SQL_TRANSACTION_CORE_DATA_REGEX, REDIS_COMMAND_REGEX, RESOURCE_NORMALIZER_REGEX, +}; use crate::span::tag_extraction::HTTP_METHOD_EXTRACTOR_REGEX; use crate::transactions::SpanDescriptionRule; @@ -30,6 +32,7 @@ pub(crate) fn scrub_span_description(span: &mut Span, rules: &Vec) -> bool { - sub_op == "sql.query" && (description.contains("\"$") || db_system == Some("mongodb")) +fn is_sql_mongodb(description: &str, db_system: Option<&str>) -> bool { + description.contains("\"$") + || description.contains("({") + || description.contains("[{") + || description.starts_with('{') + || db_system == Some("mongodb") } /// We are unable to parse active record when we do not know which database is being used. @@ -93,6 +103,13 @@ fn is_legacy_activerecord(sub_op: &str, db_system: Option<&str>) -> bool { db_system.is_none() && (sub_op.contains("active_record") || sub_op.contains("activerecord")) } +fn scrub_core_data(string: &str) -> Option { + match DB_SQL_TRANSACTION_CORE_DATA_REGEX.replace_all(string, "*") { + Cow::Owned(scrubbed) => Some(scrubbed), + Cow::Borrowed(_) => None, + } +} + fn scrub_http(string: &str) -> Option { let (method, url) = string.split_once(' ')?; if !HTTP_METHOD_EXTRACTOR_REGEX.is_match(method) { @@ -184,7 +201,12 @@ fn scrub_redis_keys(string: &str) -> Option { fn scrub_resource_identifiers(mut string: &str) -> Option { // Remove query parameters or the fragment. - if let Some(pos) = string.find('?') { + if string.starts_with("data:") { + if let Some(pos) = string.find(';') { + return Some(string[..pos].into()); + } + return Some("data:*/*".into()); + } else if let Some(pos) = string.find('?') { string = &string[..pos]; } else if let Some(pos) = string.find('#') { string = &string[..pos]; @@ -193,10 +215,16 @@ fn scrub_resource_identifiers(mut string: &str) -> Option { Cow::Owned(scrubbed) => Some(scrubbed), Cow::Borrowed(string) => { // No IDs scrubbed, but we still want to set something. - let url = Url::parse(string).ok()?; - let extension = url.path().rsplit_once('.')?.1; - let domain = normalize_domain(&url.host()?.to_string(), url.port())?; - Some(format!("{domain}/*.{extension}")) + // If we managed to parse the URL, we'll try to get an extension. + if let Ok(url) = Url::parse(string) { + let domain = normalize_domain(&url.host()?.to_string(), url.port())?; + // If there is an extension, we add it to the domain. + if let Some(extension) = url.path().rsplit_once('.') { + return Some(format!("{domain}/*.{}", extension.1)); + } + return Some(format!("{domain}/*")); + } + Some(string.into()) } } } @@ -547,7 +575,7 @@ mod tests { ); span_description_test!( - span_description_fil_no_extension, + span_description_file_with_no_extension, "somefilenamewithnoextension", "file.read", "*" @@ -560,6 +588,43 @@ mod tests { "https://data.domain.com/data/guide*.gif" ); + span_description_test!( + resource_script_with_no_extension, + "https://www.domain.com/page?id=1234567890", + "resource.script", + "*.domain.com/*" + ); + + span_description_test!( + resource_script_with_no_domain, + "/page.js?action=name", + "resource.script", + "/page.js" + ); + + span_description_test!( + resource_script_with_no_domain_no_extension, + "/page?action=name", + "resource.script", + "/page" + ); + + span_description_test!( + resource_img_embedded, + "", + "resource.img", + "data:image/svg+xml" + ); + + span_description_test!( + db_category_with_mongodb_query, + "find({some_id:1234567890},{limit:100})", + "db", + "" + ); + + span_description_test!(db_category_with_not_sql, "{someField:someValue}", "db", ""); + #[test] fn informed_sql_parser() { let json = r#" @@ -620,4 +685,39 @@ mod tests { // Can be scrubbed with db system. assert_eq!(scrubbed.as_str(), Some("SELECT a FROM b")); } + + #[test] + fn core_data() { + let json = r#"{ + "description": "INSERTED 1 'UAEventData'", + "op": "db.sql.transaction", + "origin": "auto.db.core_data" + }"#; + + let mut span = Annotated::::from_json(json).unwrap(); + + scrub_span_description(span.value_mut().as_mut().unwrap(), &vec![]); + let scrubbed = get_value!(span.data["description.scrubbed"]!); + + assert_eq!(scrubbed.as_str(), Some("INSERTED * 'UAEventData'")); + } + + #[test] + fn multiple_core_data() { + let json = r#"{ + "description": "UPDATED 1 'QueuedRequest', DELETED 1 'QueuedRequest'", + "op": "db.sql.transaction", + "origin": "auto.db.core_data" + }"#; + + let mut span = Annotated::::from_json(json).unwrap(); + + scrub_span_description(span.value_mut().as_mut().unwrap(), &vec![]); + let scrubbed = get_value!(span.data["description.scrubbed"]!); + + assert_eq!( + scrubbed.as_str(), + Some("UPDATED * 'QueuedRequest', DELETED * 'QueuedRequest'") + ); + } } diff --git a/relay-event-normalization/src/regexes.rs b/relay-event-normalization/src/regexes.rs index 4e535aa5e7..76d5439f76 100644 --- a/relay-event-normalization/src/regexes.rs +++ b/relay-event-normalization/src/regexes.rs @@ -78,3 +78,6 @@ pub static RESOURCE_NORMALIZER_REGEX: Lazy = Lazy::new(|| { ) .unwrap() }); + +pub static DB_SQL_TRANSACTION_CORE_DATA_REGEX: Lazy = + Lazy::new(|| Regex::new(r"(?P\d+)").unwrap());