Skip to content

Commit

Permalink
Normalize directly in the same String instance
Browse files Browse the repository at this point in the history
  • Loading branch information
bantonsson committed Aug 7, 2024
1 parent 748ea8d commit 63cd1ac
Showing 1 changed file with 15 additions and 14 deletions.
29 changes: 15 additions & 14 deletions trace-normalization/src/normalize_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ pub fn normalize_parent_id(parent_id: &mut u64, trace_id: u64, span_id: u64) {
}

pub fn normalize_tag(tag: &mut String) {
let mut bytes = std::mem::take(tag).into_bytes();
let bytes = unsafe { tag.as_mut_vec() };
if bytes.is_empty() {
return;
}
Expand Down Expand Up @@ -165,21 +165,23 @@ pub fn normalize_tag(tag: &mut String) {
// returns and actual utf8 codepoint
std::char::from_u32(crate::utf8_helpers::next_code_point(&mut it).unwrap()).unwrap()
};
read_cursor += c.len_utf8();
let mut len_utf8 = c.len_utf8();
read_cursor += len_utf8;

if c.is_lowercase() {
c.encode_utf8(&mut bytes[write_cursor..write_cursor + c.len_utf8()]);
c.encode_utf8(&mut bytes[write_cursor..write_cursor + len_utf8]);
is_in_illegal_span = false;
write_cursor += c.len_utf8();
write_cursor += len_utf8;
codepoints_written += 1;
continue;
}
if c.is_uppercase() {
// Take only first codepoint of the lowercase conversion
// Lowercase the current character if it has the same width as it's lower
if let Some(lower) = c.to_lowercase().next() {
if lower.len_utf8() <= c.len_utf8() {
if lower.len_utf8() <= len_utf8 {
c = lower;
len_utf8 = c.len_utf8();
}
}
}
Expand All @@ -189,15 +191,15 @@ pub fn normalize_tag(tag: &mut String) {
// unicode character classes https://www.unicode.org/reports/tr44/#Alphabetic , but
// close enough
if c.is_alphabetic() {
c.encode_utf8(&mut bytes[write_cursor..write_cursor + c.len_utf8()]);
c.encode_utf8(&mut bytes[write_cursor..write_cursor + len_utf8]);
is_in_illegal_span = false;
write_cursor += c.len_utf8();
write_cursor += len_utf8;
codepoints_written += 1;
} else if c.is_numeric() {
if write_cursor != 0 {
c.encode_utf8(&mut bytes[write_cursor..write_cursor + c.len_utf8()]);
c.encode_utf8(&mut bytes[write_cursor..write_cursor + len_utf8]);
is_in_illegal_span = false;
write_cursor += c.len_utf8();
write_cursor += len_utf8;
codepoints_written += 1;
}
} else if !is_in_illegal_span {
Expand All @@ -212,11 +214,13 @@ pub fn normalize_tag(tag: &mut String) {
write_cursor -= 1;
}
bytes.truncate(write_cursor);
*tag = String::from_utf8(bytes).unwrap();
}

fn normalize_metric_name(name: &mut String) {
let mut bytes = std::mem::take(name).into_bytes();
let bytes = unsafe { name.as_mut_vec() };
if bytes.is_empty() {
return;
}

// Find first alpha character, if none is found the metric name is empty
let Some((mut read_cursor, _)) = bytes
Expand Down Expand Up @@ -265,9 +269,6 @@ fn normalize_metric_name(name: &mut String) {
write_cursor -= 1;
}
bytes.truncate(write_cursor);

// We only wrote ascii chars, so bytes is guaranteed to be valid utf8
*name = String::from_utf8(bytes).unwrap();
}

// truncate_utf8 truncates the given string to make sure it uses less than limit bytes.
Expand Down

0 comments on commit 63cd1ac

Please sign in to comment.