Skip to content

Commit

Permalink
fix(parser): Better short base64 detection
Browse files Browse the repository at this point in the history
Previously, we bailed out if the string is too short (<90) and there
weren't non-alpha-base64 bytes present.  What we ignored were the
padding bytes.

We key off of padding bytes to detect that a string is in fact base64
encoded.  Like the other cases, there can be false positives but those
strings should show up elsewhere or the compiler will fail.

This was called out in crate-ci#485
  • Loading branch information
epage committed May 10, 2022
1 parent bd5048d commit fd53983
Showing 1 changed file with 25 additions and 7 deletions.
32 changes: 25 additions & 7 deletions crates/typos/src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,16 @@ mod parser {
<T as nom::InputIter>::Item: AsChar + Copy,
{
let (padding, captured) = take_while1(is_base64_digit)(input.clone())?;

const CHUNK: usize = 4;
let padding_offset = input.offset(&padding);
let mut padding_len = CHUNK - padding_offset % CHUNK;
if padding_len == CHUNK {
padding_len = 0;
}

if captured.input_len() < 90
&& padding_len == 0
&& captured
.iter_elements()
.all(|c| !['/', '+'].contains(&c.as_char()))
Expand All @@ -418,14 +427,8 @@ mod parser {
)));
}

const CHUNK: usize = 4;
let padding_offset = input.offset(&padding);
let mut padding_len = CHUNK - padding_offset % CHUNK;
if padding_len == CHUNK {
padding_len = 0;
}

let (after, _) = take_while_m_n(padding_len, padding_len, is_base64_padding)(padding)?;

let after_offset = input.offset(&after);
Ok(input.take_split(after_offset))
}
Expand Down Expand Up @@ -1207,6 +1210,21 @@ mod test {
assert_eq!(expected, actual);
}

#[test]
fn tokenize_ignore_base64_case_3() {
let parser = TokenizerBuilder::new().build();

let input = r#" "integrity": "sha512-hCmlUAIlUiav8Xdqw3Io4LcpA1DOt7h3LSTAC4G6JGHFFaWzI6qvFt9oilvl8BmkbBRX1IhM90ZAmpk68zccQA==","#;
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("integrity", Case::None, 8),
Identifier::new_unchecked("sha512", Case::None, 21),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_ignore_email() {
let parser = TokenizerBuilder::new().build();
Expand Down

0 comments on commit fd53983

Please sign in to comment.