Skip to content

Commit

Permalink
feat(parser): Ignore UUIDs
Browse files Browse the repository at this point in the history
We might be able to make this bail our earlier and not accidentally
detect the wrong thing by checking if the hex values are lowercase.  RFC
4122 says that UUIDs must be generated lowecase, while input accepts
any case.  The main issues are risk on the "input" part and the extra
annoyance of writing a custm `is_hex_digit` function.
  • Loading branch information
epage committed Jun 29, 2021
1 parent 32f5e6c commit 85082cd
Showing 1 changed file with 49 additions and 4 deletions.
53 changes: 49 additions & 4 deletions crates/typos/src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ mod parser {
use nom::branch::*;
use nom::bytes::complete::*;
use nom::character::complete::*;
use nom::combinator::*;
use nom::sequence::*;
use nom::{AsChar, IResult};

Expand All @@ -140,6 +141,7 @@ mod parser {
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ PartialEq
Expand Down Expand Up @@ -169,6 +171,7 @@ mod parser {
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ PartialEq
Expand All @@ -178,6 +181,7 @@ mod parser {
{
take_many0(alt((
sep1,
terminated(uuid_literal, sep1),
terminated(hex_literal, sep1),
terminated(dec_literal, sep1),
)))(input)
Expand All @@ -196,7 +200,7 @@ mod parser {
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{
take_while1(is_dec_digit)(input)
take_while1(is_dec_digit_with_sep)(input)
}

fn hex_literal<T>(input: T) -> IResult<T, T>
Expand All @@ -212,10 +216,36 @@ mod parser {
{
preceded(
pair(char('0'), alt((char('x'), char('X')))),
take_while1(is_hex_digit),
take_while1(is_hex_digit_with_sep),
)(input)
}

fn uuid_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Offset
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Slice<std::ops::RangeFrom<usize>>
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
recognize(tuple((
take_while_m_n(8, 8, AsChar::is_hex_digit),
char('-'),
take_while_m_n(4, 4, AsChar::is_hex_digit),
char('-'),
take_while_m_n(4, 4, AsChar::is_hex_digit),
char('-'),
take_while_m_n(4, 4, AsChar::is_hex_digit),
char('-'),
take_while_m_n(12, 12, AsChar::is_hex_digit),
)))(input)
}

fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
where
I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
Expand Down Expand Up @@ -249,11 +279,11 @@ mod parser {
}
}

fn is_dec_digit(i: impl AsChar + Copy) -> bool {
fn is_dec_digit_with_sep(i: impl AsChar + Copy) -> bool {
i.is_dec_digit() || is_digit_sep(i.as_char())
}

fn is_hex_digit(i: impl AsChar + Copy) -> bool {
fn is_hex_digit_with_sep(i: impl AsChar + Copy) -> bool {
i.is_hex_digit() || is_digit_sep(i.as_char())
}

Expand Down Expand Up @@ -646,6 +676,21 @@ mod test {
assert_eq!(expected, actual);
}

#[test]
fn tokenize_ignore_uuid() {
let parser = TokenizerBuilder::new().build();

let input = "Hello 123e4567-e89b-12d3-a456-426652340000 World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", Case::None, 43),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_leading_digits() {
let parser = TokenizerBuilder::new().build();
Expand Down

0 comments on commit 85082cd

Please sign in to comment.