Skip to content

Commit

Permalink
fix(parser/html): fix whitespace being lexed as html literal (#3908)
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 authored Sep 16, 2024
1 parent 0e22abe commit 4968fa5
Show file tree
Hide file tree
Showing 12 changed files with 181 additions and 180 deletions.
64 changes: 40 additions & 24 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ impl<'src> HtmlLexer<'src> {
self.consume_byte(T![>])
}
b'/' => self.consume_byte(T![/]),
b'!' => self.consume_byte(T![!]),
b'=' => self.consume_byte(T![=]),
b'!' => self.consume_byte(T![!]),
b'\'' | b'"' => self.consume_string_literal(current),
// TODO: differentiate between attribute names and identifiers
_ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
Expand All @@ -80,32 +80,14 @@ impl<'src> HtmlLexer<'src> {
}
}

fn consume_element_list_token(&mut self, current: u8) -> HtmlSyntaxKind {
debug_assert!(!self.is_eof());
fn consume_token_outside_tag(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'<' => self.consume_byte(T![<]),
_ => {
while let Some(chr) = self.current_byte() {
match chr {
b'<' => break,
chr => {
if chr.is_ascii() {
self.advance(1);
} else {
self.advance_char_unchecked();
}
}
}
}

HTML_LITERAL
}
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_l_angle(),
_ => self.consume_html_text(),
}
}

#[allow(unused)]
fn consume_element_token(&mut self, current: u8) {}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
Expand Down Expand Up @@ -337,6 +319,40 @@ impl<'src> HtmlLexer<'src> {

Ok(())
}

/// Consume HTML text literals outside of tags.
///
/// This includes text and single spaces between words. If newline or a second
/// consecutive space is found, this will stop consuming and to allow the lexer to
/// switch to `consume_whitespace`.
///
/// See: https://html.spec.whatwg.org/#space-separated-tokens
/// See: https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace
fn consume_html_text(&mut self) -> HtmlSyntaxKind {
let mut saw_space = false;
while let Some(current) = self.current_byte() {
match current {
b'<' => break,
b'\n' | b'\r' => {
self.after_newline = true;
break;
}
b' ' => {
if saw_space {
break;
}
self.advance(1);
saw_space = true;
}
_ => {
self.advance(1);
saw_space = false;
}
}
}

HTML_LITERAL
}
}

impl<'src> Lexer<'src> for HtmlLexer<'src> {
Expand Down Expand Up @@ -368,7 +384,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
match self.current_byte() {
Some(current) => match context {
HtmlLexContext::Regular => self.consume_token(current),
HtmlLexContext::ElementList => self.consume_element_list_token(current),
HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
},
None => EOF,
}
Expand Down
41 changes: 30 additions & 11 deletions crates/biome_html_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ fn losslessness(string: String) -> bool {
// Assert the result of lexing a piece of source code,
// and make sure the tokens yielded are fully lossless and the source can be reconstructed from only the tokens
macro_rules! assert_lex {
($src:expr, $($kind:ident:$len:expr $(,)?)*) => {{
($context:expr, $src:expr, $($kind:ident:$len:expr $(,)?)*) => {{
let mut lexer = HtmlLexer::from_str($src);
let mut idx = 0;
let mut tok_idx = TextSize::default();

let mut new_str = String::with_capacity($src.len());
let mut tokens = vec![];

while lexer.next_token(HtmlLexContext::default()) != EOF {
while lexer.next_token($context) != EOF {
tokens.push((lexer.current(), lexer.current_range()));
}

Expand Down Expand Up @@ -97,6 +97,9 @@ macro_rules! assert_lex {

assert_eq!($src, new_str, "Failed to reconstruct input");
}};
($src:expr, $($kind:ident:$len:expr $(,)?)*) => {
assert_lex!(HtmlLexContext::default(), $src, $($kind:$len,)*);
};
}

#[test]
Expand Down Expand Up @@ -150,17 +153,11 @@ fn element() {
}

#[test]
fn element_with_text() {
fn html_text() {
assert_lex! {
"<div>abcdefghijklmnopqrstuvwxyz!@_-:;</div>",
L_ANGLE: 1,
HTML_LITERAL: 3,
R_ANGLE: 1,
HtmlLexContext::OutsideTag,
"abcdefghijklmnopqrstuvwxyz!@_-:;",
HTML_LITERAL: 32,
L_ANGLE: 1,
SLASH: 1,
HTML_LITERAL: 3,
R_ANGLE: 1,
}
}

Expand Down Expand Up @@ -231,3 +228,25 @@ fn html_element() {
R_ANGLE: 1,
}
}

#[test]
fn html_text_spaces() {
assert_lex! {
HtmlLexContext::OutsideTag,
"Lorem ipsum dolor sit amet, consectetur.",
HTML_LITERAL: 40,
}
}

#[test]
fn html_text_spaces_with_lines() {
assert_lex! {
HtmlLexContext::OutsideTag,
"Lorem ipsum dolor sit
amet, consectetur.",
HTML_LITERAL: 21,
NEWLINE: 1,
WHITESPACE: 8,
HTML_LITERAL: 18,
}
}
8 changes: 4 additions & 4 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,17 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {

if p.at(T![/]) {
p.bump(T![/]);
p.expect(T![>]);
p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
Present(m.complete(p, HTML_SELF_CLOSING_ELEMENT))
} else {
if should_be_self_closing {
if p.at(T![/]) {
p.bump(T![/]);
}
p.expect(T![>]);
p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
return Present(m.complete(p, HTML_SELF_CLOSING_ELEMENT));
}
p.expect_with_context(T![>], HtmlLexContext::ElementList);
p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
let opening = m.complete(p, HTML_OPENING_ELEMENT);
loop {
ElementList.parse_list(p);
Expand Down Expand Up @@ -128,7 +128,7 @@ impl ParseNodeList for ElementList {
T![<] => parse_element(p),
HTML_LITERAL => {
let m = p.start();
p.bump(HTML_LITERAL);
p.bump_with_context(HTML_LITERAL, HtmlLexContext::OutsideTag);
Present(m.complete(p, HTML_CONTENT))
}
_ => Absent,
Expand Down
9 changes: 5 additions & 4 deletions crates/biome_html_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ pub(crate) struct HtmlTokenSource<'source> {

#[derive(Copy, Clone, Debug, Default)]
pub(crate) enum HtmlLexContext {
/// The default state
/// The default state. This state is used for a majority of the lexing, which is inside html tags.
#[default]
Regular,
#[allow(unused)]
/// When the lexer is inside a element list, newlines, spaces and quotes are part of the text
ElementList,
/// When the lexer is outside of a tag, special characters are lexed as text.
///
/// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`.
OutsideTag,
}

impl LexContext for HtmlLexContext {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,7 @@ HtmlRoot {
r_angle_token: R_ANGLE@12..13 ">" [] [],
},
HtmlContent {
value_token: HTML_LITERAL@13..18 "This" [] [Whitespace(" ")],
},
HtmlContent {
value_token: HTML_LITERAL@18..23 "text" [] [Whitespace(" ")],
},
HtmlContent {
value_token: HTML_LITERAL@23..26 "is" [] [Whitespace(" ")],
},
HtmlContent {
value_token: HTML_LITERAL@26..33 "inside" [] [Whitespace(" ")],
},
HtmlContent {
value_token: HTML_LITERAL@33..36 "br." [] [],
value_token: HTML_LITERAL@13..36 "This text is inside br." [] [],
},
],
HtmlBogusElement {
Expand Down Expand Up @@ -107,16 +95,8 @@ HtmlRoot {
2: HTML_ATTRIBUTE_LIST@12..12
3: (empty)
4: R_ANGLE@12..13 ">" [] []
2: HTML_CONTENT@13..18
0: HTML_LITERAL@13..18 "This" [] [Whitespace(" ")]
3: HTML_CONTENT@18..23
0: HTML_LITERAL@18..23 "text" [] [Whitespace(" ")]
4: HTML_CONTENT@23..26
0: HTML_LITERAL@23..26 "is" [] [Whitespace(" ")]
5: HTML_CONTENT@26..33
0: HTML_LITERAL@26..33 "inside" [] [Whitespace(" ")]
6: HTML_CONTENT@33..36
0: HTML_LITERAL@33..36 "br." [] []
2: HTML_CONTENT@13..36
0: HTML_LITERAL@13..36 "This text is inside br." [] []
2: HTML_BOGUS_ELEMENT@36..41
0: L_ANGLE@36..37 "<" [] []
1: SLASH@37..38 "/" [] []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ HtmlRoot {
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@5..9 "foo\n" [] [],
value_token: HTML_LITERAL@5..8 "foo" [] [],
},
],
closing_element: missing (required),
},
eof_token: EOF@9..9 "" [] [],
eof_token: EOF@8..9 "" [Newline("\n")] [],
}
```
Expand All @@ -42,18 +42,18 @@ HtmlRoot {
0: HTML_ROOT@0..9
0: (empty)
1: (empty)
2: HTML_ELEMENT@0..9
2: HTML_ELEMENT@0..8
0: HTML_OPENING_ELEMENT@0..5
0: L_ANGLE@0..1 "<" [] []
1: HTML_NAME@1..4
0: HTML_LITERAL@1..4 "div" [] []
2: HTML_ATTRIBUTE_LIST@4..4
3: R_ANGLE@4..5 ">" [] []
1: HTML_ELEMENT_LIST@5..9
0: HTML_CONTENT@5..9
0: HTML_LITERAL@5..9 "foo\n" [] []
1: HTML_ELEMENT_LIST@5..8
0: HTML_CONTENT@5..8
0: HTML_LITERAL@5..8 "foo" [] []
2: (empty)
3: EOF@9..9 "" [] []
3: EOF@8..9 "" [Newline("\n")] []
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ HtmlRoot {
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@34..40 "\n\tfoo\n" [] [],
value_token: HTML_LITERAL@34..39 "foo" [Newline("\n"), Whitespace("\t")] [],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@40..41 "<" [] [],
l_angle_token: L_ANGLE@39..41 "<" [Newline("\n")] [],
slash_token: SLASH@41..42 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@42..45 "div" [] [],
Expand Down Expand Up @@ -98,11 +98,11 @@ HtmlRoot {
1: HTML_STRING@24..32
0: HTML_STRING_LITERAL@24..32 "\"button\"" [] []
3: R_ANGLE@32..34 ">" [Newline("\n")] []
1: HTML_ELEMENT_LIST@34..40
0: HTML_CONTENT@34..40
0: HTML_LITERAL@34..40 "\n\tfoo\n" [] []
2: HTML_CLOSING_ELEMENT@40..46
0: L_ANGLE@40..41 "<" [] []
1: HTML_ELEMENT_LIST@34..39
0: HTML_CONTENT@34..39
0: HTML_LITERAL@34..39 "foo" [Newline("\n"), Whitespace("\t")] []
2: HTML_CLOSING_ELEMENT@39..46
0: L_ANGLE@39..41 "<" [Newline("\n")] []
1: SLASH@41..42 "/" [] []
2: HTML_NAME@42..45
0: HTML_LITERAL@42..45 "div" [] []
Expand Down
Loading

0 comments on commit 4968fa5

Please sign in to comment.