fix(parser/html): fix whitespace being lexed as html literal

biomejs · Sep 15, 2024 · 272e65f · 272e65f
1 parent 140d766
commit 272e65f
Show file tree

Hide file tree

Showing 12 changed files with 179 additions and 181 deletions.
diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs
@@ -61,8 +61,8 @@ impl<'src> HtmlLexer<'src> {
                 self.consume_byte(T![>])
             }
             b'/' => self.consume_byte(T![/]),
-            b'!' => self.consume_byte(T![!]),
             b'=' => self.consume_byte(T![=]),
+            b'!' => self.consume_byte(T![!]),
             b'\'' | b'"' => self.consume_string_literal(current),
             // TODO: differentiate between attribute names and identifiers
             _ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
@@ -75,37 +75,19 @@ impl<'src> HtmlLexer<'src> {
                         return bom;
                     }
                 }
-                self.consume_unexpected_character()
+                self.consume_html_text()
             }
         }
     }
 
-    fn consume_element_list_token(&mut self, current: u8) -> HtmlSyntaxKind {
-        debug_assert!(!self.is_eof());
+    fn consume_token_outside_tag(&mut self, current: u8) -> HtmlSyntaxKind {
         match current {
-            b'<' => self.consume_byte(T![<]),
-            _ => {
-                while let Some(chr) = self.current_byte() {
-                    match chr {
-                        b'<' => break,
-                        chr => {
-                            if chr.is_ascii() {
-                                self.advance(1);
-                            } else {
-                                self.advance_char_unchecked();
-                            }
-                        }
-                    }
-                }
-
-                HTML_LITERAL
-            }
+            b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
+            b'<' => self.consume_l_angle(),
+            _ => self.consume_html_text(),
         }
     }
 
-    #[allow(unused)]
-    fn consume_element_token(&mut self, current: u8) {}
-
     /// Bumps the current byte and creates a lexed token of the passed in kind.
     #[inline]
     fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
@@ -337,6 +319,37 @@ impl<'src> HtmlLexer<'src> {
 
         Ok(())
     }
+
+    /// Consume HTML text literals outside of tags.
+    ///
+    /// This includes text and single spaces between words. If newline or a second
+    /// consecutive space is found, this will stop consuming and to allow the lexer to
+    /// switch to `consume_whitespace`.
+    fn consume_html_text(&mut self) -> HtmlSyntaxKind {
+        let mut saw_space = false;
+        while let Some(current) = self.current_byte() {
+            match current {
+                b'<' => break,
+                b'\n' | b'\r' => {
+                    self.after_newline = true;
+                    break;
+                }
+                b' ' => {
+                    if saw_space {
+                        break;
+                    }
+                    self.advance(1);
+                    saw_space = true;
+                }
+                _ => {
+                    self.advance(1);
+                    saw_space = false;
+                }
+            }
+        }
+
+        HTML_LITERAL
+    }
 }
 
 impl<'src> Lexer<'src> for HtmlLexer<'src> {
@@ -368,7 +381,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
             match self.current_byte() {
                 Some(current) => match context {
                     HtmlLexContext::Regular => self.consume_token(current),
-                    HtmlLexContext::ElementList => self.consume_element_list_token(current),
+                    HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
                 },
                 None => EOF,
             }

diff --git a/crates/biome_html_parser/src/lexer/tests.rs b/crates/biome_html_parser/src/lexer/tests.rs
@@ -48,15 +48,15 @@ fn losslessness(string: String) -> bool {
 // Assert the result of lexing a piece of source code,
 // and make sure the tokens yielded are fully lossless and the source can be reconstructed from only the tokens
 macro_rules! assert_lex {
-    ($src:expr, $($kind:ident:$len:expr $(,)?)*) => {{
+    ($context:expr, $src:expr, $($kind:ident:$len:expr $(,)?)*) => {{
         let mut lexer = HtmlLexer::from_str($src);
         let mut idx = 0;
         let mut tok_idx = TextSize::default();
 
         let mut new_str = String::with_capacity($src.len());
         let mut tokens = vec![];
 
-        while lexer.next_token(HtmlLexContext::default()) != EOF {
+        while lexer.next_token($context) != EOF {
             tokens.push((lexer.current(), lexer.current_range()));
         }
 
@@ -97,6 +97,9 @@ macro_rules! assert_lex {
 
         assert_eq!($src, new_str, "Failed to reconstruct input");
     }};
+    ($src:expr, $($kind:ident:$len:expr $(,)?)*) => {
+        assert_lex!(HtmlLexContext::default(), $src, $($kind:$len,)*);
+    };
 }
 
 #[test]
@@ -150,17 +153,11 @@ fn element() {
 }
 
 #[test]
-fn element_with_text() {
+fn html_text() {
     assert_lex! {
-        "<div>abcdefghijklmnopqrstuvwxyz!@_-:;</div>",
-        L_ANGLE: 1,
-        HTML_LITERAL: 3,
-        R_ANGLE: 1,
+        HtmlLexContext::OutsideTag,
+        "abcdefghijklmnopqrstuvwxyz!@_-:;",
         HTML_LITERAL: 32,
-        L_ANGLE: 1,
-        SLASH: 1,
-        HTML_LITERAL: 3,
-        R_ANGLE: 1,
     }
 }
 
@@ -231,3 +228,25 @@ fn html_element() {
         R_ANGLE: 1,
     }
 }
+
+#[test]
+fn html_text_spaces() {
+    assert_lex! {
+        HtmlLexContext::OutsideTag,
+        "Lorem ipsum dolor sit amet, consectetur.",
+        HTML_LITERAL: 40,
+    }
+}
+
+#[test]
+fn html_text_spaces_with_lines() {
+    assert_lex! {
+        HtmlLexContext::OutsideTag,
+        "Lorem ipsum dolor sit
+        amet, consectetur.",
+        HTML_LITERAL: 21,
+        NEWLINE: 1,
+        WHITESPACE: 8,
+        HTML_LITERAL: 18,
+    }
+}
diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs
@@ -68,17 +68,17 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
 
     if p.at(T![/]) {
         p.bump(T![/]);
-        p.expect(T![>]);
+        p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
         Present(m.complete(p, HTML_SELF_CLOSING_ELEMENT))
     } else {
         if should_be_self_closing {
             if p.at(T![/]) {
                 p.bump(T![/]);
             }
-            p.expect(T![>]);
+            p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
             return Present(m.complete(p, HTML_SELF_CLOSING_ELEMENT));
         }
-        p.expect_with_context(T![>], HtmlLexContext::ElementList);
+        p.expect_with_context(T![>], HtmlLexContext::OutsideTag);
         let opening = m.complete(p, HTML_OPENING_ELEMENT);
         loop {
             ElementList.parse_list(p);
@@ -128,7 +128,7 @@ impl ParseNodeList for ElementList {
             T![<] => parse_element(p),
             HTML_LITERAL => {
                 let m = p.start();
-                p.bump(HTML_LITERAL);
+                p.bump_with_context(HTML_LITERAL, HtmlLexContext::OutsideTag);
                 Present(m.complete(p, HTML_CONTENT))
             }
             _ => Absent,

diff --git a/crates/biome_html_parser/src/token_source.rs b/crates/biome_html_parser/src/token_source.rs
@@ -16,12 +16,13 @@ pub(crate) struct HtmlTokenSource<'source> {
 
 #[derive(Copy, Clone, Debug, Default)]
 pub(crate) enum HtmlLexContext {
-    /// The default state
+    /// The default state. This state is used for a majority of the lexing, which is inside html tags.
     #[default]
     Regular,
-    #[allow(unused)]
-    /// When the lexer is inside a element list, newlines, spaces and quotes are part of the text
-    ElementList,
+    /// When the lexer is outside of a tag, special characters are lexed as text.
+    ///
+    /// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`.
+    OutsideTag,
 }
 
 impl LexContext for HtmlLexContext {

diff --git a/crates/biome_html_parser/tests/html_specs/error/element/br-with-end.html.snap b/crates/biome_html_parser/tests/html_specs/error/element/br-with-end.html.snap
@@ -40,19 +40,7 @@ HtmlRoot {
                     r_angle_token: R_ANGLE@12..13 ">" [] [],
                 },
                 HtmlContent {
-                    value_token: HTML_LITERAL@13..18 "This" [] [Whitespace(" ")],
-                },
-                HtmlContent {
-                    value_token: HTML_LITERAL@18..23 "text" [] [Whitespace(" ")],
-                },
-                HtmlContent {
-                    value_token: HTML_LITERAL@23..26 "is" [] [Whitespace(" ")],
-                },
-                HtmlContent {
-                    value_token: HTML_LITERAL@26..33 "inside" [] [Whitespace(" ")],
-                },
-                HtmlContent {
-                    value_token: HTML_LITERAL@33..36 "br." [] [],
+                    value_token: HTML_LITERAL@13..36 "This text is inside br." [] [],
                 },
             ],
             HtmlBogusElement {
@@ -107,16 +95,8 @@ HtmlRoot {
         2: HTML_ATTRIBUTE_LIST@12..12
         3: (empty)
         4: R_ANGLE@12..13 ">" [] []
-      2: HTML_CONTENT@13..18
-        0: HTML_LITERAL@13..18 "This" [] [Whitespace(" ")]
-      3: HTML_CONTENT@18..23
-        0: HTML_LITERAL@18..23 "text" [] [Whitespace(" ")]
-      4: HTML_CONTENT@23..26
-        0: HTML_LITERAL@23..26 "is" [] [Whitespace(" ")]
-      5: HTML_CONTENT@26..33
-        0: HTML_LITERAL@26..33 "inside" [] [Whitespace(" ")]
-      6: HTML_CONTENT@33..36
-        0: HTML_LITERAL@33..36 "br." [] []
+      2: HTML_CONTENT@13..36
+        0: HTML_LITERAL@13..36 "This text is inside br." [] []
     2: HTML_BOGUS_ELEMENT@36..41
       0: L_ANGLE@36..37 "<" [] []
       1: SLASH@37..38 "/" [] []

diff --git a/crates/biome_html_parser/tests/html_specs/error/element/missing-close-tag.html.snap b/crates/biome_html_parser/tests/html_specs/error/element/missing-close-tag.html.snap
@@ -27,12 +27,12 @@ HtmlRoot {
         },
         children: HtmlElementList [
             HtmlContent {
-                value_token: HTML_LITERAL@5..9 "foo\n" [] [],
+                value_token: HTML_LITERAL@5..8 "foo" [] [],
             },
         ],
         closing_element: missing (required),
     },
-    eof_token: EOF@9..9 "" [] [],
+    eof_token: EOF@8..9 "" [Newline("\n")] [],
 }
 ```
 
@@ -42,18 +42,18 @@ HtmlRoot {
 0: HTML_ROOT@0..9
   0: (empty)
   1: (empty)
-  2: HTML_ELEMENT@0..9
+  2: HTML_ELEMENT@0..8
     0: HTML_OPENING_ELEMENT@0..5
       0: L_ANGLE@0..1 "<" [] []
       1: HTML_NAME@1..4
         0: HTML_LITERAL@1..4 "div" [] []
       2: HTML_ATTRIBUTE_LIST@4..4
       3: R_ANGLE@4..5 ">" [] []
-    1: HTML_ELEMENT_LIST@5..9
-      0: HTML_CONTENT@5..9
-        0: HTML_LITERAL@5..9 "foo\n" [] []
+    1: HTML_ELEMENT_LIST@5..8
+      0: HTML_CONTENT@5..8
+        0: HTML_LITERAL@5..8 "foo" [] []
     2: (empty)
-  3: EOF@9..9 "" [] []
+  3: EOF@8..9 "" [Newline("\n")] []
 
 ```
 

diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/multiline-attributes.html.snap b/crates/biome_html_parser/tests/html_specs/ok/attributes/multiline-attributes.html.snap
@@ -55,11 +55,11 @@ HtmlRoot {
         },
         children: HtmlElementList [
             HtmlContent {
-                value_token: HTML_LITERAL@34..40 "\n\tfoo\n" [] [],
+                value_token: HTML_LITERAL@34..39 "foo" [Newline("\n"), Whitespace("\t")] [],
             },
         ],
         closing_element: HtmlClosingElement {
-            l_angle_token: L_ANGLE@40..41 "<" [] [],
+            l_angle_token: L_ANGLE@39..41 "<" [Newline("\n")] [],
             slash_token: SLASH@41..42 "/" [] [],
             name: HtmlName {
                 value_token: HTML_LITERAL@42..45 "div" [] [],
@@ -98,11 +98,11 @@ HtmlRoot {
             1: HTML_STRING@24..32
               0: HTML_STRING_LITERAL@24..32 "\"button\"" [] []
       3: R_ANGLE@32..34 ">" [Newline("\n")] []
-    1: HTML_ELEMENT_LIST@34..40
-      0: HTML_CONTENT@34..40
-        0: HTML_LITERAL@34..40 "\n\tfoo\n" [] []
-    2: HTML_CLOSING_ELEMENT@40..46
-      0: L_ANGLE@40..41 "<" [] []
+    1: HTML_ELEMENT_LIST@34..39
+      0: HTML_CONTENT@34..39
+        0: HTML_LITERAL@34..39 "foo" [Newline("\n"), Whitespace("\t")] []
+    2: HTML_CLOSING_ELEMENT@39..46
+      0: L_ANGLE@39..41 "<" [Newline("\n")] []
       1: SLASH@41..42 "/" [] []
       2: HTML_NAME@42..45
         0: HTML_LITERAL@42..45 "div" [] []