Skip to content

Commit

Permalink
feat(parser/html): lex and parse unquoted attribute values
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Sep 17, 2024
1 parent 7e6de58 commit 5e73499
Show file tree
Hide file tree
Showing 10 changed files with 1,082 additions and 3 deletions.
46 changes: 46 additions & 0 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ impl<'src> HtmlLexer<'src> {
}
}

fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_byte(T![<]),
b'>' => self.consume_byte(T![>]),
b'\'' | b'"' => self.consume_string_literal(current),
_ => self.consume_unquoted_string_literal(),
}
}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
Expand Down Expand Up @@ -233,6 +243,41 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume an attribute value that is not quoted.
///
/// See: https://html.spec.whatwg.org/#attributes-2 under "Unquoted attribute value syntax"
fn consume_unquoted_string_literal(&mut self) -> HtmlSyntaxKind {
let mut content_started = false;
let mut encountered_invalid = false;
while let Some(current) = self.current_byte() {
match current {
// these characters safely terminate an unquoted attribute value
b'\n' | b'\r' | b'\t' | b' ' | b'>' => break,
// these characters are absolutely invalid in an unquoted attribute value
b'?' | b'\'' | b'"' | b'=' | b'<' | b'`' => {
encountered_invalid = true;
break;
}
_ if current.is_ascii() => {
self.advance(1);
content_started = true;
}
_ => break,
}
}

if content_started && !encountered_invalid {
HTML_STRING_LITERAL
} else {
let char = self.current_char_unchecked();
self.push_diagnostic(ParseDiagnostic::new(
"Unexpected character in unquoted attribute value",
self.text_position()..self.text_position() + char.text_len(),
));
self.consume_unexpected_character()
}
}

fn consume_l_angle(&mut self) -> HtmlSyntaxKind {
self.assert_byte(b'<');

Expand Down Expand Up @@ -385,6 +430,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
Some(current) => match context {
HtmlLexContext::Regular => self.consume_token(current),
HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
},
None => EOF,
}
Expand Down
35 changes: 35 additions & 0 deletions crates/biome_html_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,38 @@ fn html_text_spaces_with_lines() {
HTML_LITERAL: 18,
}
}

#[test]
fn unquoted_attribute_value_1() {
assert_lex! {
HtmlLexContext::AttributeValue,
"value",
HTML_STRING_LITERAL: 5,
}
}

#[test]
fn unquoted_attribute_value_2() {
assert_lex! {
HtmlLexContext::AttributeValue,
"value value\tvalue\n",
HTML_STRING_LITERAL: 5,
WHITESPACE: 1,
HTML_STRING_LITERAL: 5,
WHITESPACE: 1,
HTML_STRING_LITERAL: 5,
NEWLINE: 1,
}
}

#[test]
fn unquoted_attribute_value_invalid_chars() {
assert_lex! {
HtmlLexContext::AttributeValue,
"?<='\"`",
ERROR_TOKEN: 1,
L_ANGLE: 1,
ERROR_TOKEN: 1,
ERROR_TOKEN: 3,
}
}
6 changes: 3 additions & 3 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ fn parse_literal(p: &mut HtmlParser) -> ParsedSyntax {
Present(m.complete(p, HTML_NAME))
}

fn parse_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
fn parse_attribute_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
if !p.at(HTML_STRING_LITERAL) {
return Absent;
}
Expand All @@ -226,7 +226,7 @@ fn parse_attribute_initializer(p: &mut HtmlParser) -> ParsedSyntax {
return Absent;
}
let m = p.start();
p.bump(T![=]);
parse_string_literal(p).or_add_diagnostic(p, expected_initializer);
p.bump_with_context(T![=], HtmlLexContext::AttributeValue);
parse_attribute_string_literal(p).or_add_diagnostic(p, expected_initializer);
Present(m.complete(p, HTML_ATTRIBUTE_INITIALIZER_CLAUSE))
}
4 changes: 4 additions & 0 deletions crates/biome_html_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ pub(crate) enum HtmlLexContext {
///
/// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`.
OutsideTag,
/// When the parser encounters a `=` token (the beginning of the attribute initializer clause), it switches to this context.
///
/// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal.
AttributeValue,
}

impl LexContext for HtmlLexContext {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<div>
<div class== >foo</div>
<div class=? >foo</div>
</div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<div>
<div class== >foo</div>
<div class=? >foo</div>
</div>
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..4 "div" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@4..5 ">" [] [],
},
children: HtmlElementList [
HtmlBogusElement {
items: [
HtmlBogus {
items: [
L_ANGLE@5..8 "<" [Newline("\n"), Whitespace("\t")] [],
HtmlName {
value_token: HTML_LITERAL@8..12 "div" [] [Whitespace(" ")],
},
HtmlBogus {
items: [
HtmlAttribute {
name: HtmlName {
value_token: HTML_LITERAL@12..17 "class" [] [],
},
initializer: HtmlAttributeInitializerClause {
eq_token: EQ@17..18 "=" [] [],
value: missing (required),
},
},
HtmlBogusElement {
items: [
ERROR_TOKEN@18..20 "=" [] [Whitespace(" ")],
],
},
],
},
R_ANGLE@20..21 ">" [] [],
],
},
HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@21..24 "foo" [] [],
},
],
HtmlClosingElement {
l_angle_token: L_ANGLE@24..25 "<" [] [],
slash_token: SLASH@25..26 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@26..29 "div" [] [],
},
r_angle_token: R_ANGLE@29..30 ">" [] [],
},
],
},
HtmlBogusElement {
items: [
HtmlBogus {
items: [
L_ANGLE@30..33 "<" [Newline("\n"), Whitespace("\t")] [],
HtmlName {
value_token: HTML_LITERAL@33..37 "div" [] [Whitespace(" ")],
},
HtmlBogus {
items: [
HtmlAttribute {
name: HtmlName {
value_token: HTML_LITERAL@37..42 "class" [] [],
},
initializer: HtmlAttributeInitializerClause {
eq_token: EQ@42..43 "=" [] [],
value: missing (required),
},
},
HtmlBogusElement {
items: [
ERROR_TOKEN@43..45 "?" [] [Whitespace(" ")],
],
},
],
},
R_ANGLE@45..46 ">" [] [],
],
},
HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@46..49 "foo" [] [],
},
],
HtmlClosingElement {
l_angle_token: L_ANGLE@49..50 "<" [] [],
slash_token: SLASH@50..51 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@51..54 "div" [] [],
},
r_angle_token: R_ANGLE@54..55 ">" [] [],
},
],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@55..57 "<" [Newline("\n")] [],
slash_token: SLASH@57..58 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@58..61 "div" [] [],
},
r_angle_token: R_ANGLE@61..62 ">" [] [],
},
},
eof_token: EOF@62..63 "" [Newline("\n")] [],
}
```

## CST

```
0: HTML_ROOT@0..63
0: (empty)
1: (empty)
2: HTML_ELEMENT@0..62
0: HTML_OPENING_ELEMENT@0..5
0: L_ANGLE@0..1 "<" [] []
1: HTML_NAME@1..4
0: HTML_LITERAL@1..4 "div" [] []
2: HTML_ATTRIBUTE_LIST@4..4
3: R_ANGLE@4..5 ">" [] []
1: HTML_ELEMENT_LIST@5..55
0: HTML_BOGUS_ELEMENT@5..30
0: HTML_BOGUS@5..21
0: L_ANGLE@5..8 "<" [Newline("\n"), Whitespace("\t")] []
1: HTML_NAME@8..12
0: HTML_LITERAL@8..12 "div" [] [Whitespace(" ")]
2: HTML_BOGUS@12..20
0: HTML_ATTRIBUTE@12..18
0: HTML_NAME@12..17
0: HTML_LITERAL@12..17 "class" [] []
1: HTML_ATTRIBUTE_INITIALIZER_CLAUSE@17..18
0: EQ@17..18 "=" [] []
1: (empty)
1: HTML_BOGUS_ELEMENT@18..20
0: ERROR_TOKEN@18..20 "=" [] [Whitespace(" ")]
3: R_ANGLE@20..21 ">" [] []
1: HTML_ELEMENT_LIST@21..24
0: HTML_CONTENT@21..24
0: HTML_LITERAL@21..24 "foo" [] []
2: HTML_CLOSING_ELEMENT@24..30
0: L_ANGLE@24..25 "<" [] []
1: SLASH@25..26 "/" [] []
2: HTML_NAME@26..29
0: HTML_LITERAL@26..29 "div" [] []
3: R_ANGLE@29..30 ">" [] []
1: HTML_BOGUS_ELEMENT@30..55
0: HTML_BOGUS@30..46
0: L_ANGLE@30..33 "<" [Newline("\n"), Whitespace("\t")] []
1: HTML_NAME@33..37
0: HTML_LITERAL@33..37 "div" [] [Whitespace(" ")]
2: HTML_BOGUS@37..45
0: HTML_ATTRIBUTE@37..43
0: HTML_NAME@37..42
0: HTML_LITERAL@37..42 "class" [] []
1: HTML_ATTRIBUTE_INITIALIZER_CLAUSE@42..43
0: EQ@42..43 "=" [] []
1: (empty)
1: HTML_BOGUS_ELEMENT@43..45
0: ERROR_TOKEN@43..45 "?" [] [Whitespace(" ")]
3: R_ANGLE@45..46 ">" [] []
1: HTML_ELEMENT_LIST@46..49
0: HTML_CONTENT@46..49
0: HTML_LITERAL@46..49 "foo" [] []
2: HTML_CLOSING_ELEMENT@49..55
0: L_ANGLE@49..50 "<" [] []
1: SLASH@50..51 "/" [] []
2: HTML_NAME@51..54
0: HTML_LITERAL@51..54 "div" [] []
3: R_ANGLE@54..55 ">" [] []
2: HTML_CLOSING_ELEMENT@55..62
0: L_ANGLE@55..57 "<" [Newline("\n")] []
1: SLASH@57..58 "/" [] []
2: HTML_NAME@58..61
0: HTML_LITERAL@58..61 "div" [] []
3: R_ANGLE@61..62 ">" [] []
3: EOF@62..63 "" [Newline("\n")] []
```

## Diagnostics

```
invalid-unqouted-value1.html:2:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Unexpected character in unquoted attribute value
1 │ <div>
> 2 │ <div class== >foo</div>
│ ^
3 │ <div class=? >foo</div>
4 │ </div>
invalid-unqouted-value1.html:2:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Unexpected character `=`
1 │ <div>
> 2 │ <div class== >foo</div>
│ ^
3 │ <div class=? >foo</div>
4 │ </div>
invalid-unqouted-value1.html:3:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Unexpected character in unquoted attribute value
1 │ <div>
2 │ <div class== >foo</div>
> 3 │ <div class=? >foo</div>
│ ^
4 │ </div>
5 │
invalid-unqouted-value1.html:3:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Unexpected character `?`
1 │ <div>
2 │ <div class== >foo</div>
> 3 │ <div class=? >foo</div>
│ ^
4 │ </div>
5 │
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<div>
<div class=foo"bar >foo</div>
<div class=foo'bar >foo</div>
</div>
Loading

0 comments on commit 5e73499

Please sign in to comment.