Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(html): Handle whitespace in tags #19

Merged
merged 1 commit into from
Jul 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions src/html/parse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,44 @@ mod tests {
}
}

#[test]
fn parse_should_trim_newlines_out_of_start_tag_names() {
// arrange
let text = r#"
<div
id="hi"
class="bye">
</div>
"#;

// act
let result = parse(text).unwrap();

// assert
// <div>
let mut attributes = HashMap::new();
attributes.insert("id", "hi");
attributes.insert("class", "bye");
assert_tag(&result, result.root_node, "div", Some(attributes));
}

#[test]
fn parse_should_trim_newlines_out_of_end_tag_names() {
// arrange
let text = r#"
<div>
</div
>
"#;

// act
let result = parse(text).unwrap();

// assert
// <div>
assert_tag(&result, result.root_node, "div", None);
}

#[test]
fn parse_should_handle_attributes_without_value() {
// arrange
Expand Down
63 changes: 44 additions & 19 deletions src/html/tokenizer/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::vecpointer::VecPointerRef;

use super::Token;

/// Checks if the [TextPointer](TextPointer) is currently pointing to a StartTag [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a StartTag [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// StartTag is defined as `<{{String}}`
Expand All @@ -16,6 +16,7 @@ pub fn is_start_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
loop {
match pointer.next() {
Some(' ') | Some('>') | Some('/') => break,
Some(c) if c.is_whitespace() => break,
Some(c) => {
name.push(*c);
}
Expand All @@ -32,7 +33,7 @@ pub fn is_start_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to an EndTag [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to an EndTag [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// EndTag is defined as `</{{String}}`
Expand All @@ -44,6 +45,7 @@ pub fn is_end_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
loop {
match pointer.next() {
Some(' ') | Some('>') => break,
Some(c) if c.is_whitespace() => break,
Some(c) => {
name.push(*c);
}
Expand All @@ -57,7 +59,7 @@ pub fn is_end_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Comment [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a Comment [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Comment is defined as `<!--{{String}}-->`
Expand All @@ -83,7 +85,7 @@ pub fn is_comment(pointer: &mut VecPointerRef<char>) -> Option<Token> {
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to the end of a Comment [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to the end of a Comment [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// This is a helper method not used directly in the lexer.
Expand All @@ -100,7 +102,7 @@ pub fn is_end_comment(pointer: &mut VecPointerRef<char>) -> bool {
false
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a TagClose [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a TagClose [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// TagClose is defined as `>`
Expand All @@ -112,7 +114,7 @@ pub fn is_tag_close(pointer: &mut VecPointerRef<char>) -> Option<Token> {
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a TagCloseAndEnd [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a TagCloseAndEnd [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// TagCloseAndEnd is defined as `/>`
Expand All @@ -124,7 +126,7 @@ pub fn is_tag_close_and_end(pointer: &mut VecPointerRef<char>) -> Option<Token>
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a AssignmentSign [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a AssignmentSign [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// AssignmentSign is defined as `=`
Expand All @@ -136,7 +138,7 @@ pub fn is_assignment_sign(pointer: &mut VecPointerRef<char>) -> Option<Token> {
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Literal [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a Literal [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Literal is defined as `"{{String}}"` inside a tag definition.
Expand Down Expand Up @@ -180,25 +182,29 @@ pub fn is_literal(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Opti
}

lazy_static! {
/// List of characters that end an Identifier [Symbol](Symbol).
static ref INAVLID_ID_CHARS: Vec<char> = vec![' ', '<', '>', '/', '=', '"'];
/// List of characters that end an Identifier [Token](Token).
static ref INAVLID_ID_CHARS: Vec<char> = vec!['<', '>', '/', '=', '"'];
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Identifier [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a Identifier [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Identifier is defined as any text inside a tag definition.
pub fn is_identifier(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<Token> {
fn valid_char(c: &char) -> bool {
!c.is_whitespace() && !INAVLID_ID_CHARS.contains(c)
}

if !has_open_tag {
return None;
}

if let Some(c) = pointer.current() {
if !INAVLID_ID_CHARS.contains(c) {
if valid_char(c) {
let mut text: Vec<char> = vec![*c];
loop {
match pointer.next() {
Some(c) if INAVLID_ID_CHARS.contains(c) => break,
Some(c) if !valid_char(c) => break,
Some(c) => {
text.push(*c);
}
Expand All @@ -214,11 +220,15 @@ pub fn is_identifier(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> O
None
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Text [Symbol](Symbol).
/// Checks if the [VecPointerRef](VecPointerRef) is currently pointing to a Text [Token](Token).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Text is defined as any text outside a tag definition.
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_tag: bool) -> Option<Token> {
pub fn is_text(
pointer: &mut VecPointerRef<char>,
has_open_tag: bool,
in_script_tag: bool,
) -> Option<Token> {
if has_open_tag {
return None;
}
Expand All @@ -238,7 +248,6 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_

// In a script tag the *only* thing that can end a text is an end script tag.
if in_script_tag {

if let Some(end_tag) = is_end_tag(pointer) {
match end_tag {
Token::EndTag(end_tag) => {
Expand All @@ -247,8 +256,11 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_
pointer.index = pointer_index;
break;
}
},
token => panic!("is_end_tag returned {:?} instead of Token::EndTag", token)
}
token => panic!(
"is_end_tag returned {:?} instead of Token::EndTag",
token
),
}
}
} else {
Expand All @@ -264,7 +276,7 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_
// If the loop hasn't been broken at this point, add the '<' and move on.
pointer.index = pointer_index;
buffer.push('<');
},
}
Some(c) => {
if !c.is_whitespace() {
has_non_whitespace = true;
Expand Down Expand Up @@ -581,6 +593,19 @@ mod tests {
assert_eq!(0, pointer.index);
}

#[test]
fn is_identifier_should_not_match_newline() {
// arrange
let chars: Vec<char> = "\n".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_identifier(&mut pointer, true);

// assert
assert!(matches!(result, None));
}

#[test]
fn is_text_works() {
// arrange
Expand Down
37 changes: 32 additions & 5 deletions src/html/tokenizer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
mod tokens;
mod helpers;
mod tokens;

use crate::vecpointer::VecPointerRef;
pub use tokens::Token;
use log::error;
use thiserror::Error;
pub use tokens::Token;

#[derive(Error, Debug)]
pub enum LexError {}
Expand All @@ -30,8 +31,11 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
if start_tag == "script" {
in_script_tag = true;
}
},
token => panic!("is_start_tag returned {:?} instead of Token::StartTag", token)
}
token => panic!(
"is_start_tag returned {:?} instead of Token::StartTag",
token
),
}

symbols.push(s);
Expand All @@ -58,7 +62,7 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
if let Some(c) = pointer.current() {
if !c.is_whitespace() {
// Unknown symbol, move on ¯\_(ツ)_/¯
eprintln!("Unknown HTML symbol {}", c);
error!("Unknown HTML symbol {}", c);
}
}
pointer.next();
Expand Down Expand Up @@ -133,6 +137,29 @@ mod tests {
assert_eq!(expected, result);
}

#[test]
fn lex_should_handle_end_tag_with_whitespace() {
// arrange
let text = r#"
<node>1</node
>
"#;

// act
let result = lex(text).unwrap();

// assert
let expected = vec![
Token::StartTag(String::from("node")),
Token::TagClose,
Token::Text(String::from("1")),
Token::EndTag(String::from("node")),
Token::TagClose,
];

assert_eq!(expected, result);
}

#[test]
fn lex_works() {
// arrange
Expand Down