From b806551ade204a11928a60b9d624174c084c0682 Mon Sep 17 00:00:00 2001 From: Alexander Akait <4567934+alexander-akait@users.noreply.github.com> Date: Mon, 1 Nov 2021 10:53:26 +0300 Subject: [PATCH] refactor(css/lexer): Refactor lexer to follow spec (#2593) --- css/parser/src/lexer/mod.rs | 1307 ++++++++++------- css/parser/tests/errors/escaped/eof/input.css | 2 + .../tests/errors/escaped/eof/output.stderr | 8 + .../tests/errors/url/parenthesis/input.css | 3 + .../errors/url/parenthesis/output.stderr | 10 + .../tests/recovery/value/quotes/output.json | 24 +- .../recovery/value/quotes/output.swc-stderr | 6 +- 7 files changed, 850 insertions(+), 510 deletions(-) create mode 100644 css/parser/tests/errors/escaped/eof/input.css create mode 100644 css/parser/tests/errors/escaped/eof/output.stderr create mode 100644 css/parser/tests/errors/url/parenthesis/input.css create mode 100644 css/parser/tests/errors/url/parenthesis/output.stderr diff --git a/css/parser/src/lexer/mod.rs b/css/parser/src/lexer/mod.rs index 052452c98926..198f648c9d34 100644 --- a/css/parser/src/lexer/mod.rs +++ b/css/parser/src/lexer/mod.rs @@ -87,21 +87,25 @@ impl Lexer where I: Input, { - fn read_token(&mut self) -> LexResult { - if self.input.cur().is_none() { - return Err(ErrorKind::Eof); - } + // #[inline] + // fn current_input_code_point(&mut self) -> Option { + // self.input.clone().nth(-1).map(|i| i.1) + // } - if self.input.is_byte(b'/') && self.input.peek() == Some('*') { - self.skip_block_comment()?; - self.skip_ws()?; - self.start_pos = self.input.cur_pos(); - - return self.read_token(); - } + fn read_token(&mut self) -> LexResult { + // Consume comments. + // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A + // ASTERISK (*), consume them and all following code points up to and including + // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an + // EOF code point. Return to the start of this step. + if self.input.cur() == Some('/') { + if self.input.peek() == Some('*') { + self.skip_block_comment()?; + self.skip_ws()?; + self.start_pos = self.input.cur_pos(); - if self.config.allow_wrong_line_comments { - if self.input.is_byte(b'/') && self.input.peek() == Some('/') { + return self.read_token(); + } else if self.config.allow_wrong_line_comments && self.input.peek() == Some('/') { self.skip_line_comment()?; self.start_pos = self.input.cur_pos(); @@ -109,413 +113,382 @@ where } } - macro_rules! try_delim { - ($b:tt,$tok:tt) => {{ - if self.input.eat_byte($b) { - return Ok(tok!($tok)); - } - }}; - } - - // TODO: Consume the next input code point. https://www.w3.org/TR/css-syntax-3/#consume-token. We should use `self.input.bump()` and reconsume according spec - - if let Some(c) = self.input.cur() { - if is_whitespace(c) { - let value = self.read_ws()?; - - return Ok(Token::WhiteSpace { - value: value.into(), - }); - } - } + let start = self.input.cur_pos(); + let next = self.input.cur(); - if self.input.is_byte(b'"') { - return self.read_str(None); - } + // Consume the next input code point. + match next { + // whitespace + // Consume as much whitespace as possible. Return a . + Some(c) if is_whitespace(c) => { + let mut value = String::new(); - if self.input.is_byte(b'#') { - let c = self.input.cur(); + loop { + let c = self.input.cur(); - self.input.bump(); + match c { + Some(c) if is_whitespace(c) => { + self.input.bump(); - let first = self.input.cur(); - let second = self.input.peek(); + value.push(c); + } + _ => { + break; + } + } + } - if is_name_continue(first.unwrap()) || self.is_valid_escape(first, second)? { - let third = self.input.peek_ahead(); - let is_id = self.would_start_ident(first, second, third)?; - let name = self.read_name()?; + if self.config.allow_wrong_line_comments { + if self.input.is_byte(b'/') && self.input.peek() == Some('/') { + self.skip_line_comment()?; + self.start_pos = self.input.cur_pos(); + } + } - return Ok(Token::Hash { - is_id, - value: name.0, - raw: name.1, + return Ok(Token::WhiteSpace { + value: value.into(), }); } - - return Ok(Token::Delim { value: c.unwrap() }); - } - - if self.input.is_byte(b'\'') { - return self.read_str(None); - } - - try_delim!(b'(', "("); - - try_delim!(b')', ")"); - - if self.input.is_byte(b'+') { - let start = self.input.cur_pos(); - let c = self.input.cur(); - - self.input.bump(); - - if self.would_start_number(None, None, None)? { - self.input.reset_to(start); - - return self.read_numeric(); + // U+0022 QUOTATION MARK (") + // Consume a string token and return it. + Some(c) if c == '"' => { + return self.read_str(None); } + // U+0023 NUMBER SIGN (#) + // If the next input code point is a name code point or the next two input code points + // are a valid escape, then: + Some(c) if c == '#' => { + // TODO: If the next input code point is a name code point or the next two input + // code points are a valid escape, then: + self.input.bump(); - return Ok(Token::Delim { value: c.unwrap() }); - } - - try_delim!(b',', ","); - - if self.input.is_byte(b'-') { - let start = self.input.cur_pos(); - let c = self.input.cur(); + let first = self.input.cur(); + let second = self.input.peek(); - self.input.bump(); + if is_name(first.unwrap()) || self.is_valid_escape(first, second)? { + let third = self.input.peek_ahead(); + let is_id = self.would_start_ident(first, second, third)?; + let name = self.read_name()?; - if self.would_start_number(None, None, None)? { - self.input.reset_to(start); + return Ok(Token::Hash { + is_id, + value: name.0, + raw: name.1, + }); + } - return self.read_numeric(); - } else if self.input.cur() == Some('-') && self.input.peek() == Some('>') { - self.input.bump(); + return Ok(Token::Delim { value: c }); + } + // U+0027 APOSTROPHE (') + // Consume a string token and return it. + Some(c) if c == '\'' => { + return self.read_str(None); + } + // U+0028 LEFT PARENTHESIS (() + // Return a <(-token>. + Some(c) if c == '(' => { self.input.bump(); - return Ok(Token::CDC); - } else if self.would_start_ident(None, None, None)? { - self.input.reset_to(start); - - return self - .read_name() - .map(|(value, raw)| Token::Ident { value, raw }); + return Ok(tok!("(")); } + // U+0029 RIGHT PARENTHESIS ()) + // Return a <)-token>. + Some(c) if c == ')' => { + self.input.bump(); - return Ok(Token::Delim { value: c.unwrap() }); - } - - if self.input.is_byte(b'.') { - let start = self.input.cur_pos(); - let c = self.input.cur(); - - self.input.bump(); - - if self.would_start_number(None, None, None)? { - self.input.reset_to(start); - - return self.read_numeric(); + return Ok(tok!(")")); } + // U+002B PLUS SIGN (+) + Some(c) if c == '+' => { + self.input.bump(); - return Ok(Token::Delim { value: c.unwrap() }); - } - - try_delim!(b':', ":"); - - try_delim!(b';', ";"); - - if self.input.is_byte(b'<') { - let c = self.input.cur(); - - self.input.bump(); + // If the input stream starts with a number, reconsume the current input code + // point, consume a numeric token and return it. + if self.would_start_number(None, None, None)? { + self.input.reset_to(start); - // $DIR/tests/errors/escaped/eof/input.css:2:12 + | +2 | color: \ + | ^ + diff --git a/css/parser/tests/errors/url/parenthesis/input.css b/css/parser/tests/errors/url/parenthesis/input.css new file mode 100644 index 000000000000..f1645ab6c19c --- /dev/null +++ b/css/parser/tests/errors/url/parenthesis/input.css @@ -0,0 +1,3 @@ +a { + background: url(test\); +} \ No newline at end of file diff --git a/css/parser/tests/errors/url/parenthesis/output.stderr b/css/parser/tests/errors/url/parenthesis/output.stderr new file mode 100644 index 000000000000..34950537def8 --- /dev/null +++ b/css/parser/tests/errors/url/parenthesis/output.stderr @@ -0,0 +1,10 @@ +error: Expected "}" + +error: Expected Declaration value + --> $DIR/tests/errors/url/parenthesis/input.css:2:17 + | +2 | background: url(test\); + | _________________^ +3 | | } + | |_^ + diff --git a/css/parser/tests/recovery/value/quotes/output.json b/css/parser/tests/recovery/value/quotes/output.json index 04f2f61ef7da..a8fc60abfcaa 100644 --- a/css/parser/tests/recovery/value/quotes/output.json +++ b/css/parser/tests/recovery/value/quotes/output.json @@ -136,25 +136,25 @@ { "span": { "start": 25, - "end": 30, + "end": 29, "ctxt": 0 }, "token": { "BadStr": { "value": "tes", - "raw": "\"tes\n" + "raw": "\"tes" } } }, { "span": { - "start": 30, + "start": 29, "end": 34, "ctxt": 0 }, "token": { "WhiteSpace": { - "value": " " + "value": "\n " } } }, @@ -174,13 +174,25 @@ { "span": { "start": 35, - "end": 38, + "end": 37, "ctxt": 0 }, "token": { "BadStr": { "value": ";", - "raw": "\";\n" + "raw": "\";" + } + } + }, + { + "span": { + "start": 37, + "end": 38, + "ctxt": 0 + }, + "token": { + "WhiteSpace": { + "value": "\n" } } } diff --git a/css/parser/tests/recovery/value/quotes/output.swc-stderr b/css/parser/tests/recovery/value/quotes/output.swc-stderr index 14584f269ff6..24b904093891 100644 --- a/css/parser/tests/recovery/value/quotes/output.swc-stderr +++ b/css/parser/tests/recovery/value/quotes/output.swc-stderr @@ -1,8 +1,6 @@ error: Expected Declaration value --> $DIR/tests/recovery/value/quotes/input.css:2:14 | -2 | content: "tes - | ______________^ -3 | | t"; - | |_ +2 | content: "tes + | ^^^^