Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Implement HTML comments and gate behind the annex-b feature #2817

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion boa_engine/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ trace = []
console = []

# Enable Boa's additional ECMAScript features for web browsers.
annex-b = []
annex-b = ["boa_parser/annex-b"]

[dependencies]
boa_interner.workspace = true
Expand Down
5 changes: 2 additions & 3 deletions boa_engine/src/builtins/function/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,9 @@ impl BuiltInFunctionObject {
} else {
let mut parameters = Vec::with_capacity(args.len());
for arg in args {
parameters.push(arg.to_string(context)?.as_slice().to_owned());
parameters.push(arg.to_string(context)?);
}
let mut parameters = parameters.join(utf16!(","));
parameters.push(u16::from(b')'));
let parameters = parameters.join(utf16!(","));

// TODO: make parser generic to u32 iterators
let parameters =
Expand Down
3 changes: 3 additions & 0 deletions boa_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ num-traits = "0.2.15"
bitflags = "2.1.0"
num-bigint = "0.4.3"
regress = "0.5.0"

[features]
annex-b = []
2 changes: 1 addition & 1 deletion boa_parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ impl<R> Tokenizer<R> for MultiLineComment {
}
}

///Lexes a first line Hashbang comment
/// Lexes a first line Hashbang comment
///
/// More information:
/// - [ECMAScript reference][spec]
Expand Down
28 changes: 21 additions & 7 deletions boa_parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ use std::io::{self, Bytes, Error, ErrorKind, Read};
pub(super) struct Cursor<R> {
iter: InnerIter<R>,
pos: Position,
strict_mode: bool,
module: bool,
strict: bool,
}

impl<R> Cursor<R> {
Expand All @@ -31,13 +32,24 @@ impl<R> Cursor<R> {
}

/// Returns if strict mode is currently active.
pub(super) const fn strict_mode(&self) -> bool {
self.strict_mode
pub(super) const fn strict(&self) -> bool {
self.strict
}

/// Sets the current strict mode.
pub(super) fn set_strict_mode(&mut self, strict_mode: bool) {
self.strict_mode = strict_mode;
pub(super) fn set_strict(&mut self, strict: bool) {
self.strict = strict;
}

/// Returns if the module mode is currently active.
pub(super) const fn module(&self) -> bool {
self.module
}

/// Sets the current goal symbol to module.
pub(super) fn set_module(&mut self, module: bool) {
self.module = module;
self.strict = module;
}
}

Expand All @@ -50,7 +62,8 @@ where
Self {
iter: InnerIter::new(inner.bytes()),
pos: Position::new(1, 1),
strict_mode: false,
strict: false,
module: false,
}
}

Expand All @@ -59,7 +72,8 @@ where
Self {
iter: InnerIter::new(inner.bytes()),
pos,
strict_mode: false,
strict: false,
module: false,
}
}

Expand Down
112 changes: 88 additions & 24 deletions boa_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,23 +71,6 @@ pub struct Lexer<R> {
}

impl<R> Lexer<R> {
/// Checks if a character is whitespace as per ECMAScript standards.
///
/// The Rust `char::is_whitespace` function and the ECMAScript standard use different sets of
/// characters as whitespaces:
/// * Rust uses `\p{White_Space}`,
/// * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}`
///
/// [More information](https://tc39.es/ecma262/#table-32)
const fn is_whitespace(ch: u32) -> bool {
matches!(
ch,
0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
// Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
)
}

/// Sets the goal symbol for the lexer.
pub(crate) fn set_goal(&mut self, elm: InputElement) {
self.goal_symbol = elm;
Expand All @@ -99,13 +82,23 @@ impl<R> Lexer<R> {
}

/// Returns if strict mode is currently active.
pub(super) const fn strict_mode(&self) -> bool {
self.cursor.strict_mode()
pub(super) const fn strict(&self) -> bool {
self.cursor.strict()
}

/// Sets the current strict mode.
pub(super) fn set_strict_mode(&mut self, strict_mode: bool) {
self.cursor.set_strict_mode(strict_mode);
pub(super) fn set_strict(&mut self, strict: bool) {
self.cursor.set_strict(strict);
}

/// Returns if module mode is currently active.
pub(super) const fn module(&self) -> bool {
self.cursor.module()
}

/// Signals that the goal symbol is a module
pub(super) fn set_module(&mut self, module: bool) {
self.cursor.set_module(module);
}

/// Creates a new lexer.
Expand Down Expand Up @@ -180,14 +173,38 @@ impl<R> Lexer<R> {
}
}

/// Skips an HTML close comment (`-->`) if the `annex-b` feature is enabled.
pub(crate) fn skip_html_close(&mut self, interner: &mut Interner) -> Result<(), Error>
where
R: Read,
{
if !cfg!(feature = "annex-b") || self.module() {
return Ok(());
}

while self.cursor.peek_char()?.map_or(false, is_whitespace) {
let _next = self.cursor.next_char();
}

if self.cursor.peek_n(3)? == [b'-', b'-', b'>'] {
let _next = self.cursor.next_byte();
let _next = self.cursor.next_byte();
let _next = self.cursor.next_byte();

let start = self.cursor.pos();
SingleLineComment.lex(&mut self.cursor, start, interner)?;
}

Ok(())
}

/// Retrieves the next token from the lexer.
///
/// # Errors
///
/// Will return `Err` on invalid tokens and invalid reads of the bytes being lexed.
// We intentionally don't implement Iterator trait as Result<Option> is cleaner to handle.
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self, interner: &mut Interner) -> Result<Option<Token>, Error>
pub(crate) fn next_no_skip(&mut self, interner: &mut Interner) -> Result<Option<Token>, Error>
where
R: Read,
{
Expand All @@ -197,7 +214,7 @@ impl<R> Lexer<R> {
let start = self.cursor.pos();
if let Some(next_ch) = self.cursor.next_char()? {
// Ignore whitespace
if !Self::is_whitespace(next_ch) {
if !is_whitespace(next_ch) {
break (start, next_ch);
}
} else {
Expand Down Expand Up @@ -269,6 +286,14 @@ impl<R> Lexer<R> {
)),
'#' => PrivateIdentifier::new().lex(&mut self.cursor, start, interner),
'/' => self.lex_slash_token(start, interner),
#[cfg(feature = "annex-b")]
'<' if !self.module() && self.cursor.peek_n(3)? == [b'!', b'-', b'-'] => {
let _next = self.cursor.next_byte();
let _next = self.cursor.next_byte();
let _next = self.cursor.next_byte();
let start = self.cursor.pos();
SingleLineComment.lex(&mut self.cursor, start, interner)
}
#[allow(clippy::cast_possible_truncation)]
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start, interner)
Expand Down Expand Up @@ -311,6 +336,28 @@ impl<R> Lexer<R> {
}
}

/// Retrieves the next token from the lexer, skipping comments.
///
/// # Errors
///
/// Will return `Err` on invalid tokens and invalid reads of the bytes being lexed.
// We intentionally don't implement Iterator trait as Result<Option> is cleaner to handle.
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self, interner: &mut Interner) -> Result<Option<Token>, Error>
where
R: Read,
{
loop {
let Some(next) = self.next_no_skip(interner)? else {
return Ok(None)
};

if next.kind() != &TokenKind::Comment {
return Ok(Some(next));
}
}
}

/// Performs the lexing of a template literal.
pub(crate) fn lex_template(
&mut self,
Expand Down Expand Up @@ -339,3 +386,20 @@ impl Default for InputElement {
Self::RegExp
}
}

/// Checks if a character is whitespace as per ECMAScript standards.
///
/// The Rust `char::is_whitespace` function and the ECMAScript standard use different sets of
/// characters as whitespaces:
/// * Rust uses `\p{White_Space}`,
/// * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}`
///
/// [More information](https://tc39.es/ecma262/#table-32)
const fn is_whitespace(ch: u32) -> bool {
matches!(
ch,
0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
// Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
)
}
4 changes: 2 additions & 2 deletions boa_parser/src/lexer/number.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
let ch = char::from(byte);
if ch.is_digit(8) {
// LegacyOctalIntegerLiteral, or a number with leading 0s.
if cursor.strict_mode() {
if cursor.strict() {
// LegacyOctalIntegerLiteral is forbidden with strict mode true.
return Err(Error::syntax(
"implicit octal literals are not allowed in strict mode",
Expand All @@ -278,7 +278,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
// Indicates a numerical digit comes after then 0 but it isn't an octal digit
// so therefore this must be a number with an unneeded leading 0. This is
// forbidden in strict mode.
if cursor.strict_mode() {
if cursor.strict() {
return Err(Error::syntax(
"leading 0's are not allowed in strict mode",
start_pos,
Expand Down
12 changes: 6 additions & 6 deletions boa_parser/src/lexer/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ impl<R> Tokenizer<R> for StringLiteral {
let _timer = Profiler::global().start_event("StringLiteral", "Lexing");

let (lit, span, escape_sequence) =
Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict())?;

Ok(Token::new(
TokenKind::string_literal(interner.get_or_intern(&lit[..]), escape_sequence),
Expand All @@ -116,7 +116,7 @@ impl StringLiteral {
cursor: &mut Cursor<R>,
start_pos: Position,
terminator: StringTerminator,
is_strict_mode: bool,
strict: bool,
) -> Result<(Vec<u16>, Span, Option<EscapeSequence>), Error>
where
R: Read,
Expand All @@ -139,7 +139,7 @@ impl StringLiteral {
Self::take_escape_sequence_or_line_continuation(
cursor,
ch_start_pos,
is_strict_mode,
strict,
false,
)?
{
Expand Down Expand Up @@ -167,7 +167,7 @@ impl StringLiteral {
pub(super) fn take_escape_sequence_or_line_continuation<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
is_strict_mode: bool,
strict: bool,
is_template_literal: bool,
) -> Result<Option<(u32, Option<EscapeSequence>)>, Error>
where
Expand Down Expand Up @@ -208,7 +208,7 @@ impl StringLiteral {
"\\8 and \\9 are not allowed in template literal",
start_pos,
));
} else if is_strict_mode {
} else if strict {
return Err(Error::syntax(
"\\8 and \\9 are not allowed in strict mode",
start_pos,
Expand All @@ -224,7 +224,7 @@ impl StringLiteral {
));
}

if is_strict_mode {
if strict {
return Err(Error::syntax(
"octal escape sequences are not allowed in strict mode",
start_pos,
Expand Down
4 changes: 2 additions & 2 deletions boa_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,7 @@ fn string_legacy_octal_escape() {
for (s, _) in &test_cases {
let mut lexer = Lexer::new(s.as_bytes());
let interner = &mut Interner::default();
lexer.set_strict_mode(true);
lexer.set_strict(true);

if let Error::Syntax(_, pos) = lexer
.next(interner)
Expand Down Expand Up @@ -1096,7 +1096,7 @@ fn string_non_octal_decimal_escape() {
for (s, _) in &test_cases {
let mut lexer = Lexer::new(s.as_bytes());
let interner = &mut Interner::default();
lexer.set_strict_mode(true);
lexer.set_strict(true);

if let Error::Syntax(_, pos) = lexer
.next(interner)
Expand Down
25 changes: 18 additions & 7 deletions boa_parser/src/parser/cursor/buffered_lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,20 @@ where
.map_err(Error::from)
}

pub(super) const fn strict_mode(&self) -> bool {
self.lexer.strict_mode()
pub(super) const fn strict(&self) -> bool {
self.lexer.strict()
}

pub(super) fn set_strict_mode(&mut self, strict_mode: bool) {
self.lexer.set_strict_mode(strict_mode);
pub(super) fn set_strict(&mut self, strict: bool) {
self.lexer.set_strict(strict);
}

pub(super) const fn module(&self) -> bool {
self.lexer.module()
}

pub(super) fn set_module(&mut self, module: bool) {
self.lexer.set_module(module);
}

/// Fills the peeking buffer with the next token.
Expand All @@ -124,10 +132,13 @@ where
// We don't want to have multiple contiguous line terminators in the buffer, since
// they have no meaning.
let next = loop {
let next = self.lexer.next(interner)?;
self.lexer.skip_html_close(interner)?;
let next = self.lexer.next_no_skip(interner)?;
if let Some(ref token) = next {
if token.kind() != &TokenKind::LineTerminator {
break next;
match token.kind() {
TokenKind::LineTerminator => { /* skip */ }
TokenKind::Comment => self.lexer.skip_html_close(interner)?,
_ => break next,
}
} else {
break None;
Expand Down
Loading