Skip to content

Commit

Permalink
Add early errors for escaped identifiers (#2546)
Browse files Browse the repository at this point in the history
This Pull Request changes the following:

- Add early errors for escaped characters in object and class setters and getters.
- Add early errors for escaped characters in class `static`.
- Add early errors for escaped characters in `new.target`.
- Add early errors for legacy octal/decial escapes that are used in string literals before a `"use strict"` directive.
  • Loading branch information
raskad committed Jan 28, 2023
1 parent aa8e0c5 commit bd0652f
Show file tree
Hide file tree
Showing 13 changed files with 231 additions and 90 deletions.
9 changes: 7 additions & 2 deletions boa_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! This module implements lexing for identifiers (foo, myvar, etc.) used in ECMAScript.

use crate::lexer::{Cursor, Error, StringLiteral, Token, TokenKind, Tokenizer};
use crate::lexer::{
token::ContainsEscapeSequence, Cursor, Error, StringLiteral, Token, TokenKind, Tokenizer,
};
use boa_ast::{Keyword, Position, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
Expand Down Expand Up @@ -71,7 +73,10 @@ impl<R> Tokenizer<R> for Identifier {
Ok(Keyword::False) => TokenKind::BooleanLiteral(false),
Ok(Keyword::Null) => TokenKind::NullLiteral,
Ok(keyword) => TokenKind::Keyword((keyword, contains_escaped_chars)),
_ => TokenKind::identifier(interner.get_or_intern(identifier_name.as_str())),
_ => TokenKind::Identifier((
interner.get_or_intern(identifier_name.as_str()),
ContainsEscapeSequence(contains_escaped_chars),
)),
};

Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
Expand Down
61 changes: 33 additions & 28 deletions boa_parser/src/lexer/string.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Boa's lexing for ECMAScript string literals.

use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer};
use crate::lexer::{token::EscapeSequence, Cursor, Error, Token, TokenKind, Tokenizer};
use boa_ast::{Position, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
Expand Down Expand Up @@ -88,11 +88,11 @@ impl<R> Tokenizer<R> for StringLiteral {
{
let _timer = Profiler::global().start_event("StringLiteral", "Lexing");

let (lit, span) =
let (lit, span, escape_sequence) =
Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;

Ok(Token::new(
TokenKind::string_literal(interner.get_or_intern(&lit[..])),
TokenKind::string_literal(interner.get_or_intern(&lit[..]), escape_sequence),
span,
))
}
Expand All @@ -117,11 +117,13 @@ impl StringLiteral {
start_pos: Position,
terminator: StringTerminator,
is_strict_mode: bool,
) -> Result<(Vec<u16>, Span), Error>
) -> Result<(Vec<u16>, Span, Option<EscapeSequence>), Error>
where
R: Read,
{
let mut buf = Vec::new();
let mut escape_sequence = None;

loop {
let ch_start_pos = cursor.pos();
let ch = cursor.next_char()?;
Expand All @@ -133,12 +135,15 @@ impl StringLiteral {
let _timer =
Profiler::global().start_event("StringLiteral - escape sequence", "Lexing");

if let Some(escape_value) = Self::take_escape_sequence_or_line_continuation(
cursor,
ch_start_pos,
is_strict_mode,
false,
)? {
if let Some((escape_value, escape)) =
Self::take_escape_sequence_or_line_continuation(
cursor,
ch_start_pos,
is_strict_mode,
false,
)?
{
escape_sequence = escape_sequence.or(escape);
buf.push_code_point(escape_value);
}
}
Expand All @@ -156,15 +161,15 @@ impl StringLiteral {
}
}

Ok((buf, Span::new(start_pos, cursor.pos())))
Ok((buf, Span::new(start_pos, cursor.pos()), escape_sequence))
}

pub(super) fn take_escape_sequence_or_line_continuation<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
is_strict_mode: bool,
is_template_literal: bool,
) -> Result<Option<u32>, Error>
) -> Result<Option<(u32, Option<EscapeSequence>)>, Error>
where
R: Read,
{
Expand All @@ -176,25 +181,25 @@ impl StringLiteral {
})?;

let escape_value = match escape_ch {
0x0062 /* b */ => Some(0x0008 /* <BS> */),
0x0074 /* t */ => Some(0x0009 /* <HT> */),
0x006E /* n */ => Some(0x000A /* <LF> */),
0x0076 /* v */ => Some(0x000B /* <VT> */),
0x0066 /* f */ => Some(0x000C /* <FF> */),
0x0072 /* r */ => Some(0x000D /* <CR> */),
0x0022 /* " */ => Some(0x0022 /* " */),
0x0027 /* ' */ => Some(0x0027 /* ' */),
0x005C /* \ */ => Some(0x005C /* \ */),
0x0062 /* b */ => Some((0x0008 /* <BS> */, None)),
0x0074 /* t */ => Some((0x0009 /* <HT> */, None)),
0x006E /* n */ => Some((0x000A /* <LF> */, None)),
0x0076 /* v */ => Some((0x000B /* <VT> */, None)),
0x0066 /* f */ => Some((0x000C /* <FF> */, None)),
0x0072 /* r */ => Some((0x000D /* <CR> */, None)),
0x0022 /* " */ => Some((0x0022 /* " */, None)),
0x0027 /* ' */ => Some((0x0027 /* ' */, None)),
0x005C /* \ */ => Some((0x005C /* \ */, None)),
0x0030 /* 0 */ if cursor
.peek()?
.filter(|next_byte| (b'0'..=b'9').contains(next_byte))
.is_none() =>
Some(0x0000 /* NULL */),
Some((0x0000 /* NULL */, None)),
0x0078 /* x */ => {
Some(Self::take_hex_escape_sequence(cursor, start_pos)?)
Some((Self::take_hex_escape_sequence(cursor, start_pos)?, None))
}
0x0075 /* u */ => {
Some(Self::take_unicode_escape_sequence(cursor, start_pos)?)
Some((Self::take_unicode_escape_sequence(cursor, start_pos)?, None))
}
0x0038 /* 8 */ | 0x0039 /* 9 */ => {
// Grammar: NonOctalDecimalEscapeSequence
Expand All @@ -209,7 +214,7 @@ impl StringLiteral {
start_pos,
));
}
Some(escape_ch)
Some((escape_ch, Some(EscapeSequence::NonOctalDecimal)))
}
_ if (0x0030..=0x0037 /* '0'..='7' */).contains(&escape_ch) => {
if is_template_literal {
Expand All @@ -226,10 +231,10 @@ impl StringLiteral {
));
}

Some(Self::take_legacy_octal_escape_sequence(
Some((Self::take_legacy_octal_escape_sequence(
cursor,
escape_ch.try_into().expect("an ascii char must not fail to convert"),
)?)
)?, Some(EscapeSequence::LegacyOctal)))
}
_ if Self::is_line_terminator(escape_ch) => {
// Grammar: LineContinuation
Expand All @@ -238,7 +243,7 @@ impl StringLiteral {
None
}
_ => {
Some(escape_ch)
Some((escape_ch, None))
}
};

Expand Down
2 changes: 1 addition & 1 deletion boa_parser/src/lexer/template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ impl TemplateString {
true,
)?;

if let Some(escape_value) = escape_value {
if let Some((escape_value, _)) = escape_value {
buf.push_code_point(escape_value);
}
}
Expand Down
48 changes: 32 additions & 16 deletions boa_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
#![allow(clippy::indexing_slicing)]

use crate::lexer::{
template::TemplateString, token::Numeric, Cursor, Error, Interner, Lexer, Position, Punctuator,
Read, Span, TokenKind,
template::TemplateString,
token::{ContainsEscapeSequence, EscapeSequence, Numeric},
Cursor, Error, Interner, Lexer, Position, Punctuator, Read, Span, TokenKind,
};
use boa_ast::Keyword;
use boa_interner::Sym;
Expand Down Expand Up @@ -94,9 +95,18 @@ fn check_identifier() {
TokenKind::identifier(
interner.get_or_intern_static("x\u{200C}\u{200D}", utf16!("x\u{200C}\u{200D}")),
),
TokenKind::identifier(interner.get_or_intern_static("x", utf16!("x"))),
TokenKind::identifier(interner.get_or_intern_static("xx", utf16!("xx"))),
TokenKind::identifier(interner.get_or_intern_static("xxx", utf16!("xxx"))),
TokenKind::Identifier((
interner.get_or_intern_static("x", utf16!("x")),
ContainsEscapeSequence(true),
)),
TokenKind::Identifier((
interner.get_or_intern_static("xx", utf16!("xx")),
ContainsEscapeSequence(true),
)),
TokenKind::Identifier((
interner.get_or_intern_static("xxx", utf16!("xxx")),
ContainsEscapeSequence(true),
)),
];

expect_tokens(&mut lexer, &expected, interner);
Expand Down Expand Up @@ -141,8 +151,8 @@ fn check_string() {
let a_sym = interner.get_or_intern_static("aaa", utf16!("aaa"));
let b_sym = interner.get_or_intern_static("bbb", utf16!("bbb"));
let expected = [
TokenKind::string_literal(a_sym),
TokenKind::string_literal(b_sym),
TokenKind::string_literal(a_sym, None),
TokenKind::string_literal(b_sym, None),
];

expect_tokens(&mut lexer, &expected, interner);
Expand Down Expand Up @@ -305,7 +315,7 @@ fn check_variable_definition_tokens() {
TokenKind::Keyword((Keyword::Let, false)),
TokenKind::identifier(a_sym),
TokenKind::Punctuator(Punctuator::Assign),
TokenKind::string_literal(hello_sym),
TokenKind::string_literal(hello_sym, None),
TokenKind::Punctuator(Punctuator::Semicolon),
];

Expand Down Expand Up @@ -943,7 +953,7 @@ fn string_unicode() {

let sym = interner.get_or_intern_static("中文", utf16!("中文"));
let expected = [
TokenKind::StringLiteral(sym),
TokenKind::StringLiteral((sym, None)),
TokenKind::Punctuator(Punctuator::Semicolon),
];

Expand All @@ -957,7 +967,7 @@ fn string_unicode_escape_with_braces() {

let sym =
interner.get_or_intern_static("{\u{20ac}\u{a0}\u{a0}}", utf16!("{\u{20ac}\u{a0}\u{a0}}"));
let expected = [TokenKind::StringLiteral(sym)];
let expected = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected, interner);

Expand Down Expand Up @@ -992,7 +1002,7 @@ fn string_unicode_escape_with_braces_2() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern_static("\u{20ac}\u{a0}\u{a0}", utf16!("\u{20ac}\u{a0}\u{a0}"));
let expected = [TokenKind::StringLiteral(sym)];
let expected = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected, interner);
}
Expand All @@ -1005,7 +1015,7 @@ fn string_with_single_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern_static("Б", utf16!("Б"));
let expected = [TokenKind::StringLiteral(sym)];
let expected = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected, interner);
}
Expand All @@ -1027,7 +1037,10 @@ fn string_legacy_octal_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern(expected.encode_utf16().collect::<Vec<_>>().as_slice());
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((
sym,
Some(EscapeSequence::LegacyOctal),
))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand Down Expand Up @@ -1057,7 +1070,7 @@ fn string_zero_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern(expected.encode_utf16().collect::<Vec<_>>().as_slice());
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand All @@ -1072,7 +1085,10 @@ fn string_non_octal_decimal_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern(expected.encode_utf16().collect::<Vec<_>>().as_slice());
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((
sym,
Some(EscapeSequence::NonOctalDecimal),
))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand Down Expand Up @@ -1101,7 +1117,7 @@ fn string_line_continuation() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern_static("hello world", utf16!("hello world"));
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand Down
40 changes: 33 additions & 7 deletions boa_parser/src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub enum TokenKind {
EOF,

/// An identifier.
Identifier(Sym),
Identifier((Sym, ContainsEscapeSequence)),

/// A private identifier.
PrivateIdentifier(Sym),
Expand All @@ -117,7 +117,7 @@ pub enum TokenKind {
Punctuator(Punctuator),

/// A string literal.
StringLiteral(Sym),
StringLiteral((Sym, Option<EscapeSequence>)),

/// A part of a template literal without substitution.
TemplateNoSubstitution(TemplateString),
Expand Down Expand Up @@ -175,7 +175,7 @@ impl TokenKind {
/// Creates an `Identifier` token type.
#[must_use]
pub const fn identifier(ident: Sym) -> Self {
Self::Identifier(ident)
Self::Identifier((ident, ContainsEscapeSequence(false)))
}

/// Creates a `NumericLiteral` token kind.
Expand All @@ -194,8 +194,8 @@ impl TokenKind {

/// Creates a `StringLiteral` token type.
#[must_use]
pub const fn string_literal(lit: Sym) -> Self {
Self::StringLiteral(lit)
pub const fn string_literal(lit: Sym, escape_sequence: Option<EscapeSequence>) -> Self {
Self::StringLiteral((lit, escape_sequence))
}

/// Creates a `TemplateMiddle` token type.
Expand Down Expand Up @@ -234,15 +234,15 @@ impl TokenKind {
match *self {
Self::BooleanLiteral(val) => val.to_string(),
Self::EOF => "end of file".to_owned(),
Self::Identifier(ident) => interner.resolve_expect(ident).to_string(),
Self::Identifier((ident, _)) => interner.resolve_expect(ident).to_string(),
Self::PrivateIdentifier(ident) => format!("#{}", interner.resolve_expect(ident)),
Self::Keyword((word, _)) => word.to_string(),
Self::NullLiteral => "null".to_owned(),
Self::NumericLiteral(Numeric::Rational(num)) => num.to_string(),
Self::NumericLiteral(Numeric::Integer(num)) => num.to_string(),
Self::NumericLiteral(Numeric::BigInt(ref num)) => format!("{num}n"),
Self::Punctuator(punc) => punc.to_string(),
Self::StringLiteral(lit) => interner.resolve_expect(lit).to_string(),
Self::StringLiteral((lit, _)) => interner.resolve_expect(lit).to_string(),
Self::TemplateNoSubstitution(ts) | Self::TemplateMiddle(ts) => {
interner.resolve_expect(ts.as_raw()).to_string()
}
Expand All @@ -258,3 +258,29 @@ impl TokenKind {
}
}
}

/// Indicates the type of an escape sequence.
#[cfg_attr(feature = "deser", derive(serde::Serialize, serde::Deserialize))]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EscapeSequence {
/// A legacy escape sequence starting with `0` - `7`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-LegacyOctalEscapeSequence
LegacyOctal,

/// A octal escape sequence starting with `8` - `9`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-NonOctalDecimalEscapeSequence
NonOctalDecimal,
}

/// Indicates if an identifier contains an escape sequence.
#[cfg_attr(feature = "deser", derive(serde::Serialize, serde::Deserialize))]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct ContainsEscapeSequence(pub bool);
Loading

0 comments on commit bd0652f

Please sign in to comment.