Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Add early errors for escaped identifiers #2546

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions boa_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! This module implements lexing for identifiers (foo, myvar, etc.) used in ECMAScript.

use crate::lexer::{Cursor, Error, StringLiteral, Token, TokenKind, Tokenizer};
use crate::lexer::{
token::ContainsEscapeSequence, Cursor, Error, StringLiteral, Token, TokenKind, Tokenizer,
};
use boa_ast::{Keyword, Position, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
Expand Down Expand Up @@ -71,7 +73,10 @@ impl<R> Tokenizer<R> for Identifier {
Ok(Keyword::False) => TokenKind::BooleanLiteral(false),
Ok(Keyword::Null) => TokenKind::NullLiteral,
Ok(keyword) => TokenKind::Keyword((keyword, contains_escaped_chars)),
_ => TokenKind::identifier(interner.get_or_intern(identifier_name.as_str())),
_ => TokenKind::Identifier((
interner.get_or_intern(identifier_name.as_str()),
ContainsEscapeSequence(contains_escaped_chars),
)),
};

Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
Expand Down
61 changes: 33 additions & 28 deletions boa_parser/src/lexer/string.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Boa's lexing for ECMAScript string literals.

use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer};
use crate::lexer::{token::EscapeSequence, Cursor, Error, Token, TokenKind, Tokenizer};
use boa_ast::{Position, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
Expand Down Expand Up @@ -88,11 +88,11 @@ impl<R> Tokenizer<R> for StringLiteral {
{
let _timer = Profiler::global().start_event("StringLiteral", "Lexing");

let (lit, span) =
let (lit, span, escape_sequence) =
Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;

Ok(Token::new(
TokenKind::string_literal(interner.get_or_intern(&lit[..])),
TokenKind::string_literal(interner.get_or_intern(&lit[..]), escape_sequence),
span,
))
}
Expand All @@ -117,11 +117,13 @@ impl StringLiteral {
start_pos: Position,
terminator: StringTerminator,
is_strict_mode: bool,
) -> Result<(Vec<u16>, Span), Error>
) -> Result<(Vec<u16>, Span, Option<EscapeSequence>), Error>
where
R: Read,
{
let mut buf = Vec::new();
let mut escape_sequence = None;

loop {
let ch_start_pos = cursor.pos();
let ch = cursor.next_char()?;
Expand All @@ -133,12 +135,15 @@ impl StringLiteral {
let _timer =
Profiler::global().start_event("StringLiteral - escape sequence", "Lexing");

if let Some(escape_value) = Self::take_escape_sequence_or_line_continuation(
cursor,
ch_start_pos,
is_strict_mode,
false,
)? {
if let Some((escape_value, escape)) =
Self::take_escape_sequence_or_line_continuation(
cursor,
ch_start_pos,
is_strict_mode,
false,
)?
{
escape_sequence = escape_sequence.or(escape);
buf.push_code_point(escape_value);
}
}
Expand All @@ -156,15 +161,15 @@ impl StringLiteral {
}
}

Ok((buf, Span::new(start_pos, cursor.pos())))
Ok((buf, Span::new(start_pos, cursor.pos()), escape_sequence))
}

pub(super) fn take_escape_sequence_or_line_continuation<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
is_strict_mode: bool,
is_template_literal: bool,
) -> Result<Option<u32>, Error>
) -> Result<Option<(u32, Option<EscapeSequence>)>, Error>
where
R: Read,
{
Expand All @@ -176,25 +181,25 @@ impl StringLiteral {
})?;

let escape_value = match escape_ch {
0x0062 /* b */ => Some(0x0008 /* <BS> */),
0x0074 /* t */ => Some(0x0009 /* <HT> */),
0x006E /* n */ => Some(0x000A /* <LF> */),
0x0076 /* v */ => Some(0x000B /* <VT> */),
0x0066 /* f */ => Some(0x000C /* <FF> */),
0x0072 /* r */ => Some(0x000D /* <CR> */),
0x0022 /* " */ => Some(0x0022 /* " */),
0x0027 /* ' */ => Some(0x0027 /* ' */),
0x005C /* \ */ => Some(0x005C /* \ */),
0x0062 /* b */ => Some((0x0008 /* <BS> */, None)),
0x0074 /* t */ => Some((0x0009 /* <HT> */, None)),
0x006E /* n */ => Some((0x000A /* <LF> */, None)),
0x0076 /* v */ => Some((0x000B /* <VT> */, None)),
0x0066 /* f */ => Some((0x000C /* <FF> */, None)),
0x0072 /* r */ => Some((0x000D /* <CR> */, None)),
0x0022 /* " */ => Some((0x0022 /* " */, None)),
0x0027 /* ' */ => Some((0x0027 /* ' */, None)),
0x005C /* \ */ => Some((0x005C /* \ */, None)),
0x0030 /* 0 */ if cursor
.peek()?
.filter(|next_byte| (b'0'..=b'9').contains(next_byte))
.is_none() =>
Some(0x0000 /* NULL */),
Some((0x0000 /* NULL */, None)),
0x0078 /* x */ => {
Some(Self::take_hex_escape_sequence(cursor, start_pos)?)
Some((Self::take_hex_escape_sequence(cursor, start_pos)?, None))
}
0x0075 /* u */ => {
Some(Self::take_unicode_escape_sequence(cursor, start_pos)?)
Some((Self::take_unicode_escape_sequence(cursor, start_pos)?, None))
}
0x0038 /* 8 */ | 0x0039 /* 9 */ => {
// Grammar: NonOctalDecimalEscapeSequence
Expand All @@ -209,7 +214,7 @@ impl StringLiteral {
start_pos,
));
}
Some(escape_ch)
Some((escape_ch, Some(EscapeSequence::NonOctalDecimal)))
}
_ if (0x0030..=0x0037 /* '0'..='7' */).contains(&escape_ch) => {
if is_template_literal {
Expand All @@ -226,10 +231,10 @@ impl StringLiteral {
));
}

Some(Self::take_legacy_octal_escape_sequence(
Some((Self::take_legacy_octal_escape_sequence(
cursor,
escape_ch.try_into().expect("an ascii char must not fail to convert"),
)?)
)?, Some(EscapeSequence::LegacyOctal)))
}
_ if Self::is_line_terminator(escape_ch) => {
// Grammar: LineContinuation
Expand All @@ -238,7 +243,7 @@ impl StringLiteral {
None
}
_ => {
Some(escape_ch)
Some((escape_ch, None))
}
};

Expand Down
2 changes: 1 addition & 1 deletion boa_parser/src/lexer/template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ impl TemplateString {
true,
)?;

if let Some(escape_value) = escape_value {
if let Some((escape_value, _)) = escape_value {
buf.push_code_point(escape_value);
}
}
Expand Down
48 changes: 32 additions & 16 deletions boa_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
#![allow(clippy::indexing_slicing)]

use crate::lexer::{
template::TemplateString, token::Numeric, Cursor, Error, Interner, Lexer, Position, Punctuator,
Read, Span, TokenKind,
template::TemplateString,
token::{ContainsEscapeSequence, EscapeSequence, Numeric},
Cursor, Error, Interner, Lexer, Position, Punctuator, Read, Span, TokenKind,
};
use boa_ast::Keyword;
use boa_interner::Sym;
Expand Down Expand Up @@ -94,9 +95,18 @@ fn check_identifier() {
TokenKind::identifier(
interner.get_or_intern_static("x\u{200C}\u{200D}", utf16!("x\u{200C}\u{200D}")),
),
TokenKind::identifier(interner.get_or_intern_static("x", utf16!("x"))),
TokenKind::identifier(interner.get_or_intern_static("xx", utf16!("xx"))),
TokenKind::identifier(interner.get_or_intern_static("xxx", utf16!("xxx"))),
TokenKind::Identifier((
interner.get_or_intern_static("x", utf16!("x")),
ContainsEscapeSequence(true),
)),
TokenKind::Identifier((
interner.get_or_intern_static("xx", utf16!("xx")),
ContainsEscapeSequence(true),
)),
TokenKind::Identifier((
interner.get_or_intern_static("xxx", utf16!("xxx")),
ContainsEscapeSequence(true),
)),
];

expect_tokens(&mut lexer, &expected, interner);
Expand Down Expand Up @@ -141,8 +151,8 @@ fn check_string() {
let a_sym = interner.get_or_intern_static("aaa", utf16!("aaa"));
let b_sym = interner.get_or_intern_static("bbb", utf16!("bbb"));
let expected = [
TokenKind::string_literal(a_sym),
TokenKind::string_literal(b_sym),
TokenKind::string_literal(a_sym, None),
TokenKind::string_literal(b_sym, None),
];

expect_tokens(&mut lexer, &expected, interner);
Expand Down Expand Up @@ -305,7 +315,7 @@ fn check_variable_definition_tokens() {
TokenKind::Keyword((Keyword::Let, false)),
TokenKind::identifier(a_sym),
TokenKind::Punctuator(Punctuator::Assign),
TokenKind::string_literal(hello_sym),
TokenKind::string_literal(hello_sym, None),
TokenKind::Punctuator(Punctuator::Semicolon),
];

Expand Down Expand Up @@ -943,7 +953,7 @@ fn string_unicode() {

let sym = interner.get_or_intern_static("中文", utf16!("中文"));
let expected = [
TokenKind::StringLiteral(sym),
TokenKind::StringLiteral((sym, None)),
TokenKind::Punctuator(Punctuator::Semicolon),
];

Expand All @@ -957,7 +967,7 @@ fn string_unicode_escape_with_braces() {

let sym =
interner.get_or_intern_static("{\u{20ac}\u{a0}\u{a0}}", utf16!("{\u{20ac}\u{a0}\u{a0}}"));
let expected = [TokenKind::StringLiteral(sym)];
let expected = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected, interner);

Expand Down Expand Up @@ -992,7 +1002,7 @@ fn string_unicode_escape_with_braces_2() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern_static("\u{20ac}\u{a0}\u{a0}", utf16!("\u{20ac}\u{a0}\u{a0}"));
let expected = [TokenKind::StringLiteral(sym)];
let expected = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected, interner);
}
Expand All @@ -1005,7 +1015,7 @@ fn string_with_single_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern_static("Б", utf16!("Б"));
let expected = [TokenKind::StringLiteral(sym)];
let expected = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected, interner);
}
Expand All @@ -1027,7 +1037,10 @@ fn string_legacy_octal_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern(expected.encode_utf16().collect::<Vec<_>>().as_slice());
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((
sym,
Some(EscapeSequence::LegacyOctal),
))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand Down Expand Up @@ -1057,7 +1070,7 @@ fn string_zero_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern(expected.encode_utf16().collect::<Vec<_>>().as_slice());
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand All @@ -1072,7 +1085,10 @@ fn string_non_octal_decimal_escape() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern(expected.encode_utf16().collect::<Vec<_>>().as_slice());
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((
sym,
Some(EscapeSequence::NonOctalDecimal),
))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand Down Expand Up @@ -1101,7 +1117,7 @@ fn string_line_continuation() {
let interner = &mut Interner::default();

let sym = interner.get_or_intern_static("hello world", utf16!("hello world"));
let expected_tokens = [TokenKind::StringLiteral(sym)];
let expected_tokens = [TokenKind::StringLiteral((sym, None))];

expect_tokens(&mut lexer, &expected_tokens, interner);
}
Expand Down
40 changes: 33 additions & 7 deletions boa_parser/src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub enum TokenKind {
EOF,

/// An identifier.
Identifier(Sym),
Identifier((Sym, ContainsEscapeSequence)),

/// A private identifier.
PrivateIdentifier(Sym),
Expand All @@ -117,7 +117,7 @@ pub enum TokenKind {
Punctuator(Punctuator),

/// A string literal.
StringLiteral(Sym),
StringLiteral((Sym, Option<EscapeSequence>)),

/// A part of a template literal without substitution.
TemplateNoSubstitution(TemplateString),
Expand Down Expand Up @@ -175,7 +175,7 @@ impl TokenKind {
/// Creates an `Identifier` token type.
#[must_use]
pub const fn identifier(ident: Sym) -> Self {
Self::Identifier(ident)
Self::Identifier((ident, ContainsEscapeSequence(false)))
}

/// Creates a `NumericLiteral` token kind.
Expand All @@ -194,8 +194,8 @@ impl TokenKind {

/// Creates a `StringLiteral` token type.
#[must_use]
pub const fn string_literal(lit: Sym) -> Self {
Self::StringLiteral(lit)
pub const fn string_literal(lit: Sym, escape_sequence: Option<EscapeSequence>) -> Self {
Self::StringLiteral((lit, escape_sequence))
}

/// Creates a `TemplateMiddle` token type.
Expand Down Expand Up @@ -234,15 +234,15 @@ impl TokenKind {
match *self {
Self::BooleanLiteral(val) => val.to_string(),
Self::EOF => "end of file".to_owned(),
Self::Identifier(ident) => interner.resolve_expect(ident).to_string(),
Self::Identifier((ident, _)) => interner.resolve_expect(ident).to_string(),
Self::PrivateIdentifier(ident) => format!("#{}", interner.resolve_expect(ident)),
Self::Keyword((word, _)) => word.to_string(),
Self::NullLiteral => "null".to_owned(),
Self::NumericLiteral(Numeric::Rational(num)) => num.to_string(),
Self::NumericLiteral(Numeric::Integer(num)) => num.to_string(),
Self::NumericLiteral(Numeric::BigInt(ref num)) => format!("{num}n"),
Self::Punctuator(punc) => punc.to_string(),
Self::StringLiteral(lit) => interner.resolve_expect(lit).to_string(),
Self::StringLiteral((lit, _)) => interner.resolve_expect(lit).to_string(),
Self::TemplateNoSubstitution(ts) | Self::TemplateMiddle(ts) => {
interner.resolve_expect(ts.as_raw()).to_string()
}
Expand All @@ -258,3 +258,29 @@ impl TokenKind {
}
}
}

/// Indicates the type of an escape sequence.
#[cfg_attr(feature = "deser", derive(serde::Serialize, serde::Deserialize))]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EscapeSequence {
/// A legacy escape sequence starting with `0` - `7`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-LegacyOctalEscapeSequence
LegacyOctal,

/// A octal escape sequence starting with `8` - `9`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-NonOctalDecimalEscapeSequence
NonOctalDecimal,
}

/// Indicates if an identifier contains an escape sequence.
#[cfg_attr(feature = "deser", derive(serde::Serialize, serde::Deserialize))]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct ContainsEscapeSequence(pub bool);
Loading