Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Unicode escape in identifier names #1102

Merged
merged 1 commit into from
May 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions boa/src/syntax/ast/keyword.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Classes/extends
Extends,

/// The `false` keyword.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
False,

/// The `finally` keyword.
///
/// More information:
Expand Down Expand Up @@ -301,6 +311,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/new
New,

/// The `null` keyword.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-NullLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/null
Null,

/// The `of` keyword.
///
/// More information:
Expand Down Expand Up @@ -369,6 +389,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions
Throw,

/// The `true` keyword
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
True,

/// The `try` keyword.
///
/// More information:
Expand Down Expand Up @@ -479,6 +509,7 @@ impl Keyword {
Self::Enum => "enum",
Self::Extends => "extends",
Self::Export => "export",
Self::False => "false",
Self::Finally => "finally",
Self::For => "for",
Self::Function => "function",
Expand All @@ -488,12 +519,14 @@ impl Keyword {
Self::Import => "import",
Self::Let => "let",
Self::New => "new",
Self::Null => "null",
Self::Of => "of",
Self::Return => "return",
Self::Super => "super",
Self::Switch => "switch",
Self::This => "this",
Self::Throw => "throw",
Self::True => "true",
Self::Try => "try",
Self::TypeOf => "typeof",
Self::Var => "var",
Expand Down Expand Up @@ -552,6 +585,7 @@ impl FromStr for Keyword {
"enum" => Ok(Self::Enum),
"extends" => Ok(Self::Extends),
"export" => Ok(Self::Export),
"false" => Ok(Self::False),
"finally" => Ok(Self::Finally),
"for" => Ok(Self::For),
"function" => Ok(Self::Function),
Expand All @@ -561,12 +595,14 @@ impl FromStr for Keyword {
"import" => Ok(Self::Import),
"let" => Ok(Self::Let),
"new" => Ok(Self::New),
"null" => Ok(Self::Null),
"of" => Ok(Self::Of),
"return" => Ok(Self::Return),
"super" => Ok(Self::Super),
"switch" => Ok(Self::Switch),
"this" => Ok(Self::This),
"throw" => Ok(Self::Throw),
"true" => Ok(Self::True),
"try" => Ok(Self::Try),
"typeof" => Ok(Self::TypeOf),
"var" => Ok(Self::Var),
Expand Down
2 changes: 2 additions & 0 deletions boa/src/syntax/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ where
/// predicate on the ascii char
///
/// The buffer is not incremented.
#[allow(dead_code)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we allow dead code here? If it's not being used, it should be removed, right?

Same applies to line 195.

#[inline]
pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
where
Expand Down Expand Up @@ -191,6 +192,7 @@ where
/// It also stops when there is no next character.
///
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
#[allow(dead_code)]
pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
where
F: Fn(u32) -> bool,
Expand Down
129 changes: 93 additions & 36 deletions boa/src/syntax/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
profiler::BoaProfiler,
syntax::{
ast::{Keyword, Position, Span},
lexer::{Token, TokenKind},
lexer::{StringLiteral, Token, TokenKind},
},
};
use boa_unicode::UnicodeProperties;
Expand Down Expand Up @@ -86,43 +86,100 @@ impl<R> Tokenizer<R> for Identifier {
{
let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");

let mut init_buf = [0u8; 4];
let mut buf = Vec::new();
self.init.encode_utf8(&mut init_buf);
buf.extend(init_buf.iter().take(self.init.len_utf8()));

cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;

let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
let tk = match token_str {
"true" => TokenKind::BooleanLiteral(true),
"false" => TokenKind::BooleanLiteral(false),
"null" => TokenKind::NullLiteral,
slice => {
if let Ok(keyword) = slice.parse() {
if cursor.strict_mode() && keyword == Keyword::With {
return Err(Error::Syntax(
"using 'with' statement not allowed in strict mode".into(),
start_pos,
));
}
TokenKind::Keyword(keyword)
} else {
if cursor.strict_mode() && STRICT_FORBIDDEN_IDENTIFIERS.contains(&slice) {
return Err(Error::Syntax(
format!(
"using future reserved keyword '{}' not allowed in strict mode",
slice
)
.into(),
start_pos,
));
}
TokenKind::identifier(slice)
}
let (identifier_name, contains_escaped_chars) =
Self::take_identifier_name(cursor, start_pos, self.init)?;

let token_kind = if let Ok(keyword) = identifier_name.parse() {
if contains_escaped_chars {
return Err(Error::Syntax(
"unicode escaped characters are not allowed in keyword".into(),
start_pos,
));
}

if cursor.strict_mode() && keyword == Keyword::With {
return Err(Error::Syntax(
"using 'with' statement not allowed in strict mode".into(),
start_pos,
));
}

match keyword {
Keyword::True => TokenKind::BooleanLiteral(true),
Keyword::False => TokenKind::BooleanLiteral(false),
Keyword::Null => TokenKind::NullLiteral,
_ => TokenKind::Keyword(keyword),
}
} else {
if cursor.strict_mode()
&& STRICT_FORBIDDEN_IDENTIFIERS.contains(&identifier_name.as_str())
{
return Err(Error::Syntax(
format!(
"using future reserved keyword '{}' not allowed in strict mode",
identifier_name
)
.into(),
start_pos,
));
}
TokenKind::identifier(identifier_name.into_boxed_str())
};

Ok(Token::new(tk, Span::new(start_pos, cursor.pos())))
Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
}
}

impl Identifier {
#[inline]
fn take_identifier_name<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
init: char,
) -> Result<(String, bool), Error>
where
R: Read,
{
let mut contains_escaped_chars = false;
let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? {
let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;

if Self::is_identifier_start(ch) {
contains_escaped_chars = true;
String::from(char::try_from(ch).unwrap())
} else {
return Err(Error::Syntax("invalid identifier start".into(), start_pos));
}
} else {
// The caller guarantees that `init` is a valid identifier start
String::from(init)
};

loop {
let ch = match cursor.peek_char()? {
Some(0x005C /* \ */) if cursor.peek_n(2)? >> 8 == 0x0075 /* u */ => {
let pos = cursor.pos();
let _ = cursor.next_byte();
let _ = cursor.next_byte();
let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?;

if Self::is_identifier_part(ch) {
contains_escaped_chars = true;
ch
} else {
return Err(Error::Syntax("invalid identifier part".into(), pos));
}
}
Some(ch) if Self::is_identifier_part(ch) => {
let _ = cursor.next_char()?;
ch
},
_ => break,
};

identifier_name.push(char::try_from(ch).unwrap());
}

Ok((identifier_name, contains_escaped_chars))
}
}
7 changes: 5 additions & 2 deletions boa/src/syntax/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,15 @@ impl<R> Lexer<R> {
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
'\\' if self.cursor.peek()? == Some(b'u') => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ if Identifier::is_identifier_start(c as u32) => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ => {
let details = format!(
"unexpected '{}' at line {}, column {}",
Expand Down
5 changes: 4 additions & 1 deletion boa/src/syntax/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fn check_multi_line_comment() {

#[test]
fn check_identifier() {
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D} \\u0078 \\u0078\\u0078 \\u{0078}x\\u{0078}";
let mut lexer = Lexer::new(s.as_bytes());

let expected = [
Expand All @@ -85,6 +85,9 @@ fn check_identifier() {
TokenKind::identifier("Ѐ"),
TokenKind::identifier("ЀЀ"),
TokenKind::identifier("x\u{200C}\u{200D}"),
TokenKind::identifier("x"),
TokenKind::identifier("xx"),
TokenKind::identifier("xxx"),
];

expect_tokens(&mut lexer, &expected);
Expand Down