Skip to content

Commit

Permalink
Implement generic encoding input
Browse files Browse the repository at this point in the history
  • Loading branch information
raskad committed Dec 26, 2023
1 parent 55752f2 commit d02b972
Show file tree
Hide file tree
Showing 79 changed files with 549 additions and 397 deletions.
4 changes: 2 additions & 2 deletions core/engine/src/builtins/eval/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ impl Eval {

// 2. If Type(x) is not String, return x.
// TODO: rework parser to take an iterator of `u32` unicode codepoints
let Some(x) = x.as_string().map(JsString::to_std_string_escaped) else {
let Some(x) = x.as_string() else {
return Ok(x.clone());
};

Expand All @@ -118,7 +118,7 @@ impl Eval {
// b. If script is a List of errors, throw a SyntaxError exception.
// c. If script Contains ScriptBody is false, return undefined.
// d. Let body be the ScriptBody of script.
let mut parser = Parser::new(Source::from_bytes(&x));
let mut parser = Parser::new(Source::from_utf16(x));
parser.set_identifier(context.next_parser_identifier());
if strict {
parser.set_strict();
Expand Down
5 changes: 3 additions & 2 deletions core/engine/src/context/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ mod hooks;
pub(crate) mod icu;
pub mod intrinsics;

use boa_parser::source::ReadChar;
pub use hooks::{DefaultHooks, HostHooks};

#[cfg(feature = "intl")]
Expand All @@ -14,7 +15,7 @@ use intrinsics::Intrinsics;

#[cfg(not(feature = "intl"))]
pub use std::marker::PhantomData;
use std::{cell::Cell, io::Read, path::Path, rc::Rc};
use std::{cell::Cell, path::Path, rc::Rc};

use crate::{
builtins,
Expand Down Expand Up @@ -185,7 +186,7 @@ impl Context {
/// Note that this won't run any scheduled promise jobs; you need to call [`Context::run_jobs`]
/// on the context or [`JobQueue::run_jobs`] on the provided queue to run them.
#[allow(clippy::unit_arg, dropping_copy_types)]
pub fn eval<R: Read>(&mut self, src: Source<'_, R>) -> JsResult<JsValue> {
pub fn eval<R: ReadChar>(&mut self, src: Source<'_, R>) -> JsResult<JsValue> {
let main_timer = Profiler::global().start_event("Script evaluation", "Main");

let result = Script::parse(src, None, self)?.evaluate(self);
Expand Down
4 changes: 2 additions & 2 deletions core/engine/src/module/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ mod loader;
mod namespace;
mod source;
mod synthetic;
use boa_parser::source::ReadChar;
pub use loader::*;
pub use namespace::ModuleNamespace;
use source::SourceTextModule;
Expand All @@ -33,7 +34,6 @@ pub use synthetic::{SyntheticModule, SyntheticModuleInitializer};
use std::cell::{Cell, RefCell};
use std::collections::HashSet;
use std::hash::Hash;
use std::io::Read;
use std::rc::Rc;

use rustc_hash::FxHashSet;
Expand Down Expand Up @@ -141,7 +141,7 @@ impl Module {
/// Parses the provided `src` as an ECMAScript module, returning an error if parsing fails.
///
/// [spec]: https://tc39.es/ecma262/#sec-parsemodule
pub fn parse<R: Read>(
pub fn parse<R: ReadChar>(
src: Source<'_, R>,
realm: Option<Realm>,
context: &mut Context,
Expand Down
6 changes: 2 additions & 4 deletions core/engine/src/script.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,8 @@
//! [spec]: https://tc39.es/ecma262/#sec-scripts
//! [script]: https://tc39.es/ecma262/#sec-script-records

use std::io::Read;

use boa_gc::{Finalize, Gc, GcRefCell, Trace};
use boa_parser::{Parser, Source};
use boa_parser::{source::ReadChar, Parser, Source};
use boa_profiler::Profiler;
use rustc_hash::FxHashMap;

Expand Down Expand Up @@ -76,7 +74,7 @@ impl Script {
/// Parses the provided `src` as an ECMAScript script, returning an error if parsing fails.
///
/// [spec]: https://tc39.es/ecma262/#sec-parse-script
pub fn parse<R: Read>(
pub fn parse<R: ReadChar>(
src: Source<'_, R>,
realm: Option<Realm>,
context: &mut Context,
Expand Down
4 changes: 2 additions & 2 deletions core/engine/src/tests/operators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ fn multicharacter_assignment_to_non_assignable_ctd() {
TestAction::assert_native_error(
src,
JsNativeErrorKind::Syntax,
"Invalid left-hand side in assignment at line 1, col 13",
"Invalid left-hand side in assignment at line 1, col 12",
)
}),
);
Expand Down Expand Up @@ -397,7 +397,7 @@ fn multicharacter_bitwise_assignment_to_non_assignable_ctd() {
TestAction::assert_native_error(
src,
JsNativeErrorKind::Syntax,
"Invalid left-hand side in assignment at line 1, col 13",
"Invalid left-hand side in assignment at line 1, col 12",
)
}),
);
Expand Down
8 changes: 4 additions & 4 deletions core/parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Boa's lexing for ECMAScript comments.

use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer};
use crate::source::ReadChar;
use boa_ast::{Position, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
use std::io::Read;

/// Lexes a single line comment.
///
Expand All @@ -26,7 +26,7 @@ impl<R> Tokenizer<R> for SingleLineComment {
_interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("SingleLineComment", "Lexing");

Expand Down Expand Up @@ -66,7 +66,7 @@ impl<R> Tokenizer<R> for MultiLineComment {
_interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("MultiLineComment", "Lexing");

Expand Down Expand Up @@ -115,7 +115,7 @@ impl<R> Tokenizer<R> for HashbangComment {
_interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("Hashbang", "Lexing");

Expand Down
89 changes: 10 additions & 79 deletions core/parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
//! Boa's lexer cursor that manages the input byte stream.

use crate::source::{ReadChar, UTF8Input};
use boa_ast::Position;
use boa_profiler::Profiler;
use std::io::{self, Bytes, Error, ErrorKind, Read};
use std::io::{self, Error, ErrorKind};

/// Cursor over the source code.
#[derive(Debug)]
pub(super) struct Cursor<R> {
iter: InnerIter<R>,
iter: R,
pos: Position,
module: bool,
strict: bool,
Expand Down Expand Up @@ -54,14 +56,11 @@ impl<R> Cursor<R> {
}
}

impl<R> Cursor<R>
where
R: Read,
{
impl<R: ReadChar> Cursor<R> {
/// Creates a new Lexer cursor.
pub(super) fn new(inner: R) -> Self {
Self {
iter: InnerIter::new(inner.bytes()),
iter: inner,
pos: Position::new(1, 1),
strict: false,
module: false,
Expand All @@ -72,7 +71,7 @@ where
/// Creates a new Lexer cursor with an initial position.
pub(super) fn with_position(inner: R, pos: Position) -> Self {
Self {
iter: InnerIter::new(inner.bytes()),
iter: inner,
pos,
strict: false,
module: false,
Expand Down Expand Up @@ -216,76 +215,8 @@ where
}
}

/// Inner iterator for a cursor.
#[derive(Debug)]
struct InnerIter<R> {
iter: Bytes<R>,
}

impl<R> InnerIter<R> {
/// Creates a new inner iterator.
const fn new(iter: Bytes<R>) -> Self {
Self { iter }
impl<'a> From<&'a [u8]> for Cursor<UTF8Input<&'a [u8]>> {
fn from(input: &'a [u8]) -> Self {
Self::new(UTF8Input::new(input))
}
}

impl<R> InnerIter<R>
where
R: Read,
{
/// Retrieves the next byte
fn next_byte(&mut self) -> io::Result<Option<u8>> {
self.iter.next().transpose()
}

/// Retrieves the next unchecked char in u32 code point.
fn next_char(&mut self) -> io::Result<Option<u32>> {
// Decode UTF-8
let x = match self.next_byte()? {
Some(b) if b < 128 => return Ok(Some(u32::from(b))),
Some(b) => b,
None => return Ok(None),
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(self.next_byte()?);
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(self.next_byte()?);
let y_z = utf8_acc_cont_byte(u32::from(y & CONT_MASK), z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(self.next_byte()?);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
};

Ok(Some(ch))
}
}

/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;

/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
u32::from(byte & (0x7F >> width))
}

/// Returns the value of `ch` updated with continuation byte `byte`.
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | u32::from(byte & CONT_MASK)
}

fn unwrap_or_0(opt: Option<u8>) -> u8 {
opt.unwrap_or(0)
}
6 changes: 3 additions & 3 deletions core/parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
use crate::lexer::{
token::ContainsEscapeSequence, Cursor, Error, StringLiteral, Token, TokenKind, Tokenizer,
};
use crate::source::ReadChar;
use boa_ast::{Keyword, Position, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
use std::io::Read;

/// Identifier lexing.
///
Expand Down Expand Up @@ -60,7 +60,7 @@ impl<R> Tokenizer<R> for Identifier {
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("Identifier", "Lexing");

Expand Down Expand Up @@ -95,7 +95,7 @@ impl Identifier {
init: char,
) -> Result<(String, bool), Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("Identifier::take_identifier_name", "Lexing");

Expand Down
22 changes: 14 additions & 8 deletions core/parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ use self::{
string::StringLiteral,
template::TemplateLiteral,
};
use crate::source::{ReadChar, UTF8Input};
use boa_ast::{Position, Punctuator, Span};
use boa_interner::Interner;
use boa_profiler::Profiler;
use std::io::Read;

pub use self::{
error::Error,
Expand All @@ -60,7 +60,7 @@ trait Tokenizer<R> {
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read;
R: ReadChar;
}

/// Lexer or tokenizer for the Boa JavaScript Engine.
Expand Down Expand Up @@ -104,7 +104,7 @@ impl<R> Lexer<R> {
/// Creates a new lexer.
pub fn new(reader: R) -> Self
where
R: Read,
R: ReadChar,
{
Self {
cursor: Cursor::new(reader),
Expand All @@ -125,7 +125,7 @@ impl<R> Lexer<R> {
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("lex_slash_token", "Lexing");

Expand Down Expand Up @@ -179,7 +179,7 @@ impl<R> Lexer<R> {
/// Skips an HTML close comment (`-->`) if the `annex-b` feature is enabled.
pub(crate) fn skip_html_close(&mut self, interner: &mut Interner) -> Result<(), Error>
where
R: Read,
R: ReadChar,
{
if cfg!(not(feature = "annex-b")) || self.module() {
return Ok(());
Expand Down Expand Up @@ -210,7 +210,7 @@ impl<R> Lexer<R> {
// We intentionally don't implement Iterator trait as Result<Option> is cleaner to handle.
pub(crate) fn next_no_skip(&mut self, interner: &mut Interner) -> Result<Option<Token>, Error>
where
R: Read,
R: ReadChar,
{
let _timer = Profiler::global().start_event("next()", "Lexing");

Expand Down Expand Up @@ -352,7 +352,7 @@ impl<R> Lexer<R> {
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self, interner: &mut Interner) -> Result<Option<Token>, Error>
where
R: Read,
R: ReadChar,
{
loop {
let Some(next) = self.next_no_skip(interner)? else {
Expand All @@ -372,12 +372,18 @@ impl<R> Lexer<R> {
interner: &mut Interner,
) -> Result<Token, Error>
where
R: Read,
R: ReadChar,
{
TemplateLiteral.lex(&mut self.cursor, start, interner)
}
}

impl<'a> From<&'a [u8]> for Lexer<UTF8Input<&'a [u8]>> {
fn from(input: &'a [u8]) -> Self {
Self::new(UTF8Input::new(input))
}
}

/// ECMAScript goal symbols.
///
/// <https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar>
Expand Down
Loading

0 comments on commit d02b972

Please sign in to comment.