diff --git a/Cargo.lock b/Cargo.lock index c4e0491db83..a4dbcbbed7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,6 +10,7 @@ dependencies = [ "boa_interner", "boa_unicode", "chrono", + "const-utf16", "criterion", "dyn-clone", "fast-float", @@ -252,6 +253,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "const-utf16" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90feefab165fe011746e3be2f0708b7b180fcbd9f5054ff81a454d7bd93d8285" + [[package]] name = "criterion" version = "0.3.5" diff --git a/boa/Cargo.toml b/boa/Cargo.toml index 362ff7e64b8..4ebea65f073 100644 --- a/boa/Cargo.toml +++ b/boa/Cargo.toml @@ -38,6 +38,7 @@ fast-float = "0.2.0" unicode-normalization = "0.1.19" dyn-clone = "1.0.4" once_cell = "1.9.0" +const-utf16 = "0.2.1" # Optional Dependencies measureme = { version = "10.0.0", optional = true } diff --git a/boa/src/jsstring.rs b/boa/src/jsstring.rs new file mode 100644 index 00000000000..a2b2d36abac --- /dev/null +++ b/boa/src/jsstring.rs @@ -0,0 +1,636 @@ +#![warn(unsafe_op_in_unsafe_fn)] + +use core::hash::Hash; +use std::{ + alloc::{dealloc, Layout}, + borrow::Borrow, + cell::Cell, + fmt, + marker::PhantomData, + ops::{Deref, Index}, + ptr::{self, NonNull}, + slice::SliceIndex, +}; + +use crate::{ + builtins::string::is_trimmable_whitespace, + gc::{empty_trace, Finalize, Trace}, +}; +use const_utf16::encode as utf16; +use rustc_hash::FxHashSet; + +const CONSTANTS_ARRAY: [&[u16]; 120] = [ + // Empty string + utf16!(""), + // Misc + utf16!(","), + utf16!(":"), + // Generic use + utf16!("name"), + utf16!("length"), + utf16!("arguments"), + utf16!("prototype"), + utf16!("constructor"), + // typeof + utf16!("null"), + utf16!("undefined"), + utf16!("number"), + utf16!("string"), + utf16!("symbol"), + utf16!("bigint"), + utf16!("object"), + utf16!("function"), + // Property descriptor + utf16!("value"), + utf16!("get"), + utf16!("set"), + utf16!("writable"), + utf16!("enumerable"), + utf16!("configurable"), + // Object object + utf16!("Object"), + utf16!("assign"), + utf16!("create"), + utf16!("toString"), + utf16!("valueOf"), + utf16!("is"), + utf16!("seal"), + utf16!("isSealed"), + utf16!("freeze"), + utf16!("isFrozen"), + utf16!("keys"), + utf16!("values"), + utf16!("entries"), + // Function object + utf16!("Function"), + utf16!("apply"), + utf16!("bind"), + utf16!("call"), + // Array object + utf16!("Array"), + utf16!("from"), + utf16!("isArray"), + utf16!("of"), + utf16!("get [Symbol.species]"), + utf16!("copyWithin"), + utf16!("every"), + utf16!("fill"), + utf16!("filter"), + utf16!("find"), + utf16!("findIndex"), + utf16!("flat"), + utf16!("flatMap"), + utf16!("forEach"), + utf16!("includes"), + utf16!("indexOf"), + utf16!("join"), + utf16!("map"), + utf16!("reduce"), + utf16!("reduceRight"), + utf16!("reverse"), + utf16!("shift"), + utf16!("slice"), + utf16!("some"), + utf16!("sort"), + utf16!("unshift"), + utf16!("push"), + utf16!("pop"), + // String object + utf16!("String"), + utf16!("charAt"), + utf16!("charCodeAt"), + utf16!("concat"), + utf16!("endsWith"), + utf16!("lastIndexOf"), + utf16!("match"), + utf16!("matchAll"), + utf16!("normalize"), + utf16!("padEnd"), + utf16!("padStart"), + utf16!("repeat"), + utf16!("replace"), + utf16!("replaceAll"), + utf16!("search"), + utf16!("split"), + utf16!("startsWith"), + utf16!("substring"), + utf16!("toLowerString"), + utf16!("toUpperString"), + utf16!("trim"), + utf16!("trimEnd"), + utf16!("trimStart"), + // Number object + utf16!("Number"), + // Boolean object + utf16!("Boolean"), + // RegExp object + utf16!("RegExp"), + utf16!("exec"), + utf16!("test"), + utf16!("flags"), + utf16!("index"), + utf16!("lastIndex"), + // Symbol object + utf16!("Symbol"), + utf16!("for"), + utf16!("keyFor"), + utf16!("description"), + utf16!("[Symbol.toPrimitive]"), + // Map object + utf16!("Map"), + utf16!("clear"), + utf16!("delete"), + utf16!("has"), + utf16!("size"), + // Set object + utf16!("Set"), + // Reflect object + utf16!("Reflect"), + // Error objects + utf16!("Error"), + utf16!("TypeError"), + utf16!("RangeError"), + utf16!("SyntaxError"), + utf16!("ReferenceError"), + utf16!("EvalError"), + utf16!("URIError"), + utf16!("message"), + // Date object + utf16!("Date"), + utf16!("toJSON"), +]; + +const MAX_CONSTANT_STRING_LENGTH: usize = { + let mut max = 0; + let mut i = 0; + while i < CONSTANTS_ARRAY.len() { + let len = CONSTANTS_ARRAY[i].len(); + if len > max { + max = len; + } + i += 1; + } + max +}; + +thread_local! { + static CONSTANTS: FxHashSet = { + let mut constants = FxHashSet::default(); + + for s in CONSTANTS_ARRAY.iter() { + let s = JsString::from_slice_skip_interning(s); + constants.insert(s); + } + + constants + }; +} + +/// The inner representation of a [`JsString`]. +#[repr(C)] +struct Inner { + /// The utf16 length. + len: usize, + + /// The number of references to the string. + /// + /// When this reaches `0` the string is deallocated. + refcount: Cell, + + /// An empty array which is used to get the offset of string data. + data: [u16; 0], +} + +#[derive(Finalize)] +pub struct JsString { + ptr: NonNull, + phantom: PhantomData, +} + +impl JsString { + fn inner(&self) -> &Inner { + unsafe { self.ptr.as_ref() } + } + + fn from_inner(ptr: NonNull) -> Self { + Self { + ptr, + phantom: PhantomData, + } + } + + unsafe fn from_ptr(ptr: *mut Inner) -> Self { + Self::from_inner(unsafe { NonNull::new_unchecked(ptr) }) + } + + unsafe fn allocate(len: usize) -> *mut Inner { + // We get the layout of the `Inner` type and we extend by the size + // of the string array. + let (layout, offset) = Layout::array::(len) + .and_then(|arr| Layout::new::().extend(arr)) + .map(|(layout, offset)| (layout.pad_to_align(), offset)) + .unwrap(); + + unsafe { + let inner = std::alloc::alloc(layout) as *mut Inner; + + // Write the first part, the Inner. + inner.write(Inner { + len, + refcount: Cell::new(1), + data: [0; 0], + }); + + // Get offset into the string data. + let data = (*inner).data.as_mut_ptr(); + + debug_assert!(ptr::eq(inner.cast::().add(offset).cast(), data)); + + inner + } + } + + fn from_slice_skip_interning(data: &[u16]) -> Self { + unsafe { + let ptr = Self::allocate(data.len()); + ptr::copy_nonoverlapping(data.as_ptr(), (*ptr).data.as_mut_ptr(), data.len()); + Self::from_ptr(ptr) + } + } + + pub fn as_slice(&self) -> &[u16] { + self + } + + /// Concatenate two string. + pub fn concat(x: T, y: U) -> JsString + where + T: AsRef<[u16]>, + U: AsRef<[u16]>, + { + Self::concat_array(&[x.as_ref(), y.as_ref()]) + } + + /// Concatenate array of string. + pub fn concat_array(strings: &[&[u16]]) -> JsString { + let len = strings.iter().fold(0, |len, s| len + s.len()); + + let string = unsafe { + let ptr = Self::allocate(len); + let data = (*ptr).data.as_mut_ptr(); + let mut offset = 0; + for string in strings { + ptr::copy_nonoverlapping(string.as_ptr(), data.add(offset), string.len()); + offset += string.len(); + } + Self::from_ptr(ptr) + }; + + if string.len() <= MAX_CONSTANT_STRING_LENGTH { + if let Some(constant) = CONSTANTS.with(|c| c.get(&string).cloned()) { + return constant; + } + } + + string + } + + /// `6.1.4.1 StringIndexOf ( string, searchValue, fromIndex )` + /// + /// Note: Instead of returning an isize with `-1` as the "not found" value, + /// We make use of the type system and return Option with None as the "not found" value. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-stringindexof + pub(crate) fn index_of(&self, search_value: &Self, from_index: usize) -> Option { + // 1. Assert: Type(string) is String. + // 2. Assert: Type(searchValue) is String. + // 3. Assert: fromIndex is a non-negative integer. + + // 4. Let len be the length of string. + let len = self.len(); + + // 5. If searchValue is the empty String and fromIndex ≤ len, return fromIndex. + if search_value.is_empty() && from_index <= len { + return Some(from_index); + } + + // 6. Let searchLen be the length of searchValue. + let search_len = search_value.len(); + + let range = len.checked_sub(search_len)?; + + // 7. For each integer i starting with fromIndex such that i ≤ len - searchLen, in ascending order, do + for i in from_index..=range { + // a. Let candidate be the substring of string from i to i + searchLen. + let candidate = &self[i..i + search_len]; + + // b. If candidate is the same sequence of code units as searchValue, return i. + if candidate == search_value { + return Some(i); + } + } + + // 8. Return -1. + None + } + + pub(crate) fn string_to_number(&self) -> f64 { + // TODO: to optimize this we would need to create our own version of `trim_matches` but for utf16 + let string = String::from_utf16_lossy(self); + let string = string.trim_matches(is_trimmable_whitespace); + + // TODO: write our own lexer to match syntax StrDecimalLiteral + match string { + "" => 0.0, + "Infinity" | "+Infinity" => f64::INFINITY, + "-Infinity" => f64::NEG_INFINITY, + _ if matches!( + string + .chars() + .take(4) + .collect::() + .to_ascii_lowercase() + .as_str(), + "inf" | "+inf" | "-inf" | "nan" | "+nan" | "-nan" + ) => + { + // Prevent fast_float from parsing "inf", "+inf" as Infinity and "-inf" as -Infinity + f64::NAN + } + _ => fast_float::parse(string).unwrap_or(f64::NAN), + } + } +} + +impl AsRef<[u16]> for JsString { + fn as_ref(&self) -> &[u16] { + self + } +} + +impl Borrow<[u16]> for JsString { + fn borrow(&self) -> &[u16] { + self + } +} + +// Safety: [`JsString`] does not contain any objects which require trace, +// so this is safe. +unsafe impl Trace for JsString { + empty_trace!(); +} + +impl Clone for JsString { + #[inline] + fn clone(&self) -> Self { + self.inner().refcount.set(self.inner().refcount.get() + 1); + + JsString { + ptr: self.ptr, + phantom: PhantomData, + } + } +} + +impl Default for JsString { + fn default() -> Self { + Self::from(utf16!("")) + } +} + +impl Drop for JsString { + #[inline] + fn drop(&mut self) { + self.inner().refcount.set(self.inner().refcount.get() - 1); + if self.inner().refcount.get() == 0 { + // Safety: If refcount is 0 and we call drop, that means this is the last + // JsString which points to this memory allocation, so deallocating it is safe. + unsafe { + dealloc( + self.ptr.as_ptr().cast(), + Layout::for_value(self.ptr.as_ref()), + ); + } + } + } +} + +impl Deref for JsString { + type Target = [u16]; + + fn deref(&self) -> &Self::Target { + unsafe { std::slice::from_raw_parts(self.inner().data.as_ptr(), self.inner().len) } + } +} + +impl PartialEq for JsString { + fn eq(&self, other: &Self) -> bool { + self.ptr == other.ptr || self[..] == other[..] + } +} + +impl Eq for JsString {} + +impl PartialOrd for JsString { + fn partial_cmp(&self, other: &Self) -> Option { + self[..].partial_cmp(other) + } +} + +impl Ord for JsString { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self[..].cmp(other) + } +} + +impl PartialEq<[u16]> for JsString { + fn eq(&self, other: &[u16]) -> bool { + &**self == other + } +} + +impl PartialEq for [u16] { + fn eq(&self, other: &JsString) -> bool { + self == &**other + } +} + +impl PartialEq<[u16; N]> for JsString { + fn eq(&self, other: &[u16; N]) -> bool { + *self == other[..] + } +} + +impl PartialEq for [u16; N] { + fn eq(&self, other: &JsString) -> bool { + self[..] == *other + } +} + +impl Hash for JsString { + fn hash(&self, state: &mut H) { + self[..].hash(state); + } +} + +impl From<&[u16]> for JsString { + fn from(s: &[u16]) -> Self { + if s.len() <= MAX_CONSTANT_STRING_LENGTH { + if let Some(constant) = CONSTANTS.with(|c| c.get(s).cloned()) { + return constant; + } + } + Self::from_slice_skip_interning(s) + } +} + +impl From<&[u16; N]> for JsString { + #[inline] + fn from(s: &[u16; N]) -> Self { + JsString::from(&s[..]) + } +} + +impl> Index for JsString { + type Output = I::Output; + + #[inline] + fn index(&self, index: I) -> &Self::Output { + Index::index(&**self, index) + } +} + +impl fmt::Debug for JsString { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + std::char::decode_utf16(self.as_slice().to_owned()) + .map(|r| { + r.map_or_else( + |err| format!("<0x{:04x}>", err.unpaired_surrogate()), + String::from, + ) + }) + .collect::() + .fmt(f) + } +} + +impl fmt::Display for JsString { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + String::from_utf16_lossy(self.as_slice()).fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::JsString; + use const_utf16::encode as utf16; + use std::mem::size_of; + + impl JsString { + /// Gets the number of `JsString`s which point to this allocation. + #[inline] + fn refcount(&self) -> usize { + self.inner().refcount.get() + } + } + + #[test] + fn empty() { + let s = JsString::from(utf16!("")); + assert_eq!(*s, "".encode_utf16().collect::>()) + } + + #[test] + fn pointer_size() { + assert_eq!(size_of::(), size_of::<*const u8>()); + assert_eq!(size_of::>(), size_of::<*const u8>()); + } + + #[test] + fn refcount() { + let x = JsString::from(utf16!("Hello world")); + assert_eq!(JsString::refcount(&x), 1); + + { + let y = x.clone(); + assert_eq!(JsString::refcount(&x), 2); + assert_eq!(JsString::refcount(&y), 2); + + { + let z = y.clone(); + assert_eq!(JsString::refcount(&x), 3); + assert_eq!(JsString::refcount(&y), 3); + assert_eq!(JsString::refcount(&z), 3); + } + + assert_eq!(JsString::refcount(&x), 2); + assert_eq!(JsString::refcount(&y), 2); + } + + assert_eq!(JsString::refcount(&x), 1); + } + + #[test] + fn ptr_eq() { + let x = JsString::from(utf16!("Hello")); + let y = x.clone(); + + assert_eq!(x.ptr, y.ptr); + + let z = JsString::from(utf16!("Hello")); + assert_ne!(x.ptr, z.ptr); + assert_ne!(y.ptr, z.ptr); + } + + #[test] + fn as_str() { + const HELLO: &str = "Hello"; + let x = JsString::from(utf16!(HELLO)); + + assert_eq!(*x, HELLO.encode_utf16().collect::>()); + } + + #[test] + fn hash() { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + const HELLOWORLD: &[u16] = utf16!("Hello World!"); + let x = JsString::from(HELLOWORLD); + + assert_eq!(&*x, HELLOWORLD); + + let mut hasher = DefaultHasher::new(); + HELLOWORLD.hash(&mut hasher); + let s_hash = hasher.finish(); + + let mut hasher = DefaultHasher::new(); + x.hash(&mut hasher); + let x_hash = hasher.finish(); + + assert_eq!(s_hash, x_hash); + } + + #[test] + fn concat() { + let x = JsString::from(utf16!("hello")); + const Y: &[u16] = utf16!(", "); + let z = JsString::from(utf16!("world")); + const W: &[u16] = utf16!("!"); + + let xy = JsString::concat(x, Y); + assert_eq!(xy, *utf16!("hello, ")); + assert_eq!(JsString::refcount(&xy), 1); + + let xyz = JsString::concat(xy, z); + assert_eq!(xyz, *utf16!("hello, world")); + assert_eq!(JsString::refcount(&xyz), 1); + + let xyzw = JsString::concat(xyz, W); + assert_eq!(xyzw, *utf16!("hello, world!")); + assert_eq!(JsString::refcount(&xyzw), 1); + } +} diff --git a/boa/src/lib.rs b/boa/src/lib.rs index b0612d26955..733ed942f10 100644 --- a/boa/src/lib.rs +++ b/boa/src/lib.rs @@ -73,6 +73,7 @@ pub mod class; pub mod context; pub mod environment; pub mod gc; +pub mod jsstring; pub mod object; pub mod profiler; pub mod property;