From 1dd09426ae6eb646aec7c7335c165a0acf02ab7d Mon Sep 17 00:00:00 2001 From: Devin Alexander Torres Date: Tue, 17 Nov 2020 20:06:29 -0500 Subject: [PATCH] Implement String.prototype.codePointAt Closes #751. --- boa/src/builtins/string/mod.rs | 83 +++++++++++++++++++++++--------- boa/src/builtins/string/tests.rs | 46 ++++++++++++++++++ 2 files changed, 106 insertions(+), 23 deletions(-) diff --git a/boa/src/builtins/string/mod.rs b/boa/src/builtins/string/mod.rs index c0aebe5c78c..46bdf6910e1 100644 --- a/boa/src/builtins/string/mod.rs +++ b/boa/src/builtins/string/mod.rs @@ -22,7 +22,7 @@ use crate::{ }; use regress::Regex; use std::{ - char::decode_utf16, + char::{decode_utf16, from_u32}, cmp::{max, min}, f64::NAN, string::String as StdString, @@ -50,11 +50,11 @@ pub(crate) fn code_point_at(string: RcString, position: i32) -> Option<(u32, u8, } fn is_leading_surrogate(value: u16) -> bool { - value >= 0xD800 && value <= 0xDBFF + (0xD800..=0xDBFF).contains(&value) } fn is_trailing_surrogate(value: u16) -> bool { - value >= 0xDC00 && value <= 0xDFFF + (0xDC00..=0xDFFF).contains(&value) } /// JavaScript `String` implementation. @@ -84,6 +84,7 @@ impl BuiltIn for String { .property("length", 0, attribute) .method(Self::char_at, "charAt", 1) .method(Self::char_code_at, "charCodeAt", 1) + .method(Self::code_point_at, "codePointAt", 1) .method(Self::to_string, "toString", 0) .method(Self::concat, "concat", 1) .method(Self::repeat, "repeat", 1) @@ -197,23 +198,60 @@ impl String { .unwrap_or_else(Value::undefined) .to_integer(context)? as i32; + // Fast path returning empty string when pos is obviously out of range + if pos < 0 || pos >= primitive_val.len() as i32 { + return Ok("".into()); + } + // Calling .len() on a string would give the wrong result, as they are bytes not the number of // unicode code points // Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of // bytes is an O(1) operation. - let length = primitive_val.chars().count(); + if let Some(utf16_val) = primitive_val.encode_utf16().nth(pos as usize) { + Ok(Value::from(from_u32(utf16_val as u32).unwrap())) + } else { + Ok("".into()) + } + } - // We should return an empty string is pos is out of range - if pos >= length as i32 || pos < 0 { - return Ok("".into()); + /// `String.prototype.codePointAt( index )` + /// + /// The `codePointAt()` method returns an integer between `0` to `1114111` (`0x10FFFF`) representing the UTF-16 code unit at the given index. + /// + /// If no UTF-16 surrogate pair begins at the index, the code point at the index is returned. + /// + /// `codePointAt()` returns `undefined` if the given index is less than `0`, or if it is equal to or greater than the `length` of the string. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-string.prototype.codepointat + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/codePointAt + pub(crate) fn code_point_at( + this: &Value, + args: &[Value], + context: &mut Context, + ) -> Result { + // First we get it the actual string a private field stored on the object only the context has access to. + // Then we convert it into a Rust String by wrapping it in from_value + let primitive_val = this.to_string(context)?; + let pos = args + .get(0) + .cloned() + .unwrap_or_else(Value::undefined) + .to_integer(context)? as i32; + + // Fast path returning undefined when pos is obviously out of range + if pos < 0 || pos >= primitive_val.len() as i32 { + return Ok(Value::undefined()); } - Ok(Value::from( - primitive_val - .chars() - .nth(pos as usize) - .expect("failed to get value"), - )) + if let Some((code_point, _, _)) = code_point_at(primitive_val, pos) { + Ok(Value::from(code_point)) + } else { + Ok(Value::undefined()) + } } /// `String.prototype.charCodeAt( index )` @@ -238,26 +276,25 @@ impl String { // First we get it the actual string a private field stored on the object only the context has access to. // Then we convert it into a Rust String by wrapping it in from_value let primitive_val = this.to_string(context)?; - - // Calling .len() on a string would give the wrong result, as they are bytes not the number of unicode code points - // Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of bytes is an O(1) operation. - let length = primitive_val.chars().count(); let pos = args .get(0) .cloned() .unwrap_or_else(Value::undefined) .to_integer(context)? as i32; - if pos >= length as i32 || pos < 0 { + // Fast path returning NaN when pos is obviously out of range + if pos < 0 || pos >= primitive_val.len() as i32 { return Ok(Value::from(NAN)); } - let utf16_val = primitive_val - .encode_utf16() - .nth(pos as usize) - .expect("failed to get utf16 value"); + // Calling .len() on a string would give the wrong result, as they are bytes not the number of unicode code points + // Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of bytes is an O(1) operation. // If there is no element at that index, the result is NaN - Ok(Value::from(f64::from(utf16_val))) + if let Some(utf16_val) = primitive_val.encode_utf16().nth(pos as usize) { + Ok(Value::from(f64::from(utf16_val))) + } else { + Ok(Value::from(NAN)) + } } /// `String.prototype.concat( str1[, ...strN] )` diff --git a/boa/src/builtins/string/tests.rs b/boa/src/builtins/string/tests.rs index 3ff349381c7..611d0ecac42 100644 --- a/boa/src/builtins/string/tests.rs +++ b/boa/src/builtins/string/tests.rs @@ -775,19 +775,65 @@ fn last_index_non_integer_position_argument() { #[test] fn char_at() { let mut context = Context::new(); + assert_eq!(forward(&mut context, "'abc'.charAt(-1)"), "\"\""); assert_eq!(forward(&mut context, "'abc'.charAt(1)"), "\"b\""); assert_eq!(forward(&mut context, "'abc'.charAt(9)"), "\"\""); assert_eq!(forward(&mut context, "'abc'.charAt()"), "\"a\""); assert_eq!(forward(&mut context, "'abc'.charAt(null)"), "\"a\""); + assert_eq!(forward(&mut context, "'\\uDBFF'.charAt(0)"), "\"\u{FFFD}\""); } #[test] fn char_code_at() { let mut context = Context::new(); + assert_eq!(forward(&mut context, "'abc'.charCodeAt(-1)"), "NaN"); assert_eq!(forward(&mut context, "'abc'.charCodeAt(1)"), "98"); assert_eq!(forward(&mut context, "'abc'.charCodeAt(9)"), "NaN"); assert_eq!(forward(&mut context, "'abc'.charCodeAt()"), "97"); assert_eq!(forward(&mut context, "'abc'.charCodeAt(null)"), "97"); + assert_eq!(forward(&mut context, "'\\uFFFF'.charCodeAt(0)"), "65535"); +} + +#[test] +fn code_point_at() { + let mut context = Context::new(); + assert_eq!(forward(&mut context, "'abc'.codePointAt(-1)"), "undefined"); + assert_eq!(forward(&mut context, "'abc'.codePointAt(1)"), "98"); + assert_eq!(forward(&mut context, "'abc'.codePointAt(9)"), "undefined"); + assert_eq!(forward(&mut context, "'abc'.codePointAt()"), "97"); + assert_eq!(forward(&mut context, "'abc'.codePointAt(null)"), "97"); + assert_eq!( + forward(&mut context, "'\\uD800\\uDC00'.codePointAt(0)"), + "65536" + ); + assert_eq!( + forward(&mut context, "'\\uD800\\uDFFF'.codePointAt(0)"), + "66559" + ); + assert_eq!( + forward(&mut context, "'\\uDBFF\\uDC00'.codePointAt(0)"), + "1113088" + ); + assert_eq!( + forward(&mut context, "'\\uDBFF\\uDFFF'.codePointAt(0)"), + "1114111" + ); + assert_eq!( + forward(&mut context, "'\\uD800\\uDC00'.codePointAt(1)"), + "56320" + ); + assert_eq!( + forward(&mut context, "'\\uD800\\uDFFF'.codePointAt(1)"), + "57343" + ); + assert_eq!( + forward(&mut context, "'\\uDBFF\\uDC00'.codePointAt(1)"), + "56320" + ); + assert_eq!( + forward(&mut context, "'\\uDBFF\\uDFFF'.codePointAt(1)"), + "57343" + ); } #[test]