Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement String.prototype.codePointAt #935

Merged
merged 1 commit into from
Nov 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 60 additions & 23 deletions boa/src/builtins/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::{
};
use regress::Regex;
use std::{
char::decode_utf16,
char::{decode_utf16, from_u32},
cmp::{max, min},
f64::NAN,
string::String as StdString,
Expand Down Expand Up @@ -50,11 +50,11 @@ pub(crate) fn code_point_at(string: RcString, position: i32) -> Option<(u32, u8,
}

fn is_leading_surrogate(value: u16) -> bool {
value >= 0xD800 && value <= 0xDBFF
(0xD800..=0xDBFF).contains(&value)
}

fn is_trailing_surrogate(value: u16) -> bool {
value >= 0xDC00 && value <= 0xDFFF
(0xDC00..=0xDFFF).contains(&value)
}

/// JavaScript `String` implementation.
Expand Down Expand Up @@ -84,6 +84,7 @@ impl BuiltIn for String {
.property("length", 0, attribute)
.method(Self::char_at, "charAt", 1)
.method(Self::char_code_at, "charCodeAt", 1)
.method(Self::code_point_at, "codePointAt", 1)
.method(Self::to_string, "toString", 0)
.method(Self::concat, "concat", 1)
.method(Self::repeat, "repeat", 1)
Expand Down Expand Up @@ -197,23 +198,60 @@ impl String {
.unwrap_or_else(Value::undefined)
.to_integer(context)? as i32;

// Fast path returning empty string when pos is obviously out of range
if pos < 0 || pos >= primitive_val.len() as i32 {
return Ok("".into());
}

// Calling .len() on a string would give the wrong result, as they are bytes not the number of
// unicode code points
// Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of
// bytes is an O(1) operation.
let length = primitive_val.chars().count();
if let Some(utf16_val) = primitive_val.encode_utf16().nth(pos as usize) {
Ok(Value::from(from_u32(utf16_val as u32).unwrap()))
} else {
Ok("".into())
}
}

// We should return an empty string is pos is out of range
if pos >= length as i32 || pos < 0 {
return Ok("".into());
/// `String.prototype.codePointAt( index )`
///
/// The `codePointAt()` method returns an integer between `0` to `1114111` (`0x10FFFF`) representing the UTF-16 code unit at the given index.
///
/// If no UTF-16 surrogate pair begins at the index, the code point at the index is returned.
///
/// `codePointAt()` returns `undefined` if the given index is less than `0`, or if it is equal to or greater than the `length` of the string.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#sec-string.prototype.codepointat
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/codePointAt
pub(crate) fn code_point_at(
this: &Value,
args: &[Value],
context: &mut Context,
) -> Result<Value> {
// First we get it the actual string a private field stored on the object only the context has access to.
// Then we convert it into a Rust String by wrapping it in from_value
let primitive_val = this.to_string(context)?;
let pos = args
.get(0)
.cloned()
.unwrap_or_else(Value::undefined)
.to_integer(context)? as i32;

// Fast path returning undefined when pos is obviously out of range
if pos < 0 || pos >= primitive_val.len() as i32 {
return Ok(Value::undefined());
}

Ok(Value::from(
primitive_val
.chars()
.nth(pos as usize)
.expect("failed to get value"),
))
if let Some((code_point, _, _)) = code_point_at(primitive_val, pos) {
Ok(Value::from(code_point))
} else {
Ok(Value::undefined())
}
}

/// `String.prototype.charCodeAt( index )`
Expand All @@ -238,26 +276,25 @@ impl String {
// First we get it the actual string a private field stored on the object only the context has access to.
// Then we convert it into a Rust String by wrapping it in from_value
let primitive_val = this.to_string(context)?;

// Calling .len() on a string would give the wrong result, as they are bytes not the number of unicode code points
// Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of bytes is an O(1) operation.
let length = primitive_val.chars().count();
let pos = args
.get(0)
.cloned()
.unwrap_or_else(Value::undefined)
.to_integer(context)? as i32;

if pos >= length as i32 || pos < 0 {
// Fast path returning NaN when pos is obviously out of range
if pos < 0 || pos >= primitive_val.len() as i32 {
return Ok(Value::from(NAN));
}

let utf16_val = primitive_val
.encode_utf16()
.nth(pos as usize)
.expect("failed to get utf16 value");
// Calling .len() on a string would give the wrong result, as they are bytes not the number of unicode code points
// Note that this is an O(N) operation (because UTF-8 is complex) while getting the number of bytes is an O(1) operation.
// If there is no element at that index, the result is NaN
Ok(Value::from(f64::from(utf16_val)))
if let Some(utf16_val) = primitive_val.encode_utf16().nth(pos as usize) {
Ok(Value::from(f64::from(utf16_val)))
} else {
Ok(Value::from(NAN))
}
}

/// `String.prototype.concat( str1[, ...strN] )`
Expand Down
46 changes: 46 additions & 0 deletions boa/src/builtins/string/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -775,19 +775,65 @@ fn last_index_non_integer_position_argument() {
#[test]
fn char_at() {
let mut context = Context::new();
assert_eq!(forward(&mut context, "'abc'.charAt(-1)"), "\"\"");
assert_eq!(forward(&mut context, "'abc'.charAt(1)"), "\"b\"");
assert_eq!(forward(&mut context, "'abc'.charAt(9)"), "\"\"");
assert_eq!(forward(&mut context, "'abc'.charAt()"), "\"a\"");
assert_eq!(forward(&mut context, "'abc'.charAt(null)"), "\"a\"");
assert_eq!(forward(&mut context, "'\\uDBFF'.charAt(0)"), "\"\u{FFFD}\"");
}

#[test]
fn char_code_at() {
let mut context = Context::new();
assert_eq!(forward(&mut context, "'abc'.charCodeAt(-1)"), "NaN");
assert_eq!(forward(&mut context, "'abc'.charCodeAt(1)"), "98");
assert_eq!(forward(&mut context, "'abc'.charCodeAt(9)"), "NaN");
assert_eq!(forward(&mut context, "'abc'.charCodeAt()"), "97");
assert_eq!(forward(&mut context, "'abc'.charCodeAt(null)"), "97");
assert_eq!(forward(&mut context, "'\\uFFFF'.charCodeAt(0)"), "65535");
}

#[test]
fn code_point_at() {
let mut context = Context::new();
assert_eq!(forward(&mut context, "'abc'.codePointAt(-1)"), "undefined");
assert_eq!(forward(&mut context, "'abc'.codePointAt(1)"), "98");
assert_eq!(forward(&mut context, "'abc'.codePointAt(9)"), "undefined");
assert_eq!(forward(&mut context, "'abc'.codePointAt()"), "97");
assert_eq!(forward(&mut context, "'abc'.codePointAt(null)"), "97");
assert_eq!(
forward(&mut context, "'\\uD800\\uDC00'.codePointAt(0)"),
"65536"
);
assert_eq!(
forward(&mut context, "'\\uD800\\uDFFF'.codePointAt(0)"),
"66559"
);
assert_eq!(
forward(&mut context, "'\\uDBFF\\uDC00'.codePointAt(0)"),
"1113088"
);
assert_eq!(
forward(&mut context, "'\\uDBFF\\uDFFF'.codePointAt(0)"),
"1114111"
);
assert_eq!(
forward(&mut context, "'\\uD800\\uDC00'.codePointAt(1)"),
"56320"
);
assert_eq!(
forward(&mut context, "'\\uD800\\uDFFF'.codePointAt(1)"),
"57343"
);
assert_eq!(
forward(&mut context, "'\\uDBFF\\uDC00'.codePointAt(1)"),
"56320"
);
assert_eq!(
forward(&mut context, "'\\uDBFF\\uDFFF'.codePointAt(1)"),
"57343"
);
}

#[test]
Expand Down