Skip to content

Commit

Permalink
Implement RegExp named capture groups (#1390)
Browse files Browse the repository at this point in the history
  • Loading branch information
raskad authored Jul 27, 2021
1 parent f93145c commit a7ebfc8
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 41 deletions.
102 changes: 65 additions & 37 deletions boa/src/builtins/regexp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pub mod regexp_string_iterator;

use crate::{
builtins::{array::Array, BuiltIn},
builtins::{array::Array, string, BuiltIn},
gc::{empty_trace, Finalize, Trace},
object::{ConstructorBuilder, FunctionBuilder, GcObject, ObjectData, PROTOTYPE},
property::Attribute,
Expand Down Expand Up @@ -712,8 +712,7 @@ impl RegExp {
// 2. Assert: Type(S) is String.

// 3. Let length be the number of code units in S.
// Regress only works with utf8. According to the spec we would use the utf16 encoded count.
let length = input.chars().count();
let length = input.encode_utf16().count();

// 4. Let lastIndex be ℝ(? ToLength(? Get(R, "lastIndex"))).
let mut last_index = this.get_field("lastIndex", context)?.to_length(context)?;
Expand Down Expand Up @@ -755,10 +754,16 @@ impl RegExp {

// b. Let r be matcher(S, lastIndex).
// Check if last_index is a valid utf8 index into input.
if input.get(last_index..).is_none() {
return Ok(Value::null());
}
let r = matcher.find_from(&input, last_index).next();
let last_byte_index = match String::from_utf16(
&input.encode_utf16().take(last_index).collect::<Vec<u16>>(),
) {
Ok(s) => s.len(),
Err(_) => {
return context
.throw_type_error("Failed to get byte index from utf16 encoded string")
}
};
let r = matcher.find_from(&input, last_byte_index).next();

match r {
// c. If r is failure, then
Expand Down Expand Up @@ -809,8 +814,7 @@ impl RegExp {
// Let eUTF be the smallest index into S that corresponds to the character at element e of Input.
// If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
// b. Set e to eUTF.
// Regress only works with utf8. According to the spec we would use the utf16 encoded count.
e = input.split_at(e).0.chars().count() + 1;
e = input.split_at(e).0.encode_utf16().count();
}

// 15. If global is true or sticky is true, then
Expand Down Expand Up @@ -847,12 +851,36 @@ impl RegExp {
a.create_data_property_or_throw(0, matched_substr, context)
.unwrap();

// TODO: named capture groups
// 24. If R contains any GroupName, then
// a. Let groups be ! OrdinaryObjectCreate(null).
// 25. Else,
// a. Let groups be undefined.
let groups = Value::undefined();
let named_groups = match_value.named_groups();
let groups = if named_groups.clone().count() > 0 {
// a. Let groups be ! OrdinaryObjectCreate(null).
let groups = Value::new_object(context);

// Perform 27.f here
// f. If the ith capture of R was defined with a GroupName, then
// i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName.
// ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue).
for (name, range) in named_groups {
if let Some(range) = range {
let value = if let Some(s) = input.get(range.clone()) {
s
} else {
""
};

groups
.to_object(context)?
.create_data_property_or_throw(name, value, context)
.unwrap();
}
}
groups
} else {
// a. Let groups be undefined.
Value::undefined()
};

// 26. Perform ! CreateDataPropertyOrThrow(A, "groups", groups).
a.create_data_property_or_throw("groups", groups, context)
Expand Down Expand Up @@ -880,11 +908,6 @@ impl RegExp {
// e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(i)), capturedValue).
a.create_data_property_or_throw(i, captured_value, context)
.unwrap();

// TODO: named capture groups
// f. If the ith capture of R was defined with a GroupName, then
// i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName.
// ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue).
}

// 28. Return A.
Expand Down Expand Up @@ -931,7 +954,7 @@ impl RegExp {
let unicode = this.get_field("unicode", context)?.to_boolean();

// c. Perform ? Set(rx, "lastIndex", +0𝔽, true).
this.set_field("lastIndex", Value::from(0), true, context)?;
this.set_field("lastIndex", 0, true, context)?;

// d. Let A be ! ArrayCreate(0).
let a = Array::array_create(0, None, context).unwrap();
Expand Down Expand Up @@ -1099,8 +1122,7 @@ impl RegExp {
.to_string(context)?;

// 4. Let lengthS be the number of code unit elements in S.
// Regress only works with utf8. According to the spec we would use the utf16 encoded count.
let length_arg_str = arg_str.chars().count();
let length_arg_str = arg_str.encode_utf16().count();

// 5. Let functionalReplace be IsCallable(replaceValue).
let replace_value = args.get(1).cloned().unwrap_or_default();
Expand Down Expand Up @@ -1179,8 +1201,7 @@ impl RegExp {
let matched = result.get_field("0", context)?.to_string(context)?;

// d. Let matchLength be the number of code units in matched.
// Regress only works with utf8. According to the spec we would use the utf16 encoded count.
let match_length = matched.chars().count();
let match_length = matched.encode_utf16().count();

// e. Let position be ? ToIntegerOrInfinity(? Get(result, "index")).
let position = result
Expand Down Expand Up @@ -1260,13 +1281,14 @@ impl RegExp {
}

// ii. Let replacement be ? GetSubstitution(matched, S, position, captures, namedCaptures, replaceValue).
replacement = crate::builtins::string::get_substitution(
replacement = string::get_substitution(
matched.to_string(),
arg_str.to_string(),
position,
captures,
named_captures,
replace_value.to_string(context)?.to_string(),
context,
)?;
}

Expand Down Expand Up @@ -1433,7 +1455,7 @@ impl RegExp {
}

// 15. Let size be the length of S.
let size = arg_str.chars().count();
let size = arg_str.encode_utf16().count();

// 16. If size is 0, then
if size == 0 {
Expand Down Expand Up @@ -1485,10 +1507,13 @@ impl RegExp {
q = advance_string_index(arg_str.clone(), q, unicode);
} else {
// 1. Let T be the substring of S from p to q.
//let arg_str_substring = arg_str
// .get(p..q)
// .expect("invalid index into string to split");
let arg_str_substring: String = arg_str.chars().skip(p).take(q - p).collect();
let arg_str_substring = String::from_utf16_lossy(
&arg_str
.encode_utf16()
.skip(p)
.take(q - p)
.collect::<Vec<u16>>(),
);

// 2. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(lengthA)), T).
a.create_data_property_or_throw(length_a, arg_str_substring, context)
Expand Down Expand Up @@ -1542,7 +1567,13 @@ impl RegExp {
}

// 20. Let T be the substring of S from p to size.
let arg_str_substring: String = arg_str.chars().skip(p).take(size - p).collect();
let arg_str_substring = String::from_utf16_lossy(
&arg_str
.encode_utf16()
.skip(p)
.take(size - p)
.collect::<Vec<u16>>(),
);

// 21. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(lengthA)), T).
a.create_data_property_or_throw(length_a, arg_str_substring, context)
Expand Down Expand Up @@ -1570,19 +1601,16 @@ fn advance_string_index(s: JsString, index: usize, unicode: bool) -> usize {
}

// 3. Let length be the number of code units in S.
let length = s.chars().count();
let length = s.encode_utf16().count();

// 4. If index + 1 ≥ length, return index + 1.
if index + 1 > length {
return index + 1;
}

// 5. Let cp be ! CodePointAt(S, index).
let offset = if let Some(c) = s.chars().nth(index) {
c.len_utf8()
} else {
1
};
let (_, offset, _) =
crate::builtins::string::code_point_at(s, index as i32).expect("Failed to get code point");

index + offset
index + offset as usize
}
45 changes: 41 additions & 4 deletions boa/src/builtins/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ impl String {
length
} else {
args.get(1)
.expect("Could not get argumetn")
.expect("Could not get argument")
.to_integer(context)? as i32
};

Expand Down Expand Up @@ -746,6 +746,7 @@ impl String {
captures,
Value::undefined(),
replace_value.to_string(context)?.to_string(),
context,
)?
};

Expand Down Expand Up @@ -1511,8 +1512,9 @@ pub(crate) fn get_substitution(
str: StdString,
position: usize,
captures: Vec<Value>,
_named_captures: Value,
named_captures: Value,
replacement: StdString,
context: &mut Context,
) -> Result<JsString> {
// 1. Assert: Type(matched) is String.

Expand Down Expand Up @@ -1626,9 +1628,44 @@ pub(crate) fn get_substitution(
}
// $<
(Some('<'), _) => {
// TODO: named capture groups
// 1. If namedCaptures is undefined, the replacement text is the String "$<".
result.push_str("$<");
// 2. Else,
if named_captures.is_undefined() {
result.push_str("$<")
} else {
// a. Assert: Type(namedCaptures) is Object.

// b. Scan until the next > U+003E (GREATER-THAN SIGN).
let mut group_name = StdString::new();
let mut found = false;
loop {
match chars.next() {
Some('>') => {
found = true;
break;
}
Some(c) => group_name.push(c),
None => break,
}
}

// c. If none is found, the replacement text is the String "$<".
// d. Else,
if !found {
result.push_str("$<");
result.push_str(&group_name);
} else {
// i. Let groupName be the enclosed substring.
// ii. Let capture be ? Get(namedCaptures, groupName).
let capture = named_captures.get_field(group_name, context)?;

// iii. If capture is undefined, replace the text through > with the empty String.
// iv. Otherwise, replace the text through > with ? ToString(capture).
if !capture.is_undefined() {
result.push_str(capture.to_string(context)?.as_str());
}
}
}
}
// $?, ? is none of the above
_ => {
Expand Down

0 comments on commit a7ebfc8

Please sign in to comment.