Skip to content

Commit

Permalink
[regexp] Replace whitespace and digit comparison instructions with ge…
Browse files Browse the repository at this point in the history
…neric set comparisons (#105)

## Summary

In preparation for fixes to case insensitive mode we are moving some
special comparison instructions to generic set comparisons. This PR
refactors whitespace and digit comparison instructions to use the
generic set comparison path, meaning individual comparisons are emitted
for each range in the set.

Also fixes generating a set for the unimplemented "Assigned" binary
property.

## Tests

Verify that bytecode snapshot tests for digit and whitespace character
classes are updated with the new comparison instructions.
  • Loading branch information
Hans-Halverson authored Jan 20, 2025
1 parent 1bafa68 commit 72b163a
Show file tree
Hide file tree
Showing 11 changed files with 89 additions and 184 deletions.
12 changes: 10 additions & 2 deletions src/js/common/unicode_property.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,17 @@ impl BinaryUnicodeProperty {
add_binary_property_to_set(set_builder, &ICU.properties.ascii_hex_digit)
}
Self::Alphabetic => add_binary_property_to_set(set_builder, &ICU.properties.alphabetic),
Self::Any => set_builder.add_range32(0x00..=MAX_CODE_POINT),
Self::Any => set_builder.add_range32(0..=MAX_CODE_POINT),
Self::Assigned => {
todo!()
// Assigned is the complement of the Unassigned general category
let mut assigned_set = CodePointInversionListBuilder::new();
add_general_category_group_to_set(
&mut assigned_set,
GeneralCategoryGroup::Unassigned,
);
assigned_set.complement();

set_builder.add_set(&assigned_set.build());
}
Self::BidiControl => {
add_binary_property_to_set(set_builder, &ICU.properties.bidi_control)
Expand Down
79 changes: 51 additions & 28 deletions src/js/runtime/regexp/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ use std::sync::LazyLock;
use icu_collections::codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder};

use crate::js::{
common::{unicode::CodePoint, wtf_8::Wtf8String},
common::{
unicode::{CodePoint, MAX_CODE_POINT},
wtf_8::Wtf8String,
},
parser::regexp::{
Alternative, AnonymousGroup, Assertion, CaptureGroup, CaptureGroupIndex, CharacterClass,
ClassRange, Disjunction, Lookaround, Quantifier, RegExp, RegExpFlags, Term,
Expand All @@ -22,12 +25,10 @@ use super::{
AssertNotWordBoundaryInstruction, AssertStartInstruction, AssertStartOrNewlineInstruction,
AssertWordBoundaryInstruction, BackreferenceInstruction, BranchInstruction,
ClearCaptureInstruction, CompareBetweenInstruction, CompareEqualsInstruction,
CompareIsDigitInstruction, CompareIsNotDigitInstruction, CompareIsNotWhitespaceInstruction,
CompareIsWhitespaceInstruction, ConsumeIfFalseInstruction, ConsumeIfTrueInstruction,
FailInstruction, InstructionIteratorMut, JumpInstruction, LiteralInstruction,
LookaroundInstruction, LoopInstruction, MarkCapturePointInstruction, OpCode,
ProgressInstruction, WildcardInstruction, WildcardNoNewlineInstruction,
WordBoundaryMoveToPreviousInstruction,
ConsumeIfFalseInstruction, ConsumeIfTrueInstruction, FailInstruction,
InstructionIteratorMut, JumpInstruction, LiteralInstruction, LookaroundInstruction,
LoopInstruction, MarkCapturePointInstruction, OpCode, ProgressInstruction,
WildcardInstruction, WildcardNoNewlineInstruction, WordBoundaryMoveToPreviousInstruction,
},
matcher::canonicalize,
};
Expand Down Expand Up @@ -242,22 +243,6 @@ impl CompiledRegExpBuilder {
CompareBetweenInstruction::write(self.current_block_buf(), start, end)
}

fn emit_compare_is_digit_instruction(&mut self) {
CompareIsDigitInstruction::write(self.current_block_buf())
}

fn emit_compare_is_not_digit_instruction(&mut self) {
CompareIsNotDigitInstruction::write(self.current_block_buf())
}

fn emit_compare_is_whitespace_instruction(&mut self) {
CompareIsWhitespaceInstruction::write(self.current_block_buf())
}

fn emit_compare_is_not_whitespace_instruction(&mut self) {
CompareIsNotWhitespaceInstruction::write(self.current_block_buf())
}

fn emit_lookaround_instruction(&mut self, is_ahead: bool, is_positive: bool, body_branch: u32) {
LookaroundInstruction::write(self.current_block_buf(), is_ahead, is_positive, body_branch)
}
Expand Down Expand Up @@ -957,13 +942,19 @@ impl CompiledRegExpBuilder {
set_builder.add_range32(*start..=*end);
}
}
// Shorthand char ranges have own instructions
ClassRange::Digit => self.emit_compare_is_digit_instruction(),
ClassRange::NotDigit => self.emit_compare_is_not_digit_instruction(),
// Use the precomputed word sets for word and whitespace character classes
ClassRange::Word => set_builder.add_set(&WORD_SET),
ClassRange::NotWord => set_builder.add_set(&NOT_WORD_SET),
ClassRange::Whitespace => self.emit_compare_is_whitespace_instruction(),
ClassRange::NotWhitespace => self.emit_compare_is_not_whitespace_instruction(),
ClassRange::Whitespace => set_builder.add_set(&WHITESPACE_SET),
ClassRange::NotWhitespace => set_builder.add_set(&NOT_WHITESPACE_SET),
// Decimal ranges are simple so they are hardcoded
ClassRange::Digit => {
set_builder.add_range('0'..='9');
}
ClassRange::NotDigit => {
set_builder.add_range32(0..('0' as u32));
set_builder.add_range32(('9' as u32 + 1)..=MAX_CODE_POINT);
}
ClassRange::UnicodeProperty(property) => {
property.add_to_set(&mut set_builder);
}
Expand Down Expand Up @@ -1107,12 +1098,44 @@ static NOT_WORD_SET: LazyLock<CodePointInversionList> = LazyLock::new(|| {
set_builder.build()
});

/// Set of whitespace characters to be used for whitespace character classes.
static WHITESPACE_SET: LazyLock<CodePointInversionList> =
LazyLock::new(|| create_whitespace_set_builder().build());

/// Set of non-whitespace characters to be used for non-whitespace character classes.
static NOT_WHITESPACE_SET: LazyLock<CodePointInversionList> = LazyLock::new(|| {
let mut set_builder = create_whitespace_set_builder();
set_builder.complement();
set_builder.build()
});

fn create_word_set_builder() -> CodePointInversionListBuilder {
let mut set_builder = CodePointInversionListBuilder::new();

set_builder.add_range('a'..='z');
set_builder.add_range('A'..='Z');
set_builder.add_range('0'..='9');
set_builder.add_char('_');

set_builder
}

fn create_whitespace_set_builder() -> CodePointInversionListBuilder {
// All code points on the right hand side of WhiteSpace or LineTerminator productions in the
// spec.
let mut set_builder = CodePointInversionListBuilder::new();

set_builder.add_range('\u{0009}'..='\u{000D}');
set_builder.add_char('\u{0020}');
set_builder.add_char('\u{00A0}');
set_builder.add_char('\u{1680}');
set_builder.add_range('\u{2000}'..='\u{200A}');
set_builder.add_range('\u{2028}'..='\u{2029}');
set_builder.add_char('\u{202F}');
set_builder.add_char('\u{205F}');
set_builder.add_char('\u{3000}');
set_builder.add_char('\u{FEFF}');

set_builder
}

Expand Down
39 changes: 0 additions & 39 deletions src/js/runtime/regexp/instruction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,26 +153,6 @@ pub enum OpCode {
/// Layout: [[opcode: u8] [start_code_point: u24] [end_code_point: u32]]
CompareBetween,

/// Set the compare register to true if the current code point is a digit (\d)
///
/// Layout: [[opcode: u8] [padding: u24]]
CompareIsDigit,

/// Set the compare register to true if the current code point is not a digit (\d)
///
/// Layout: [[opcode: u8] [padding: u24]]
CompareIsNotDigit,

/// Set the compare register to true if the current code point is a whitespace (\S)
///
/// Layout: [[opcode: u8] [padding: u24]]
CompareIsWhitespace,

/// Set the compare register to true if the current code point is not a whitespace (\S)
///
/// Layout: [[opcode: u8] [padding: u24]]
CompareIsNotWhitespace,

/// Start a lookahead with operands `is_ahead`, `is_positive`, and `body_branch`
/// which is the instruction that starts the lookaround body.
///
Expand Down Expand Up @@ -209,10 +189,6 @@ impl OpCode {
OpCode::ConsumeIfFalse => ConsumeIfFalseInstruction::SIZE,
OpCode::CompareEquals => CompareEqualsInstruction::SIZE,
OpCode::CompareBetween => CompareBetweenInstruction::SIZE,
OpCode::CompareIsDigit => CompareIsDigitInstruction::SIZE,
OpCode::CompareIsNotDigit => CompareIsNotDigitInstruction::SIZE,
OpCode::CompareIsWhitespace => CompareIsWhitespaceInstruction::SIZE,
OpCode::CompareIsNotWhitespace => CompareIsNotWhitespaceInstruction::SIZE,
OpCode::Lookaround => LookaroundInstruction::SIZE,
}
}
Expand Down Expand Up @@ -279,14 +255,6 @@ impl Instruction {
OpCode::ConsumeIfFalse => self.cast::<ConsumeIfFalseInstruction>().debug_print(),
OpCode::CompareEquals => self.cast::<CompareEqualsInstruction>().debug_print(),
OpCode::CompareBetween => self.cast::<CompareBetweenInstruction>().debug_print(),
OpCode::CompareIsDigit => self.cast::<CompareIsDigitInstruction>().debug_print(),
OpCode::CompareIsNotDigit => self.cast::<CompareIsNotDigitInstruction>().debug_print(),
OpCode::CompareIsWhitespace => {
self.cast::<CompareIsWhitespaceInstruction>().debug_print()
}
OpCode::CompareIsNotWhitespace => self
.cast::<CompareIsNotWhitespaceInstruction>()
.debug_print(),
OpCode::Lookaround => self.cast::<LookaroundInstruction>().debug_print(),
}
}
Expand Down Expand Up @@ -378,13 +346,6 @@ nullary_regexp_bytcode_instruction!(
);
nullary_regexp_bytcode_instruction!(ConsumeIfTrueInstruction, OpCode::ConsumeIfTrue);
nullary_regexp_bytcode_instruction!(ConsumeIfFalseInstruction, OpCode::ConsumeIfFalse);
nullary_regexp_bytcode_instruction!(CompareIsDigitInstruction, OpCode::CompareIsDigit);
nullary_regexp_bytcode_instruction!(CompareIsNotDigitInstruction, OpCode::CompareIsNotDigit);
nullary_regexp_bytcode_instruction!(CompareIsWhitespaceInstruction, OpCode::CompareIsWhitespace);
nullary_regexp_bytcode_instruction!(
CompareIsNotWhitespaceInstruction,
OpCode::CompareIsNotWhitespace
);

regexp_bytecode_instruction!(
LiteralInstruction,
Expand Down
45 changes: 6 additions & 39 deletions src/js/runtime/regexp/matcher.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use crate::js::{
common::{
icu::ICU,
unicode::{
is_decimal_digit, is_newline, is_whitespace, try_encode_surrogate_pair, CodePoint,
},
unicode::{is_newline, try_encode_surrogate_pair, CodePoint},
},
parser::lexer_stream::{
HeapOneByteLexerStream, HeapTwoByteCodePointLexerStream, HeapTwoByteCodeUnitLexerStream,
Expand All @@ -22,12 +20,11 @@ use super::{
AssertEndInstruction, AssertEndOrNewlineInstruction, AssertNotWordBoundaryInstruction,
AssertStartInstruction, AssertStartOrNewlineInstruction, AssertWordBoundaryInstruction,
BackreferenceInstruction, BranchInstruction, ClearCaptureInstruction,
CompareBetweenInstruction, CompareEqualsInstruction, CompareIsDigitInstruction,
CompareIsNotDigitInstruction, CompareIsNotWhitespaceInstruction,
CompareIsWhitespaceInstruction, ConsumeIfFalseInstruction, ConsumeIfTrueInstruction,
Instruction, JumpInstruction, LiteralInstruction, LookaroundInstruction, LoopInstruction,
MarkCapturePointInstruction, ProgressInstruction, TInstruction, WildcardInstruction,
WildcardNoNewlineInstruction, WordBoundaryMoveToPreviousInstruction,
CompareBetweenInstruction, CompareEqualsInstruction, ConsumeIfFalseInstruction,
ConsumeIfTrueInstruction, Instruction, JumpInstruction, LiteralInstruction,
LookaroundInstruction, LoopInstruction, MarkCapturePointInstruction, ProgressInstruction,
TInstruction, WildcardInstruction, WildcardNoNewlineInstruction,
WordBoundaryMoveToPreviousInstruction,
},
};

Expand Down Expand Up @@ -439,36 +436,6 @@ impl<T: LexerStream> MatchEngine<T> {

self.advance_instruction::<CompareBetweenInstruction>();
}
OpCode::CompareIsDigit => {
if is_decimal_digit(self.string_lexer.current()) {
self.compare_register = true;
}

self.advance_instruction::<CompareIsDigitInstruction>();
}
OpCode::CompareIsNotDigit => {
if !is_decimal_digit(self.string_lexer.current()) {
self.compare_register = true;
}

self.advance_instruction::<CompareIsNotDigitInstruction>();
}
OpCode::CompareIsWhitespace => {
let current = self.string_lexer.current();
if is_whitespace(current) || is_newline(current) {
self.compare_register = true;
}

self.advance_instruction::<CompareIsWhitespaceInstruction>();
}
OpCode::CompareIsNotWhitespace => {
let current = self.string_lexer.current();
if !is_whitespace(current) && !is_newline(current) {
self.compare_register = true;
}

self.advance_instruction::<CompareIsNotWhitespaceInstruction>();
}
OpCode::Lookaround => {
let instr = instr.cast::<LookaroundInstruction>();
let is_ahead = instr.is_ahead();
Expand Down
Binary file added tests/js_regexp_bytecode/class/digit_class.exp
Binary file not shown.
11 changes: 11 additions & 0 deletions tests/js_regexp_bytecode/class/digit_class.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/\d\D/;

// Shorthand as single element of class
/[\d][\D]/;

// Shorthand mixed in class
/[\da][\D4]/;
/[\da-z][\D4-6]/;

// Union is the entire unicode range
/[\d\D]/;
71 changes: 0 additions & 71 deletions tests/js_regexp_bytecode/class/shorthand.exp

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
// Shorthand digit and whitespace classes have own instructions
/\d\D\s\S/;
/\s\S/;

// Shorthand as single element of class
/[\d][\D][\s][\S]/;
/[\s][\S]/;

// Shorthand mixed in class
/[\da][\Da][\sa][\Sa]/;
/[\da-z][\Da-z][\sa-z][\Sa-z]/
/[\sa][\S\x0B]/;
/[\sa-z][\S\x0A-\x0C]/;

// Union is the entire unicode range
/[\s\S]/;
Binary file modified tests/js_regexp_bytecode/class/word.exp
Binary file not shown.
5 changes: 4 additions & 1 deletion tests/js_regexp_bytecode/class/word.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@

// Shorthand mixed in class
/[\wa][\Wa]/;
/[\wa-z][\Wa-z]/;
/[\wa-z][\Wa-z]/;

// Union is the entire unicode range
/[\w\W]/;

0 comments on commit 72b163a

Please sign in to comment.