From 4117a434a215749f7119e6a284666b0f2473c709 Mon Sep 17 00:00:00 2001 From: Ron Buckton Date: Wed, 15 Nov 2023 16:19:36 -0500 Subject: [PATCH] Align proposal spec to latest ecma262 spec text --- spec.emu | 978 ++++++++++++++----------------------------------------- 1 file changed, 237 insertions(+), 741 deletions(-) diff --git a/spec.emu b/spec.emu index 9e49cbf..dae0133 100644 --- a/spec.emu +++ b/spec.emu @@ -30,31 +30,31 @@ contributors: Ron Buckton, Ecma International

The RegExp constructor applies the following grammar to the input pattern String. An error occurs if the grammar cannot interpret the String as an expansion of |Pattern|.

Syntax

- Pattern[UnicodeMode, N] :: - Disjunction[?UnicodeMode, ?N] + Pattern[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] - Disjunction[UnicodeMode, N] :: - Alternative[?UnicodeMode, ?N] - Alternative[?UnicodeMode, ?N] `|` Disjunction[?UnicodeMode, ?N] + Disjunction[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + Alternative[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + Alternative[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `|` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] - Alternative[UnicodeMode, N] :: + Alternative[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: [empty] - Alternative[?UnicodeMode, ?N] Term[?UnicodeMode, ?N] + Alternative[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] Term[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] - Term[UnicodeMode, N] :: - Assertion[?UnicodeMode, ?N] - Atom[?UnicodeMode, ?N] - Atom[?UnicodeMode, ?N] Quantifier + Term[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + Assertion[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + Atom[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + Atom[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] Quantifier - Assertion[UnicodeMode, N] :: + Assertion[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: `^` `$` `\` `b` `\` `B` - `(` `?` `=` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` `!` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` `<=` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` `<!` Disjunction[?UnicodeMode, ?N] `)` + `(` `?` `=` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` `!` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` `<=` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` `<!` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` Quantifier :: QuantifierPrefix @@ -68,15 +68,15 @@ contributors: Ron Buckton, Ecma International `{` DecimalDigits[~Sep] `,` `}` `{` DecimalDigits[~Sep] `,` DecimalDigits[~Sep] `}` - Atom[UnicodeMode, N] :: + Atom[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: PatternCharacter `.` - `\` AtomEscape[?UnicodeMode, ?N] - CharacterClass[?UnicodeMode] - `(` GroupSpecifier[?UnicodeMode] Disjunction[?UnicodeMode, ?N] `)` - `(` `?` `:` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` + `\` AtomEscape[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + CharacterClass[?UnicodeMode, ?UnicodeSetsMode] + `(` GroupSpecifier[?UnicodeMode]? Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` `:` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` SyntaxCharacter :: one of `^` `$` `\` `.` `*` `+` `?` `(` `)` `[` `]` `{` `}` `|` @@ -84,15 +84,15 @@ contributors: Ron Buckton, Ecma International PatternCharacter :: SourceCharacter but not SyntaxCharacter - AtomEscape[UnicodeMode, N] :: + AtomEscape[UnicodeMode, NamedCaptureGroups] :: DecimalEscape CharacterClassEscape[?UnicodeMode] CharacterEscape[?UnicodeMode] - [+N] `k` GroupName[?UnicodeMode] + [+NamedCaptureGroups] `k` GroupName[?UnicodeMode] CharacterEscape[UnicodeMode] :: ControlEscape - `c` ControlLetter + `c` AsciiLetter `0` [lookahead ∉ DecimalDigit] HexEscapeSequence RegExpUnicodeEscapeSequence[?UnicodeMode] @@ -101,12 +101,7 @@ contributors: Ron Buckton, Ecma International ControlEscape :: one of `f` `n` `r` `t` `v` - ControlLetter :: one of - `a` `b` `c` `d` `e` `f` `g` `h` `i` `j` `k` `l` `m` `n` `o` `p` `q` `r` `s` `t` `u` `v` `w` `x` `y` `z` - `A` `B` `C` `D` `E` `F` `G` `H` `I` `J` `K` `L` `M` `N` `O` `P` `Q` `R` `S` `T` `U` `V` `W` `X` `Y` `Z` - GroupSpecifier[UnicodeMode] :: - [empty] `?` GroupName[?UnicodeMode] GroupName[UnicodeMode] :: @@ -193,26 +188,27 @@ contributors: Ron Buckton, Ecma International DecimalDigit UnicodePropertyNameCharacter :: - ControlLetter + AsciiLetter `_` - CharacterClass[UnicodeMode] :: - `[` [lookahead != `^`] ClassRanges[?UnicodeMode] `]` - `[` `^` ClassRanges[?UnicodeMode] `]` + CharacterClass[UnicodeMode, UnicodeSetsMode] :: + `[` [lookahead != `^`] ClassContents[?UnicodeMode, ?UnicodeSetsMode] `]` + `[` `^` ClassContents[?UnicodeMode, ?UnicodeSetsMode] `]` - ClassRanges[UnicodeMode] :: + ClassContents[UnicodeMode, UnicodeSetsMode] :: [empty] - NonemptyClassRanges[?UnicodeMode] + [~UnicodeSetsMode] NonemptyClassRanges[?UnicodeMode] + [+UnicodeSetsMode] ClassSetExpression NonemptyClassRanges[UnicodeMode] :: ClassAtom[?UnicodeMode] ClassAtom[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] - ClassAtom[?UnicodeMode] `-` ClassAtom[?UnicodeMode] ClassRanges[?UnicodeMode] + ClassAtom[?UnicodeMode] `-` ClassAtom[?UnicodeMode] ClassContents[?UnicodeMode, ~UnicodeSetsMode] NonemptyClassRangesNoDash[UnicodeMode] :: ClassAtom[?UnicodeMode] ClassAtomNoDash[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] - ClassAtomNoDash[?UnicodeMode] `-` ClassAtom[?UnicodeMode] ClassRanges[?UnicodeMode] + ClassAtomNoDash[?UnicodeMode] `-` ClassAtom[?UnicodeMode] ClassContents[?UnicodeMode, ~UnicodeSetsMode] ClassAtom[UnicodeMode] :: `-` @@ -227,8 +223,72 @@ contributors: Ron Buckton, Ecma International [+UnicodeMode] `-` CharacterClassEscape[?UnicodeMode] CharacterEscape[?UnicodeMode] + + ClassSetExpression :: + ClassUnion + ClassIntersection + ClassSubtraction + + ClassUnion :: + ClassSetRange ClassUnion? + ClassSetOperand ClassUnion? + + ClassIntersection :: + ClassSetOperand `&&` [lookahead ≠ `&`] ClassSetOperand + ClassIntersection `&&` [lookahead ≠ `&`] ClassSetOperand + + ClassSubtraction :: + ClassSetOperand `--` ClassSetOperand + ClassSubtraction `--` ClassSetOperand + + ClassSetRange :: + ClassSetCharacter `-` ClassSetCharacter + + ClassSetOperand :: + NestedClass + ClassStringDisjunction + ClassSetCharacter + + NestedClass :: + `[` [lookahead ≠ `^`] ClassContents[+UnicodeMode, +UnicodeSetsMode] `]` + `[^` ClassContents[+UnicodeMode, +UnicodeSetsMode] `]` + `\` CharacterClassEscape[+UnicodeMode] + + The first two lines here are equivalent to |CharacterClass|. + + + ClassStringDisjunction :: + `\q{` ClassStringDisjunctionContents `}` + ClassStringDisjunctionContents :: + ClassString + ClassString `|` ClassStringDisjunctionContents + + ClassString :: + [empty] + NonEmptyClassString + + NonEmptyClassString :: + ClassSetCharacter NonEmptyClassStringopt + + ClassSetCharacter :: + [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter + `\` CharacterEscape[+UnicodeMode] + `\` ClassSetReservedPunctuator + `\b` + + ClassSetReservedDoublePunctuator :: one of + `&&` `!!` `##` `$$` `%%` `**` `++` `,,` `..` `::` `;;` `<<` `==` `>>` `??` `@@` `^^` ```` `~~` + + ClassSetSyntaxCharacter :: one of + `(` `)` `[` `]` `{` `}` `/` `-` `\` `|` + + ClassSetReservedPunctuator :: one of + `&` `-` `!` `#` `%` `,` + `:` `;` `<` `=` `>` `@` + ``` `~` +

A number of productions in this section are given alternative definitions in section .

@@ -236,54 +296,6 @@ contributors: Ron Buckton, Ecma International

Pattern Semantics

- -

Notation

-

The descriptions below use the following aliases:

-
    -
  • - _Input_ is a List whose elements are the characters of the String being matched by the regular expression pattern. Each character is either a code unit or a code point, depending upon the kind of pattern involved. The notation _Input_[_n_] means the _n_th character of _Input_, where _n_ can range between 0 (inclusive) and _InputLength_ (exclusive). -
  • -
  • - _InputLength_ is the number of characters in _Input_. -
  • -
  • - _NcapturingParens_ is the total number of left-capturing parentheses (i.e. the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes) in the pattern. A left-capturing parenthesis is any `(` pattern character that is matched by the `(` terminal of the Atom :: `(` GroupSpecifier Disjunction `)` production. -
  • -
  • - _DotAll_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"s"* and otherwise is *false*. -
  • -
  • - _IgnoreCase_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"i"* and otherwise is *false*. -
  • -
  • - _Multiline_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"m"* and otherwise is *false*. -
  • -
  • - _Unicode_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"u"* and otherwise is *false*. -
  • -
  • - _WordCharacters_ is the mathematical set that is the union of all sixty-three characters in *"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"* (letters, numbers, and U+005F (LOW LINE) in the Unicode Basic Latin block) and all characters _c_ for which _c_ is not in that set but Canonicalize(_c_) is. _WordCharacters_ cannot contain more than sixty-three characters unless _Unicode_ and _IgnoreCase_ are both *true*. -
  • -
-

Furthermore, the descriptions below use the following internal data structures:

-
    -
  • - A CharSet is a mathematical set of characters. When the _Unicode_ flag is *true*, “all characters” means the CharSet containing all code point values; otherwise “all characters” means the CharSet containing all code unit values. -
  • -
  • - A State is an ordered pair (_endIndex_, _captures_) where _endIndex_ is an integer and _captures_ is a List of _NcapturingParens_ values. States are used to represent partial match states in the regular expression matching algorithms. The _endIndex_ is one plus the index of the last input character matched so far by the pattern, while _captures_ holds the results of capturing parentheses. The _n_th element of _captures_ is either a List of characters that represents the value obtained by the _n_th set of capturing parentheses or *undefined* if the _n_th set of capturing parentheses hasn't been reached yet. Due to backtracking, many States may be in use at any time during the matching process. -
  • -
  • - A MatchResult is either a State or the special token ~failure~ that indicates that the match failed. -
  • -
  • - A Continuation is an Abstract Closure that takes one State argument and returns a MatchResult result. The Continuation attempts to match the remaining portion (specified by the closure's captured values) of the pattern against _Input_, starting at the intermediate state given by its State argument. If the match succeeds, the Continuation returns the final State that it reached; if the match fails, the Continuation returns ~failure~. -
  • -
  • - A Matcher is an Abstract Closure that takes two arguments—a State and a Continuation—and returns a MatchResult result. A Matcher attempts to match a middle subpattern (specified by the closure's captured values) of the pattern against _Input_, starting at the intermediate state given by its State argument. The Continuation argument should be a closure that matches the rest of the pattern. After matching the subpattern of a pattern to obtain a new State, the Matcher then calls Continuation on that new State to test if the rest of the pattern can match as well. If it can, the Matcher returns the State returned by Continuation; if not, the Matcher may try different choices at its choice points, repeatedly calling Continuation until it either succeeds or all possibilities have been exhausted. -
  • -
-
@@ -300,310 +312,19 @@ contributors: Ron Buckton, Ecma International
  • It is a Syntax Error if any code point in the source text matched by the first |RegularExpressionFlags| is also contained in the source text matched by the second |RegularExpressionFlags|. - - -

    Modifiers Records

    -

    A Modifiers Record is a Record value used to encapsulate information about the regular expression flags that apply to a subpattern.

    -

    Modifiers Records have the fields listed in .

    - - - - - - - - - - - - - - - - - - - - - - -
    Field NameValueMeaning
    [[DotAll]]a BooleanIndicates whether the *"s"* flag is currently enabled.
    [[IgnoreCase]]a BooleanIndicates whether the *"i"* flag is currently enabled.
    [[Multiline]]a BooleanIndicates whether the *"m"* flag is currently enabled.
    -
    -
    - -

    Runtime Semantics: CompilePattern

    -
    -
    description
    -
    It returns an Abstract Closure that takes a String and a non-negative integer and returns a MatchResult.
    -
    - Pattern :: Disjunction - - 1. Let _modifiers_ be the Modifiers Record { [[DotAll]]: _DotAll_, [[IgnoreCase]]: _IgnoreCase_, [[Multiline]]: _Multiline_ }. - 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~forward~ and _modifiers_. - 1. Return a new Abstract Closure with parameters (_str_, _index_) that captures _m_ and performs the following steps when called: - 1. Assert: Type(_str_) is String. - 1. Assert: _index_ is a non-negative integer which is ≤ the length of _str_. - 1. If _Unicode_ is *true*, let _Input_ be StringToCodePoints(_str_). Otherwise, let _Input_ be a List whose elements are the code units that are the elements of _str_. _Input_ will be used throughout the algorithms in . Each element of _Input_ is considered to be a character. - 1. Let _InputLength_ be the number of characters contained in _Input_. This alias will be used throughout the algorithms in . - 1. Let _listIndex_ be the index into _Input_ of the character that was obtained from element _index_ of _str_. - 1. Let _c_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _y_. - 1. Let _cap_ be a List of _NcapturingParens_ *undefined* values, indexed 1 through _NcapturingParens_. - 1. Let _x_ be the State (_listIndex_, _cap_). - 1. Return _m_(_x_, _c_). - - -

    A Pattern compiles to an Abstract Closure value. RegExpBuiltinExec can then apply this procedure to a String and an offset within the String to determine whether the pattern would match starting at exactly that offset within the String, and, if it does match, what the values of the capturing parentheses would be. The algorithms in are designed so that compiling a pattern may throw a *SyntaxError* exception; on the other hand, once the pattern is successfully compiled, applying the resulting Abstract Closure to find a match in a String cannot throw an exception (except for any implementation-defined exceptions that can occur anywhere such as out-of-memory).

    -
    -
    - - -

    - Runtime Semantics: CompileSubpattern ( - _direction_: ~forward~ or ~backward~, - _modifiers_: a Modifiers Record, - ): a Matcher -

    -
    -
    - -

    This section is amended in B.1.2.4.

    -
    - - - Disjunction :: Alternative `|` Disjunction - - 1. Let _m1_ be CompileSubpattern of |Alternative| with arguments _direction_ and _modifiers_. - 1. Let _m2_ be CompileSubpattern of |Disjunction| with arguments _direction_ and _modifiers_. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _m1_ and _m2_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _r_ be _m1_(_x_, _c_). - 1. If _r_ is not ~failure~, return _r_. - 1. Return _m2_(_x_, _c_). - - -

    The `|` regular expression operator separates two alternatives. The pattern first tries to match the left |Alternative| (followed by the sequel of the regular expression); if it fails, it tries to match the right |Disjunction| (followed by the sequel of the regular expression). If the left |Alternative|, the right |Disjunction|, and the sequel all have choice points, all choices in the sequel are tried before moving on to the next choice in the left |Alternative|. If choices in the left |Alternative| are exhausted, the right |Disjunction| is tried instead of the left |Alternative|. Any capturing parentheses inside a portion of the pattern skipped by `|` produce *undefined* values instead of Strings. Thus, for example,

    -
    /a|ab/.exec("abc")
    -

    returns the result *"a"* and not *"ab"*. Moreover,

    -
    /((a)|(ab))((c)|(bc))/.exec("abc")
    -

    returns the array

    -
    ["abc", "a", "a", undefined, "bc", undefined, "bc"]
    -

    and not

    -
    ["abc", "ab", undefined, "ab", "c", "c", undefined]
    -

    The order in which the two alternatives are tried is independent of the value of _direction_.

    -
    - - - Alternative :: [empty] - - 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Return _c_(_x_). - - Alternative :: Alternative Term - - 1. Let _m1_ be CompileSubpattern of |Alternative| with arguments _direction_ and _modifiers_. - 1. Let _m2_ be CompileSubpattern of |Term| with arguments _direction_ and _modifiers_. - 1. If _direction_ is ~forward~, then - 1. Let _m_ be a new Matcher with parameters (_x_, _c_) that captures _m1_ and _m2_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures _c_ and _m2_ and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _m2_(_y_, _c_). - 1. Return _m1_(_x_, _d_). - 1. Else, - 1. Assert: _direction_ is ~backward~. - 1. Let _m_ be a new Matcher with parameters (_x_, _c_) that captures _m1_ and _m2_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures _c_ and _m1_ and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _m1_(_y_, _c_). - 1. Return _m2_(_x_, _d_). - - -

    Consecutive |Term|s try to simultaneously match consecutive portions of _Input_. When _direction_ is ~forward~, if the left |Alternative|, the right |Term|, and the sequel of the regular expression all have choice points, all choices in the sequel are tried before moving on to the next choice in the right |Term|, and all choices in the right |Term| are tried before moving on to the next choice in the left |Alternative|. When _direction_ is ~backward~, the evaluation order of |Alternative| and |Term| are reversed.

    -
    - - - Term :: Assertion - - 1. Return CompileAssertion of |Assertion| with argument _modifiers_. - - -

    The resulting Matcher is independent of _direction_.

    -
    - Term :: Atom - - 1. Return CompileAtom of |Atom| with arguments _direction_ and _modifiers_. - - Term :: Atom Quantifier - - 1. Let _m_ be CompileAtom of |Atom| with arguments _direction_ and _modifiers_. - 1. Let _q_ be CompileQuantifier of |Quantifier|. - 1. Assert: _q_.[[Min]] ≤ _q_.[[Max]]. - 1. Let _parenIndex_ be the number of left-capturing parentheses in the entire regular expression that occur to the left of this |Term|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes prior to or enclosing this |Term|. - 1. Let _parenCount_ be the number of left-capturing parentheses in |Atom|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes enclosed by |Atom|. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_, _q_, _parenIndex_, and _parenCount_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Return RepeatMatcher(_m_, _q_.[[Min]], _q_.[[Max]], _q_.[[Greedy]], _x_, _c_, _parenIndex_, _parenCount_). - -
    - - -

    - Runtime Semantics: CompileAssertion ( - _modifiers_: a Modifiers Record, - ): a Matcher -

    -
    -
    - -

    This section is amended in B.1.2.5.

    -
    - Assertion :: `^` - - 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _e_ be _x_'s _endIndex_. - 1. If _e_ = 0, or if _Multiline__modifiers_.[[Multiline]] is *true* and the character _Input_[_e_ - 1] is one of |LineTerminator|, then - 1. Return _c_(_x_). - 1. Return ~failure~. - - -

    Even when the `y` flag is used with a pattern, `^` always matches only at the beginning of _Input_, or (if _Multiline__modifiers_.[[Multiline]] is *true*) at the beginning of a line.

    -
    - Assertion :: `$` - - 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _e_ be _x_'s _endIndex_. - 1. If _e_ = _InputLength_, or if _Multiline__modifiers_.[[Multiline]] is *true* and the character _Input_[_e_] is one of |LineTerminator|, then - 1. Return _c_(_x_). - 1. Return ~failure~. - - Assertion :: `\` `b` - - 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _e_ be _x_'s _endIndex_. - 1. Let _a_ be IsWordChar(_e_ - 1, _modifiers_). - 1. Let _b_ be IsWordChar(_e_, _modifiers_). - 1. If _a_ is *true* and _b_ is *false*, or if _a_ is *false* and _b_ is *true*, return _c_(_x_). - 1. Return ~failure~. - - Assertion :: `\` `B` - - 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _e_ be _x_'s _endIndex_. - 1. Let _a_ be IsWordChar(_e_ - 1, _modifiers_). - 1. Let _b_ be IsWordChar(_e_, _modifiers_). - 1. If _a_ is *true* and _b_ is *true*, or if _a_ is *false* and _b_ is *false*, return _c_(_x_). - 1. Return ~failure~. - - Assertion :: `(` `?` `=` Disjunction `)` - - 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~forward~ and _modifiers_. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _y_. - 1. Let _r_ be _m_(_x_, _d_). - 1. If _r_ is ~failure~, return ~failure~. - 1. Let _y_ be _r_'s State. - 1. Let _cap_ be _y_'s _captures_ List. - 1. Let _xe_ be _x_'s _endIndex_. - 1. Let _z_ be the State (_xe_, _cap_). - 1. Return _c_(_z_). - - Assertion :: `(` `?` `!` Disjunction `)` - - 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~forward~ and _modifiers_. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _y_. - 1. Let _r_ be _m_(_x_, _d_). - 1. If _r_ is not ~failure~, return ~failure~. - 1. Return _c_(_x_). - - Assertion :: `(` `?` `<=` Disjunction `)` - - 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~backward~ and _modifiers_. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _y_. - 1. Let _r_ be _m_(_x_, _d_). - 1. If _r_ is ~failure~, return ~failure~. - 1. Let _y_ be _r_'s State. - 1. Let _cap_ be _y_'s _captures_ List. - 1. Let _xe_ be _x_'s _endIndex_. - 1. Let _z_ be the State (_xe_, _cap_). - 1. Return _c_(_z_). - - Assertion :: `(` `?` `<!` Disjunction `)` - - 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~backward~ and _modifiers_. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Return _y_. - 1. Let _r_ be _m_(_x_, _d_). - 1. If _r_ is not ~failure~, return ~failure~. - 1. Return _c_(_x_). - - - -

    - IsWordChar ( - _e_: an integer, - _modifiers_: a Modifiers Record, - ) -

    -
    -
    - - 1. If _e_ = -1 or _e_ is _InputLength_, return *false*. - 1. Let _c_ be the character _Input_[_e_]. - 1. Let _wordCharacters_ be GetWordCharacters(_modifiers_). - 1. If _c_ is in _WordCharacters__wordCharacters_, return *true*. - 1. Return *false*. - -
    -
    -

    Runtime Semantics: CompileAtom ( + _rer_: a RegExp Record, _direction_: ~forward~ or ~backward~, - _modifiers_: a Modifiers Record, ): a Matcher

    -

    This section is amended in B.1.2.6.

    +

    This section is amended in .

    @@ -611,49 +332,71 @@ contributors: Ron Buckton, Ecma International 1. Let _ch_ be the character matched by |PatternCharacter|. 1. Let _A_ be a one-element CharSet containing the character _ch_. - 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). + 1. Return CharacterSetMatcher(_rer_, _A_, *false*, _direction_). Atom :: `.` - 1. Let _A_ be the CharSet of all characters. - 1. If _DotAll__modifiers_.[[DotAll]] is not *true*, then + 1. Let _A_ be AllCharacters(_rer_). + 1. If _rer_.[[DotAll]] is not *true*, then 1. Remove from _A_ all characters corresponding to a code point on the right-hand side of the |LineTerminator| production. - 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). + 1. Return CharacterSetMatcher(_rer_, _A_, *false*, _direction_). Atom :: CharacterClass - 1. Let _cc_ be CompileCharacterClass of |CharacterClass|. - 1. Return CharacterSetMatcher(_cc_.[[CharSet]], _cc_.[[Invert]], _direction_, _modifiers_). - - Atom :: `(` GroupSpecifier Disjunction `)` - - 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments _direction_ and _modifiers_. - 1. Let _parenIndex_ be the number of left-capturing parentheses in the entire regular expression that occur to the left of this |Atom|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes prior to or enclosing this |Atom|. + 1. Let _cc_ be CompileCharacterClass of |CharacterClass| with argument _rer_. + 1. Let _cs_ be _cc_.[[CharSet]]. + 1. If _rer_.[[UnicodeSets]] is *false*, or if every CharSetElement of _cs_ consists of a single character (including if _cs_ is empty), return CharacterSetMatcher(_rer_, _cs_, _cc_.[[Invert]], _direction_). + 1. Assert: _cc_.[[Invert]] is *false*. + 1. Let _lm_ be an empty List of Matchers. + 1. For each CharSetElement _s_ in _cs_ containing more than 1 character, iterating in descending order of length, do + 1. Let _cs2_ be a one-element CharSet containing the last code point of _s_. + 1. Let _m2_ be CharacterSetMatcher(_rer_, _cs2_, *false*, _direction_). + 1. For each code point _c1_ in _s_, iterating backwards from its second-to-last code point, do + 1. Let _cs1_ be a one-element CharSet containing _c1_. + 1. Let _m1_ be CharacterSetMatcher(_rer_, _cs1_, *false*, _direction_). + 1. Set _m2_ to MatchSequence(_m1_, _m2_, _direction_). + 1. Append _m2_ to _lm_. + 1. Let _singles_ be the CharSet containing every CharSetElement of _cs_ that consists of a single character. + 1. Append CharacterSetMatcher(_rer_, _singles_, *false*, _direction_) to _lm_. + 1. If _cs_ contains the empty sequence of characters, append EmptyMatcher() to _lm_. + 1. Let _m2_ be the last Matcher in _lm_. + 1. For each Matcher _m1_ of _lm_, iterating backwards from its second-to-last element, do + 1. Set _m2_ to MatchTwoAlternatives(_m1_, _m2_). + 1. Return _m2_. + + Atom :: `(` GroupSpecifier? Disjunction `)` + + 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments _rer_ and _direction_. + 1. Let _parenIndex_ be CountLeftCapturingParensBefore(|Atom|). 1. Return a new Matcher with parameters (_x_, _c_) that captures _direction_, _m_, and _parenIndex_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _d_ be a new Continuation with parameters (_y_) that captures _x_, _c_, _direction_, and _parenIndex_ and performs the following steps when called: - 1. Assert: _y_ is a State. - 1. Let _cap_ be a copy of _y_'s _captures_ List. - 1. Let _xe_ be _x_'s _endIndex_. - 1. Let _ye_ be _y_'s _endIndex_. + 1. Assert: _x_ is a MatchState. + 1. Assert: _c_ is a MatcherContinuation. + 1. Let _d_ be a new MatcherContinuation with parameters (_y_) that captures _x_, _c_, _direction_, and _parenIndex_ and performs the following steps when called: + 1. Assert: _y_ is a MatchState. + 1. Let _cap_ be a copy of _y_.[[Captures]]. + 1. Let _Input_ be _x_.[[Input]]. + 1. Let _xe_ be _x_.[[EndIndex]]. + 1. Let _ye_ be _y_.[[EndIndex]]. 1. If _direction_ is ~forward~, then - 1. Assert: _xe_ ≤ _ye_. - 1. Let _s_ be a List whose elements are the characters of _Input_ at indices _xe_ (inclusive) through _ye_ (exclusive). + 1. Assert: _xe_ ≤ _ye_. + 1. Let _r_ be the CaptureRange { [[StartIndex]]: _xe_, [[EndIndex]]: _ye_ }. 1. Else, 1. Assert: _direction_ is ~backward~. - 1. Assert: _ye_ ≤ _xe_. - 1. Let _s_ be a List whose elements are the characters of _Input_ at indices _ye_ (inclusive) through _xe_ (exclusive). - 1. Set _cap_[_parenIndex_ + 1] to _s_. - 1. Let _z_ be the State (_ye_, _cap_). + 1. Assert: _ye_ ≤ _xe_. + 1. Let _r_ be the CaptureRange { [[StartIndex]]: _ye_, [[EndIndex]]: _xe_ }. + 1. Set _cap_[_parenIndex_ + 1] to _r_. + 1. Let _z_ be the MatchState { [[Input]]: _Input_, [[EndIndex]]: _ye_, [[Captures]]: _cap_ }. 1. Return _c_(_z_). 1. Return _m_(_x_, _d_). + +

    Parentheses of the form `(` |Disjunction| `)` serve both to group the components of the |Disjunction| pattern together and to save the result of the match. The result can be used either in a backreference (`\\` followed by a non-zero decimal number), referenced in a replace String, or returned as part of an array from the regular expression matching Abstract Closure. To inhibit the capturing behaviour of parentheses, use the form `(?:` |Disjunction| `)` instead.

    +
    - Atom :: `(` `?` `:` Disjunction `)` + Atom :: `(?:` Disjunction `)` - 1. Return CompileSubpattern of |Disjunction| with arguments _direction_ and _modifiers_. + 1. Return CompileSubpattern of |Disjunction| with arguments _rer_ and _direction_. @@ -662,15 +405,15 @@ contributors: Ron Buckton, Ecma International 1. Let _addModifiers_ be the source text matched by |RegularExpressionFlags|. 1. Let _removeModifiers_ be the empty String. - 1. Let _newModifiers_ be UpdateModifiers(_modifiers_, CodePointsToString(_addModifiers_), _removeModifiers_). - 1. Return CompileSubpattern of |Disjunction| with arguments _direction_ and _newModifiers_. + 1. Let _modifiedRer_ be UpdateModifiers(_rer_, CodePointsToString(_addModifiers_), _removeModifiers_). + 1. Return CompileSubpattern of |Disjunction| with arguments _modifiedRer_ and _direction_. Atom :: `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction `)` 1. Let _addModifiers_ be the source text matched by the first |RegularExpressionFlags|. 1. Let _removeModifiers_ be the source text matched by the second |RegularExpressionFlags|. - 1. Let _newModifiers_ be UpdateModifiers(_modifiers_, CodePointsToString(_addModifiers_), CodePointsToString(_removeModifiers_)). - 1. Return CompileSubpattern of |Disjunction| with arguments _direction_ and _newModifiers_. + 1. Let _modifiedRer_ be UpdateModifiers(_rer_, CodePointsToString(_addModifiers_), CodePointsToString(_removeModifiers_)). + 1. Return CompileSubpattern of |Disjunction| with arguments _modifiedRer_ and _direction_. @@ -678,324 +421,77 @@ contributors: Ron Buckton, Ecma International AtomEscape :: DecimalEscape 1. Let _n_ be the CapturingGroupNumber of |DecimalEscape|. - 1. Assert: _n_ ≤ _NcapturingParens_. - 1. Return BackreferenceMatcher(_n_, _direction_, _modifiers_). + 1. Assert: _n_ ≤ _rer_.[[CapturingGroupsCount]]. + 1. Return BackreferenceMatcher(_rer_, _n_, _direction_). -

    An escape sequence of the form `\\` followed by a non-zero decimal number _n_ matches the result of the _n_th set of capturing parentheses (). It is an error if the regular expression has fewer than _n_ capturing parentheses. If the regular expression has _n_ or more capturing parentheses but the _n_th one is *undefined* because it has not captured anything, then the backreference always succeeds.

    +

    An escape sequence of the form `\\` followed by a non-zero decimal number _n_ matches the result of the _n_th set of capturing parentheses (). It is an error if the regular expression has fewer than _n_ capturing parentheses. If the regular expression has _n_ or more capturing parentheses but the _n_th one is *undefined* because it has not captured anything, then the backreference always succeeds.

    AtomEscape :: CharacterEscape 1. Let _cv_ be the CharacterValue of |CharacterEscape|. 1. Let _ch_ be the character whose character value is _cv_. 1. Let _A_ be a one-element CharSet containing the character _ch_. - 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). + 1. Return CharacterSetMatcher(_rer_, _A_, *false*, _direction_). AtomEscape :: CharacterClassEscape - 1. Let _A_ be CompileToCharSet of |CharacterClassEscape|. - 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). + 1. Let _cs_ be CompileToCharSet of |CharacterClassEscape| with argument _rer_. + 1. If _rer_.[[UnicodeSets]] is *false*, or if every CharSetElement of _cs_ consists of a single character (including if _cs_ is empty), return CharacterSetMatcher(_rer_, _cs_, *false*, _direction_). + 1. Let _lm_ be an empty List of Matchers. + 1. For each CharSetElement _s_ in _cs_ containing more than 1 character, iterating in descending order of length, do + 1. Let _cs2_ be a one-element CharSet containing the last code point of _s_. + 1. Let _m2_ be CharacterSetMatcher(_rer_, _cs2_, *false*, _direction_). + 1. For each code point _c1_ in _s_, iterating backwards from its second-to-last code point, do + 1. Let _cs1_ be a one-element CharSet containing _c1_. + 1. Let _m1_ be CharacterSetMatcher(_rer_, _cs1_, *false*, _direction_). + 1. Set _m2_ to MatchSequence(_m1_, _m2_, _direction_). + 1. Append _m2_ to _lm_. + 1. Let _singles_ be the CharSet containing every CharSetElement of _cs_ that consists of a single character. + 1. Append CharacterSetMatcher(_rer_, _singles_, *false*, _direction_) to _lm_. + 1. If _cs_ contains the empty sequence of characters, append EmptyMatcher() to _lm_. + 1. Let _m2_ be the last Matcher in _lm_. + 1. For each Matcher _m1_ of _lm_, iterating backwards from its second-to-last element, do + 1. Set _m2_ to MatchTwoAlternatives(_m1_, _m2_). + 1. Return _m2_. AtomEscape :: `k` GroupName - 1. Search the enclosing |Pattern| for an instance of a |GroupSpecifier| containing a |RegExpIdentifierName| which has a CapturingGroupName equal to the CapturingGroupName of the |RegExpIdentifierName| contained in |GroupName|. - 1. Assert: A unique such |GroupSpecifier| is found. - 1. Let _parenIndex_ be the number of left-capturing parentheses in the entire regular expression that occur to the left of the located |GroupSpecifier|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes prior to or enclosing the located |GroupSpecifier|, including its immediately enclosing |Atom|. - 1. Return BackreferenceMatcher(_parenIndex_, _direction_, _modifiers_). - - - -

    - CharacterSetMatcher ( - _A_: a CharSet, - _invert_: a Boolean, - _direction_: ~forward~ or ~backward~, - _modifiers_: a Modifiers Record, - ): a Matcher -

    -
    -
    - - 1. Return a new Matcher with parameters (_x_, _c_) that captures _A_, _invert_, and _direction_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _e_ be _x_'s _endIndex_. - 1. If _direction_ is ~forward~, let _f_ be _e_ + 1. - 1. Else, let _f_ be _e_ - 1. - 1. If _f_ < 0 or _f_ > _InputLength_, return ~failure~. - 1. Let _index_ be min(_e_, _f_). - 1. Let _ch_ be the character _Input_[_index_]. - 1. Let _cc_ be Canonicalize(_ch_, _modifiers_). - 1. If there exists a member _a_ of _A_ such that Canonicalize(_a_, _modifiers_) is _cc_, let _found_ be *true*. Otherwise, let _found_ be *false*. - 1. If _invert_ is *false* and _found_ is *false*, return ~failure~. - 1. If _invert_ is *true* and _found_ is *true*, return ~failure~. - 1. Let _cap_ be _x_'s _captures_ List. - 1. Let _y_ be the State (_f_, _cap_). - 1. Return _c_(_y_). - -
    - - -

    - BackreferenceMatcher ( - _n_: a positive integer, - _direction_: ~forward~ or ~backward~, - _modifiers_: a Modifiers Record, - ): a Matcher -

    -
    -
    - - 1. Assert: _n_ ≥ 1. - 1. Return a new Matcher with parameters (_x_, _c_) that captures _n_ and _direction_ and performs the following steps when called: - 1. Assert: _x_ is a State. - 1. Assert: _c_ is a Continuation. - 1. Let _cap_ be _x_'s _captures_ List. - 1. Let _s_ be _cap_[_n_]. - 1. If _s_ is *undefined*, return _c_(_x_). - 1. Let _e_ be _x_'s _endIndex_. - 1. Let _len_ be the number of elements in _s_. - 1. If _direction_ is ~forward~, let _f_ be _e_ + _len_. - 1. Else, let _f_ be _e_ - _len_. - 1. If _f_ < 0 or _f_ > _InputLength_, return ~failure~. - 1. Let _g_ be min(_e_, _f_). - 1. If there exists an integer _i_ between 0 (inclusive) and _len_ (exclusive) such that Canonicalize(_s_[_i_], _modifiers_) is not the same character value as Canonicalize(_Input_[_g_ + _i_], _modifiers_), return ~failure~. - 1. Let _y_ be the State (_f_, _cap_). - 1. Return _c_(_y_). - -
    - - -

    - Canonicalize ( - _ch_: a character, - _modifiers_: a Modifiers Record, - ): a Matcher -

    -
    -
    - - 1. If _Unicode_ is *true* and _IgnoreCase__modifiers_.[[IgnoreCase]] is *true*, then - 1. If the file CaseFolding.txt of the Unicode Character Database provides a simple or common case folding mapping for _ch_, return the result of applying that mapping to _ch_. - 1. Return _ch_. - 1. If _IgnoreCase__modifiers_.[[IgnoreCase]] is *false*, return _ch_. - 1. Assert: _ch_ is a UTF-16 code unit. - 1. Let _cp_ be the code point whose numeric value is that of _ch_. - 1. Let _u_ be the result of toUppercase(« _cp_ »), according to the Unicode Default Case Conversion algorithm. - 1. Let _uStr_ be CodePointsToString(_u_). - 1. If _uStr_ does not consist of a single code unit, return _ch_. - 1. Let _cu_ be _uStr_'s single code unit element. - 1. If the numeric value of _ch_ ≥ 128 and the numeric value of _cu_ < 128, return _ch_. - 1. Return _cu_. - - -

    Parentheses of the form `(` |Disjunction| `)` serve both to group the components of the |Disjunction| pattern together and to save the result of the match. The result can be used either in a backreference (`\\` followed by a non-zero decimal number), referenced in a replace String, or returned as part of an array from the regular expression matching Abstract Closure. To inhibit the capturing behaviour of parentheses, use the form `(?:` |Disjunction| `)` instead.

    -
    - -

    The form `(?=` |Disjunction| `)` specifies a zero-width positive lookahead. In order for it to succeed, the pattern inside |Disjunction| must match at the current position, but the current position is not advanced before matching the sequel. If |Disjunction| can match at the current position in several ways, only the first one is tried. Unlike other regular expression operators, there is no backtracking into a `(?=` form (this unusual behaviour is inherited from Perl). This only matters when the |Disjunction| contains capturing parentheses and the sequel of the pattern contains backreferences to those captures.

    -

    For example,

    -
    /(?=(a+))/.exec("baaabac")
    -

    matches the empty String immediately after the first `b` and therefore returns the array:

    -
    ["", "aaa"]
    -

    To illustrate the lack of backtracking into the lookahead, consider:

    -
    /(?=(a+))a*b\1/.exec("baaabac")
    -

    This expression returns

    -
    ["aba", "a"]
    -

    and not:

    -
    ["aaaba", "a"]
    -
    - -

    The form `(?!` |Disjunction| `)` specifies a zero-width negative lookahead. In order for it to succeed, the pattern inside |Disjunction| must fail to match at the current position. The current position is not advanced before matching the sequel. |Disjunction| can contain capturing parentheses, but backreferences to them only make sense from within |Disjunction| itself. Backreferences to these capturing parentheses from elsewhere in the pattern always return *undefined* because the negative lookahead must fail for the pattern to succeed. For example,

    -
    /(.*?)a(?!(a+)b\2c)\2(.*)/.exec("baaabaac")
    -

    looks for an `a` not immediately followed by some positive number n of `a`'s, a `b`, another n `a`'s (specified by the first `\\2`) and a `c`. The second `\\2` is outside the negative lookahead, so it matches against *undefined* and therefore always succeeds. The whole expression returns the array:

    -
    ["baaabaac", "ba", undefined, "abaac"]
    -
    - -

    In case-insignificant matches when _Unicode_ is *true*, all characters are implicitly case-folded using the simple mapping provided by the Unicode standard immediately before they are compared. The simple mapping always maps to a single code point, so it does not map, for example, `ß` (U+00DF) to `SS`. It may however map a code point outside the Basic Latin range to a character within, for example, `ſ` (U+017F) to `s`. Such characters are not mapped if _Unicode_ is *false*. This prevents Unicode code points such as U+017F and U+212A from matching regular expressions such as `/[a-z]/i`, but they will match `/[a-z]/ui`.

    -
    -
    -
    - - -

    Runtime Semantics: CompileToCharSet ( ): a CharSet

    -
    -
    - -

    This section is amended in .

    -
    - - - ClassRanges :: [empty] - - 1. Return the empty CharSet. - - - - NonemptyClassRanges :: ClassAtom NonemptyClassRangesNoDash - - 1. Let _A_ be CompileToCharSet of |ClassAtom|. - 1. Let _B_ be CompileToCharSet of |NonemptyClassRangesNoDash|. - 1. Return the union of CharSets _A_ and _B_. - - NonemptyClassRanges :: ClassAtom `-` ClassAtom ClassRanges - - 1. Let _A_ be CompileToCharSet of the first |ClassAtom|. - 1. Let _B_ be CompileToCharSet of the second |ClassAtom|. - 1. Let _C_ be CompileToCharSet of |ClassRanges|. - 1. Let _D_ be CharacterRange(_A_, _B_). - 1. Return the union of _D_ and _C_. - - - - NonemptyClassRangesNoDash :: ClassAtomNoDash NonemptyClassRangesNoDash - - 1. Let _A_ be CompileToCharSet of |ClassAtomNoDash|. - 1. Let _B_ be CompileToCharSet of |NonemptyClassRangesNoDash|. - 1. Return the union of CharSets _A_ and _B_. - - NonemptyClassRangesNoDash :: ClassAtomNoDash `-` ClassAtom ClassRanges - - 1. Let _A_ be CompileToCharSet of |ClassAtomNoDash|. - 1. Let _B_ be CompileToCharSet of |ClassAtom|. - 1. Let _C_ be CompileToCharSet of |ClassRanges|. - 1. Let _D_ be CharacterRange(_A_, _B_). - 1. Return the union of _D_ and _C_. - - -

    |ClassRanges| can expand into a single |ClassAtom| and/or ranges of two |ClassAtom| separated by dashes. In the latter case the |ClassRanges| includes all characters between the first |ClassAtom| and the second |ClassAtom|, inclusive; an error occurs if either |ClassAtom| does not represent a single character (for example, if one is \w) or if the first |ClassAtom|'s character value is greater than the second |ClassAtom|'s character value.

    -
    - -

    Even if the pattern ignores case, the case of the two ends of a range is significant in determining which characters belong to the range. Thus, for example, the pattern `/[E-F]/i` matches only the letters `E`, `F`, `e`, and `f`, while the pattern `/[E-f]/i` matches all upper and lower-case letters in the Unicode Basic Latin block as well as the symbols `[`, `\\`, `]`, `^`, `_`, and `.

    -
    - -

    A `-` character can be treated literally or it can denote a range. It is treated literally if it is the first or last character of |ClassRanges|, the beginning or end limit of a range specification, or immediately follows a range specification.

    -
    - - - ClassAtom :: `-` - - 1. Return the CharSet containing the single character `-` U+002D (HYPHEN-MINUS). - - - - ClassAtomNoDash :: SourceCharacter but not one of `\` or `]` or `-` - - 1. Return the CharSet containing the character matched by |SourceCharacter|. - - - - - ClassEscape :: `b` - - ClassEscape :: `-` - - ClassEscape :: CharacterEscape - - - 1. Let _cv_ be the CharacterValue of this |ClassEscape|. - 1. Let _c_ be the character whose character value is _cv_. - 1. Return the CharSet containing the single character _c_. - - -

    A |ClassAtom| can use any of the escape sequences that are allowed in the rest of the regular expression except for `\\b`, `\\B`, and backreferences. Inside a |CharacterClass|, `\\b` means the backspace character, while `\\B` and backreferences raise errors. Using a backreference inside a |ClassAtom| causes an error.

    -
    - - - CharacterClassEscape :: `d` - - 1. Return the ten-element CharSet containing the characters `0` through `9` inclusive. - - CharacterClassEscape :: `D` - - 1. Return the CharSet containing all characters not in the CharSet returned by CharacterClassEscape :: `d` . - - CharacterClassEscape :: `s` - - 1. Return the CharSet containing all characters corresponding to a code point on the right-hand side of the |WhiteSpace| or |LineTerminator| productions. - - CharacterClassEscape :: `S` - - 1. Return the CharSet containing all characters not in the CharSet returned by CharacterClassEscape :: `s` . - - CharacterClassEscape :: `w` - - 1. Return _WordCharacters_GetWordCharacters(_modifiers_). - - CharacterClassEscape :: `W` - - 1. Return the CharSet containing all characters not in the CharSet returned by CharacterClassEscape :: `w` . - - CharacterClassEscape :: `p{` UnicodePropertyValueExpression `}` - - 1. Return the CharSet containing all Unicode code points included in CompileToCharSet of |UnicodePropertyValueExpression|. - - CharacterClassEscape :: `P{` UnicodePropertyValueExpression `}` - - 1. Return the CharSet containing all Unicode code points not included in CompileToCharSet of |UnicodePropertyValueExpression|. - - UnicodePropertyValueExpression :: UnicodePropertyName `=` UnicodePropertyValue - - 1. Let _ps_ be SourceText of |UnicodePropertyName|. - 1. Let _p_ be UnicodeMatchProperty(_ps_). - 1. Assert: _p_ is a Unicode property name or property alias listed in the “Property name and aliases” column of . - 1. Let _vs_ be SourceText of |UnicodePropertyValue|. - 1. Let _v_ be UnicodeMatchPropertyValue(_p_, _vs_). - 1. Return the CharSet containing all Unicode code points whose character database definition includes the property _p_ with value _v_. - - UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue - - 1. Let _s_ be SourceText of |LoneUnicodePropertyNameOrValue|. - 1. If UnicodeMatchPropertyValue(`General_Category`, _s_) is identical to a List of Unicode code points that is the name of a Unicode general category or general category alias listed in the “Property value and aliases” column of , then - 1. Return the CharSet containing all Unicode code points whose character database definition includes the property “General_Category” with value _s_. - 1. Let _p_ be UnicodeMatchProperty(_s_). - 1. Assert: _p_ is a binary Unicode property or binary property alias listed in the “Property name and aliases” column of . - 1. Return the CharSet containing all Unicode code points whose character database definition includes the property _p_ with value “True”. + 1. Let _matchingGroupSpecifiers_ be GroupSpecifiersThatMatch(|GroupName|). + 1. Assert: _matchingGroupSpecifiers_ contains a single |GroupSpecifier|. + 1. Let _groupSpecifier_ be the sole element of _matchingGroupSpecifiers_. + 1. Let _parenIndex_ be CountLeftCapturingParensBefore(_groupSpecifier_). + 1. Return BackreferenceMatcher(_rer_, _parenIndex_, _direction_).
    - -

    - - GetWordCharacters ( - _modifiers_: a Modifiers Record, - ): a CharSet - -

    -
    -
    - - 1. Let _wordCharacters_ be the mathematical set that is the union of all sixty-three characters in *"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"* (letters, numbers, and U+005F (LOW LINE) in the Unicode Basic Latin block) and all characters _c_ for which _c_ is not in that set but Canonicalize(_c_, _modifiers_) is. - 1. Return _wordCharacters_. - - - _wordCharacters_ cannot contain more than sixty-three characters unless _Unicode_ and _modifiers_.[[IgnoreCase]] are both *true*. - -
    -

    UpdateModifiers ( - _modifiers_: a Modifiers Record, + _rer_: a RegExp Record, _add_: a String, _remove_: a String, - ): a Modifiers + ): a RegExp Record

    - 1. Let _dotAll_ be _modifiers_.[[DotAll]]. - 1. Let _ignoreCase_ be _modifiers_.[[IgnoreCase]]. - 1. Let _multiline_ be _modifiers_.[[Multiline]]. - 1. If _add_ contains *"s"*, set _dotAll_ to *true*. + 1. Let _ignoreCase_ be _rer_.[[IgnoreCase]]. + 1. Let _multiline_ be _rer_.[[Multiline]]. + 1. Let _dotAll_ be _rer_.[[DotAll]]. + 1. Let _unicode_ be _rer_.[[Unicode]]. + 1. Let _unicodeSets_ be _rer_.[[UnicodeSets]]. + 1. Let _capturingGroupsCount_ be _rer_.[[CapturingGroupsCount]]. 1. If _add_ contains *"i"*, set _ignoreCase_ to *true*. 1. If _add_ contains *"m"*, set _multiline_ to *true*. - 1. If _remove_ contains *"s"*, set _dotAll_ to *false*. + 1. If _add_ contains *"s"*, set _dotAll_ to *true*. 1. If _remove_ contains *"i"*, set _ignoreCase_ to *false*. 1. If _remove_ contains *"m"*, set _multiline_ to *false*. - 1. Return the Modifiers Record { [[DotAll]]: _dotAll_, [[IgnoreCase]]: _ignoreCase_, [[Multiline]]: _multiline_ }. + 1. If _remove_ contains *"s"*, set _dotAll_ to *false*. + 1. Return the RegExp Record { [[IgnoreCase]]: _ignoreCase_, [[Multiline]]: _multiline_, [[DotAll]]: _dotAll_, [[Unicode]]: _unicode_, [[UnicodeSets]]: _unicodeSets_, [[CapturingGroupsCount]]: _capturingGroupsCount_ }.
    @@ -1015,86 +511,86 @@ contributors: Ron Buckton, Ecma International

    This alternative pattern grammar and semantics only changes the syntax and semantics of BMP patterns. The following grammar extensions include productions parameterized with the [UnicodeMode] parameter. However, none of these extensions change the syntax of Unicode patterns recognized when parsing with the [UnicodeMode] parameter present on the goal symbol.

    Syntax

    - Term[UnicodeMode, N] :: - [+UnicodeMode] Assertion[+UnicodeMode, ?N] - [+UnicodeMode] Atom[+UnicodeMode, ?N] Quantifier - [+UnicodeMode] Atom[+UnicodeMode, ?N] - [~UnicodeMode] QuantifiableAssertion[?N] Quantifier - [~UnicodeMode] Assertion[~UnicodeMode, ?N] - [~UnicodeMode] ExtendedAtom[?N] Quantifier - [~UnicodeMode] ExtendedAtom[?N] - - Assertion[UnicodeMode, N] :: + Term[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: + [+UnicodeMode] Assertion[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + [+UnicodeMode] Atom[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] Quantifier + [+UnicodeMode] Atom[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] + [~UnicodeMode] QuantifiableAssertion[?NamedCaptureGroups] Quantifier + [~UnicodeMode] Assertion[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] + [~UnicodeMode] ExtendedAtom[?NamedCaptureGroups] Quantifier + [~UnicodeMode] ExtendedAtom[?NamedCaptureGroups] + + Assertion[UnicodeMode, UnicodeSetsMode, NamedCaptureGroups] :: `^` `$` - `\` `b` - `\` `B` - [+UnicodeMode] `(` `?` `=` Disjunction[+UnicodeMode, ?N] `)` - [+UnicodeMode] `(` `?` `!` Disjunction[+UnicodeMode, ?N] `)` - [~UnicodeMode] QuantifiableAssertion[?N] - `(` `?` `<=` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` `<!` Disjunction[?UnicodeMode, ?N] `)` - - QuantifiableAssertion[N] :: - `(` `?` `=` Disjunction[~UnicodeMode, ?N] `)` - `(` `?` `!` Disjunction[~UnicodeMode, ?N] `)` - - ExtendedAtom[N] :: + `\b` + `\B` + [+UnicodeMode] `(?=` Disjunction[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + [+UnicodeMode] `(?!` Disjunction[+UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + [~UnicodeMode] QuantifiableAssertion[?NamedCaptureGroups] + `(?<=` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(?<!` Disjunction[?UnicodeMode, ?UnicodeSetsMode, ?NamedCaptureGroups] `)` + + QuantifiableAssertion[NamedCaptureGroups] :: + `(?=` Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(?!` Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] `)` + + ExtendedAtom[NamedCaptureGroups] :: `.` - `\` AtomEscape[~UnicodeMode, ?N] + `\` AtomEscape[~UnicodeMode, ?NamedCaptureGroups] `\` [lookahead == `c`] - CharacterClass[~UnicodeMode] - `(` Disjunction[~UnicodeMode, ?N] `)` - `(` `?` `:` Disjunction[~UnicodeMode, ?N] `)` - `(` `?` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` - `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` + CharacterClass[~UnicodeMode, ~UnicodeSetsMode] + `(` GroupSpecifier[~UnicodeMode]? Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` `:` Disjunction[~UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] `)` + `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ~UnicodeSetsMode, ?NamedCaptureGroups] `)` InvalidBracedQuantifier ExtendedPatternCharacter InvalidBracedQuantifier :: `{` DecimalDigits[~Sep] `}` - `{` DecimalDigits[~Sep] `,` `}` + `{` DecimalDigits[~Sep] `,}` `{` DecimalDigits[~Sep] `,` DecimalDigits[~Sep] `}` ExtendedPatternCharacter :: SourceCharacter but not one of `^` `$` `\` `.` `*` `+` `?` `(` `)` `[` `|` - AtomEscape[UnicodeMode, N] :: + AtomEscape[UnicodeMode, NamedCaptureGroups] :: [+UnicodeMode] DecimalEscape - [~UnicodeMode] DecimalEscape [> but only if the CapturingGroupNumber of |DecimalEscape| is ≤ _NcapturingParens_] + [~UnicodeMode] DecimalEscape [> but only if the CapturingGroupNumber of |DecimalEscape| is ≤ CountLeftCapturingParensWithin(the |Pattern| containing |DecimalEscape|)] CharacterClassEscape[?UnicodeMode] - CharacterEscape[?UnicodeMode, ?N] - [+N] `k` GroupName[?UnicodeMode] + CharacterEscape[?UnicodeMode, ?NamedCaptureGroups] + [+NamedCaptureGroups] `k` GroupName[?UnicodeMode] - CharacterEscape[UnicodeMode, N] :: + CharacterEscape[UnicodeMode, NamedCaptureGroups] :: ControlEscape - `c` ControlLetter + `c` AsciiLetter `0` [lookahead ∉ DecimalDigit] HexEscapeSequence RegExpUnicodeEscapeSequence[?UnicodeMode] [~UnicodeMode] LegacyOctalEscapeSequence - IdentityEscape[?UnicodeMode, ?N] + IdentityEscape[?UnicodeMode, ?NamedCaptureGroups] - IdentityEscape[UnicodeMode, N] :: + IdentityEscape[UnicodeMode, NamedCaptureGroups] :: [+UnicodeMode] SyntaxCharacter [+UnicodeMode] `/` - [~UnicodeMode] SourceCharacterIdentityEscape[?N] + [~UnicodeMode] SourceCharacterIdentityEscape[?NamedCaptureGroups] - SourceCharacterIdentityEscape[N] :: - [~N] SourceCharacter but not `c` - [+N] SourceCharacter but not one of `c` or `k` + SourceCharacterIdentityEscape[NamedCaptureGroups] :: + [~NamedCaptureGroups] SourceCharacter but not `c` + [+NamedCaptureGroups] SourceCharacter but not one of `c` or `k` - ClassAtomNoDash[UnicodeMode, N] :: + ClassAtomNoDash[UnicodeMode, NamedCaptureGroups] :: SourceCharacter but not one of `\` or `]` or `-` - `\` ClassEscape[?UnicodeMode, ?N] + `\` ClassEscape[?UnicodeMode, ?NamedCaptureGroups] `\` [lookahead == `c`] - ClassEscape[UnicodeMode, N] :: + ClassEscape[UnicodeMode, NamedCaptureGroups] :: `b` [+UnicodeMode] `-` [~UnicodeMode] `c` ClassControlLetter CharacterClassEscape[?UnicodeMode] - CharacterEscape[?UnicodeMode, ?N] + CharacterEscape[?UnicodeMode, ?NamedCaptureGroups] ClassControlLetter :: DecimalDigit