@@ -119,6 +119,7 @@ namespace regex_constants {
119119 grep = 0x10,
120120 egrep = 0x20,
121121 _Gmask = 0x3F,
122+ _Any_posix = basic | extended | grep | egrep | awk,
122123
123124 icase = 0x0100,
124125 nosubs = 0x0200,
@@ -3213,10 +3214,14 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_rep2(int _Min, int _Max, bool _Gre
32133214 _Node_base* _Pos = _Current;
32143215 if (_Pos->_Kind == _N_end_group || _Pos->_Kind == _N_end_capture) {
32153216 _Pos = static_cast<_Node_end_group*>(_Pos)->_Back;
3216- }
3217-
3218- if (_Min == 0 && _Max == 1) { // rewrite zero-or-one quantifiers as alternations to make the
3219- // "simple loop" optimization more likely to engage
3217+ } else if (_Min == 0 && _Max == 1) {
3218+ // Rewrite zero-or-one quantifiers as alternations to make the
3219+ // "simple loop" optimization more likely to engage.
3220+ //
3221+ // GH-5490: This rewrite becomes observably incorrect
3222+ // if the subexpression contains capture groups,
3223+ // so we don't apply it if the subexpression is surrounded
3224+ // by a capturing or non-capturing group.
32203225 _Node_endif* _End = new _Node_endif;
32213226 _Node_if* _If_expr = new _Node_if(_End);
32223227 _Node_if* _If_empty_str = new _Node_if(_End);
@@ -3240,13 +3245,14 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_rep2(int _Min, int _Max, bool _Gre
32403245 swap(_If_expr->_Next->_Prev, _If_empty_str->_Next->_Prev); // intentional ADL
32413246 swap(_If_expr->_Next, _If_empty_str->_Next); // intentional ADL
32423247 }
3243- } else {
3244- _Node_end_rep* _Node0 = new _Node_end_rep();
3245- _Node_rep* _Nx = new _Node_rep(_Greedy, _Min, _Max, _Node0, _Root->_Loops++);
3246- _Node0->_Begin_rep = _Nx;
3247- _Link_node(_Node0);
3248- _Insert_node(_Pos, _Nx);
3248+ return;
32493249 }
3250+
3251+ _Node_end_rep* _Node0 = new _Node_end_rep();
3252+ _Node_rep* _Nx = new _Node_rep(_Greedy, _Min, _Max, _Node0, _Root->_Loops++);
3253+ _Node0->_Begin_rep = _Nx;
3254+ _Link_node(_Node0);
3255+ _Insert_node(_Pos, _Nx);
32503256}
32513257
32523258template <class _FwdIt, class _Elem, class _RxTraits>
@@ -3325,26 +3331,30 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
33253331 int _Ix = 0;
33263332 _Tgt_state_t<_It> _St = _Tgt_state;
33273333
3328- for (; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
3329- // GH-5365: We have to reset the capture groups from the second iteration on.
3330- // We can avoid the reset for the first iteration
3331- // because we know that a simple repetition was not encountered before.
3332- if (_Ix > 0) {
3333- _Tgt_state._Grp_valid = _St._Grp_valid;
3334- }
3335-
3336- _It _Cur = _Tgt_state._Cur;
3334+ if (0 < _Node->_Min) {
3335+ // GH-5365: We can avoid resetting capture groups for the first iteration
3336+ // because we know that a simple repetition of this loop was not encountered before.
33373337 if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
3338- _Tgt_state = _St;
33393338 return false;
3340- } else if (_Cur == _Tgt_state._Cur) {
3341- _Ix = _Node->_Min - 1; // skip matches that don't change state
3339+ } else if (_Tgt_state._Cur == _St._Cur) { // matches empty string
3340+ // loop is branchless, so it will only ever match empty strings
3341+ // -> skip all other matches as they don't change state and immediately try tail
3342+ return _Match_pat(_Node->_End_rep->_Next);
3343+ } else { // loop never matches the empty string
3344+ for (_Ix = 1; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
3345+ // GH-5365: We have to reset the capture groups from the second iteration on.
3346+ _Tgt_state._Grp_valid = _St._Grp_valid;
3347+ if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
3348+ return false;
3349+ }
3350+ }
33423351 }
33433352 }
33443353
3345- _Tgt_state_t<_It> _Final = _Tgt_state;
3346- bool _Matched0 = false;
3347- _It _Saved_pos = _Tgt_state._Cur;
3354+ _Tgt_state_t<_It> _Final;
3355+ bool _Matched0 = false;
3356+ _It _Saved_pos = _Tgt_state._Cur;
3357+ bool _Done = false;
33483358
33493359 if (_Match_pat(_Node->_End_rep->_Next)) {
33503360 if (!_Greedy) {
@@ -3356,32 +3366,58 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
33563366 _Matched0 = true;
33573367 }
33583368
3359- while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
3369+ if (_Ix == 0 && _Node->_Max != 0) {
33603370 _Tgt_state._Cur = _Saved_pos;
33613371 _Tgt_state._Grp_valid = _St._Grp_valid;
3362- if (!_Match_pat(_Node->_Next)) {
3363- break; // rep match failed, quit loop
3364- }
33653372
3366- _It _Mid = _Tgt_state._Cur;
3367- if (_Match_pat(_Node->_End_rep->_Next)) {
3368- if (!_Greedy) {
3373+ if (!_Match_pat(_Node->_Next)) { // rep match failed, we are done
3374+ _Done = true;
3375+ } else if (_Saved_pos == _Tgt_state._Cur) { // match empty, try no more repetitions
3376+ _Done = true;
3377+ // we only potentially accept/try tail for POSIX
3378+ if ((_Sflags & regex_constants::_Any_posix) && _Match_pat(_Node->_End_rep->_Next)) {
33693379 return true; // go with current match
33703380 }
3381+ } else {
3382+ _Saved_pos = _Tgt_state._Cur;
3383+ if (_Match_pat(_Node->_End_rep->_Next)) {
3384+ if (!_Greedy) {
3385+ return true; // go with current match
3386+ }
33713387
3372- // record match and continue
3373- _Final = _Tgt_state;
3374- _Matched0 = true;
3388+ // record match and continue
3389+ _Final = _Tgt_state;
3390+ _Matched0 = true;
3391+ }
33753392 }
3393+ _Ix = 1;
3394+ }
33763395
3377- if (_Saved_pos == _Mid) {
3378- break; // rep match ate no additional elements, quit loop
3379- }
3396+ if (!_Done) {
3397+ while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
3398+ _Tgt_state._Cur = _Saved_pos;
3399+ _Tgt_state._Grp_valid = _St._Grp_valid;
3400+ if (!_Match_pat(_Node->_Next) || _Tgt_state._Cur == _Saved_pos) {
3401+ break; // rep match failed, quit loop
3402+ }
33803403
3381- _Saved_pos = _Mid;
3404+ // since loop is branchless, empty rep match is not possible at this point
3405+ _Saved_pos = _Tgt_state._Cur;
3406+ if (_Match_pat(_Node->_End_rep->_Next)) {
3407+ if (!_Greedy) {
3408+ return true; // go with current match
3409+ }
3410+
3411+ // record match and continue
3412+ _Final = _Tgt_state;
3413+ _Matched0 = true;
3414+ }
3415+ }
33823416 }
33833417
3384- _Tgt_state = _Matched0 ? _Final : _St;
3418+ if (_Matched0) { // record final match
3419+ _Tgt_state = _Final;
3420+ }
33853421 return _Matched0;
33863422}
33873423
@@ -3395,61 +3431,56 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node,
33953431 _It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter);
33963432 bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur;
33973433
3398- if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
3399- _Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
3400- } else if (_Init_idx < _Node->_Min) { // try a required rep
3401- if (!_Progress) {
3402- _Matched0 = _Match_pat(_Node->_End_rep->_Next); // empty, try tail
3403- } else { // try another required match
3404- _Psav->_Loop_idx = _Init_idx + 1;
3405- _Psav->_Loop_iter = _STD addressof(_St._Cur);
3406- _STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3407- _Tgt_state._Grp_valid.end(), false);
3408- _Matched0 = _Match_pat(_Node->_Next);
3409- }
3410- } else if (_Longest) { // longest, try any number of repetitions
3411-
3412- // match with no further repetition
3413- _Matched0 = _Match_pat(_Node->_End_rep->_Next);
3414- // match with at least one more repetition if last repetition made progress
3415- if (_Progress) {
3434+ if (_Init_idx < _Node->_Min) { // try another required match
3435+ _Psav->_Loop_iter = _STD addressof(_St._Cur);
3436+ _Psav->_Loop_idx = _Progress ? _Init_idx + 1 : _Node->_Min; // try only one more match after an empty match
3437+ _STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3438+ _Tgt_state._Grp_valid.end(), false);
3439+ _Matched0 = _Match_pat(_Node->_Next);
3440+ } else if (_Init_idx == _Node->_Min || _Progress) {
3441+ if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
3442+ _Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
3443+ } else if (_Longest) { // longest, try any number of repetitions
3444+
3445+ // match with no further repetition
3446+ _Matched0 = _Match_pat(_Node->_End_rep->_Next);
3447+
3448+ // try to match with one more repetition
34163449 _Tgt_state = _St;
34173450 _Psav->_Loop_idx = _Init_idx + 1;
34183451 _Psav->_Loop_iter = _STD addressof(_St._Cur);
3419-
34203452 if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true
34213453 _Matched0 = true;
34223454 }
3423- }
3424- } else if (!_Greedy) { // not greedy, favor minimum number of reps
3425- _Matched0 = _Match_pat(_Node->_End_rep->_Next);
3426- if (!_Matched0 && _Progress) { // tail failed, try another rep
3427- _Tgt_state = _St;
3428- _Psav->_Loop_idx = _Init_idx + 1;
3429- _Psav->_Loop_iter = _STD addressof(_St._Cur);
3430- _STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3431- _Tgt_state._Grp_valid.end(), false);
3432- _Matched0 = _Match_pat(_Node->_Next);
3433- }
3434- } else { // greedy, favor maximum number of reps
3435- if (_Progress) { // try another rep
3455+ } else if (!_Greedy) { // not greedy, favor minimum number of reps
3456+ _Matched0 = _Match_pat(_Node->_End_rep->_Next);
3457+ if (!_Matched0) { // tail failed, try another rep
3458+ _Tgt_state = _St;
3459+ _Psav->_Loop_idx = _Init_idx + 1;
3460+ _Psav->_Loop_iter = _STD addressof(_St._Cur);
3461+ _STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3462+ _Tgt_state._Grp_valid.end(), false);
3463+ _Matched0 = _Match_pat(_Node->_Next);
3464+ }
3465+ } else { // greedy, favor maximum number of reps,
3466+ // so try another rep
34363467 _Psav->_Loop_idx = _Init_idx + 1;
34373468 _Psav->_Loop_iter = _STD addressof(_St._Cur);
34383469 _STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
34393470 _Tgt_state._Grp_valid.end(), false);
34403471 _Matched0 = _Match_pat(_Node->_Next);
3441- }
34423472
3443- if ((_Progress || 1 >= _Init_idx) && !_Matched0) { // rep failed, try tail
3444- _Psav->_Loop_idx = _Loop_idx_sav;
3445- _Psav->_Loop_iter = _Loop_iter_sav;
3446- _Tgt_state = _St;
3447- _Matched0 = _Match_pat(_Node->_End_rep->_Next);
3473+ if (!_Matched0) { // rep failed, try tail
3474+ _Psav->_Loop_idx = _Loop_idx_sav;
3475+ _Psav->_Loop_iter = _Loop_iter_sav;
3476+ _Tgt_state = _St;
3477+ _Matched0 = _Match_pat(_Node->_End_rep->_Next);
3478+ }
34483479 }
3449- }
3450-
3451- if (!_Matched0) {
3452- _Tgt_state = _St ;
3480+ } else if (_Init_idx == 1 && (_Sflags & regex_constants::_Any_posix)) {
3481+ // POSIX allows an empty repetition if the subexpression is matched only once,
3482+ // so try tail
3483+ _Matched0 = _Match_pat(_Node->_End_rep->_Next) ;
34533484 }
34543485
34553486 _Psav->_Loop_idx = _Loop_idx_sav;
@@ -3470,9 +3501,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep_first(_Node_rep*
34703501 // No capture group reset is performed for POSIX regexes,
34713502 // so we prevent any reset by setting the first capture group to the number of capture groups _Ncap.
34723503 if (_Psav->_Group_first == 0) {
3473- constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep
3474- | regex_constants::egrep | regex_constants::awk;
3475- if ((_Sflags & _Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
3504+ if ((_Sflags & regex_constants::_Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
34763505 _Psav->_Group_first = _Ncap;
34773506 }
34783507 }
@@ -3844,10 +3873,8 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
38443873 if (_Tgt_state._Cur == _End) {
38453874 _Failed = true;
38463875 } else {
3847- constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep
3848- | regex_constants::egrep | regex_constants::awk;
38493876 const _Elem _Ch = *_Tgt_state._Cur;
3850- if (_Sflags & _Any_posix) {
3877+ if (_Sflags & regex_constants:: _Any_posix) {
38513878 if (_Ch == _Elem()) {
38523879 _Failed = true;
38533880 }
@@ -4893,10 +4920,18 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
48934920 _Calculate_loop_simplicity(static_cast<_Node_assert*>(_Nx)->_Child, nullptr, nullptr);
48944921 break;
48954922 case _N_rep:
4896- // _Node_rep inside another _Node_rep makes both not simple
4923+ // _Node_rep inside another _Node_rep makes both not simple if _Outer_rep can be repeated more than once
4924+ // because _Matcher2::_Do_rep0() does not reset capture group boundaries when control is returned to it.
4925+ // If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop.
48974926 if (_Outer_rep) {
4898- _Outer_rep->_Simple_loop = 0;
4899- static_cast<_Node_rep*>(_Nx)->_Simple_loop = 0;
4927+ _Outer_rep->_Simple_loop = 0;
4928+ auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
4929+ if (_Outer_rep->_Max >= 0 && _Outer_rep->_Max <= 1) {
4930+ _Calculate_loop_simplicity(_Inner_rep->_Next, _Inner_rep->_End_rep->_Next, _Inner_rep);
4931+ _Nx = _Inner_rep->_End_rep;
4932+ } else {
4933+ _Inner_rep->_Simple_loop = 0;
4934+ }
49004935 } else {
49014936 _Outer_rep = static_cast<_Node_rep*>(_Nx);
49024937 }
0 commit comments