Skip to content

Commit 0992807

Browse files
<regex>: Reject empty repetitions when required by regex grammars (#5494)
Co-authored-by: Stephan T. Lavavej <stl@microsoft.com>
1 parent e9912d8 commit 0992807

File tree

3 files changed

+320
-92
lines changed

3 files changed

+320
-92
lines changed

stl/inc/regex

Lines changed: 126 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ namespace regex_constants {
119119
grep = 0x10,
120120
egrep = 0x20,
121121
_Gmask = 0x3F,
122+
_Any_posix = basic | extended | grep | egrep | awk,
122123

123124
icase = 0x0100,
124125
nosubs = 0x0200,
@@ -3213,10 +3214,14 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_rep2(int _Min, int _Max, bool _Gre
32133214
_Node_base* _Pos = _Current;
32143215
if (_Pos->_Kind == _N_end_group || _Pos->_Kind == _N_end_capture) {
32153216
_Pos = static_cast<_Node_end_group*>(_Pos)->_Back;
3216-
}
3217-
3218-
if (_Min == 0 && _Max == 1) { // rewrite zero-or-one quantifiers as alternations to make the
3219-
// "simple loop" optimization more likely to engage
3217+
} else if (_Min == 0 && _Max == 1) {
3218+
// Rewrite zero-or-one quantifiers as alternations to make the
3219+
// "simple loop" optimization more likely to engage.
3220+
//
3221+
// GH-5490: This rewrite becomes observably incorrect
3222+
// if the subexpression contains capture groups,
3223+
// so we don't apply it if the subexpression is surrounded
3224+
// by a capturing or non-capturing group.
32203225
_Node_endif* _End = new _Node_endif;
32213226
_Node_if* _If_expr = new _Node_if(_End);
32223227
_Node_if* _If_empty_str = new _Node_if(_End);
@@ -3240,13 +3245,14 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_rep2(int _Min, int _Max, bool _Gre
32403245
swap(_If_expr->_Next->_Prev, _If_empty_str->_Next->_Prev); // intentional ADL
32413246
swap(_If_expr->_Next, _If_empty_str->_Next); // intentional ADL
32423247
}
3243-
} else {
3244-
_Node_end_rep* _Node0 = new _Node_end_rep();
3245-
_Node_rep* _Nx = new _Node_rep(_Greedy, _Min, _Max, _Node0, _Root->_Loops++);
3246-
_Node0->_Begin_rep = _Nx;
3247-
_Link_node(_Node0);
3248-
_Insert_node(_Pos, _Nx);
3248+
return;
32493249
}
3250+
3251+
_Node_end_rep* _Node0 = new _Node_end_rep();
3252+
_Node_rep* _Nx = new _Node_rep(_Greedy, _Min, _Max, _Node0, _Root->_Loops++);
3253+
_Node0->_Begin_rep = _Nx;
3254+
_Link_node(_Node0);
3255+
_Insert_node(_Pos, _Nx);
32503256
}
32513257

32523258
template <class _FwdIt, class _Elem, class _RxTraits>
@@ -3325,26 +3331,30 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
33253331
int _Ix = 0;
33263332
_Tgt_state_t<_It> _St = _Tgt_state;
33273333

3328-
for (; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
3329-
// GH-5365: We have to reset the capture groups from the second iteration on.
3330-
// We can avoid the reset for the first iteration
3331-
// because we know that a simple repetition was not encountered before.
3332-
if (_Ix > 0) {
3333-
_Tgt_state._Grp_valid = _St._Grp_valid;
3334-
}
3335-
3336-
_It _Cur = _Tgt_state._Cur;
3334+
if (0 < _Node->_Min) {
3335+
// GH-5365: We can avoid resetting capture groups for the first iteration
3336+
// because we know that a simple repetition of this loop was not encountered before.
33373337
if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
3338-
_Tgt_state = _St;
33393338
return false;
3340-
} else if (_Cur == _Tgt_state._Cur) {
3341-
_Ix = _Node->_Min - 1; // skip matches that don't change state
3339+
} else if (_Tgt_state._Cur == _St._Cur) { // matches empty string
3340+
// loop is branchless, so it will only ever match empty strings
3341+
// -> skip all other matches as they don't change state and immediately try tail
3342+
return _Match_pat(_Node->_End_rep->_Next);
3343+
} else { // loop never matches the empty string
3344+
for (_Ix = 1; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
3345+
// GH-5365: We have to reset the capture groups from the second iteration on.
3346+
_Tgt_state._Grp_valid = _St._Grp_valid;
3347+
if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
3348+
return false;
3349+
}
3350+
}
33423351
}
33433352
}
33443353

3345-
_Tgt_state_t<_It> _Final = _Tgt_state;
3346-
bool _Matched0 = false;
3347-
_It _Saved_pos = _Tgt_state._Cur;
3354+
_Tgt_state_t<_It> _Final;
3355+
bool _Matched0 = false;
3356+
_It _Saved_pos = _Tgt_state._Cur;
3357+
bool _Done = false;
33483358

33493359
if (_Match_pat(_Node->_End_rep->_Next)) {
33503360
if (!_Greedy) {
@@ -3356,32 +3366,58 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
33563366
_Matched0 = true;
33573367
}
33583368

3359-
while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
3369+
if (_Ix == 0 && _Node->_Max != 0) {
33603370
_Tgt_state._Cur = _Saved_pos;
33613371
_Tgt_state._Grp_valid = _St._Grp_valid;
3362-
if (!_Match_pat(_Node->_Next)) {
3363-
break; // rep match failed, quit loop
3364-
}
33653372

3366-
_It _Mid = _Tgt_state._Cur;
3367-
if (_Match_pat(_Node->_End_rep->_Next)) {
3368-
if (!_Greedy) {
3373+
if (!_Match_pat(_Node->_Next)) { // rep match failed, we are done
3374+
_Done = true;
3375+
} else if (_Saved_pos == _Tgt_state._Cur) { // match empty, try no more repetitions
3376+
_Done = true;
3377+
// we only potentially accept/try tail for POSIX
3378+
if ((_Sflags & regex_constants::_Any_posix) && _Match_pat(_Node->_End_rep->_Next)) {
33693379
return true; // go with current match
33703380
}
3381+
} else {
3382+
_Saved_pos = _Tgt_state._Cur;
3383+
if (_Match_pat(_Node->_End_rep->_Next)) {
3384+
if (!_Greedy) {
3385+
return true; // go with current match
3386+
}
33713387

3372-
// record match and continue
3373-
_Final = _Tgt_state;
3374-
_Matched0 = true;
3388+
// record match and continue
3389+
_Final = _Tgt_state;
3390+
_Matched0 = true;
3391+
}
33753392
}
3393+
_Ix = 1;
3394+
}
33763395

3377-
if (_Saved_pos == _Mid) {
3378-
break; // rep match ate no additional elements, quit loop
3379-
}
3396+
if (!_Done) {
3397+
while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
3398+
_Tgt_state._Cur = _Saved_pos;
3399+
_Tgt_state._Grp_valid = _St._Grp_valid;
3400+
if (!_Match_pat(_Node->_Next) || _Tgt_state._Cur == _Saved_pos) {
3401+
break; // rep match failed, quit loop
3402+
}
33803403

3381-
_Saved_pos = _Mid;
3404+
// since loop is branchless, empty rep match is not possible at this point
3405+
_Saved_pos = _Tgt_state._Cur;
3406+
if (_Match_pat(_Node->_End_rep->_Next)) {
3407+
if (!_Greedy) {
3408+
return true; // go with current match
3409+
}
3410+
3411+
// record match and continue
3412+
_Final = _Tgt_state;
3413+
_Matched0 = true;
3414+
}
3415+
}
33823416
}
33833417

3384-
_Tgt_state = _Matched0 ? _Final : _St;
3418+
if (_Matched0) { // record final match
3419+
_Tgt_state = _Final;
3420+
}
33853421
return _Matched0;
33863422
}
33873423

@@ -3395,61 +3431,56 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node,
33953431
_It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter);
33963432
bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur;
33973433

3398-
if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
3399-
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
3400-
} else if (_Init_idx < _Node->_Min) { // try a required rep
3401-
if (!_Progress) {
3402-
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // empty, try tail
3403-
} else { // try another required match
3404-
_Psav->_Loop_idx = _Init_idx + 1;
3405-
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3406-
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3407-
_Tgt_state._Grp_valid.end(), false);
3408-
_Matched0 = _Match_pat(_Node->_Next);
3409-
}
3410-
} else if (_Longest) { // longest, try any number of repetitions
3411-
3412-
// match with no further repetition
3413-
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3414-
// match with at least one more repetition if last repetition made progress
3415-
if (_Progress) {
3434+
if (_Init_idx < _Node->_Min) { // try another required match
3435+
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3436+
_Psav->_Loop_idx = _Progress ? _Init_idx + 1 : _Node->_Min; // try only one more match after an empty match
3437+
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3438+
_Tgt_state._Grp_valid.end(), false);
3439+
_Matched0 = _Match_pat(_Node->_Next);
3440+
} else if (_Init_idx == _Node->_Min || _Progress) {
3441+
if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
3442+
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
3443+
} else if (_Longest) { // longest, try any number of repetitions
3444+
3445+
// match with no further repetition
3446+
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3447+
3448+
// try to match with one more repetition
34163449
_Tgt_state = _St;
34173450
_Psav->_Loop_idx = _Init_idx + 1;
34183451
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3419-
34203452
if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true
34213453
_Matched0 = true;
34223454
}
3423-
}
3424-
} else if (!_Greedy) { // not greedy, favor minimum number of reps
3425-
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3426-
if (!_Matched0 && _Progress) { // tail failed, try another rep
3427-
_Tgt_state = _St;
3428-
_Psav->_Loop_idx = _Init_idx + 1;
3429-
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3430-
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3431-
_Tgt_state._Grp_valid.end(), false);
3432-
_Matched0 = _Match_pat(_Node->_Next);
3433-
}
3434-
} else { // greedy, favor maximum number of reps
3435-
if (_Progress) { // try another rep
3455+
} else if (!_Greedy) { // not greedy, favor minimum number of reps
3456+
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3457+
if (!_Matched0) { // tail failed, try another rep
3458+
_Tgt_state = _St;
3459+
_Psav->_Loop_idx = _Init_idx + 1;
3460+
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3461+
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3462+
_Tgt_state._Grp_valid.end(), false);
3463+
_Matched0 = _Match_pat(_Node->_Next);
3464+
}
3465+
} else { // greedy, favor maximum number of reps,
3466+
// so try another rep
34363467
_Psav->_Loop_idx = _Init_idx + 1;
34373468
_Psav->_Loop_iter = _STD addressof(_St._Cur);
34383469
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
34393470
_Tgt_state._Grp_valid.end(), false);
34403471
_Matched0 = _Match_pat(_Node->_Next);
3441-
}
34423472

3443-
if ((_Progress || 1 >= _Init_idx) && !_Matched0) { // rep failed, try tail
3444-
_Psav->_Loop_idx = _Loop_idx_sav;
3445-
_Psav->_Loop_iter = _Loop_iter_sav;
3446-
_Tgt_state = _St;
3447-
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3473+
if (!_Matched0) { // rep failed, try tail
3474+
_Psav->_Loop_idx = _Loop_idx_sav;
3475+
_Psav->_Loop_iter = _Loop_iter_sav;
3476+
_Tgt_state = _St;
3477+
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3478+
}
34483479
}
3449-
}
3450-
3451-
if (!_Matched0) {
3452-
_Tgt_state = _St;
3480+
} else if (_Init_idx == 1 && (_Sflags & regex_constants::_Any_posix)) {
3481+
// POSIX allows an empty repetition if the subexpression is matched only once,
3482+
// so try tail
3483+
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
34533484
}
34543485

34553486
_Psav->_Loop_idx = _Loop_idx_sav;
@@ -3470,9 +3501,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep_first(_Node_rep*
34703501
// No capture group reset is performed for POSIX regexes,
34713502
// so we prevent any reset by setting the first capture group to the number of capture groups _Ncap.
34723503
if (_Psav->_Group_first == 0) {
3473-
constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep
3474-
| regex_constants::egrep | regex_constants::awk;
3475-
if ((_Sflags & _Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
3504+
if ((_Sflags & regex_constants::_Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
34763505
_Psav->_Group_first = _Ncap;
34773506
}
34783507
}
@@ -3844,10 +3873,8 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
38443873
if (_Tgt_state._Cur == _End) {
38453874
_Failed = true;
38463875
} else {
3847-
constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep
3848-
| regex_constants::egrep | regex_constants::awk;
38493876
const _Elem _Ch = *_Tgt_state._Cur;
3850-
if (_Sflags & _Any_posix) {
3877+
if (_Sflags & regex_constants::_Any_posix) {
38513878
if (_Ch == _Elem()) {
38523879
_Failed = true;
38533880
}
@@ -4893,10 +4920,18 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
48934920
_Calculate_loop_simplicity(static_cast<_Node_assert*>(_Nx)->_Child, nullptr, nullptr);
48944921
break;
48954922
case _N_rep:
4896-
// _Node_rep inside another _Node_rep makes both not simple
4923+
// _Node_rep inside another _Node_rep makes both not simple if _Outer_rep can be repeated more than once
4924+
// because _Matcher2::_Do_rep0() does not reset capture group boundaries when control is returned to it.
4925+
// If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop.
48974926
if (_Outer_rep) {
4898-
_Outer_rep->_Simple_loop = 0;
4899-
static_cast<_Node_rep*>(_Nx)->_Simple_loop = 0;
4927+
_Outer_rep->_Simple_loop = 0;
4928+
auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
4929+
if (_Outer_rep->_Max >= 0 && _Outer_rep->_Max <= 1) {
4930+
_Calculate_loop_simplicity(_Inner_rep->_Next, _Inner_rep->_End_rep->_Next, _Inner_rep);
4931+
_Nx = _Inner_rep->_End_rep;
4932+
} else {
4933+
_Inner_rep->_Simple_loop = 0;
4934+
}
49004935
} else {
49014936
_Outer_rep = static_cast<_Node_rep*>(_Nx);
49024937
}

0 commit comments

Comments
 (0)