diff --git a/stl/inc/regex b/stl/inc/regex index e280502115..4c3d8f503a 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1509,7 +1509,6 @@ public: using _Difft = typename iterator_traits<_FwdIt>::difference_type; _Builder(const _RxTraits& _Tr, regex_constants::syntax_option_type); - bool _Beg_expr() const; void _Setlong(); // _Discard_pattern is an ABI zombie name void _Tidy() noexcept; @@ -1545,7 +1544,6 @@ private: static void _Insert_node(_Node_base*, _Node_base*); _Node_base* _New_node(_Node_type _Kind); void _Add_str_node(); - bool _Beg_expr(_Node_base*) const; void _Add_char_to_bitmap(_Elem _Ch); void _Add_char_to_array(_Elem _Ch); void _Add_elts(_Node_class<_Elem, _RxTraits>*, typename _RxTraits::char_class_type, bool); @@ -2777,17 +2775,6 @@ _Node_base* _Builder<_FwdIt, _Elem, _RxTraits>::_Getmark() const { return _Current; } -template -bool _Builder<_FwdIt, _Elem, _RxTraits>::_Beg_expr(_Node_base* _Nx) const { - // test for beginning of expression or subexpression - return _Nx->_Kind == _N_begin || _Nx->_Kind == _N_group || _Nx->_Kind == _N_capture; -} - -template -bool _Builder<_FwdIt, _Elem, _RxTraits>::_Beg_expr() const { // test for beginning of expression or subexpression - return _Beg_expr(_Current) || (_Current->_Kind == _N_bol && _Beg_expr(_Current->_Prev)); -} - template _Node_base* _Builder<_FwdIt, _Elem, _RxTraits>::_Link_node(_Node_base* _Nx) { // insert _Nx at current location _Nx->_Prev = _Current; @@ -3899,17 +3886,16 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char break; case _Meta_star: - if ((_L_flags & _L_star_beg) && _Nfa._Beg_expr()) { - _Mchar = _Meta_chr; - } - + // A star can always act as a quantifier outside bracket expressions, + // but _L_star_beg (used by basic/grep) allows its use as an ordinary character + // at the beginning of a (sub-)expression (potentially after an optional caret anchor). + // We'll handle that when we are parsing alternatives in disjunctions. break; case _Meta_caret: - if ((_L_flags & _L_anch_rstr) && !_Nfa._Beg_expr()) { - _Mchar = _Meta_chr; - } - + // A caret can always negate a bracket expression, + // but _L_anch_rstr (used by basic/grep) restricts caret anchors to the beginning. + // We'll handle that restriction when we're about to add a bol node. break; case _Meta_dlr: @@ -4475,15 +4461,21 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte _Next(); _Quant = _Wrapped_disjunction(); _Expect(_Meta_rpar, regex_constants::error_paren); - } else if (_Mchar == _Meta_caret) { // add bol node + } else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || !_Found)) { // add bol node _Nfa._Add_bol(); _Next(); - _Quant = false; + if ((_L_flags & _L_star_beg) && _Mchar == _Meta_star && !_Found) { + _Nfa._Add_char(_Char); + _Next(); + } else { + _Quant = false; + } } else if (_Mchar == _Meta_dlr) { // add eol node _Nfa._Add_eol(); _Next(); _Quant = false; - } else if (_Mchar == _Meta_star || _Mchar == _Meta_plus || _Mchar == _Meta_query || _Mchar == _Meta_lbr) { + } else if ((_Mchar == _Meta_star && (!(_L_flags & _L_star_beg) || _Found)) || _Mchar == _Meta_plus + || _Mchar == _Meta_query || _Mchar == _Meta_lbr) { _Error(regex_constants::error_badrepeat); } else if (_Mchar == _Meta_rbr && !(_L_flags & _L_paren_bal)) { _Error(regex_constants::error_brace); diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 0efb1979e4..198c65d489 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -680,6 +680,338 @@ void test_gh_5160() { neg_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE } +void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) { + g_regexTester.should_not_match("yx", "y[^x]", basic_or_grep); + g_regexTester.should_match("yz", "y[^x]", basic_or_grep); + g_regexTester.should_match("y^", "y[^x]", basic_or_grep); + + g_regexTester.should_match("yx", "y[x^]", basic_or_grep); + g_regexTester.should_not_match("yz", "y[x^]", basic_or_grep); + g_regexTester.should_match("y^", "y[x^]", basic_or_grep); + + g_regexTester.should_not_match("yx", "y[^x^]", basic_or_grep); + g_regexTester.should_match("yz", "y[^x^]", basic_or_grep); + g_regexTester.should_not_match("y^", "y[^x^]", basic_or_grep); + + { + const test_regex no_anchor(&g_regexTester, "meo[wW]", basic_or_grep); + no_anchor.should_search_match("meow_machine", "meow"); + no_anchor.should_search_match("homeowner", "meow"); + } + { + const test_regex beginning_anchor(&g_regexTester, "^meo[wW]", basic_or_grep); + beginning_anchor.should_search_match("meow_machine", "meow"); + beginning_anchor.should_search_fail("homeowner"); + } + { + const test_regex middle_anchor(&g_regexTester, "me^o[wW]", basic_or_grep); + middle_anchor.should_search_fail("meow_machine"); + middle_anchor.should_search_fail("homeowner"); + middle_anchor.should_search_match("home^owner", "me^ow"); + } + { + const test_regex double_carets(&g_regexTester, "^^meo[wW]", basic_or_grep); + double_carets.should_search_fail("meow_machine"); + double_carets.should_search_fail("homeowner"); + double_carets.should_search_match("^meow_machine", "^meow"); + double_carets.should_search_fail("^^meow_machine"); + double_carets.should_search_fail("ho^meowner"); + double_carets.should_search_fail("ho^^meowner"); + } + + g_regexTester.should_not_match("me^ow", R"-(me\(^o[wW]\))-", basic_or_grep); + g_regexTester.should_not_match("meow", R"-(me\(^o[wW]\))-", basic_or_grep); + + + { + const test_regex firstgroup_anchor(&g_regexTester, R"-(\(^meo[wW]\))-", basic_or_grep); + firstgroup_anchor.should_search_match("meow_machine", "meow"); + firstgroup_anchor.should_search_fail("^meow_machine"); + firstgroup_anchor.should_search_fail("homeowner"); + firstgroup_anchor.should_search_fail("ho^meowner"); + } + + { + const test_regex prefixedgroup_anchor(&g_regexTester, R"-(.*\(^meo[wW]\))-", basic_or_grep); + prefixedgroup_anchor.should_search_match("meow_machine", "meow"); + prefixedgroup_anchor.should_search_fail("^meow_machine"); + prefixedgroup_anchor.should_search_fail("homeowner"); + prefixedgroup_anchor.should_search_fail("ho^meowner"); + } + + { + const test_regex secondgroup_anchor(&g_regexTester, R"-(\(.*\)\(^meo[wW]\))-", basic_or_grep); + secondgroup_anchor.should_search_match("meow_machine", "meow"); + secondgroup_anchor.should_search_fail("^meow_machine"); + secondgroup_anchor.should_search_fail("homeowner"); + secondgroup_anchor.should_search_fail("ho^meowner"); + } + + { + const test_regex nested_anchor(&g_regexTester, R"-(.*\(^\(^meo[wW]\)\))-", basic_or_grep); + nested_anchor.should_search_match("meow_machine", "meow"); + nested_anchor.should_search_fail("^meow_machine"); + nested_anchor.should_search_fail("^^meow_machine"); + nested_anchor.should_search_fail("homeowner"); + nested_anchor.should_search_fail("ho^meowner"); + nested_anchor.should_search_fail("ho^^meowner"); + } + + { + const test_regex double_carets(&g_regexTester, R"-(.*\(^^meo[wW]\))-", basic_or_grep); + double_carets.should_search_fail("meow_machine"); + double_carets.should_search_match("^meow_machine", "^meow"); + double_carets.should_search_fail("^^meow_machine"); + double_carets.should_search_fail("homeowner"); + double_carets.should_search_fail("ho^meowner"); + double_carets.should_search_fail("ho^^meowner"); + } + + // Validate correct handling of star at the + // beginning of an expression (with or without optional caret). + g_regexTester.should_match("*", "*", basic_or_grep); + g_regexTester.should_not_match("**", "*", basic_or_grep); + g_regexTester.should_match("****", "**", basic_or_grep); + g_regexTester.should_throw("***", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*", "^*", basic_or_grep); + g_regexTester.should_not_match("**", "^*", basic_or_grep); + g_regexTester.should_not_match("^*", "^*", basic_or_grep); + g_regexTester.should_match("****", "^**", basic_or_grep); + g_regexTester.should_throw("^***", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*aa", "*a*", basic_or_grep); + g_regexTester.should_match("*a", "*a*", basic_or_grep); + g_regexTester.should_not_match("aa", "*a*", basic_or_grep); + g_regexTester.should_not_match("*a*", "*a*", basic_or_grep); + + g_regexTester.should_match("*aa", "^*a*", basic_or_grep); + g_regexTester.should_not_match("aa", "^*a*", basic_or_grep); + g_regexTester.should_not_match("*a*", "^*a*", basic_or_grep); + g_regexTester.should_not_match("^*a", "^*a*", basic_or_grep); + g_regexTester.should_not_match("^*aa", "^*a*", basic_or_grep); + g_regexTester.should_not_match("^*a*", "^*a*", basic_or_grep); + + g_regexTester.should_match("*", R"-(\(*\))-", basic_or_grep); + g_regexTester.should_not_match("**", R"-(\(*\))-", basic_or_grep); + g_regexTester.should_match("****", R"-(\(**\))-", basic_or_grep); + g_regexTester.should_throw(R"-(\(***\))-", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("**", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("^*", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_match("***", R"-(\(^**\))-", basic_or_grep); + g_regexTester.should_throw(R"-(\(^***\))-", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*aa", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_match("*a", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_not_match("aa", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_not_match("*a*", R"-(\(*a*\))-", basic_or_grep); + + g_regexTester.should_match("*aa", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("aa", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("*a*", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*aa", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a*", R"-(\(^*a*\))-", basic_or_grep); + + g_regexTester.should_match("*", R"-(.*\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("**", R"-(.*\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("^*", R"-(.*\(^*\))-", basic_or_grep); + g_regexTester.should_match("***", R"-(.*\(^**\))-", basic_or_grep); + g_regexTester.should_throw(R"-(.*\(^***\))-", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*aa", R"-(\(.*^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("aa", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("*a*", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*aa", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a*", R"-(.*\(^*a*\))-", basic_or_grep); + + // Validate that there is no special behavior near bars, + // as they are alternation operators in regex modes other than basic or grep. + { + const test_regex middle_bar(&g_regexTester, "^a|a", basic_or_grep); + middle_bar.should_search_match("a|a", "a|a"); + middle_bar.should_search_fail("^a|a"); + middle_bar.should_search_fail("ba|a"); + middle_bar.should_search_fail("a"); + } + + { + const test_regex group_middle_bar(&g_regexTester, "^\\(a|a\\)", basic_or_grep); + group_middle_bar.should_search_match("a|a", "a|a"); + group_middle_bar.should_search_fail("^a|a"); + group_middle_bar.should_search_fail("ba|a"); + group_middle_bar.should_search_fail("a"); + } + + { + const test_regex middle_bar_with_caret(&g_regexTester, "^a|^b", basic_or_grep); + middle_bar_with_caret.should_search_match("a|^b", "a|^b"); + middle_bar_with_caret.should_search_fail("a|b"); + middle_bar_with_caret.should_search_fail("^a|^b"); + middle_bar_with_caret.should_search_fail("ca|^b"); + middle_bar_with_caret.should_search_fail("a"); + middle_bar_with_caret.should_search_fail("b"); + } + + { + const test_regex group_middle_bar_with_caret(&g_regexTester, "^\\(a|^b\\)", basic_or_grep); + group_middle_bar_with_caret.should_search_match("a|^b", "a|^b"); + group_middle_bar_with_caret.should_search_fail("a|b"); + group_middle_bar_with_caret.should_search_fail("^a|^b"); + group_middle_bar_with_caret.should_search_fail("ca|^b"); + group_middle_bar_with_caret.should_search_fail("a"); + group_middle_bar_with_caret.should_search_fail("b"); + } + + g_regexTester.should_match("ab", "a|*b", basic_or_grep); + g_regexTester.should_match("a||b", "a|*b", basic_or_grep); + g_regexTester.should_not_match("a|*b", "a|*b", basic_or_grep); + g_regexTester.should_throw("a|**b", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("ab", "^a|*b", basic_or_grep); + g_regexTester.should_match("a||b", "^a|*b", basic_or_grep); + g_regexTester.should_not_match("a|*b", "^a|*b", basic_or_grep); + g_regexTester.should_throw("^a|**b", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("a|b", "^a|^*b", basic_or_grep); + g_regexTester.should_match("a|^^b", "^a|^*b", basic_or_grep); + g_regexTester.should_not_match("a|*b", "^a|^*b", basic_or_grep); + g_regexTester.should_not_match("a|^*b", "^a|^*b", basic_or_grep); + g_regexTester.should_throw("^a|^**b", error_badrepeat, basic_or_grep); +} + +void test_gh_5165() { + // GH-5165: Revise caret parsing in basic and grep mode + test_gh_5165_syntax_option(basic); + test_gh_5165_syntax_option(grep); + + // test cases specific for basic regular expressions + { + const test_regex middle_nl(&g_regexTester, "^a\na", basic); + middle_nl.should_search_match("a\na", "a\na"); + middle_nl.should_search_fail("^a\na"); + middle_nl.should_search_fail("ba\na"); + middle_nl.should_search_fail("a"); + } + + { + const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", basic); + group_middle_nl.should_search_match("a\na", "a\na"); + group_middle_nl.should_search_fail("^a\na"); + group_middle_nl.should_search_fail("ba\na"); + group_middle_nl.should_search_fail("a"); + } + + { + const test_regex middle_nl_with_caret(&g_regexTester, "^a\n^b", basic); + middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); + middle_nl_with_caret.should_search_fail("a\nb"); + middle_nl_with_caret.should_search_fail("^a\n^b"); + middle_nl_with_caret.should_search_fail("ca\n^b"); + middle_nl_with_caret.should_search_fail("a"); + middle_nl_with_caret.should_search_fail("b"); + } + + { + const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", basic); + group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); + group_middle_nl_with_caret.should_search_fail("a\nb"); + group_middle_nl_with_caret.should_search_fail("^a\n^b"); + group_middle_nl_with_caret.should_search_fail("ca\n^b"); + group_middle_nl_with_caret.should_search_fail("a"); + group_middle_nl_with_caret.should_search_fail("b"); + } + + g_regexTester.should_match("ab", "a\n*b", basic); + g_regexTester.should_match("a\n\nb", "a\n*b", basic); + g_regexTester.should_not_match("a\n*b", "a\n*b", basic); + g_regexTester.should_match("a\n\nb", "^a\n*b", basic); + g_regexTester.should_throw("^a\n**b", error_badrepeat, basic); + + g_regexTester.should_match("a\nb", "^a\n^*b", basic); + g_regexTester.should_match("a\n^^b", "^a\n^*b", basic); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", basic); + g_regexTester.should_not_match("a\n^*b", "^a\n^*b", basic); + g_regexTester.should_throw("^a\n^**b", error_badrepeat, basic); + + // test cases specific for grep mode + { + const test_regex middle_nl(&g_regexTester, "^a\na", grep); + middle_nl.should_search_match("a\na", "a"); + middle_nl.should_search_match("^a\na", "a"); + middle_nl.should_search_match("ba\na", "a"); + middle_nl.should_search_match("a", "a"); + middle_nl.should_search_fail("b"); + } + + { + // This regular expression is not accepted by POSIX grep, but currently the regex parser does not reject it. + // If parser is changed to reject it, adjust this test case. + const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", grep); + group_middle_nl.should_search_match("a\na", "a\na"); + group_middle_nl.should_search_fail("^a\na"); + group_middle_nl.should_search_fail("ba\na"); + group_middle_nl.should_search_fail("a"); + } + + { + const test_regex middle_nl_with_caret(&g_regexTester, "^a\n^b", grep); + middle_nl_with_caret.should_search_match("a\n^b", "a"); + middle_nl_with_caret.should_search_match("a\nb", "a"); + middle_nl_with_caret.should_search_match("ab", "a"); + middle_nl_with_caret.should_search_match("a", "a"); + middle_nl_with_caret.should_search_match("b", "b"); + middle_nl_with_caret.should_search_match("ba", "b"); + middle_nl_with_caret.should_search_fail("^a"); + middle_nl_with_caret.should_search_fail("ca"); + middle_nl_with_caret.should_search_fail("^b"); + middle_nl_with_caret.should_search_fail("ca"); + middle_nl_with_caret.should_search_fail("cb"); + } + + { + // This regular expression is not accepted by POSIX grep, but currently the regex parser does not reject it. + // If parser is changed to reject it, adjust this test case. + const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", grep); + group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); + group_middle_nl_with_caret.should_search_fail("a\nb"); + group_middle_nl_with_caret.should_search_fail("^a\n^b"); + group_middle_nl_with_caret.should_search_fail("ca\n^b"); + group_middle_nl_with_caret.should_search_fail("a"); + group_middle_nl_with_caret.should_search_fail("b"); + } + + g_regexTester.should_not_match("ab", "a\n*b", grep); + g_regexTester.should_not_match("a\n\nb", "a\n*b", grep); + g_regexTester.should_not_match("a\n*b", "a\n*b", grep); + g_regexTester.should_match("a", "a\n*b", grep); + g_regexTester.should_match("*b", "a\n*b", grep); + g_regexTester.should_match("a", "a\n**b", grep); + g_regexTester.should_match("***b", "a\n**b", grep); + + g_regexTester.should_not_match("ab", "^a\n*b", grep); + g_regexTester.should_not_match("a\n\nb", "^a\n*b", grep); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", grep); + g_regexTester.should_match("a", "^a\n*b", grep); + g_regexTester.should_match("*b", "^a\n*b", grep); + g_regexTester.should_match("a", "^a\n**b", grep); + g_regexTester.should_match("****b", "^a\n**b", grep); + + g_regexTester.should_not_match("a\nb", "^a\n^*b", grep); + g_regexTester.should_not_match("a\n^^b", "^a\n^*b", grep); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", grep); + g_regexTester.should_not_match("a\n^*b", "^a\n^*b", grep); + g_regexTester.should_not_match("^*b", "^a\n^*b", grep); + g_regexTester.should_match("a", "^a\n^*b", grep); + g_regexTester.should_match("*b", "^a\n^*b", grep); + g_regexTester.should_not_match("**b", "^a\n^*b", grep); + g_regexTester.should_match("a", "^a\n^**b", grep); + g_regexTester.should_match("****b", "^a\n^**b", grep); +} + void test_gh_5167() { // GH-5167: Limit backreference parsing to single digit for basic regular expressions g_regexTester.should_match("abab0", R"(\(ab*\)\10)", basic); @@ -779,6 +1111,7 @@ int main() { test_gh_4995(); test_gh_5058(); test_gh_5160(); + test_gh_5165(); test_gh_5167(); test_gh_5192(); test_gh_5214();