Skip to content

Commit 2391e5e

Browse files
<regex>: Improve search performance for regexes with initial + quantifiers (#5509)
Co-authored-by: Stephan T. Lavavej <stl@microsoft.com>
1 parent 6de3f1c commit 2391e5e

File tree

5 files changed

+104
-6
lines changed

5 files changed

+104
-6
lines changed

benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ add_benchmark(nth_element src/nth_element.cpp)
116116
add_benchmark(path_lexically_normal src/path_lexically_normal.cpp)
117117
add_benchmark(priority_queue_push_range src/priority_queue_push_range.cpp)
118118
add_benchmark(random_integer_generation src/random_integer_generation.cpp)
119+
add_benchmark(regex_search src/regex_search.cpp)
119120
add_benchmark(remove src/remove.cpp)
120121
add_benchmark(replace src/replace.cpp)
121122
add_benchmark(reverse src/reverse.cpp)

benchmarks/src/regex_search.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
#include <benchmark/benchmark.h>
5+
#include <regex>
6+
#include <string>
7+
8+
#include "lorem.hpp"
9+
10+
using namespace std;
11+
12+
void bm_lorem_search(benchmark::State& state, const char* pattern) {
13+
string repeated_lorem{lorem_ipsum};
14+
for (long long i = 0; i < state.range(); ++i) {
15+
repeated_lorem += repeated_lorem;
16+
}
17+
regex re{pattern};
18+
19+
for (auto _ : state) {
20+
benchmark::DoNotOptimize(repeated_lorem);
21+
const char* pos = repeated_lorem.data();
22+
const char* end = repeated_lorem.data() + repeated_lorem.size();
23+
cmatch match;
24+
for (; regex_search(pos, end, match, re); ++pos) {
25+
benchmark::DoNotOptimize(match);
26+
pos = match[0].second;
27+
if (pos == end) {
28+
break;
29+
}
30+
}
31+
}
32+
}
33+
34+
BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4);
35+
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4);
36+
BENCHMARK_CAPTURE(bm_lorem_search, "(?:bibe)+", "(?:bibe)+")->Arg(2)->Arg(3)->Arg(4);
37+
38+
BENCHMARK_MAIN();

stl/inc/regex

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4128,6 +4128,15 @@ _BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg
41284128
break;
41294129
}
41304130

4131+
case _N_rep:
4132+
{
4133+
_Node_rep* _Node = static_cast<_Node_rep*>(_Nx);
4134+
if (_Node->_Min == 0) {
4135+
return _First_arg;
4136+
}
4137+
break;
4138+
}
4139+
41314140
case _N_begin:
41324141
break;
41334142

@@ -4139,7 +4148,6 @@ _BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg
41394148
case _N_neg_assert:
41404149
case _N_back:
41414150
case _N_endif:
4142-
case _N_rep:
41434151
case _N_end_rep:
41444152
default:
41454153
return _First_arg;

tests/std/include/test_regex_support.hpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,9 @@ class test_regex {
282282
submatches_success = false;
283283
break;
284284
}
285-
} else if (!actual_capture.matched || actual_capture.first != (mr[0].first + expected_capture.first)
286-
|| actual_capture.second != (mr[0].first + expected_capture.second)) {
285+
} else if (!actual_capture.matched
286+
|| actual_capture.first != (subject.begin() + expected_capture.first)
287+
|| actual_capture.second != (subject.begin() + expected_capture.second)) {
287288
submatches_success = false;
288289
break;
289290
}
@@ -297,7 +298,8 @@ class test_regex {
297298
for (const auto& expected_capture : capture_groups) {
298299
std::string capture = "(unmatched)";
299300
if (expected_capture.first != -1) {
300-
capture.assign(mr[0].first + expected_capture.first, mr[0].first + expected_capture.second);
301+
capture.assign(
302+
subject.begin() + expected_capture.first, subject.begin() + expected_capture.second);
301303
}
302304
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), expected_capture.first,
303305
expected_capture.second);
@@ -313,8 +315,8 @@ class test_regex {
313315
std::ptrdiff_t last = -1;
314316
if (actual_capture.matched) {
315317
capture = actual_capture.str();
316-
first = actual_capture.first - mr[0].first;
317-
last = actual_capture.second - mr[0].first;
318+
first = actual_capture.first - subject.begin();
319+
last = actual_capture.second - subject.begin();
318320
}
319321
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), first, last);
320322
initial = false;

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,6 +1889,54 @@ void test_gh_5490() {
18891889
}
18901890
}
18911891

1892+
void test_gh_5509() {
1893+
// GH-5509 extended the matcher's skip optimization
1894+
// to regexes starting with a loop with at least one repetition,
1895+
// speeding up searches for such regexes
1896+
1897+
{
1898+
test_regex char_plus_regex(&g_regexTester, "(a+)");
1899+
char_plus_regex.should_search_match_capture_groups("blwerofaaweraf", "aa", match_default, {{7, 9}});
1900+
char_plus_regex.should_search_fail("blwerofwerf");
1901+
}
1902+
1903+
{
1904+
test_regex charclass_plus_regex(&g_regexTester, "([fa]+)");
1905+
charclass_plus_regex.should_search_match_capture_groups("blwerofaaweraf", "faa", match_default, {{6, 9}});
1906+
charclass_plus_regex.should_search_fail("blwerower");
1907+
}
1908+
1909+
{
1910+
test_regex string_plus_regex(&g_regexTester, "((?:aw)+)");
1911+
string_plus_regex.should_search_match_capture_groups("blwerofaawaweraf", "awaw", match_default, {{8, 12}});
1912+
string_plus_regex.should_search_fail("blwerofaerwaf");
1913+
}
1914+
1915+
{
1916+
test_regex anchored_string_plus_regex(&g_regexTester, "((?:^aw)+)");
1917+
anchored_string_plus_regex.should_search_match_capture_groups(
1918+
"blwerofa\nawaweraf", "aw", match_default, {{9, 11}});
1919+
anchored_string_plus_regex.should_search_fail("blwerof\naerwaf");
1920+
}
1921+
1922+
{
1923+
test_regex anchored_string_plus_regex(&g_regexTester, "((?:$\naw)+)");
1924+
anchored_string_plus_regex.should_search_match_capture_groups(
1925+
"blwerofa\nawaweraf", "\naw", match_default, {{8, 11}});
1926+
anchored_string_plus_regex.should_search_fail("blwerof\naerwaf");
1927+
}
1928+
1929+
{
1930+
test_regex string_star_string_regex(&g_regexTester, "((?:aw)*fa)");
1931+
string_star_string_regex.should_search_match_capture_groups(
1932+
"blwerofaawawfaeraf", "fa", match_default, {{6, 8}});
1933+
string_star_string_regex.should_search_match_capture_groups(
1934+
"blweroawawfaeraf", "awawfa", match_default, {{6, 12}});
1935+
string_star_string_regex.should_search_match("blwerofaerwaf", "fa");
1936+
string_star_string_regex.should_search_fail("blweroerwaf");
1937+
}
1938+
}
1939+
18921940
int main() {
18931941
test_dev10_449367_case_insensitivity_should_work();
18941942
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
@@ -1936,6 +1984,7 @@ int main() {
19361984
test_gh_5374();
19371985
test_gh_5377();
19381986
test_gh_5490();
1987+
test_gh_5509();
19391988

19401989
return g_regexTester.result();
19411990
}

0 commit comments

Comments
 (0)