Skip to content

Commit 5b1b787

Browse files
[3.12] pythongh-67877: Fix memory leaks in terminated RE matching (pythonGH-126840)
If SRE(match) function terminates abruptly, either because of a signal or because memory allocation fails, allocated SRE_REPEAT blocks might be never released. (cherry picked from commit 7538e7f) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: <wjssz@users.noreply.github.com>
1 parent 9d986d9 commit 5b1b787

File tree

6 files changed

+251
-14
lines changed

6 files changed

+251
-14
lines changed

Lib/test/test_re.py

+44
Original file line numberDiff line numberDiff line change
@@ -2621,6 +2621,50 @@ def test_regression_gh94675(self):
26212621
p.terminate()
26222622
p.join()
26232623

2624+
def test_fail(self):
2625+
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
2626+
2627+
def test_character_set_any(self):
2628+
# The union of complementary character sets matches any character
2629+
# and is equivalent to "(?s:.)".
2630+
s = '1x\n'
2631+
for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
2632+
with self.subTest(pattern=p):
2633+
self.assertEqual(re.findall(p, s), list(s))
2634+
self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
2635+
2636+
def test_character_set_none(self):
2637+
# Negation of the union of complementary character sets does not match
2638+
# any character.
2639+
s = '1x\n'
2640+
for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
2641+
with self.subTest(pattern=p):
2642+
self.assertIsNone(re.search(p, s))
2643+
self.assertIsNone(re.search('(?s:.)' + p, s))
2644+
2645+
def check_interrupt(self, pattern, string, maxcount):
2646+
class Interrupt(Exception):
2647+
pass
2648+
p = re.compile(pattern)
2649+
for n in range(maxcount):
2650+
try:
2651+
p._fail_after(n, Interrupt)
2652+
p.match(string)
2653+
return n
2654+
except Interrupt:
2655+
pass
2656+
finally:
2657+
p._fail_after(-1, None)
2658+
2659+
@unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build')
2660+
def test_memory_leaks(self):
2661+
self.check_interrupt(r'(.)*:', 'abc:', 100)
2662+
self.check_interrupt(r'([^:])*?:', 'abc:', 100)
2663+
self.check_interrupt(r'([^:])*+:', 'abc:', 100)
2664+
self.check_interrupt(r'(.){2,4}:', 'abc:', 100)
2665+
self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100)
2666+
self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100)
2667+
26242668

26252669
def get_debug_out(pat):
26262670
with captured_stdout() as out:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix memory leaks when :mod:`regular expression <re>` matching terminates
2+
abruptly, either because of a signal or because memory allocation fails.

Modules/_sre/clinic/sre.c.h

+43-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/_sre/sre.c

+127-5
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size)
218218
return 0;
219219
}
220220

221+
/* memory pool functions for SRE_REPEAT, this can avoid memory
222+
leak when SRE(match) function terminates abruptly.
223+
state->repeat_pool_used is a doubly-linked list, so that we
224+
can remove a SRE_REPEAT node from it.
225+
state->repeat_pool_unused is a singly-linked list, we put/get
226+
node at the head. */
227+
static SRE_REPEAT *
228+
repeat_pool_malloc(SRE_STATE *state)
229+
{
230+
SRE_REPEAT *repeat;
231+
232+
if (state->repeat_pool_unused) {
233+
/* remove from unused pool (singly-linked list) */
234+
repeat = state->repeat_pool_unused;
235+
state->repeat_pool_unused = repeat->pool_next;
236+
}
237+
else {
238+
repeat = PyObject_Malloc(sizeof(SRE_REPEAT));
239+
if (!repeat) {
240+
return NULL;
241+
}
242+
}
243+
244+
/* add to used pool (doubly-linked list) */
245+
SRE_REPEAT *temp = state->repeat_pool_used;
246+
if (temp) {
247+
temp->pool_prev = repeat;
248+
}
249+
repeat->pool_prev = NULL;
250+
repeat->pool_next = temp;
251+
state->repeat_pool_used = repeat;
252+
253+
return repeat;
254+
}
255+
256+
static void
257+
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
258+
{
259+
SRE_REPEAT *prev = repeat->pool_prev;
260+
SRE_REPEAT *next = repeat->pool_next;
261+
262+
/* remove from used pool (doubly-linked list) */
263+
if (prev) {
264+
prev->pool_next = next;
265+
}
266+
else {
267+
state->repeat_pool_used = next;
268+
}
269+
if (next) {
270+
next->pool_prev = prev;
271+
}
272+
273+
/* add to unused pool (singly-linked list) */
274+
repeat->pool_next = state->repeat_pool_unused;
275+
state->repeat_pool_unused = repeat;
276+
}
277+
278+
static void
279+
repeat_pool_clear(SRE_STATE *state)
280+
{
281+
/* clear used pool */
282+
SRE_REPEAT *next = state->repeat_pool_used;
283+
state->repeat_pool_used = NULL;
284+
while (next) {
285+
SRE_REPEAT *temp = next;
286+
next = temp->pool_next;
287+
PyObject_Free(temp);
288+
}
289+
290+
/* clear unused pool */
291+
next = state->repeat_pool_unused;
292+
state->repeat_pool_unused = NULL;
293+
while (next) {
294+
SRE_REPEAT *temp = next;
295+
next = temp->pool_next;
296+
PyObject_Free(temp);
297+
}
298+
}
299+
221300
/* generate 8-bit version */
222301

223302
#define SRE_CHAR Py_UCS1
@@ -463,6 +542,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
463542
state->pos = start;
464543
state->endpos = end;
465544

545+
#ifdef Py_DEBUG
546+
state->fail_after_count = pattern->fail_after_count;
547+
state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
548+
#endif
549+
466550
return string;
467551
err:
468552
/* We add an explicit cast here because MSVC has a bug when
@@ -485,6 +569,8 @@ state_fini(SRE_STATE* state)
485569
/* See above PyMem_Del for why we explicitly cast here. */
486570
PyMem_Free((void*) state->mark);
487571
state->mark = NULL;
572+
/* SRE_REPEAT pool */
573+
repeat_pool_clear(state);
488574
}
489575

490576
/* calculate offset from start of string */
@@ -571,6 +657,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg)
571657
Py_VISIT(self->groupindex);
572658
Py_VISIT(self->indexgroup);
573659
Py_VISIT(self->pattern);
660+
#ifdef Py_DEBUG
661+
Py_VISIT(self->fail_after_exc);
662+
#endif
574663
return 0;
575664
}
576665

@@ -580,6 +669,9 @@ pattern_clear(PatternObject *self)
580669
Py_CLEAR(self->groupindex);
581670
Py_CLEAR(self->indexgroup);
582671
Py_CLEAR(self->pattern);
672+
#ifdef Py_DEBUG
673+
Py_CLEAR(self->fail_after_exc);
674+
#endif
583675
return 0;
584676
}
585677

@@ -642,7 +734,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
642734
Py_ssize_t status;
643735
PyObject *match;
644736

645-
if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
737+
if (!state_init(&state, self, string, pos, endpos))
646738
return NULL;
647739

648740
state.ptr = state.start;
@@ -1330,6 +1422,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
13301422
return Py_NewRef(self);
13311423
}
13321424

1425+
#ifdef Py_DEBUG
1426+
/*[clinic input]
1427+
_sre.SRE_Pattern._fail_after
1428+
1429+
count: int
1430+
exception: object
1431+
/
1432+
1433+
For debugging.
1434+
[clinic start generated code]*/
1435+
1436+
static PyObject *
1437+
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
1438+
PyObject *exception)
1439+
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
1440+
{
1441+
self->fail_after_count = count;
1442+
Py_INCREF(exception);
1443+
Py_XSETREF(self->fail_after_exc, exception);
1444+
Py_RETURN_NONE;
1445+
}
1446+
#endif /* Py_DEBUG */
1447+
13331448
static PyObject *
13341449
pattern_repr(PatternObject *obj)
13351450
{
@@ -1456,6 +1571,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
14561571
self->pattern = NULL;
14571572
self->groupindex = NULL;
14581573
self->indexgroup = NULL;
1574+
#ifdef Py_DEBUG
1575+
self->fail_after_count = -1;
1576+
self->fail_after_exc = NULL;
1577+
#endif
14591578

14601579
self->codesize = n;
14611580

@@ -2552,7 +2671,8 @@ pattern_new_match(_sremodulestate* module_state,
25522671
if (!match)
25532672
return NULL;
25542673

2555-
match->pattern = (PatternObject*)Py_NewRef(pattern);
2674+
Py_INCREF(pattern);
2675+
match->pattern = pattern;
25562676

25572677
match->string = Py_NewRef(state->string);
25582678

@@ -2688,7 +2808,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
26882808
return NULL;
26892809
}
26902810

2691-
match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2811+
match = pattern_new_match(module_state, self->pattern,
26922812
state, status);
26932813

26942814
if (status == 0)
@@ -2738,7 +2858,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
27382858
return NULL;
27392859
}
27402860

2741-
match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2861+
match = pattern_new_match(module_state, self->pattern,
27422862
state, status);
27432863

27442864
if (status == 0)
@@ -2774,7 +2894,8 @@ pattern_scanner(_sremodulestate *module_state,
27742894
return NULL;
27752895
}
27762896

2777-
scanner->pattern = Py_NewRef(self);
2897+
Py_INCREF(self);
2898+
scanner->pattern = self;
27782899

27792900
PyObject_GC_Track(scanner);
27802901
return (PyObject*) scanner;
@@ -2968,6 +3089,7 @@ static PyMethodDef pattern_methods[] = {
29683089
_SRE_SRE_PATTERN_SCANNER_METHODDEF
29693090
_SRE_SRE_PATTERN___COPY___METHODDEF
29703091
_SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
3092+
_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
29713093
{"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
29723094
PyDoc_STR("See PEP 585")},
29733095
{NULL, NULL}

Modules/_sre/sre.h

+16-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ typedef struct {
3434
int flags; /* flags used when compiling pattern source */
3535
PyObject *weakreflist; /* List of weak references */
3636
int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
37+
#ifdef Py_DEBUG
38+
/* for simulation of user interruption */
39+
int fail_after_count;
40+
PyObject *fail_after_exc;
41+
#endif
3742
/* pattern code */
3843
Py_ssize_t codesize;
3944
SRE_CODE code[1];
@@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T {
6873
const SRE_CODE* pattern; /* points to REPEAT operator arguments */
6974
const void* last_ptr; /* helper to check for infinite loops */
7075
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
76+
/* for SRE_REPEAT pool */
77+
struct SRE_REPEAT_T *pool_prev;
78+
struct SRE_REPEAT_T *pool_next;
7179
} SRE_REPEAT;
7280

7381
typedef struct {
@@ -94,12 +102,19 @@ typedef struct {
94102
size_t data_stack_base;
95103
/* current repeat context */
96104
SRE_REPEAT *repeat;
105+
/* SRE_REPEAT pool */
106+
SRE_REPEAT *repeat_pool_used;
107+
SRE_REPEAT *repeat_pool_unused;
97108
unsigned int sigcount;
109+
#ifdef Py_DEBUG
110+
int fail_after_count;
111+
PyObject *fail_after_exc;
112+
#endif
98113
} SRE_STATE;
99114

100115
typedef struct {
101116
PyObject_HEAD
102-
PyObject* pattern;
117+
PatternObject* pattern;
103118
SRE_STATE state;
104119
int executing;
105120
} ScannerObject;

0 commit comments

Comments
 (0)