From 2b46b2151978d5e5a4807aaeb82f5233414c6318 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 14 Nov 2024 15:30:44 +0200 Subject: [PATCH] gh-67877: Fix memory leaks in terminated RE matching If SRE(match) function terminates abruptly, either because of a signal or because memory allocation fails, allocated SRE_REPEAT blocks might be never released. --- Lib/test/test_re.py | 21 ++++ ...4-11-14-22-25-49.gh-issue-67877.G9hw0w.rst | 2 + Modules/_sre/clinic/sre.c.h | 44 ++++++- Modules/_sre/sre.c | 107 ++++++++++++++---- Modules/_sre/sre.h | 13 ++- Modules/_sre/sre_lib.h | 39 ++++++- 6 files changed, 198 insertions(+), 28 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 7bc702ec89a4a7..af36806bffeaf3 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2681,6 +2681,27 @@ def test_character_set_none(self): self.assertIsNone(re.search(p, s)) self.assertIsNone(re.search('(?s:.)' + p, s)) + def check_interrupt(self, pattern, string, maxcount): + class Interrupt(Exception): + pass + p = re.compile(pattern) + for n in range(maxcount): + p._fail_after(n, Interrupt) + try: + p.match(string) + return n + except Interrupt: + pass + + @unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build') + def test_memory_leaks(self): + self.check_interrupt(r'(.)*:', 'abc:', 100) + self.check_interrupt(r'([^:])*?:', 'abc:', 100) + self.check_interrupt(r'([^:])*+:', 'abc:', 100) + self.check_interrupt(r'(.){2,4}:', 'abc:', 100) + self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100) + self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100) + def get_debug_out(pat): with captured_stdout() as out: diff --git a/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst new file mode 100644 index 00000000000000..021b4ae2e100bc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst @@ -0,0 +1,2 @@ +Fix memory leaks when :mod:`regular expression ` matching terminates +abruptly, either because of a signal or because memory allocation fails. diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index e287f3d5ad3991..87e4785a428468 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -985,6 +985,44 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__, #define _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF \ {"__deepcopy__", (PyCFunction)_sre_SRE_Pattern___deepcopy__, METH_O, _sre_SRE_Pattern___deepcopy____doc__}, +#if defined(Py_DEBUG) + +PyDoc_STRVAR(_sre_SRE_Pattern__fail_after__doc__, +"_fail_after($self, count, exception, /)\n" +"--\n" +"\n" +"For debugging."); + +#define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF \ + {"_fail_after", _PyCFunction_CAST(_sre_SRE_Pattern__fail_after), METH_FASTCALL, _sre_SRE_Pattern__fail_after__doc__}, + +static PyObject * +_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count, + PyObject *exception); + +static PyObject * +_sre_SRE_Pattern__fail_after(PatternObject *self, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + int count; + PyObject *exception; + + if (!_PyArg_CheckPositional("_fail_after", nargs, 2, 2)) { + goto exit; + } + count = PyLong_AsInt(args[0]); + if (count == -1 && PyErr_Occurred()) { + goto exit; + } + exception = args[1]; + return_value = _sre_SRE_Pattern__fail_after_impl(self, count, exception); + +exit: + return return_value; +} + +#endif /* defined(Py_DEBUG) */ + PyDoc_STRVAR(_sre_compile__doc__, "compile($module, /, pattern, flags, code, groups, groupindex,\n" " indexgroup)\n" @@ -1474,4 +1512,8 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const } return _sre_SRE_Scanner_search_impl(self, cls); } -/*[clinic end generated code: output=afaa301d55957cb0 input=a9049054013a1b77]*/ + +#ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF + #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF +#endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */ +/*[clinic end generated code: output=f8cb77f2261f0b2e input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 2c86f8869d8e58..d411a6ef5eca49 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -397,6 +397,25 @@ _sre_unicode_tolower_impl(PyObject *module, int character) return sre_lower_unicode(character); } +LOCAL(void) +state_clean_repeat_data(SRE_STATE* state) +{ + SRE_REPEAT *rep = state->repeat; + state->repeat = NULL; + while (rep) { + SRE_REPEAT *prev = rep->prev; + PyMem_Free(rep); + rep = prev; + } + rep = state->repstack; + state->repstack = NULL; + while (rep) { + SRE_REPEAT *next = rep->next; + PyMem_Free(rep); + rep = next; + } +} + LOCAL(void) state_reset(SRE_STATE* state) { @@ -406,8 +425,7 @@ state_reset(SRE_STATE* state) state->lastmark = -1; state->lastindex = -1; - state->repeat = NULL; - + state_clean_repeat_data(state); data_stack_dealloc(state); } @@ -511,6 +529,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->pos = start; state->endpos = end; +#ifdef Py_DEBUG + state->fail_after_count = pattern->fail_after_count; + state->fail_after_exc = pattern->fail_after_exc; // borrowed ref +#endif + return string; err: /* We add an explicit cast here because MSVC has a bug when @@ -524,15 +547,21 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, } LOCAL(void) -state_fini(SRE_STATE* state) +state_fini(SRE_STATE* state, PatternObject *pattern) { if (state->buffer.buf) PyBuffer_Release(&state->buffer); Py_XDECREF(state->string); + state_clean_repeat_data(state); data_stack_dealloc(state); /* See above PyMem_Free() for why we explicitly cast here. */ PyMem_Free((void*) state->mark); state->mark = NULL; +#ifdef Py_DEBUG + if (pattern) { + pattern->fail_after_count = -1; + } +#endif } /* calculate offset from start of string */ @@ -619,6 +648,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg) Py_VISIT(self->groupindex); Py_VISIT(self->indexgroup); Py_VISIT(self->pattern); +#ifdef Py_DEBUG + Py_VISIT(self->fail_after_exc); +#endif return 0; } @@ -628,6 +660,9 @@ pattern_clear(PatternObject *self) Py_CLEAR(self->groupindex); Py_CLEAR(self->indexgroup); Py_CLEAR(self->pattern); +#ifdef Py_DEBUG + Py_CLEAR(self->fail_after_exc); +#endif return 0; } @@ -690,7 +725,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls, Py_ssize_t status; PyObject *match; - if (!state_init(&state, (PatternObject *)self, string, pos, endpos)) + if (!state_init(&state, self, string, pos, endpos)) return NULL; INIT_TRACE(&state); @@ -702,12 +737,12 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls, TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { - state_fini(&state); + state_fini(&state, self); return NULL; } match = pattern_new_match(module_state, self, &state, status); - state_fini(&state); + state_fini(&state, self); return match; } @@ -747,12 +782,12 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls, TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { - state_fini(&state); + state_fini(&state, self); return NULL; } match = pattern_new_match(module_state, self, &state, status); - state_fini(&state); + state_fini(&state, self); return match; } @@ -792,12 +827,12 @@ _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls, TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { - state_fini(&state); + state_fini(&state, self); return NULL; } match = pattern_new_match(module_state, self, &state, status); - state_fini(&state); + state_fini(&state, self); return match; } @@ -826,7 +861,7 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string, list = PyList_New(0); if (!list) { - state_fini(&state); + state_fini(&state, self); return NULL; } @@ -888,12 +923,12 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string, state.start = state.ptr; } - state_fini(&state); + state_fini(&state, self); return list; error: Py_DECREF(list); - state_fini(&state); + state_fini(&state, self); return NULL; } @@ -989,7 +1024,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, list = PyList_New(0); if (!list) { - state_fini(&state); + state_fini(&state, self); return NULL; } @@ -1053,12 +1088,12 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, if (status < 0) goto error; - state_fini(&state); + state_fini(&state, self); return list; error: Py_DECREF(list); - state_fini(&state); + state_fini(&state, self); return NULL; } @@ -1185,7 +1220,7 @@ pattern_subx(_sremodulestate* module_state, list = PyList_New(0); if (!list) { Py_DECREF(filter); - state_fini(&state); + state_fini(&state, self); return NULL; } @@ -1271,7 +1306,7 @@ pattern_subx(_sremodulestate* module_state, goto error; } - state_fini(&state); + state_fini(&state, self); Py_DECREF(filter); @@ -1303,7 +1338,7 @@ pattern_subx(_sremodulestate* module_state, error: Py_DECREF(list); - state_fini(&state); + state_fini(&state, self); Py_DECREF(filter); return NULL; @@ -1381,6 +1416,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo) return Py_NewRef(self); } +#ifdef Py_DEBUG +/*[clinic input] +_sre.SRE_Pattern._fail_after + + count: int + exception: object + / + +For debugging. +[clinic start generated code]*/ + +static PyObject * +_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count, + PyObject *exception) +/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/ +{ + self->fail_after_count = count; + Py_INCREF(exception); + Py_XSETREF(self->fail_after_exc, exception); + Py_RETURN_NONE; +} +#endif /* Py_DEBUG */ + static PyObject * pattern_repr(PatternObject *obj) { @@ -1506,6 +1564,11 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, self->pattern = NULL; self->groupindex = NULL; self->indexgroup = NULL; +#ifdef Py_DEBUG + self->fail_after_count = -1; + self->fail_after_exc = NULL; + self->fail_after_exc = Py_NewRef(PyExc_RuntimeError); +#endif self->codesize = n; @@ -2680,7 +2743,7 @@ scanner_dealloc(ScannerObject* self) PyTypeObject *tp = Py_TYPE(self); PyObject_GC_UnTrack(self); - state_fini(&self->state); + state_fini(&self->state, self->pattern); (void)scanner_clear(self); tp->tp_free(self); Py_DECREF(tp); @@ -2826,7 +2889,8 @@ pattern_scanner(_sremodulestate *module_state, return NULL; } - scanner->pattern = Py_NewRef(self); + Py_INCREF(self); + scanner->pattern = self; PyObject_GC_Track(scanner); return (PyObject*) scanner; @@ -3020,6 +3084,7 @@ static PyMethodDef pattern_methods[] = { _SRE_SRE_PATTERN_SCANNER_METHODDEF _SRE_SRE_PATTERN___COPY___METHODDEF _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF + _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS, PyDoc_STR("See PEP 585")}, {NULL, NULL} diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index 83d89d57b11199..149778c3416cd3 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -34,6 +34,11 @@ typedef struct { int flags; /* flags used when compiling pattern source */ PyObject *weakreflist; /* List of weak references */ int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */ +#ifdef Py_DEBUG + /* for simulation of user interruption */ + int fail_after_count; + PyObject *fail_after_exc; +#endif /* pattern code */ Py_ssize_t codesize; SRE_CODE code[1]; @@ -68,6 +73,7 @@ typedef struct SRE_REPEAT_T { const SRE_CODE* pattern; /* points to REPEAT operator arguments */ const void* last_ptr; /* helper to check for infinite loops */ struct SRE_REPEAT_T *prev; /* points to previous repeat context */ + struct SRE_REPEAT_T *next; /* links repeat contexts in state->repstack */ } SRE_REPEAT; typedef struct { @@ -95,12 +101,17 @@ typedef struct { size_t data_stack_base; /* current repeat context */ SRE_REPEAT *repeat; + SRE_REPEAT *repstack; unsigned int sigcount; +#ifdef Py_DEBUG + int fail_after_count; + PyObject *fail_after_exc; +#endif } SRE_STATE; typedef struct { PyObject_HEAD - PyObject* pattern; + PatternObject* pattern; SRE_STATE state; int executing; } ScannerObject; diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 97fbb0a75e54b6..d39a7fc6da88e8 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -560,13 +560,29 @@ typedef struct { Py_ssize_t last_ctx_pos; } SRE(match_context); -#define MAYBE_CHECK_SIGNALS \ +#define _MAYBE_CHECK_SIGNALS \ do { \ if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \ RETURN_ERROR(SRE_ERROR_INTERRUPTED); \ } \ } while (0) +#ifdef Py_DEBUG +# define MAYBE_CHECK_SIGNALS \ + do { \ + _MAYBE_CHECK_SIGNALS; \ + if (state->fail_after_count == 0) { \ + PyErr_SetNone(state->fail_after_exc); \ + RETURN_ERROR(SRE_ERROR_INTERRUPTED); \ + } \ + if (state->fail_after_count > 0) { \ + state->fail_after_count--; \ + } \ + } while (0) +#else +# define MAYBE_CHECK_SIGNALS _MAYBE_CHECK_SIGNALS +#endif /* Py_DEBUG */ + #ifdef HAVE_COMPUTED_GOTOS #ifndef USE_COMPUTED_GOTOS #define USE_COMPUTED_GOTOS 1 @@ -1120,21 +1136,20 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) pattern[1], pattern[2])); /* install new repeat context */ - /* TODO(https://github.com/python/cpython/issues/67877): Fix this - * potential memory leak. */ ctx->u.rep = (SRE_REPEAT*) PyMem_Malloc(sizeof(*ctx->u.rep)); if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; + RETURN_ERROR(SRE_ERROR_MEMORY); } ctx->u.rep->count = -1; ctx->u.rep->pattern = pattern; ctx->u.rep->prev = state->repeat; + ctx->u.rep->next = NULL; ctx->u.rep->last_ptr = NULL; state->repeat = ctx->u.rep; state->ptr = ptr; DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]); + assert(state->repeat == ctx->u.rep); state->repeat = ctx->u.rep->prev; PyMem_Free(ctx->u.rep); @@ -1203,9 +1218,16 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* cannot match more repeated items here. make sure the tail matches */ + assert(state->repeat == ctx->u.rep); + assert(state->repeat->next == NULL); + state->repeat->next = state->repstack; + state->repstack = state->repeat; state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, pattern); + assert(state->repstack == ctx->u.rep); state->repeat = ctx->u.rep; // restore repeat before return + state->repstack = state->repeat->next; + state->repeat->next = NULL; RETURN_ON_SUCCESS(ret); state->ptr = ptr; @@ -1241,6 +1263,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) } /* see if the tail matches */ + assert(state->repeat == ctx->u.rep); + assert(state->repeat->next == NULL); + state->repeat->next = state->repstack; + state->repstack = state->repeat; state->repeat = ctx->u.rep->prev; LASTMARK_SAVE(); @@ -1249,7 +1275,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, pattern); SRE_REPEAT *repeat_of_tail = state->repeat; + assert(state->repstack == ctx->u.rep); state->repeat = ctx->u.rep; // restore repeat before return + state->repstack = state->repeat->next; + state->repeat->next = NULL; if (ret) { if (repeat_of_tail)