Skip to content

gh-91760: More strict rules for numerical group references and group names in RE #91792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 8, 2022
19 changes: 11 additions & 8 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ The special characters are:
``(?P<name>...)``
Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid
Python identifiers, and each group name must be defined only once within a
Python identifiers, and in bytes patterns they must contain only characters
in the ASCII range. Each group name must be defined only once within a
regular expression. A symbolic group is also a numbered group, just as if
the group were not named.

Expand All @@ -417,8 +418,9 @@ The special characters are:
| | * ``\1`` |
+---------------------------------------+----------------------------------+

.. deprecated:: 3.11
Group names containing non-ASCII characters in bytes patterns.
.. versionchanged:: 3.12
In bytes patterns group names must contain only characters in
the ASCII range.

.. index:: single: (?P=; in regular expressions

Expand Down Expand Up @@ -489,8 +491,8 @@ The special characters are:
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
not with ``'<user@host.com'`` nor ``'user@host.com>'``.

.. deprecated:: 3.11
Group *id* containing anything except ASCII digits.
.. versionchanged:: 3.12
Group *id* can only contain ASCII digits.


The special sequences consist of ``'\'`` and a character from the list below.
Expand Down Expand Up @@ -1001,9 +1003,10 @@ form.
Empty matches for the pattern are replaced when adjacent to a previous
non-empty match.

.. deprecated:: 3.11
Group *id* containing anything except ASCII digits.
Group names containing non-ASCII characters in bytes replacement strings.
.. versionchanged:: 3.12
Group *id* can only contain ASCII digits.
In bytes replacement strings group names must contain only characters
in the ASCII range.


.. function:: subn(pattern, repl, string, count=0, flags=0)
Expand Down
10 changes: 10 additions & 0 deletions Doc/whatsnew/3.12.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,13 @@ Porting to Python 3.12

This section lists previously described changes and other bugfixes
that may require changes to your code.

Changes in the Python API
-------------------------

* More strict rules are now applied for numerical group references and
group names in regular expressions.
Only sequence of ASCII digits is now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
(Contributed by Serhiy Storchaka in :gh:`91760`.)
40 changes: 12 additions & 28 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,17 +291,13 @@ def error(self, msg, offset=0):
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
return error(msg, self.string, self.tell() - offset)

def checkgroupname(self, name, offset, nested):
def checkgroupname(self, name, offset):
if not (self.istext or name.isascii()):
msg = "bad character in group name %a" % name
raise self.error(msg, len(name) + offset)
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)
if not (self.istext or name.isascii()):
import warnings
warnings.warn(
"bad character in group name %a at position %d" %
(name, self.tell() - len(name) - offset),
DeprecationWarning, stacklevel=nested + 7
)

def _class_escape(source, escape):
# handle escape code inside character class
Expand Down Expand Up @@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">", "group name")
source.checkgroupname(name, 1, nested)
source.checkgroupname(name, 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")", "group name")
source.checkgroupname(name, 1, nested)
source.checkgroupname(name, 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name %r" % name
Expand Down Expand Up @@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(":
# conditional backreference group
condname = source.getuntil(")", "group name")
if condname.isidentifier():
source.checkgroupname(condname, 1, nested)
if not (condname.isdecimal() and condname.isascii()):
source.checkgroupname(condname, 1)
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
msg = "bad character in group name %r" % condname
raise source.error(msg, len(condname) + 1) from None
condgroup = int(condname)
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
Expand Down Expand Up @@ -1022,20 +1012,14 @@ def addgroup(index, pos):
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">", "group name")
if name.isidentifier():
s.checkgroupname(name, 1, -1)
if not (name.isdecimal() and name.isascii()):
s.checkgroupname(name, 1)
try:
index = groupindex[name]
except KeyError:
raise IndexError("unknown group name %r" % name) from None
else:
try:
index = int(name)
if index < 0:
raise ValueError
except ValueError:
raise s.error("bad character in group name %r" % name,
len(name) + 1) from None
index = int(name)
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
Expand Down
79 changes: 24 additions & 55 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,21 +275,12 @@ def test_symbolic_groups_errors(self):
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 4") as w:
re.compile(b'(?P<\xc2\xb5>x)')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 4"):
self.checkPatternError(b'(?P=\xc2\xb5)',
r"unknown group name '\xc2\xb5'", 4)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3"):
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"unknown group name '\xc2\xb5'", 3)
self.checkPatternError(b'(?P<\xc2\xb5>x)',
r"bad character in group name '\xc2\xb5'", 4)
self.checkPatternError(b'(?P=\xc2\xb5)',
r"bad character in group name '\xc2\xb5'", 4)
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"bad character in group name '\xc2\xb5'", 3)

def test_symbolic_refs(self):
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
Expand Down Expand Up @@ -322,35 +313,22 @@ def test_symbolic_refs_errors(self):
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
"bad character in group name '-1'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\+1' "
r"at position 3") as w:
re.sub('(?P<a>x)', r'\g<+1>', 'xx')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '1_0' "
r"at position 3"):
re.sub('()'*10, r'\g<1_0>', 'xx')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
"bad character in group name '+1'", 3)
self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
"bad character in group name '1_0'", 3)
self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
"bad character in group name ' 1 '", 3)
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
"bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3") as w:
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
self.assertEqual(w.filename, __file__)
self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
r"bad character in group name '\xc2\xb5'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
"bad character in group name '㊀'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
"bad character in group name '¹'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '१' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g<१>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
"bad character in group name '१'", 3)

def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
Expand Down Expand Up @@ -616,27 +594,18 @@ def test_re_groupref_exists_errors(self):
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
self.checkPatternError(r'()(?(-1)a|b)',
"bad character in group name '-1'", 5)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\+1' "
r"at position 5") as w:
re.compile(r'()(?(+1)a|b)')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '1_0' "
r"at position 23"):
re.compile(r'()'*10 + r'(?(1_0)a|b)')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 5"):
re.compile(r'()(?( 1 )a|b)')
self.checkPatternError(r'()(?(+1)a|b)',
"bad character in group name '+1'", 5)
self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
"bad character in group name '1_0'", 23)
self.checkPatternError(r'()(?( 1 )a|b)',
"bad character in group name ' 1 '", 5)
self.checkPatternError(r'()(?(㊀)a|b)',
"bad character in group name '㊀'", 5)
self.checkPatternError(r'()(?(¹)a|b)',
"bad character in group name '¹'", 5)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '१' "
r"at position 5"):
re.compile(r'()(?(१)a|b)')
self.checkPatternError(r'()(?(१)a|b)',
"bad character in group name '१'", 5)
self.checkPatternError(r'()(?(1',
"missing ), unterminated name", 5)
self.checkPatternError(r'()(?(1)a',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Apply more strict rules for numerical group references and group names in
regular expressions. Only sequence of ASCII digits is now accepted as
a numerical reference. The group name in
bytes patterns and replacement strings can now only contain ASCII letters
and digits and underscore.