From 663910de0256a4e7e46536ef9d8277781f24db9a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 20 Oct 2023 15:26:41 +0200 Subject: [PATCH 01/14] gh-102988: email parseaddr() now rejects malformed address Detect email address parsing errors and return empty tuple to indicate the parsing error (old API). Add an optional 'strict' parameter to getaddresses() and parseaddr() functions. Patch by Thomas Dwyer. Co-Authored-By: Thomas Dwyer --- Doc/library/email.utils.rst | 17 ++- Doc/whatsnew/3.13.rst | 11 ++ Lib/email/utils.py | 121 ++++++++++++++-- Lib/test/test_email/test_email.py | 132 ++++++++++++++---- ...-10-20-15-28-08.gh-issue-102988.dStNO7.rst | 6 + 5 files changed, 251 insertions(+), 36 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst diff --git a/Doc/library/email.utils.rst b/Doc/library/email.utils.rst index 345b64001c1ace..5d03c08e7914d9 100644 --- a/Doc/library/email.utils.rst +++ b/Doc/library/email.utils.rst @@ -65,6 +65,11 @@ of the new API. *email address* parts. Returns a tuple of that information, unless the parse fails, in which case a 2-tuple of ``('', '')`` is returned. + If *strict* is true, use a strict parser which rejects malformed inputs. + + .. versionchanged:: 3.13 + Add *strict* optional parameter and reject malformed inputs by default. + .. function:: formataddr(pair, charset='utf-8') @@ -82,12 +87,15 @@ of the new API. Added the *charset* option. -.. function:: getaddresses(fieldvalues) +.. function:: getaddresses(fieldvalues, *, strict=True) This method returns a list of 2-tuples of the form returned by ``parseaddr()``. *fieldvalues* is a sequence of header field values as might be returned by - :meth:`Message.get_all `. Here's a simple - example that gets all the recipients of a message:: + :meth:`Message.get_all `. + + If *strict* is true, use a strict parser which rejects malformed inputs. + + Here's a simple example that gets all the recipients of a message:: from email.utils import getaddresses @@ -97,6 +105,9 @@ of the new API. resent_ccs = msg.get_all('resent-cc', []) all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs) + .. versionchanged:: 3.13 + Add *strict* optional parameter and reject malformed inputs by default. + .. function:: parsedate(date) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 534813f3659c9d..b4451c83c55753 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -192,6 +192,17 @@ doctest :attr:`doctest.TestResults.skipped` attributes. (Contributed by Victor Stinner in :gh:`108794`.) +email +----- + +* :func:`email.utils.getaddresses` and :func:`email.utils.parseaddr` now return + ``('', '')`` 2-tuples in more situations where invalid email addresses are + encountered instead of potentially inaccurate values. Add optional *strict* + parameter to these two functions: use ``strict=False`` to get the old + behavior, accept malformed inputs. + (Contributed by Thomas Dwyer for :gh:`102988` to improve the CVE-2023-27043 + fix.) + glob ---- diff --git a/Lib/email/utils.py b/Lib/email/utils.py index a49a8fa986ce0c..1da8f7ad6dd9cb 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -43,6 +43,7 @@ specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[\\"]') + def _has_surrogates(s): """Return True if s contains surrogate-escaped binary data.""" # This check is based on the fact that unless there are surrogates, utf8 @@ -103,12 +104,97 @@ def formataddr(pair, charset='utf-8'): return address +def _strip_quoted_realname(addr): + """Remove real name between quotes.""" + pos = 0 + start = None + remove = [] + while pos < len(addr): + if addr[pos] == '"': + if start is None: + start = pos + else: + remove.append((start, pos)) + start = None + pos += 1 + + if not remove: + return addr + + result = [] + pos = 0 + for start, stop in remove: + result.append(addr[pos:start]) + pos = stop + 1 + result.append(addr[pos:]) + + return ''.join(result) -def getaddresses(fieldvalues): - """Return a list of (REALNAME, EMAIL) for each fieldvalue.""" - all = COMMASPACE.join(str(v) for v in fieldvalues) - a = _AddressList(all) - return a.addresslist + +def getaddresses(fieldvalues, *, strict=True): + """Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue. + + When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in + its place. + + If strict is True, use a strict parser which rejects malformed inputs. + In this case, if the resulting list of parsed addresses is greater than + the number of fieldvalues in the input list, a parsing error has occurred + and consequently a list containing a single empty 2-tuple [('', '')] is + returned in its place. This is done to avoid invalid output. + + Malformed input: getaddresses(['alice@example.com ']) + Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')] + Safe output: [('', '')] + + """ + if not strict: + all = COMMASPACE.join(str(v) for v in fieldvalues) + a = _AddressList(all) + return a.addresslist + + fieldvalues = [str(v) for v in fieldvalues] + fieldvalues = _pre_parse_validation(fieldvalues) + addr = COMMASPACE.join(v for v in fieldvalues) + a = _AddressList(addr) + result = _post_parse_validation(a.addresslist) + + # Treat output as invalid if the number of addresses is not equal to the + # expected number of addresses. + n = 0 + for v in fieldvalues: + # When a comma is used in the Real Name part it is not a deliminator. + # So strip those out before counting the commas + v = _strip_quoted_realname(v) + # Expected number of addresses: 1 + number of commas + n += 1 + v.count(',') + if len(result) != n: + return [('', '')] + + return result + + +def _pre_parse_validation(email_header_fields): + accepted_values = [] + for v in email_header_fields: + s = v.replace('\\(', '').replace('\\)', '') + if s.count('(') != s.count(')'): + v = "('', '')" + accepted_values.append(v) + + return accepted_values + + +def _post_parse_validation(parsed_email_header_tuples): + accepted_values = [] + # The parser would have parsed a correctly formatted domain-literal + # The existence of an [ after parsing indicates a parsing failure + for v in parsed_email_header_tuples: + if '[' in v[1]: + v = ('', '') + accepted_values.append(v) + + return accepted_values def _format_timetuple_and_zone(timetuple, zone): @@ -207,16 +293,33 @@ def parsedate_to_datetime(data): tzinfo=datetime.timezone(datetime.timedelta(seconds=tz))) -def parseaddr(addr): +def parseaddr(addr, *, strict=True): """ Parse addr into its constituent realname and email address parts. Return a tuple of realname and email address, unless the parse fails, in which case return a 2-tuple of ('', ''). + + If strict is True, use a strict parser which rejects malformed inputs. """ - addrs = _AddressList(addr).addresslist - if not addrs: - return '', '' + if not strict: + addrs = _AddressList(addr).addresslist + if not addrs: + return ('', '') + return addrs[0] + + if isinstance(addr, list): + addr = addr[0] + + if not isinstance(addr, str): + return ('', '') + + addr = _pre_parse_validation([addr])[0] + addrs = _post_parse_validation(_AddressList(addr).addresslist) + + if not addrs or len(addrs) > 1: + return ('', '') + return addrs[0] diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 512464f87162cd..db469f78ce5586 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3320,32 +3320,116 @@ def test_getaddresses(self): [('Al Person', 'aperson@dom.ain'), ('Bud Person', 'bperson@dom.ain')]) - def test_getaddresses_comma_in_name(self): - """GH-106669 regression test.""" - self.assertEqual( - utils.getaddresses( - [ - '"Bud, Person" ', - 'aperson@dom.ain (Al Person)', - '"Mariusz Felisiak" ', - ] - ), - [ - ('Bud, Person', 'bperson@dom.ain'), - ('Al Person', 'aperson@dom.ain'), - ('Mariusz Felisiak', 'to@example.com'), - ], - ) + def test_parsing_errors(self): + """Test for parsing errors from CVE-2023-27043 and CVE-2019-16056""" + alice = 'alice@example.org' + bob = 'bob@example.com' + empty = ('', '') + + # Test utils.getaddresses() and utils.parseaddr() on malformed email + # addresses: default behavior (strict=True) rejects malformed address, + # and strict=True which tolerates malformed address. + for invalid_separator, expected_non_strict in ( + ('(', [(f'<{bob}>', alice)]), + (')', [('', alice), empty, ('', bob)]), + ('<', [('', alice), empty, ('', bob), empty]), + ('>', [('', alice), empty, ('', bob)]), + ('[', [('', f'{alice}[<{bob}>]')]), + (']', [('', alice), empty, ('', bob)]), + ('@', [empty, empty, ('', bob)]), + (';', [('', alice), empty, ('', bob)]), + (':', [('', alice), ('', bob)]), + ('.', [('', alice + '.'), ('', bob)]), + ('"', [('', alice), ('', f'<{bob}>')]), + ): + address = f'{alice}{invalid_separator}<{bob}>' + with self.subTest(address=address): + self.assertEqual(utils.getaddresses([address]), + [empty]) + self.assertEqual(utils.getaddresses([address], strict=False), + expected_non_strict) + + self.assertEqual(utils.parseaddr([address]), + empty) + self.assertEqual(utils.parseaddr([address], strict=False), + ('', address)) + + # Comma (',') is treated differently depending on strict parameter. + # Comma without quotes. + address = f'{alice},<{bob}>' + self.assertEqual(utils.getaddresses([address]), + [('', alice), ('', bob)]) + self.assertEqual(utils.getaddresses([address], strict=False), + [('', alice), ('', bob)]) + self.assertEqual(utils.parseaddr([address]), + empty) + self.assertEqual(utils.parseaddr([address], strict=False), + ('', address)) + + # Comma with quotes. + address = '"Alice, alice@example.org" ' + expected_strict = ('Alice, alice@example.org', 'bob@example.com') + self.assertEqual(utils.getaddresses([address]), [expected_strict]) + self.assertEqual(utils.getaddresses([address], strict=False), [expected_strict]) + self.assertEqual(utils.parseaddr([address]), expected_strict) + self.assertEqual(utils.parseaddr([address], strict=False), + ('', address)) + + # Two addresses with quotes separated by comma. + address = '"Jane Doe" , "John Doe" ' + self.assertEqual(utils.getaddresses([address]), + [('Jane Doe', 'jane@example.net'), + ('John Doe', 'john@example.net')]) + self.assertEqual(utils.getaddresses([address], strict=False), + [('Jane Doe', 'jane@example.net'), + ('John Doe', 'john@example.net')]) + self.assertEqual(utils.parseaddr([address]), empty) + self.assertEqual(utils.parseaddr([address], strict=False), + ('', address)) def test_getaddresses_nasty(self): - eq = self.assertEqual - eq(utils.getaddresses(['foo: ;']), [('', '')]) - eq(utils.getaddresses( - ['[]*-- =~$']), - [('', ''), ('', ''), ('', '*--')]) - eq(utils.getaddresses( - ['foo: ;', '"Jason R. Mastaler" ']), - [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]) + for addresses, expected in ( + (['"Sürname, Firstname" '], + [('Sürname, Firstname', 'to@example.com')]), + + (['foo: ;'], + [('', '')]), + + (['foo: ;', '"Jason R. Mastaler" '], + [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]), + + ([r'Pete(A nice \) chap) '], + [('Pete (A nice ) chap his account his host)', 'pete@silly.test')]), + + (['(Empty list)(start)Undisclosed recipients :(nobody(I know))'], + [('', '')]), + + (['Mary <@machine.tld:mary@example.net>, , jdoe@test . example'], + [('Mary', 'mary@example.net'), ('', ''), ('', 'jdoe@test.example')]), + + (['John Doe '], + [('John Doe (comment)', 'jdoe@machine.example')]), + + (['"Mary Smith: Personal Account" '], + [('Mary Smith: Personal Account', 'smith@home.example')]), + + (['Undisclosed recipients:;'], + [('', '')]), + + ([r', "Giant; \"Big\" Box" '], + [('', 'boss@nil.test'), ('Giant; "Big" Box', 'bob@example.net')]), + ): + with self.subTest(addresses=addresses): + self.assertEqual(utils.getaddresses(addresses), + expected) + self.assertEqual(utils.getaddresses(addresses, strict=False), + expected) + + addresses = ['[]*-- =~$'] + self.assertEqual(utils.getaddresses(addresses), + [('', '')]) + self.assertEqual(utils.getaddresses(addresses, strict=False), + [('', ''), ('', ''), ('', '*--')]) def test_getaddresses_embedded_comment(self): """Test proper handling of a nested comment""" diff --git a/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst new file mode 100644 index 00000000000000..d4703588cf12a8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst @@ -0,0 +1,6 @@ +:func:`email.utils.getaddresses` and :func:`email.utils.parseaddr` now +return ``('', '')`` 2-tuples in more situations where invalid email +addresses are encountered instead of potentially inaccurate values. Add +optional *strict* parameter to these two functions: use ``strict=False`` to +get the old behavior, accept malformed inputs. Patch by Thomas Dwyer to +improve the CVE-2023-27043 fix. From 730500c850f4b5c941f63ec7ab2e9a4f764aca72 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 30 Nov 2023 14:07:47 +0100 Subject: [PATCH 02/14] Fast path --- Lib/email/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 1da8f7ad6dd9cb..b877ff8c2bd537 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -106,6 +106,10 @@ def formataddr(pair, charset='utf-8'): def _strip_quoted_realname(addr): """Remove real name between quotes.""" + if '"' not in addr: + # Fast path + return addr + pos = 0 start = None remove = [] @@ -118,9 +122,6 @@ def _strip_quoted_realname(addr): start = None pos += 1 - if not remove: - return addr - result = [] pos = 0 for start, stop in remove: From 1a377ecb99d5a497cd57355a331c5aaf6a135f05 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 30 Nov 2023 14:35:48 +0100 Subject: [PATCH 03/14] Handle escaped quotes --- Lib/email/utils.py | 15 ++++++++++----- Lib/test/test_email/test_email.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index b877ff8c2bd537..ede7a608951a0b 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -104,8 +104,8 @@ def formataddr(pair, charset='utf-8'): return address -def _strip_quoted_realname(addr): - """Remove real name between quotes.""" +def _strip_quoted_realnames(addr): + """Strip real names between quotes.""" if '"' not in addr: # Fast path return addr @@ -113,13 +113,18 @@ def _strip_quoted_realname(addr): pos = 0 start = None remove = [] + previous = None while pos < len(addr): - if addr[pos] == '"': + ch = addr[pos] + if ch == '"' and previous != '\\': if start is None: start = pos else: remove.append((start, pos)) start = None + elif ch == '\\' and previous == '\\': + previous = None + previous = ch pos += 1 result = [] @@ -165,8 +170,8 @@ def getaddresses(fieldvalues, *, strict=True): n = 0 for v in fieldvalues: # When a comma is used in the Real Name part it is not a deliminator. - # So strip those out before counting the commas - v = _strip_quoted_realname(v) + # So strip those out before counting the commas. + v = _strip_quoted_realnames(v) # Expected number of addresses: 1 + number of commas n += 1 + v.count(',') if len(result) != n: diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index db469f78ce5586..b030ef7fe6a45c 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3366,7 +3366,7 @@ def test_parsing_errors(self): self.assertEqual(utils.parseaddr([address], strict=False), ('', address)) - # Comma with quotes. + # Real name between quotes containing comma. address = '"Alice, alice@example.org" ' expected_strict = ('Alice, alice@example.org', 'bob@example.com') self.assertEqual(utils.getaddresses([address]), [expected_strict]) @@ -3375,6 +3375,24 @@ def test_parsing_errors(self): self.assertEqual(utils.parseaddr([address], strict=False), ('', address)) + # Valid parenthesis in comments. + address = 'alice@example.org (Alice)' + expected_strict = ('Alice', 'alice@example.org') + self.assertEqual(utils.getaddresses([address]), [expected_strict]) + self.assertEqual(utils.getaddresses([address], strict=False), [expected_strict]) + self.assertEqual(utils.parseaddr([address]), expected_strict) + self.assertEqual(utils.parseaddr([address], strict=False), + ('', address)) + + # Invalid parenthesis in comments. + address = 'alice@example.org )Alice(' + self.assertEqual(utils.getaddresses([address]), [empty]) + self.assertEqual(utils.getaddresses([address], strict=False), + [('', 'alice@example.org'), ('', ''), ('', 'Alice')]) + self.assertEqual(utils.parseaddr([address]), empty) + self.assertEqual(utils.parseaddr([address], strict=False), + ('', address)) + # Two addresses with quotes separated by comma. address = '"Jane Doe" , "John Doe" ' self.assertEqual(utils.getaddresses([address]), @@ -3620,6 +3638,17 @@ def test_mime_classes_policy_argument(self): m = cls(*constructor, policy=email.policy.default) self.assertIs(m.policy, email.policy.default) + def test_strip_quoted_realnames(self): + def check(addr, expected): + self.assertEqual(utils._strip_quoted_realnames(addr), expected) + + check('Jane Doe , John Doe ', + 'Jane Doe , John Doe ') + check('"Jane Doe" , "John Doe" ', + ' , ') + check(r'"Jane \"Doe"." ', + ' ') + # Test the iterator/generators class TestIterators(TestEmailBase): From da25ed3b8623eaedce731c95a2ae953df187013a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 30 Nov 2023 14:54:07 +0100 Subject: [PATCH 04/14] Check parenthesis --- Lib/email/utils.py | 45 +++++++++++++++++++++++-------- Lib/test/test_email/test_email.py | 25 ++++++++++++++++- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index ede7a608951a0b..3802520bc79bd9 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -104,28 +104,37 @@ def formataddr(pair, charset='utf-8'): return address +def _iter_escaped_chars(addr): + pos = 0 + escape = False + for pos, ch in enumerate(addr): + if ch == '\\' and not escape: + escape = True + else: + if escape: + yield (pos, '\\' + ch) + else: + yield (pos, ch) + escape = False + if escape: + yield (pos, '\\') + + def _strip_quoted_realnames(addr): """Strip real names between quotes.""" if '"' not in addr: # Fast path return addr - pos = 0 start = None remove = [] - previous = None - while pos < len(addr): - ch = addr[pos] - if ch == '"' and previous != '\\': + for pos, ch in _iter_escaped_chars(addr): + if ch == '"': if start is None: start = pos else: remove.append((start, pos)) start = None - elif ch == '\\' and previous == '\\': - previous = None - previous = ch - pos += 1 result = [] pos = 0 @@ -180,11 +189,25 @@ def getaddresses(fieldvalues, *, strict=True): return result +def _check_parenthesis(addr): + # Ignore parenthesis in quoted real names. + addr = _strip_quoted_realnames(addr) + + opens = 0 + for pos, ch in _iter_escaped_chars(addr): + if ch == '(': + opens += 1 + elif ch == ')': + opens -= 1 + if opens < 0: + return False + return (opens == 0) + + def _pre_parse_validation(email_header_fields): accepted_values = [] for v in email_header_fields: - s = v.replace('\\(', '').replace('\\)', '') - if s.count('(') != s.count(')'): + if not _check_parenthesis(v): v = "('', '')" accepted_values.append(v) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index b030ef7fe6a45c..3e5f3c57fa4286 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3638,6 +3638,19 @@ def test_mime_classes_policy_argument(self): m = cls(*constructor, policy=email.policy.default) self.assertIs(m.policy, email.policy.default) + def test_iter_escaped_chars(self): + self.assertEqual(list(utils._iter_escaped_chars(r'a\\b\"c\\"d')), + [(0, 'a'), + (2, '\\\\'), + (3, 'b'), + (5, '\\"'), + (6, 'c'), + (8, '\\\\'), + (9, '"'), + (10, 'd')]) + self.assertEqual(list(utils._iter_escaped_chars('a\\')), + [(0, 'a'), (1, '\\')]) + def test_strip_quoted_realnames(self): def check(addr, expected): self.assertEqual(utils._strip_quoted_realnames(addr), expected) @@ -3646,9 +3659,19 @@ def check(addr, expected): 'Jane Doe , John Doe ') check('"Jane Doe" , "John Doe" ', ' , ') - check(r'"Jane \"Doe"." ', + check(r'"Jane \"Doe\"." ', ' ') + def test_check_parenthesis(self): + addr = 'alice@example.net' + self.assertTrue(utils._check_parenthesis(f'{addr} (Alice)')) + self.assertFalse(utils._check_parenthesis(f'{addr} )Alice(')) + self.assertFalse(utils._check_parenthesis(f'{addr} (Alice))')) + self.assertFalse(utils._check_parenthesis(f'{addr} ((Alice)')) + + # Ignore real name between quotes + self.assertTrue(utils._check_parenthesis(f'")Alice((" {addr}')) + # Test the iterator/generators class TestIterators(TestEmailBase): From 2b199bb653fa5f9e2c2b5c24834bf58b32793787 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 30 Nov 2023 15:25:49 +0100 Subject: [PATCH 05/14] Credit myself in the Changelog entry --- Doc/whatsnew/3.13.rst | 4 ++-- .../Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index b4451c83c55753..c50cd5178a07c8 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -200,8 +200,8 @@ email encountered instead of potentially inaccurate values. Add optional *strict* parameter to these two functions: use ``strict=False`` to get the old behavior, accept malformed inputs. - (Contributed by Thomas Dwyer for :gh:`102988` to improve the CVE-2023-27043 - fix.) + (Contributed by Thomas Dwyer and Victor Stinner for :gh:`102988` to improve + the CVE-2023-27043 fix.) glob ---- diff --git a/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst index d4703588cf12a8..72d3c75f77ade0 100644 --- a/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst +++ b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst @@ -2,5 +2,5 @@ return ``('', '')`` 2-tuples in more situations where invalid email addresses are encountered instead of potentially inaccurate values. Add optional *strict* parameter to these two functions: use ``strict=False`` to -get the old behavior, accept malformed inputs. Patch by Thomas Dwyer to -improve the CVE-2023-27043 fix. +get the old behavior, accept malformed inputs. Patch by Thomas Dwyer and Victor +Stinner to improve the CVE-2023-27043 fix. From 9b95376836bbbc0511e3e6f06f71d6c343146441 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 5 Dec 2023 22:41:21 +0100 Subject: [PATCH 06/14] Add email.utils.supports_strict_parsing attribute --- Doc/whatsnew/3.13.rst | 2 ++ Lib/email/utils.py | 2 ++ Lib/test/test_email/test_email.py | 4 ++++ .../Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst | 4 +++- 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index c50cd5178a07c8..d1063461063161 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -200,6 +200,8 @@ email encountered instead of potentially inaccurate values. Add optional *strict* parameter to these two functions: use ``strict=False`` to get the old behavior, accept malformed inputs. + ``getattr(email.utils, 'supports_strict_parsing', False)`` can be use to + check if the *strict* paramater is available. (Contributed by Thomas Dwyer and Victor Stinner for :gh:`102988` to improve the CVE-2023-27043 fix.) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 3802520bc79bd9..754d86b177e7bc 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -146,6 +146,8 @@ def _strip_quoted_realnames(addr): return ''.join(result) +supports_strict_parsing = True + def getaddresses(fieldvalues, *, strict=True): """Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue. diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 3e5f3c57fa4286..49134cc3f48ee8 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -16,6 +16,7 @@ import email import email.policy +import email.utils from email.charset import Charset from email.generator import Generator, DecodedGenerator, BytesGenerator @@ -3405,6 +3406,9 @@ def test_parsing_errors(self): self.assertEqual(utils.parseaddr([address], strict=False), ('', address)) + # Test email.utils.supports_strict_parsing attribute + self.assertEqual(email.utils.supports_strict_parsing, True) + def test_getaddresses_nasty(self): for addresses, expected in ( (['"Sürname, Firstname" '], diff --git a/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst index 72d3c75f77ade0..3d0e9e4078c934 100644 --- a/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst +++ b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst @@ -2,5 +2,7 @@ return ``('', '')`` 2-tuples in more situations where invalid email addresses are encountered instead of potentially inaccurate values. Add optional *strict* parameter to these two functions: use ``strict=False`` to -get the old behavior, accept malformed inputs. Patch by Thomas Dwyer and Victor +get the old behavior, accept malformed inputs. +``getattr(email.utils, 'supports_strict_parsing', False)`` can be use to check +if the *strict* paramater is available. Patch by Thomas Dwyer and Victor Stinner to improve the CVE-2023-27043 fix. From cf504821bbee5503f1105d88801a0482312cca40 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 5 Dec 2023 22:47:03 +0100 Subject: [PATCH 07/14] Simplify getaddresses() --- Lib/email/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 754d86b177e7bc..e90d7be4d01ac3 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -172,7 +172,7 @@ def getaddresses(fieldvalues, *, strict=True): fieldvalues = [str(v) for v in fieldvalues] fieldvalues = _pre_parse_validation(fieldvalues) - addr = COMMASPACE.join(v for v in fieldvalues) + addr = COMMASPACE.join(fieldvalues) a = _AddressList(addr) result = _post_parse_validation(a.addresslist) From 0dd57458b861eb54c7ee78aa2cab714e5d06aef3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 5 Dec 2023 23:12:54 +0100 Subject: [PATCH 08/14] Update parseaddr() prototype in the doc --- Doc/library/email.utils.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/email.utils.rst b/Doc/library/email.utils.rst index 5d03c08e7914d9..d693a9bc3933b5 100644 --- a/Doc/library/email.utils.rst +++ b/Doc/library/email.utils.rst @@ -58,7 +58,7 @@ of the new API. begins with angle brackets, they are stripped off. -.. function:: parseaddr(address) +.. function:: parseaddr(address, *, strict=True) Parse address -- which should be the value of some address-containing field such as :mailheader:`To` or :mailheader:`Cc` -- into its constituent *realname* and From 637bdb8476866d7f97dd173cee68a93138c8b9f4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 5 Dec 2023 23:15:07 +0100 Subject: [PATCH 09/14] Apply theta682's suggestion --- Lib/email/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index e90d7be4d01ac3..f8d464c4d12311 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -108,14 +108,13 @@ def _iter_escaped_chars(addr): pos = 0 escape = False for pos, ch in enumerate(addr): - if ch == '\\' and not escape: + if escape: + yield (pos, '\\' + ch) + escape = False + elif ch == '\\': escape = True else: - if escape: - yield (pos, '\\' + ch) - else: - yield (pos, ch) - escape = False + yield (pos, ch) if escape: yield (pos, '\\') From 8df7013f62b12f317971676572262239f90d3b88 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 12 Dec 2023 16:01:16 +0100 Subject: [PATCH 10/14] Update Lib/test/test_email/test_email.py Co-authored-by: Petr Viktorin --- Lib/test/test_email/test_email.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 49134cc3f48ee8..64e4dd570ed458 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3329,7 +3329,7 @@ def test_parsing_errors(self): # Test utils.getaddresses() and utils.parseaddr() on malformed email # addresses: default behavior (strict=True) rejects malformed address, - # and strict=True which tolerates malformed address. + # and strict=False which tolerates malformed address. for invalid_separator, expected_non_strict in ( ('(', [(f'<{bob}>', alice)]), (')', [('', alice), empty, ('', bob)]), From 59da8f12a8b73d79fb19eca312013cd361c74d45 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 12 Dec 2023 16:19:19 +0100 Subject: [PATCH 11/14] Restore removed test_getaddresses_comma_in_name() --- Lib/test/test_email/test_email.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 64e4dd570ed458..09279128c6b6af 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3321,6 +3321,23 @@ def test_getaddresses(self): [('Al Person', 'aperson@dom.ain'), ('Bud Person', 'bperson@dom.ain')]) + def test_getaddresses_comma_in_name(self): + """GH-106669 regression test.""" + self.assertEqual( + utils.getaddresses( + [ + '"Bud, Person" ', + 'aperson@dom.ain (Al Person)', + '"Mariusz Felisiak" ', + ] + ), + [ + ('Bud, Person', 'bperson@dom.ain'), + ('Al Person', 'aperson@dom.ain'), + ('Mariusz Felisiak', 'to@example.com'), + ], + ) + def test_parsing_errors(self): """Test for parsing errors from CVE-2023-27043 and CVE-2019-16056""" alice = 'alice@example.org' From 361b517d7183535e5a47f9f72fb11cb1dd90d4a0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 12 Dec 2023 16:24:47 +0100 Subject: [PATCH 12/14] Move implementation details to comment --- Lib/email/utils.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index f8d464c4d12311..5bc936207d230b 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -153,17 +153,18 @@ def getaddresses(fieldvalues, *, strict=True): When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in its place. - If strict is True, use a strict parser which rejects malformed inputs. - In this case, if the resulting list of parsed addresses is greater than - the number of fieldvalues in the input list, a parsing error has occurred - and consequently a list containing a single empty 2-tuple [('', '')] is - returned in its place. This is done to avoid invalid output. + If strict is true, use a strict parser which rejects malformed inputs. + """ - Malformed input: getaddresses(['alice@example.com ']) - Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')] - Safe output: [('', '')] + # If strict is true, if the resulting list of parsed addresses is greater + # than the number of fieldvalues in the input list, a parsing error has + # occurred and consequently a list containing a single empty 2-tuple [('', + # '')] is returned in its place. This is done to avoid invalid output. + # + # Malformed input: getaddresses(['alice@example.com ']) + # Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')] + # Safe output: [('', '')] - """ if not strict: all = COMMASPACE.join(str(v) for v in fieldvalues) a = _AddressList(all) From 3105a75498f265d0b479ec785d5aad35f982d903 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 12 Dec 2023 16:51:00 +0100 Subject: [PATCH 13/14] Simplify _strip_quoted_realnames() --- Lib/email/utils.py | 23 +++++++++++------------ Lib/test/test_email/test_email.py | 15 +++++++++++++-- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 5bc936207d230b..c485dc5f64a864 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -125,22 +125,21 @@ def _strip_quoted_realnames(addr): # Fast path return addr - start = None - remove = [] + start = 0 + open_pos = None + result = [] for pos, ch in _iter_escaped_chars(addr): if ch == '"': - if start is None: - start = pos + if open_pos is None: + open_pos = pos else: - remove.append((start, pos)) - start = None + if start != open_pos: + result.append(addr[start:open_pos]) + start = pos + 1 + open_pos = None - result = [] - pos = 0 - for start, stop in remove: - result.append(addr[pos:start]) - pos = stop + 1 - result.append(addr[pos:]) + if start < (len(addr) - 1): + result.append(addr[start:]) return ''.join(result) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 09279128c6b6af..8371203d3506ea 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3676,13 +3676,24 @@ def test_strip_quoted_realnames(self): def check(addr, expected): self.assertEqual(utils._strip_quoted_realnames(addr), expected) - check('Jane Doe , John Doe ', - 'Jane Doe , John Doe ') check('"Jane Doe" , "John Doe" ', ' , ') check(r'"Jane \"Doe\"." ', ' ') + # special cases + check(r'before"name"after', 'beforeafter') + check(r'before"name"', 'before') + check(r'"name"after', 'after') + + # no change + for addr in ( + 'Jane Doe , John Doe ', + 'lone " quote', + ): + self.assertEqual(utils._strip_quoted_realnames(addr), addr) + + def test_check_parenthesis(self): addr = 'alice@example.net' self.assertTrue(utils._check_parenthesis(f'{addr} (Alice)')) From fc6e86b53801fe43956e335b6add3617ebbaecd1 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 13 Dec 2023 14:11:53 +0100 Subject: [PATCH 14/14] Fix _strip_quoted_realnames() --- Lib/email/utils.py | 2 +- Lib/test/test_email/test_email.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index c485dc5f64a864..96b0172088987a 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -138,7 +138,7 @@ def _strip_quoted_realnames(addr): start = pos + 1 open_pos = None - if start < (len(addr) - 1): + if start < len(addr): result.append(addr[start:]) return ''.join(result) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 8371203d3506ea..39d4ace8d4a1d8 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -3684,7 +3684,10 @@ def check(addr, expected): # special cases check(r'before"name"after', 'beforeafter') check(r'before"name"', 'before') + check(r'b"name"', 'b') # single char check(r'"name"after', 'after') + check(r'"name"a', 'a') # single char + check(r'"name"', '') # no change for addr in (