From 70a56f64bfc355b9b9217bdf390bf0e9090be867 Mon Sep 17 00:00:00 2001 From: Riccardo Schirone <562321+ret2libc@users.noreply.github.com> Date: Mon, 21 Nov 2022 00:11:20 +0100 Subject: [PATCH] Fix unicode errors (#7044) Fix few errors related to Unicode decoding. * multipart forms with invalid utf-8 characters as data can cause a UnicodeDecodeError to be raised. This should be raised as ValueError, like what is done in other parts of the code base. * HTTP request parser (pure-python) tries to decode the header name with utf-8/xmlcharrefreplace, which cannot decode bytes such as `\xd9`. I don't think so. - [x] I think the code is well written - [x] Unit tests for the changes exist - [ ] ~Documentation reflects the changes~ - [ ] If you provide code modification, please add yourself to `CONTRIBUTORS.txt` * The format is <Name> <Surname>. * Please keep alphabetical order, the file is sorted by names. - [x] ~Add a new news fragment into the `CHANGES` folder~ * name it `.` for example (588.bugfix) * if you don't have an `issue_id` change it to the pr id after creating the pr * ensure type is one of the following: * `.feature`: Signifying a new feature. * `.bugfix`: Signifying a bug fix. * `.doc`: Signifying a documentation improvement. * `.removal`: Signifying a deprecation or removal of public API. * `.misc`: A ticket has been closed, but it is not of interest to users. * Make sure to use full sentences with correct case and punctuation, for example: "Fix issue with non-ascii contents in doctest text files." (cherry picked from commit bce6e3c7324f53342bcd225103771c509fc4a67f) --- CHANGES/7044.bugfix | 1 + aiohttp/http_parser.py | 6 +++--- aiohttp/multipart.py | 7 ++++++- tests/test_http_parser.py | 8 ++++++++ tests/test_multipart.py | 14 ++++++++++++++ 5 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 CHANGES/7044.bugfix diff --git a/CHANGES/7044.bugfix b/CHANGES/7044.bugfix new file mode 100644 index 00000000000..4f5d60d7bab --- /dev/null +++ b/CHANGES/7044.bugfix @@ -0,0 +1 @@ +Avoid raising UnicodeDecodeError in multipart and in HTTP headers parsing. diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py index 5a66ce4b9ee..9749de25514 100644 --- a/aiohttp/http_parser.py +++ b/aiohttp/http_parser.py @@ -155,7 +155,7 @@ def parse_headers( if len(bname) > self.max_field_size: raise LineTooLong( "request header name {}".format( - bname.decode("utf8", "xmlcharrefreplace") + bname.decode("utf8", "backslashreplace") ), str(self.max_field_size), str(len(bname)), @@ -177,7 +177,7 @@ def parse_headers( if header_length > self.max_field_size: raise LineTooLong( "request header field {}".format( - bname.decode("utf8", "xmlcharrefreplace") + bname.decode("utf8", "backslashreplace") ), str(self.max_field_size), str(header_length), @@ -198,7 +198,7 @@ def parse_headers( if header_length > self.max_field_size: raise LineTooLong( "request header field {}".format( - bname.decode("utf8", "xmlcharrefreplace") + bname.decode("utf8", "backslashreplace") ), str(self.max_field_size), str(header_length), diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py index fdbc17eafa1..e6308de461d 100644 --- a/aiohttp/multipart.py +++ b/aiohttp/multipart.py @@ -425,8 +425,13 @@ async def form(self, *, encoding: Optional[str] = None) -> List[Tuple[str, str]] real_encoding = encoding else: real_encoding = self.get_charset(default="utf-8") + try: + decoded_data = data.rstrip().decode(real_encoding) + except UnicodeDecodeError: + raise ValueError("data cannot be decoded with %s encoding" % real_encoding) + return parse_qsl( - data.rstrip().decode(real_encoding), + decoded_data, keep_blank_values=True, encoding=real_encoding, ) diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index cc30629dd0d..ad8ff3ad284 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -118,6 +118,14 @@ def test_parse_headers(parser: Any) -> None: assert not msg.upgrade +def test_parse_headers_longline(parser: Any) -> None: + invalid_unicode_byte = b"\xd9" + header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192 + text = b"GET /test HTTP/1.1\r\n" + header_name + b": test\r\n" + b"\r\n" + b"\r\n" + with pytest.raises((http_exceptions.LineTooLong, http_exceptions.BadHttpMessage)): + parser.feed_data(text) + + def test_parse(parser) -> None: text = b"GET /test HTTP/1.1\r\n\r\n" messages, upgrade, tail = parser.feed_data(text) diff --git a/tests/test_multipart.py b/tests/test_multipart.py index cc3f5ff7c23..d3cb3667012 100644 --- a/tests/test_multipart.py +++ b/tests/test_multipart.py @@ -546,6 +546,20 @@ async def test_read_form(self) -> None: result = await obj.form() assert [("foo", "bar"), ("foo", "baz"), ("boo", "")] == result + async def test_read_form_invalid_utf8(self) -> None: + invalid_unicode_byte = b"\xff" + data = invalid_unicode_byte + b"%s--:--" % newline + with Stream(data) as stream: + obj = aiohttp.BodyPartReader( + BOUNDARY, + {CONTENT_TYPE: "application/x-www-form-urlencoded"}, + stream, + ) + with pytest.raises( + ValueError, match="data cannot be decoded with utf-8 encoding" + ): + await obj.form() + async def test_read_form_encoding(self) -> None: data = b"foo=bar&foo=baz&boo=%s--:--" % newline with Stream(data) as stream: