diff --git a/CHANGES/7044.bugfix b/CHANGES/7044.bugfix new file mode 100644 index 00000000000..4f5d60d7bab --- /dev/null +++ b/CHANGES/7044.bugfix @@ -0,0 +1 @@ +Avoid raising UnicodeDecodeError in multipart and in HTTP headers parsing. diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py index 5a66ce4b9ee..9749de25514 100644 --- a/aiohttp/http_parser.py +++ b/aiohttp/http_parser.py @@ -155,7 +155,7 @@ def parse_headers( if len(bname) > self.max_field_size: raise LineTooLong( "request header name {}".format( - bname.decode("utf8", "xmlcharrefreplace") + bname.decode("utf8", "backslashreplace") ), str(self.max_field_size), str(len(bname)), @@ -177,7 +177,7 @@ def parse_headers( if header_length > self.max_field_size: raise LineTooLong( "request header field {}".format( - bname.decode("utf8", "xmlcharrefreplace") + bname.decode("utf8", "backslashreplace") ), str(self.max_field_size), str(header_length), @@ -198,7 +198,7 @@ def parse_headers( if header_length > self.max_field_size: raise LineTooLong( "request header field {}".format( - bname.decode("utf8", "xmlcharrefreplace") + bname.decode("utf8", "backslashreplace") ), str(self.max_field_size), str(header_length), diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py index fdbc17eafa1..e6308de461d 100644 --- a/aiohttp/multipart.py +++ b/aiohttp/multipart.py @@ -425,8 +425,13 @@ async def form(self, *, encoding: Optional[str] = None) -> List[Tuple[str, str]] real_encoding = encoding else: real_encoding = self.get_charset(default="utf-8") + try: + decoded_data = data.rstrip().decode(real_encoding) + except UnicodeDecodeError: + raise ValueError("data cannot be decoded with %s encoding" % real_encoding) + return parse_qsl( - data.rstrip().decode(real_encoding), + decoded_data, keep_blank_values=True, encoding=real_encoding, ) diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index cc30629dd0d..ad8ff3ad284 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -118,6 +118,14 @@ def test_parse_headers(parser: Any) -> None: assert not msg.upgrade +def test_parse_headers_longline(parser: Any) -> None: + invalid_unicode_byte = b"\xd9" + header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192 + text = b"GET /test HTTP/1.1\r\n" + header_name + b": test\r\n" + b"\r\n" + b"\r\n" + with pytest.raises((http_exceptions.LineTooLong, http_exceptions.BadHttpMessage)): + parser.feed_data(text) + + def test_parse(parser) -> None: text = b"GET /test HTTP/1.1\r\n\r\n" messages, upgrade, tail = parser.feed_data(text) diff --git a/tests/test_multipart.py b/tests/test_multipart.py index cc3f5ff7c23..d3cb3667012 100644 --- a/tests/test_multipart.py +++ b/tests/test_multipart.py @@ -546,6 +546,20 @@ async def test_read_form(self) -> None: result = await obj.form() assert [("foo", "bar"), ("foo", "baz"), ("boo", "")] == result + async def test_read_form_invalid_utf8(self) -> None: + invalid_unicode_byte = b"\xff" + data = invalid_unicode_byte + b"%s--:--" % newline + with Stream(data) as stream: + obj = aiohttp.BodyPartReader( + BOUNDARY, + {CONTENT_TYPE: "application/x-www-form-urlencoded"}, + stream, + ) + with pytest.raises( + ValueError, match="data cannot be decoded with utf-8 encoding" + ): + await obj.form() + async def test_read_form_encoding(self) -> None: data = b"foo=bar&foo=baz&boo=%s--:--" % newline with Stream(data) as stream: