From 70a56f64bfc355b9b9217bdf390bf0e9090be867 Mon Sep 17 00:00:00 2001
From: Riccardo Schirone <562321+ret2libc@users.noreply.github.com>
Date: Mon, 21 Nov 2022 00:11:20 +0100
Subject: [PATCH] Fix unicode errors (#7044)

Fix few errors related to Unicode decoding.
* multipart forms with invalid utf-8 characters as data can cause a
UnicodeDecodeError to be raised. This should be raised as ValueError,
like what is done in other parts of the code base.
* HTTP request parser (pure-python) tries to decode the header name with
utf-8/xmlcharrefreplace, which cannot decode bytes such as `\xd9`.

I don't think so.

<!-- Are there any issues opened that will be resolved by merging this
change? -->

- [x] I think the code is well written
- [x] Unit tests for the changes exist
- [ ] ~Documentation reflects the changes~
- [ ] If you provide code modification, please add yourself to
`CONTRIBUTORS.txt`
  * The format is &lt;Name&gt; &lt;Surname&gt;.
  * Please keep alphabetical order, the file is sorted by names.
- [x] ~Add a new news fragment into the `CHANGES` folder~
  * name it `<issue_id>.<type>` for example (588.bugfix)
* if you don't have an `issue_id` change it to the pr id after creating
the pr
  * ensure type is one of the following:
    * `.feature`: Signifying a new feature.
    * `.bugfix`: Signifying a bug fix.
    * `.doc`: Signifying a documentation improvement.
    * `.removal`: Signifying a deprecation or removal of public API.
* `.misc`: A ticket has been closed, but it is not of interest to users.
* Make sure to use full sentences with correct case and punctuation, for
example: "Fix issue with non-ascii contents in doctest text files."

(cherry picked from commit bce6e3c7324f53342bcd225103771c509fc4a67f)
---
 CHANGES/7044.bugfix       |  1 +
 aiohttp/http_parser.py    |  6 +++---
 aiohttp/multipart.py      |  7 ++++++-
 tests/test_http_parser.py |  8 ++++++++
 tests/test_multipart.py   | 14 ++++++++++++++
 5 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 CHANGES/7044.bugfix
diff --git a/CHANGES/7044.bugfix b/CHANGES/7044.bugfix
new file mode 100644
index 00000000000..4f5d60d7bab
--- /dev/null
+++ b/CHANGES/7044.bugfix
@@ -0,0 +1 @@
+Avoid raising UnicodeDecodeError in multipart and in HTTP headers parsing.
diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py
index 5a66ce4b9ee..9749de25514 100644
--- a/aiohttp/http_parser.py
+++ b/aiohttp/http_parser.py
@@ -155,7 +155,7 @@ def parse_headers(
             if len(bname) > self.max_field_size:
                 raise LineTooLong(
                     "request header name {}".format(
-                        bname.decode("utf8", "xmlcharrefreplace")
+                        bname.decode("utf8", "backslashreplace")
                     ),
                     str(self.max_field_size),
                     str(len(bname)),
@@ -177,7 +177,7 @@ def parse_headers(
                     if header_length > self.max_field_size:
                         raise LineTooLong(
                             "request header field {}".format(
-                                bname.decode("utf8", "xmlcharrefreplace")
+                                bname.decode("utf8", "backslashreplace")
                             ),
                             str(self.max_field_size),
                             str(header_length),
@@ -198,7 +198,7 @@ def parse_headers(
                 if header_length > self.max_field_size:
                     raise LineTooLong(
                         "request header field {}".format(
-                            bname.decode("utf8", "xmlcharrefreplace")
+                            bname.decode("utf8", "backslashreplace")
                         ),
                         str(self.max_field_size),
                         str(header_length),
diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py
index fdbc17eafa1..e6308de461d 100644
--- a/aiohttp/multipart.py
+++ b/aiohttp/multipart.py
@@ -425,8 +425,13 @@ async def form(self, *, encoding: Optional[str] = None) -> List[Tuple[str, str]]
             real_encoding = encoding
         else:
             real_encoding = self.get_charset(default="utf-8")
+        try:
+            decoded_data = data.rstrip().decode(real_encoding)
+        except UnicodeDecodeError:
+            raise ValueError("data cannot be decoded with %s encoding" % real_encoding)
+
         return parse_qsl(
-            data.rstrip().decode(real_encoding),
+            decoded_data,
             keep_blank_values=True,
             encoding=real_encoding,
         )
diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
index cc30629dd0d..ad8ff3ad284 100644
--- a/tests/test_http_parser.py
+++ b/tests/test_http_parser.py
@@ -118,6 +118,14 @@ def test_parse_headers(parser: Any) -> None:
     assert not msg.upgrade
 
 
+def test_parse_headers_longline(parser: Any) -> None:
+    invalid_unicode_byte = b"\xd9"
+    header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192
+    text = b"GET /test HTTP/1.1\r\n" + header_name + b": test\r\n" + b"\r\n" + b"\r\n"
+    with pytest.raises((http_exceptions.LineTooLong, http_exceptions.BadHttpMessage)):
+        parser.feed_data(text)
+
+
 def test_parse(parser) -> None:
     text = b"GET /test HTTP/1.1\r\n\r\n"
     messages, upgrade, tail = parser.feed_data(text)
diff --git a/tests/test_multipart.py b/tests/test_multipart.py
index cc3f5ff7c23..d3cb3667012 100644
--- a/tests/test_multipart.py
+++ b/tests/test_multipart.py
@@ -546,6 +546,20 @@ async def test_read_form(self) -> None:
             result = await obj.form()
         assert [("foo", "bar"), ("foo", "baz"), ("boo", "")] == result
 
+    async def test_read_form_invalid_utf8(self) -> None:
+        invalid_unicode_byte = b"\xff"
+        data = invalid_unicode_byte + b"%s--:--" % newline
+        with Stream(data) as stream:
+            obj = aiohttp.BodyPartReader(
+                BOUNDARY,
+                {CONTENT_TYPE: "application/x-www-form-urlencoded"},
+                stream,
+            )
+            with pytest.raises(
+                ValueError, match="data cannot be decoded with utf-8 encoding"
+            ):
+                await obj.form()
+
     async def test_read_form_encoding(self) -> None:
         data = b"foo=bar&foo=baz&boo=%s--:--" % newline
         with Stream(data) as stream: