From 0daa85f8f97cd37f7bafb5e6d42c62a16c3962e6 Mon Sep 17 00:00:00 2001
From: Tom Christie <tom@tomchristie.com>
Date: Tue, 21 Jul 2020 13:36:30 +0100
Subject: [PATCH] Prefer defaulting to utf-8, rather than using chardet
 autodetect

---
 README.md                      |  1 -
 docs/index.md                  |  1 -
 httpx/_decoders.py             | 62 ++++++----------------------------
 httpx/_models.py               | 12 +------
 setup.cfg                      |  2 +-
 setup.py                       |  1 -
 tests/models/test_responses.py | 31 ++++++-----------
 tests/test_decoders.py         | 10 ------
 8 files changed, 23 insertions(+), 97 deletions(-)

diff --git a/README.md b/README.md
index d262045f34..82ff0fffd7 100644
--- a/README.md
+++ b/README.md
@@ -112,7 +112,6 @@ The HTTPX project relies on these excellent libraries:
   * `h11` - HTTP/1.1 support.
   * `h2` - HTTP/2 support.
 * `certifi` - SSL certificates.
-* `chardet` - Fallback auto-detection for response encoding.
 * `hstspreload` - determines whether IDNA-encoded host should be only accessed via HTTPS.
 * `idna` - Internationalized domain name support.
 * `rfc3986` - URL parsing & normalization.
diff --git a/docs/index.md b/docs/index.md
index 7f7ecee6c5..4205e3672e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -110,7 +110,6 @@ The HTTPX project relies on these excellent libraries:
   * `h11` - HTTP/1.1 support.
   * `h2` - HTTP/2 support.
 * `certifi` - SSL certificates.
-* `chardet` - Fallback auto-detection for response encoding.
 * `hstspreload` - determines whether IDNA-encoded host should be only accessed via HTTPS.
 * `idna` - Internationalized domain name support.
 * `rfc3986` - URL parsing & normalization.
diff --git a/httpx/_decoders.py b/httpx/_decoders.py
index 1ea47b0004..4848ef99b6 100644
--- a/httpx/_decoders.py
+++ b/httpx/_decoders.py
@@ -7,8 +7,6 @@
 import typing
 import zlib
 
-import chardet
-
 from ._exceptions import DecodingError
 
 try:
@@ -161,62 +159,22 @@ class TextDecoder:
     """
 
     def __init__(self, encoding: typing.Optional[str] = None):
-        self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
-            None if encoding is None else codecs.getincrementaldecoder(encoding)()
-        )
-        self.detector = chardet.universaldetector.UniversalDetector()
-
-        # This buffer is only needed if 'decoder' is 'None'
-        # we want to trigger errors if data is getting added to
-        # our internal buffer for some silly reason while
-        # a decoder is discovered.
-        self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
+        use_encoding = "utf-8" if encoding is None else encoding
+        self.decoder = codecs.getincrementaldecoder(use_encoding)()
 
     def decode(self, data: bytes) -> str:
         try:
-            if self.decoder is not None:
-                text = self.decoder.decode(data)
-            else:
-                assert self.buffer is not None
-                text = ""
-                self.detector.feed(data)
-                self.buffer += data
-
-                # Should be more than enough data to process, we don't
-                # want to buffer too long as chardet will wait until
-                # detector.close() is used to give back common
-                # encodings like 'utf-8'.
-                if len(self.buffer) >= 4096:
-                    self.decoder = codecs.getincrementaldecoder(
-                        self._detector_result()
-                    )()
-                    text = self.decoder.decode(bytes(self.buffer), False)
-                    self.buffer = None
-
-            return text
-        except UnicodeDecodeError:  # pragma: nocover
-            raise DecodingError() from None
+            return self.decoder.decode(data)
+        except UnicodeDecodeError as exc:  # pragma: nocover
+            message = str(exc)
+            raise DecodingError(message) from None
 
     def flush(self) -> str:
         try:
-            if self.decoder is None:
-                # Empty string case as chardet is guaranteed to not have a guess.
-                assert self.buffer is not None
-                if len(self.buffer) == 0:
-                    return ""
-                return bytes(self.buffer).decode(self._detector_result())
-
-            return self.decoder.decode(b"", True)
-        except UnicodeDecodeError:  # pragma: nocover
-            raise DecodingError() from None
-
-    def _detector_result(self) -> str:
-        self.detector.close()
-        result = self.detector.result["encoding"]
-        if not result:  # pragma: nocover
-            raise DecodingError("Unable to determine encoding of content")
-
-        return result
+            return self.decoder.decode(b"", final=True)
+        except UnicodeDecodeError as exc:  # pragma: nocover
+            message = str(exc)
+            raise DecodingError(message) from None
 
 
 class LineDecoder:
diff --git a/httpx/_models.py b/httpx/_models.py
index 892a959d65..44dd33bbcf 100644
--- a/httpx/_models.py
+++ b/httpx/_models.py
@@ -8,7 +8,6 @@
 from http.cookiejar import Cookie, CookieJar
 from urllib.parse import parse_qsl, urlencode
 
-import chardet
 import rfc3986
 
 from .__version__ import __version__
@@ -750,9 +749,7 @@ def encoding(self) -> str:
         if not hasattr(self, "_encoding"):
             encoding = self.charset_encoding
             if encoding is None or not is_known_encoding(encoding):
-                encoding = self.apparent_encoding
-                if encoding is None or not is_known_encoding(encoding):
-                    encoding = "utf-8"
+                encoding = "utf-8"
             self._encoding = encoding
         return self._encoding
 
@@ -782,13 +779,6 @@ def charset_encoding(self) -> typing.Optional[str]:
 
         return None
 
-    @property
-    def apparent_encoding(self) -> typing.Optional[str]:
-        """
-        Return the encoding, as it appears to autodetection.
-        """
-        return chardet.detect(self.content)["encoding"]
-
     @property
     def decoder(self) -> Decoder:
         """
diff --git a/setup.cfg b/setup.cfg
index 6732488f63..e421820507 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ check_untyped_defs = True
 profile = black
 combine_as_imports = True
 known_first_party = httpx,tests
-known_third_party = brotli,certifi,chardet,cryptography,hstspreload,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn
+known_third_party = brotli,certifi,cryptography,hstspreload,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn
 
 [tool:pytest]
 addopts = --cov=httpx --cov=tests -rxXs
diff --git a/setup.py b/setup.py
index cc6216992c..4503e0ceba 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,6 @@ def get_packages(package):
         "certifi",
         "hstspreload",
         "sniffio",
-        "chardet==3.*",
         "idna==2.*",
         "rfc3986>=1.3,<2",
         "httpcore==0.9.*",
diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py
index 1935aba3e5..6787f840d7 100644
--- a/tests/models/test_responses.py
+++ b/tests/models/test_responses.py
@@ -48,25 +48,15 @@ def test_response_content_type_encoding():
     assert response.encoding == "latin-1"
 
 
-def test_response_autodetect_encoding():
+def test_response_fallback_to_utf8():
     """
-    Autodetect encoding if there is no charset info in a Content-Type header.
-    """
-    content = "おはようございます。".encode("EUC-JP")
-    response = httpx.Response(200, content=content, request=REQUEST)
-    assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
-
-
-def test_response_fallback_to_autodetect():
-    """
-    Fallback to autodetection if we get an invalid charset in the Content-Type header.
+    Fallback to utf-8 if we get an invalid charset in the Content-Type header.
     """
     headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
     response = httpx.Response(200, content=content, headers=headers, request=REQUEST)
     assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding == "utf-8"
 
 
 def test_response_default_text_encoding():
@@ -84,10 +74,11 @@ def test_response_default_text_encoding():
 
 def test_response_default_encoding():
     """
-    Default to utf-8 if all else fails.
+    Default to utf-8 for decoding.
     """
-    response = httpx.Response(200, content=b"", request=REQUEST)
-    assert response.text == ""
+    content = "おはようございます。".encode("utf-8")
+    response = httpx.Response(200, content=content, request=REQUEST)
+    assert response.text == "おはようございます。"
     assert response.encoding == "utf-8"
 
 
@@ -98,7 +89,7 @@ def test_response_non_text_encoding():
     headers = {"Content-Type": "image/png"}
     response = httpx.Response(200, content=b"xyz", headers=headers, request=REQUEST)
     assert response.text == "xyz"
-    assert response.encoding == "ascii"
+    assert response.encoding == "utf-8"
 
 
 def test_response_set_explicit_encoding():
@@ -129,7 +120,7 @@ def test_read():
 
     assert response.status_code == 200
     assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding == "utf-8"
     assert response.is_closed
 
     content = response.read()
@@ -145,7 +136,7 @@ async def test_aread():
 
     assert response.status_code == 200
     assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding == "utf-8"
     assert response.is_closed
 
     content = await response.aread()
diff --git a/tests/test_decoders.py b/tests/test_decoders.py
index 89c545b5a4..a4c2e132aa 100644
--- a/tests/test_decoders.py
+++ b/tests/test_decoders.py
@@ -154,16 +154,6 @@ def test_decoding_errors(header_value):
     [
         ((b"Hello,", b" world!"), "ascii"),
         ((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
-        ((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
-        ((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
-        (
-            (b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
-            "MacCyrillic",
-        ),
-        (
-            (b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
-            "euc-jp",
-        ),
     ],
 )
 @pytest.mark.asyncio