From 0daa85f8f97cd37f7bafb5e6d42c62a16c3962e6 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Tue, 21 Jul 2020 13:36:30 +0100 Subject: [PATCH] Prefer defaulting to utf-8, rather than using chardet autodetect --- README.md | 1 - docs/index.md | 1 - httpx/_decoders.py | 62 ++++++---------------------------- httpx/_models.py | 12 +------ setup.cfg | 2 +- setup.py | 1 - tests/models/test_responses.py | 31 ++++++----------- tests/test_decoders.py | 10 ------ 8 files changed, 23 insertions(+), 97 deletions(-) diff --git a/README.md b/README.md index d262045f34..82ff0fffd7 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,6 @@ The HTTPX project relies on these excellent libraries: * `h11` - HTTP/1.1 support. * `h2` - HTTP/2 support. * `certifi` - SSL certificates. -* `chardet` - Fallback auto-detection for response encoding. * `hstspreload` - determines whether IDNA-encoded host should be only accessed via HTTPS. * `idna` - Internationalized domain name support. * `rfc3986` - URL parsing & normalization. diff --git a/docs/index.md b/docs/index.md index 7f7ecee6c5..4205e3672e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -110,7 +110,6 @@ The HTTPX project relies on these excellent libraries: * `h11` - HTTP/1.1 support. * `h2` - HTTP/2 support. * `certifi` - SSL certificates. -* `chardet` - Fallback auto-detection for response encoding. * `hstspreload` - determines whether IDNA-encoded host should be only accessed via HTTPS. * `idna` - Internationalized domain name support. * `rfc3986` - URL parsing & normalization. diff --git a/httpx/_decoders.py b/httpx/_decoders.py index 1ea47b0004..4848ef99b6 100644 --- a/httpx/_decoders.py +++ b/httpx/_decoders.py @@ -7,8 +7,6 @@ import typing import zlib -import chardet - from ._exceptions import DecodingError try: @@ -161,62 +159,22 @@ class TextDecoder: """ def __init__(self, encoding: typing.Optional[str] = None): - self.decoder: typing.Optional[codecs.IncrementalDecoder] = ( - None if encoding is None else codecs.getincrementaldecoder(encoding)() - ) - self.detector = chardet.universaldetector.UniversalDetector() - - # This buffer is only needed if 'decoder' is 'None' - # we want to trigger errors if data is getting added to - # our internal buffer for some silly reason while - # a decoder is discovered. - self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray() + use_encoding = "utf-8" if encoding is None else encoding + self.decoder = codecs.getincrementaldecoder(use_encoding)() def decode(self, data: bytes) -> str: try: - if self.decoder is not None: - text = self.decoder.decode(data) - else: - assert self.buffer is not None - text = "" - self.detector.feed(data) - self.buffer += data - - # Should be more than enough data to process, we don't - # want to buffer too long as chardet will wait until - # detector.close() is used to give back common - # encodings like 'utf-8'. - if len(self.buffer) >= 4096: - self.decoder = codecs.getincrementaldecoder( - self._detector_result() - )() - text = self.decoder.decode(bytes(self.buffer), False) - self.buffer = None - - return text - except UnicodeDecodeError: # pragma: nocover - raise DecodingError() from None + return self.decoder.decode(data) + except UnicodeDecodeError as exc: # pragma: nocover + message = str(exc) + raise DecodingError(message) from None def flush(self) -> str: try: - if self.decoder is None: - # Empty string case as chardet is guaranteed to not have a guess. - assert self.buffer is not None - if len(self.buffer) == 0: - return "" - return bytes(self.buffer).decode(self._detector_result()) - - return self.decoder.decode(b"", True) - except UnicodeDecodeError: # pragma: nocover - raise DecodingError() from None - - def _detector_result(self) -> str: - self.detector.close() - result = self.detector.result["encoding"] - if not result: # pragma: nocover - raise DecodingError("Unable to determine encoding of content") - - return result + return self.decoder.decode(b"", final=True) + except UnicodeDecodeError as exc: # pragma: nocover + message = str(exc) + raise DecodingError(message) from None class LineDecoder: diff --git a/httpx/_models.py b/httpx/_models.py index 892a959d65..44dd33bbcf 100644 --- a/httpx/_models.py +++ b/httpx/_models.py @@ -8,7 +8,6 @@ from http.cookiejar import Cookie, CookieJar from urllib.parse import parse_qsl, urlencode -import chardet import rfc3986 from .__version__ import __version__ @@ -750,9 +749,7 @@ def encoding(self) -> str: if not hasattr(self, "_encoding"): encoding = self.charset_encoding if encoding is None or not is_known_encoding(encoding): - encoding = self.apparent_encoding - if encoding is None or not is_known_encoding(encoding): - encoding = "utf-8" + encoding = "utf-8" self._encoding = encoding return self._encoding @@ -782,13 +779,6 @@ def charset_encoding(self) -> typing.Optional[str]: return None - @property - def apparent_encoding(self) -> typing.Optional[str]: - """ - Return the encoding, as it appears to autodetection. - """ - return chardet.detect(self.content)["encoding"] - @property def decoder(self) -> Decoder: """ diff --git a/setup.cfg b/setup.cfg index 6732488f63..e421820507 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,7 +14,7 @@ check_untyped_defs = True profile = black combine_as_imports = True known_first_party = httpx,tests -known_third_party = brotli,certifi,chardet,cryptography,hstspreload,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn +known_third_party = brotli,certifi,cryptography,hstspreload,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn [tool:pytest] addopts = --cov=httpx --cov=tests -rxXs diff --git a/setup.py b/setup.py index cc6216992c..4503e0ceba 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,6 @@ def get_packages(package): "certifi", "hstspreload", "sniffio", - "chardet==3.*", "idna==2.*", "rfc3986>=1.3,<2", "httpcore==0.9.*", diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 1935aba3e5..6787f840d7 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -48,25 +48,15 @@ def test_response_content_type_encoding(): assert response.encoding == "latin-1" -def test_response_autodetect_encoding(): +def test_response_fallback_to_utf8(): """ - Autodetect encoding if there is no charset info in a Content-Type header. - """ - content = "おはようございます。".encode("EUC-JP") - response = httpx.Response(200, content=content, request=REQUEST) - assert response.text == "おはようございます。" - assert response.encoding == "EUC-JP" - - -def test_response_fallback_to_autodetect(): - """ - Fallback to autodetection if we get an invalid charset in the Content-Type header. + Fallback to utf-8 if we get an invalid charset in the Content-Type header. """ headers = {"Content-Type": "text-plain; charset=invalid-codec-name"} - content = "おはようございます。".encode("EUC-JP") + content = "おはようございます。".encode("utf-8") response = httpx.Response(200, content=content, headers=headers, request=REQUEST) assert response.text == "おはようございます。" - assert response.encoding == "EUC-JP" + assert response.encoding == "utf-8" def test_response_default_text_encoding(): @@ -84,10 +74,11 @@ def test_response_default_text_encoding(): def test_response_default_encoding(): """ - Default to utf-8 if all else fails. + Default to utf-8 for decoding. """ - response = httpx.Response(200, content=b"", request=REQUEST) - assert response.text == "" + content = "おはようございます。".encode("utf-8") + response = httpx.Response(200, content=content, request=REQUEST) + assert response.text == "おはようございます。" assert response.encoding == "utf-8" @@ -98,7 +89,7 @@ def test_response_non_text_encoding(): headers = {"Content-Type": "image/png"} response = httpx.Response(200, content=b"xyz", headers=headers, request=REQUEST) assert response.text == "xyz" - assert response.encoding == "ascii" + assert response.encoding == "utf-8" def test_response_set_explicit_encoding(): @@ -129,7 +120,7 @@ def test_read(): assert response.status_code == 200 assert response.text == "Hello, world!" - assert response.encoding == "ascii" + assert response.encoding == "utf-8" assert response.is_closed content = response.read() @@ -145,7 +136,7 @@ async def test_aread(): assert response.status_code == 200 assert response.text == "Hello, world!" - assert response.encoding == "ascii" + assert response.encoding == "utf-8" assert response.is_closed content = await response.aread() diff --git a/tests/test_decoders.py b/tests/test_decoders.py index 89c545b5a4..a4c2e132aa 100644 --- a/tests/test_decoders.py +++ b/tests/test_decoders.py @@ -154,16 +154,6 @@ def test_decoding_errors(header_value): [ ((b"Hello,", b" world!"), "ascii"), ((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"), - ((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"), - ((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"), - ( - (b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64, - "MacCyrillic", - ), - ( - (b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512, - "euc-jp", - ), ], ) @pytest.mark.asyncio