Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefer defaulting to utf-8, rather than using chardet autodetect #1078

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ The HTTPX project relies on these excellent libraries:
* `h11` - HTTP/1.1 support.
* `h2` - HTTP/2 support.
* `certifi` - SSL certificates.
* `chardet` - Fallback auto-detection for response encoding.
* `hstspreload` - determines whether IDNA-encoded host should be only accessed via HTTPS.
* `idna` - Internationalized domain name support.
* `rfc3986` - URL parsing & normalization.
Expand Down
1 change: 0 additions & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ The HTTPX project relies on these excellent libraries:
* `h11` - HTTP/1.1 support.
* `h2` - HTTP/2 support.
* `certifi` - SSL certificates.
* `chardet` - Fallback auto-detection for response encoding.
* `hstspreload` - determines whether IDNA-encoded host should be only accessed via HTTPS.
* `idna` - Internationalized domain name support.
* `rfc3986` - URL parsing & normalization.
Expand Down
62 changes: 10 additions & 52 deletions httpx/_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import typing
import zlib

import chardet

from ._exceptions import DecodingError

try:
Expand Down Expand Up @@ -161,62 +159,22 @@ class TextDecoder:
"""

def __init__(self, encoding: typing.Optional[str] = None):
self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
None if encoding is None else codecs.getincrementaldecoder(encoding)()
)
self.detector = chardet.universaldetector.UniversalDetector()

# This buffer is only needed if 'decoder' is 'None'
# we want to trigger errors if data is getting added to
# our internal buffer for some silly reason while
# a decoder is discovered.
self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
use_encoding = "utf-8" if encoding is None else encoding
self.decoder = codecs.getincrementaldecoder(use_encoding)()

def decode(self, data: bytes) -> str:
try:
if self.decoder is not None:
text = self.decoder.decode(data)
else:
assert self.buffer is not None
text = ""
self.detector.feed(data)
self.buffer += data

# Should be more than enough data to process, we don't
# want to buffer too long as chardet will wait until
# detector.close() is used to give back common
# encodings like 'utf-8'.
if len(self.buffer) >= 4096:
self.decoder = codecs.getincrementaldecoder(
self._detector_result()
)()
text = self.decoder.decode(bytes(self.buffer), False)
self.buffer = None

return text
except UnicodeDecodeError: # pragma: nocover
raise DecodingError() from None
return self.decoder.decode(data)
except UnicodeDecodeError as exc: # pragma: nocover
message = str(exc)
raise DecodingError(message) from None

def flush(self) -> str:
try:
if self.decoder is None:
# Empty string case as chardet is guaranteed to not have a guess.
assert self.buffer is not None
if len(self.buffer) == 0:
return ""
return bytes(self.buffer).decode(self._detector_result())

return self.decoder.decode(b"", True)
except UnicodeDecodeError: # pragma: nocover
raise DecodingError() from None

def _detector_result(self) -> str:
self.detector.close()
result = self.detector.result["encoding"]
if not result: # pragma: nocover
raise DecodingError("Unable to determine encoding of content")

return result
return self.decoder.decode(b"", final=True)
except UnicodeDecodeError as exc: # pragma: nocover
message = str(exc)
raise DecodingError(message) from None


class LineDecoder:
Expand Down
12 changes: 1 addition & 11 deletions httpx/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qsl, urlencode

import chardet
import rfc3986

from .__version__ import __version__
Expand Down Expand Up @@ -693,9 +692,7 @@ def encoding(self) -> str:
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None or not is_known_encoding(encoding):
encoding = self.apparent_encoding
if encoding is None or not is_known_encoding(encoding):
encoding = "utf-8"
encoding = "utf-8"
self._encoding = encoding
return self._encoding

Expand Down Expand Up @@ -725,13 +722,6 @@ def charset_encoding(self) -> typing.Optional[str]:

return None

@property
def apparent_encoding(self) -> typing.Optional[str]:
"""
Return the encoding, as it appears to autodetection.
"""
return chardet.detect(self.content)["encoding"]

@property
def decoder(self) -> Decoder:
"""
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ check_untyped_defs = True
profile = black
combine_as_imports = True
known_first_party = httpx,tests
known_third_party = brotli,certifi,chardet,cryptography,hstspreload,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn
known_third_party = brotli,certifi,cryptography,hstspreload,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn

[tool:pytest]
addopts = --cov=httpx --cov=tests -rxXs
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def get_packages(package):
"certifi",
"hstspreload",
"sniffio",
"chardet==3.*",
"idna==2.*",
"rfc3986>=1.3,<2",
"httpcore==0.9.*",
Expand Down
31 changes: 11 additions & 20 deletions tests/models/test_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,15 @@ def test_response_content_type_encoding():
assert response.encoding == "latin-1"


def test_response_autodetect_encoding():
def test_response_fallback_to_utf8():
"""
Autodetect encoding if there is no charset info in a Content-Type header.
"""
content = "おはようございます。".encode("EUC-JP")
response = httpx.Response(200, content=content, request=REQUEST)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"


def test_response_fallback_to_autodetect():
"""
Fallback to autodetection if we get an invalid charset in the Content-Type header.
Fallback to utf-8 if we get an invalid charset in the Content-Type header.
"""
headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(200, content=content, headers=headers, request=REQUEST)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding == "utf-8"
Comment on lines +56 to +59
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, what if we do get a "おはようございます。".encode("EUC-JP"), and the server fails to include ; encoding=EUC-JP?

I assume we'll try to .decode("utf-8") — and that it will fail?

>>> text = "おはようございます。".encode("EUC-JP")
>>> text.decode("utf-8")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa4 in position 0: invalid start byte

In that case I assume we want to allow the user to provide some kind of manual encoding (eg if they know it's some exotic encoding).

Most likely it would just be a recommendation to use something like this...

text = response.content.decode("EUC-JP")

This is mostly fine I suppose.

But then, we probably also want to provide more helpful error messaging than the UnicodeDecodeError above, correct? For example, a DecodingError with an error message saying "consider decoding .content manually" (we may have this already, but not sure?).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than...

text = response.content.decode("EUC-JP")
print(text)

You'd use..

response.encoding = "EUC-JP"
print(response.text)



def test_response_default_text_encoding():
Expand All @@ -84,10 +74,11 @@ def test_response_default_text_encoding():

def test_response_default_encoding():
"""
Default to utf-8 if all else fails.
Default to utf-8 for decoding.
"""
response = httpx.Response(200, content=b"", request=REQUEST)
assert response.text == ""
content = "おはようございます。".encode("utf-8")
response = httpx.Response(200, content=content, request=REQUEST)
assert response.text == "おはようございます。"
assert response.encoding == "utf-8"


Expand All @@ -98,7 +89,7 @@ def test_response_non_text_encoding():
headers = {"Content-Type": "image/png"}
response = httpx.Response(200, content=b"xyz", headers=headers, request=REQUEST)
assert response.text == "xyz"
assert response.encoding == "ascii"
assert response.encoding == "utf-8"


def test_response_set_explicit_encoding():
Expand Down Expand Up @@ -129,7 +120,7 @@ def test_read():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding == "utf-8"
assert response.is_closed

content = response.read()
Expand All @@ -145,7 +136,7 @@ async def test_aread():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding == "utf-8"
assert response.is_closed

content = await response.aread()
Expand Down
10 changes: 0 additions & 10 deletions tests/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,6 @@ def test_decoding_errors(header_value):
[
((b"Hello,", b" world!"), "ascii"),
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
(
(b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
"MacCyrillic",
),
(
(b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
"euc-jp",
),
],
)
@pytest.mark.asyncio
Expand Down