Skip to content

Commit

Permalink
[web, url] Move header-based encoding detection to web.get
Browse files Browse the repository at this point in the history
url.py did this really good thing where it would look for the encoding in the
HTTP headers and use them to .decode() the string returned from web.get.

web.get() gained (not too long ago) the ability to decode text itself, but it
was doing it to utf8 by default.

So instead of hardcoding utf8 in web.get, I moved that detection functionality
from url to web, so now all modules using web.get will be able to enjoy it.
  • Loading branch information
Elad Alfassa committed Aug 6, 2014
1 parent 5f5bc0f commit 9604bfc
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 18 deletions.
20 changes: 4 additions & 16 deletions willie/modules/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,22 +207,10 @@ def check_callbacks(bot, trigger, url, run=True):

def find_title(url):
"""Return the title for the given URL."""
content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes,
dont_decode=True)
content_type = headers.get('Content-Type') or ''
encoding_match = re.match('.*?charset *= *(\S+)', content_type)
# If they gave us something else instead, try that
if encoding_match:
try:
content = content.decode(encoding_match.group(1))
except:
encoding_match = None
# They didn't tell us what they gave us, so go with UTF-8 or fail silently.
if not encoding_match:
try:
content = content.decode('utf-8')
except:
return
try:
content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes)
except UnicodeDecodeError:
return # Fail silently when data can't be decoded

# Some cleanup that I don't really grok, but was in the original, so
# we'll keep it (with the compiled regexes made global) for now.
Expand Down
14 changes: 12 additions & 2 deletions willie/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,22 @@ def get(uri, timeout=20, headers=None, return_headers=False,
u = get_urllib_object(uri, timeout, headers, verify_ssl)
bytes = u.read(limit_bytes)
u.close()
headers = dict(u.info())
if not dont_decode:
bytes = bytes.decode('utf-8')
# Detect encoding automatically from HTTP headers
content_type = headers.get('Content-Type') or ''
encoding_match = re.match('.*?charset *= *(\S+)', content_type, re.IGNORECASE)
if encoding_match:
try:
bytes = bytes.decode(encoding_match.group(1))
except:
# attempt unicode on failure
encoding_match = None
if not encoding_match:
bytes = bytes.decode('utf-8')
if not return_headers:
return bytes
else:
headers = dict(u.info())
headers['_http_status'] = u.code
return (bytes, headers)

Expand Down

0 comments on commit 9604bfc

Please sign in to comment.