diff --git a/willie/modules/url.py b/willie/modules/url.py index 3800e5fdde..9482336a3c 100644 --- a/willie/modules/url.py +++ b/willie/modules/url.py @@ -207,22 +207,10 @@ def check_callbacks(bot, trigger, url, run=True): def find_title(url): """Return the title for the given URL.""" - content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes, - dont_decode=True) - content_type = headers.get('Content-Type') or '' - encoding_match = re.match('.*?charset *= *(\S+)', content_type) - # If they gave us something else instead, try that - if encoding_match: - try: - content = content.decode(encoding_match.group(1)) - except: - encoding_match = None - # They didn't tell us what they gave us, so go with UTF-8 or fail silently. - if not encoding_match: - try: - content = content.decode('utf-8') - except: - return + try: + content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes) + except UnicodeDecodeError: + return # Fail silently when data can't be decoded # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. diff --git a/willie/web.py b/willie/web.py index 44b18c2d1e..0c16d4f638 100644 --- a/willie/web.py +++ b/willie/web.py @@ -58,12 +58,22 @@ def get(uri, timeout=20, headers=None, return_headers=False, u = get_urllib_object(uri, timeout, headers, verify_ssl) bytes = u.read(limit_bytes) u.close() + headers = dict(u.info()) if not dont_decode: - bytes = bytes.decode('utf-8') + # Detect encoding automatically from HTTP headers + content_type = headers.get('Content-Type') or '' + encoding_match = re.match('.*?charset *= *(\S+)', content_type, re.IGNORECASE) + if encoding_match: + try: + bytes = bytes.decode(encoding_match.group(1)) + except: + # attempt unicode on failure + encoding_match = None + if not encoding_match: + bytes = bytes.decode('utf-8') if not return_headers: return bytes else: - headers = dict(u.info()) headers['_http_status'] = u.code return (bytes, headers)