From b46649f8cdff8dc7833d6e67deb210186d6390ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 5 Jan 2016 18:04:03 -0500 Subject: [PATCH 1/2] SNI support using Python requests for .url without this, SNI-enabled sites, which are becoming more and more popular, are not displayed by the URL plugin a good site to test with is: https://sni.velox.ch/ the requests API is similar enough to the `web.get` API to replace it, but that is left to another pull request, as other plugins may not require SNI support because they probably don't encounter the same variety of sites as `.url` --- requirements.txt | 1 + sopel/modules/url.py | 37 +++++++++++-------------------------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/requirements.txt b/requirements.txt index b3856aba06..d0f3a10bf3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ pytz praw pyenchant pygeoip +requests diff --git a/sopel/modules/url.py b/sopel/modules/url.py index e18f84cb6a..58342b7df1 100644 --- a/sopel/modules/url.py +++ b/sopel/modules/url.py @@ -8,10 +8,12 @@ from __future__ import unicode_literals, absolute_import, print_function, division import re +from contextlib import closing from sopel import web, tools from sopel.module import commands, rule, example from sopel.config.types import ValidatedAttribute, StaticSection +import requests url_finder = None # These are used to clean up the title tag before actually parsing it. Not the @@ -150,14 +152,6 @@ def process_urls(bot, trigger, urls): pass # First, check that the URL we got doesn't match matched = check_callbacks(bot, trigger, url, False) - if matched: - continue - # Then see if it redirects anywhere - new_url = follow_redirects(url) - if not new_url: - continue - # Then see if the final URL matches anything - matched = check_callbacks(bot, trigger, new_url, new_url != url) if matched: continue # Finally, actually show the URL @@ -167,20 +161,6 @@ def process_urls(bot, trigger, urls): return results -def follow_redirects(url): - """ - Follow HTTP 3xx redirects, and return the actual URL. Return None if - there's a problem. - """ - try: - connection = web.get_urllib_object(url, 60) - url = connection.geturl() or url - connection.close() - except: - return None - return url - - def check_callbacks(bot, trigger, url, run=True): """ Check the given URL against the callbacks list. If it matches, and ``run`` @@ -201,10 +181,15 @@ def check_callbacks(bot, trigger, url, run=True): def find_title(url): """Return the title for the given URL.""" - try: - content, headers = web.get(url, return_headers=True, limit_bytes=max_bytes) - except UnicodeDecodeError: - return # Fail silently when data can't be decoded + with closing(requests.get(url, stream=True)) as response: + try: + content = '' + for line in response.iter_lines(decode_unicode=True): + content += line + if '' in content or len(content) > max_bytes: + break + except UnicodeDecodeError: + return # Fail silently when data can't be decoded # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. From a4ee91f46c5f4850d974f5be5193e05e1675683b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 5 Jan 2016 18:27:35 -0500 Subject: [PATCH 2/2] properly close partially read request this is because the closing() structure doesn't seem to be supported in all cases. at least in request 2.8, the response.close() call actually works, so we'll use that. note that it fails in 2.4.3 (debian jessie/stable) --- sopel/modules/url.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sopel/modules/url.py b/sopel/modules/url.py index 58342b7df1..0d7aac5de1 100644 --- a/sopel/modules/url.py +++ b/sopel/modules/url.py @@ -181,15 +181,18 @@ def check_callbacks(bot, trigger, url, run=True): def find_title(url): """Return the title for the given URL.""" - with closing(requests.get(url, stream=True)) as response: - try: - content = '' - for line in response.iter_lines(decode_unicode=True): - content += line - if '' in content or len(content) > max_bytes: - break - except UnicodeDecodeError: - return # Fail silently when data can't be decoded + response = requests.get(url, stream=True) + try: + content = '' + for line in response.iter_lines(decode_unicode=True): + content += line + if '' in content or len(content) > max_bytes: + break + except UnicodeDecodeError: + return # Fail silently when data can't be decoded + finally: + # need to close the connexion because we have not read all the data + response.close() # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now.