Skip to content

Commit

Permalink
Fix #323: urllib3 transport fails with UnicodeError on some invalid URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
lorien committed May 1, 2018
1 parent 387ee61 commit d02ec23
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 18 deletions.
9 changes: 7 additions & 2 deletions grab/spider/network_service/threaded.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from six.moves.queue import Empty

from grab.error import GrabNetworkError, GrabTooManyRedirectsError
from grab.error import (
GrabNetworkError, GrabTooManyRedirectsError
)
from grab.util.misc import camel_case_to_underscore
from grab.spider.base_service import BaseService

Expand Down Expand Up @@ -83,9 +85,12 @@ def worker_callback(self, worker):
orig_exc_name = (
ex.original_exc.__class__.__name__
)
# UnicodeError: see #323
if (
is_redir_err or
orig_exc_name == 'error'):
orig_exc_name == 'error' or
orig_exc_name ==
'UnicodeError'):
ex_cls = ex
else:
ex_cls = ex.original_exc
Expand Down
15 changes: 9 additions & 6 deletions grab/transport/urllib3.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,15 @@ def request(self):
req_url = make_str(req.url)
req_method = req.method
req.op_started = time.time()
res = pool.urlopen(req_method,
req_url,
body=req.data, timeout=timeout,
retries=retry,
headers=req.headers,
preload_content=False)
try:
res = pool.urlopen(req_method,
req_url,
body=req.data, timeout=timeout,
retries=retry,
headers=req.headers,
preload_content=False)
except UnicodeError as ex:
raise error.GrabConnectionError('GrabInvalidUrl', ex)
except exceptions.ReadTimeoutError as ex:
raise error.GrabTimeoutError('ReadTimeoutError', ex)
except exceptions.ConnectTimeoutError as ex:
Expand Down
19 changes: 17 additions & 2 deletions tests/grab_url_processing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# coding: utf-8
from six.moves.urllib.parse import quote

from tests.util import build_grab
from tests.util import BaseGrabTestCase
from tests.util import (
build_grab, BaseGrabTestCase, only_grab_transport
)
from grab.error import GrabConnectionError


class GrabUrlProcessingTestCase(BaseGrabTestCase):
Expand Down Expand Up @@ -54,3 +56,16 @@ def test_null_byte_url(self):
grab.go(self.server.get_url())
self.assertEqual(b'y', grab.doc.body)
self.assertEqual(grab.doc.url, quote(redirect_url, safe=':./?&'))

@only_grab_transport('urllib3')
def test_urllib3_idna_error(self):
invalid_url = (
'http://13354&altProductId=6423589&productId=6423589'
'&altProductStoreId=13713&catalogId=10001'
'&categoryId=28678&productStoreId=13713'
'http://www.textbooksnow.com/webapp/wcs/stores'
'/servlet/ProductDisplay?langId=-1&storeId='
)
grab = build_grab(transport='urllib3')
with self.assertRaises(GrabConnectionError):
grab.go(invalid_url)
27 changes: 27 additions & 0 deletions tests/lib_urllib3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# coding: utf-8
import six

from tests.util import BaseGrabTestCase, only_grab_transport


class GrabApiTestCase(BaseGrabTestCase):
def setUp(self):
self.server.reset()

@only_grab_transport('urllib3')
def test_urllib3_idna_error(self):
invalid_url = (
'http://13354&altProductId=6423589&productId=6423589'
'&altProductStoreId=13713&catalogId=10001'
'&categoryId=28678&productStoreId=13713'
'http://www.textbooksnow.com/webapp/wcs/stores'
'/servlet/ProductDisplay?langId=-1&storeId='
)
from urllib3 import PoolManager
from urllib3.exceptions import NewConnectionError
pool = PoolManager()
exc_cls = UnicodeError if six.PY3 else NewConnectionError
self.assertRaises(
exc_cls, pool.request, 'GET', invalid_url,
retries=False
)
18 changes: 10 additions & 8 deletions tests/spider_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,19 @@

from grab import GrabTimeoutError, Grab
from grab.spider import Spider, Task
from tests.util import BaseGrabTestCase, build_spider, run_test_if, GLOBAL
from tests.util import (
BaseGrabTestCase, build_spider, run_test_if, GLOBAL,
)

# That URLs breaks Grab's URL normalization process
# with error "label empty or too long"
INVALID_URL = 'http://13354&altProductId=6423589&productId=6423589'\
'&altProductStoreId=13713&catalogId=10001'\
'&categoryId=28678&productStoreId=13713'\
'http://www.textbooksnow.com/webapp/wcs/stores'\
'/servlet/ProductDisplay?langId=-1&storeId='
INVALID_URL = (
'http://13354&altProductId=6423589&productId=6423589'
'&altProductStoreId=13713&catalogId=10001'
'&categoryId=28678&productStoreId=13713'
'http://www.textbooksnow.com/webapp/wcs/stores'
'/servlet/ProductDisplay?langId=-1&storeId='
)


class SpiderErrorTestCase(BaseGrabTestCase):
Expand Down Expand Up @@ -93,7 +97,6 @@ def task_initial(self, grab, task):
yield Task('more', url=server.get_url())

def task_more(self, grab, task):
#print(grab.doc.url)
grab.doc('//div').text()

bot = build_spider(SimpleSpider)
Expand Down Expand Up @@ -164,7 +167,6 @@ def task_page(self, grab, unused_task):

bot = build_spider(SimpleSpider)
bot.run()
print(bot.stat.counters)
self.assertTrue('error:grab-timeout-error' in bot.stat.counters)

@run_test_if(lambda: (GLOBAL['network_service'] == 'threaded'
Expand Down

0 comments on commit d02ec23

Please sign in to comment.