Fix #323: urllib3 transport fails with UnicodeError on some invalid URLs

lorien · May 1, 2018 · d02ec23 · d02ec23
1 parent 387ee61
commit d02ec23
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 18 deletions.
diff --git a/grab/spider/network_service/threaded.py b/grab/spider/network_service/threaded.py
@@ -2,7 +2,9 @@
 
 from six.moves.queue import Empty
 
-from grab.error import GrabNetworkError, GrabTooManyRedirectsError
+from grab.error import (
+    GrabNetworkError, GrabTooManyRedirectsError
+)
 from grab.util.misc import camel_case_to_underscore
 from grab.spider.base_service import BaseService
 
@@ -83,9 +85,12 @@ def worker_callback(self, worker):
                                         orig_exc_name = (
                                             ex.original_exc.__class__.__name__
                                         )
+                                        # UnicodeError: see #323
                                         if (
                                                 is_redir_err or
-                                                orig_exc_name == 'error'):
+                                                orig_exc_name == 'error' or
+                                                orig_exc_name ==
+                                                'UnicodeError'):
                                             ex_cls = ex
                                         else:
                                             ex_cls = ex.original_exc

diff --git a/grab/transport/urllib3.py b/grab/transport/urllib3.py
@@ -279,12 +279,15 @@ def request(self):
                 req_url = make_str(req.url)
                 req_method = req.method
             req.op_started = time.time()
-            res = pool.urlopen(req_method,
-                               req_url,
-                               body=req.data, timeout=timeout,
-                               retries=retry,
-                               headers=req.headers,
-                               preload_content=False)
+            try:
+                res = pool.urlopen(req_method,
+                                   req_url,
+                                   body=req.data, timeout=timeout,
+                                   retries=retry,
+                                   headers=req.headers,
+                                   preload_content=False)
+            except UnicodeError as ex:
+                raise error.GrabConnectionError('GrabInvalidUrl', ex)
         except exceptions.ReadTimeoutError as ex:
             raise error.GrabTimeoutError('ReadTimeoutError', ex)
         except exceptions.ConnectTimeoutError as ex:

diff --git a/tests/grab_url_processing.py b/tests/grab_url_processing.py
@@ -1,8 +1,10 @@
 # coding: utf-8
 from six.moves.urllib.parse import quote
 
-from tests.util import build_grab
-from tests.util import BaseGrabTestCase
+from tests.util import (
+    build_grab, BaseGrabTestCase, only_grab_transport
+)
+from grab.error import GrabConnectionError
 
 
 class GrabUrlProcessingTestCase(BaseGrabTestCase):
@@ -54,3 +56,16 @@ def test_null_byte_url(self):
         grab.go(self.server.get_url())
         self.assertEqual(b'y', grab.doc.body)
         self.assertEqual(grab.doc.url, quote(redirect_url, safe=':./?&'))
+
+    @only_grab_transport('urllib3')
+    def test_urllib3_idna_error(self):
+        invalid_url = (
+            'http://13354&altProductId=6423589&productId=6423589'
+            '&altProductStoreId=13713&catalogId=10001'
+            '&categoryId=28678&productStoreId=13713'
+            'http://www.textbooksnow.com/webapp/wcs/stores'
+            '/servlet/ProductDisplay?langId=-1&storeId='
+        )
+        grab = build_grab(transport='urllib3')
+        with self.assertRaises(GrabConnectionError):
+            grab.go(invalid_url)
diff --git a/tests/lib_urllib3.py b/tests/lib_urllib3.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+import six
+
+from tests.util import BaseGrabTestCase, only_grab_transport
+
+
+class GrabApiTestCase(BaseGrabTestCase):
+    def setUp(self):
+        self.server.reset()
+
+    @only_grab_transport('urllib3')
+    def test_urllib3_idna_error(self):
+        invalid_url = (
+            'http://13354&altProductId=6423589&productId=6423589'
+            '&altProductStoreId=13713&catalogId=10001'
+            '&categoryId=28678&productStoreId=13713'
+            'http://www.textbooksnow.com/webapp/wcs/stores'
+            '/servlet/ProductDisplay?langId=-1&storeId='
+        )
+        from urllib3 import PoolManager
+        from urllib3.exceptions import NewConnectionError
+        pool = PoolManager()
+        exc_cls = UnicodeError if six.PY3 else NewConnectionError
+        self.assertRaises(
+            exc_cls, pool.request, 'GET', invalid_url,
+            retries=False
+        )
diff --git a/tests/spider_error.py b/tests/spider_error.py
@@ -4,15 +4,19 @@
 
 from grab import GrabTimeoutError, Grab
 from grab.spider import Spider, Task
-from tests.util import BaseGrabTestCase, build_spider, run_test_if, GLOBAL
+from tests.util import (
+    BaseGrabTestCase, build_spider, run_test_if, GLOBAL,
+)
 
 # That URLs breaks Grab's URL normalization process
 # with error "label empty or too long"
-INVALID_URL = 'http://13354&altProductId=6423589&productId=6423589'\
-              '&altProductStoreId=13713&catalogId=10001'\
-              '&categoryId=28678&productStoreId=13713'\
-              'http://www.textbooksnow.com/webapp/wcs/stores'\
-              '/servlet/ProductDisplay?langId=-1&storeId='
+INVALID_URL = (
+    'http://13354&altProductId=6423589&productId=6423589'
+    '&altProductStoreId=13713&catalogId=10001'
+    '&categoryId=28678&productStoreId=13713'
+    'http://www.textbooksnow.com/webapp/wcs/stores'
+    '/servlet/ProductDisplay?langId=-1&storeId='
+)
 
 
 class SpiderErrorTestCase(BaseGrabTestCase):
@@ -93,7 +97,6 @@ def task_initial(self, grab, task):
                     yield Task('more', url=server.get_url())
 
                 def task_more(self, grab, task):
-                    #print(grab.doc.url)
                     grab.doc('//div').text()
 
             bot = build_spider(SimpleSpider)
@@ -164,7 +167,6 @@ def task_page(self, grab, unused_task):
 
         bot = build_spider(SimpleSpider)
         bot.run()
-        print(bot.stat.counters)
         self.assertTrue('error:grab-timeout-error' in bot.stat.counters)
 
     @run_test_if(lambda: (GLOBAL['network_service'] == 'threaded'