Skip to content

Commit

Permalink
Merge pull request #310 from lorien/issue_267_threaded_redirects
Browse files Browse the repository at this point in the history
Fix #267: normalize handling of too many redirect error
  • Loading branch information
lorien authored Apr 18, 2018
2 parents 777ac71 + f907b53 commit 5918759
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 14 deletions.
6 changes: 3 additions & 3 deletions grab/spider/network_service/multicurl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from grab.spider.base_service import BaseService


ERROR_TOO_MANY_REFRESH_REDIRECTS = -2
ERROR_TOO_MANY_REDIRECTS = -2
# Source: https://curl.haxx.se/libcurl/c/libcurl-errors.html
ERRNUM_PYCURL_TAG = {
0: 'E_OK',
Expand Down Expand Up @@ -102,7 +102,7 @@
93: 'E_RECURSIVE_API_CALL',
}
ERRNUM_TAG = {
ERROR_TOO_MANY_REFRESH_REDIRECTS: 'too-many-refresh-redirects',
ERROR_TOO_MANY_REDIRECTS: 'too-many-redirects',
}
for code, tag in ERRNUM_PYCURL_TAG.items():
assert tag.startswith('E_')
Expand Down Expand Up @@ -299,7 +299,7 @@ def iterate_results(self):
self.network_op_lock.acquire()
grab.process_request_result()
except GrabTooManyRedirectsError:
ecode = ERROR_TOO_MANY_REFRESH_REDIRECTS
ecode = ERROR_TOO_MANY_REDIRECTS
emsg = 'Too many meta refresh redirects'
is_ok = False
finally:
Expand Down
30 changes: 19 additions & 11 deletions grab/spider/network_service/threaded.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,10 @@

from six.moves.queue import Empty

from grab.error import GrabNetworkError
from grab.error import GrabNetworkError, GrabTooManyRedirectsError
from grab.util.misc import camel_case_to_underscore
from grab.spider.base_service import BaseService

ERROR_TOO_MANY_REFRESH_REDIRECTS = -2
ERROR_ABBR = {
ERROR_TOO_MANY_REFRESH_REDIRECTS: 'too-many-refresh-redirects',
}


def make_class_abbr(name):
val = camel_case_to_underscore(name)
Expand Down Expand Up @@ -79,17 +74,30 @@ def worker_callback(self, worker):
}
try:
grab.request()
except GrabNetworkError as ex:
if (ex.original_exc.__class__.__name__
== 'error'):
except (
GrabNetworkError,
GrabTooManyRedirectsError) as ex:
is_redir_err = isinstance(
ex, GrabTooManyRedirectsError
)
orig_exc_name = (
ex.original_exc.__class__.__name__
)
if (
is_redir_err or
orig_exc_name == 'error'):
ex_cls = ex
else:
ex_cls = ex.original_exc
result.update({
'ok': False,
'exc': ex,
'error_abbr': make_class_abbr(
ex_cls.__class__.__name__
'error_abbr': (
'too-many-redirects'
if is_redir_err
else make_class_abbr(
ex_cls.__class__.__name__
)
),
})
(self.spider.task_dispatcher
Expand Down
30 changes: 30 additions & 0 deletions tests/spider_redirect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from grab.spider import Spider, Task

from tests.util import BaseGrabTestCase, build_spider


class BasicSpiderTestCase(BaseGrabTestCase):
def setUp(self):
self.server.reset()

def test_too_many_redirects(self):
class TestSpider(Spider):
def task_page(self, unused_grab, unused_task):
pass

bot = build_spider(TestSpider)
bot.setup_queue()
bot.add_task(Task('page', url=self.server.get_url()))

self.server.response['headers'] = [
('Location', self.server.get_url()),
]
self.server.response['code'] = 302
bot.run()

print('counters', bot.stat.counters)
print('items', bot.stat.collections)
self.assertEqual(
1, len(bot.stat.collections['network-count-rejected'])
)
self.assertTrue('error:too-many-redirects' in bot.stat.counters)

0 comments on commit 5918759

Please sign in to comment.