Skip to content

Commit

Permalink
Merge pull request #229 from scrapinghub/batched-create-requests
Browse files Browse the repository at this point in the history
batch oriented request creation in crawling strategy
  • Loading branch information
sibiryakov authored Nov 29, 2016
2 parents 9952f0f + 79b20cc commit 9e94fc9
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 6 deletions.
13 changes: 11 additions & 2 deletions frontera/worker/strategies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ def schedule(self, request, score=1.0, dont_queue=False):

def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''):
"""
Creates request with specified fields, with state fetched from backend.
Creates request with specified fields, with state fetched from backend. This method only creates request, but
isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states
from storage.
:param url: str
:param method: str
Expand All @@ -110,5 +112,12 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No
"""
r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body)
self.url_mw._add_fingerprint(r)
self._states_context.refresh_and_keep(r)
return r

def refresh_states(self, requests):
"""
Retrieves states for all requests from storage.
:param requests: list(:class:`Request <frontera.core.models.Request>`)
"""
self._states_context.refresh_and_keep(requests)
9 changes: 5 additions & 4 deletions frontera/worker/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,11 @@ def fetch(self):
self._states.fetch(self._fingerprints)
self._fingerprints.clear()

def refresh_and_keep(self, request):
self._states.fetch([request.meta[b'fingerprint']])
self._states.set_states(request)
self._requests.append(request)
def refresh_and_keep(self, requests):
self.to_fetch(requests)
self.fetch()
self._states.set_states(requests)
self._requests.extend(requests)

def release(self):
self._states.update_cache(self._requests)
Expand Down
58 changes: 58 additions & 0 deletions tests/test_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
from frontera.worker.strategies import BaseCrawlingStrategy
from frontera.worker.strategy import StatesContext
from frontera.settings import Settings
from tests.mocks.frontier_manager import FakeFrontierManager

from frontera.contrib.backends.memory import MemoryStates
from frontera.core.components import States


class TestingCrawlingStrategy(BaseCrawlingStrategy):
def add_seeds(self, seeds):
pass

def page_crawled(self, response):
pass

def page_error(self, request, error):
pass

def links_extracted(self, request, links):
pass


class MessageBusStream(object):
def send(self, request, score=1.0, dont_queue=False):
pass

def flush(self):
pass


class TestCrawlingStrategy(object):
def strategy(self):
settings = Settings()
manager = FakeFrontierManager(settings)
stream = MessageBusStream()
states = MemoryStates(10)
states_ctx = StatesContext(states)
return TestingCrawlingStrategy.from_worker(manager, stream, states_ctx)

def test_create_request(self):
s = self.strategy()
req = s.create_request("http://test.com/someurl")
assert req.meta[b'fingerprint'] == b'955ac04f1b1a96de60a5139ad90c80be87822159'

def test_states_refresh(self):
s = self.strategy()
states = s._states_context._states
url = "http://test.com/someurl"
req1 = s.create_request(url)
req1.meta[b'state'] = States.CRAWLED
states.update_cache(req1)

req2 = s.create_request(url)
s.refresh_states([req2])
assert req2.meta[b'state'] == req1.meta[b'state']
assert req2.meta[b'state'] == States.CRAWLED

0 comments on commit 9e94fc9

Please sign in to comment.