Skip to content

Commit

Permalink
optimize batch fetch method to boost throughput
Browse files Browse the repository at this point in the history
The previous start url fetching method only working when spider
is idle, which is not full concurrency.This patch optimizes it by
using request_left_downloader signal.

Signed-off-by: Tianyue Ren <rentianyue-jk@360shuke.com>
  • Loading branch information
NiuBlibing committed Feb 8, 2023
1 parent 40c5e28 commit af59239
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion src/scrapy_redis/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def setup_redis(self, crawler=None):
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
crawler.signals.connect(self.fill_requests_queue, signal=signals.request_left_downloader)

def pop_list_queue(self, redis_key, batch_size):
with self.server.pipeline() as pipe:
Expand All @@ -102,11 +103,22 @@ def pop_priority_queue(self, redis_key, batch_size):
datas, _ = pipe.execute()
return datas

def fill_requests_queue(self):
need_size = self.crawler.engine.downloader.total_concurrency - \
len(self.crawler.engine.downloader.active) - len(self.crawler.engine.slot.scheduler.queue)
if need_size > 0:
self.logger.debug("Need to fill %i request(s)", need_size)
for req in self.__next_requests(need_size):
self.crawler.engine.crawl(req, spider=self)

def next_requests(self):
return self.__next_requests(self.redis_batch_size)

def __next_requests(self, redis_batch_size):
"""Returns a request to be scheduled or none."""
# XXX: Do we need to use a timeout here?
found = 0
datas = self.fetch_data(self.redis_key, self.redis_batch_size)
datas = self.fetch_data(self.redis_key, redis_batch_size)
for data in datas:
reqs = self.make_request_from_data(data)
if isinstance(reqs, Iterable):
Expand Down

0 comments on commit af59239

Please sign in to comment.