Skip to content

Commit

Permalink
Merge pull request #476 from scrapy/197-spiderqueue
Browse files Browse the repository at this point in the history
feat: Add spiderqueue configuration option
  • Loading branch information
jpmckinney authored Mar 10, 2023
2 parents d7605e5 + 040cc67 commit 538357c
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 18 deletions.
10 changes: 10 additions & 0 deletions docs/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ and services.

For more info see `Twisted Application Framework`_

.. _spiderqueue:

spiderqueue
-----------

The scheduler enqueues crawls in per-project spider queues, for the poller to pick.
You can define a custom spider queue class that implements the ISpiderQueue interface.

Defaults to ``scrapyd.spiderqueue.SqliteSpiderQueue``.

.. _webroot:

webroot
Expand Down
15 changes: 15 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@
Release notes
=============

Unreleased
----------

Added
~~~~~

- Add ``spiderqueue`` configuration option for custom spider queue.

Changed
~~~~~~~

- ``scrapyd.spiderqueue.SqliteSpiderQueue`` is initialized with a ``scrapyd.config.Config`` object and a project name, rather than a SQLite connection string (i.e. database file path).
- If ``dbs_dir`` is set to ``:memory`` or to a URL, it is passed through without modification and without creating a directory to ``scrapyd.jobstorage.SqliteJobStorage`` and ``scrapyd.spiderqueue.SqliteSpiderQueue``.
- ``scrapyd.utils.get_spider_queues`` defers the creation of the ``dbs_dir`` directory to the spider queue implementation.

1.4.1 (2023-02-10)
------------------

Expand Down
1 change: 1 addition & 0 deletions scrapyd/default_scrapyd.conf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ runner = scrapyd.runner
jobstorage = scrapyd.jobstorage.MemoryJobStorage
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
spiderqueue = scrapyd.spiderqueue.SqliteSpiderQueue
webroot = scrapyd.website.Root
eggstorage = scrapyd.eggstorage.FilesystemEggStorage

Expand Down
8 changes: 2 additions & 6 deletions scrapyd/jobstorage.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
from datetime import datetime

from zope.interface import implementer

from scrapyd.interfaces import IJobStorage
from scrapyd.sqlite import SqliteFinishedJobs
from scrapyd.utils import sqlite_connection_string


def job_log_url(job):
Expand Down Expand Up @@ -50,11 +50,7 @@ def __iter__(self):
class SqliteJobStorage(object):

def __init__(self, config):
dbsdir = config.get('dbs_dir', 'dbs')
if not os.path.exists(dbsdir):
os.makedirs(dbsdir)
dbpath = os.path.join(dbsdir, 'jobs.db')
self.jstorage = SqliteFinishedJobs(dbpath, "finished_jobs")
self.jstorage = SqliteFinishedJobs(sqlite_connection_string(config, 'jobs'), "finished_jobs")
self.finished_to_keep = config.getint('finished_to_keep', 100)

def add(self, job):
Expand Down
5 changes: 3 additions & 2 deletions scrapyd/spiderqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from scrapyd.interfaces import ISpiderQueue
from scrapyd.sqlite import JsonSqlitePriorityQueue
from scrapyd.utils import sqlite_connection_string


@implementer(ISpiderQueue)
class SqliteSpiderQueue(object):

def __init__(self, database=None, table='spider_queue'):
self.q = JsonSqlitePriorityQueue(database, table)
def __init__(self, config, project, table='spider_queue'):
self.q = JsonSqlitePriorityQueue(sqlite_connection_string(config, project), table)

def add(self, name, priority=0.0, **spider_args):
d = spider_args.copy()
Expand Down
3 changes: 2 additions & 1 deletion scrapyd/tests/test_spiderqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from zope.interface.verify import verifyObject

from scrapyd import spiderqueue
from scrapyd.config import Config
from scrapyd.interfaces import ISpiderQueue


Expand All @@ -11,7 +12,7 @@ class SpiderQueueTest(unittest.TestCase):
"""

def setUp(self):
self.q = spiderqueue.SqliteSpiderQueue(':memory:')
self.q = spiderqueue.SqliteSpiderQueue(Config(values={'dbs_dir': ':memory:'}), 'quotesbot')
self.name = 'spider1'
self.priority = 5
self.args = {
Expand Down
21 changes: 12 additions & 9 deletions scrapyd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import os
import sys
from subprocess import PIPE, Popen
from urllib.parse import urlsplit

from packaging.version import InvalidVersion, Version
from scrapy.utils.misc import load_object
from twisted.web import resource

from scrapyd.config import Config
from scrapyd.spiderqueue import SqliteSpiderQueue
from scrapyd.sqlite import JsonSqliteDict


Expand Down Expand Up @@ -55,14 +55,17 @@ def __setitem__(self, key, value):

def get_spider_queues(config):
"""Return a dict of Spider Queues keyed by project name"""
dbsdir = config.get('dbs_dir', 'dbs')
if not os.path.exists(dbsdir):
os.makedirs(dbsdir)
d = {}
for project in get_project_list(config):
dbpath = os.path.join(dbsdir, '%s.db' % project)
d[project] = SqliteSpiderQueue(dbpath)
return d
spiderqueue = load_object(config.get('spiderqueue', 'scrapyd.spiderqueue.SqliteSpiderQueue'))
return {project: spiderqueue(config, project) for project in get_project_list(config)}


def sqlite_connection_string(config, database):
dbs_dir = config.get('dbs_dir', 'dbs')
if dbs_dir == ':memory:' or urlsplit(dbs_dir).scheme:
return dbs_dir
if not os.path.exists(dbs_dir):
os.makedirs(dbs_dir)
return os.path.join(dbs_dir, f'{database}.db')


def get_project_list(config):
Expand Down

0 comments on commit 538357c

Please sign in to comment.