Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add spiderqueue configuration option #476

Merged
merged 3 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ and services.

For more info see `Twisted Application Framework`_

.. _spiderqueue:

spiderqueue
-----------

The scheduler enqueues crawls in per-project spider queues, for the poller to pick.
You can define a custom spider queue class that implements the ISpiderQueue interface.

Defaults to ``scrapyd.spiderqueue.SqliteSpiderQueue``.

.. _webroot:

webroot
Expand Down
15 changes: 15 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@
Release notes
=============

Unreleased
----------

Added
~~~~~

- Add ``spiderqueue`` configuration option for custom spider queue.

Changed
~~~~~~~

- ``scrapyd.spiderqueue.SqliteSpiderQueue`` is initialized with a ``scrapyd.config.Config`` object and a project name, rather than a SQLite connection string (i.e. database file path).
- If ``dbs_dir`` is set to ``:memory`` or to a URL, it is passed through without modification and without creating a directory to ``scrapyd.jobstorage.SqliteJobStorage`` and ``scrapyd.spiderqueue.SqliteSpiderQueue``.
- ``scrapyd.utils.get_spider_queues`` defers the creation of the ``dbs_dir`` directory to the spider queue implementation.

1.4.1 (2023-02-10)
------------------

Expand Down
1 change: 1 addition & 0 deletions scrapyd/default_scrapyd.conf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ runner = scrapyd.runner
jobstorage = scrapyd.jobstorage.MemoryJobStorage
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
spiderqueue = scrapyd.spiderqueue.SqliteSpiderQueue
webroot = scrapyd.website.Root
eggstorage = scrapyd.eggstorage.FilesystemEggStorage

Expand Down
8 changes: 2 additions & 6 deletions scrapyd/jobstorage.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
from datetime import datetime

from zope.interface import implementer

from scrapyd.interfaces import IJobStorage
from scrapyd.sqlite import SqliteFinishedJobs
from scrapyd.utils import sqlite_connection_string


def job_log_url(job):
Expand Down Expand Up @@ -50,11 +50,7 @@ def __iter__(self):
class SqliteJobStorage(object):

def __init__(self, config):
dbsdir = config.get('dbs_dir', 'dbs')
if not os.path.exists(dbsdir):
os.makedirs(dbsdir)
dbpath = os.path.join(dbsdir, 'jobs.db')
self.jstorage = SqliteFinishedJobs(dbpath, "finished_jobs")
self.jstorage = SqliteFinishedJobs(sqlite_connection_string(config, 'jobs'), "finished_jobs")
self.finished_to_keep = config.getint('finished_to_keep', 100)

def add(self, job):
Expand Down
5 changes: 3 additions & 2 deletions scrapyd/spiderqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from scrapyd.interfaces import ISpiderQueue
from scrapyd.sqlite import JsonSqlitePriorityQueue
from scrapyd.utils import sqlite_connection_string


@implementer(ISpiderQueue)
class SqliteSpiderQueue(object):

def __init__(self, database=None, table='spider_queue'):
self.q = JsonSqlitePriorityQueue(database, table)
def __init__(self, config, project, table='spider_queue'):
self.q = JsonSqlitePriorityQueue(sqlite_connection_string(config, project), table)

def add(self, name, priority=0.0, **spider_args):
d = spider_args.copy()
Expand Down
3 changes: 2 additions & 1 deletion scrapyd/tests/test_spiderqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from zope.interface.verify import verifyObject

from scrapyd import spiderqueue
from scrapyd.config import Config
from scrapyd.interfaces import ISpiderQueue


Expand All @@ -11,7 +12,7 @@ class SpiderQueueTest(unittest.TestCase):
"""

def setUp(self):
self.q = spiderqueue.SqliteSpiderQueue(':memory:')
self.q = spiderqueue.SqliteSpiderQueue(Config(values={'dbs_dir': ':memory:'}), 'quotesbot')
self.name = 'spider1'
self.priority = 5
self.args = {
Expand Down
21 changes: 12 additions & 9 deletions scrapyd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import os
import sys
from subprocess import PIPE, Popen
from urllib.parse import urlsplit

from packaging.version import InvalidVersion, Version
from scrapy.utils.misc import load_object
from twisted.web import resource

from scrapyd.config import Config
from scrapyd.spiderqueue import SqliteSpiderQueue
from scrapyd.sqlite import JsonSqliteDict


Expand Down Expand Up @@ -55,14 +55,17 @@ def __setitem__(self, key, value):

def get_spider_queues(config):
"""Return a dict of Spider Queues keyed by project name"""
dbsdir = config.get('dbs_dir', 'dbs')
if not os.path.exists(dbsdir):
os.makedirs(dbsdir)
d = {}
for project in get_project_list(config):
dbpath = os.path.join(dbsdir, '%s.db' % project)
d[project] = SqliteSpiderQueue(dbpath)
return d
spiderqueue = load_object(config.get('spiderqueue', 'scrapyd.spiderqueue.SqliteSpiderQueue'))
return {project: spiderqueue(config, project) for project in get_project_list(config)}


def sqlite_connection_string(config, database):
dbs_dir = config.get('dbs_dir', 'dbs')
if dbs_dir == ':memory:' or urlsplit(dbs_dir).scheme:
return dbs_dir
if not os.path.exists(dbs_dir):
os.makedirs(dbs_dir)
return os.path.join(dbs_dir, f'{database}.db')


def get_project_list(config):
Expand Down