Skip to content

Commit

Permalink
Merge pull request #17 from xaqq/check-spider
Browse files Browse the repository at this point in the history
check if a spider exists before schedule it (with sqlite cache)
  • Loading branch information
jayzeng committed Jul 10, 2014
2 parents 5988429 + 288afef commit b9a38f6
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 2 deletions.
33 changes: 32 additions & 1 deletion scrapyd/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,35 @@
import sys
import os
from .sqlite import JsonSqliteDict
from subprocess import Popen, PIPE
from ConfigParser import NoSectionError

from scrapyd.spiderqueue import SqliteSpiderQueue
from scrapy.utils.python import stringify_dict, unicode_to_str
from scrapyd.config import Config

class UtilsCache:
# array of project name that need to be invalided
invalid_cached_projects = []

def __init__(self):
self.cache_manager = JsonSqliteDict(table="utils_cache_manager")

# Invalid the spider's list's cache of a given project (by name)
@staticmethod
def invalid_cache(project):
UtilsCache.invalid_cached_projects.append(project)

def __getitem__(self, key):
for p in UtilsCache.invalid_cached_projects:
if p in self.cache_manager:
del self.cache_manager[p]
UtilsCache.invalid_cached_projects[:] = []
return self.cache_manager[key]

def __setitem__(self, key, value):
self.cache_manager[key] = value

def get_spider_queues(config):
"""Return a dict of Spider Quees keyed by project name"""
dbsdir = config.get('dbs_dir', 'dbs')
Expand Down Expand Up @@ -51,6 +74,12 @@ def get_crawl_args(message):

def get_spider_list(project, runner=None, pythonpath=None):
"""Return the spider list from the given project, using the given runner"""
if "cache" not in get_spider_list.__dict__:
get_spider_list.cache = UtilsCache()
try:
return get_spider_list.cache[project]
except KeyError:
pass
if runner is None:
runner = Config().get('runner')
env = os.environ.copy()
Expand All @@ -63,5 +92,7 @@ def get_spider_list(project, runner=None, pythonpath=None):
if proc.returncode:
msg = err or out or 'unknown error'
raise RuntimeError(msg.splitlines()[-1])
return out.splitlines()
tmp = out.splitlines();
get_spider_list.cache[project] = tmp
return tmp

6 changes: 5 additions & 1 deletion scrapyd/webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from twisted.python import log

from scrapy.utils.txweb import JsonResource
from .utils import get_spider_list
from .utils import get_spider_list, UtilsCache

class WsResource(JsonResource):

Expand All @@ -31,6 +31,9 @@ def render_POST(self, txrequest):
args = dict((k, v[0]) for k, v in txrequest.args.items())
project = args.pop('project')
spider = args.pop('spider')
spiders = get_spider_list(project)
if not spider in spiders:
return {"status": "error", "message": "spider '%s' not found" % spider}
args['settings'] = settings
jobid = uuid.uuid1().hex
args['_job'] = jobid
Expand Down Expand Up @@ -65,6 +68,7 @@ def render_POST(self, txrequest):
self.root.eggstorage.put(eggf, project, version)
spiders = get_spider_list(project)
self.root.update_projects()
UtilsCache.invalid_cache(project)
return {"status": "ok", "project": project, "version": version, \
"spiders": len(spiders)}

Expand Down

0 comments on commit b9a38f6

Please sign in to comment.