Skip to content

Commit

Permalink
Some simplifications to Scrapyd architecture and internals:
Browse files Browse the repository at this point in the history
- launcher no longer knows about egg storage
- removed get_spider_list_from_eggifile() file and replaced by simpler
  get_spider_list() which doesn't receive en egg file as argument
- changed "egg runner" name to just "runner" to reflect the fact that it
  doesn't necesarilly run eggs (though it does in the default case)

--HG--
rename : scrapyd/eggrunner.py => scrapyd/runner.py
  • Loading branch information
pablohoffman committed Dec 27, 2010
1 parent 1d85764 commit 9c06c26
Show file tree
Hide file tree
Showing 13 changed files with 111 additions and 108 deletions.
2 changes: 1 addition & 1 deletion scrapyd/default_scrapyd.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ max_proc = 0
max_proc_per_cpu = 4
http_port = 6800
debug = off
egg_runner = scrapyd.eggrunner
runner = scrapyd.runner
application = scrapyd.app.application
9 changes: 0 additions & 9 deletions scrapyd/eggrunner.py

This file was deleted.

22 changes: 1 addition & 21 deletions scrapyd/eggutils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,4 @@
from __future__ import with_statement

import os, sys, shutil, pkg_resources
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile

def get_spider_list_from_eggfile(eggfile, project, eggrunner='scrapyd.eggrunner'):
with NamedTemporaryFile(suffix='.egg') as f:
shutil.copyfileobj(eggfile, f)
f.flush()
eggfile.seek(0)
pargs = [sys.executable, '-m', eggrunner, 'list']
env = os.environ.copy()
env['SCRAPY_PROJECT'] = project
env['SCRAPY_EGGFILE'] = f.name
proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
out, err = proc.communicate()
if proc.returncode:
msg = err or out or 'unknown error'
raise RuntimeError(msg.splitlines()[-1])
return out.splitlines()
import os, pkg_resources

def activate_egg(eggpath):
"""Activate a Scrapy egg file. This is meant to be used from egg runners
Expand Down
6 changes: 2 additions & 4 deletions scrapyd/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,14 @@ def __init__(self, config, initenv=os.environ):
self.settings = {}
self.initenv = initenv

def get_environment(self, message, slot, eggpath):
def get_environment(self, message, slot):
project = message['_project']
env = self.initenv.copy()
env['SCRAPY_SLOT'] = str(slot)
env['SCRAPY_PROJECT'] = project
env['SCRAPY_SPIDER'] = message['_spider']
env['SCRAPY_JOB'] = message['_job']
if eggpath:
env['SCRAPY_EGGFILE'] = eggpath
elif project in self.settings:
if project in self.settings:
env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
env['SCRAPY_SQLITE_DB'] = dbpath
Expand Down
6 changes: 1 addition & 5 deletions scrapyd/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,9 @@ def update_projects():
class IEnvironment(Interface):
"""A component to generate the environment of crawler processes"""

def get_environment(message, slot, eggpath):
def get_environment(message, slot):
"""Return the environment variables to use for running the process.
`message` is the message received from the IPoller.next() method
`slot` is the Launcher slot where the process will be running.
`eggpath` is the path to an eggfile that contains the project code. The
`eggpath` may be `None` if no egg was found for the project, in
which case the project must be on the python path and its settings
defined in scrapyd.conf [settings] section
"""
42 changes: 12 additions & 30 deletions scrapyd/launcher.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import sys, os
from shutil import copyfileobj
from tempfile import mkstemp
from datetime import datetime

from twisted.internet import reactor, defer, protocol, error
Expand All @@ -10,7 +8,7 @@
from scrapy.utils.py26 import cpu_count
from scrapy.utils.python import stringify_dict
from scrapyd.utils import get_crawl_args
from .interfaces import IPoller, IEggStorage, IEnvironment
from .interfaces import IPoller, IEnvironment

class Launcher(Service):

Expand All @@ -21,57 +19,41 @@ def __init__(self, config, app):
self.max_proc = config.getint('max_proc', 0)
if not self.max_proc:
self.max_proc = cpu_count() * config.getint('max_proc_per_cpu', 4)
self.egg_runner = config.get('egg_runner', 'scrapyd.eggrunner')
self.runner = config.get('runner', 'scrapyd.runner')
self.app = app

def startService(self):
for slot in range(self.max_proc):
self._wait_for_project(slot)
log.msg("%s started: max_proc=%r, egg_runner=%r" % (self.parent.name, \
self.max_proc, self.egg_runner), system="Launcher")
log.msg("%s started: max_proc=%r, runner=%r" % (self.parent.name, \
self.max_proc, self.runner), system="Launcher")

def _wait_for_project(self, slot):
poller = self.app.getComponent(IPoller)
poller.next().addCallback(self._spawn_process, slot)

def _get_eggpath(self, project):
eggstorage = self.app.getComponent(IEggStorage)
version, eggf = eggstorage.get(project)
if eggf is None:
return
prefix = '%s-%s-' % (project, version)
fd, eggpath = mkstemp(prefix=prefix, suffix='.egg')
lf = os.fdopen(fd, 'wb')
copyfileobj(eggf, lf)
lf.close()
return eggpath

def _spawn_process(self, message, slot):
msg = stringify_dict(message, keys_only=False)
project = msg['_project']
eggpath = self._get_eggpath(project)
args = [sys.executable, '-m', self.egg_runner, 'crawl']
args = [sys.executable, '-m', self.runner, 'crawl']
args += get_crawl_args(msg)
e = self.app.getComponent(IEnvironment)
env = e.get_environment(msg, slot, eggpath)
env = e.get_environment(msg, slot)
env = stringify_dict(env, keys_only=False)
pp = ScrapyProcessProtocol(eggpath, slot, project, msg['_spider'], \
pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \
msg['_job'], env)
pp.deferred.addBoth(self._process_finished, eggpath, slot)
pp.deferred.addBoth(self._process_finished, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
self.processes[slot] = pp

def _process_finished(self, _, eggpath, slot):
if eggpath:
os.remove(eggpath)
def _process_finished(self, _, slot):
self.processes.pop(slot)
self._wait_for_project(slot)


class ScrapyProcessProtocol(protocol.ProcessProtocol):

def __init__(self, eggfile, slot, project, spider, job, env):
self.eggfile = eggfile
def __init__(self, slot, project, spider, job, env):
self.slot = slot
self.pid = None
self.project = project
Expand Down Expand Up @@ -100,6 +82,6 @@ def processEnded(self, status):
self.deferred.callback(self)

def log(self, msg):
msg += "project=%r spider=%r job=%r pid=%r egg=%r log=%r" % (self.project, \
self.spider, self.job, self.pid, self.eggfile, self.logfile)
msg += "project=%r spider=%r job=%r pid=%r log=%r" % (self.project, \
self.spider, self.job, self.pid, self.logfile)
log.msg(msg, system="Launcher")
37 changes: 37 additions & 0 deletions scrapyd/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import shutil
import tempfile
from contextlib import contextmanager

from scrapyd import get_application
from scrapyd.interfaces import IEggStorage
from scrapyd.eggutils import activate_egg

@contextmanager
def project_environment(project):
app = get_application()
eggstorage = app.getComponent(IEggStorage)
version, eggfile = eggstorage.get(project)
if eggfile:
prefix = '%s-%s-' % (project, version)
fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg')
lf = os.fdopen(fd, 'wb')
shutil.copyfileobj(eggfile, lf)
lf.close()
activate_egg(eggpath)
else:
eggpath = None
try:
yield
finally:
if eggpath:
os.remove(eggpath)

def main():
project = os.environ['SCRAPY_PROJECT']
with project_environment(project):
from scrapy.cmdline import execute
execute()

if __name__ == '__main__':
main()
14 changes: 0 additions & 14 deletions scrapyd/tests/test_eggutils.py

This file was deleted.

18 changes: 2 additions & 16 deletions scrapyd/tests/test_environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from scrapyd.config import Config
from scrapyd.environ import Environment

class EggStorageTest(unittest.TestCase):
class EnvironmentTest(unittest.TestCase):

def setUp(self):
d = self.mktemp()
Expand All @@ -24,25 +24,11 @@ def test_interface(self):
def test_get_environment_with_eggfile(self):
msg = {'_project': 'mybot', '_spider': 'myspider', '_job': 'ID'}
slot = 3
env = self.environ.get_environment(msg, slot, '/path/to/file.egg')
env = self.environ.get_environment(msg, slot)
self.assertEqual(env['SCRAPY_PROJECT'], 'mybot')
self.assertEqual(env['SCRAPY_SLOT'], '3')
self.assertEqual(env['SCRAPY_SPIDER'], 'myspider')
self.assertEqual(env['SCRAPY_JOB'], 'ID')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith('/mybot/myspider/ID.log'))
self.assert_(env['SCRAPY_EGGFILE'].endswith('/path/to/file.egg'))
self.failIf('SCRAPY_SETTINGS_MODULE' in env)

def test_get_environment_without_eggfile(self):
msg = {'_project': 'newbot', '_spider': 'myspider', '_job': 'ID'}
slot = 3
env = self.environ.get_environment(msg, slot, None)
self.assertEqual(env['SCRAPY_PROJECT'], 'newbot')
self.assertEqual(env['SCRAPY_SLOT'], '3')
self.assertEqual(env['SCRAPY_SPIDER'], 'myspider')
self.assertEqual(env['SCRAPY_JOB'], 'ID')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('newbot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith('/newbot/myspider/ID.log'))
self.assertEqual(env['SCRAPY_SETTINGS_MODULE'], 'newbot.settings')
self.failIf('SCRAPY_EGGFILE' in env)
38 changes: 36 additions & 2 deletions scrapyd/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
import unittest
from __future__ import with_statement

from scrapyd.utils import get_crawl_args
import os
from cStringIO import StringIO

from twisted.trial import unittest

from scrapy.utils.py26 import get_data
from scrapyd.interfaces import IEggStorage
from scrapyd.utils import get_crawl_args, get_spider_list
from scrapyd import get_application

__package__ = 'scrapyd.tests' # required for compatibility with python 2.5

class UtilsTest(unittest.TestCase):

Expand All @@ -11,3 +21,27 @@ def test_get_crawl_args(self):
cargs = get_crawl_args(msg)
self.assertEqual(cargs, ['lala', '-a', 'arg1=val1'])
assert all(isinstance(x, str) for x in cargs), cargs

class GetSpiderListTest(unittest.TestCase):

def test_get_spider_list(self):
path = self.mktemp()
j = os.path.join
eggs_dir = j(path, 'eggs')
os.makedirs(eggs_dir)
dbs_dir = j(path, 'dbs')
os.makedirs(dbs_dir)
logs_dir = j(path, 'logs')
os.makedirs(logs_dir)
os.chdir(path)
with open('scrapyd.conf', 'w') as f:
f.write("[scrapyd]\n")
f.write("eggs_dir = %s\n" % eggs_dir)
f.write("dbs_dir = %s\n" % dbs_dir)
f.write("logs_dir = %s\n" % logs_dir)
app = get_application()
eggstorage = app.getComponent(IEggStorage)
eggfile = StringIO(get_data(__package__, 'mybot.egg'))
eggstorage.put(eggfile, 'mybot', 'r1')
self.assertEqual(sorted(get_spider_list('mybot')), ['spider1', 'spider2'])

15 changes: 15 additions & 0 deletions scrapyd/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import sys
import os
from subprocess import Popen, PIPE
from ConfigParser import NoSectionError

from scrapy.spiderqueue import SqliteSpiderQueue
Expand Down Expand Up @@ -41,3 +43,16 @@ def get_crawl_args(message):
args += ['-a']
args += ['%s=%s' % (k, v)]
return args

def get_spider_list(project, runner='scrapyd.runner'):
"""Return the spider list from the given project, using the given runner"""
env = os.environ.copy()
env['SCRAPY_PROJECT'] = project
pargs = [sys.executable, '-m', runner, 'list']
proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
out, err = proc.communicate()
if proc.returncode:
msg = err or out or 'unknown error'
raise RuntimeError(msg.splitlines()[-1])
return out.splitlines()

8 changes: 3 additions & 5 deletions scrapyd/webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from cStringIO import StringIO

from scrapy.utils.txweb import JsonResource
from .eggutils import get_spider_list_from_eggfile
from .utils import get_spider_list

class WsResource(JsonResource):

Expand Down Expand Up @@ -42,8 +42,8 @@ def render_POST(self, txrequest):
project = d['project'][0]
version = d['version'][0]
eggf = StringIO(d['egg'][0])
spiders = get_spider_list_from_eggfile(eggf, project)
self.root.eggstorage.put(eggf, project, version)
spiders = get_spider_list(project)
self.root.update_projects()
return {"status": "ok", "project": project, "version": version, \
"spiders": len(spiders)}
Expand All @@ -65,9 +65,7 @@ class ListSpiders(WsResource):

def render_GET(self, txrequest):
project = txrequest.args['project'][0]
_, eggf = self.root.eggstorage.get(project)
spiders = get_spider_list_from_eggfile(eggf, project, \
eggrunner=self.root.egg_runner)
spiders = get_spider_list(project, runner=self.root.runner)
return {"status": "ok", "spiders": spiders}

class DeleteProject(WsResource):
Expand Down
2 changes: 1 addition & 1 deletion scrapyd/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Root(resource.Resource):
def __init__(self, config, app):
resource.Resource.__init__(self)
self.debug = config.getboolean('debug', False)
self.eggrunner = config.get('egg_runner')
self.runner = config.get('runner')
logsdir = config.get('logs_dir')
self.app = app
self.putChild('', Home(self))
Expand Down

0 comments on commit 9c06c26

Please sign in to comment.