gratipay · rohitpaulk · May 5, 2017 · May 3, 2017 · May 4, 2017 · May 4, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,7 @@ language: python
 git:
  depth: 5
 addons:
- postgresql: 9.3
+ postgresql: 9.5
  firefox: latest-esr
 before_install:
  - git branch -vv | grep '^*'

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Quick Start
 Local
 -----
 
-Given Python 2.7, Postgres 9.3, and a C/make toolchain:
+Given Python 2.7, Postgres 9.6, and a C/make toolchain:
 
 ```shell
 git clone https://github.com/gratipay/gratipay.com.git
@@ -116,7 +116,7 @@ On Debian or Ubuntu you will need the following packages:
 
 ```shell
 sudo apt-get install \
- postgresql-9.3 \
+ postgresql-9.6 \
  postgresql-contrib \
  libpq-dev \
  python-dev \
@@ -386,7 +386,7 @@ Modifying the Database
 ======================
 
 We write SQL, specifically the [PostgreSQL
-variant](https://www.postgresql.org/docs/9.3/static/). We keep our database
+variant](https://www.postgresql.org/docs/9.6/static/). We keep our database
 schema in
 [`schema.sql`](https://github.com/gratipay/gratipay.com/blob/master/sql/schema.sql),
 and we write schema changes for each PR branch in a `sql/branch.sql` file, which
@@ -436,11 +436,11 @@ database configured in your testing environment.
 Local Database Setup
 --------------------
 
-For the best development experience, you need a local
-installation of [Postgres](https://www.postgresql.org/download/). The best
-version of Postgres to use is 9.3.5, because that's what we're using in
-production at Heroku. You need at least 9.2, because we depend on being able to
-specify a URI to `psql`, and that was added in 9.2.
+For the best development experience, you need a local installation of
+[Postgres](https://www.postgresql.org/download/). The best version of Postgres
+to use is 9.6.2, because that's what we're using in production at Heroku. You
+need at least 9.5 to support the features we depend on, along with the
+`pg_stat_statments` and `pg_trgm` extensions.
 
 + Mac: use Homebrew: `brew install postgres`
 + Ubuntu: use Apt: `apt-get install postgresql postgresql-contrib libpq-dev`

diff --git a/defaults.env b/defaults.env
@@ -79,6 +79,7 @@ EMAIL_QUEUE_ALLOW_UP_TO=3
 
 UPDATE_CTA_EVERY=300
 CHECK_DB_EVERY=600
+CHECK_NPM_SYNC_EVERY=0
 OPTIMIZELY_ID=
 INCLUDE_PIWIK=no
 SENTRY_DSN=

diff --git a/gratipay/application.py b/gratipay/application.py
@@ -3,7 +3,7 @@
 
 import psycopg2.extras
 
-from . import email, utils
+from . import email, sync_npm, utils
 from .cron import Cron
 from .models import GratipayDB
 from .payday_runner import PaydayRunner
@@ -53,6 +53,7 @@ def install_periodic_jobs(self, website, env, db):
  cron(env.update_cta_every, lambda: utils.update_cta(website))
  cron(env.check_db_every, db.self_check, True)
  cron(env.email_queue_flush_every, self.email_queue.flush, True)
+ cron(env.check_npm_sync_every, lambda: sync_npm.check(db))
 
 
  def add_event(self, c, type, payload):

diff --git a/gratipay/cli/sync_npm.py b/gratipay/cli/sync_npm.py
@@ -3,40 +3,35 @@
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
-import argparse
-
-from gratipay import wireup
-from gratipay.sync_npm import serialize, upsert
-
-
-def parse_args(argv):
- p = argparse.ArgumentParser()
- p.add_argument('command', choices=['serialize', 'upsert'])
- p.add_argument('path', help='the path to the input file', nargs='?', default='/dev/stdin')
- return p.parse_args(argv)
+import time
 
+from aspen import log
 
-subcommands = { 'serialize': serialize.main
- , 'upsert': upsert.main
- }
+from gratipay import wireup
+from gratipay.sync_npm import consume_change_stream, get_last_seq, production_change_stream
+from gratipay.utils import sentry
 
 
-def main(argv=sys.argv):
+def main():
  """This function is installed via an entrypoint in ``setup.py`` as
  ``sync-npm``.
 
  Usage::
 
- sync-npm {serialize,upsert} {<filepath>}
-
- ``<filepath>`` defaults to stdin.
-
- .. note:: Sphinx is expanding ``sys.argv`` in the parameter list. Sorry. :-/
+ sync-npm
 
  """
  env = wireup.env()
- args = parse_args(argv[1:])
  db = wireup.db(env)
-
- subcommands[args.command](env, args, db)
+ while 1:
+ with sentry.teller(env):
+ consume_change_stream(production_change_stream, db)
+ try:
+ last_seq = get_last_seq(db)
+ sleep_for = 60
+ log( 'Encountered an error, will pick up with %s in %s seconds (Ctrl-C to exit) ...'
+ % (last_seq, sleep_for)
+ )
+ time.sleep(sleep_for) # avoid a busy loop if thrashing
+ except KeyboardInterrupt:
+ return
diff --git a/gratipay/sync_npm.py b/gratipay/sync_npm.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import requests
+from aspen import log
+from couchdb import Database
+
+
+REGISTRY_URL = 'https://replicate.npmjs.com/'
+
+
+def get_last_seq(db):
+ return db.one('SELECT npm_last_seq FROM worker_coordination')
+
+
+def production_change_stream(seq):
+ """Given a sequence number in the npm registry change stream, start
+ streaming from there!
+ """
+ return Database(REGISTRY_URL).changes(feed='continuous', include_docs=True, since=seq)
+
+
+def process_doc(doc):
+ """Return a smoothed-out doc, or None if it's not a package doc, meaning
+ there's no name key and it's probably a design doc, per:
+
+ https://github.com/npm/registry/blob/aef8a275/docs/follower.md#clean-up
+
+ """
+ if 'name' not in doc:
+ return None
+ name = doc['name']
+ description = doc.get('description', '')
+ emails = [e for e in [m.get('email') for m in doc.get('maintainers', [])] if e.strip()]
+ return {'name': name, 'description': description, 'emails': sorted(set(emails))}
+
+
+def consume_change_stream(change_stream, db):
+ """Given a function similar to :py:func:`production_change_stream` and a
+ :py:class:`~GratipayDB`, read from the stream and write to the db.
+
+ The npm registry is a CouchDB app, which means we get a change stream from
+ it that allows us to follow registry updates in near-realtime. Our strategy
+ here is to maintain open connections to both the registry and our own
+ database, and write as we read.
+
+ """
+ last_seq = get_last_seq(db)
+ log("Picking up with npm sync at {}.".format(last_seq))
+ with db.get_connection() as connection:
+ for change in change_stream(last_seq):
+ if change.get('deleted'):
+ # Hack to work around conflation of design docs and packages in updates
+ op, doc = delete, {'name': change['id']}
+ else:
+ op, doc = upsert, change['doc']
+ processed = process_doc(doc)
+ if not processed:
+ continue
+ cursor = connection.cursor()
+ op(cursor, processed)
+ cursor.run('UPDATE worker_coordination SET npm_last_seq=%(seq)s', change)
+ connection.commit()
+
+
+def delete(cursor, processed):
+ cursor.run("DELETE FROM packages WHERE package_manager='npm' AND name=%(name)s", processed)
+
+
+def upsert(cursor, processed):
+ cursor.run('''
+ INSERT INTO packages
+ (package_manager, name, description, emails)
+ VALUES ('npm', %(name)s, %(description)s, %(emails)s)
+
+ ON CONFLICT (package_manager, name) DO UPDATE
+ SET description=%(description)s, emails=%(emails)s
+ ''', processed)
+
+
+def check(db, _print=print):
+ ours = db.one('SELECT npm_last_seq FROM worker_coordination')
+ theirs = int(requests.get(REGISTRY_URL).json()['update_seq'])
+ _print("count#npm-sync-lag={}".format(theirs - ours))
diff --git a/gratipay/sync_npm/__init__.py b/gratipay/sync_npm/__init__.py
diff --git a/gratipay/sync_npm/serialize.py b/gratipay/sync_npm/serialize.py