Skip to content
This repository has been archived by the owner on Feb 8, 2018. It is now read-only.

Rebuild npm syncing #4438

Merged
merged 11 commits into from
May 5, 2017
Merged
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ language: python
git:
depth: 5
addons:
postgresql: 9.3
postgresql: 9.5
firefox: latest-esr
before_install:
- git branch -vv | grep '^*'
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Quick Start
Local
-----

Given Python 2.7, Postgres 9.3, and a C/make toolchain:
Given Python 2.7, Postgres 9.6, and a C/make toolchain:

```shell
git clone https://github.com/gratipay/gratipay.com.git
Expand Down Expand Up @@ -116,7 +116,7 @@ On Debian or Ubuntu you will need the following packages:

```shell
sudo apt-get install \
postgresql-9.3 \
postgresql-9.6 \
postgresql-contrib \
libpq-dev \
python-dev \
Expand Down Expand Up @@ -386,7 +386,7 @@ Modifying the Database
======================

We write SQL, specifically the [PostgreSQL
variant](https://www.postgresql.org/docs/9.3/static/). We keep our database
variant](https://www.postgresql.org/docs/9.6/static/). We keep our database
schema in
[`schema.sql`](https://github.com/gratipay/gratipay.com/blob/master/sql/schema.sql),
and we write schema changes for each PR branch in a `sql/branch.sql` file, which
Expand Down Expand Up @@ -436,11 +436,11 @@ database configured in your testing environment.
Local Database Setup
--------------------

For the best development experience, you need a local
installation of [Postgres](https://www.postgresql.org/download/). The best
version of Postgres to use is 9.3.5, because that's what we're using in
production at Heroku. You need at least 9.2, because we depend on being able to
specify a URI to `psql`, and that was added in 9.2.
For the best development experience, you need a local installation of
[Postgres](https://www.postgresql.org/download/). The best version of Postgres
to use is 9.6.2, because that's what we're using in production at Heroku. You
need at least 9.5 to support the features we depend on, along with the
`pg_stat_statments` and `pg_trgm` extensions.

+ Mac: use Homebrew: `brew install postgres`
+ Ubuntu: use Apt: `apt-get install postgresql postgresql-contrib libpq-dev`
Expand Down
1 change: 1 addition & 0 deletions defaults.env
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ EMAIL_QUEUE_ALLOW_UP_TO=3

UPDATE_CTA_EVERY=300
CHECK_DB_EVERY=600
CHECK_NPM_SYNC_EVERY=0
OPTIMIZELY_ID=
INCLUDE_PIWIK=no
SENTRY_DSN=
Expand Down
3 changes: 2 additions & 1 deletion gratipay/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import psycopg2.extras

from . import email, utils
from . import email, sync_npm, utils
from .cron import Cron
from .models import GratipayDB
from .payday_runner import PaydayRunner
Expand Down Expand Up @@ -53,6 +53,7 @@ def install_periodic_jobs(self, website, env, db):
cron(env.update_cta_every, lambda: utils.update_cta(website))
cron(env.check_db_every, db.self_check, True)
cron(env.email_queue_flush_every, self.email_queue.flush, True)
cron(env.check_npm_sync_every, lambda: sync_npm.check(db))


def add_event(self, c, type, payload):
Expand Down
43 changes: 19 additions & 24 deletions gratipay/cli/sync_npm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,35 @@
"""
from __future__ import absolute_import, division, print_function, unicode_literals

import sys
import argparse

from gratipay import wireup
from gratipay.sync_npm import serialize, upsert


def parse_args(argv):
p = argparse.ArgumentParser()
p.add_argument('command', choices=['serialize', 'upsert'])
p.add_argument('path', help='the path to the input file', nargs='?', default='/dev/stdin')
return p.parse_args(argv)
import time

from aspen import log

subcommands = { 'serialize': serialize.main
, 'upsert': upsert.main
}
from gratipay import wireup
from gratipay.sync_npm import consume_change_stream, get_last_seq, production_change_stream
from gratipay.utils import sentry


def main(argv=sys.argv):
def main():
"""This function is installed via an entrypoint in ``setup.py`` as
``sync-npm``.

Usage::

sync-npm {serialize,upsert} {<filepath>}

``<filepath>`` defaults to stdin.

.. note:: Sphinx is expanding ``sys.argv`` in the parameter list. Sorry. :-/
sync-npm

"""
env = wireup.env()
args = parse_args(argv[1:])
db = wireup.db(env)

subcommands[args.command](env, args, db)
while 1:
with sentry.teller(env):
consume_change_stream(production_change_stream, db)
try:
last_seq = get_last_seq(db)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, if we're calling get_last_seq here anyway - might make sense to simplify the function definition of consume_change_stream to accept the stream directly, and not a function that has to be called with seq to return the stream?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in af41409.

sleep_for = 60
log( 'Encountered an error, will pick up with %s in %s seconds (Ctrl-C to exit) ...'
% (last_seq, sleep_for)
)
time.sleep(sleep_for) # avoid a busy loop if thrashing
except KeyboardInterrupt:
return
84 changes: 84 additions & 0 deletions gratipay/sync_npm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import requests
from aspen import log
from couchdb import Database


REGISTRY_URL = 'https://replicate.npmjs.com/'


def get_last_seq(db):
return db.one('SELECT npm_last_seq FROM worker_coordination')


def production_change_stream(seq):
"""Given a sequence number in the npm registry change stream, start
streaming from there!
"""
return Database(REGISTRY_URL).changes(feed='continuous', include_docs=True, since=seq)


def process_doc(doc):
"""Return a smoothed-out doc, or None if it's not a package doc, meaning
there's no name key and it's probably a design doc, per:

https://github.com/npm/registry/blob/aef8a275/docs/follower.md#clean-up

"""
if 'name' not in doc:
return None
name = doc['name']
description = doc.get('description', '')
emails = [e for e in [m.get('email') for m in doc.get('maintainers', [])] if e.strip()]
return {'name': name, 'description': description, 'emails': sorted(set(emails))}


def consume_change_stream(change_stream, db):
"""Given a function similar to :py:func:`production_change_stream` and a
:py:class:`~GratipayDB`, read from the stream and write to the db.

The npm registry is a CouchDB app, which means we get a change stream from
it that allows us to follow registry updates in near-realtime. Our strategy
here is to maintain open connections to both the registry and our own
database, and write as we read.

"""
last_seq = get_last_seq(db)
log("Picking up with npm sync at {}.".format(last_seq))
with db.get_connection() as connection:
for change in change_stream(last_seq):
if change.get('deleted'):
# Hack to work around conflation of design docs and packages in updates
op, doc = delete, {'name': change['id']}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a bit confusing. delete takes a dictionary, although it only needs one string as the argument. Also, we don't need to pass the fake doc ({'name': change['id']}) through process_doc.

At the cost of a line or two more, I think this can be simplified.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something along the lines of:

screen shot 2017-05-05 at 5 37 48 pm

Raw version:

Before:

with db.get_connection() as connection:
        for change in change_stream(last_seq):
            if change.get('deleted'):
                # Hack to work around conflation of design docs and packages in updates
                op, doc = delete, {'name': change['id']}
            else:
                op, doc = upsert, change['doc']
            processed = process_doc(doc)
            if not processed:
                continue
            cursor = connection.cursor()
            op(cursor, processed)
            cursor.run('UPDATE worker_coordination SET npm_last_seq=%(seq)s', change)
            connection.commit()


def delete(cursor, processed):
    cursor.run("DELETE FROM packages WHERE package_manager='npm' AND name=%(name)s", processed)


def upsert(cursor, processed):
    cursor.run('''
    INSERT INTO packages
                (package_manager, name, description, emails)
         VALUES ('npm', %(name)s, %(description)s, %(emails)s)
    ON CONFLICT (package_manager, name) DO UPDATE
            SET description=%(description)s, emails=%(emails)s
    ''', processed)

After:

with db.get_connection() as connection:
        for change in change_stream(last_seq):
            cursor = connection.cursor()
            if change.get('deleted'):
                # Hack to work around conflation of design docs and packages in updates
                delete(cursor, change['id'])
            else:
                upsert(cursor, process_doc(doc))
            cursor.run('UPDATE worker_coordination SET npm_last_seq=%(seq)s', change)
            connection.commit()


def delete(cursor, package_name):
    cursor.run("DELETE FROM packages WHERE package_manager='npm' AND name=%s", package_name)


def upsert(cursor, processed_doc):
    cursor.run('''
    INSERT INTO packages
                (package_manager, name, description, emails)
         VALUES ('npm', %(name)s, %(description)s, %(emails)s)
    ON CONFLICT (package_manager, name) DO UPDATE
            SET description=%(description)s, emails=%(emails)s
    ''', processed_doc)

Only downside I see here is that we're doing a little bit more work (calling process_doc, checking the deleted key) inside the transaction

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That doesn't account for skipping docs with no name key. How about 631dfc9?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yes 631dfc9 looks good

else:
op, doc = upsert, change['doc']
processed = process_doc(doc)
if not processed:
continue
cursor = connection.cursor()
op(cursor, processed)
cursor.run('UPDATE worker_coordination SET npm_last_seq=%(seq)s', change)
connection.commit()


def delete(cursor, processed):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it'd be clearer if we renamed processed to processed_doc

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 631dfc9.

cursor.run("DELETE FROM packages WHERE package_manager='npm' AND name=%(name)s", processed)


def upsert(cursor, processed):
cursor.run('''
INSERT INTO packages
(package_manager, name, description, emails)
VALUES ('npm', %(name)s, %(description)s, %(emails)s)

ON CONFLICT (package_manager, name) DO UPDATE
SET description=%(description)s, emails=%(emails)s
''', processed)


def check(db, _print=print):
ours = db.one('SELECT npm_last_seq FROM worker_coordination')
theirs = int(requests.get(REGISTRY_URL).json()['update_seq'])
_print("count#npm-sync-lag={}".format(theirs - ours))
43 changes: 0 additions & 43 deletions gratipay/sync_npm/__init__.py

This file was deleted.

107 changes: 0 additions & 107 deletions gratipay/sync_npm/serialize.py

This file was deleted.

Loading