srrecs.py

#!/usr/bin/env python

import mr_tools
from hashlib import md5
import random
import tempfile
import sys

def obsc(d, secret):
    m = md5(str(d))
    m.update(secret)
    return m.hexdigest()

def obscure(secret=random.random()):
    """Turn identifiable components of vote dumps into salted hashes"""
    def o(d):
        m = md5(str(d))
        m.update(secret)
        return m.hexdigest()
    @mr_tools.dataspec_m('account_id',
                         'link_id',
                         'sr_id',
                         'dir')
    def process(aff):
        yield o(aff.account_id), o(aff.link_id), o(aff.sr_id), aff.dir
    mr_tools.mr_map(process)

def affinities_m():
    """Take the vote dump generated by srrecs.pig on stdin and prepare
       for reducing the votes into affinities by keying them on
       (account_id, sr_id)"""

    @mr_tools.dataspec_m('account_id',
                         'link_id',
                         'sr_id',
                         'dir')
    def process(aff):
        yield ('%s_%s' % (aff.account_id, aff.sr_id),
               aff.account_id, aff.link_id, aff.sr_id, aff.dir)

    mr_tools.mr_map(process)

def affinities_r():
    """The reduction step of turning lists of votes on sr_ids into
       affinities"""

    @mr_tools.dataspec_r('account_id',
                         'link_id',
                         'sr_id',
                         'dir')
    def process(account_srid, affs):
        # we can assume that all of the account_ids and sr_ids are
        # equal
        count=0.0
        ups=0.0
        account_id = sr_id = None

        for aff in affs:
            if account_id is None:
                account_id = aff.account_id
                sr_id = aff.sr_id

            count+=1
            if aff.dir == '1':
                ups += 1

        # you must vote at least three times to ride
        if count >= 3:
            yield account_id, sr_id, ups/count

    mr_tools.mr_reduce(process)

def write_matrix(out_cm, out_clabel, out_rlabel):
    """Reformat the affinities coming out of the functions above to
       the format wanted by skmeans (which is the format used by
       CLUTO, documented at
       <http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf> in
       section 3.3.1)"""

    class Stats(object):
        __slots__ = ['num_srs', 'num_rows', 'sr_map', 'total_entries']

        def __init__(self):
            self.num_srs = self.num_rows = self.total_entries = 0

            # we can safely keep the whole sr_map around in memory
            # like this because we have fewer than 100k of them
            self.sr_map = {}

    stats = Stats()

    f_cm = tempfile.TemporaryFile()
    f_cl = tempfile.TemporaryFile()

    @mr_tools.dataspec_r('sr_id',
                         ('affinity',float))
    def _reduce(account_id, affs):
        affs = list(affs)
        for aff in affs:
            # the affinities we get are from 0..1, but skmeans wants
            # -1..1)
            aff.affinity = aff.affinity*2-1

        # skmeans really doesn't like rows consisting entirely in
        # zeroes
        affs = filter(lambda aff: not(-0.001 < aff.affinity < 0.001),
                      affs)
        if not affs:
            return []

        stats.num_rows += 1

        for aff in affs:
            if aff.sr_id not in stats.sr_map:
                stats.num_srs += 1
                stats.sr_map[aff.sr_id] = stats.num_srs # CLUTO's
                                                        # matricies
                                                        # are 1-based

        stats.total_entries += len(affs)

        f_cl.write('%s\n' % (account_id,))

        f_cm.write(' '.join(('%s %s' % (stats.sr_map[aff.sr_id], aff.affinity)
                          for aff in affs)))
        f_cm.write('\n')

        return []

    mr_tools.mr_reduce(_reduce)

    def cp_fds(infd, outfd, buffsize = 1024*1024):
        infd.flush()
        infd.seek(0)
        while True:
            readed = infd.read(buffsize)
            if readed:
                outfd.write(readed)
            else:
                break

    with open(out_cm,'w') as outfd:
        outfd.write(
            '%d %d %d\n' % (stats.num_rows, len(stats.sr_map), stats.total_entries))
        cp_fds(f_cm, outfd)

    with open(out_clabel,'w') as outfd:
        cp_fds(f_cl, outfd)

    with open(out_rlabel, 'w') as outfd:
        for sr_id, sr_mapped in sorted(stats.sr_map.items(),
                                       key = lambda x: x[1]):
            outfd.write('%s\n' % (sr_id,))

if __name__ == '__main__':
    eval(sys.argv[1])