vw-varinfo2

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: ts=4 sw=4 expandtab
"""
    vw-varinfo2: vw dataset summary & variable importance

    This is new & simpler implementation of the original vw-varinfo

    It is designed to be simpler and faster. There's less dependence
    on command line options so it is much more robust against future
    changes and new options in vowpal-wabbit.

    This implementation is in python (original was in perl)

    TODO: multi-class support is not implemented!

    Author: Ariel Faigon (2016)
"""
import sys
import os
import subprocess
import re
import itertools
import tempfile
import traceback

from operator import itemgetter

Verbose = False
ARGV0 = os.path.basename(sys.argv[0])

# Default vw executable program to call
VW = 'vw'
# Additional VW args which should be reused from 1st to 2nd pass
VWARGS = []

# Hash mappings for per-feature (min, max, hash-value, weight)
F_MIN = {}
F_MAX = {}
F_HASH = {}
F_WEIGHT = {}

# We need to have a model at the end to load all weights
# If it is not supplied on the command line, we add it ourselves
ModelName = ''
CleanupModel = False

# A global switch and list of all seen labels to support MultiClass
MultiClass = False
MCLabels = None

def v(msg):
    """print message to stderr"""
    sys.stderr.write("%s\n" % msg)
    sys.stderr.flush()

def d(msg):
    """Verbose/debugging message, activated with '-v' option."""
    if not Verbose:
        return
    v(msg)

def fatal(msg):
    """fatal (can't continue) situation error message"""
    v("== FATAL: %s" % msg)
    sys.exit(1)

def usage(msg):
    """Print usage message and exit"""
    if msg:
        v(msg)
    v("Usage: %s [-v] [<vw>] [<vw_options>] <vw_args>..." % ARGV0)
    v("    Notes:\n"
      "\t- You may omit the <vw> argument (default is 'vw')\n"
      "\t- You may use a different <vw> executable as the 1st arg\n"
      "\t- <vw_args> are all the vw arguments, as you would call vw directly\n"
      "\t- If <vw_args> is just a dataset-file - vw defaults will be used\n"
      "\t- To lose the constant (intercept), use vw's '--noconstant' option\n"
      "\t  However the constant may be useful to show if there's a bias")
    sys.exit(1)


def which(program):
    """
    Find a program in $PATH
    If found, return its full path, otherwise return None
    """
    def is_exe(fpath):
        """Return True if fpath is executable, False otherwise"""
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, _ = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


def all_features_dicts():
    """
    Returns two dict of all features in a structured way:
    1st dict is individual features: scalar keys with a value of 1
    2nd dict is for features within name-spaces, key is the name-space
    first dict is:
    {
        # individual (not in a name-space) features:
        "f1": 1,
        "f2": 1,
        "fN": 1
    }
    second dict is:
    {
        # features in name-spaces:
        "namespace1": { "f1":1, "f2":1, ... },
        "namespace2": {"f1":1, "f2":1, ... },
    }
    """
    d1 = {}
    d2 = {}
    for k in F_HASH:
        if '^' in k:
            ns, fname = k.split('^', 1)
            if ns not in d2:
                d2[ns] = {}
            d2[ns][fname] = 1
        else:
            # Constant feature should never be added as a regular
            # feature. vw adds it by itself as needed.
            # TODO: multiclass uses separate Constant_<N> per class
            if k != 'Constant':
                d1[k] = 1

    return d1, d2


def all_features_example():
    """Return a equal-weight vw line with all features present"""
    # TODO: implement multi-class: needs per-class internal data-structs
    d1, d2 = all_features_dicts()
    individual_features = ' | ' + ' '.join(d1.keys())
    ns_features = []
    for ns in d2:
        fnames = d2[ns].keys()
        one_ns_features = " |%s %s" % (ns, ' '.join(fnames))
        ns_features.append(one_ns_features)

    example = '1' + individual_features + ' '.join(ns_features) + '\n'
    d("all_features_example: %s" % example)
    return example


def process_audit_line(line):
    """
    Process an audit line coming from 'vw'
    Track min/max/hash-value/weight for each feature
    """
    features = line.split("\t")
    features.pop(0)
    for f in features:
        fields = f.split(':')
        fname = fields[0]

        if fname == '':
            # don't process 'empty' features
            continue

        fhash = int(fields[1])
        fval = float(fields[2])
        fweight = float(fields[-1].split('@')[0])

        F_WEIGHT[fname] = fweight
        F_HASH[fname] = fhash

        if fname not in F_MIN:
            # feature seen for 1st time
            F_MIN[fname] = fval
            F_MAX[fname] = fval

        if fval < F_MIN[fname]:
            F_MIN[fname] = fval
        if fval > F_MAX[fname]:
            F_MAX[fname] = fval


def vw_audit(vw_cmd, our_input=None):
    """
    Generator for vw audit-lines
    (Each example is mapped to its audit-line)

    vw_cmd is a list of args to run vw with (vw command line)

    There are two modes of running:
        1) Normal: input provided directly to vw from command line
        2) 2nd pass: input provided by vw-varinfo as a string
           This mode is activated when our_input="some string..."
    """
    if our_input:
        # Input comes from our_input (string)
        # which is sent via stdin to the vw subprocess
        vw_proc = subprocess.Popen(
            vw_cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            stdin=subprocess.PIPE,
            bufsize=1048576
        )
        # python3 expects a bytes-like object
        # Hence encoding the string our_input
        vw_proc.stdin.write(our_input.encode())
        vw_proc.stdin.close()
    else:
        # By default, vw reads from a training-set
        # which is provided on the command line
        vw_proc = subprocess.Popen(
            vw_cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            close_fds=False,
            bufsize=1048576
        )

    example_no = 0

    while True:
        # Since encoded string was written
        # therefore it needs to be decoded
        vw_line = vw_proc.stdout.readline().decode()
        if not vw_line:
            # End of input
            vw_proc.stdout.close()
            vw_proc.wait()
            if vw_proc.returncode:
                # non-zero exit code, print the full command that
                # failed to help user reproduce/understand it
                fatal("vw subprocess failed (status=%s):\n\t%s\n"
                      "(Run the command above to reproduce the failure)" %
                      (vw_proc.returncode, ' '.join(vw_cmd)))
            else:
                # everything looks cool, support debugging anyway
                d("%s: %s examples, exit status: %s" %
                  (vw_cmd, example_no, vw_proc.returncode))

            return

        # An audit line is recognized by a leading-tab
        if vw_line[0] == '\t':
            # An audit line (what we're looking for)
            example_no += 1
            d(vw_line)
            yield vw_line
            continue

        # print("vw_line=[%s] len=%d not(vw_line)=%s" % (vw_line, len(vw_line), (not vw_line)))
        # time.sleep(0.0001)

        # Q: anything we want to do with other lines?
        # A: for now no, we just read the next line from vw

    return

def run_vw(vw_cmd, our_input=None):
    """Track all variables and their weights via vw --audit lines"""
    for line in vw_audit(vw_cmd, our_input):
        process_audit_line(line)


def is_vw_arg(arg):
    """
    Return True iff the arg looks like a 'vw' argument
    Side effect: modifies the VW global variable iff user uses
    a different vw
    """
    global VW
    if arg == VW:
        return True
    if re.search(r'(?:^|[\\/])vw[-_.0-9]*(\.exe)?$', arg):
        VW = arg
        return True
    return False


def already_has_audit(args):
    """Return True iff args already include --audit (or -a)"""
    if '-a' in args or '--audit' in args:
        return True
    return False


def is_multiclass(args):
    """
    Check args for any hint of a multiclass problem
    (Check is option dependent and may be incomplete)
    """
    # Not sure if --wap, --ect multi-class are actually right
    for mc_opt in ('--oaa', '--csoaa', '--ect', '--wap', '--sequence'):
        if mc_opt in args:
            return True
    return False


def model_arg(args):
    """Return the model arg if any"""
    f_idx = None
    try:
        f_idx = args.index('-f')
    except:
        # not there
        return None

    try:
        f_idx += 1
        model = args[f_idx]
    except:
        fatal("Oops! -f withot an arg - can't continue")

    return model


def get_vw_cmd(args):
    """
    Return the vw command we want to run
    This means stripping our own (vw-varinfo) name from the list
    and making sure:
        1) That we have 'vw' at the beginning
        2) That -a is added for auditing
    """
    global ModelName, CleanupModel, Verbose

    if len(args) <= 1:
        usage('')

    # -- move ourselves (vw-varinfo arg) out of the way
    args.pop(0)

    # 1st arg can be '-v' for debugging this script
    if len(args) > 0 and args[0] == '-v':
        Verbose = True
        args.pop(0)

    vw_args = []

    if len(args) < 1:
        usage('Too few args: %s' % args)

    if not is_vw_arg(args[0]):
        if os.name == 'nt':
            args.insert(0, 'vw.exe')
        else:
            args.insert(0, 'vw')

    if not already_has_audit(args):
        args.insert(1, '--audit')

    if '--noconstant' in args:
        VWARGS.append('--noconstant')

    model = model_arg(args)
    if model:
        ModelName = model
    else:
        ModelName = tempfile.mktemp(suffix='.vwmodel')
        args.insert(1, ModelName)
        args.insert(1, '-f')
        CleanupModel = True

    # TODO: skip leading options that are intended for vw-varinfo itself
    for arg in args:
        vw_args.append(arg)

    d("vw_cmd is: %s" % vw_args)
    vw_exe = vw_args[0]
    if which(vw_exe) is None:
        fatal("Sorry: can't find %s (vowpal wabbit executable) in $PATH\n"
              "PATH=%s" % (vw_exe, os.environ["PATH"]))

    return vw_args


def minmax(data):
    """
    Return a pair (min, max) of list arg
    """
    lo = 0
    hi = 0
    for i in data:
        if i > hi:
            hi = i
        if i < lo:
            lo = i

    return lo, hi


def summarize():
    """Output summary of variables"""
    wmin, wmax = minmax(F_WEIGHT.values())
    w_absmax = max(abs(wmin), abs(wmax))

    # Print a header
    print(("%-16s\t%10s\t%s\t%s\t%s\t%s" %
          ('FeatureName', 'HashVal', 'MinVal', 'MaxVal', 'Weight', 'RelScore')))

    # TODO: implement multi-class
    # multi-class needs per-class internal data-structs

    # To reverse-order add: 'reverse=True' arg to 'sorted'
    # itemgetter: (0): sort-by-key, (1): sort by value
    sorted_tuples = sorted(F_WEIGHT.items(), key=itemgetter(1))
    for fname, _ in sorted_tuples:
        fmin = float(F_MIN[fname])
        fmax = float(F_MAX[fname])
        fweight = float(F_WEIGHT[fname])
        fhash = F_HASH[fname]
        relscore = 100.0 * (fweight/w_absmax if w_absmax > 0 else 0.0)
        print(("%-16s\t%10s\t%.2f\t%.2f\t%.2f\t%7.2f" %
              (fname, fhash, fmin, fmax, fweight, relscore)))

def pass1(vw_cmd):
    """In pass1 we run 'vw' as in the original command"""
    d("Starting PASS 1 ...")
    run_vw(vw_cmd, None)

def pass2():
    """
    Run a 2nd pass with all features and stored model
    To get the final weights for all features
    """
    vw_cmd = [VW, '--quiet', '-t', '-a', '-i', ModelName]
    if len(VWARGS) > 0:
        vw_cmd += VWARGS
    all_features = all_features_example()
    d("Starting PASS 2 ...")
    run_vw(vw_cmd, all_features)


#
# -- main
#
def main():
    """Main func for vw-varinfo2: dataset feature information summary"""

    global MultiClass, MCLabels

    try:
        vw_cmd = get_vw_cmd(sys.argv)

        if is_multiclass(vw_cmd):
            # multi-class needs per-class internal data-structs
            MultiClass = True
            MCLabels = []

        # Run 1st pass to collect data on all features:
        pass1(vw_cmd)

        # Run second pass:
        #   with -i ModelName and single example w/ all-features present
        pass2()

        summarize()

        if CleanupModel:
            d("removing tempfile: %s" % ModelName)
            os.remove(ModelName)

    except Exception as estr:
        # catch-all to cover all unhandled exceptions
        fatal("%s\n%s" % (estr, traceback.format_exc()))

    return 0

if __name__ == "__main__":
    sys.exit(main())