diff --git a/src/setup_pos_neg_file.py b/src/setup_pos_neg_file.py new file mode 100644 index 0000000..2d4404a --- /dev/null +++ b/src/setup_pos_neg_file.py @@ -0,0 +1,97 @@ + +import argparse +from collections import defaultdict +import os +import sys +#from tqdm import tqdm +#import pandas as pd +import numpy as np + + +def parse_args(): + ## Parse command line args. + parser = argparse.ArgumentParser( + description='Script to take known human-virus PPIs, (TODO) generate negative examples, ' + + 'and create a "pos-neg-file" which matches the format expected by the FastSinkSource pipeline. ' + + 'Writes a tab-separated table with proteins on the rows, the name of the input file on the column, ' + + 'and 1/0/-1 for pos/unk/neg example as the values.') + + # general parameters + parser.add_argument('--pos-examples-file', type=str, required=True, + help="Single-column file containing the positive examples") + parser.add_argument('--name', type=str, + help="Name to give this dataset. Default is the file name") + parser.add_argument('--prot-universe-file', type=str, + help="Single-column file containing universe of proteins from which to sample negative examples") + parser.add_argument('--sample-neg-examples-factor', type=float, + help="If specified, sample negative examples randomly without replacement from the protein universe equal to * # positives") + parser.add_argument('--seed', type=float, + help="TODO Seed of the random number generator to use when sampling.") + parser.add_argument('--out-file', type=str, default="pos-neg-file.tsv", + help="path/to/file.tsv for which to write output.") + + # evaluation parameters + #group = parser.add_argument_group('Evaluation options') + #group.add_argument('--only-eval', action="store_true", default=False, + # help="Perform evaluation only (i.e., skip prediction mode)") + args = parser.parse_args() + return args + + +def main(pos_examples_file, prot_universe_file=None, sample_neg_examples_factor=None, + out_file=None, **kwargs): + """ + """ + print("Reading %s" % (pos_examples_file)) + pos_examples = set(np.loadtxt(pos_examples_file, dtype=str)) + print("\t%s positive examples" % (len(pos_examples))) + neg_examples = None + + if prot_universe_file is not None: + print("Reading %s" % (prot_universe_file)) + prot_universe = set(np.loadtxt(prot_universe_file, dtype=str)) + print("\t%s proteins" % (len(prot_universe))) + + pos_examples = pos_examples & prot_universe + print("\t%s positive examples after limitting to those in the specified universe" % (len(pos_examples))) + + if sample_neg_examples_factor is not None: + if prot_universe_file is None: + print("ERROR: Must specify the universe from which to sample negative examples if --sample-neg-examples specified.") + sys.exit() + neg_sample = sample_neg_examples_factor * len(pos_examples) + non_pos_universe = prot_universe - pos_examples + print("Sampling %s (%s*%s) negative examples from the universe of %s non-pos prots" % ( + neg_sample, sample_neg_examples_factor, len(pos_examples), len(non_pos_universe))) + if neg_sample > len(non_pos_universe): + print("ERROR: cannot sample more negative examples than specified by non_pos_universe") + + # now perform the sampling + non_pos_universe = np.asarray(list(non_pos_universe), dtype=str) + # sample without replacement + neg_examples = np.random.choice(non_pos_universe, size=int(neg_sample), replace=False) + + # now write the output file + print("\nWriting %s" % (out_file)) + # make sure the output directory exists + os.makedirs(os.path.dirname(out_file), exist_ok=True) + + name = out_file.split('/')[-1] if kwargs.get('name') is None else kwargs['name'] + out_str = "prots\t%s\n" % (name) + for p in sorted(prot_universe): + val = 0 + if p in pos_examples: + val = 1 + elif neg_examples is not None and p in neg_examples: + val = -1 + out_str += "%s\t%s\n" % (p, val) + + with open(out_file, 'w') as out: + out.write(out_str) + + +if __name__ == "__main__": + args = parse_args() + + kwargs = vars(args) + main(**kwargs)