From 910cd36644c9a03a28f62954afafb9b4c0c2243e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 29 May 2019 16:05:31 +0100 Subject: [PATCH 1/2] add aggregate_spike_count to reports.py add a new function, aggregate_spike_count, to reports.py to produce a matrix of spike-in counts per-sample from an array of per-sample spike-in count files --- reports.py | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/reports.py b/reports.py index 4e9d60f96..87022790c 100755 --- a/reports.py +++ b/reports.py @@ -10,7 +10,7 @@ import glob import os import time -from collections import OrderedDict +from collections import OrderedDict, defaultdict import csv import math import shutil @@ -453,6 +453,50 @@ def parser_consolidate_spike_count(parser=argparse.ArgumentParser()): __commands__.append(('consolidate_spike_count', parser_consolidate_spike_count)) +def aggregate_spike_count(inDir, outFile): + '''aggregate multiple spike count reports into one.''' + spike_in_sample_counts = defaultdict(dict) # For a given spikein ID, map to sample name and corresponding count + samples_seen = [] + with open(outFile, 'wt') as outf: + for fn in glob.glob(inDir+"*.spike_count.txt"):# os.listdir(): + #fn = os.path.join(inDir, fn) + s = os.path.basename(fn) + if not s.endswith('.spike_count.txt'): + raise Exception() + if s.find("ERCC"): + s=s[:s.find("ERCC")-1] + else: + s = s[:-len('.spike_count.txt')] + if s not in samples_seen: + samples_seen.append(s) + with open(fn, 'rt') as inf: + for line in inf: + if not line.startswith('Input bam') and not line.startswith('*'): + spike, count = [line.strip().split('\t')[i] for i in [0,2]] + spike_in_sample_counts[spike][s] = count + #outf.write('\t'.join([s, spike, count]) + '\n') + outf.write("\t".join(["spike-in"]+samples_seen)+"\n") + for spike in spike_in_sample_counts.keys(): + row = [] + row.append(spike) + for s in samples_seen: + if s in spike_in_sample_counts[spike]: + row.append(spike_in_sample_counts[spike][s]) + else: + row.append("0") + outf.write("\t".join(row)+"\n") + + +def parser_aggregate_spike_count(parser=argparse.ArgumentParser()): + parser.add_argument('inDir', help='Input spike count directory.') + parser.add_argument('outFile', help='Output report file.') + util.cmd.attach_main(parser, aggregate_spike_count, split_args=True) + return parser + + +__commands__.append(('aggregate_spike_count', parser_aggregate_spike_count)) + + # ========================= From 167785c9b060cc8871286f431617530acd40b1a3 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 4 Jun 2019 13:24:24 -0400 Subject: [PATCH 2/2] sort spike-ins listed in aggregated report sort spike-ins listed in aggregated report; also metavar positional args to avoid camel case in signature of called function. path used as base for glob is now passed through realpath to standardize the path a bit more and allow paths specified with or without trailing slashes (realpath removes them if present) --- reports.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/reports.py b/reports.py index 87022790c..a6c81461d 100755 --- a/reports.py +++ b/reports.py @@ -427,11 +427,11 @@ def get_earliest_date(inDir): return time.strftime("%Y-%m-%d", time.localtime(earliest)) -def consolidate_spike_count(inDir, outFile): +def consolidate_spike_count(in_dir, out_file): '''Consolidate multiple spike count reports into one.''' - with open(outFile, 'wt') as outf: - for fn in os.listdir(inDir): - fn = os.path.join(inDir, fn) + with open(out_file, 'wt') as outf: + for fn in os.listdir(in_dir): + fn = os.path.join(in_dir, fn) s = os.path.basename(fn) if not s.endswith('.spike_count.txt'): raise Exception() @@ -444,8 +444,8 @@ def consolidate_spike_count(inDir, outFile): def parser_consolidate_spike_count(parser=argparse.ArgumentParser()): - parser.add_argument('inDir', help='Input spike count directory.') - parser.add_argument('outFile', help='Output report file.') + parser.add_argument('in_dir', metavar="inDir", help='Input spike count directory.') + parser.add_argument('out_file', metavar="outFile", help='Output report file.') util.cmd.attach_main(parser, consolidate_spike_count, split_args=True) return parser @@ -453,13 +453,13 @@ def parser_consolidate_spike_count(parser=argparse.ArgumentParser()): __commands__.append(('consolidate_spike_count', parser_consolidate_spike_count)) -def aggregate_spike_count(inDir, outFile): +def aggregate_spike_count(in_dir, out_file): '''aggregate multiple spike count reports into one.''' spike_in_sample_counts = defaultdict(dict) # For a given spikein ID, map to sample name and corresponding count samples_seen = [] - with open(outFile, 'wt') as outf: - for fn in glob.glob(inDir+"*.spike_count.txt"):# os.listdir(): - #fn = os.path.join(inDir, fn) + with open(out_file, 'wt') as outf: + for fn in glob.glob(os.path.realpath(in_dir)+"/*.spike_count.txt"):# os.listdir(): + #fn = os.path.join(in_dir, fn) s = os.path.basename(fn) if not s.endswith('.spike_count.txt'): raise Exception() @@ -476,7 +476,7 @@ def aggregate_spike_count(inDir, outFile): spike_in_sample_counts[spike][s] = count #outf.write('\t'.join([s, spike, count]) + '\n') outf.write("\t".join(["spike-in"]+samples_seen)+"\n") - for spike in spike_in_sample_counts.keys(): + for spike in sorted(spike_in_sample_counts.keys()): row = [] row.append(spike) for s in samples_seen: @@ -488,8 +488,8 @@ def aggregate_spike_count(inDir, outFile): def parser_aggregate_spike_count(parser=argparse.ArgumentParser()): - parser.add_argument('inDir', help='Input spike count directory.') - parser.add_argument('outFile', help='Output report file.') + parser.add_argument('in_dir', metavar="inDir", help='Input spike count directory.') + parser.add_argument('out_file', metavar="outFile", help='Output report file.') util.cmd.attach_main(parser, aggregate_spike_count, split_args=True) return parser