From 910cd36644c9a03a28f62954afafb9b4c0c2243e Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Wed, 29 May 2019 16:05:31 +0100
Subject: [PATCH 1/2] add aggregate_spike_count to reports.py

add a new function, aggregate_spike_count, to reports.py to produce a matrix of spike-in counts per-sample from an array of per-sample spike-in count files
---
 reports.py | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/reports.py b/reports.py
index 4e9d60f96..87022790c 100755
--- a/reports.py
+++ b/reports.py
@@ -10,7 +10,7 @@
 import glob
 import os
 import time
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 import csv
 import math
 import shutil
@@ -453,6 +453,50 @@ def parser_consolidate_spike_count(parser=argparse.ArgumentParser()):
 __commands__.append(('consolidate_spike_count', parser_consolidate_spike_count))
 
 
+def aggregate_spike_count(inDir, outFile):
+    '''aggregate multiple spike count reports into one.'''
+    spike_in_sample_counts = defaultdict(dict) # For a given spikein ID, map to sample name and corresponding count
+    samples_seen = []
+    with open(outFile, 'wt') as outf:
+        for fn in glob.glob(inDir+"*.spike_count.txt"):# os.listdir():
+            #fn = os.path.join(inDir, fn)
+            s = os.path.basename(fn)
+            if not s.endswith('.spike_count.txt'):
+                raise Exception()
+            if s.find("ERCC"):
+                s=s[:s.find("ERCC")-1]
+            else:
+                s = s[:-len('.spike_count.txt')]
+            if s not in samples_seen:
+                samples_seen.append(s)
+            with open(fn, 'rt') as inf:
+                for line in inf:
+                    if not line.startswith('Input bam') and not line.startswith('*'):
+                        spike, count = [line.strip().split('\t')[i] for i in [0,2]]
+                        spike_in_sample_counts[spike][s] = count
+                        #outf.write('\t'.join([s, spike, count]) + '\n')
+        outf.write("\t".join(["spike-in"]+samples_seen)+"\n")
+        for spike in spike_in_sample_counts.keys():
+            row = []
+            row.append(spike)
+            for s in samples_seen:
+                if s in spike_in_sample_counts[spike]:
+                    row.append(spike_in_sample_counts[spike][s])
+                else:
+                    row.append("0")
+            outf.write("\t".join(row)+"\n")
+
+
+def parser_aggregate_spike_count(parser=argparse.ArgumentParser()):
+    parser.add_argument('inDir', help='Input spike count directory.')
+    parser.add_argument('outFile', help='Output report file.')
+    util.cmd.attach_main(parser, aggregate_spike_count, split_args=True)
+    return parser
+
+
+__commands__.append(('aggregate_spike_count', parser_aggregate_spike_count))
+
+
 # =========================
 
 

From 167785c9b060cc8871286f431617530acd40b1a3 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 4 Jun 2019 13:24:24 -0400
Subject: [PATCH 2/2] sort spike-ins listed in aggregated report

sort spike-ins listed in aggregated report; also metavar positional args to avoid camel case in signature of called function. path used as base for glob is now passed through realpath to standardize the path a bit more and allow paths specified with or without trailing slashes (realpath removes them if present)
---
 reports.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/reports.py b/reports.py
index 87022790c..a6c81461d 100755
--- a/reports.py
+++ b/reports.py
@@ -427,11 +427,11 @@ def get_earliest_date(inDir):
     return time.strftime("%Y-%m-%d", time.localtime(earliest))
 
 
-def consolidate_spike_count(inDir, outFile):
+def consolidate_spike_count(in_dir, out_file):
     '''Consolidate multiple spike count reports into one.'''
-    with open(outFile, 'wt') as outf:
-        for fn in os.listdir(inDir):
-            fn = os.path.join(inDir, fn)
+    with open(out_file, 'wt') as outf:
+        for fn in os.listdir(in_dir):
+            fn = os.path.join(in_dir, fn)
             s = os.path.basename(fn)
             if not s.endswith('.spike_count.txt'):
                 raise Exception()
@@ -444,8 +444,8 @@ def consolidate_spike_count(inDir, outFile):
 
 
 def parser_consolidate_spike_count(parser=argparse.ArgumentParser()):
-    parser.add_argument('inDir', help='Input spike count directory.')
-    parser.add_argument('outFile', help='Output report file.')
+    parser.add_argument('in_dir', metavar="inDir", help='Input spike count directory.')
+    parser.add_argument('out_file', metavar="outFile", help='Output report file.')
     util.cmd.attach_main(parser, consolidate_spike_count, split_args=True)
     return parser
 
@@ -453,13 +453,13 @@ def parser_consolidate_spike_count(parser=argparse.ArgumentParser()):
 __commands__.append(('consolidate_spike_count', parser_consolidate_spike_count))
 
 
-def aggregate_spike_count(inDir, outFile):
+def aggregate_spike_count(in_dir, out_file):
     '''aggregate multiple spike count reports into one.'''
     spike_in_sample_counts = defaultdict(dict) # For a given spikein ID, map to sample name and corresponding count
     samples_seen = []
-    with open(outFile, 'wt') as outf:
-        for fn in glob.glob(inDir+"*.spike_count.txt"):# os.listdir():
-            #fn = os.path.join(inDir, fn)
+    with open(out_file, 'wt') as outf:
+        for fn in glob.glob(os.path.realpath(in_dir)+"/*.spike_count.txt"):# os.listdir():
+            #fn = os.path.join(in_dir, fn)
             s = os.path.basename(fn)
             if not s.endswith('.spike_count.txt'):
                 raise Exception()
@@ -476,7 +476,7 @@ def aggregate_spike_count(inDir, outFile):
                         spike_in_sample_counts[spike][s] = count
                         #outf.write('\t'.join([s, spike, count]) + '\n')
         outf.write("\t".join(["spike-in"]+samples_seen)+"\n")
-        for spike in spike_in_sample_counts.keys():
+        for spike in sorted(spike_in_sample_counts.keys()):
             row = []
             row.append(spike)
             for s in samples_seen:
@@ -488,8 +488,8 @@ def aggregate_spike_count(inDir, outFile):
 
 
 def parser_aggregate_spike_count(parser=argparse.ArgumentParser()):
-    parser.add_argument('inDir', help='Input spike count directory.')
-    parser.add_argument('outFile', help='Output report file.')
+    parser.add_argument('in_dir', metavar="inDir", help='Input spike count directory.')
+    parser.add_argument('out_file', metavar="outFile", help='Output report file.')
     util.cmd.attach_main(parser, aggregate_spike_count, split_args=True)
     return parser