-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsummarise_topGO_output.py
executable file
·52 lines (40 loc) · 1.9 KB
/
summarise_topGO_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3
"""
> summarise_topGO_output.py <
Script checks folder for output files from topGO pipeline (bp_*.txt, cc_*.txt
and mf_*.txt), then summarises everything into a summary file.
This script used to exist as a shell script that parsed the same files using
awk, but I discovered that awk might treat floats in scientific notation
as strings, therefore incorrectly concluding that '2.3e-09' is not smaller than
0.05. This weird behaviour was first diagnosed for the awk distributed in
Debian on WSL--it didn't happen on Cygwin/Ubuntu on WSL.
Regardless, as awk is much less predictable than Python, this script was
written to give me peace of mind.
"""
import csv
import glob
bp_files = sorted(glob.glob('bp_*.txt'))
for bp_file in bp_files:
# the '1' is to replace bp with the replacement string only once
cc_file = bp_file.replace('bp', 'cc', 1)
mf_file = bp_file.replace('bp', 'mf', 1)
output_file = bp_file.replace('bp', 'summary', 1)
with open(output_file, 'w') as opf:
for f in [bp_file, cc_file, mf_file]:
tsv_reader = csv.reader(open(f), delimiter='\t')
# print header
print ('-- {} --'.format(f), file=opf)
header = next(tsv_reader)
print ('', *header, sep='\t', file=opf)
for row in tsv_reader:
if not row: continue
# two criteria to pass:
# 1. At least 5 terms in the universe (row[3])
if int(row[3]) < 5: continue
# 2. P value < 0.05 OR if it contains the character '<'
# (because R outputs stuff like '< 1e-30') (row[6])
if row[6][0] == '<':
print ('\t'.join(row), file=opf)
elif float(row[6]) < 0.05:
print ('\t'.join(row), file=opf)
print (file=opf)