forked from mpertea/stringtie2-initial-release
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepDE.py
executable file
·274 lines (233 loc) · 10.8 KB
/
prepDE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python2
import re, csv, sys, os, glob, warnings, itertools
from math import ceil
from optparse import OptionParser
from operator import itemgetter
MIN_PYTHON = (2, 7)
if sys.version_info < MIN_PYTHON:
sys.exit("Python %s.%s or later is required.\n" % MIN_PYTHON)
parser=OptionParser(description='Generates two CSV files containing the count matrices for genes and transcripts, using the coverage values found in the output of `stringtie -e`')
parser.add_option('-i', '--input', '--in', default='ballgown', help="the parent directory of the sample sub-directories or a textfile listing the paths to GTF files [default: %default]")
parser.add_option('-g', default='gene_count_matrix.csv', help="where to output the gene count matrix [default: %default")
parser.add_option('-t', default='transcript_count_matrix.csv', help="where to output the transcript count matrix [default: %default]")
parser.add_option('-l', '--length', default=75, type='int', help="the average read length [default: %default]")
parser.add_option('-p', '--pattern', default=".", help="a regular expression that selects the sample subdirectories")
parser.add_option('-c', '--cluster', action="store_true", help="whether to cluster genes that overlap with different gene IDs, ignoring ones with geneID pattern (see below)")
parser.add_option('-s', '--string', default="MSTRG", help="if a different prefix is used for geneIDs assigned by StringTie [default: %default]")
parser.add_option('-k', '--key', default="prepG", help="if clustering, what prefix to use for geneIDs assigned by this script [default: %default]")
parser.add_option('--legend', default="legend.csv", help="if clustering, where to output the legend file mapping transcripts to assigned geneIDs [default: %default]")
(opts, args)=parser.parse_args()
samples = [] # List of tuples. If sample list, (first column, path). Else, (subdirectory name, path to gtf file in subdirectory)
if (os.path.isfile(opts.input)):
# gtfList = True
try:
fin = open(opts.input, 'r')
for line in fin:
if line[0] != '#':
lineLst = tuple(line.strip().split())
if (len(lineLst) != 2):
print "Error: Text file with sample ID and path invalid (%s)" % (line.strip())
exit(1)
if lineLst[0] in samples:
print "Error: Sample ID duplicated (%s)" % (lineLst[0])
exit(1)
if not os.path.isfile(lineLst[1]):
print "Error: GTF file not found (%s)" % (lineLst[1])
exit(1)
samples.append(lineLst)
except IOError:
print "Error: List of .gtf files, %s, doesn't exist" % (opts.input)
exit(1)
else:
# gtfList = False
## Check that opts.input directory exists
if not os.path.isdir(opts.input):
parser.print_help()
print " "
print "Error: sub-directory '%s' not found!" % (opts.input)
sys.exit(1)
#####
## Collect all samples file paths and if empty print help message and quit
#####
samples = [(i,glob.iglob(os.path.join(opts.input,i,"*.gtf")).next()) for i in next(os.walk(opts.input))[1] if re.search(opts.pattern,i)]
if len(samples) == 0:
parser.print_help()
print " "
print "Error: no GTF files found under ./%s !" % (opts.input)
sys.exit(1)
RE_GENE_ID=re.compile('gene_id "([^"]+)"')
RE_GENE_NAME=re.compile('gene_name "([^"]+)"')
RE_TRANSCRIPT_ID=re.compile('transcript_id "([^"]+)"')
RE_COVERAGE=re.compile('cov "([\-\+\d\.]+)"')
RE_STRING=re.compile(re.escape(opts.string))
#####
## Sort the sample names by the sample ID
#####
samples.sort()
#####
## Checks whether a given row is a transcript
## other options: ex. exon, transcript, mRNA, 5'UTR
#####
def is_transcript(x):
return len(x)>2 and x[2]=="transcript"
def getGeneID(s, ctg, tid):
r=RE_GENE_ID.search(s)
if r: return r.group(1)
r=RE_GENE_NAME.search(s)
if r: return ctg+'|'+r.group(1)
return tid
def getCov(s):
r=RE_COVERAGE.search(s)
if r:
v=float(r.group(1))
if v<0.0: v=0.0
return v
return 0.0
def is_overlap(x,y): #NEEDS TO BE INTS!
return x[0]<=y[1] and y[0]<=x[1]
def t_overlap(t1, t2): #from badGenes: chromosome, strand, cluster, start, end, (e1start, e1end)...
if t1[0] != t2[0] or t1[1] != t2[1] or t1[5]<t2[4]: return False
for i in range(6, len(t1)):
for j in range(6, len(t2)):
if is_overlap(t1[i], t2[j]): return True
return False
## Average Readlength
read_len=opts.length
## Variables/Matrices to store t/g_counts
t_count_matrix, g_count_matrix=[],[]
##Get ready for clustering, stuff is once for all samples##
geneIDs={} #key=transcript, value=cluster/gene_id
## For each of the sorted sample paths
for s in samples:
badGenes=[] #list of bad genes (just ones that aren't MSTRG)
try:
## opts.input = parent directory of sample subdirectories
## s = sample currently iterating through
## os.path.join(opts.input,s,"*.gtf") path to current sample's GTF
## split = list of lists: [[chromosome, ...],...]
#with open(glob.iglob(os.path.join(opts.input,s,"*.gtf")).next()) as f:
# split=[l.split('\t') for l in f.readlines()]
# if not gtfList:
# f = open(glob.iglob(os.path.join(opts.input,s[1],"*.gtf")).next())
# else:
# f = open(s[1])
with open(s[1]) as f:
split=[l.split('\t') for l in f.readlines()]
## i = numLine; v = corresponding i-th GTF row
for i,v in enumerate(split):
if is_transcript(v):
t_id=RE_TRANSCRIPT_ID.search(v[len(v)-1]).group(1)
try:
g_id=getGeneID(v[len(v)-1], v[0], t_id)
except:
print "Problem at line:\n:%s\n" % (v)
print "i='%s', len(v)=%s" % (i, len(v));
sys.exit(1)
geneIDs.setdefault(t_id, g_id)
if not RE_STRING.match(g_id):
badGenes.append([v[0],v[6], t_id, g_id, min(int(v[3]),int(v[4])), max(int(v[3]),int(v[4]))]) #chromosome, strand, cluster/transcript id, start, end
j=i+1
while j<len(split) and split[j][2]=="exon":
badGenes[len(badGenes)-1].append((min(int(split[j][3]), int(split[j][4])), max(int(split[j][3]), int(split[j][4]))))
j+=1
except StopIteration:
warnings.warn("Didn't get a GTF in that directory. Looking in another...")
else: #we found the "bad" genes!
break
##THE CLUSTERING BEGINS!##
if opts.cluster and len(badGenes)>0:
clusters=[] #lists of lists (could be sets) or something of transcripts
badGenes.sort(key=itemgetter(3)) #sort by start coord...?
i=0
while i<len(badGenes): #rather un-pythonic
temp_cluster=[badGenes[i]]
k=0
while k<len(temp_cluster):
j=i+1
while j<len(badGenes):
if t_overlap(temp_cluster[k], badGenes[j]):
temp_cluster.append(badGenes[j])
del badGenes[j]
else:
j+=1
k+=1
if len(temp_cluster)>1:
clusters.append([t[2] for t in temp_cluster])
i+=1
print len(clusters)
for c in clusters:
c.sort()
clusters.sort(key=itemgetter(0))
legend=[]
for u,c in enumerate(clusters):
my_ID=opts.key+str((u+1))
legend.append(list(itertools.chain.from_iterable([[my_ID],c]))) #my_ID, clustered transcript IDs
for t in c:
geneIDs[t]=my_ID
## geneIDs[t]="|".join(c) #duct-tape transcript IDs together, disregarding ref_gene_names and things like that
with open(opts.legend, 'w') as l_file:
my_writer=csv.writer(l_file)
my_writer.writerows(legend)
geneDict={} #key=gene/cluster, value=dictionary with key=sample, value=summed counts
t_dict={}
for q, s in enumerate(samples):
print q, s[0]
try:
#with open(glob.iglob(os.path.join(opts.input,s,"*.gtf")).next()) as f: #grabs first .gtf file it finds inside the sample subdirectory
# if not gtfList:
# f = open(glob.iglob(os.path.join(opts.input,s[1],"*.gtf")).next())
# else:
f = open(s[1])
# s = s.split('/')[len(s.split('/')) - 1].split('.gtf')[0].split('_')[0]
# s = sample_IDs[q]
## split=[t[:len(t)-1]+t[len(t)-1].split(";") for t in split]
## split=[t[:len(t)-1] for t in split] #eliminate '\n' at end
## split=[[e.lstrip() for e in t] for t in split]
#should consider making stuff into dictionaries, maybe each split line
## transcriptList=[]
transcript_len=0
for l in f:
if l.startswith("#"):
continue
v=l.split('\t')
if v[2]=="transcript":
if transcript_len>0:
## transcriptList.append((g_id, t_id, int(ceil(coverage*transcript_len/read_len))))
t_dict.setdefault(t_id, {})
t_dict[t_id].setdefault(s[0], int(ceil(coverage*transcript_len/read_len)))
t_id=RE_TRANSCRIPT_ID.search(v[len(v)-1]).group(1)
#g_id=RE_GENE_ID.search(v[len(v)-1]).group(1)
g_id=getGeneID(v[len(v)-1], v[0], t_id)
#coverage=float(RE_COVERAGE.search(v[len(v)-1]).group(1))
coverage=getCov(v[len(v)-1])
transcript_len=0
if v[2]=="exon":
transcript_len+=int(v[4])-int(v[3])+1 #because end coordinates are inclusive in GTF
## transcriptList.append((g_id, t_id, int(ceil(coverage*transcript_len/read_len))))
t_dict.setdefault(t_id, {})
t_dict[t_id].setdefault(s[0], int(ceil(coverage*transcript_len/read_len)))
except StopIteration:
# if not gtfList:
# warnings.warn("No GTF file found in " + os.path.join(opts.input,s[1]))
# else:
warnings.warn("No GTF file found in " + s[1])
## transcriptList.sort(key=lambda bla: bla[1]) #gene_id
for i,v in t_dict.iteritems():
## print i,v
geneDict.setdefault(geneIDs[i],{}) #gene_id
geneDict[geneIDs[i]].setdefault(s[0],0)
geneDict[geneIDs[i]][s[0]]+=v[s[0]]
with open(opts.t, 'w') as csvfile:
my_writer = csv.DictWriter(csvfile, fieldnames = ["transcript_id"] + [x for x,y in samples])
my_writer.writerow(dict((fn,fn) for fn in my_writer.fieldnames))
for i in t_dict:
t_dict[i]["transcript_id"] = i
my_writer.writerow(t_dict[i])
with open(opts.g, 'w') as csvfile:
my_writer = csv.DictWriter(csvfile, fieldnames = ["gene_id"] + [x for x,y in samples])
## my_writer.writerow([""]+samples)
## my_writer.writerows(geneDict)
my_writer.writerow(dict((fn,fn) for fn in my_writer.fieldnames))
for i in geneDict:
geneDict[i]["gene_id"] = i #add gene_id to row
my_writer.writerow(geneDict[i])