This repository has been archived by the owner on Jan 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
getGene.py
611 lines (483 loc) · 21.9 KB
/
getGene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
import os
import re
import string
from tt_log import logger
import Annotations as anno
import Best as best
import Cluster as cl
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
MIN_REGION_SIZE = 50
FASTA_WRAP = 60 # bases per fasta line
REGEX_NAME = re.compile('(c\d+)') # cluster ID in cluster name
REGEX_LEN = re.compile('\/(\d+)$') # cluster length in cluster name
COMPLTAB = string.maketrans('ACGTacgt', 'TGCAtgca') # for reverse-complementing reads
def getAnnotations(opt):
# hold the annotList in the RAM, which contains information of
# reference transcripts
if opt.format == 'pickle':
annotList = anno.AnnotationList.fromPickle(opt.gtf)
elif opt.format == 'alt':
annotList = anno.AnnotationList(opt.gtf, altFormat=True)
else: # standard format
annotList = anno.AnnotationList(opt.gtf)
return annotList
def getGeneFromAnnotation(opt, tranList, exonList):
# Add to lists of transcripts and exons: annotations for gene of interest.
if opt.gtf is None:
return tranList, exonList
if opt.annotations:
annotList = opt.annotations
else:
if opt.format == 'pickle':
annotList = anno.AnnotationList.fromPickle(opt.gtf)
elif opt.format == 'alt':
annotList = anno.AnnotationList(opt.gtf, altFormat=True)
else: # standard format
annotList = anno.AnnotationList(opt.gtf)
allGenes = annotList.getGeneDict()
allGenes.update({k.upper(): v for k, v in allGenes.iteritems()})
if opt.gene not in allGenes:
raise RuntimeError('gene %s is not in the annotation file' % opt.gene)
geneList = allGenes[opt.gene] # a list of Annotation objects
if len(geneList) > 1:
logger.warning('gene %s appears %d times in annotations, first occurrence plotted'
% (opt.gene, len(geneList)))
myGene = geneList[0]
for tran in myGene.getChildren(): # tran is an Annotation object
myTran = Transcript(tran.name, start=tran.start, end=tran.end,
annot=True, ID=tran.ID, source=(0, opt.gtf))
if hasattr(tran, 'startcodon'):
myTran.startcodon = tran.startcodon
if hasattr(tran, 'stopcodon'):
myTran.stopcodon = tran.stopcodon
for exon in tran.getChildren(): # exon is an Annotation object
myExon = Exon(myTran, exon.name, exon.start,
exon.end, exon.strand) # no Q score
if hasattr(exon, 'polyAs'):
myExon.polyAs = exon.polyAs
exonList.append(myExon)
myTran.exons.append(myExon)
tranList.append(myTran)
return tranList, exonList
def getMatchedIsoforms(opt):
clusterDict = dict()
for matchFile in opt.matches:
clusterDict[matchFile] = cl.ClusterDict.fromPickle(matchFile)
return clusterDict
def getGeneFromMatches(opt, tranList, exonList):
'''Add to lists of transcripts and exons: clusters which
matched gene of interest.'''
if opt.matches is None:
return tranList, exonList
localList = list() # temporary list of clusters
totClusters = 0
for ix, matchFile in enumerate(opt.matches): # --matches may have been specified more thn once
if opt.clusterDict:
clusterDict = opt.clusterDict[matchFile]
else:
clusterDict = cl.ClusterDict.fromPickle(matchFile) # pickle file produced by matchAnnot.py
for cluster in getClustersForGene(clusterDict, opt.gene): # cluster is Cluster object
cluster.source = (ix + 1, matchFile)
totClusters += 1
full, partial = cluster.getFP()
sortKey = 'f%06dp%06d' % (full, partial) # single key includes full and partial counts
matchLen = re.search(REGEX_LEN, cluster.name) # filter by cluster length, if requested
if matchLen is None:
raise RuntimeError('no length in name: %s' % cluster.name)
else:
localList.append([cluster, sortKey])
localList.sort(key=lambda x: x[1], reverse=True) # sort by full/partial counts
totFull = 0
totPartial = 0
for ent in localList:
cluster = ent[0]
myTran = Transcript(cluster.name, score=cluster.bestScore,
source=cluster.source)
myTran.chr = cluster.chr
full, partial = cluster.getFP()
totFull += full
totPartial += partial
myTran.full = full
myTran.partial = partial
end = 0
start = float('inf')
leading, trailing = cluster.cigar.softclips()
for exonNum, exon in enumerate(cluster.cigar.exons()): # exon is a cs.Exon object
exonName = '%s/%d' % (myTran.name, exonNum) # exons don't have names: make one up
if end < exon.end:
end = exon.end
if start > exon.start:
start = exon.start
if cluster.cigar.MD is not None: # if MD string was supplied
myExon = Exon(myTran, exonName, exon.start, exon.end, cluster.strand, QScore=exon.QScore())
else:
myExon = Exon(myTran, exonName, exon.start, exon.end, cluster.strand)
if exonNum == 0:
myExon.leading = leading # add leading softclips to first exon
exonList.append(myExon)
myTran.exons.append(myExon)
myExon.trailing = trailing # add trailing softclips to last exon
myTran.start = start
myTran.end = end
tranList.append(myTran)
if opt.fasta is not None:
writeFasta(opt, cluster)
logger.debug('kept %d of %d clusters for gene %s' % (len(localList), totClusters, opt.gene))
logger.debug('kept clusters include %d full + %d partial reads' % (totFull, totPartial))
return tranList, exonList
def assignBlocks(opt, exonList):
# assign exons into blocks
'''
Assign exons to blocks, separated by sequence which is intronic in
all transcripts. exonList is assumed to be sorted by ascending
start position.
'''
adjust = 0
blockNo = 0
exonIx = 0
blocks = list()
while exonIx < len(exonList): # can't use enumerate here, it's a double loop
blockStart = exonList[exonIx].start # block start = start of first exon in block
blockEnd = exonList[exonIx].end # initial value, updated in the loop below
while exonIx < len(exonList) and exonList[exonIx].start <= blockEnd:
myExon = exonList[exonIx]
if myExon.end > blockEnd:
blockEnd = myExon.end
myExon.block = blockNo
myExon.tran.blocks.add(blockNo) # transcript has an exon in this block
myExon.adjStart = myExon.start - blockStart + adjust
exonIx += 1
adjust += blockEnd - blockStart + 1
blocks.append(Block(blockStart, blockEnd, adjust))
blockNo += 1
return blocks
def assignBlocksReverse(opt, exonList):
# Like assignblocks, but for the reverse strand, ordering blocks
# from the 5' end of the transcript. exonList is assumed to be
# sorted by decreasing exon end position.
# I did this as a separate mirror image of assignBlocks, rather
# than clutter the scenery with lots of forward/reverse checks.
adjust = 0
blockNo = 0
exonIx = 0
blocks = list()
while exonIx < len(exonList): # can't use enumerate here, it's a double loop
blockStart = exonList[exonIx].end # block start = end of last exon in block
blockEnd = exonList[exonIx].start # initial value, updated in the loop below
while exonIx < len(exonList) and exonList[exonIx].end >= blockEnd:
myExon = exonList[exonIx]
if myExon.start < blockEnd:
blockEnd = myExon.start
myExon.block = blockNo
myExon.tran.blocks.add(blockNo) # transcript has an exon in this block
myExon.adjStart = blockStart - myExon.end + adjust
exonIx += 1
adjust += blockStart - blockEnd + 1
blocks.append(Block(blockStart, blockEnd, adjust))
blockNo += 1
return blocks
def annotationBlocks(exonList):
# Assign exons to blocks, separated by sequence which is intronic in
# all transcripts. exonList is assumed to be sorted by ascending
# start position.
annotExons = list()
for e in exonList:
if e.tran.annot is True:
annotExons.append(e)
adjust = annotExons[0].adjStart
blockNo = 0
exonIx = 0
blocks = list()
while exonIx < len(annotExons): # can't use enumerate here, it's a double loop
blockStart = annotExons[exonIx].start # block start = start of first exon in block
blockEnd = annotExons[exonIx].end # initial value, updated in the loop below
while exonIx < len(annotExons) and annotExons[exonIx].start <= blockEnd:
myExon = annotExons[exonIx]
if myExon.end > blockEnd:
blockEnd = myExon.end
exonIx += 1
adjust += blockEnd - blockStart + 1
blocks.append(Block(blockStart, blockEnd, adjust))
blockNo += 1
return blocks
def annotationBlocksReverse(exonList):
annotExons = list()
for e in exonList:
if e.tran.annot is True:
annotExons.append(e)
adjust = annotExons[0].adjStart
blockNo = 0
exonIx = 0
blocks = list()
while exonIx < len(annotExons): # can't use enumerate here, it's a double loop
blockStart = annotExons[exonIx].end # block start = start of first exon in block
blockEnd = annotExons[exonIx].start # initial value, updated in the loop below
while exonIx < len(annotExons) and annotExons[exonIx].end >= blockEnd:
myExon = annotExons[exonIx]
if myExon.start < blockEnd:
blockEnd = myExon.start
exonIx += 1
adjust += blockStart - blockEnd + 1
blocks.append(Block(blockStart, blockEnd, adjust))
blockNo += 1
return blocks
def findRegions(tranList):
# Find breakpoints where coverage by exons changes.
# Why are we doing this? See the note in the Transcript class
# definition below.
breaks = list()
for tranIx, tran in enumerate(tranList):
for exon in tran.exons:
breaks.append([exon.start, 0, tranIx, tran.name, exon.name])
breaks.append([exon.end, 1, tranIx, tran.name, exon.name])
breaks.sort(key=lambda x: x[0])
curPos = breaks[0][0]
curTranSet = set()
region = 0
for ix in xrange(len(breaks)):
posit, flag, tranIx, tranName, exonName = breaks[ix]
if posit > curPos + MIN_REGION_SIZE: # this is a new region
if len(curTranSet) > 0:
for ix in curTranSet:
tranList[ix].regions.add(region) # update set of regions hit by this transcript
region += 1
curPos = posit
if flag == 0: # exon start
curTranSet.add(tranIx)
else: # exon end
curTranSet.remove(tranIx)
logger.debug('found %d regions' % region)
return
def orderTranscripts(tranList):
'''
Order the transcripts (i,e., assign each a Y coordinate) so similar
transcripts are close to each other.
'''
# The measure of similarity used here is block occupancy: The
# distance between two transcripts is the number of blocks where
# one transcript has exons, and the other doesn't. How many exons
# there are, or how similar they are in length, is not looked at.
# The ordering is done using a greedy nearest-neighbor
# heuristic. To do it optimally turns it into a Traveling Salesman
# problem.
tranNames = list()
curTran = tranList[0] # arbitrarily start with the first transcript
tranIx = 0
while True: # loop until break below
if curTran.annot is False:
tranNames.append(curTran.name) # needed for yticks call
else:
tranNames.append(curTran.ID)
curTran.tranIx = tranIx
bestTran = best.Best(reverse=True)
for myTran in tranList: # find the next closest transcript
if myTran.tranIx is None: # if transcript hasn't been indexed yet
diff = len(curTran.regions.symmetric_difference(myTran.regions))
bestTran.update(diff, myTran)
if bestTran.which is None: # every transcript has its index: we're done
break
curTran = bestTran.which
tranIx += 1
return tranNames
def writeFasta(opt, cluster):
'''Write a fasta file for a cluster.'''
# It's fairly common to want to see the sequence for interesting
# clusters in fasta format. It's convenient to build that into
# clusterView, since the logic for picking the most populous (or
# otherwise interesting) clusters is already here.
if not os.path.exists(opt.fasta):
os.makedirs(opt.fasta)
elif not os.path.isdir(opt.fasta):
raise RuntimeError('%s exists but is not a directory' % opt.fasta)
match = re.search(REGEX_NAME, cluster.name)
if match is None:
raise RuntimeError('cannot find cluster ID in %s' % cluster.name)
if cluster.strand == '+': # Cluster object includes bases in forward strand sense
bases = cluster.bases
else:
bases = cluster.bases[::-1].translate(COMPLTAB) # fasta file wants them in read sense
filename = '%s/%s.fasta' % (opt.fasta, match.group(1))
handle = open(filename, 'w')
handle.write('>%s\n' % cluster.name)
for ix in xrange(0, len(bases), FASTA_WRAP):
handle.write(bases[ix:ix + FASTA_WRAP] + '\n')
handle.close()
def groupTran(tranList, exonList, cluster_num):
"""
Group transcripts by exon/intron similarities.
"""
# minVal is the minimum starting point of all the transcripts,
# maxVal stands for maximum
maxVal = 0
minVal = float('inf')
matchTran = list()
for tran in tranList:
if tran.annot is False:
if maxVal < tran.end:
maxVal = tran.end
if minVal > tran.start:
minVal = tran.start
matchTran.append(tran)
if len(matchTran) == 0:
return None
df = pd.DataFrame(data=matchTran, columns=['tran'])
df['min'] = minVal
df['max'] = maxVal
df['name'] = df.apply(getName, axis=1)
df['exons'] = df.apply(getExon, axis=1)
# Build a matrix contains only true and false
#
# Transcript1: ----- ---- -- -------
# Transcript2: ---- ------ ------- ---
# Superimpose: ---------------------------
# booleanTran: FFFFFFFFFFFFFFFFFFFFFFFFFFF
# Overlap1: FFTTTTTFFFFTTTTFFTTFTTTTTTT
# booleanTran has the same length with superimpose, which is a list of
# False. Change the overlap region between the booleanTran and each
# transcript to True.
df['boolean'] = df.apply(toBoolean, axis=1)
# Create a distance table that can be used in K-Means.
#
# c225/f26p50/6117 c483/f8p23/6083 c20615/f3p27/6185
# c225/f26p50/6117 0.000000 0.029911 0.012941
# c483/f8p23/6083 0.029911 0.000000 0.019231
# c20615/f3p27/6185 0.012941 0.019231 0.000000
# The number can be interpreted as the similarity between each two
# transcript. 0 means they are exactly same while 1 means they have
# no overlap region.
# Create the distance matrix efficiently by noting that it is symmetrical and performing the following steps:
# (1) Initialize matrix with all zeros, which covers the diagonals;
# (2) calculate distances for only entries to the bottom-left of the diagonal;
# (3) add the matrix and its transpose to fill in the entries to the top-right of the diagonal.
# NOTE: if distance function changes, then need to revisit this code.
length = len(df)
matrix = np.full((length, length), 0.0)
for cur_index in range(1, length):
matrix[cur_index][0:cur_index] = [calcDis(df, cur_index, i) for i in range(cur_index)]
matrix = matrix + matrix.T
# Create dataframe distance table.
index = df['name']
distanceTable = pd.DataFrame()
distanceTable = pd.DataFrame(matrix)
distanceTable.columns = index
distanceTable.index = index
# Group transcripts, n_clusters set how many groups should be assigned
colorDF = pd.DataFrame()
colorDF['name'] = df['name']
if len(colorDF) < cluster_num:
cluster_num = len(colorDF)
for i in range(cluster_num):
group = KMeans(n_clusters=i + 1).fit_predict(distanceTable)
groupName = 'group%s' % str(i + 1)
colorDF[groupName] = group
return colorDF
# functions that are applied by pandas dataframe
def getExon(row):
startEnd = list()
exons = row['tran'].exons
for exon in exons:
# the adjusted start, end of exons, which starts from 0
startEnd.append((exon.start - row['min'], exon.end - row['min']))
return startEnd
def getName(row):
return row.tran.name
def toBoolean(row):
booleanTran = [False for x in range(row['max'] - row['min'])]
exons = row['exons']
for exon in exons:
booleanTran[exon[0]:exon[1] + 1] = [True for x in range(exon[1] + 1 - exon[0])]
return booleanTran
def calcDis(df, i, j):
# calculate the distance between two transcripts
# transcript1: length L1
# transcript2: length L2
# overlap: length L3
# distance: (L1+L2-2L3)/(L1+L2-L3)
tran1 = df.ix[i]
tran2 = df.ix[j]
sum1 = float(sum(tran1['boolean']))
sum2 = float(sum(tran2['boolean']))
overlapLength = sum([a and b for a, b in zip(tran1['boolean'], tran2['boolean'])])
distance = (sum1 + sum2 - 2 * overlapLength) / (sum1 + sum2 - overlapLength)
return distance
# cut the transcript name if it is too long
def reduceNameLength(tranNames):
newTranNames = list()
for name in tranNames:
if len(name) >= 30:
try:
splitList = name.split('|')
newName = "|".join([splitList[0], splitList[2]])
newTranNames.append(newName)
except:
newTranNames.append(name[:25])
else:
newTranNames.append(name)
return newTranNames
def getClustersForGene(clusterDict, gene):
'''Generator function to return clusters for specified gene.'''
gd = clusterDict.getGeneDict()
gd.update({k.upper(): v for k, v in gd.iteritems()})
if gene not in gd:
return
for cluster in gd[gene]:
yield cluster
return
class Transcript (object):
'''Just a struct actually, containing data about a transcript.'''
def __init__(self, name, start=None, end=None, score=None, full=None,
partial=None, annot=False, ID=None, chr=None, source=None):
self.name = name
self.score = score
self.annot = annot # transcript comes from annotations?
self.tranIx = None # y-axis coordinate of transcript
self.full = full # number of full-length reads
self.partial = partial # number of partial-length reads
self.start = start
self.end = end
self.ID = ID
self.exons = list() # Exon objects for this transcript
self.blocks = set() # blocks where this transcript has exon(s)
self.regions = set() # regions where this transcript has exon(s)
self.chr = chr
self.source = source
# What's the difference between a block and a region? Every
# exon boundary defines a new region. A new block occurs only
# when exon coverage transitions from 0 to >0. The example
# below comprises 5 regions, but only one block.
# ==============
# ==============
# ==================
# | | | | | |
# I originally used block occupancy to group similar
# transcripts in orderTranscripts. But that turned out not to
# work very well, so I invented regions as a finer-grain
# version of that idea.
class Exon (object):
'''Struct containing data about an exon.'''
def __init__(self, tran, name, start, end, strand, QScore=None):
self.tran = tran # Transcript object containing this exon
self.name = name
self.start = start
self.end = end
self.strand = strand
self.QScore = QScore
self.block = None # block number where this exon resides
self.adjStart = None # start of exon in phony x-axis coordinates
self.leading = 0 # number of leading softclipped bases
self.trailing = 0 # number of trailing softclipped bases
class Block (object):
'''Struct for plot block.'''
# A plot block is a vertical span representing a range contiguous
# bases. Plot blocks are separated by vertical lines representing
# regions of the reference, of unspecified length, which contain
# no exons.
# one implication of that scheme is that the x axis of the plot is
# meaningless: it represents neither genomic nor RNA sequence range.
def __init__(self, start, end, boundary):
self.start = start # actual genomic start coord
self.end = end # actual genomic end coord
self.boundary = boundary # right-hand boundary x-coord in phony space
self.annot = False # block contains annotation exons?