Skip to content

Commit 0de3f99

Browse files
author
Bill Majoros
committed
update
1 parent 7fbc5f0 commit 0de3f99

10 files changed

+125
-23
lines changed

CodonIterator.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __init__(self,transcript,axisSequenceRef,stopCodons):
5151
exon.containsCoordinate(startCodon-1)): break
5252
if(not exon.containsCoordinate(startCodon) and
5353
not exon.containsCoordinate(startCodon-1)):
54-
raise Exception("start codon not found: "+str(startCodon))
54+
raise Exception(transcript.getID()+"start codon not found: "+str(startCodon))
5555
self.exon=exon
5656
self.relative=(startCodon-exon.begin) if strand=="+" else \
5757
exon.end-startCodon
@@ -99,7 +99,7 @@ def nextCodon(self):
9999
triplet=exonSeq[relative:relative+thisExonContrib]
100100
transcript=self.transcript
101101
thisExonOrder=exon.order
102-
if(not thisExonOrder): raise Exception("exon has no order")
102+
if(thisExonOrder is None): raise Exception("exon has no order")
103103
exons=transcript.exons
104104
numExons=len(exons)
105105
nextExonOrder=thisExonOrder+1
@@ -113,7 +113,7 @@ def nextCodon(self):
113113
exonLen1=exon.getLength()
114114
realSeqLen2=len(nextExon.sequence)
115115
exonLen2=nextExon.getLength()
116-
raise Exception("Error in transcript "+transcriptId+": nextContrib="+nextExonContrib+" triplet=\""+triplet+"\" exonLen1="+exonLen1+" realLen1="+realSeqLen1+" exonLen2="+exonLen2+" realLen2="+realSeqLen2)
116+
raise Exception("Error in transcript "+transcriptId+": nextContrib="+str(nextExonContrib)+" triplet=\""+triplet+"\" exonLen1="+str(exonLen1)+" realLen1="+str(realSeqLen1)+" exonLen2="+str(exonLen2)+" realLen2="+str(realSeqLen2))
117117
self.relative=nextExonContrib
118118
self.absolute=nextExon.begin+nextExonContrib if strand=="+" \
119119
else nextExon.end-nextExonContrib

Exon.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def reverseComplement(self,seqLen):
203203
end=self.getEnd()
204204
self.begin=seqLen-end
205205
self.end=seqLen-begin
206-
self.strand=compStrand(self.strand)
206+
self.strand=self.compStrand(self.strand)
207207

208208
def copy(self):
209209
new=Exon(self.begin,self.end,self.transcript)

FastaReader.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# Instance Methods:
1818
# reader=FastaReader(filename)
1919
# reader=readerFromFileHandle(fileHandle);
20-
# [defline,sequence]=reader.nextSequence()
20+
# (defline,sequence)=reader.nextSequence()
2121
# reader.close()
2222
# reader.dontUppercase()
2323
# reader.doUppercase()
@@ -26,8 +26,9 @@
2626
# num=FastaReader.countEntries(filename)
2727
# FastaReader.readAll(filename) # returns hash : id->sequence
2828
# FastaReader.readAllAndKeepDefs(filename) # returns hash : id->[def,seq]
29+
# FastaReader.readAllIntoArray(filename) # [def,seq]
2930
# (defline,seq)=FastaReader.firstSequence(filename)
30-
# [id,attribute_hash]=FastaReader.parseDefline(defline)
31+
# (id,attribute_hash)=FastaReader.parseDefline(defline)
3132
#=========================================================================
3233
class FastaReader:
3334
"""FastaReader"""
@@ -110,7 +111,7 @@ def readAll(cls,filename):
110111
hash={}
111112
reader=FastaReader(filename)
112113
while(True):
113-
[defline,seq]=reader.nextSequence()
114+
(defline,seq)=reader.nextSequence()
114115
if(not defline): break
115116
match=re.search("^\s*>(\S+)",defline)
116117
if(not match): raise Exception("can't parse defline: "+defline)
@@ -119,6 +120,17 @@ def readAll(cls,filename):
119120
reader.close()
120121
return hash
121122

123+
@classmethod
124+
def readAllIntoArray(cls,filename):
125+
array=[]
126+
reader=FastaReader(filename)
127+
while(True):
128+
(defline,seq)=reader.nextSequence()
129+
if(not defline): break
130+
array.append([defline,seq])
131+
reader.close()
132+
return array
133+
122134
@classmethod
123135
def readAllAndKeepDefs(cls,filename):
124136
hash={}

GFF3Parser.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,8 @@ def parseRecord(self,fields):
239239
rec={"substrate":substrate,
240240
"source":source,
241241
"type":type,
242-
"begin":begin,
243-
"end":end,
242+
"begin":int(begin)-1,
243+
"end":int(end),
244244
"score":score,
245245
"strand":strand,
246246
"frame":frame,

GffTranscriptReader.py

+29-7
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@
3030
#
3131
# Attributes:
3232
# shouldSortTranscripts
33+
# exonsAreCDS : interpret "exon" features as "CDS" when reading GFF
3334
# Methods:
3435
# reader=GffTranscriptReader()
3536
# reader.setStopCodons({"TAG":1,"TAA":1,"TGA":1})
3637
# transcriptArray=reader.loadGFF(filename)
3738
# geneList=reader.loadGenes(filename)
3839
# hashTable=reader.hashBySubstrate(filename)
40+
# reader.hashBySubstrateInto(filename,hash)
3941
# hashTable=reader.hashGenesBySubstrate(filename)
4042
# hashTable=reader.loadTranscriptIdHash(filename)
4143
# hashTable=reader.loadGeneIdHash(filename)
@@ -45,6 +47,7 @@
4547
class GffTranscriptReader:
4648
def __init__(self):
4749
self.shouldSortTranscripts=True
50+
self.exonsAreCDS=False
4851
self.stopCodons={"TAG":1,"TAA":1,"TGA":1}
4952

5053
def loadGenes(self,filename):
@@ -53,7 +56,8 @@ def loadGenes(self,filename):
5356
for transcript in transcripts:
5457
gene=transcript.getGene()
5558
if(not gene):
56-
raise Exception("transcript "+transcript.getID()+" has no gene")
59+
raise Exception("transcript "+transcript.getID()+
60+
" has no gene")
5761
genes.add(gene)
5862
genes=list(genes)
5963
genes.sort(key=lambda gene: gene.getBegin())
@@ -81,14 +85,17 @@ def loadGeneIdHash(self,filename):
8185
return hash
8286

8387
def hashBySubstrate(self,filename):
84-
transcriptArray=self.loadGFF(filename)
8588
hash={}
89+
self.hashBySubstrateInto(filename,hash)
90+
return hash
91+
92+
def hashBySubstrateInto(self,filename,hash):
93+
transcriptArray=self.loadGFF(filename)
8694
for transcript in transcriptArray:
8795
id=transcript.getSubstrate()
8896
array=hash.get(id,None)
8997
if(array is None): array=hash[id]=[]
9098
array.append(transcript)
91-
return hash
9299

93100
def hashGenesBySubstrate(self,filename):
94101
geneArray=self.loadGenes(filename)
@@ -124,6 +131,8 @@ def adjustStartCodons_fw(self,transcript,totalIntronSize):
124131
exons=transcript.exons
125132
exons.sort(key=lambda exon: exon.begin)
126133
numExons=len(exons)
134+
#if(transcript.getID()=="ENST00000361390.2"):
135+
# print("adjustStartCodons_fw",numExons,"exons")
127136
if(numExons==0): return None
128137
if(transcript.begin is None) :
129138
transcript.begin=exons[0].begin
@@ -260,10 +269,13 @@ def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF,
260269
readOrder+=1
261270
transcript.substrate=fields[0]
262271
transcript.source=fields[1]
263-
if(transcriptBeginEnd.find(transcriptId,None) is not None):
272+
if(transcriptBeginEnd.get(transcriptId,None) is not None):
264273
(begin,end)=transcriptBeginEnd[transcriptId]
265274
transcript.setBegin(begin)
266275
transcript.setEnd(end)
276+
else:
277+
transcript.setBegin(exonBegin)
278+
transcript.setEnd(exonEnd)
267279
transcript.geneId=geneId
268280
gene=genes.get(geneId,None)
269281
if(gene is None):
@@ -319,6 +331,9 @@ def loadGFF_exon(self,fields,line,transcriptBeginEnd,GFF,
319331
(begin,end)=transcriptBeginEnd[transcriptId]
320332
transcript.setBegin(begin)
321333
transcript.setEnd(end)
334+
else:
335+
transcript.setBegin(exonBegin)
336+
transcript.setEnd(exonEnd)
322337
transcript.geneId=geneId
323338
gene=genes.get(geneId,None)
324339
if(gene is None):
@@ -370,6 +385,9 @@ def loadGFF_CDS(self,fields,line,transcriptBeginEnd,GFF,
370385
(begin,end)=transcriptBeginEnd[transcriptId]
371386
transcript.setBegin(begin)
372387
transcript.setEnd(end)
388+
else:
389+
transcript.setBegin(exonBegin)
390+
transcript.setEnd(exonEnd)
373391
transcript.geneId=geneId
374392
gene=genes.get(geneId,None)
375393
if(gene is None):
@@ -397,15 +415,19 @@ def loadGFF(self,gffFilename):
397415
if(re.search("^\s*\#",line)): continue
398416
fields=line.split("\t") ### \t added 3/24/2017
399417
if(len(fields)<8): raise Exception("can't parse GTF:"+line)
400-
if(fields[2]=="transcript"):
418+
if(fields[2]=="transcript" or fields[2]=="mRNA"):
401419
self.loadGFF_transcript(fields,line,transcriptBeginEnd,GFF,
402420
transcripts,readOrder,genes)
403421
elif("UTR" in fields[2] or "utr" in fields[2]):
404422
self.loadGFF_UTR(fields,line,transcriptBeginEnd,GFF,
405423
transcripts,readOrder,genes)
406424
elif(fields[2]=="exon"):
407-
self.loadGFF_exon(fields,line,transcriptBeginEnd,GFF,
408-
transcripts,readOrder,genes)
425+
if(self.exonsAreCDS):
426+
self.loadGFF_CDS(fields,line,transcriptBeginEnd,GFF,
427+
transcripts,readOrder,genes)
428+
else:
429+
self.loadGFF_exon(fields,line,transcriptBeginEnd,GFF,
430+
transcripts,readOrder,genes)
409431
elif("CDS" in fields[2] or "-exon" in fields[2]):
410432
self.loadGFF_CDS(fields,line,transcriptBeginEnd,GFF,
411433
transcripts,readOrder,genes)

Interval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
# begin=interval.getBegin()
2828
# end=interval.getEnd()
2929
# length=interval.length()
30-
# bool=interval.equals($other)
30+
# bool=interval.equals(other)
3131
# other=interval.clone()
3232
# bool=interval.isEmpty()
3333
# d=interval.relativeDistanceFromBegin(pos)

Shuffler.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,28 @@
1818
#
1919
#=========================================================================
2020
class Shuffler:
21-
"""Shuffler shuffles arrays"""
21+
"""Shuffler shuffles arrays and strings"""
2222
def __init__(self):
2323
pass
2424

2525
@classmethod
26-
def shuffle(cls,array):
26+
def shuffleArray(cls,array):
2727
L=len(array)
2828
for i in range(L):
2929
j=random.randint(0,L-1)
3030
temp=array[i]
3131
array[i]=array[j]
3232
array[j]=temp
3333

34+
@classmethod
35+
def shuffleString(cls,string):
36+
L=len(string)
37+
ret=string
38+
for i in range(L):
39+
j=random.randint(0,L-1)
40+
if(j==i): continue
41+
if(j>i):
42+
ret=ret[0:i]+ret[j]+ret[i+1:j]+ret[i]+ret[j+1:L]
43+
else:
44+
ret=ret[0:j]+ret[i]+ret[j+1:i]+ret[j]+ret[i+1:L]
45+
return ret

SlurmWriter.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,17 @@ def setQueue(self,value):
5252

5353
def writeArrayScript(self,slurmDir,jobName,maxParallel,moreSBATCH=""):
5454
if(moreSBATCH is None): moreSBATCH=""
55-
if(maxParallel<1): raise Exception("specify maxParallel parameter")
55+
if(int(maxParallel)<1): raise Exception("specify maxParallel parameter")
5656
moreSBATCH=moreSBATCH.rstrip()
57+
if(len(moreSBATCH)>0):
58+
moreSBATCH=moreSBATCH.rstrip()+"\n"
59+
#moreSBATCH=moreSBATCH+"\n"
5760
if(self.niceValue>0) :
5861
moreSBATCH+="#SBATCH --nice="+str(self.niceValue)+"\n"
5962
if(self.memValue>0):
6063
moreSBATCH+="#SBATCH --mem="+str(self.memValue)+"\n"
6164
if(self.threadsValue>0):
6265
moreSBATCH+="#SBATCH --cpus-per-task="+str(self.threadsValue)+"\n"
63-
if(len(moreSBATCH)>0):
64-
moreSBATCH=moreSBATCH.rstrip()+"\n"
6566
queue=""
6667
if(len(self.queue)>0):
6768
queue="#SBATCH -p "+self.queue+"\n"

Transcript.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,12 @@ def __init__(self,id,strand=None):
132132
self.rawExons=None
133133
self.stopCodons={"TAG":1,"TGA":1,"TAA":1}
134134
self.startCodon=None
135+
self.startCodonAbsolute=None
135136
self.extraFields=None
136137
self.structureChange=None
137138
self.fate=None
139+
self.begin=None
140+
self.end=None
138141
else: # EssexNode
139142
essex=id
140143
self.transcriptId=essex.getAttribute("ID")
@@ -634,8 +637,12 @@ def getIntrons(self):
634637
for exon in exons:
635638
if(lastExonEnd):
636639
if(strand=="+"):
640+
if(lastExonEnd>exon.getBegin()): exit("XXX "+str(lastExonEnd)+" "+str(exon.getBegin())+" "+strand)
637641
introns.append(Interval(lastExonEnd,exon.getBegin()))
638642
else:
643+
if(lastExonEnd<exon.getEnd()):
644+
for exon in exons: print("ZZZ",exon.toGff())
645+
exit("YYY "+str(exon.getEnd())+" "+str(lastExonEnd)+" "+strand+" "+self.toGff())
639646
introns.append(Interval(exon.getEnd(),lastExonEnd))
640647
lastExonEnd=exon.getEnd() if strand=="+" else exon.getBegin()
641648
return introns
@@ -734,7 +741,8 @@ def getRawExons(self):
734741
if(i+1<n):
735742
nextExon=rawExons[i+1]
736743
if(exon.getEnd()==nextExon.getBegin() or
737-
exon.getEnd()==nextExon.getBegin()-1):
744+
exon.getEnd()==nextExon.getBegin()-1 or
745+
exon.overlaps(nextExon)):
738746
exon.setEnd(nextExon.getEnd())
739747
nextExon=None
740748
rawExons.pop(i+1)
@@ -794,6 +802,7 @@ def parseRawExons(self):
794802
if(CDS is None or len(CDS)==0): self.UTR=rawExons
795803
return
796804
UTR=[]
805+
seen=set()
797806
if(strand=="+"):
798807
for exon in rawExons:
799808
begin=exon.getBegin()
@@ -802,21 +811,33 @@ def parseRawExons(self):
802811
if(end<=cdsBegin):
803812
newExon=exon.copy()
804813
newExon.setType("five_prime_UTR")
814+
key=str(newExon.begin)+"-"+str(newExon.end)
815+
if(key in seen): continue
816+
seen.add(key)
805817
UTR.append(newExon)
806818
else:
807819
newExon=exon.copy()
808820
newExon.setEnd(cdsBegin)
809821
newExon.setType("five_prime_UTR")
822+
key=str(newExon.begin)+"-"+str(newExon.end)
823+
if(key in seen): continue
824+
seen.add(key)
810825
UTR.append(newExon)
811826
if(end>cdsEnd):
812827
if(begin>=cdsEnd):
813828
newExon=exon.copy()
814829
newExon.setType("three_prime_UTR")
830+
key=str(newExon.begin)+"-"+str(newExon.end)
831+
if(key in seen): continue
832+
seen.add(key)
815833
UTR.append(newExon)
816834
else:
817835
newExon=exon.copy()
818836
newExon.setBegin(cdsEnd)
819837
newExon.setType("three_prime_UTR")
838+
key=str(newExon.begin)+"-"+str(newExon.end)
839+
if(key in seen): continue
840+
seen.add(key)
820841
UTR.append(newExon)
821842
else: # strand=="-"
822843
for exon in rawExons:
@@ -826,21 +847,33 @@ def parseRawExons(self):
826847
if(end<=cdsBegin):
827848
newExon=exon.copy()
828849
newExon.setType("three_prime_UTR")
850+
key=str(newExon.begin)+"-"+str(newExon.end)
851+
if(key in seen): continue
852+
seen.add(key)
829853
UTR.append(newExon)
830854
else:
831855
newExon=exon.copy()
832856
newExon.setEnd(cdsBegin)
833857
newExon.setType("three_prime_UTR")
858+
key=str(newExon.begin)+"-"+str(newExon.end)
859+
if(key in seen): continue
860+
seen.add(key)
834861
UTR.append(newExon)
835862
if(end>cdsEnd):
836863
if(begin>=cdsEnd):
837864
newExon=exon.copy()
838865
newExon.setType("five_prime_UTR")
866+
key=str(newExon.begin)+"-"+str(newExon.end)
867+
if(key in seen): continue
868+
seen.add(key)
839869
UTR.append(newExon)
840870
else:
841871
newExon=exon.copy()
842872
newExon.setBegin(cdsEnd)
843873
newExon.setType("five_prime_UTR")
874+
key=str(newExon.begin)+"-"+str(newExon.end)
875+
if(key in seen): continue
876+
seen.add(key)
844877
UTR.append(newExon)
845878
self.UTR=UTR
846879

0 commit comments

Comments
 (0)