update

Bill Majoros · Bill Majoros · commit 0de3f99deb72 · 2017-12-09T07:43:53.000-05:00
diff --git a/CodonIterator.py b/CodonIterator.py
@@ -51,7 +51,7 @@ def __init__(self,transcript,axisSequenceRef,stopCodons):
                    exon.containsCoordinate(startCodon-1)): break
             if(not exon.containsCoordinate(startCodon) and
                not exon.containsCoordinate(startCodon-1)):
-                raise Exception("start codon not found: "+str(startCodon))
+                raise Exception(transcript.getID()+"start codon not found: "+str(startCodon))
             self.exon=exon
             self.relative=(startCodon-exon.begin) if strand=="+" else \
                 exon.end-startCodon
@@ -99,7 +99,7 @@ def nextCodon(self):
             triplet=exonSeq[relative:relative+thisExonContrib]
             transcript=self.transcript
             thisExonOrder=exon.order
-            if(not thisExonOrder): raise Exception("exon has no order")
+            if(thisExonOrder is None): raise Exception("exon has no order")
             exons=transcript.exons
             numExons=len(exons)
             nextExonOrder=thisExonOrder+1
@@ -113,7 +113,7 @@ def nextCodon(self):
                     exonLen1=exon.getLength()
                     realSeqLen2=len(nextExon.sequence)
                     exonLen2=nextExon.getLength()
-                    raise Exception("Error in transcript "+transcriptId+": nextContrib="+nextExonContrib+" triplet=\""+triplet+"\" exonLen1="+exonLen1+" realLen1="+realSeqLen1+" exonLen2="+exonLen2+" realLen2="+realSeqLen2)
+                    raise Exception("Error in transcript "+transcriptId+": nextContrib="+str(nextExonContrib)+" triplet=\""+triplet+"\" exonLen1="+str(exonLen1)+" realLen1="+str(realSeqLen1)+" exonLen2="+str(exonLen2)+" realLen2="+str(realSeqLen2))
                 self.relative=nextExonContrib
                 self.absolute=nextExon.begin+nextExonContrib if strand=="+" \
                                else nextExon.end-nextExonContrib
diff --git a/Exon.py b/Exon.py
@@ -203,7 +203,7 @@ def reverseComplement(self,seqLen):
         end=self.getEnd()
         self.begin=seqLen-end
         self.end=seqLen-begin
-        self.strand=compStrand(self.strand)
+        self.strand=self.compStrand(self.strand)
 
     def copy(self):
         new=Exon(self.begin,self.end,self.transcript)
diff --git a/FastaReader.py b/FastaReader.py
@@ -17,7 +17,7 @@
 # Instance Methods:
 #   reader=FastaReader(filename)
 #   reader=readerFromFileHandle(fileHandle);
-#   [defline,sequence]=reader.nextSequence()
+#   (defline,sequence)=reader.nextSequence()
 #   reader.close()
 #   reader.dontUppercase()
 #   reader.doUppercase()
@@ -26,8 +26,9 @@
 #   num=FastaReader.countEntries(filename)
 #   FastaReader.readAll(filename) # returns hash : id->sequence
 #   FastaReader.readAllAndKeepDefs(filename) # returns hash : id->[def,seq]
+#   FastaReader.readAllIntoArray(filename) # [def,seq]
 #   (defline,seq)=FastaReader.firstSequence(filename)
-#   [id,attribute_hash]=FastaReader.parseDefline(defline)
+#   (id,attribute_hash)=FastaReader.parseDefline(defline)
 #=========================================================================
 class FastaReader:
     """FastaReader"""
@@ -110,7 +111,7 @@ def readAll(cls,filename):
         hash={}
         reader=FastaReader(filename)
         while(True):
-            [defline,seq]=reader.nextSequence()
+            (defline,seq)=reader.nextSequence()
             if(not defline): break
             match=re.search("^\s*>(\S+)",defline)
             if(not match): raise Exception("can't parse defline: "+defline)
@@ -119,6 +120,17 @@ def readAll(cls,filename):
         reader.close()
         return hash
 
+    @classmethod
+    def readAllIntoArray(cls,filename):
+        array=[]
+        reader=FastaReader(filename)
+        while(True):
+            (defline,seq)=reader.nextSequence()
+            if(not defline): break
+            array.append([defline,seq])
+        reader.close()
+        return array
+
     @classmethod
     def readAllAndKeepDefs(cls,filename):
         hash={}
diff --git a/GFF3Parser.py b/GFF3Parser.py
@@ -239,8 +239,8 @@ def parseRecord(self,fields):
         rec={"substrate":substrate,
              "source":source,
              "type":type,
-             "begin":begin,
-             "end":end,
+             "begin":int(begin)-1,
+             "end":int(end),
              "score":score,
              "strand":strand,
              "frame":frame,
diff --git a/GffTranscriptReader.py b/GffTranscriptReader.py
@@ -30,12 +30,14 @@
 #
 # Attributes:
 #   shouldSortTranscripts
+#   exonsAreCDS : interpret "exon" features as "CDS" when reading GFF
 # Methods:
 #   reader=GffTranscriptReader()
 #   reader.setStopCodons({"TAG":1,"TAA":1,"TGA":1})
 #   transcriptArray=reader.loadGFF(filename)
 #   geneList=reader.loadGenes(filename)
 #   hashTable=reader.hashBySubstrate(filename)
+#   reader.hashBySubstrateInto(filename,hash)
 #   hashTable=reader.hashGenesBySubstrate(filename)
 #   hashTable=reader.loadTranscriptIdHash(filename)
 #   hashTable=reader.loadGeneIdHash(filename)
@@ -45,6 +47,7 @@
 class GffTranscriptReader:
     def __init__(self):
         self.shouldSortTranscripts=True
+        self.exonsAreCDS=False
         self.stopCodons={"TAG":1,"TAA":1,"TGA":1}
 
     def loadGenes(self,filename):
@@ -53,7 +56,8 @@ def loadGenes(self,filename):
         for transcript in transcripts:
             gene=transcript.getGene()
             if(not gene):
-                raise Exception("transcript "+transcript.getID()+" has no gene")
+                raise Exception("transcript "+transcript.getID()+
+                                " has no gene")
             genes.add(gene)
         genes=list(genes)
         genes.sort(key=lambda gene: gene.getBegin())
@@ -81,14 +85,17 @@ def loadGeneIdHash(self,filename):
         return hash
 
     def hashBySubstrate(self,filename):
-        transcriptArray=self.loadGFF(filename)
         hash={}
+        self.hashBySubstrateInto(filename,hash)
+        return hash
+
+    def hashBySubstrateInto(self,filename,hash):
+        transcriptArray=self.loadGFF(filename)
         for transcript in transcriptArray:
             id=transcript.getSubstrate()
             array=hash.get(id,None)
             if(array is None): array=hash[id]=[]
             array.append(transcript)
-        return hash
 
     def hashGenesBySubstrate(self,filename):
         geneArray=self.loadGenes(filename)
@@ -124,6 +131,8 @@ def adjustStartCodons_fw(self,transcript,totalIntronSize):
         exons=transcript.exons
         exons.sort(key=lambda exon: exon.begin)
         numExons=len(exons)
+        #if(transcript.getID()=="ENST00000361390.2"):
+        #    print("adjustStartCodons_fw",numExons,"exons")
         if(numExons==0): return None
         if(transcript.begin is None) :
             transcript.begin=exons[0].begin
@@ -260,10 +269,13 @@ def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF,
             readOrder+=1
             transcript.substrate=fields[0]
             transcript.source=fields[1]
-            if(transcriptBeginEnd.find(transcriptId,None) is not None):
+            if(transcriptBeginEnd.get(transcriptId,None) is not None):
                 (begin,end)=transcriptBeginEnd[transcriptId]
                 transcript.setBegin(begin)
                 transcript.setEnd(end)
+            else:
+                transcript.setBegin(exonBegin)
+                transcript.setEnd(exonEnd)
         transcript.geneId=geneId
         gene=genes.get(geneId,None)
         if(gene is None):
@@ -319,6 +331,9 @@ def loadGFF_exon(self,fields,line,transcriptBeginEnd,GFF,
                 (begin,end)=transcriptBeginEnd[transcriptId]
                 transcript.setBegin(begin)
                 transcript.setEnd(end)
+            else:
+                transcript.setBegin(exonBegin)
+                transcript.setEnd(exonEnd)
         transcript.geneId=geneId
         gene=genes.get(geneId,None)
         if(gene is None):
@@ -370,6 +385,9 @@ def loadGFF_CDS(self,fields,line,transcriptBeginEnd,GFF,
                 (begin,end)=transcriptBeginEnd[transcriptId]
                 transcript.setBegin(begin)
                 transcript.setEnd(end)
+            else:
+                transcript.setBegin(exonBegin)
+                transcript.setEnd(exonEnd)
         transcript.geneId=geneId
         gene=genes.get(geneId,None)
         if(gene is None):
@@ -397,15 +415,19 @@ def loadGFF(self,gffFilename):
             if(re.search("^\s*\#",line)): continue
             fields=line.split("\t") ### \t added 3/24/2017
             if(len(fields)<8): raise Exception("can't parse GTF:"+line)
-            if(fields[2]=="transcript"):
+            if(fields[2]=="transcript" or fields[2]=="mRNA"):
                 self.loadGFF_transcript(fields,line,transcriptBeginEnd,GFF,
                                    transcripts,readOrder,genes)
             elif("UTR" in fields[2] or "utr" in fields[2]):
                 self.loadGFF_UTR(fields,line,transcriptBeginEnd,GFF,
                             transcripts,readOrder,genes)
             elif(fields[2]=="exon"):
-                self.loadGFF_exon(fields,line,transcriptBeginEnd,GFF,
-                             transcripts,readOrder,genes)
+                if(self.exonsAreCDS):
+                    self.loadGFF_CDS(fields,line,transcriptBeginEnd,GFF,
+                                     transcripts,readOrder,genes)
+                else:
+                    self.loadGFF_exon(fields,line,transcriptBeginEnd,GFF,
+                                      transcripts,readOrder,genes)
             elif("CDS" in fields[2] or "-exon" in fields[2]):
                 self.loadGFF_CDS(fields,line,transcriptBeginEnd,GFF,
                             transcripts,readOrder,genes)
diff --git a/Interval.py b/Interval.py
@@ -27,7 +27,7 @@
 #   begin=interval.getBegin()
 #   end=interval.getEnd()
 #   length=interval.length()
-#   bool=interval.equals($other)
+#   bool=interval.equals(other)
 #   other=interval.clone()
 #   bool=interval.isEmpty()
 #   d=interval.relativeDistanceFromBegin(pos)
diff --git a/Shuffler.py b/Shuffler.py
@@ -18,16 +18,28 @@
 #   
 #=========================================================================
 class Shuffler:
-    """Shuffler shuffles arrays"""
+    """Shuffler shuffles arrays and strings"""
     def __init__(self):
         pass
 
     @classmethod
-    def shuffle(cls,array):
+    def shuffleArray(cls,array):
         L=len(array)
         for i in range(L):
             j=random.randint(0,L-1)
             temp=array[i]
             array[i]=array[j]
             array[j]=temp
 
+    @classmethod
+    def shuffleString(cls,string):
+        L=len(string)
+        ret=string
+        for i in range(L):
+            j=random.randint(0,L-1)
+            if(j==i): continue
+            if(j>i):
+                ret=ret[0:i]+ret[j]+ret[i+1:j]+ret[i]+ret[j+1:L]
+            else:
+                ret=ret[0:j]+ret[i]+ret[j+1:i]+ret[j]+ret[i+1:L]
+        return ret
diff --git a/SlurmWriter.py b/SlurmWriter.py
@@ -52,16 +52,17 @@ def setQueue(self,value):
 
     def writeArrayScript(self,slurmDir,jobName,maxParallel,moreSBATCH=""):
         if(moreSBATCH is None): moreSBATCH=""
-        if(maxParallel<1): raise Exception("specify maxParallel parameter")
+        if(int(maxParallel)<1): raise Exception("specify maxParallel parameter")
         moreSBATCH=moreSBATCH.rstrip()
+        if(len(moreSBATCH)>0):
+            moreSBATCH=moreSBATCH.rstrip()+"\n"
+        #moreSBATCH=moreSBATCH+"\n"
         if(self.niceValue>0) :
             moreSBATCH+="#SBATCH --nice="+str(self.niceValue)+"\n"
         if(self.memValue>0):
             moreSBATCH+="#SBATCH --mem="+str(self.memValue)+"\n"
         if(self.threadsValue>0):
             moreSBATCH+="#SBATCH --cpus-per-task="+str(self.threadsValue)+"\n"
-        if(len(moreSBATCH)>0):
-            moreSBATCH=moreSBATCH.rstrip()+"\n"
         queue=""
         if(len(self.queue)>0):
             queue="#SBATCH -p "+self.queue+"\n"
diff --git a/Transcript.py b/Transcript.py
@@ -132,9 +132,12 @@ def __init__(self,id,strand=None):
             self.rawExons=None
             self.stopCodons={"TAG":1,"TGA":1,"TAA":1}
             self.startCodon=None
+            self.startCodonAbsolute=None
             self.extraFields=None
             self.structureChange=None
             self.fate=None
+            self.begin=None
+            self.end=None
         else: # EssexNode
             essex=id
             self.transcriptId=essex.getAttribute("ID")
@@ -634,8 +637,12 @@ def getIntrons(self):
         for exon in exons:
             if(lastExonEnd):
                 if(strand=="+"):
+                    if(lastExonEnd>exon.getBegin()): exit("XXX "+str(lastExonEnd)+" "+str(exon.getBegin())+" "+strand)
                     introns.append(Interval(lastExonEnd,exon.getBegin()))
                 else:
+                    if(lastExonEnd<exon.getEnd()): 
+                        for exon in exons: print("ZZZ",exon.toGff())
+                        exit("YYY "+str(exon.getEnd())+" "+str(lastExonEnd)+" "+strand+" "+self.toGff())
                     introns.append(Interval(exon.getEnd(),lastExonEnd))
             lastExonEnd=exon.getEnd() if strand=="+" else exon.getBegin()
         return introns
@@ -734,7 +741,8 @@ def getRawExons(self):
             if(i+1<n):
                 nextExon=rawExons[i+1]
                 if(exon.getEnd()==nextExon.getBegin() or
-                   exon.getEnd()==nextExon.getBegin()-1):
+                   exon.getEnd()==nextExon.getBegin()-1 or
+                   exon.overlaps(nextExon)):
                     exon.setEnd(nextExon.getEnd())
                     nextExon=None
                     rawExons.pop(i+1)
@@ -794,6 +802,7 @@ def parseRawExons(self):
             if(CDS is None or len(CDS)==0): self.UTR=rawExons
             return
         UTR=[]
+        seen=set()
         if(strand=="+"):
             for exon in rawExons:
                 begin=exon.getBegin()
@@ -802,21 +811,33 @@ def parseRawExons(self):
                     if(end<=cdsBegin):
                         newExon=exon.copy()
                         newExon.setType("five_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
                     else:
                         newExon=exon.copy()
                         newExon.setEnd(cdsBegin)
                         newExon.setType("five_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
                 if(end>cdsEnd):
                     if(begin>=cdsEnd):
                         newExon=exon.copy()
                         newExon.setType("three_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
                     else:
                         newExon=exon.copy()
                         newExon.setBegin(cdsEnd)
                         newExon.setType("three_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
         else: # strand=="-"
             for exon in rawExons:
@@ -826,21 +847,33 @@ def parseRawExons(self):
                     if(end<=cdsBegin):
                         newExon=exon.copy()
                         newExon.setType("three_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
                     else:
                         newExon=exon.copy()
                         newExon.setEnd(cdsBegin)
                         newExon.setType("three_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
                 if(end>cdsEnd):
                     if(begin>=cdsEnd):
                         newExon=exon.copy()
                         newExon.setType("five_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
                     else:
                         newExon=exon.copy()
                         newExon.setBegin(cdsEnd)
                         newExon.setType("five_prime_UTR")
+                        key=str(newExon.begin)+"-"+str(newExon.end)
+                        if(key in seen): continue
+                        seen.add(key)
                         UTR.append(newExon)
         self.UTR=UTR
 
diff --git a/Translation.py b/Translation.py