update

Bill Majoros · Bill Majoros · commit 766447939423 · 2019-07-19T13:37:11.000-04:00
diff --git a/CigarString.py b/CigarString.py
@@ -22,6 +22,10 @@
 #   cigarOp=cigar[i] # returns a CigarOp object
 #   str=cigar.toString()
 #   cigar.computeIntervals(refPos)
+#   op=cigar.longestMatch() # returns a CigarOp object (or None)
+#   L=cigar.longestMatchLen() # returns integer
+#   (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2) # None if no match
+#                              ^ must call computeIntervals() first!
 #=========================================================================
 class CigarString:
     """CigarString parses CIGAR strings (alignments)"""
@@ -52,6 +56,33 @@ def completeMatch(self):
         ops=self.ops
         return len(ops)==1 and ops[0].op=="M"
 
+    def longestMatchStats(self,query,ref):
+        m=self.longestMatch()
+        if(m is None): return None
+        sub1=query[m.interval1.getBegin():m.interval1.getEnd()]
+        sub2=ref[m.interval2.getBegin():m.interval2.getEnd()]
+        matches=0; mismatches=0
+        for i in range(len(sub1)):
+            if(sub1[i]==sub2[i]): matches+=1
+            else: mismatches+=1
+        return (matches,mismatches)
+
+    def longestMatchLen(self):
+        m=self.longestMatch()
+        if(m is None): return 0
+        return m.getLength()
+
+    def longestMatch(self):
+        longest=None
+        longestLength=0
+        for op in self.ops:
+            if(op.getOp()=="M"):
+                L=op.getLength()
+                if(L>longestLength):
+                    longest=op
+                    longestLength=L
+        return longest
+
     def toString(self):
         ops=self.ops
         s=""
diff --git a/DataFrame.py b/DataFrame.py
@@ -24,6 +24,7 @@
 #   n=df.nrow()
 #   n=df.ncol()
 #   row=df[index]
+#   rows=df.getRows()
 #   elem=df[i][j]
 #   df.toInt()
 #   df.toFloat()
@@ -36,6 +37,7 @@
 #   col=df.getColumn(columnName) # call hashColNames() first!
 #   bool=df.rowExists(rowName) # call hashRowNames() first!
 #   bool=df.columnExists(colName) # call hashColNames() first!
+#   index=df.getColumnIndex(colName) # call hashColNames() first!
 #   newDataFrame=df.subsetColumns(colIndices)
 #   idx=df.addColumn(colName,defaultValue) # returns index of new column
 #   df.print(handle)
@@ -54,6 +56,9 @@ def __init__(self):
    def addRow(self,row):
       self.matrix.append(row)
 
+   def getRows(self):
+      return self.matrix
+
    def toDataArray(self):
       array=[]
       for row in self.matrix:
@@ -88,6 +93,9 @@ def rowExists(self,rowName):
       if(self.rowHash is None): raise Exception("call hashRowNames() first")
       return self.rowHash.get(rowName,None) is not None
 
+   def getColumnIndex(self,colName):
+      return self.colHash.get(colName)
+
    def columnExists(self,colName):
       if(self.colHash is None): raise Exception("call hashColNames() first")
       return self.colHash.get(colName,None) is not None
@@ -104,7 +112,7 @@ def getColumnNames(self):
    def getRowI(self,rowIndex):
       return self.matrix[rowIndex]
 
-   def getColumnI(self,colIndex):
+   def getColI(self,colIndex):
       column=DataFrameRow()
       for row in self.matrix:
          column.values.append(row[colIndex])
@@ -123,7 +131,7 @@ def getColumn(self,colName):
       column=DataFrameRow()
       column.label=colName
       for row in self.matrix:
-         colum.values.append(row[colIndex])
+         column.values.append(row[colIndex])
       return column
 
    def hashRowNames(self):
@@ -137,7 +145,7 @@ def hashColNames(self):
       h=self.colHash={}
       numCols=self.ncol()
       for i in range(numCols):
-         h[header[i]]=i
+         h[self.header[i]]=i
 
    def getHeader(self):
       return self.header
diff --git a/FastqReader.py b/FastqReader.py
@@ -16,7 +16,7 @@
 #   fh : file handle
 # Instance Methods:
 #   reader=FastqReader(filename)
-#   [ID,seq,qual]=reader.nextSequence() # returns None at EOF
+#   (ID,seq,qual,pair)=reader.nextSequence() # returns None at EOF
 #   reader.close()
 # Class Methods:
 #=========================================================================
@@ -44,7 +44,7 @@ def nextSequence(self):
         seq=fh.readline().rstrip()
         junk=fh.readline()
         qual=fh.readline().rstrip()
-        return [ID,seq,qual,pair]
+        return (ID,seq,qual,pair)
         
 
 
diff --git a/Interval.py b/Interval.py
@@ -8,6 +8,8 @@
 from builtins import (bytes, dict, int, list, object, range, str, ascii,
    chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
 import sys
+from Rex import Rex
+rex=Rex()
 
 #=========================================================================
 # Attributes:
@@ -21,6 +23,7 @@
 #   bool=interval.contains(position)
 #   bool=interval.containsInterval(other)
 #   distance=interval.distance(other)
+#   distance=interval.distanceFromPoint(x)
 #   intersection=interval.intersect(other)
 #   union=interval.union(other) # returns an array of intervals
 #   diff=interval.minus(other)  # returns an array of intervals
@@ -37,6 +40,9 @@
 #   center=interval.floatCenter()
 #   center=interval.intCenter()
 #   center=interval.center() # same as floatCenter()
+# Class methods:
+#   interval=Interval.parseInt("(10,15)")
+#   interval=Interval.parseFloat("(3.5,7.2)")
 #=========================================================================
 
 class Interval:
@@ -47,6 +53,22 @@ def __init__(self,begin=0,end=0):
       self.begin=begin
       self.end=end
 
+   @classmethod
+   def parseInt(cls,interval):
+      if(rex.find("\(([^,]+),([^\)]+)\)",interval)):
+         return Interval(int(rex[1]),int(rex[2]))
+      if(rex.find("([^,]+):([^\)]+)",interval)):
+         return Interval(int(rex[1]),int(rex[2]))
+      if(rex.find("([^,]+)-([^\)]+)",interval)):
+         return Interval(int(rex[1]),int(rex[2]))
+      return None
+
+   @classmethod
+   def parseFloat(cls,interval):
+      if(rex.find("\(([^,]+),([^\)]+)\)",interval)):
+         return Interval(float(rex[1]),float(rex[2]))
+      return None
+
    def print(self,file=sys.stdout):
       print("(",self.begin,",",self.end,")",sep="",end="",file=file)
 
@@ -56,6 +78,12 @@ def toString(self):
    def overlaps(self,other):
       return self.begin<other.end and other.begin<self.end
 
+   def distanceFromPoint(self,x):
+      if(self.contains(x)): return 0
+      d1=abs(self.begin-x)
+      d2=abs(self.end-x)
+      return d1 if d1<d2 else d2
+
    def distance(self,other):
       if(self.overlaps(other)): return 0
       d=self.begin-other.end
diff --git a/SamReader.py b/SamReader.py
@@ -16,15 +16,18 @@
 #=========================================================================
 # Attributes:
 #   fh : file handle
+#   headerLines : array of header lines
 # Instance Methods:
 #   reader=SamReader(filename)
 #   samRecord=reader.nextSequence() # returns None at EOF
+#   (record,line)=reader.nextSeqAndText() # returns None at EOF
 #   reader.close()
 # Class Methods:
 #=========================================================================
 class SamReader:
     """SamReader"""
     def __init__(self,filename):
+        self.headerLines=[]
         if(filename is not None):
             if(rex.find("\.gz$",filename)): self.fh=gzip.open(filename,"rt")
             else: self.fh=open(filename,"r")
@@ -33,10 +36,17 @@ def close(self):
         self.fh.close()
 
     def nextSequence(self):
+        pair=self.nextSeqAndText()
+        if(pair is None): return None
+        (rec,line)=pair
+        return rec
+
+    def nextSeqAndText(self):
         fh=self.fh
         line=fh.readline()
         if(line is None): return None
         while(line is not None and len(line)>0 and line[0]=="@"):
+            if(line[0]=="@"): self.headerLines.append(line)
             line=fh.readline()
         if(line is None or len(line)==0): return None
         fields=line.rstrip().split()
@@ -46,8 +56,9 @@ def nextSequence(self):
         refPos=int(refPos)-1 # convert 1-based to 0-based
         flags=int(flags)
         CIGAR=CigarString(cigar)
-        rec=SamRecord(ID,refName,refPos,CIGAR,seq,flags)
-        return rec
+        tags=fields[11:]
+        rec=SamRecord(ID,refName,refPos,CIGAR,seq,flags,tags)
+        return (rec,line)
 
 # M03884:303:000000000-C4RM6:1:1101:1776:15706    99      chrX:31786371-31797409  6687    44      150M    =       6813    271     ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA  BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB  AS:i:300        XN:i:0  
 
diff --git a/SamRecord.py b/SamRecord.py
@@ -7,6 +7,8 @@
    unicode_literals, generators, nested_scopes, with_statement)
 from builtins import (bytes, dict, int, list, object, range, str, ascii,
    chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+from Rex import Rex
+rex=Rex()
 
 #=========================================================================
 # Attributes:
@@ -16,13 +18,17 @@
 #   CIGAR = CigarString
 #   seq = read sequence
 #   flags = bitfield
+#   tags = array of tags at end of record (MD:Z:122G25, NM:i:1, etc.)
 # Instance Methods:
-#   rec=SamReader(ID,refName,refPos,cigar,seq,flags)
+#   rec=SamRecord(ID,refName,refPos,cigar,seq,flags,tags)
 #   ID=rec.getID()
-#   cigar=rec.getCigar()
+#   cigar=rec.getCigar() # returns CigarString object
 #   seq=rec.getSequence()
 #   refName=rec.getRefName()
 #   refPos=rec.getRefPos()
+#   tags=rec.getTags()
+#   fields=rec.parseMDtag()
+#   tag=getTag("MD") # returns the third field, e.g. "122G25" in MD:Z:122G25
 #   bool=rec.flag_hasMultipleSegments()
 #   bool=rec.flag_properlyAligned()
 #   bool=rec.flag_unmapped()
@@ -39,13 +45,41 @@
 #=========================================================================
 class SamRecord:
     """SamRecord"""
-    def __init__(self,ID,refName,refPos,CIGAR,seq,flags):
+    def __init__(self,ID,refName,refPos,CIGAR,seq,flags,tags):
         self.ID=ID
         self.refName=refName
         self.refPos=refPos
         self.CIGAR=CIGAR
         self.seq=seq
         self.flags=flags
+        self.tags=tags
+
+    def getTags(self):
+        return self.tags
+
+    def getTag(self,label):
+        for tag in self.tags:
+            if(not rex.find("^([^:]+):[^:]+:(\S+)",tag)):
+                raise Exception("Can't parse SAM tag: "+tag)
+            if(rex[1]==label): return rex[2]
+        return None
+
+    def parseMDtag(self):
+        md=self.getTag("MD")
+        fields=[]
+        while(len(md)>0):
+            if(rex.find("^(\d+)(.*)",md)):
+                fields.append(rex[1])
+                md=rex[2]
+            elif(rex.find("^([ACGT])(.*)",md)):
+                fields.append(rex[1])
+                md=rex[2]
+            elif(rex.find("^(\^[ACGT]+)(.*)",md)):
+                fields.append(rex[1])
+                md=rex[2]
+            else:
+                raise Exception("Can't parse MD tag: "+md)
+        return fields
 
     def getRefName(self):
         return self.refName
diff --git a/SmithWaterman.py b/SmithWaterman.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# Author: William H. Majoros (bmajoros@alumni.duke.edu)
+#=========================================================================
+from __future__ import (absolute_import, division, print_function, 
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+# The above imports should allow this program to run in both Python 2 and
+# Python 3.  You might need to update your version of module "future".
+import os
+from Pipe import Pipe
+from Rex import Rex
+rex=Rex()
+import TempFilename
+from CigarString import CigarString
+from FastaWriter import FastaWriter
+
+#=========================================================================
+# Attributes:
+#   
+# Instance Methods:
+#   aligner=SmithWaterman(alignerDir,matrixFile,openPenalty,extrendPenalty)
+#   cigarString=aligner.align(seq1,seq2)
+#=========================================================================
+
+class SmithWaterman:
+    def __init__(self,alignerDir,matrixFile,gapOpenPenalty,gapExtendPenalty):
+        self.alignerDir=alignerDir
+        self.matrixFile=matrixFile
+        self.gapOpen=gapOpenPenalty
+        self.gapExtend=gapExtendPenalty
+        self.fastaWriter=FastaWriter()
+
+    def writeFile(self,defline,seq):
+        filename=TempFilename.generate("fasta")
+        self.fastaWriter.writeFasta(defline,seq,filename)
+        return filename
+
+    def swapInsDel(self,cigar):
+        # This is done because my aligner defines insertions and deletions
+        # opposite to how they're defined in the SAM specification
+        newCigar=""
+        for x in cigar:
+            if(x=="I"): x="D"
+            elif(x=="D"): x="I"
+            newCigar+=x
+        return newCigar
+
+    def align(self,seq1,seq2):
+        file1=self.writeFile("query",seq1)
+        file2=self.writeFile("reference",seq2)
+        cmd=self.alignerDir+"/smith-waterman -q "+self.matrixFile+" "+\
+            str(self.gapOpen)+" "+str(self.gapExtend)+" "+file1+" "+file2+" DNA"
+        output=Pipe.run(cmd)
+        os.remove(file1)
+        os.remove(file2)
+        if(not rex.find("CIGAR=(\S+)",output)):
+            raise Exception("Can't parse aligner output: "+output)
+        cigar=rex[1]
+        cigar=self.swapInsDel(cigar) # because I define cigars differently
+        return CigarString(cigar)
+
+
+
diff --git a/StanParser.py b/StanParser.py
diff --git a/parse-vcf.py b/parse-vcf.py
diff --git a/reverse.py b/reverse.py