update

Bill Majoros · Bill Majoros · commit 04b179db4e9d · 2020-03-17T15:15:16.000-04:00
diff --git a/CigarOp.py b/CigarOp.py
@@ -17,7 +17,7 @@
 #   length : integer
 #   interval1 : Interval (in sequence 1 = query)
 #   interval2 : Interval (in sequence 2 = reference)
-#   op : M/I/D/S:
+#   op : M(or =/X)/I/D/S:
 #                                                              consumes
 #                                                              query ref
 #     M 0 alignment match (can be a sequence match or mismatch) yes yes
@@ -35,12 +35,21 @@
 #   bool=op.advanceInRef() # matches, deletions, etc.
 #   op=op.getOp()
 #   L=op.getLength()
+#   interval=op.getQueryInterval() # sequence 1
+#   interval=op.getRefInterval() # sequence 2
 #=========================================================================
 class CigarOp:
     def __init__(self,op,L):
         self.op=op
         self.length=L
-        self.interval=None
+        self.interval1=None
+        self.interval2=None
+
+    def getQueryInterval(self):
+        return self.interval1
+
+    def getRefInterval(self):
+        return self.interval2
 
     def getOp(self): 
         return self.op
diff --git a/CigarString.py b/CigarString.py
@@ -22,16 +22,24 @@
 #   cigarOp=cigar[i] # returns a CigarOp object
 #   str=cigar.toString()
 #   cigar.computeIntervals(refPos)
+#   ops=cigar.matchesByLength() # sorted by decreasing length
 #   op=cigar.longestMatch() # returns a CigarOp object (or None)
 #   L=cigar.longestMatchLen() # returns integer
-#   (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2) # None if no match
-#                              ^ must call computeIntervals() first!
+#   (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2) 
+#       # ^ Returns none if no match; must call computeIntervals() first!
 #=========================================================================
 class CigarString:
     """CigarString parses CIGAR strings (alignments)"""
     def __init__(self,cigar):
         self.ops=self.parse(cigar)
 
+    def matchesByLength(self):
+        matches=[]
+        for op in self.ops:
+            if(op.getOp() in ("M","=","X")): matches.append(op)
+        matches.sort(key=lambda x: -x.getLength())
+        return matches
+
     def computeIntervals(self,refPos):
         ops=self.ops
         n=len(ops)
@@ -54,7 +62,7 @@ def __getitem__(self,i):
         
     def completeMatch(self):
         ops=self.ops
-        return len(ops)==1 and ops[0].op=="M"
+        return len(ops)==1 and ops[0].op in ("M","=","X")
 
     def longestMatchStats(self,query,ref):
         m=self.longestMatch()
@@ -76,7 +84,7 @@ def longestMatch(self):
         longest=None
         longestLength=0
         for op in self.ops:
-            if(op.getOp()=="M"):
+            if(op.getOp()in ("M","=","X")):
                 L=op.getLength()
                 if(L>longestLength):
                     longest=op
diff --git a/DataFrame.py b/DataFrame.py
@@ -18,6 +18,7 @@
 #   colHash : dictionary mapping column names to column indices
 # Methods:
 #   df=DataFrame()
+#   df.save(filename)
 #   rowNames=df.getRowNames()
 #   colNames=df.getColumnNames()
 #   df.addRow(DataFrameRow)
@@ -40,9 +41,11 @@
 #   bool=df.columnExists(colName) # call hashColNames() first!
 #   index=df.getColumnIndex(colName) # call hashColNames() first!
 #   newDataFrame=df.subsetColumns(colIndices)
+#   newDataFrame=df.subsetRows(rowIndices)
 #   idx=df.addColumn(colName,defaultValue) # returns index of new column
 #   df.print(handle)
 #   array=df.toDataArray()
+#   df.appendDF(otherDF) # does NOT do a deep copy!
 # Class methods:
 #   df=DataFrame.readTable(filename,header=False,rowNames=False)
 #=========================================================================
@@ -54,6 +57,13 @@ def __init__(self):
       self.rowHash=None
       self.colHash=None
 
+   def save(self,filename):
+      with open(filename,"wt") as OUT:
+         self.print(OUT)
+
+   def appendDF(self,other):
+      self.matrix.extend(other.matrix)
+
    def addRow(self,row):
       self.matrix.append(row)
 
@@ -90,6 +100,13 @@ def subsetColumns(self,colIndices):
          newDF.matrix.append(newRow)
       return newDF
 
+   def subsetRows(self,rowIndices):
+      newDF=DataFrame()
+      newDF.header=self.header
+      for i in rowIndices: 
+         newDF.addRow(self[i].clone())
+      return newDF
+
    def rowExists(self,rowName):
       if(self.rowHash is None): raise Exception("call hashRowNames() first")
       return self.rowHash.get(rowName,None) is not None
@@ -175,9 +192,9 @@ def readTable(cls,filename,header=False,rowNames=False):
       with open(filename,"rt") as IN:
          if(header):
             df.header=IN.readline()
-            df.header=df.header.rstrip().split("\t")
+            df.header=df.header.rstrip().split() #("\t")
          for line in IN:
-            fields=line.rstrip().split("\t")
+            fields=line.rstrip().split() #("\t")
             if(len(fields)<1): continue
             label=""
             if(rowNames):
diff --git a/DataFrameRow.py b/DataFrameRow.py
@@ -17,27 +17,40 @@
 #   row=DataFrameRow()
 #   elem=row[i] # first element is at 0 (the label is not counted)
 #   label=row.getLabel()
+#   raw=raw.getRaw()
 #   row.rename(label)
 #   n=row.length()
 #   row.toInt()
 #   row.toFloat()
 #   row.append(value)
 #   row.print(handle)
+#   newRow=row.clone()
 #=========================================================================
 
 class DataFrameRow:
    def __init__(self):
       self.label=""
       self.values=[]
 
+   def getRaw(self):
+      return self.values
+
+   def clone(self):
+      r=DataFrameRow()
+      r.label=self.label
+      for x in self.values:
+         r.values.append(x)
+      return r
+
    def __getitem__(self,i):
       return self.values[i]
 
    def __setitem__(self,i,value):
       self.values[i]=value
 
    def print(self,handle):
-      print(self.label+"\t","\t".join([str(x) for x in self.values]),sep="")
+      if(self.label!=""): print(self.label+"\t",end="",file=handle)
+      print("\t".join([str(x) for x in self.values]),sep="",file=handle)
 
    def append(self,value):
       self.values.append(value)
diff --git a/EssexNode.py b/EssexNode.py
@@ -32,6 +32,7 @@
 #   node.setIthElem(i,dataOrNode)
 #   elem=node.findChild(tag)
 #   array=node.findChildren(tag)
+#   node.dropChild(i)
 #   array=node.findDescendents(tag) # always returns an array
 #   elem=node.findDescendent(tag) # returns node or undef
 #   bool=node.hasDescendentOrDatum(tagOrDatum)
@@ -70,6 +71,9 @@ def __init__(self,parms):
             self.tag=""
             self.elements=[]
 
+    def dropChild(self,i):
+        del self.elements[i]
+
     def addElem(self,elem):
         self.elements.append(elem)
 
diff --git a/EssexParser.py b/EssexParser.py
@@ -28,6 +28,8 @@
 #   parser.close()
 #   tree=parser.nextElem()   # returns root of the tree
 #   forest=parser.parseAll() # returns an array of trees
+# Class methods:
+#   forest=EssexParser.loadFile(filename) # returns array of trees
 ######################################################################
 
 class EssexParser:
@@ -57,6 +59,11 @@ def parseAll(self):
             forest.append(tree)
         return forest
 
+    @classmethod
+    def loadFile(cls,filename):
+        parser=EssexParser(filename)
+        return parser.parseAll()
+
     def nextElem(self):
         if(not self.isOpen): raise Exception("file is not open")
         scanner=self.scanner
diff --git a/FastaReader.py b/FastaReader.py
@@ -17,7 +17,7 @@
 # Instance Methods:
 #   reader=FastaReader(filename)
 #   reader=readerFromFileHandle(fileHandle);
-#   (defline,sequence)=reader.nextSequence()
+#   (defline,sequence)=reader.nextSequence() # returns None at eof
 #   reader.close()
 #   reader.dontUppercase()
 #   reader.doUppercase()
@@ -63,7 +63,7 @@ def nextSequence(self):
             else:
                 line=fh.readline()
                 if(line): line=line.rstrip()
-            if(not line): return [None,None]
+            if(not line): return None # [None,None]
             if(re.search("^\s*>",line)):
                 defline=line
                 while(True):
@@ -111,7 +111,9 @@ def readAll(cls,filename):
         hash={}
         reader=FastaReader(filename)
         while(True):
-            (defline,seq)=reader.nextSequence()
+            rec=reader.nextSequence()
+            if(rec is None): break
+            (defline,seq)=rec
             if(not defline): break
             match=re.search("^\s*>(\S+)",defline)
             if(not match): raise Exception("can't parse defline: "+defline)
diff --git a/FastqReader.py b/FastqReader.py
@@ -15,8 +15,11 @@
 # Attributes:
 #   fh : file handle
 # Instance Methods:
-#   reader=FastqReader(filename)
-#   (ID,seq,qual,pair)=reader.nextSequence() # returns None at EOF
+#   reader=FastqReader(filename) # can be gzipped!
+#   (ID,seq,qual,qualSeq,pair)=reader.nextSequence() # returns None at EOF
+#        * pair indicates which read of the pair: 1 or 2
+#        * qual is an array of integer quality values
+#        * qualSeq is the raw quality string
 #   reader.close()
 # Class Methods:
 #=========================================================================
@@ -43,8 +46,9 @@ def nextSequence(self):
         if(rex.find("\s+(\d)",line)): pair=int(rex[1])
         seq=fh.readline().rstrip()
         junk=fh.readline()
-        qual=fh.readline().rstrip()
-        return (ID,seq,qual,pair)
+        qualSeq=fh.readline().rstrip()
+        qual=[ord(x)-33 for x in qualSeq]
+        return (ID,seq,qual,qualSeq,pair)
         
 
 
diff --git a/Rex.py b/Rex.py
@@ -28,6 +28,13 @@ def find(self,pattern,line):
         self.match=re.search(pattern,line)
         return self.match is not None
 
+    def split(self,pattern,line):
+        fields=re.split(pattern,line)
+        nonEmpty=[]
+        for x in fields:
+            if(x!=""): nonEmpty.append(x)
+        return nonEmpty
+
     def findOrDie(self,pattern,line):
         if(not self.find(pattern,line)): raise Exception("can't parse: "+line)
 
diff --git a/Stan.py b/Stan.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# Copyright (C)2016 William H. Majoros (martiandna@gmail.com).
+#=========================================================================
+from __future__ import (absolute_import, division, print_function, 
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+# The above imports should allow this program to run in both Python 2 and
+# Python 3.  You might need to update your version of module "future".
+import os
+
+######################################################################
+# Attributes:
+#    
+# Methods:
+#    stan=Stan(model)
+#    stan.run(numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile=None):
+#    stan.writeOneDimArray(name,array,dim,OUT):
+#    stan.writeTwoDimArray(name,array,firstDim,secondDim,OUT):
+#    stan.writeThreeDimArray(name,array,firstDim,secondDim,thirdDim,OUT):
+#    stan.initArray2D(dim1,dim2,value):
+#    stan.initArray3D(dim1,dim2,dim3,value):
+
+######################################################################
+
+class Stan:
+    def __init__(self,model):
+        self.model=model
+
+    def writeOneDimArray(self,name,array,dim,OUT):
+        print(name+" <- c(",end="",file=OUT)
+        for i in range(0,dim):
+            print(array[i],end="",file=OUT)
+            if(i+1<dim): print(",",end="",file=OUT)
+        print(")",file=OUT)
+
+    def writeTwoDimArray(self,name,array,firstDim,secondDim,OUT):
+        print(name+" <- structure(c(",end="",file=OUT)
+        for j in range(secondDim): # second dim
+            for i in range(firstDim): # first dim
+                print(array[i][j],end="",file=OUT)
+                if(i+1<firstDim): print(",",end="",file=OUT)
+            if(j+1<secondDim): print(",",end="",file=OUT)
+        print("), .Dim=c(",firstDim,",",secondDim,"))",sep="",file=OUT)
+
+    def writeThreeDimArray(self,name,array,firstDim,secondDim,thirdDim,OUT):
+        print(name+" <- structure(c(",end="",file=OUT)
+        for k in range(thirdDim): # third dim
+            for j in range(secondDim): # second dim
+                for i in range(firstDim): # first dim
+                    print(array[i][j][k],end="",file=OUT)
+                    if(i+1<firstDim): print(",",end="",file=OUT)
+                if(j+1<secondDim): print(",",end="",file=OUT)
+            if(k+1<thirdDim): print(",",end="",file=OUT)
+        print("), .Dim=c(",firstDim,",",secondDim,",",thirdDim,"))",sep="",file=OUT)
+
+    def initArray2D(self,dim1,dim2,value):
+        array=[]
+        for i in range(dim1):
+            row=[]
+            for j in range(dim2):
+                row.append(value)
+            array.append(row)
+        return array
+
+    def initArray3D(self,dim1,dim2,dim3,value):
+        array=[]
+        for i in range(dim1):
+            row=[]
+            for j in range(dim2):
+                row2=[]
+                for k in range(dim3):
+                    row2.append(value)
+                row.append(row2)
+            array.append(row)
+        return array
+
+    def run(self,numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile=None):
+        cmd=self.getCmd(numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile)
+        os.system(cmd)
+
+    def getCmd(self,numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile=None):
+        init=" init="+initFile if initFile is not None else ""
+        cmd=self.model+" sample thin=1"+\
+            " num_samples="+str(numSamples)+\
+            " num_warmup="+str(numWarmup)+\
+            " data file="+inputFile+\
+            init+\
+            " output file="+outputFile+" refresh=0 > "+stderrFile
+        return cmd
+