update

Bill Majoros · Bill Majoros · commit 574df0fa68d7 · 2019-01-04T14:31:21.000-05:00
diff --git a/DataFrame.py b/DataFrame.py
@@ -0,0 +1,131 @@
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# 2018 William H. Majoros (bmajoros@alumni.duke.edu)
+#=========================================================================
+from __future__ import (absolute_import, division, print_function, 
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+import sys
+from DataFrameRow import DataFrameRow
+
+#=========================================================================
+# Attributes:
+#   header
+#   matrix : array of rows, each of which is an array of data values
+#   rowHash : dictionary mapping row names to row indices
+#   colHash : dictionary mapping column names to column indices
+# Methods:
+#   df=DataFrame()
+#   rowNames=df.getRowNames()
+#   colNames=df.getColumnNames()
+#   n=df.nrow()
+#   n=df.ncol()
+#   row=df[index]
+#   elem=df[i][j]
+#   df.toInt()
+#   df.toFloat()
+#   header=df.getHeader()
+#   df.hashRowNames()
+#   df.hashColNames()
+#   row=df.getRow(rowName) # call hashRowNames() first!
+#   col=df.getColumn(columnName) # call hashColNames() first!
+#   bool=df.rowExists(rowName) # call hashRowNames() first!
+#   bool=df.columnExists(colName) # call hashColNames() first!
+# Class methods:
+#   df=DataFrame.readTable(filename,hasHeader=True,hasRowNames=True)
+#=========================================================================
+
+class DataFrame:
+   def __init__(self):
+      self.header=[]
+      self.matrix=[]
+      self.rowHash=None
+      self.colHash=None
+
+   def rowExists(self,rowName):
+      if(self.rowHash is None): raise Exception("call hashRowNames() first")
+      return self.rowHash.get(rowName,None) is not None
+
+   def columnExists(self,colName):
+      if(self.colHash is None): raise Exception("call hashColNames() first")
+      return self.colHash.get(colName,None) is not None
+
+   def getRowNames(self):
+      names=[]
+      for row in self.matrix:
+         names.append(row.label)
+      return names
+
+   def getColumnNames(self):
+      return header
+
+   def getRow(self,rowName):
+      if(self.rowHash is None): raise Exception("call hashRowNames() first")
+      rowIndex=self.rowHash.get(rowName,None)
+      if(rowIndex is None): raise Exception("row not found: "+rowName)
+      return self.matrix[rowIndex]
+
+   def getColumn(self,colName):
+      if(self.colHash is None): raise Exception("call hashColNames() first")
+      colIndex=self.colHash.get(colName,None)
+      if(colIndex is None): raise Exception("column not found: "+colName)
+      column=DataFrameRow()
+      column.label=colName
+      for row in self.matrix:
+         colum.values.append(row[colIndex])
+
+   def hashRowNames(self):
+      h=self.rowHash={}
+      numRows=self.nrow()
+      for i in range(numRows):
+         row=self.matrix[i]
+         h[row.label]=i
+
+   def hashColNames(self):
+      h=self.colHash={}
+      numCols=self.ncol()
+      for i in range(numCols):
+         h[header[i]]=i
+
+   def getHeader(self):
+      return self.header
+
+   def nrow(self):
+      return len(self.matrix)
+
+   def ncol(self):
+      return len(self.header)
+
+   def __getitem__(self,i):
+      return self.matrix[i]
+
+   def toInt(self):
+      for row in self.matrix: row.toInt()
+
+   def toFloat(self):
+      for row in self.matrix: row.toFloat()
+
+   @classmethod
+   def readTable(cls,filename,hasHeader=True,hasRowNames=True):
+      df=DataFrame()
+      with open(filename,"rt") as IN:
+         if(hasHeader):
+            df.header=IN.readline()
+            df.header=df.header.rstrip().split()
+         for line in IN:
+            fields=line.rstrip().split()
+            if(len(fields)<1): continue
+            label=""
+            if(hasRowNames):
+               label=fields[0]
+               fields=fields[1:]
+            row=DataFrameRow()
+            row.label=label
+            row.values=fields
+            df.matrix.append(row)
+      if(len(df.matrix)>0 and df.matrix[0].length()<len(df.header)):
+         df.header=df.header[1:]
+      return df
+
diff --git a/DataFrameRow.py b/DataFrameRow.py
@@ -0,0 +1,48 @@
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# 2018 William H. Majoros (bmajoros@alumni.duke.edu)
+#=========================================================================
+from __future__ import (absolute_import, division, print_function, 
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+import sys
+
+#=========================================================================
+# Attributes:
+#   label : string
+#   values : array of values
+# Methods:
+#   row=DataFrameRow()
+#   elem=row[i] # first element is at 0 (the label is not counted)
+#   label=row.getLabel()
+#   row.rename(label)
+#   n=row.length()
+#   row.toInt()
+#   row.toFloat()
+#=========================================================================
+
+class DataFrameRow:
+   def __init__(self):
+      self.label=""
+      self.values=[]
+
+   def __getitem__(self,i):
+      return self.values[i]
+
+   def length(self):
+      return len(self.values)
+
+   def getLabel(self):
+      return self.label
+
+   def rename(self,x):
+      self.label=x
+      
+   def toInt(self):
+      self.values=[int(x) for x in self.values]
+
+   def toFloat(self):
+      self.values=[float(x) for x in self.values]
+
diff --git a/FastqReader.py b/FastqReader.py
@@ -0,0 +1,50 @@
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
+#=========================================================================
+from __future__ import (absolute_import, division, print_function,
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+from Rex import Rex
+rex=Rex()
+import gzip
+
+#=========================================================================
+# Attributes:
+#   fh : file handle
+# Instance Methods:
+#   reader=FastqReader(filename)
+#   [ID,seq,qual]=reader.nextSequence() # returns None at EOF
+#   reader.close()
+# Class Methods:
+#=========================================================================
+class FastqReader:
+    """FastqReader"""
+    def __init__(self,filename):
+        if(filename is not None):
+            if(rex.find("\.gz$",filename)): self.fh=gzip.open(filename,"rt")
+            else: self.fh=open(filename,"r")
+
+    def close(self):
+        self.fh.close()
+
+    def nextSequence(self):
+        fh=self.fh
+        line=fh.readline()
+        if(line is None): return None
+        if(len(line)==0): return None
+        if(not rex.find("^(\S+)",line)):
+            return None
+            #raise Exception("Cannot parse fastq line: "+ID)
+        ID=rex[1]
+        pair=1
+        if(rex.find("\s+(\d)",line)): pair=int(rex[1])
+        seq=fh.readline().rstrip()
+        junk=fh.readline()
+        qual=fh.readline().rstrip()
+        return [ID,seq,qual,pair]
+        
+
+
diff --git a/SamReader.py b/SamReader.py
@@ -0,0 +1,50 @@
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
+#=========================================================================
+from __future__ import (absolute_import, division, print_function,
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+from Rex import Rex
+rex=Rex()
+import gzip
+from SamRecord import SamRecord
+
+#=========================================================================
+# Attributes:
+#   fh : file handle
+# Instance Methods:
+#   reader=SamReader(filename)
+#   [ID,seq,qual]=reader.nextSequence() # returns None at EOF
+#   reader.close()
+# Class Methods:
+#=========================================================================
+class SamReader:
+    """SamReader"""
+    def __init__(self,filename):
+        if(filename is not None):
+            if(rex.find("\.gz$",filename)): self.fh=gzip.open(filename,"rt")
+            else: self.fh=open(filename,"r")
+
+    def close(self):
+        self.fh.close()
+
+    def nextSequence(self):
+        fh=self.fh
+        line=fh.readline()
+        if(line is None): return None
+        while(line is not None and len(line)>0 and line[0]=="@"):
+            line=fh.readline()
+        if(line is None or len(line)==0): return None
+        fields=line.rstrip().split()
+        if(len(fields)<11): raise Exception("can't parse sam line: "+line)
+        (ID,flags,refName,refPos,mapQual,cigar,rnext,pnext,templateLen,
+         seq,qual)=fields[:11]
+        refPos=int(refPos)
+        rec=SamRecord(ID,refName,refPos,cigar,seq)
+        return rec
+
+# M03884:303:000000000-C4RM6:1:1101:1776:15706    99      chrX:31786371-31797409  6687    44      150M    =       6813    271     ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA  BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB  AS:i:300        XN:i:0  
+
diff --git a/SamRecord.py b/SamRecord.py
@@ -0,0 +1,32 @@
+#=========================================================================
+# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
+# License (GPL) version 3, as described at www.opensource.org.
+# 2018 William H. Majoros (bmajoros@allumni.duke.edu)
+#=========================================================================
+from __future__ import (absolute_import, division, print_function,
+   unicode_literals, generators, nested_scopes, with_statement)
+from builtins import (bytes, dict, int, list, object, range, str, ascii,
+   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
+
+#=========================================================================
+# Attributes:
+#   ID = read identifier
+#   refName = name of reference sequence the read aligns to
+#   refPos = position in reference where alignment begins
+#   cigar = alignment
+#   seq = read sequence
+# Instance Methods:
+#   rec=SamReader(ID,refName,refPos,cigar,seq)
+# Class Methods:
+#=========================================================================
+class SamRecord:
+    """SamRecord"""
+    def __init__(self,ID,refName,refPos,cigar,seq):
+        self.ID=ID
+        self.refName=refName
+        self.refPos=refPos
+        self.cigar=cigar
+        self.seq=seq
+
+# M03884:303:000000000-C4RM6:1:1101:1776:15706    99      chrX:31786371-31797409  6687    44      150M    =       6813    271     ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA  BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB  AS:i:300        XN:i:0  
+
diff --git a/Shuffler.py b/Shuffler.py
@@ -13,9 +13,10 @@
 # Attributes:
 #   
 # Instance Methods:
-#   Shuffler()
+#   shuffler=Shuffler()
 # Class Methods:
-#   
+#   Shuffler.shuffleArray(array)
+#   s=Shuffler.shuffleString(s)
 #=========================================================================
 class Shuffler:
     """Shuffler shuffles arrays and strings"""
diff --git a/SlurmWriter.py b/SlurmWriter.py
@@ -18,12 +18,12 @@
 #   threadsValue : number of CPUs requested
 # Instance Methods:
 #   SlurmWriter()
-#   writer.addCommand(cmd)
-#   writer.nice() # turns on "nice" (sets it to 100 by default)
-#   writer.mem(1500)
-#   writer.threads(16)
-#   writer.setQueue("new,all")
-#   writer.writeArrayScript(slurmDir,jobName,maxParallel,
+#   slurm.addCommand(cmd)
+#   slurm.nice() # turns on "nice" (sets it to 100 by default)
+#   slurm.mem(1500)
+#   slurm.threads(16)
+#   slurm.setQueue("new,all")
+#   slurm.writeArrayScript(slurmDir,jobName,maxParallel,
 #                           additional_SBATCH_lines)
 #=========================================================================
 class SlurmWriter:
@@ -95,6 +95,32 @@ def writeArrayScript(self,slurmDir,jobName,maxParallel,moreSBATCH=""):
                      queue+moreSBATCH+"#",
                      slurmDir+"/command${SLURM_ARRAY_TASK_ID}.sh\n"
                      ]))
+    def writeScript(self,slurmFile,outFile,jobName,command,moreSBATCH=""):
+        if(moreSBATCH is None): moreSBATCH=""
+        moreSBATCH=moreSBATCH.rstrip()
+        if(len(moreSBATCH)>0):
+            moreSBATCH=moreSBATCH.rstrip()+"\n"
+        if(self.niceValue>0) :
+            moreSBATCH+="#SBATCH --nice="+str(self.niceValue)+"\n"
+        if(self.memValue>0):
+            moreSBATCH+="#SBATCH --mem="+str(self.memValue)+"\n"
+        if(self.threadsValue>0):
+            moreSBATCH+="#SBATCH --cpus-per-task="+str(self.threadsValue)+"\n"
+        queue=""
+        if(len(self.queue)>0):
+            queue="#SBATCH -p "+self.queue+"\n"
+        with open(slurmFile,"w") as OUT:
+            OUT.write("\n".join(
+                    ["#!/bin/sh",
+                     "#",
+                     "#SBATCH --get-user-env",
+                     "#SBATCH -J "+jobName,
+                     "#SBATCH -A "+jobName,
+                     "#SBATCH -o "+outFile,
+                     "#SBATCH -e "+outFile,
+                     queue+moreSBATCH+"#",
+                     command
+                     ]))
 
 
 
diff --git a/essex-pretty-print.py b/essex-pretty-print.py
diff --git a/template.py b/template.py

-Original file line number
+Diff line change
 +#!/usr/bin/env python
 +#=========================================================================
 +# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
 +# License (GPL) version 3, as described at www.opensource.org.
 +# Author: William H. Majoros (bmajoros@alumni.duke.edu)
 +#=========================================================================
 +from __future__ import (absolute_import, division, print_function,
 +   unicode_literals, generators, nested_scopes, with_statement)
 +from builtins import (bytes, dict, int, list, object, range, str, ascii,
 +   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
 +# The above imports should allow this program to run in both Python 2 and
 +# Python 3.  You might need to update your version of module "future".
 +import sys
 +import ProgramName
 +from EssexParser import EssexParser
++
 +#=========================================================================
 +# main()
 +#=========================================================================
 +if(len(sys.argv)!=2):
 +    exit(ProgramName.get()+" <in.essex>\n")
 +(infile,)=sys.argv[1:]
++
 +parser=EssexParser(infile)
 +while(True):
 +    tree=parser.nextElem()
 +    if(tree is None): break
 +    tree.print(sys.stdout)
++
++
++
++