Skip to content

Commit 04b179d

Browse files
author
Bill Majoros
committed
update
1 parent afcf79a commit 04b179d

10 files changed

+181
-16
lines changed

CigarOp.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# length : integer
1818
# interval1 : Interval (in sequence 1 = query)
1919
# interval2 : Interval (in sequence 2 = reference)
20-
# op : M/I/D/S:
20+
# op : M(or =/X)/I/D/S:
2121
# consumes
2222
# query ref
2323
# M 0 alignment match (can be a sequence match or mismatch) yes yes
@@ -35,12 +35,21 @@
3535
# bool=op.advanceInRef() # matches, deletions, etc.
3636
# op=op.getOp()
3737
# L=op.getLength()
38+
# interval=op.getQueryInterval() # sequence 1
39+
# interval=op.getRefInterval() # sequence 2
3840
#=========================================================================
3941
class CigarOp:
4042
def __init__(self,op,L):
4143
self.op=op
4244
self.length=L
43-
self.interval=None
45+
self.interval1=None
46+
self.interval2=None
47+
48+
def getQueryInterval(self):
49+
return self.interval1
50+
51+
def getRefInterval(self):
52+
return self.interval2
4453

4554
def getOp(self):
4655
return self.op

CigarString.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,24 @@
2222
# cigarOp=cigar[i] # returns a CigarOp object
2323
# str=cigar.toString()
2424
# cigar.computeIntervals(refPos)
25+
# ops=cigar.matchesByLength() # sorted by decreasing length
2526
# op=cigar.longestMatch() # returns a CigarOp object (or None)
2627
# L=cigar.longestMatchLen() # returns integer
27-
# (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2) # None if no match
28-
# ^ must call computeIntervals() first!
28+
# (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2)
29+
# # ^ Returns none if no match; must call computeIntervals() first!
2930
#=========================================================================
3031
class CigarString:
3132
"""CigarString parses CIGAR strings (alignments)"""
3233
def __init__(self,cigar):
3334
self.ops=self.parse(cigar)
3435

36+
def matchesByLength(self):
37+
matches=[]
38+
for op in self.ops:
39+
if(op.getOp() in ("M","=","X")): matches.append(op)
40+
matches.sort(key=lambda x: -x.getLength())
41+
return matches
42+
3543
def computeIntervals(self,refPos):
3644
ops=self.ops
3745
n=len(ops)
@@ -54,7 +62,7 @@ def __getitem__(self,i):
5462

5563
def completeMatch(self):
5664
ops=self.ops
57-
return len(ops)==1 and ops[0].op=="M"
65+
return len(ops)==1 and ops[0].op in ("M","=","X")
5866

5967
def longestMatchStats(self,query,ref):
6068
m=self.longestMatch()
@@ -76,7 +84,7 @@ def longestMatch(self):
7684
longest=None
7785
longestLength=0
7886
for op in self.ops:
79-
if(op.getOp()=="M"):
87+
if(op.getOp()in ("M","=","X")):
8088
L=op.getLength()
8189
if(L>longestLength):
8290
longest=op

DataFrame.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
# colHash : dictionary mapping column names to column indices
1919
# Methods:
2020
# df=DataFrame()
21+
# df.save(filename)
2122
# rowNames=df.getRowNames()
2223
# colNames=df.getColumnNames()
2324
# df.addRow(DataFrameRow)
@@ -40,9 +41,11 @@
4041
# bool=df.columnExists(colName) # call hashColNames() first!
4142
# index=df.getColumnIndex(colName) # call hashColNames() first!
4243
# newDataFrame=df.subsetColumns(colIndices)
44+
# newDataFrame=df.subsetRows(rowIndices)
4345
# idx=df.addColumn(colName,defaultValue) # returns index of new column
4446
# df.print(handle)
4547
# array=df.toDataArray()
48+
# df.appendDF(otherDF) # does NOT do a deep copy!
4649
# Class methods:
4750
# df=DataFrame.readTable(filename,header=False,rowNames=False)
4851
#=========================================================================
@@ -54,6 +57,13 @@ def __init__(self):
5457
self.rowHash=None
5558
self.colHash=None
5659

60+
def save(self,filename):
61+
with open(filename,"wt") as OUT:
62+
self.print(OUT)
63+
64+
def appendDF(self,other):
65+
self.matrix.extend(other.matrix)
66+
5767
def addRow(self,row):
5868
self.matrix.append(row)
5969

@@ -90,6 +100,13 @@ def subsetColumns(self,colIndices):
90100
newDF.matrix.append(newRow)
91101
return newDF
92102

103+
def subsetRows(self,rowIndices):
104+
newDF=DataFrame()
105+
newDF.header=self.header
106+
for i in rowIndices:
107+
newDF.addRow(self[i].clone())
108+
return newDF
109+
93110
def rowExists(self,rowName):
94111
if(self.rowHash is None): raise Exception("call hashRowNames() first")
95112
return self.rowHash.get(rowName,None) is not None
@@ -175,9 +192,9 @@ def readTable(cls,filename,header=False,rowNames=False):
175192
with open(filename,"rt") as IN:
176193
if(header):
177194
df.header=IN.readline()
178-
df.header=df.header.rstrip().split("\t")
195+
df.header=df.header.rstrip().split() #("\t")
179196
for line in IN:
180-
fields=line.rstrip().split("\t")
197+
fields=line.rstrip().split() #("\t")
181198
if(len(fields)<1): continue
182199
label=""
183200
if(rowNames):

DataFrameRow.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,40 @@
1717
# row=DataFrameRow()
1818
# elem=row[i] # first element is at 0 (the label is not counted)
1919
# label=row.getLabel()
20+
# raw=raw.getRaw()
2021
# row.rename(label)
2122
# n=row.length()
2223
# row.toInt()
2324
# row.toFloat()
2425
# row.append(value)
2526
# row.print(handle)
27+
# newRow=row.clone()
2628
#=========================================================================
2729

2830
class DataFrameRow:
2931
def __init__(self):
3032
self.label=""
3133
self.values=[]
3234

35+
def getRaw(self):
36+
return self.values
37+
38+
def clone(self):
39+
r=DataFrameRow()
40+
r.label=self.label
41+
for x in self.values:
42+
r.values.append(x)
43+
return r
44+
3345
def __getitem__(self,i):
3446
return self.values[i]
3547

3648
def __setitem__(self,i,value):
3749
self.values[i]=value
3850

3951
def print(self,handle):
40-
print(self.label+"\t","\t".join([str(x) for x in self.values]),sep="")
52+
if(self.label!=""): print(self.label+"\t",end="",file=handle)
53+
print("\t".join([str(x) for x in self.values]),sep="",file=handle)
4154

4255
def append(self,value):
4356
self.values.append(value)

EssexNode.py

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
# node.setIthElem(i,dataOrNode)
3333
# elem=node.findChild(tag)
3434
# array=node.findChildren(tag)
35+
# node.dropChild(i)
3536
# array=node.findDescendents(tag) # always returns an array
3637
# elem=node.findDescendent(tag) # returns node or undef
3738
# bool=node.hasDescendentOrDatum(tagOrDatum)
@@ -70,6 +71,9 @@ def __init__(self,parms):
7071
self.tag=""
7172
self.elements=[]
7273

74+
def dropChild(self,i):
75+
del self.elements[i]
76+
7377
def addElem(self,elem):
7478
self.elements.append(elem)
7579

EssexParser.py

+7
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
# parser.close()
2929
# tree=parser.nextElem() # returns root of the tree
3030
# forest=parser.parseAll() # returns an array of trees
31+
# Class methods:
32+
# forest=EssexParser.loadFile(filename) # returns array of trees
3133
######################################################################
3234

3335
class EssexParser:
@@ -57,6 +59,11 @@ def parseAll(self):
5759
forest.append(tree)
5860
return forest
5961

62+
@classmethod
63+
def loadFile(cls,filename):
64+
parser=EssexParser(filename)
65+
return parser.parseAll()
66+
6067
def nextElem(self):
6168
if(not self.isOpen): raise Exception("file is not open")
6269
scanner=self.scanner

FastaReader.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# Instance Methods:
1818
# reader=FastaReader(filename)
1919
# reader=readerFromFileHandle(fileHandle);
20-
# (defline,sequence)=reader.nextSequence()
20+
# (defline,sequence)=reader.nextSequence() # returns None at eof
2121
# reader.close()
2222
# reader.dontUppercase()
2323
# reader.doUppercase()
@@ -63,7 +63,7 @@ def nextSequence(self):
6363
else:
6464
line=fh.readline()
6565
if(line): line=line.rstrip()
66-
if(not line): return [None,None]
66+
if(not line): return None # [None,None]
6767
if(re.search("^\s*>",line)):
6868
defline=line
6969
while(True):
@@ -111,7 +111,9 @@ def readAll(cls,filename):
111111
hash={}
112112
reader=FastaReader(filename)
113113
while(True):
114-
(defline,seq)=reader.nextSequence()
114+
rec=reader.nextSequence()
115+
if(rec is None): break
116+
(defline,seq)=rec
115117
if(not defline): break
116118
match=re.search("^\s*>(\S+)",defline)
117119
if(not match): raise Exception("can't parse defline: "+defline)

FastqReader.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@
1515
# Attributes:
1616
# fh : file handle
1717
# Instance Methods:
18-
# reader=FastqReader(filename)
19-
# (ID,seq,qual,pair)=reader.nextSequence() # returns None at EOF
18+
# reader=FastqReader(filename) # can be gzipped!
19+
# (ID,seq,qual,qualSeq,pair)=reader.nextSequence() # returns None at EOF
20+
# * pair indicates which read of the pair: 1 or 2
21+
# * qual is an array of integer quality values
22+
# * qualSeq is the raw quality string
2023
# reader.close()
2124
# Class Methods:
2225
#=========================================================================
@@ -43,8 +46,9 @@ def nextSequence(self):
4346
if(rex.find("\s+(\d)",line)): pair=int(rex[1])
4447
seq=fh.readline().rstrip()
4548
junk=fh.readline()
46-
qual=fh.readline().rstrip()
47-
return (ID,seq,qual,pair)
49+
qualSeq=fh.readline().rstrip()
50+
qual=[ord(x)-33 for x in qualSeq]
51+
return (ID,seq,qual,qualSeq,pair)
4852

4953

5054

Rex.py

+7
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ def find(self,pattern,line):
2828
self.match=re.search(pattern,line)
2929
return self.match is not None
3030

31+
def split(self,pattern,line):
32+
fields=re.split(pattern,line)
33+
nonEmpty=[]
34+
for x in fields:
35+
if(x!=""): nonEmpty.append(x)
36+
return nonEmpty
37+
3138
def findOrDie(self,pattern,line):
3239
if(not self.find(pattern,line)): raise Exception("can't parse: "+line)
3340

Stan.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/usr/bin/env python
2+
#=========================================================================
3+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
4+
# License (GPL) version 3, as described at www.opensource.org.
5+
# Copyright (C)2016 William H. Majoros (martiandna@gmail.com).
6+
#=========================================================================
7+
from __future__ import (absolute_import, division, print_function,
8+
unicode_literals, generators, nested_scopes, with_statement)
9+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
10+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
11+
# The above imports should allow this program to run in both Python 2 and
12+
# Python 3. You might need to update your version of module "future".
13+
import os
14+
15+
######################################################################
16+
# Attributes:
17+
#
18+
# Methods:
19+
# stan=Stan(model)
20+
# stan.run(numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile=None):
21+
# stan.writeOneDimArray(name,array,dim,OUT):
22+
# stan.writeTwoDimArray(name,array,firstDim,secondDim,OUT):
23+
# stan.writeThreeDimArray(name,array,firstDim,secondDim,thirdDim,OUT):
24+
# stan.initArray2D(dim1,dim2,value):
25+
# stan.initArray3D(dim1,dim2,dim3,value):
26+
27+
######################################################################
28+
29+
class Stan:
30+
def __init__(self,model):
31+
self.model=model
32+
33+
def writeOneDimArray(self,name,array,dim,OUT):
34+
print(name+" <- c(",end="",file=OUT)
35+
for i in range(0,dim):
36+
print(array[i],end="",file=OUT)
37+
if(i+1<dim): print(",",end="",file=OUT)
38+
print(")",file=OUT)
39+
40+
def writeTwoDimArray(self,name,array,firstDim,secondDim,OUT):
41+
print(name+" <- structure(c(",end="",file=OUT)
42+
for j in range(secondDim): # second dim
43+
for i in range(firstDim): # first dim
44+
print(array[i][j],end="",file=OUT)
45+
if(i+1<firstDim): print(",",end="",file=OUT)
46+
if(j+1<secondDim): print(",",end="",file=OUT)
47+
print("), .Dim=c(",firstDim,",",secondDim,"))",sep="",file=OUT)
48+
49+
def writeThreeDimArray(self,name,array,firstDim,secondDim,thirdDim,OUT):
50+
print(name+" <- structure(c(",end="",file=OUT)
51+
for k in range(thirdDim): # third dim
52+
for j in range(secondDim): # second dim
53+
for i in range(firstDim): # first dim
54+
print(array[i][j][k],end="",file=OUT)
55+
if(i+1<firstDim): print(",",end="",file=OUT)
56+
if(j+1<secondDim): print(",",end="",file=OUT)
57+
if(k+1<thirdDim): print(",",end="",file=OUT)
58+
print("), .Dim=c(",firstDim,",",secondDim,",",thirdDim,"))",sep="",file=OUT)
59+
60+
def initArray2D(self,dim1,dim2,value):
61+
array=[]
62+
for i in range(dim1):
63+
row=[]
64+
for j in range(dim2):
65+
row.append(value)
66+
array.append(row)
67+
return array
68+
69+
def initArray3D(self,dim1,dim2,dim3,value):
70+
array=[]
71+
for i in range(dim1):
72+
row=[]
73+
for j in range(dim2):
74+
row2=[]
75+
for k in range(dim3):
76+
row2.append(value)
77+
row.append(row2)
78+
array.append(row)
79+
return array
80+
81+
def run(self,numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile=None):
82+
cmd=self.getCmd(numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile)
83+
os.system(cmd)
84+
85+
def getCmd(self,numWarmup,numSamples,inputFile,outputFile,stderrFile,initFile=None):
86+
init=" init="+initFile if initFile is not None else ""
87+
cmd=self.model+" sample thin=1"+\
88+
" num_samples="+str(numSamples)+\
89+
" num_warmup="+str(numWarmup)+\
90+
" data file="+inputFile+\
91+
init+\
92+
" output file="+outputFile+" refresh=0 > "+stderrFile
93+
return cmd
94+

0 commit comments

Comments
 (0)