Skip to content

Commit b5aecf6

Browse files
author
Bill Majoros
committed
update
1 parent 9a9f0e5 commit b5aecf6

7 files changed

+160
-11
lines changed

CigarOp.py

+35-1
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,50 @@
77
unicode_literals, generators, nested_scopes, with_statement)
88
from builtins import (bytes, dict, int, list, object, range, str, ascii,
99
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
from Interval import Interval
11+
12+
ADVANCE_QUERY=set(["M","I","S","=","X"])
13+
ADVANCE_REF=set(["M","D","N","=","X"])
1014

1115
#=========================================================================
1216
# Attributes:
13-
# op : M/I/D/S
1417
# length : integer
18+
# interval1 : Interval (in sequence 1 = query)
19+
# interval2 : Interval (in sequence 2 = reference)
20+
# op : M/I/D/S:
21+
# consumes
22+
# query ref
23+
# M 0 alignment match (can be a sequence match or mismatch) yes yes
24+
# I 1 insertion to the reference yes no
25+
# D 2 deletion from the reference no yes
26+
# N 3 skipped region from the reference no yes
27+
# S 4 soft clipping (clipped sequences present in SEQ) yes no
28+
# H 5 hard clipping (clipped sequences NOT present in SEQ) no no
29+
# P 6 padding (silent deletion from padded reference) no no
30+
# = 7 sequence match yes yes
31+
# X 8 sequence mismatch yes yes
1532
# Instance Methods:
1633
# op=CigarOp("M",135)
34+
# bool=op.advanceInQuery() # matches, insertions, etc.
35+
# bool=op.advanceInRef() # matches, deletions, etc.
36+
# op=op.getOp()
37+
# L=op.getLength()
1738
#=========================================================================
1839
class CigarOp:
1940
def __init__(self,op,L):
2041
self.op=op
2142
self.length=L
43+
self.interval=None
44+
45+
def getOp(self):
46+
return self.op
47+
48+
def getLength(self):
49+
return self.length
50+
51+
def advanceInQuery(self):
52+
return self.op in ADVANCE_QUERY
53+
54+
def advanceInRef(self):
55+
return self.op in ADVANCE_REF
2256

CigarString.py

+17
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from Rex import Rex
1111
rex=Rex()
1212
from CigarOp import CigarOp
13+
from Interval import Interval
1314

1415
#=========================================================================
1516
# Attributes:
@@ -19,12 +20,28 @@
1920
# bool=cigar.completeMatch()
2021
# numOps=cigar.length()
2122
# cigarOp=cigar[i] # returns a CigarOp object
23+
# str=cigar.toString()
24+
# cigar.computeIntervals(refPos)
2225
#=========================================================================
2326
class CigarString:
2427
"""CigarString parses CIGAR strings (alignments)"""
2528
def __init__(self,cigar):
2629
self.ops=self.parse(cigar)
2730

31+
def computeIntervals(self,refPos):
32+
ops=self.ops
33+
n=len(ops)
34+
begin1=0; begin2=refPos
35+
for i in range(n):
36+
op=ops[i]
37+
L=op.getLength()
38+
end1=begin1; end2=begin2
39+
if(op.advanceInQuery()): end1+=L
40+
if(op.advanceInRef()): end2+=L
41+
op.interval1=Interval(begin1,end1)
42+
op.interval2=Interval(begin2,end2)
43+
begin1=end1; begin2=end2
44+
2845
def length(self):
2946
return len(self.ops)
3047

DataFrame.py

+57-7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# df=DataFrame()
2121
# rowNames=df.getRowNames()
2222
# colNames=df.getColumnNames()
23+
# df.addRow(DataFrameRow)
2324
# n=df.nrow()
2425
# n=df.ncol()
2526
# row=df[index]
@@ -29,12 +30,18 @@
2930
# header=df.getHeader()
3031
# df.hashRowNames()
3132
# df.hashColNames()
33+
# row=df.getRowI(i)
34+
# col=df.getColI(i)
3235
# row=df.getRow(rowName) # call hashRowNames() first!
3336
# col=df.getColumn(columnName) # call hashColNames() first!
3437
# bool=df.rowExists(rowName) # call hashRowNames() first!
3538
# bool=df.columnExists(colName) # call hashColNames() first!
39+
# newDataFrame=df.subsetColumns(colIndices)
40+
# idx=df.addColumn(colName,defaultValue) # returns index of new column
41+
# df.print(handle)
42+
# array=df.toDataArray()
3643
# Class methods:
37-
# df=DataFrame.readTable(filename,hasHeader=True,hasRowNames=True)
44+
# df=DataFrame.readTable(filename,header=False,rowNames=False)
3845
#=========================================================================
3946

4047
class DataFrame:
@@ -44,6 +51,39 @@ def __init__(self):
4451
self.rowHash=None
4552
self.colHash=None
4653

54+
def addRow(self,row):
55+
self.matrix.append(row)
56+
57+
def toDataArray(self):
58+
array=[]
59+
for row in self.matrix:
60+
array.append(row.values)
61+
return array
62+
63+
def print(self,handle):
64+
print("\t".join(self.header),file=handle)
65+
for row in self.matrix: row.print(handle)
66+
67+
def addColumn(self,name,defaultValue):
68+
colIndex=len(self.header)
69+
self.header.append(name)
70+
for row in self.matrix:
71+
row.append(defaultValue)
72+
return colIndex
73+
74+
def subsetColumns(self,colIndices):
75+
newDF=DataFrame()
76+
header=self.header
77+
newHeader=newDF.header
78+
for i in colIndices: newHeader.append(header[i])
79+
for i in range(self.nrow()):
80+
row=self[i]
81+
newRow=DataFrameRow()
82+
newRow.rename(row.getLabel())
83+
for j in colIndices: newRow.values.append(row[j])
84+
newDF.matrix.append(newRow)
85+
return newDF
86+
4787
def rowExists(self,rowName):
4888
if(self.rowHash is None): raise Exception("call hashRowNames() first")
4989
return self.rowHash.get(rowName,None) is not None
@@ -59,7 +99,16 @@ def getRowNames(self):
5999
return names
60100

61101
def getColumnNames(self):
62-
return header
102+
return self.header
103+
104+
def getRowI(self,rowIndex):
105+
return self.matrix[rowIndex]
106+
107+
def getColumnI(self,colIndex):
108+
column=DataFrameRow()
109+
for row in self.matrix:
110+
column.values.append(row[colIndex])
111+
return column
63112

64113
def getRow(self,rowName):
65114
if(self.rowHash is None): raise Exception("call hashRowNames() first")
@@ -75,6 +124,7 @@ def getColumn(self,colName):
75124
column.label=colName
76125
for row in self.matrix:
77126
colum.values.append(row[colIndex])
127+
return column
78128

79129
def hashRowNames(self):
80130
h=self.rowHash={}
@@ -108,17 +158,17 @@ def toFloat(self):
108158
for row in self.matrix: row.toFloat()
109159

110160
@classmethod
111-
def readTable(cls,filename,hasHeader=True,hasRowNames=True):
161+
def readTable(cls,filename,header=False,rowNames=False):
112162
df=DataFrame()
113163
with open(filename,"rt") as IN:
114-
if(hasHeader):
164+
if(header):
115165
df.header=IN.readline()
116-
df.header=df.header.rstrip().split()
166+
df.header=df.header.rstrip().split("\t")
117167
for line in IN:
118-
fields=line.rstrip().split()
168+
fields=line.rstrip().split("\t")
119169
if(len(fields)<1): continue
120170
label=""
121-
if(hasRowNames):
171+
if(rowNames):
122172
label=fields[0]
123173
fields=fields[1:]
124174
row=DataFrameRow()

DataFrameRow.py

+11
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
# n=row.length()
2222
# row.toInt()
2323
# row.toFloat()
24+
# row.append(value)
25+
# row.print(handle)
2426
#=========================================================================
2527

2628
class DataFrameRow:
@@ -31,6 +33,15 @@ def __init__(self):
3133
def __getitem__(self,i):
3234
return self.values[i]
3335

36+
def __setitem__(self,i,value):
37+
self.values[i]=value
38+
39+
def print(self,handle):
40+
print(self.label+"\t","\t".join([str(x) for x in self.values]),sep="")
41+
42+
def append(self,value):
43+
self.values.append(value)
44+
3445
def length(self):
3546
return len(self.values)
3647

Interval.py

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# end
1616
# Methods:
1717
# i=Interval(begin,end)
18+
# s=interval.toString()
1819
# print(file=STDOUT)
1920
# bool=interval.overlaps(other)
2021
# bool=interval.contains(position)
@@ -49,6 +50,9 @@ def __init__(self,begin=0,end=0):
4950
def print(self,file=sys.stdout):
5051
print("(",self.begin,",",self.end,")",sep="",end="",file=file)
5152

53+
def toString(self):
54+
return "("+str(self.begin)+","+str(self.end)+")"
55+
5256
def overlaps(self,other):
5357
return self.begin<other.end and other.begin<self.end
5458

SamReader.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# fh : file handle
1919
# Instance Methods:
2020
# reader=SamReader(filename)
21-
# [ID,seq,qual]=reader.nextSequence() # returns None at EOF
21+
# samRecord=reader.nextSequence() # returns None at EOF
2222
# reader.close()
2323
# Class Methods:
2424
#=========================================================================
@@ -43,7 +43,7 @@ def nextSequence(self):
4343
if(len(fields)<11): raise Exception("can't parse sam line: "+line)
4444
(ID,flags,refName,refPos,mapQual,cigar,rnext,pnext,templateLen,
4545
seq,qual)=fields[:11]
46-
refPos=int(refPos)
46+
refPos=int(refPos)-1 # convert 1-based to 0-based
4747
flags=int(flags)
4848
CIGAR=CigarString(cigar)
4949
rec=SamRecord(ID,refName,refPos,CIGAR,seq,flags)

SamRecord.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,26 @@
1515
# refPos = position in reference where alignment begins
1616
# CIGAR = CigarString
1717
# seq = read sequence
18+
# flags = bitfield
1819
# Instance Methods:
19-
# rec=SamReader(ID,refName,refPos,cigar,seq)
20+
# rec=SamReader(ID,refName,refPos,cigar,seq,flags)
21+
# ID=rec.getID()
22+
# cigar=rec.getCigar()
23+
# seq=rec.getSequence()
24+
# refName=rec.getRefName()
25+
# refPos=rec.getRefPos()
26+
# bool=rec.flag_hasMultipleSegments()
27+
# bool=rec.flag_properlyAligned()
28+
# bool=rec.flag_unmapped()
29+
# bool=rec.flag_nextSegmentUnmapped()
30+
# bool=rec.flag_revComp()
31+
# bool=rec.flag_nextSegmentRevComp()
32+
# bool=rec.flag_firstOfPair()
33+
# bool=rec.flag_secondOfPair()
34+
# bool=rec.flag_secondaryAlignment()
35+
# bool=rec.flag_failedFilters()
36+
# bool=rec.flag_PCRduplicate()
37+
# bool=rec.flag_supplAlignment()
2038
# Class Methods:
2139
#=========================================================================
2240
class SamRecord:
@@ -29,6 +47,21 @@ def __init__(self,ID,refName,refPos,CIGAR,seq,flags):
2947
self.seq=seq
3048
self.flags=flags
3149

50+
def getRefName(self):
51+
return self.refName
52+
53+
def getRefPos(self):
54+
return self.refPos
55+
56+
def getCigar(self):
57+
return self.CIGAR
58+
59+
def getID(self):
60+
return self.ID
61+
62+
def getSequence(self):
63+
return self.seq
64+
3265
def flag_hasMultipleSegments(self):
3366
return bool(self.flags & 0x1)
3467

0 commit comments

Comments
 (0)