Skip to content

Commit 7664479

Browse files
author
Bill Majoros
committed
update
1 parent b5aecf6 commit 7664479

10 files changed

+280
-36
lines changed

CigarString.py

+31
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
# cigarOp=cigar[i] # returns a CigarOp object
2323
# str=cigar.toString()
2424
# cigar.computeIntervals(refPos)
25+
# op=cigar.longestMatch() # returns a CigarOp object (or None)
26+
# L=cigar.longestMatchLen() # returns integer
27+
# (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2) # None if no match
28+
# ^ must call computeIntervals() first!
2529
#=========================================================================
2630
class CigarString:
2731
"""CigarString parses CIGAR strings (alignments)"""
@@ -52,6 +56,33 @@ def completeMatch(self):
5256
ops=self.ops
5357
return len(ops)==1 and ops[0].op=="M"
5458

59+
def longestMatchStats(self,query,ref):
60+
m=self.longestMatch()
61+
if(m is None): return None
62+
sub1=query[m.interval1.getBegin():m.interval1.getEnd()]
63+
sub2=ref[m.interval2.getBegin():m.interval2.getEnd()]
64+
matches=0; mismatches=0
65+
for i in range(len(sub1)):
66+
if(sub1[i]==sub2[i]): matches+=1
67+
else: mismatches+=1
68+
return (matches,mismatches)
69+
70+
def longestMatchLen(self):
71+
m=self.longestMatch()
72+
if(m is None): return 0
73+
return m.getLength()
74+
75+
def longestMatch(self):
76+
longest=None
77+
longestLength=0
78+
for op in self.ops:
79+
if(op.getOp()=="M"):
80+
L=op.getLength()
81+
if(L>longestLength):
82+
longest=op
83+
longestLength=L
84+
return longest
85+
5586
def toString(self):
5687
ops=self.ops
5788
s=""

DataFrame.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
# n=df.nrow()
2525
# n=df.ncol()
2626
# row=df[index]
27+
# rows=df.getRows()
2728
# elem=df[i][j]
2829
# df.toInt()
2930
# df.toFloat()
@@ -36,6 +37,7 @@
3637
# col=df.getColumn(columnName) # call hashColNames() first!
3738
# bool=df.rowExists(rowName) # call hashRowNames() first!
3839
# bool=df.columnExists(colName) # call hashColNames() first!
40+
# index=df.getColumnIndex(colName) # call hashColNames() first!
3941
# newDataFrame=df.subsetColumns(colIndices)
4042
# idx=df.addColumn(colName,defaultValue) # returns index of new column
4143
# df.print(handle)
@@ -54,6 +56,9 @@ def __init__(self):
5456
def addRow(self,row):
5557
self.matrix.append(row)
5658

59+
def getRows(self):
60+
return self.matrix
61+
5762
def toDataArray(self):
5863
array=[]
5964
for row in self.matrix:
@@ -88,6 +93,9 @@ def rowExists(self,rowName):
8893
if(self.rowHash is None): raise Exception("call hashRowNames() first")
8994
return self.rowHash.get(rowName,None) is not None
9095

96+
def getColumnIndex(self,colName):
97+
return self.colHash.get(colName)
98+
9199
def columnExists(self,colName):
92100
if(self.colHash is None): raise Exception("call hashColNames() first")
93101
return self.colHash.get(colName,None) is not None
@@ -104,7 +112,7 @@ def getColumnNames(self):
104112
def getRowI(self,rowIndex):
105113
return self.matrix[rowIndex]
106114

107-
def getColumnI(self,colIndex):
115+
def getColI(self,colIndex):
108116
column=DataFrameRow()
109117
for row in self.matrix:
110118
column.values.append(row[colIndex])
@@ -123,7 +131,7 @@ def getColumn(self,colName):
123131
column=DataFrameRow()
124132
column.label=colName
125133
for row in self.matrix:
126-
colum.values.append(row[colIndex])
134+
column.values.append(row[colIndex])
127135
return column
128136

129137
def hashRowNames(self):
@@ -137,7 +145,7 @@ def hashColNames(self):
137145
h=self.colHash={}
138146
numCols=self.ncol()
139147
for i in range(numCols):
140-
h[header[i]]=i
148+
h[self.header[i]]=i
141149

142150
def getHeader(self):
143151
return self.header

FastqReader.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# fh : file handle
1717
# Instance Methods:
1818
# reader=FastqReader(filename)
19-
# [ID,seq,qual]=reader.nextSequence() # returns None at EOF
19+
# (ID,seq,qual,pair)=reader.nextSequence() # returns None at EOF
2020
# reader.close()
2121
# Class Methods:
2222
#=========================================================================
@@ -44,7 +44,7 @@ def nextSequence(self):
4444
seq=fh.readline().rstrip()
4545
junk=fh.readline()
4646
qual=fh.readline().rstrip()
47-
return [ID,seq,qual,pair]
47+
return (ID,seq,qual,pair)
4848

4949

5050

Interval.py

+28
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from builtins import (bytes, dict, int, list, object, range, str, ascii,
99
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
1010
import sys
11+
from Rex import Rex
12+
rex=Rex()
1113

1214
#=========================================================================
1315
# Attributes:
@@ -21,6 +23,7 @@
2123
# bool=interval.contains(position)
2224
# bool=interval.containsInterval(other)
2325
# distance=interval.distance(other)
26+
# distance=interval.distanceFromPoint(x)
2427
# intersection=interval.intersect(other)
2528
# union=interval.union(other) # returns an array of intervals
2629
# diff=interval.minus(other) # returns an array of intervals
@@ -37,6 +40,9 @@
3740
# center=interval.floatCenter()
3841
# center=interval.intCenter()
3942
# center=interval.center() # same as floatCenter()
43+
# Class methods:
44+
# interval=Interval.parseInt("(10,15)")
45+
# interval=Interval.parseFloat("(3.5,7.2)")
4046
#=========================================================================
4147

4248
class Interval:
@@ -47,6 +53,22 @@ def __init__(self,begin=0,end=0):
4753
self.begin=begin
4854
self.end=end
4955

56+
@classmethod
57+
def parseInt(cls,interval):
58+
if(rex.find("\(([^,]+),([^\)]+)\)",interval)):
59+
return Interval(int(rex[1]),int(rex[2]))
60+
if(rex.find("([^,]+):([^\)]+)",interval)):
61+
return Interval(int(rex[1]),int(rex[2]))
62+
if(rex.find("([^,]+)-([^\)]+)",interval)):
63+
return Interval(int(rex[1]),int(rex[2]))
64+
return None
65+
66+
@classmethod
67+
def parseFloat(cls,interval):
68+
if(rex.find("\(([^,]+),([^\)]+)\)",interval)):
69+
return Interval(float(rex[1]),float(rex[2]))
70+
return None
71+
5072
def print(self,file=sys.stdout):
5173
print("(",self.begin,",",self.end,")",sep="",end="",file=file)
5274

@@ -56,6 +78,12 @@ def toString(self):
5678
def overlaps(self,other):
5779
return self.begin<other.end and other.begin<self.end
5880

81+
def distanceFromPoint(self,x):
82+
if(self.contains(x)): return 0
83+
d1=abs(self.begin-x)
84+
d2=abs(self.end-x)
85+
return d1 if d1<d2 else d2
86+
5987
def distance(self,other):
6088
if(self.overlaps(other)): return 0
6189
d=self.begin-other.end

SamReader.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,18 @@
1616
#=========================================================================
1717
# Attributes:
1818
# fh : file handle
19+
# headerLines : array of header lines
1920
# Instance Methods:
2021
# reader=SamReader(filename)
2122
# samRecord=reader.nextSequence() # returns None at EOF
23+
# (record,line)=reader.nextSeqAndText() # returns None at EOF
2224
# reader.close()
2325
# Class Methods:
2426
#=========================================================================
2527
class SamReader:
2628
"""SamReader"""
2729
def __init__(self,filename):
30+
self.headerLines=[]
2831
if(filename is not None):
2932
if(rex.find("\.gz$",filename)): self.fh=gzip.open(filename,"rt")
3033
else: self.fh=open(filename,"r")
@@ -33,10 +36,17 @@ def close(self):
3336
self.fh.close()
3437

3538
def nextSequence(self):
39+
pair=self.nextSeqAndText()
40+
if(pair is None): return None
41+
(rec,line)=pair
42+
return rec
43+
44+
def nextSeqAndText(self):
3645
fh=self.fh
3746
line=fh.readline()
3847
if(line is None): return None
3948
while(line is not None and len(line)>0 and line[0]=="@"):
49+
if(line[0]=="@"): self.headerLines.append(line)
4050
line=fh.readline()
4151
if(line is None or len(line)==0): return None
4252
fields=line.rstrip().split()
@@ -46,8 +56,9 @@ def nextSequence(self):
4656
refPos=int(refPos)-1 # convert 1-based to 0-based
4757
flags=int(flags)
4858
CIGAR=CigarString(cigar)
49-
rec=SamRecord(ID,refName,refPos,CIGAR,seq,flags)
50-
return rec
59+
tags=fields[11:]
60+
rec=SamRecord(ID,refName,refPos,CIGAR,seq,flags,tags)
61+
return (rec,line)
5162

5263
# M03884:303:000000000-C4RM6:1:1101:1776:15706 99 chrX:31786371-31797409 6687 44 150M = 6813 271 ATACTATTGCTGCGGTAATAACTGTAACTGCAGTTACTATTTAGTGATTTGTATGTAGATGTAGATGTAGTCTATGTCAGACACTATGCTGAGCATTTTATGGTTGCTATGTACTGATACATACAGAAACAAGAGGTACGTTCTTTTACA BBBBFFFFFFFGGGGGEFGGFGHFHFFFHHHFFHHHFHFHHHGFHEDGGHFHBGFHGBDHFHFFFHHHHFHHHHHGHGFFBGGGHFHFFHHFFFFHHHHGHGFHHGFHGHHHGFHFFHHFHHFFGFFFFGGEHFFEHHFGHHHGHHHHFB AS:i:300 XN:i:0
5364

SamRecord.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
unicode_literals, generators, nested_scopes, with_statement)
88
from builtins import (bytes, dict, int, list, object, range, str, ascii,
99
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
from Rex import Rex
11+
rex=Rex()
1012

1113
#=========================================================================
1214
# Attributes:
@@ -16,13 +18,17 @@
1618
# CIGAR = CigarString
1719
# seq = read sequence
1820
# flags = bitfield
21+
# tags = array of tags at end of record (MD:Z:122G25, NM:i:1, etc.)
1922
# Instance Methods:
20-
# rec=SamReader(ID,refName,refPos,cigar,seq,flags)
23+
# rec=SamRecord(ID,refName,refPos,cigar,seq,flags,tags)
2124
# ID=rec.getID()
22-
# cigar=rec.getCigar()
25+
# cigar=rec.getCigar() # returns CigarString object
2326
# seq=rec.getSequence()
2427
# refName=rec.getRefName()
2528
# refPos=rec.getRefPos()
29+
# tags=rec.getTags()
30+
# fields=rec.parseMDtag()
31+
# tag=getTag("MD") # returns the third field, e.g. "122G25" in MD:Z:122G25
2632
# bool=rec.flag_hasMultipleSegments()
2733
# bool=rec.flag_properlyAligned()
2834
# bool=rec.flag_unmapped()
@@ -39,13 +45,41 @@
3945
#=========================================================================
4046
class SamRecord:
4147
"""SamRecord"""
42-
def __init__(self,ID,refName,refPos,CIGAR,seq,flags):
48+
def __init__(self,ID,refName,refPos,CIGAR,seq,flags,tags):
4349
self.ID=ID
4450
self.refName=refName
4551
self.refPos=refPos
4652
self.CIGAR=CIGAR
4753
self.seq=seq
4854
self.flags=flags
55+
self.tags=tags
56+
57+
def getTags(self):
58+
return self.tags
59+
60+
def getTag(self,label):
61+
for tag in self.tags:
62+
if(not rex.find("^([^:]+):[^:]+:(\S+)",tag)):
63+
raise Exception("Can't parse SAM tag: "+tag)
64+
if(rex[1]==label): return rex[2]
65+
return None
66+
67+
def parseMDtag(self):
68+
md=self.getTag("MD")
69+
fields=[]
70+
while(len(md)>0):
71+
if(rex.find("^(\d+)(.*)",md)):
72+
fields.append(rex[1])
73+
md=rex[2]
74+
elif(rex.find("^([ACGT])(.*)",md)):
75+
fields.append(rex[1])
76+
md=rex[2]
77+
elif(rex.find("^(\^[ACGT]+)(.*)",md)):
78+
fields.append(rex[1])
79+
md=rex[2]
80+
else:
81+
raise Exception("Can't parse MD tag: "+md)
82+
return fields
4983

5084
def getRefName(self):
5185
return self.refName

SmithWaterman.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python
2+
#=========================================================================
3+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
4+
# License (GPL) version 3, as described at www.opensource.org.
5+
# Author: William H. Majoros (bmajoros@alumni.duke.edu)
6+
#=========================================================================
7+
from __future__ import (absolute_import, division, print_function,
8+
unicode_literals, generators, nested_scopes, with_statement)
9+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
10+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
11+
# The above imports should allow this program to run in both Python 2 and
12+
# Python 3. You might need to update your version of module "future".
13+
import os
14+
from Pipe import Pipe
15+
from Rex import Rex
16+
rex=Rex()
17+
import TempFilename
18+
from CigarString import CigarString
19+
from FastaWriter import FastaWriter
20+
21+
#=========================================================================
22+
# Attributes:
23+
#
24+
# Instance Methods:
25+
# aligner=SmithWaterman(alignerDir,matrixFile,openPenalty,extrendPenalty)
26+
# cigarString=aligner.align(seq1,seq2)
27+
#=========================================================================
28+
29+
class SmithWaterman:
30+
def __init__(self,alignerDir,matrixFile,gapOpenPenalty,gapExtendPenalty):
31+
self.alignerDir=alignerDir
32+
self.matrixFile=matrixFile
33+
self.gapOpen=gapOpenPenalty
34+
self.gapExtend=gapExtendPenalty
35+
self.fastaWriter=FastaWriter()
36+
37+
def writeFile(self,defline,seq):
38+
filename=TempFilename.generate("fasta")
39+
self.fastaWriter.writeFasta(defline,seq,filename)
40+
return filename
41+
42+
def swapInsDel(self,cigar):
43+
# This is done because my aligner defines insertions and deletions
44+
# opposite to how they're defined in the SAM specification
45+
newCigar=""
46+
for x in cigar:
47+
if(x=="I"): x="D"
48+
elif(x=="D"): x="I"
49+
newCigar+=x
50+
return newCigar
51+
52+
def align(self,seq1,seq2):
53+
file1=self.writeFile("query",seq1)
54+
file2=self.writeFile("reference",seq2)
55+
cmd=self.alignerDir+"/smith-waterman -q "+self.matrixFile+" "+\
56+
str(self.gapOpen)+" "+str(self.gapExtend)+" "+file1+" "+file2+" DNA"
57+
output=Pipe.run(cmd)
58+
os.remove(file1)
59+
os.remove(file2)
60+
if(not rex.find("CIGAR=(\S+)",output)):
61+
raise Exception("Can't parse aligner output: "+output)
62+
cigar=rex[1]
63+
cigar=self.swapInsDel(cigar) # because I define cigars differently
64+
return CigarString(cigar)
65+
66+
67+

0 commit comments

Comments
 (0)