Skip to content

Commit 9f49643

Browse files
author
Bill Majoros
committedSep 14, 2020
update
1 parent e96761b commit 9f49643

6 files changed

+235
-6
lines changed
 

‎CigarString.py

+11
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,30 @@
2828
# (numMatches,numMismatches)=cigar.longestMatchStats(seq1,seq2)
2929
# # ^ Returns none if no match; must call computeIntervals() first!
3030
# L=cigar.totalAlignmentLength()
31+
# cigar.setOps(ops)
3132
#=========================================================================
3233
class CigarString:
3334
"""CigarString parses CIGAR strings (alignments)"""
3435
def __init__(self,cigar):
3536
self.ops=self.parse(cigar)
3637

38+
def setOps(self,ops):
39+
self.ops=ops
40+
3741
def matchesByLength(self):
3842
matches=[]
3943
for op in self.ops:
4044
if(op.getOp() in ("M","=","X")): matches.append(op)
4145
matches.sort(key=lambda x: -x.getLength())
4246
return matches
4347

48+
def countIndelBases(self):
49+
N=0
50+
for op in self.ops:
51+
c=op.getOp()
52+
if(c=="I" or c=="D"): N+=1
53+
return N
54+
4455
def totalAlignmentLength(self):
4556
L=0
4657
for op in self.ops:

‎SamHSP.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# Copyright (C)2016 William H. Majoros (martiandna@gmail.com).
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
from CigarString import CigarString
11+
from Interval import Interval
12+
13+
#=========================================================================
14+
# Attributes:
15+
# cigar : CigarString
16+
# refName : string
17+
# readInterval : Interval
18+
# refInterval : Interval
19+
# score : float
20+
# rec : the SamRecord this HSP came from
21+
# Instance Methods:
22+
# hsp=SamHSP(rec) # rec is a SamRecord
23+
# cigar=hsp.getCigar() # returns CigarString object
24+
# refName=hsp.getRefName()
25+
# boolean=hsp.overlapsOnRead(otherHSP)
26+
# boolean=hsp.overlapsOnRef(otherHSP)
27+
# interval=hsp.getReadInterval() # returns Interval object
28+
# interval=hsp.getRefInterval() # returns Interval object
29+
# hsp.computeScore()
30+
# score=hsp.getScore()
31+
# Private Methods:
32+
# self.computeIntervals()
33+
# Class Methods:
34+
# none
35+
#=========================================================================
36+
class SamHSP:
37+
"""SamHSP"""
38+
def __init__(self,rec):
39+
self.cigar=rec.getCigar()
40+
self.refName=rec.getRefName()
41+
self.rec=rec
42+
self.computeIntervals()
43+
self.score=None
44+
45+
def getScore(self):
46+
return self.score
47+
48+
def computeScore(self):
49+
mismatches=self.rec.countMismatches()
50+
matches=self.cigar.totalAlignmentLength()-mismatches
51+
indelBases=self.cigar.countIndelBases()
52+
numerator=matches
53+
denominator=1+mismatches+indelBases
54+
self.score=float(numerator)/float(denominator)
55+
56+
def overlapsOnRead(self,other):
57+
return self.readInterval.overlaps(other.readInterval)
58+
59+
def overlapsOnRef(self,other):
60+
return self.refInterval.overlaps(other.refInterval)
61+
62+
def getCigar(self):
63+
return self.cigar
64+
65+
def getRefName(self):
66+
return self.refName
67+
68+
def getReadInterval(self):
69+
return self.readInterval
70+
71+
def getRefInterval(self):
72+
return self.refInterval
73+
74+
def computeIntervals(self):
75+
cigar=self.cigar
76+
n=cigar.length()
77+
firstOp=cigar[0]
78+
lastOp=cigar[n-1]
79+
self.readInterval=Interval(firstOp.getQueryInterval().getBegin(),
80+
lastOp.getQueryInterval().getEnd())
81+
self.refInterval=Interval(firstOp.getRefInterval().getBegin(),
82+
lastOp.getRefInterval().getEnd())
83+
84+
85+

‎SamHspClusterer.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# Copyright (C)2016 William H. Majoros (martiandna@gmail.com).
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
11+
#=========================================================================
12+
# Attributes:
13+
#
14+
# Instance Methods:
15+
# clusterer=SamHspClusterer()
16+
# Class Methods:
17+
# clustered=SamHspClusterer.cluster(HSPs)
18+
#=========================================================================
19+
class SamHspClusterer:
20+
"""SamHspClusterer"""
21+
def __init__(self):
22+
pass
23+
24+
@classmethod
25+
def cluster(cls,raw):
26+
HSPs=[x for x in raw]
27+
HSPs.sort(key=lambda hsp: hsp.getReadInterval().getBegin())
28+
for hsp in HSPs: hsp.computeScore()
29+
n=len(HSPs)
30+
i=0
31+
while(i<n-1):
32+
while(HSPs[i].overlapsOnRead(HSPs[i+1])):
33+
if(HSPs[i].getScore()<HSPs[i+1].getScore()):
34+
del HSPs[i]
35+
else:
36+
del HSPs[i+1]
37+
n-=1
38+
if(i>=n-1): break
39+
i+=1
40+
return HSPs
41+

‎SamHspFactory.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#=========================================================================
2+
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
3+
# License (GPL) version 3, as described at www.opensource.org.
4+
# Copyright (C)2016 William H. Majoros (martiandna@gmail.com).
5+
#=========================================================================
6+
from __future__ import (absolute_import, division, print_function,
7+
unicode_literals, generators, nested_scopes, with_statement)
8+
from builtins import (bytes, dict, int, list, object, range, str, ascii,
9+
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
10+
from SamHSP import SamHSP
11+
from SamRecord import SamRecord
12+
from CigarString import CigarString
13+
14+
#=========================================================================
15+
# Attributes:
16+
# keepOps : set of string
17+
# Instance Methods:
18+
# factory=SamHspFactory()
19+
# HSPs=factory.makeHSPs(reads)
20+
# Private Methods:
21+
# cigar=self.processCigar(cigar)
22+
# Class Methods:
23+
# none
24+
#=========================================================================
25+
class SamHspFactory:
26+
"""SamHspFactory"""
27+
def __init__(self):
28+
self.keepOps=set(["M","I","D","=","X"])
29+
30+
def makeHSPs(self,reads):
31+
HSPs=[]
32+
for read in reads:
33+
cigar=read.getCigar()
34+
cigar.computeIntervals(read.getRefPos())
35+
cigar=self.processCigar(cigar)
36+
hsp=SamHSP(read)
37+
HSPs.append(hsp)
38+
return HSPs
39+
40+
def processCigar(self,cigar):
41+
keepOps=self.keepOps
42+
keep=[]
43+
n=cigar.length()
44+
for i in range(n):
45+
op=cigar[i]
46+
#if(op.getOp()=="S" and i>0 and i<n-1):
47+
# raise Exception("Nested soft mask detected in cigar string")
48+
if(op.getOp() in keepOps):
49+
keep.append(op)
50+
new=CigarString("")
51+
new.setOps(keep)
52+
return new

‎SamPairedRead.py

+15
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ def totalAlignedLength(self):
3535
L2=self.read2.getCigar().totalAlignmentLength()
3636
return L1+L2
3737

38+
def countIndelBases(self):
39+
cigar1=self.read1.getCigar()
40+
cigar2=self.read2.getCigar()
41+
N1=cigar1.countIndelBases()
42+
N2=cigar2.countIndelBases()
43+
return N1+N2
44+
3845
def numMismatches(self):
3946
mis1=self.read1.countMismatches()
4047
mis2=self.read2.countMismatches()
@@ -46,6 +53,14 @@ def matchProportion(self):
4653
score=float(L-mis)/float(L)
4754
return score
4855

56+
def computeScore(self):
57+
mismatches=self.numMismatches()
58+
matches=self.totalAlignedLength()-mismatches
59+
numerator=matches
60+
indelBases=self.countIndelBases()
61+
denominator=1+mismatches+indelBases
62+
score=float(numerator)/float(denominator)
63+
return score
4964

5065

5166

‎SamPairedReadStream.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
# Attributes:
1515
# reader : SamReader
1616
# dedup : boolean
17-
# buffer : SamRecord
17+
# bufferedRec : SamRecord
18+
# bufferedPair : SamPairedRead
1819
# Instance Methods:
1920
# stream=SamPairedReadStream(filename,dedup=True)
2021
# pair=stream.nextPair() # returns SamPairedRead
22+
# readGroup=stream.nextGroup() # returns array of SamPairedRead
2123
# Class Methods:
2224
# none
2325
#=========================================================================
@@ -26,14 +28,37 @@ class SamPairedReadStream:
2628
def __init__(self,filename,dedup=True):
2729
self.reader=SamReader(filename)
2830
self.dedup=dedup
29-
self.buffer=None
31+
self.bufferedRec=None
32+
self.bufferedPair=None
33+
34+
def nextGroup(self):
35+
group=[]
36+
readID=None
37+
while(True):
38+
pair=None
39+
if(self.bufferedPair is not None):
40+
pair=self.bufferedPair
41+
self.bufferedPair=None
42+
else: pair=self.nextPair()
43+
if(pair is None): return None
44+
if(readID is None):
45+
group.append(pair)
46+
readID=pair.getID()
47+
continue
48+
thisID=pair.getID()
49+
if(thisID==readID):
50+
group.append(pair)
51+
continue
52+
self.bufferedPair=pair
53+
break
54+
return group
3055

3156
def nextPair(self):
3257
while(True):
3358
rec=None
34-
if(self.buffer is not None):
35-
rec=self.buffer
36-
self.buffer=None
59+
if(self.bufferedRec is not None):
60+
rec=self.bufferedRec
61+
self.bufferedRec=None
3762
else: rec=self.reader.nextSequence()
3863
if(rec is None): return None
3964
if(rec.flag_unmapped()): continue
@@ -45,7 +70,7 @@ def nextPair(self):
4570
if(rec.flag_unmapped()): continue
4671
if(self.dedup and rec.flag_PCRduplicate()): continue
4772
if(rec.flag_firstOfPair()):
48-
self.buffer=rec
73+
self.bufferedRec=rec
4974
continue
5075
read2=rec
5176
return SamPairedRead(read1,read2)

0 commit comments

Comments
 (0)
Please sign in to comment.