Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve mpileup overlap removal #1751

Merged
merged 2 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 45 additions & 7 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -5706,8 +5706,7 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
// Loop over the overlapping region nulling qualities in either
// seq a or b.
int err = 0;
while ( 1 )
{
while ( 1 ) {
// Step to next matching reference position in a and b
while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
Expand All @@ -5716,8 +5715,6 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
err = a_ret<-1?-1:0;
break;
}
if ( iref < a_iref + a->core.pos )
iref = a_iref + a->core.pos;

while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
Expand All @@ -5726,14 +5723,55 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
err = b_ret<-1?-1:0;
break;
}

if ( iref < a_iref + a->core.pos )
iref = a_iref + a->core.pos;

if ( iref < b_iref + b->core.pos )
iref = b_iref + b->core.pos;

iref++;

if ( a_iref+a->core.pos != b_iref+b->core.pos )
// only CMATCH positions, don't know what to do with indels
continue;
// If A or B has a deletion then we catch up the other to this point.
// We also amend quality values using the same rules for mismatch.
if (a_iref+a->core.pos != b_iref+b->core.pos) {
if (a_iref+a->core.pos < b_iref+b->core.pos
&& b_cigar > bam_get_cigar(b)
&& bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
// Del in B means it's moved on further than A
do {
a_qual[a_iseq] = amul
? a_qual[a_iseq]*0.8
: 0;
a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
&a_icig, &a_iseq, &a_iref);
if (a_ret < 0)
return -(a_ret<-1); // 0 or -1
} while (a_iref + a->core.pos < b_iref+b->core.pos);
} else if (a_cigar > bam_get_cigar(a)
&& bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
// Del in A means it's moved on further than B
do {
b_qual[b_iseq] = bmul
? b_qual[b_iseq]*0.8
: 0;
b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
&b_icig, &b_iseq, &b_iref);
if (b_ret < 0)
return -(b_ret<-1); // 0 or -1
} while (b_iref + b->core.pos < a_iref+a->core.pos);
} else {
// Anything else, eg ref-skip, we don't support here
continue;
}
}

// fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
// "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
// a_cigar-bam_get_cigar(a), a_icig,
// b_cigar-bam_get_cigar(b), b_icig,
// iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
// a_iseq, b_iseq);

if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
// Fell off end of sequence, bad CIGAR?
Expand Down
20 changes: 10 additions & 10 deletions test/mpileup/c1#pad1.out
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
c1 1 9 ^!A^!A^!A^!A^!A^!A^!A^!A^!A
c1 2 9 AAAAAAAAA-3()
c1 3 9 CCCCCCCC*
c1 4 9 CCCCCCCC-1()*
c1 5 9 GGGG+6(GTTAAC)G+6(*TTAA*)G+6(GTT***)G+6(***AAC)*+6(**TA**)-1()*+6(GTTAAC)-3()
c1 6 9 CCCCCCC**
c1 7 9 GGGGGGGG*
c1 8 9 GGGGGGGG*
c1 9 9 TTTTTTTTT
c1 10 9 T$T$T$T$T$T$T$T$T$
c1 1 9 ^!A^!A^!A^!A^!A^!A^!A^!A^!A ~~~~~~~~~
c1 2 9 AAAAAAAAA-3() ~~~~~~~~~
c1 3 9 CCCCCCCC* ~~~~~~~~~
c1 4 9 CCCCCCCC-1()* ~~~~~~~~~
c1 5 9 GGGG+6(GTTAAC)G+6(*TTAA*)G+6(GTT***)G+6(***AAC)*+6(**TA**)-1()*+6(GTTAAC)-3() ~~~~~~~~~
c1 6 9 CCCCCCC** ~~~~~~~~~
c1 7 9 GGGGGGGG* ~~~~~~~~~
c1 8 9 GGGGGGGG* ~~~~~~~~~
c1 9 9 TTTTTTTTT ~~~~~~~~~
c1 10 9 T$T$T$T$T$T$T$T$T$ ~~~~~~~~~
20 changes: 10 additions & 10 deletions test/mpileup/c1#pad2.out
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
c1 1 12 ^!A^!A^!A^!A^!A^!A^!A^!A^!A^!A^!A^!*
c1 2 12 AAAAAAAAAA-3()A*
c1 3 12 CCCCCCCCC*C*
c1 4 12 CCCCCCCCC-1()*C-2()*
c1 5 13 GGGGG+6(GTTAAC)G+6(*TTAA*)G+6(GTT***)G+6(***AAC)*+6(**TA**)-1()*+6(GTTAAC)-3()**+6(**TA**)-5()^!G+6(**TA**)$
c1 6 12 CCCCCCCC****
c1 7 12 GGGGGGGGG*G*
c1 8 12 GGGGGGGGG*G*
c1 9 12 TTTTTTTTTTT*
c1 10 12 T$T$T$T$T$T$T$T$T$T$T$*$
c1 1 12 ^!A^!A^!A^!A^!A^!A^!A^!A^!A^!A^!A^!* ~~~~~~~~~~~~
c1 2 12 AAAAAAAAAA-3()A* ~~~~~~~~~~~~
c1 3 12 CCCCCCCCC*C* ~~~~~~~~~~~~
c1 4 12 CCCCCCCCC-1()*C-2()* ~~~~~~~~~~~~
c1 5 13 GGGGG+6(GTTAAC)G+6(*TTAA*)G+6(GTT***)G+6(***AAC)*+6(**TA**)-1()*+6(GTTAAC)-3()**+6(**TA**)-5()^!G+6(**TA**)$ ~~~~~~~~~~~~~
c1 6 12 CCCCCCCC**** ~~~~~~~~~~~~
c1 7 12 GGGGGGGGG*G* ~~~~~~~~~~~~
c1 8 12 GGGGGGGGG*G* ~~~~~~~~~~~~
c1 9 12 TTTTTTTTTTT* ~~~~~~~~~~~~
c1 10 12 T$T$T$T$T$T$T$T$T$T$T$*$ ~~~~~~~~~~~~
10 changes: 5 additions & 5 deletions test/mpileup/c1#pad3.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
c1 6 11 ^!C^!C^!C^!C^!C^!C^!C^!C^!*^!*^!*
c1 7 11 GGGGGGGGG*G
c1 8 11 GGGGGGGGG*G
c1 9 11 TTTTTTTTTTT
c1 10 11 T$T$T$T$T$T$T$T$T$T$T$
c1 6 11 ^!C^!C^!C^!C^!C^!C^!C^!C^!*^!*^!* ~~~~~~~~~~~
c1 7 11 GGGGGGGGG*G ~~~~~~~~~~~
c1 8 11 GGGGGGGGG*G ~~~~~~~~~~~
c1 9 11 TTTTTTTTTTT ~~~~~~~~~~~
c1 10 11 T$T$T$T$T$T$T$T$T$T$T$ ~~~~~~~~~~~
22 changes: 11 additions & 11 deletions test/mpileup/mp_D.out
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
z 2 3 ^!A^!A^!*
z 3 3 GG*
z 4 3 CCC
z 5 3 TT-3()T
z 6 3 T*T
z 7 3 A*A
z 8 3 G*G
z 9 3 CCC
z 10 3 AAA-2()
z 11 3 GG*
z 12 3 G$G$*$
z 2 3 ^!A^!A^!* 002
z 3 3 GG* 112
z 4 3 CCC 222
z 5 3 TT-3()T 333
z 6 3 T*T 474
z 7 3 A*A 575
z 8 3 G*G 676
z 9 3 CCC 777
z 10 3 AAA-2() 888
z 11 3 GG* 99~
z 12 3 G$G$*$ 00~
24 changes: 12 additions & 12 deletions test/mpileup/mp_DI.out
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
z 2 5 ^!A^!A^!A^!*^!*
z 3 5 GGG*+2(AA)*+2(*A)
z 4 5 CCCCC
z 5 5 TTTTT
z 6 5 TTTTT
z 7 5 AAAAA
z 8 5 G-2()G-2()G-2()GG
z 9 5 ***CC
z 10 5 *+2(TT)*+2(TT)$*+2(*T)$AA
z 11 3 GGG
z 12 3 GG$G$
z 13 1 C$
z 2 5 ^!A^!A^!A^!*^!* 000AB
z 3 5 GGG*+2(AA)*+2(*A) 111AB
z 4 5 CCCCC 22222
z 5 5 TTTTT 33333
z 6 5 TTTTT 44444
z 7 5 AAAAA 55555
z 8 5 G-2()G-2()G-2()GG 66666
z 9 5 ***CC AAB77
z 10 5 *+2(TT)*+2(TT)$*+2(*T)$AA AAB88
z 11 3 GGG 999
z 12 3 GG$G$ 000
z 13 1 C$ 1
22 changes: 11 additions & 11 deletions test/mpileup/mp_I.out
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
z 2 3 ^!A^!A^!A
z 3 3 GGG
z 4 3 CCC
z 5 3 TT+3(CCC)T
z 6 3 TTT
z 7 3 AAA
z 8 3 GGG
z 9 3 CCC
z 10 3 AAA
z 11 3 GGG
z 12 3 G$G$G+2(=A)$
z 2 3 ^!A^!A^!A 000
z 3 3 GGG 111
z 4 3 CCC 222
z 5 3 TT+3(CCC)T 333
z 6 3 TTT 444
z 7 3 AAA 555
z 8 3 GGG 666
z 9 3 CCC 777
z 10 3 AAA 888
z 11 3 GGG 999
z 12 3 G$G$G+2(=A)$ 000
24 changes: 12 additions & 12 deletions test/mpileup/mp_ID.out
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
z 2 3 ^!A^!A^!A
z 3 3 GGG
z 4 5 CCC^!*^!*
z 5 5 TTT**
z 6 5 TTTTT
z 7 5 AAAAA
z 8 5 G+2(TT)-2()G+2(TT)-2()G+2(T*)-2()GG
z 9 5 ***CC
z 10 5 **$*$AA
z 11 3 GGG
z 12 3 GG$G$
z 13 1 C$
z 2 3 ^!A^!A^!A 000
z 3 3 GGG 111
z 4 5 CCC^!*^!* 22244
z 5 5 TTT** 33344
z 6 5 TTTTT 44444
z 7 5 AAAAA 55555
z 8 5 G+2(TT)-2()G+2(TT)-2()G+2(T*)-2()GG 66666
z 9 5 ***CC 9~~77
z 10 5 **$*$AA 9~~88
z 11 3 GGG 999
z 12 3 GG$G$ 000
z 13 1 C$ 1
22 changes: 11 additions & 11 deletions test/mpileup/mp_N.out
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
z 2 3 ^!A^!A^!>
z 3 3 GG>
z 4 3 CCC
z 5 3 TTT
z 6 3 T>T
z 7 3 A>A
z 8 3 G>G
z 9 3 CCC
z 10 3 AAA
z 11 3 GG>
z 12 3 G$G$>$
z 2 3 ^!A^!A^!> 002
z 3 3 GG> 112
z 4 3 CCC 222
z 5 3 TTT 333
z 6 3 T>T 474
z 7 3 A>A 575
z 8 3 G>G 676
z 9 3 CCC 777
z 10 3 AAA 888
z 11 3 GG> 99~
z 12 3 G$G$>$ 00~
26 changes: 13 additions & 13 deletions test/mpileup/mp_N2.out
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
z 1 6 ^!T^!T^!T^!T^!T^!T
z 2 6 AAAAAA
z 3 6 GGGGGG
z 4 6 C+2(AA)-5()C+2(A*)-5()C+2(*A)-5()C+2(AA)C+2(A*)C+2(*A)
z 5 6 ***>>>
z 6 6 ***>>>
z 7 6 ***>>>
z 8 6 ***>>>
z 9 6 *+2(TT)*+2(*T)*+2(T*)>+2(TT)>+2(*T)>+2(T*)
z 10 6 AAAAAA
z 11 6 GGGGGG
z 12 6 GGGGGG
z 13 6 T$T$T$T$T$T$
z 1 6 ^!T^!T^!T^!T^!T^!T AAAAAA
z 2 6 AAAAAA BBBBBB
z 3 6 GGGGGG CCCCCC
z 4 6 C+2(AA)-5()C+2(A*)-5()C+2(*A)-5()C+2(AA)C+2(A*)C+2(*A) DDDDDD
z 5 6 ***>>> GHGGHG
z 6 6 ***>>> GHGGHG
z 7 6 ***>>> GHGGHG
z 8 6 ***>>> GHGGHG
z 9 6 *+2(TT)*+2(*T)*+2(T*)>+2(TT)>+2(*T)>+2(T*) GHGGHG
z 10 6 AAAAAA IIIIII
z 11 6 GGGGGG JJJJJJ
z 12 6 GGGGGG KKKKKK
z 13 6 T$T$T$T$T$T$ LLLLLL
20 changes: 10 additions & 10 deletions test/mpileup/mp_P.out
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
z 2 5 ^!A^!A^!A^!A^!A
z 3 5 GGGGG
z 4 5 CCCCC
z 5 5 TTTTT
z 6 5 TT+4(GGCC)T+4(GG**)T+4(*GC*)T+4(**CC)
z 7 5 AAAAA
z 8 5 GGGGG
z 9 5 CCCCC
z 10 5 AAAAA
z 11 5 G$G$G$G$G$
z 2 5 ^!A^!A^!A^!A^!A 00000
z 3 5 GGGGG 11111
z 4 5 CCCCC 22222
z 5 5 TTTTT 33333
z 6 5 TT+4(GGCC)T+4(GG**)T+4(*GC*)T+4(**CC) 44444
z 7 5 AAAAA 55555
z 8 5 GGGGG 66666
z 9 5 CCCCC 77777
z 10 5 AAAAA 88888
z 11 5 G$G$G$G$G$ 99999
12 changes: 12 additions & 0 deletions test/mpileup/mp_overlap1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 100003 2 ^St^+T {!
1 100004 2 aA-5() {!
1 100005 2 a* N!
1 100006 2 g* N!
1 100007 2 c* N!
1 100008 2 a* N!
1 100009 2 c* N!
1 100010 2 aA {!
1 100011 2 cC {!
1 100012 2 aA {!
1 100013 2 gG {!
1 100014 2 a$A$ {!
4 changes: 4 additions & 0 deletions test/mpileup/mp_overlap1.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.5 SO:coordinate
@SQ SN:1 LN:249250621
r1 147 1 100003 50 12M * 0 0 TAAGCACACAGA ZZZZZZZZZZZZ
r1 99 1 100003 10 2M5D5M * 0 0 TAACAGA BBBBBBB
12 changes: 12 additions & 0 deletions test/mpileup/mp_overlap2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 100003 2 ^+T^St {!
1 100004 2 A-5()a {!
1 100005 2 *a {!
1 100006 2 *g {!
1 100007 2 *c {!
1 100008 2 *a {!
1 100009 2 *c {!
1 100010 2 Aa {!
1 100011 2 Cc {!
1 100012 2 Aa {!
1 100013 2 Gg {!
1 100014 2 A$a$ {!
4 changes: 4 additions & 0 deletions test/mpileup/mp_overlap2.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@HD VN:1.5 SO:coordinate
@SQ SN:1 LN:249250621
r1 99 1 100003 10 2M5D5M * 0 0 TAACAGA BBBBBBB
r1 147 1 100003 50 12M * 0 0 TAAGCACACAGA ZZZZZZZZZZZZ
5 changes: 5 additions & 0 deletions test/mpileup/mpileup.tst
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,8 @@ P c1#pad3.out $pileup -m c1#pad3.sam

# Issue #852. Problem caused by alignments with entirely S/I ops in CIGAR.
P small.out $pileup -m small.bam

# Overlap removal and the effect on quality values
P mp_overlap1.out $pileup -m mp_overlap1.sam
P mp_overlap2.out $pileup -m mp_overlap2.sam

Loading