Skip to content

Commit

Permalink
Improve MM/MN tag validity checking to cope with SEQ "*"
Browse files Browse the repository at this point in the history
Added MM overflow detection for +ve strand data.  It was already
detected on -ve strand as this is done in the initial parse loop.
We only detect +ve strand overflow once we hit the end of the
iterator, but it's still sufficient for validation.

Also improve MM parsing when faced with empty lists, such as
"C+m;C+h,0".  These parsed correctly before, but left
state->MM[0] pointing at the "C" rather than ";" which makes overflow
detection harder.
  • Loading branch information
jkbonfield committed Feb 14, 2024
1 parent e6eb2f4 commit 172da4d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 16 deletions.
38 changes: 23 additions & 15 deletions sam_mods.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
}

uint8_t *mi = bam_aux_get(b, "MN");
if (mi && bam_aux2i(mi) != b->core.l_qseq) {
if (mi && bam_aux2i(mi) != b->core.l_qseq && b->core.l_qseq) {
// bam_aux2i with set errno = EINVAL and return 0 if the tag
// isn't integer, but 0 will be a seq-length mismatch anyway so
// triggers an error here too.
Expand Down Expand Up @@ -359,7 +359,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
if (!cp_end) {
// empty list
delta = INT_MAX;
cp_end = cp+1;
cp_end = cp;
}
}
// Now delta is first in list or computed remainder,
Expand Down Expand Up @@ -426,6 +426,10 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
}
}
}
if (ml && ml != ml_end) {
hts_log_error("%s: Too many entries in ML tag", bam_get_qname(b));
return -1;
}

state->nmods = mod_num;

Expand Down Expand Up @@ -544,8 +548,24 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
*/
int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
hts_base_mod *mods, int n_mods, int *pos) {
if (state->seq_pos >= b->core.l_qseq)
if (state->seq_pos >= b->core.l_qseq) {
// Spots +ve orientation run-overs.
// The -ve orientation is spotted in bam_parse_basemod2
int i;
for (i = 0; i < state->nmods; i++) {
// Check if any remaining items in MM after hitting the end
// of the sequence.
if (!b->core.l_qseq)
continue;

if (state->MMcount[i] < 0x7f000000 ||
(*state->MM[i]!=0 && *state->MM[i]!=';')) {
hts_log_warning("MM tag refers to bases beyond sequence length");
return -1;
}
}
return 0;
}

// Look through state->MMcount arrays to see when the next lowest is
// per base type;
Expand Down Expand Up @@ -579,18 +599,6 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
}
*pos = state->seq_pos = i;

if (i >= b->core.l_qseq) {
// Check for more MM elements than bases present.
for (i = 0; i < state->nmods; i++) {
if (!(b->core.flag & BAM_FREVERSE) &&
state->MMcount[i] < 0x7f000000) {
hts_log_warning("MM tag refers to bases beyond sequence length");
return -1;
}
}
return 0;
}

if (b->core.flag & BAM_FREVERSE) {
for (i = 0; i < state->nmods; i++)
state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
Expand Down
2 changes: 1 addition & 1 deletion test/base_mods/MM-chebi.sam
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
@CO Separate m, h and N modifications
* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,33,212,169
* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,33,212

0 comments on commit 172da4d

Please sign in to comment.