Skip to content

Commit

Permalink
added utility files, removed duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverVoogd committed Jun 29, 2023
1 parent 6695380 commit 6882b28
Show file tree
Hide file tree
Showing 10 changed files with 8,827 additions and 89 deletions.
170 changes: 85 additions & 85 deletions src/bam.c → src/utility/bam.c
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b)
return str.s;
}

//int bam_view1(const bam_header_t *header, const bam1_t *b)
//{
// char *s = bam_format1(header, b);
// int ret = -1;
// if (!s) return -1;
// if (puts(s) != EOF) ret = 0;
// free(s);
// return ret;
//}
int bam_view1(const bam_header_t *header, const bam1_t *b)
{
char *s = bam_format1(header, b);
int ret = -1;
if (!s) return -1;
if (puts(s) != EOF) ret = 0;
free(s);
return ret;
}

int bam_validate1(const bam_header_t *header, const bam1_t *b)
{
Expand Down Expand Up @@ -162,86 +162,86 @@ int bam_remove_B(bam1_t *b)
cigar = bam1_cigar(b);
for (k = 0; k < b->core.n_cigar; ++k)
if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
if (k == b->core.n_cigar) return 0; // no 'B'
if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
// allocate memory for the new CIGAR
if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
b->m_data = b->data_len + b->core.n_cigar * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
cigar = bam1_cigar(b); // after realloc, cigar may be changed
}
new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
// the core loop
seq = bam1_seq(b); qual = bam1_qual(b);
no_qual = (qual[0] == 0xff); // test whether base quality is available
i = j = 0; end_j = -1;
for (k = l = 0; k < b->core.n_cigar; ++k) {
int op = bam_cigar_op(cigar[k]);
int len = bam_cigar_oplen(cigar[k]);
if (op == BAM_CBACK) { // the backward operation
int t, u;
if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
if (len > j) goto rmB_err; // an excessively long backward
for (t = l - 1, u = 0; t >= 0; --t) { // look back
int op1 = bam_cigar_op(new_cigar[t]);
int len1 = bam_cigar_oplen(new_cigar[t]);
if (bam_cigar_type(op1)&1) { // consume the query
if (u + len1 >= len) { // stop
new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
break;
} else u += len1;
}
if (k == b->core.n_cigar) return 0; // no 'B'
if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
// allocate memory for the new CIGAR
if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
b->m_data = b->data_len + b->core.n_cigar * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
cigar = bam1_cigar(b); // after realloc, cigar may be changed
}
new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
// the core loop
seq = bam1_seq(b); qual = bam1_qual(b);
no_qual = (qual[0] == 0xff); // test whether base quality is available
i = j = 0; end_j = -1;
for (k = l = 0; k < b->core.n_cigar; ++k) {
int op = bam_cigar_op(cigar[k]);
int len = bam_cigar_oplen(cigar[k]);
if (op == BAM_CBACK) { // the backward operation
int t, u;
if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
if (len > j) goto rmB_err; // an excessively long backward
for (t = l - 1, u = 0; t >= 0; --t) { // look back
int op1 = bam_cigar_op(new_cigar[t]);
int len1 = bam_cigar_oplen(new_cigar[t]);
if (bam_cigar_type(op1)&1) { // consume the query
if (u + len1 >= len) { // stop
new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
break;
} else u += len1;
}
if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
l = t + 1;
end_j = j; j -= len;
} else { // other CIGAR operations
new_cigar[l++] = cigar[k];
if (bam_cigar_type(op)&1) { // consume the query
if (i != j) { // no need to copy if i == j
int u, c, c0;
for (u = 0; u < len; ++u) { // construct the consensus
c = bam1_seqi(seq, i+u);
if (j + u < end_j) { // in an overlap
c0 = bam1_seqi(seq, j+u);
if (c != c0) { // a mismatch; choose the better base
if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
bam1_seq_seti(seq, j+u, c);
qual[j+u] = qual[i+u] - qual[j+u];
} else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
} else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
} else { // not in an overlap; copy over
bam1_seq_seti(seq, j+u, c);
qual[j+u] = qual[i+u];
}
}
if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
l = t + 1;
end_j = j; j -= len;
} else { // other CIGAR operations
new_cigar[l++] = cigar[k];
if (bam_cigar_type(op)&1) { // consume the query
if (i != j) { // no need to copy if i == j
int u, c, c0;
for (u = 0; u < len; ++u) { // construct the consensus
c = bam1_seqi(seq, i+u);
if (j + u < end_j) { // in an overlap
c0 = bam1_seqi(seq, j+u);
if (c != c0) { // a mismatch; choose the better base
if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
bam1_seq_seti(seq, j+u, c);
qual[j+u] = qual[i+u] - qual[j+u];
} else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
} else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
} else { // not in an overlap; copy over
bam1_seq_seti(seq, j+u, c);
qual[j+u] = qual[i+u];
}
}
i += len, j += len;
}
i += len, j += len;
}
}
if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
// merge adjacent operations if possible
for (k = 1; k < l; ++k)
if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
// kill zero length operations
for (k = i = 0; k < l; ++k)
if (new_cigar[k] >> BAM_CIGAR_SHIFT)
new_cigar[i++] = new_cigar[k];
l = i;
// update b
memcpy(cigar, new_cigar, l * 4); // set CIGAR
p = b->data + b->core.l_qname + l * 4;
memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
memmove(p, qual, j); p += j; // set QUAL
memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
b->data_len = p - b->data; // update record length
return 0;

rmB_err:
b->core.flag |= BAM_FUNMAP;
return -1;
}
if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
// merge adjacent operations if possible
for (k = 1; k < l; ++k)
if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
// kill zero length operations
for (k = i = 0; k < l; ++k)
if (new_cigar[k] >> BAM_CIGAR_SHIFT)
new_cigar[i++] = new_cigar[k];
l = i;
// update b
memcpy(cigar, new_cigar, l * 4); // set CIGAR
p = b->data + b->core.l_qname + l * 4;
memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
memmove(p, qual, j); p += j; // set QUAL
memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
b->data_len = p - b->data; // update record length
return 0;

rmB_err:
b->core.flag |= BAM_FUNMAP;
return -1;
}
31 changes: 27 additions & 4 deletions src/bam.h → src/utility/bam.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,29 @@ Used with samtools
#ifndef BAM_UTIL_FUNCS_H
#define BAM_UTIL_FUNCS_H

#include <htslib/sam.h>

// Retrive the Cigar operation code
// using the base function in htslib/sam.h
// returns an int.
// the lower 4 bits of the uint8_t used to represent each cigar operation
#define bam1_cigar_op(c) bam_cigar_op((c))

// Retrive the Cigar operation length
// using htslib/sam.h
// returns an int
// of the upper 28 bits of the uint8_t used to represent each cigar operation
#define bam1_cigar_oplen(c) bam_cigar_oplen((c))

/*! @function
Get the number of cigar operations performed
@param b const bam1_t *
@return uint32)t number of cigar operations
Useful for iterating over cigar array acquired through
bam1_cigar
*/
#define bam1_cigar_len(b) (b->core.n_cigar)

// seqment->data is qname-cigar-seq-qual-aux
// qname is at bam1_qname(seqment) (which is (char*)seqment->data)

Expand All @@ -20,7 +43,7 @@ Get the 0-based leftmost coordinate of an alignment
@param b bam1_t pointer to an alignment
@return int32_t pos
*/
#define bam_alignment_start(b) (b->core.pos)
#define bam_reference_start(b) (b->core.pos)

/*
Convert the uint8_t aux value to a string
Expand All @@ -31,7 +54,7 @@ Now, valid input can be checked by the returning string's length
@param s uint8_t * pointer to tag data returned by bam_aux_get()
@return std::string of tag data, or empty string if tag type is not Z
*/
#define bam_aux2string(s) ( (s != NULL) ? std::string((*s++ == 'Z') ? (char *)s : "") : "")
#define bam_aux2string(s) ( (s != NULL) ? std::string((*s+1 == 'Z') ? (char *)s : "") : "")

#endif

Expand Down Expand Up @@ -161,7 +184,7 @@ typedef hts_itr_t *bam_iter_t;
/*! @function
@abstract Get the CIGAR array
@param b pointer to an alignment
@return pointer to the CIGAR array
@return pointer to the CIGAR array (uint32_t *)
@discussion In the CIGAR array, each element is a 32-bit integer. The
lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
Expand Down Expand Up @@ -363,7 +386,7 @@ extern "C" {
@abstract Formats a BAM record and writes it and \n to stdout
@return 0 if successful, -1 on error
*/
//int bam_view1(const bam_header_t *header, const bam1_t *b);
int bam_view1(const bam_header_t *header, const bam1_t *b);

/*!
@abstract Check whether a BAM record is plausibly valid
Expand Down
112 changes: 112 additions & 0 deletions src/utility/cigars.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#include "cigars.h"

#include <string>
#include <vector>
#include <sstream>

#include "bam.h"

/* take a bam entry,
populate a vector with all of its CIGAR operations
this is to mimic the output of bamnostic's default cigar from bamfile.fetch
*/
std::vector<CigarPair>
generate_cigar_pairs(const bam1_t *b)
{
std::vector<CigarPair>
cigar_pairs;

// iterate over the cigar
const uint32_t *cigar = bam1_cigar(b);
for (int k = 0; k < (int)bam1_cigar_len(b); k++) {
cigar_pairs.push_back((CigarPair){
(int)bam_cigar_op(cigar[k]),
(int)bam_cigar_oplen(cigar[k])
});
}
return cigar_pairs;
}

/*
cigar specification:
(lifted from the original sc_longread.py)
M BAM_CMATCH 0
I BAM_CINS 1
D BAM_CDEL 2
N BAM_CREF_SKIP 3
S BAM_CSOFT_CLIP 4
H BAM_CHARD_CLIP 5
P BAM_CPAD 6
= BAM_CEQUAL 7
X BAM_CDIFF 8
B BAM_CBACK 9
*/

std::string
generate_cigar (const std::vector<CigarPair> &cigar)
{
/*
takes a cigar as a vector of pairs of ints,
converts it to CIGAR string format
*/

const char CIGAR_CODE[] = "MIDNSHP=XB";
std::string result = "";

for (const auto & pair : cigar) {
result.append(std::to_string(pair.len));
result.push_back(CIGAR_CODE[pair.op]);
}
return result;
}

/*
takes a cigar as a vector of int pairs,
smooths it down,
returns the smooth cigar as a vector of int pairs
*/
std::vector<CigarPair>
smooth_cigar (const std::vector<CigarPair> &cigar, int threshold)
{
std::vector<CigarPair> new_cigar = {cigar[0]};

for (int i = 1; i < (int)cigar.size(); i++) {
if (new_cigar.back().op != 0) {
new_cigar.push_back(cigar[i]);
continue;
}

switch(cigar[i].op) {
case 0:
new_cigar.back().len += cigar[i].len;
break;
case 1:
if (cigar[i].len > threshold) {
new_cigar.push_back(cigar[i]);
}
break;
case 2:
if (cigar[i].len <= threshold) {
new_cigar.back().len += cigar[i].len;
} else {
new_cigar.push_back(cigar[i]);
}
break;
default:
new_cigar.push_back(cigar[i]);
}
}

return new_cigar;
}

std::string printCigarPairs(const std::vector<CigarPair> &cigar) {
std::stringstream ss;
ss << cigar[0].len << "-" << cigar[0].op;
for (int i =1; i < cigar.size(); i++) {
ss << "," << cigar[i].len << "-" << cigar[i].op;
}
return ss.str();
}
Loading

0 comments on commit 6882b28

Please sign in to comment.