From cbfd3e82780e379fdf3f84c63d4bac547a208ff4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 3 Nov 2017 14:31:02 -0400 Subject: [PATCH] Support BAMs with >65535 CIGAR operations Due to a design flaw, the original BAM format is unable to store an alignment with >65535 CIGAR operations. The SAM/BAM specification maintainers have decided to move the actual CIGAR to a CG optional tag and write a fake CIGAR `SN` at the original CIGAR place. This PR recognizes the CG tag and seamlessly moves the real CIGAR back to its right place and update the `bin` field accordingly. Library users need not take any actions. The convert and sort commands of command-line bamtools have been tested on BAMs containing the CG tag. --- src/api/internal/bam/BamReader_p.cpp | 91 +++++++++++++++++++++++++++- src/api/internal/bam/BamReader_p.h | 1 + src/api/internal/bam/BamWriter_p.cpp | 90 +++++++++++++++++++++++---- 3 files changed, 168 insertions(+), 14 deletions(-) diff --git a/src/api/internal/bam/BamReader_p.cpp b/src/api/internal/bam/BamReader_p.cpp index 25ca1844..de430cfa 100644 --- a/src/api/internal/bam/BamReader_p.cpp +++ b/src/api/internal/bam/BamReader_p.cpp @@ -246,6 +246,91 @@ void BamReaderPrivate::LoadHeaderData() m_header.Load(&m_stream); } +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f') return 4; + else return 0; +} + +static unsigned char *bam_aux_get(int aux_data_len, const unsigned char *aux_start, const char *tag) +{ + const unsigned char *p = aux_start; + while (p < aux_start + aux_data_len) { + if (p[0] == tag[0] && p[1] == tag[1]) return (unsigned char*)(p + 2); + p += 2; // skip tag + int type = *p++; // read type + if (type == 'B') { + int size = bam_aux_type2size(*p++); // read array type + unsigned len = (unsigned)p[0] | (unsigned)p[1]<<8 | (unsigned)p[2]<<16 | (unsigned)p[3]<<24; + p += 4; // skip the size field + p += len * size; // skip array + } else if (type == 'Z' || type == 'H') { + while (*p++ != 0) {} // skip NULL terminated string + } else { + p += bam_aux_type2size(type); // skip value + } + } + return NULL; +} + +static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) +{ + int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; + for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) + if (beg>>s == end>>s) return t + (beg>>s); + return 0; +} + +bool BamReaderPrivate::Tag2Cigar(BamAlignment &a, RaiiBuffer &buf) +{ + if (a.RefID < 0 || a.Position < 0 || a.SupportData.NumCigarOperations == 0) return false; + + const unsigned char *data = (const unsigned char*)buf.Buffer; + const unsigned data_len = a.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned char *p = data + a.SupportData.QueryNameLength; // the original CIGAR + unsigned cigar1 = (unsigned)p[0] | (unsigned)p[1]<<8 | (unsigned)p[2]<<16 | (unsigned)p[3]<<24; + if ((cigar1&0xf) != 4 || cigar1>>4 != a.SupportData.QuerySequenceLength) return false; + + const int seq_offset = a.SupportData.QueryNameLength + a.SupportData.NumCigarOperations * 4; + const int aux_offset = seq_offset + (a.SupportData.QuerySequenceLength + 1) / 2 + a.SupportData.QuerySequenceLength; + unsigned char *CG = bam_aux_get(data_len - aux_offset, data + aux_offset, "CG"); + if (CG == NULL || CG[0] != 'B' || CG[1] != 'I') return false; + + const unsigned tag_cigar_len = (unsigned)CG[2] | (unsigned)CG[3]<<8 | (unsigned)CG[4]<<16 | (unsigned)CG[5]<<24; + if (tag_cigar_len == 0) return false; + + // recalculate bin, as it may be incorrect if it was calculated by a tool unaware of the real CIGAR in tag + const unsigned tag_cigar_offset = CG - data + 6; + unsigned alignment_end = a.Position; + p = data + tag_cigar_offset; + for (unsigned i = 0; i < tag_cigar_len * 4; i += 4, p += 4) { + unsigned cigar1 = (unsigned)p[0] | (unsigned)p[1]<<8 | (unsigned)p[2]<<16 | (unsigned)p[3]<<24; + int op = cigar1 & 0xf; + if (op == 0 || op == 2 || op == 3 || op == 7 || op == 8) + alignment_end += cigar1 >> 4; + } + a.Bin = hts_reg2bin(a.Position, alignment_end, 14, 5); + + // populate new AllCharData + int fake_bytes = a.SupportData.NumCigarOperations * 4; + std::string new_data; + new_data.reserve(data_len - 8 - fake_bytes + 1); + new_data.append((char*)data, a.SupportData.QueryNameLength); // query name + new_data.append((char*)data + tag_cigar_offset, tag_cigar_len * 4); // real CIGAR + new_data.append((char*)data + seq_offset, tag_cigar_offset - 8 - seq_offset); // seq, qual and tags before CG + const unsigned tag_cigar_end_offset = tag_cigar_offset + tag_cigar_len * 4; + if (tag_cigar_end_offset < data_len) // tags after CG, if there is any + new_data.append((char*)data + tag_cigar_end_offset, data_len - tag_cigar_end_offset); + + // update member variables + a.SupportData.NumCigarOperations = tag_cigar_len; + a.SupportData.BlockLength -= 8 + fake_bytes; + memcpy(buf.Buffer, new_data.c_str(), buf.NumBytes - 8 - fake_bytes); + return true; +} + // populates BamAlignment with alignment data under file pointer, returns success/fail bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { @@ -291,11 +376,15 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) // read in character data - make sure proper data size was read bool readCharDataOK = false; - const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; RaiiBuffer allCharData(dataLength); if (m_stream.Read(allCharData.Buffer, dataLength) == dataLength) { + int OldNumCigarOperations = alignment.SupportData.NumCigarOperations; + if (Tag2Cigar(alignment, allCharData)) + dataLength -= 8 + OldNumCigarOperations * 4; + // store 'allCharData' in supportData structure alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength); diff --git a/src/api/internal/bam/BamReader_p.h b/src/api/internal/bam/BamReader_p.h index fdcd8f8c..655228be 100644 --- a/src/api/internal/bam/BamReader_p.h +++ b/src/api/internal/bam/BamReader_p.h @@ -53,6 +53,7 @@ class BamReaderPrivate // access alignment data bool GetNextAlignment(BamAlignment& alignment); bool GetNextAlignmentCore(BamAlignment& alignment); + bool Tag2Cigar(BamAlignment &alignment, RaiiBuffer &buf); // access auxiliary data std::string GetHeaderText() const; diff --git a/src/api/internal/bam/BamWriter_p.cpp b/src/api/internal/bam/BamWriter_p.cpp index f9327ce4..978bde73 100644 --- a/src/api/internal/bam/BamWriter_p.cpp +++ b/src/api/internal/bam/BamWriter_p.cpp @@ -295,6 +295,8 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) queryLength + // here referring to quality length tagDataLength; unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if (numCigarOperations >= 65536) + blockSize += 16; if (m_isBigEndian) BamTools::SwapEndian_32(blockSize); m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); @@ -303,7 +305,7 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) buffer[0] = al.RefID; buffer[1] = al.Position; buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[3] = (al.AlignmentFlag << 16) | (numCigarOperations < 65536? numCigarOperations : 2); buffer[4] = queryLength; buffer[5] = al.MateRefID; buffer[6] = al.MatePosition; @@ -322,17 +324,29 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) m_stream.Write(al.Name.c_str(), nameLength); // write the packed cigar - if (m_isBigEndian) { - char* cigarData = new char[packedCigarLength](); - memcpy(cigarData, packedCigar.data(), packedCigarLength); + if (numCigarOperations < 65536) { if (m_isBigEndian) { - for (std::size_t i = 0; i < packedCigarLength; ++i) - BamTools::SwapEndian_32p(&cigarData[i]); + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if (m_isBigEndian) { + for (size_t i = 0; i < packedCigarLength; ++i) + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? } - m_stream.Write(cigarData, packedCigarLength); - delete[] cigarData; // TODO: cleanup on Write exception thrown? - } else - m_stream.Write(packedCigar.data(), packedCigarLength); + else + m_stream.Write(packedCigar.data(), packedCigarLength); + } else { + unsigned int cigar[2]; + cigar[0] = queryLength << 4 | 4; + cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3; + if (m_isBigEndian) { + BamTools::SwapEndian_32(cigar[0]); + BamTools::SwapEndian_32(cigar[1]); + } + m_stream.Write((char*)cigar, 8); + } if (queryLength > 0) { @@ -450,6 +464,28 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al) delete[] tagData; // TODO: cleanup on Write exception thrown? } else m_stream.Write(al.TagData.data(), tagDataLength); + + if (numCigarOperations >= 65536) { + m_stream.Write("CGBI", 4); + if ( m_isBigEndian ) { + unsigned int cigar_len_buf = numCigarOperations; + BamTools::SwapEndian_32(cigar_len_buf); + m_stream.Write((char*)&cigar_len_buf, 4); + + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if ( m_isBigEndian ) { + for ( size_t i = 0; i < packedCigarLength; ++i ) // FIXME: similarly, this should be "i += 4", not "++i" + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } + else { + m_stream.Write((char*)&numCigarOperations, 4); + m_stream.Write(packedCigar.data(), packedCigarLength); + } + } } void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) @@ -457,6 +493,8 @@ void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) // write the block size unsigned int blockSize = al.SupportData.BlockLength; + if (al.SupportData.NumCigarOperations >= 65536) + blockSize += 16; if (m_isBigEndian) BamTools::SwapEndian_32(blockSize); m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); @@ -468,7 +506,7 @@ void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) buffer[0] = al.RefID; buffer[1] = al.Position; buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[3] = (al.AlignmentFlag << 16) | (al.SupportData.NumCigarOperations < 65536? al.SupportData.NumCigarOperations : 2); buffer[4] = al.SupportData.QuerySequenceLength; buffer[5] = al.MateRefID; buffer[6] = al.MatePosition; @@ -484,8 +522,34 @@ void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); // write the raw char data - m_stream.Write((char*)al.SupportData.AllCharData.data(), - al.SupportData.BlockLength - Constants::BAM_CORE_SIZE); + if (al.SupportData.NumCigarOperations < 65536) { + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength - Constants::BAM_CORE_SIZE); + } else { + const char *data = al.SupportData.AllCharData.c_str(); + const unsigned data_len = al.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned cigar_offset = al.SupportData.QueryNameLength; + const unsigned seq_offset = cigar_offset + al.SupportData.NumCigarOperations * 4; + unsigned fake_cigar[2]; + fake_cigar[0] = al.SupportData.QuerySequenceLength << 4 | 4; + fake_cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3; + m_stream.Write(data, al.SupportData.QueryNameLength); + if (m_isBigEndian) { + BamTools::SwapEndian_32(fake_cigar[0]); + BamTools::SwapEndian_32(fake_cigar[1]); + } + m_stream.Write((char*)&fake_cigar, 8); + m_stream.Write(data + seq_offset, data_len - seq_offset); + m_stream.Write("CGBI", 4); + if (m_isBigEndian) { + unsigned cigar_len_buf = al.SupportData.NumCigarOperations; + BamTools::SwapEndian_32(cigar_len_buf); + m_stream.Write((char*)&cigar_len_buf, 4); + } else { + m_stream.Write((char*)&al.SupportData.NumCigarOperations, 4); + } + m_stream.Write(data + cigar_offset, al.SupportData.NumCigarOperations * 4); + } } void BamWriterPrivate::WriteMagicNumber()