From bb1ac780335a6eab647a25601715f79a833976c5 Mon Sep 17 00:00:00 2001 From: dpryan79 Date: Mon, 27 Aug 2018 14:15:57 +0200 Subject: [PATCH 1/2] Increase the maximum number of contigs and libraries supported by markdup --- sambamba/markdup.d | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sambamba/markdup.d b/sambamba/markdup.d index af91651a..381b366e 100644 --- a/sambamba/markdup.d +++ b/sambamba/markdup.d @@ -558,14 +558,14 @@ auto readPairsAndFragments(alias hashFunc=simpleHash, R) (reads, table_size_log2, overflow_list_size, tmp_dir, task_pool); } -///////////////////////////////////////////////////////////////////////////////// -/// no more than 32767 libraries and 16383 reference sequences are supported //// -///////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////// +/// no more than 536 million libraries and 2 billion reference sequences are supported //// +/////////////////////////////////////////////////////////////////////////////////////////// // 8 bytes struct SingleEndBasicInfo { - mixin(bitfields!(short, "library_id", 16, - ushort, "ref_id", 14, + mixin(bitfields!(short, "library_id", 30, + ushort, "ref_id", 32, ubyte, "reversed", 1, ubyte, "paired", 1)); int coord; @@ -575,7 +575,7 @@ struct SingleEndBasicInfo { reversed == other.reversed && library_id == other.library_id; } } -static assert(SingleEndBasicInfo.sizeof == 8); +static assert(SingleEndBasicInfo.sizeof == 16); struct SingleEndInfo { SingleEndBasicInfo basic_info; @@ -585,8 +585,8 @@ struct SingleEndInfo { } struct PairedEndsInfo { - mixin(bitfields!(short, "library_id", 16, - ushort, "ref_id1", 14, + mixin(bitfields!(short, "library_id", 30, + ushort, "ref_id1", 32, ubyte, "reversed1", 1, ubyte, "reversed2", 1)); int coord1; @@ -675,7 +675,7 @@ class ReadGroupIndex { } ++i; } - enforce(n_libs <= 32767, "More than 32767 libraries are unsupported"); + enforce(n_libs <= 536870911, "More than 536870911 libraries are unsupported"); } /// -1 if read group with such name is not found in the header @@ -1232,7 +1232,6 @@ int markdup_main(string[] args) { // Set up the BAM reader and pass in the thread pool auto bam = new MultiBamReader(args[1 .. $-1], taskPool); auto n_refs = bam.reference_sequences.length; - enforce(n_refs < 16384, "More than 16383 reference sequences are unsupported"); auto rg_index = new ReadGroupIndex(bam.header); From eefaf903f9cdd781eee98518b9fe6a5fc7dad4ae Mon Sep 17 00:00:00 2001 From: dpryan79 Date: Tue, 28 Aug 2018 14:32:02 +0200 Subject: [PATCH 2/2] Allow the full number of contigs supported by a BAM file. --- sambamba/markdup.d | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sambamba/markdup.d b/sambamba/markdup.d index 381b366e..3e9a3bb3 100644 --- a/sambamba/markdup.d +++ b/sambamba/markdup.d @@ -564,8 +564,8 @@ auto readPairsAndFragments(alias hashFunc=simpleHash, R) // 8 bytes struct SingleEndBasicInfo { - mixin(bitfields!(short, "library_id", 30, - ushort, "ref_id", 32, + mixin(bitfields!(int, "library_id", 30, + uint, "ref_id", 32, ubyte, "reversed", 1, ubyte, "paired", 1)); int coord; @@ -585,13 +585,13 @@ struct SingleEndInfo { } struct PairedEndsInfo { - mixin(bitfields!(short, "library_id", 30, - ushort, "ref_id1", 32, + mixin(bitfields!(long, "library_id", 30, + uint, "ref_id1", 32, ubyte, "reversed1", 1, ubyte, "reversed2", 1)); int coord1; int coord2; - ushort ref_id2; + uint ref_id2; uint score; // sum of base qualities that are >= 15 ulong idx1, idx2; @@ -601,7 +601,7 @@ struct PairedEndsInfo { // HACK! HACK! HACK! use the fact that the structures are almost // the same except the one bit meaning 'paired' instead of 'reversed2'. auto p = cast(ubyte*)(&result); - p[0 .. 8] = (cast(ubyte*)(&this))[0 .. 8]; + p[0 .. 16] = (cast(ubyte*)(&this))[0 .. 16]; result.paired = true; return result; } @@ -762,6 +762,7 @@ SingleEndBasicInfo basicInfo(E)(auto ref E e) { bool samePosition(E1, E2)(auto ref E1 e1, auto ref E2 e2) { static if (is(E1 == PairedEndsInfo) && is(E2 == PairedEndsInfo)) { return *cast(ulong*)(&e1) == *cast(ulong*)(&e2) && + e1.coord1 == e2.coord1 && e1.ref_id2 == e2.ref_id2 && e1.coord2 == e2.coord2; } else { return basicInfo(e1).samePosition(basicInfo(e2));