diff --git a/CMakeLists.txt b/CMakeLists.txt index 055f2d7..1e6ae44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,7 +213,7 @@ add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src ${CMAKE_CURRENT_BINARY_DIR}/googletest-build EXCLUDE_FROM_ALL) - +# leviosam_test add_executable(leviosam_test src/leviosam_test.cpp) target_link_libraries(leviosam_test lvsam) target_link_libraries(leviosam_test ${HTS_LIB}) @@ -224,7 +224,7 @@ add_test(NAME leviosam_test COMMAND leviosam_test WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata ) - +# chain_test add_executable(chain_test src/chain_test.cpp) target_link_libraries(chain_test lvsam) target_link_libraries(chain_test ${HTS_LIB}) @@ -235,6 +235,28 @@ add_test(NAME chain_test COMMAND chain_test WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata ) +# bed_test +add_executable(bed_test src/bed_test.cpp) +target_link_libraries(bed_test lvsam) +target_link_libraries(bed_test ${HTS_LIB}) +target_link_libraries(bed_test ${SDSL_LIB}) +target_link_libraries(bed_test gtest gtest_main) + +add_test(NAME bed_test COMMAND bed_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata +) + +# reconcile_test +add_executable(reconcile_test src/reconcile_test.cpp) +target_link_libraries(reconcile_test lvsam) +target_link_libraries(reconcile_test ${HTS_LIB}) +target_link_libraries(reconcile_test ${SDSL_LIB}) +target_link_libraries(reconcile_test gtest gtest_main) + +add_test(NAME reconcile_test COMMAND reconcile_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata +) + INSTALL(TARGETS leviosam2 DESTINATION bin) INSTALL(FILES src/leviosam.hpp DESTINATION include) diff --git a/src/IITree.h b/src/IITree.h index 3ae722f..59e07c5 100644 --- a/src/IITree.h +++ b/src/IITree.h @@ -89,15 +89,22 @@ class IITree { size_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1; if (i1 >= a.size()) i1 = a.size(); for (i = i0; i < i1 && a[i].st < en; ++i) - if (st < a[i].en) // if overlap, append to out[] - out.push_back(i); + if (st < a[i].en) { // if overlap, append to out[] + int _en = std::min(en, a[i].en); + int _st = std::max(st, a[i].st); + out.push_back(std::max(1, _en - _st)); + } } else if (z.w == 0) { // if left child not processed size_t y = z.x - (1LL<<(z.k-1)); // the left child of z.x; NB: y may be out of range (i.e. y>=a.size()) stack[t++] = StackCell(z.k, z.x, 1); // re-add node z.x, but mark the left child having been processed if (y >= a.size() || a[y].max > st) // push the left child if y is out of range or may overlap with the query stack[t++] = StackCell(z.k - 1, y, 0); } else if (z.x < a.size() && a[z.x].st < en) { // need to push the right child - if (st < a[z.x].en) out.push_back(z.x); // test if z.x overlaps the query; if yes, append to out[] + if (st < a[z.x].en) { + int _en = std::min(en, a[z.x].en); + int _st = std::max(st, a[z.x].st); + out.push_back(std::max(1, _en - _st)); // test if z.x overlaps the query; if yes, append to out[] + } stack[t++] = StackCell(z.k - 1, z.x + (1LL<<(z.k-1)), 0); // push the right child } } diff --git a/src/bed.cpp b/src/bed.cpp index 5619982..0b4ec0f 100644 --- a/src/bed.cpp +++ b/src/bed.cpp @@ -33,7 +33,6 @@ void Bed::init(const std::string &fn) { std::string line; while (std::getline(bed_f, line)) { cnt += 1; - // std::cerr << line << "\n"; if (add_interval(line) == false) { std::cerr << "[E::Bed] Failed to update BED record " << line << "\n"; @@ -46,9 +45,10 @@ void Bed::init(const std::string &fn) { << " contigs) from " << fn << "\n"; } +/* Index the interval trees for each contig and returns the number of contigs + */ int Bed::index() { int contig_cnt = 0; - // Index the interval trees for each contig for (auto &itr : intervals) { itr.second.index(); contig_cnt += 1; @@ -76,14 +76,36 @@ bool Bed::add_interval(const std::string &line) { return true; } -// Interval intersect query +/* Interval intersect query + * Args: + * isec_threshold: + * - <0: return true if overlap >= 1bp + * - 1>= `isec_threshold` > 0: return true if overlap fraction >= `isec_threshold` + * - `isec_threshold` > 1: return true if overlap size >= `isec_threshold` (bp) +*/ bool Bed::intersect(const std::string &contig, const size_t &pos1, - const size_t &pos2) { + const size_t &pos2, const float &isec_threshold) { if (!is_valid) return false; + if (pos2 <= pos1) return false; if (intervals.find(contig) == intervals.end()) return false; std::vector isec; - intervals[contig].overlap(pos1, pos2, isec); - return (isec.size() > 0); + bool ovlp_status = intervals[contig].overlap(pos1, pos2, isec); + if (!ovlp_status) return false; + // <=0 means any overlap is good + if (isec_threshold <= 0) return (isec.size() > 0); + + int max_overlap = 0; + for (auto& _o: isec) { + if (_o > max_overlap) max_overlap = _o; + } + if (isec_threshold > 0 && isec_threshold <= 1) + return (max_overlap >= (pos2 - pos1) * isec_threshold); + + if (isec_threshold > 1) + return (max_overlap >= isec_threshold); + + return false; + // return (isec.size() > 0); } // Point intersect query diff --git a/src/bed.hpp b/src/bed.hpp index 62ad54f..9cf4f44 100644 --- a/src/bed.hpp +++ b/src/bed.hpp @@ -32,7 +32,7 @@ class Bed { int index(); bool add_interval(const std::string &line); bool intersect(const std::string &contig, const size_t &pos1, - const size_t &pos2); + const size_t &pos2, const float &isec_threshold = 0); bool intersect(const std::string &contig, const size_t &pos); BedMap get_intervals(); std::string get_fn(); diff --git a/src/bed_test.cpp b/src/bed_test.cpp new file mode 100644 index 0000000..bd469de --- /dev/null +++ b/src/bed_test.cpp @@ -0,0 +1,89 @@ +/* + * bed_test.cpp + * + * Test bed related functions for levioSAM2 + * + * Author: Nae-Chyun Chen + * Dept. of Computer Science, Johns Hopkins University + * + * Distributed under the MIT license + * https://github.com/milkschen/leviosam2 + */ +#include "bed.hpp" + +#include + +#include + +#include "gtest/gtest.h" +#include "leviosam_utils.hpp" + +/* Bed tests */ +TEST(BedTest, Simple) { + std::string bed_fn("test_small.bed"); + BedUtils::Bed bed(bed_fn); + EXPECT_EQ(bed.get_fn(), bed_fn); + EXPECT_EQ(bed.index(), 3); + EXPECT_EQ(bed.intersect("chr1", 450, 499), false); + EXPECT_EQ(bed.intersect("chr1", 450, 501), true); +} + +TEST(BedTest, AddInterval) { + std::string bed_fn("test_small.bed"); + BedUtils::Bed bed(bed_fn); + EXPECT_EQ(bed.intersect("chr1", 55), false); + EXPECT_EQ(bed.index(), 3); + EXPECT_EQ(bed.add_interval("chr1 50 60"), true); + // chr1 already exists, so cnt is still 3 + EXPECT_EQ(bed.index(), 3); + EXPECT_EQ(bed.intersect("chr1", 55), true); + EXPECT_EQ(bed.intersect("chr1", 65), false); + EXPECT_EQ(bed.intersect("chr1", 55, 58), true); + EXPECT_EQ(bed.intersect("chr1", 45, 80), true); + EXPECT_EQ(bed.intersect("chr1", 45, 55), true); +} + +TEST(BedTest, IntersectWithFrac) { + std::string bed_fn("test_small.bed"); + BedUtils::Bed bed(bed_fn); + EXPECT_EQ(bed.intersect("chr1", 450, 650, 1.0), false); + EXPECT_EQ(bed.intersect("chr1", 550, 580, 1.0), true); + EXPECT_EQ(bed.intersect("chr1", 550, 580, 0.99), true); + EXPECT_EQ(bed.intersect("chr1", 550, 700, 0.3), true); + EXPECT_EQ(bed.intersect("chr1", 550, 700, 0.35), false); + EXPECT_EQ(bed.intersect("chr1", 550, 700, 30), true); + EXPECT_EQ(bed.intersect("chr1", 550, 580, 0), true); + EXPECT_EQ(bed.intersect("chr1", 550, 580, -1), true); +} + +TEST(BedTest, AddBedRecord) { + auto b = BedUtils::Bed(); + b.add_interval("chr1 972 1052"); + b.add_interval("chr1 1053 1085"); + b.add_interval("chr1 1150 1198"); + b.add_interval("chr1 1199 1262"); + b.add_interval("chr2 340 498"); + b.add_interval("chr2 599 1000"); + int idx_contig_cnt = b.index(); + int contig_cnt = 0; + // Index the interval trees for each contig + for (auto& itr: b.get_intervals()) { + contig_cnt += 1; + } + EXPECT_EQ(contig_cnt, 2); + EXPECT_EQ(contig_cnt, idx_contig_cnt); + EXPECT_EQ(b.intersect("chr1", 1000), 1); + EXPECT_EQ(b.intersect("chr1", 1052), 0); + EXPECT_EQ(b.intersect("chr2", 339), 0); + EXPECT_EQ(b.intersect("chr2", 340), 1); + EXPECT_EQ(b.intersect("chr2", 497), 1); + EXPECT_EQ(b.intersect("chr2", 498), 0); + EXPECT_EQ(b.intersect("chr2", 340), 1); + EXPECT_EQ(b.intersect("chr2", 340), 1); + EXPECT_EQ(b.intersect("chr2", 598), 0); + EXPECT_EQ(b.intersect("chr2", 599), 1); + EXPECT_EQ(b.intersect("chr2", 600), 1); + EXPECT_EQ(b.intersect("chr1", 1500), 0); + EXPECT_EQ(b.intersect("chr15", 1000), 0); +} + diff --git a/src/leviosam.cpp b/src/leviosam.cpp index 5ce88f7..1e74128 100644 --- a/src/leviosam.cpp +++ b/src/leviosam.cpp @@ -202,43 +202,42 @@ void read_and_lift(T *lift_map, std::mutex *mutex_fread, // Skip re-alignment if `-x` is not set if (args.realign_yaml == "") continue; uint8_t *nm_ptr = bam_aux_get(aln_vec[i], "NM"); - if (nm_ptr != NULL && !(aln_vec[i]->core.flag & BAM_FUNMAP)) { - // Skip if edit distance not greater than the threshold - if (bam_aux2i(nm_ptr) <= args.aln_opts.nm_threshold) continue; - std::string ref_seq = - ref.substr(aln_vec[i]->core.pos, - bam_cigar2rlen(aln_vec[i]->core.n_cigar, - bam_get_cigar(aln_vec[i]))); - std::string q_seq = LevioSamUtils::get_read_as_is(aln_vec[i]); - std::vector new_cigar; - // We perform global alignment for local sequences and - // thus an alignment with a high number of clipped bases - // might not re-align well - int new_score = 0; - if (Aln::align_ksw2(ref_seq.data(), q_seq.data(), args.aln_opts, - new_cigar, new_score) > 0) { - LevioSamUtils::update_cigar(aln_vec[i], new_cigar); - // Redo fillmd after re-align - bam_fillmd1(aln_vec[i], ref.data(), args.md_flag, 1); - bam_aux_append(aln_vec[i], "LR", 'i', 4, - (uint8_t *)&new_score); - } else { - // If re-alignment fails, set this read to unmapped - bool first_seg = (aln_vec[i]->core.flag & BAM_FPAIRED) - ? (aln_vec[i]->core.flag & BAM_FREAD1) - ? true - : false - : true; - LevioSamUtils::update_flag_unmap(aln_vec[i], first_seg); - if (args.verbose >= VERBOSE_INFO) { - std::cerr - << "[I::read_and_lift] Zero-length new cigar for " - << bam_get_qname(aln_vec[i]) - << "; flag = " << aln_vec[i]->core.flag - << "; NM:i = " << bam_aux2i(nm_ptr) - << "; l_qseq = " << aln_vec[i]->core.l_qseq - << ". Set to unmapped.\n"; - } + // Skip is the NM tag is not found + if (nm_ptr == NULL) continue; + // Skip if unmapped + if (aln_vec[i]->core.flag & BAM_FUNMAP) continue; + // Skip if edit distance not greater than the threshold + if (bam_aux2i(nm_ptr) <= args.aln_opts.nm_threshold) continue; + std::string ref_seq = + ref.substr(aln_vec[i]->core.pos, + bam_cigar2rlen(aln_vec[i]->core.n_cigar, + bam_get_cigar(aln_vec[i]))); + std::string q_seq = LevioSamUtils::get_read_as_is(aln_vec[i]); + std::vector new_cigar; + // We perform global alignment for local sequences and + // thus an alignment with a high number of clipped bases + // might not re-align well + int new_score = 0; + if (Aln::align_ksw2(ref_seq.data(), q_seq.data(), args.aln_opts, + new_cigar, new_score) > 0) { + LevioSamUtils::update_cigar(aln_vec[i], new_cigar); + // Redo fillmd after re-align + bam_fillmd1(aln_vec[i], ref.data(), args.md_flag, 1); + bam_aux_append(aln_vec[i], "LR", 'i', 4, (uint8_t *)&new_score); + } else { + // If re-alignment fails, set this read to unmapped + bool first_seg = + (aln_vec[i]->core.flag & BAM_FPAIRED) + ? (aln_vec[i]->core.flag & BAM_FREAD1) ? true : false + : true; + LevioSamUtils::update_flag_unmap(aln_vec[i], first_seg); + if (args.verbose >= VERBOSE_INFO) { + std::cerr << "[I::read_and_lift] Zero-length new cigar for " + << bam_get_qname(aln_vec[i]) + << "; flag = " << aln_vec[i]->core.flag + << "; NM:i = " << bam_aux2i(nm_ptr) + << "; l_qseq = " << aln_vec[i]->core.l_qseq + << ". Set to unmapped.\n"; } } } @@ -255,7 +254,7 @@ void read_and_lift(T *lift_map, std::mutex *mutex_fread, } } - // If force-commit, only write the original alignment to + // If suppress, only write the original alignment to // `unliftable` If defer, write to both `unliftable` and `defer` if (wd->commit_aln_dest(aln_vec[i]) == false) { std::lock_guard g_deferred(wd->mutex_fwrite); @@ -307,9 +306,8 @@ void lift_run(lift_opts args) { } else if (args.chain_fname != "") { std::cerr << "[I::lift_run] Building levioSAM index..."; if (args.length_map.size() == 0) { - std::cerr - << "[E::lift_run] No length map is found. Please set -F " - "properly.\n"; + std::cerr << "[E::lift_run] No length map is found. Please " + "set -F properly.\n"; print_serialize_help_msg(); exit(1); } @@ -389,7 +387,8 @@ void lift_run(lift_opts args) { if (args.split_rules.size() != 0) { wd.init(args.outpre, args.split_rules, args.out_format, hdr_orig, hdr, args.bed_defer_source, args.bed_defer_dest, - args.bed_commit_source, args.bed_commit_dest); + args.bed_commit_source, args.bed_commit_dest, + args.bed_isec_threshold); } std::vector threads; @@ -435,8 +434,8 @@ lift::LiftMap lift::lift_from_vcf(std::string fname, std::string sample, void print_serialize_help_msg() { std::cerr << "\n"; std::cerr << "Build a levioSAM2 index of a chain file.\n"; - std::cerr - << "Usage: leviosam2 index -c -p -F \n"; + std::cerr << "Usage: leviosam2 index -c -p " + "-F \n"; std::cerr << "\n"; std::cerr << "Inputs: -c path Path to the chain file to index.\n"; std::cerr << " -F path Path to the FAI (FASTA index) file of the " @@ -464,9 +463,8 @@ void print_lift_help_msg() { std::cerr << " -G INT Number of allowed CIGAR changes for one " "alingment. [0]\n"; std::cerr << " -T INT Chunk size for each thread. [256] \n"; - std::cerr - << " Each thread queries <-T> reads, lifts, and " - "writes.\n"; + std::cerr << " Each thread queries <-T> reads, lifts, " + "and writes.\n"; std::cerr << " Setting a higher <-T> uses slightly more " "memory but might benefit thread scaling.\n"; std::cerr << "\n"; @@ -480,9 +478,8 @@ void print_lift_help_msg() { "commit (pre-liftover). [30]\n"; std::cerr << " * aln_score INT Min AS:i " "(alignment score) to commit (pre-liftover). [100]\n"; - std::cerr - << " * isize INT Max TLEN/isize to " - "commit (post-liftover). [1000]\n"; + std::cerr << " * isize INT Max TLEN/isize " + "to commit (post-liftover). [1000]\n"; std::cerr << " * hdist INT Max NM:i " "(Hamming dist.) to commit (post-liftover). " "`-m` and `-f` must be set. [5]\n"; @@ -490,18 +487,24 @@ void print_lift_help_msg() { "clipped to commit (post-liftover). [0.95]\n"; std::cerr << " Example: `-S mapq:20 -S aln_score:20` commits " "MQ>=20 and AS>=20 alignments.\n"; - std::cerr - << " -r string Path to a BED file (source coordinates). " - "Reads overlap with the regions are always committed. [none]\n"; + std::cerr << " -r string Path to a BED file (source " + "coordinates). Reads overlap with the regions are always " + "committed. [none]\n"; std::cerr << " -D string Path to a BED file (dest coordinates). " "Reads overlap with the regions are always deferred. [none]\n"; + std::cerr << " -B float Threshold for BED record intersection. " + "[0]\n" + " If <= 0: consider any overlap (>0 bp)\n" + " If > 1: consider >`-B`-bp overlap\n" + " If 1>=`-B`>0: consider overlap with a " + "fraction of >`-B` of the alignment.\n"; std::cerr << "\n"; } void print_main_help_msg() { std::cerr << "\n"; - std::cerr - << "Program: leviosam2 (lift over alignments using a chain file)\n"; + std::cerr << "Program: leviosam2 (lift over alignments using a " + "chain file)\n"; std::cerr << "Version: " << VERSION << "\n"; std::cerr << "Usage: leviosam2 [options]\n\n"; std::cerr << "Commands: index Build a levioSAM2 index of " @@ -509,8 +512,8 @@ void print_main_help_msg() { std::cerr << " lift Lift alignments in SAM/BAM/CRAM " "formats.\n"; std::cerr << " bed Lift intervals in BED format.\n"; - std::cerr - << " collate Collate lifted paired-end alignments.\n"; + std::cerr << " collate Collate lifted paired-end " + "alignments.\n"; std::cerr << " reconcile Reconcile alignments.\n"; std::cerr << "Options: -h Print detailed usage.\n"; std::cerr << " -V Verbose level [0].\n"; @@ -539,17 +542,18 @@ int main(int argc, char **argv) { lift_opts args; args.cmd = LevioSamUtils::make_cmd(argc, argv); static struct option long_options[] { - {"md", no_argument, 0, 'm'}, {"sam", required_argument, 0, 'a'}, - {"bed_defer_source", required_argument, 0, 'd'}, - {"bed_defer_dest", required_argument, 0, 'D'}, + {"sam", required_argument, 0, 'a'}, + {"bed_isec_threshold", required_argument, 0, 'B'}, {"chain", required_argument, 0, 'c'}, {"chainmap", required_argument, 0, 'C'}, + {"bed_defer_source", required_argument, 0, 'd'}, + {"bed_defer_dest", required_argument, 0, 'D'}, {"reference", required_argument, 0, 'f'}, {"dest_fai", required_argument, 0, 'F'}, {"haplotype", required_argument, 0, 'g'}, {"allowed_cigar_changes", required_argument, 0, 'G'}, {"leviosam", required_argument, 0, 'l'}, - {"namemap", required_argument, 0, 'n'}, + {"md", no_argument, 0, 'm'}, {"namemap", required_argument, 0, 'n'}, {"out_format", required_argument, 0, 'O'}, {"prefix", required_argument, 0, 'p'}, {"bed_commit_source", required_argument, 0, 'r'}, @@ -564,7 +568,7 @@ int main(int argc, char **argv) { }; int long_index = 0; while ((c = getopt_long(argc, argv, - "hma:c:C:d:D:f:F:g:G:l:n:O:p:r:R:s:S:t:T:v:V:x:", + "hma:B:c:C:d:D:f:F:g:G:l:n:O:p:r:R:s:S:t:T:v:V:x:", long_options, &long_index)) != -1) { switch (c) { case 'h': @@ -579,6 +583,8 @@ int main(int argc, char **argv) { case 'a': args.sam_fname = optarg; break; + case 'B': + args.bed_isec_threshold = std::stof(optarg); case 'c': args.chain_fname = optarg; break; diff --git a/src/leviosam.hpp b/src/leviosam.hpp index 13f88e7..9787be6 100644 --- a/src/leviosam.hpp +++ b/src/leviosam.hpp @@ -75,6 +75,8 @@ struct lift_opts { BedUtils::Bed bed_defer_dest; BedUtils::Bed bed_commit_source; BedUtils::Bed bed_commit_dest; + // 0: take any overlap, >1: base pairs, 1>=value>0: fraction + float bed_isec_threshold = 0; }; #define LIFT_R_L 0 // unpaired, liftable diff --git a/src/leviosam_test.cpp b/src/leviosam_test.cpp index 1d8fc11..9bf8ca7 100644 --- a/src/leviosam_test.cpp +++ b/src/leviosam_test.cpp @@ -307,38 +307,6 @@ TEST(HtslibTest, CigarOpType) { } -TEST(BedTest, AddBedRecord) { - auto b = BedUtils::Bed(); - b.add_interval("chr1 972 1052"); - b.add_interval("chr1 1053 1085"); - b.add_interval("chr1 1150 1198"); - b.add_interval("chr1 1199 1262"); - b.add_interval("chr2 340 498"); - b.add_interval("chr2 599 1000"); - int idx_contig_cnt = b.index(); - int contig_cnt = 0; - // Index the interval trees for each contig - for (auto& itr: b.get_intervals()) { - contig_cnt += 1; - } - EXPECT_EQ(contig_cnt, 2); - EXPECT_EQ(contig_cnt, idx_contig_cnt); - EXPECT_EQ(b.intersect("chr1", 1000), 1); - EXPECT_EQ(b.intersect("chr1", 1052), 0); - EXPECT_EQ(b.intersect("chr2", 339), 0); - EXPECT_EQ(b.intersect("chr2", 340), 1); - EXPECT_EQ(b.intersect("chr2", 497), 1); - EXPECT_EQ(b.intersect("chr2", 498), 0); - EXPECT_EQ(b.intersect("chr2", 340), 1); - EXPECT_EQ(b.intersect("chr2", 340), 1); - EXPECT_EQ(b.intersect("chr2", 598), 0); - EXPECT_EQ(b.intersect("chr2", 599), 1); - EXPECT_EQ(b.intersect("chr2", 600), 1); - EXPECT_EQ(b.intersect("chr1", 1500), 0); - EXPECT_EQ(b.intersect("chr15", 1000), 0); -} - - TEST(UtilsTest, StrToSet) { std::set s = LevioSamUtils::str_to_set("a,b", ","); EXPECT_EQ(s.size(), 2); diff --git a/src/leviosam_utils.cpp b/src/leviosam_utils.cpp index 3de0ef1..d882911 100644 --- a/src/leviosam_utils.cpp +++ b/src/leviosam_utils.cpp @@ -21,7 +21,8 @@ void WriteDeferred::init( const std::vector>& split_rules, const std::string of, sam_hdr_t* ihdr, sam_hdr_t* ohdr, const BedUtils::Bed& b_defer_source, const BedUtils::Bed& b_defer_dest, - const BedUtils::Bed& b_commit_source, const BedUtils::Bed& b_commit_dest) { + const BedUtils::Bed& b_commit_source, const BedUtils::Bed& b_commit_dest, + const float& b_isec_th) { write_deferred = true; std::string rules_str(""); for (auto& r : split_rules) { @@ -44,6 +45,7 @@ void WriteDeferred::init( bed_defer_dest = b_defer_dest; bed_commit_source = b_commit_source; bed_commit_dest = b_commit_dest; + bed_isec_threshold = b_isec_th; std::string out_mode = (of == "sam") ? "w" : "wb"; std::string out_fn = outpre + "-deferred." + of; @@ -124,10 +126,12 @@ bool WriteDeferred::commit_aln_source(const bam1_t* const aln) { size_t pos_end = c->pos + rlen; // The commit regions need to be considered ahead of other rules, // i.e. a low MAPQ read in a commit region should be committed - if (bed_commit_source.intersect(rname, c->pos, pos_end)) { + if (bed_commit_source.intersect(rname, c->pos, pos_end, + bed_isec_threshold)) { return true; } - if (bed_defer_source.intersect(rname, c->pos, pos_end)) { + if (bed_defer_source.intersect(rname, c->pos, pos_end, + bed_isec_threshold)) { return false; } return false; @@ -160,10 +164,10 @@ bool WriteDeferred::commit_aln_dest(const bam1_t* const aln) { std::string rname = hdr->target_name[c->tid]; auto rlen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(aln)); size_t pos_end = c->pos + rlen; - if (bed_defer_dest.intersect(rname, c->pos, pos_end)) { + if (bed_defer_dest.intersect(rname, c->pos, pos_end, bed_isec_threshold)) { return false; } - if (bed_commit_dest.intersect(rname, c->pos, pos_end)) { + if (bed_commit_dest.intersect(rname, c->pos, pos_end, bed_isec_threshold)) { return true; } if (split_modes.find("clipped_frac") != split_modes.end()) { diff --git a/src/leviosam_utils.hpp b/src/leviosam_utils.hpp index 1be31bc..bf000ef 100644 --- a/src/leviosam_utils.hpp +++ b/src/leviosam_utils.hpp @@ -29,6 +29,7 @@ #define SPLIT_MAX_CLIPPED_FRAC 0.95 #define SPLIT_MIN_AS 100 #define SPLIT_MAX_NM 5 +#define BED_ISEC_TH 0 const int8_t seq_comp_table[16] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15}; @@ -87,7 +88,8 @@ class WriteDeferred { const BedUtils::Bed& b_defer_source, const BedUtils::Bed& b_defer_dest, const BedUtils::Bed& b_commit_source, - const BedUtils::Bed& b_commit_dest); + const BedUtils::Bed& b_commit_dest, + const float& b_isec_th); void print_info(); void write_deferred_bam(bam1_t* aln, sam_hdr_t* hdr); @@ -111,6 +113,7 @@ class WriteDeferred { float max_clipped_frac = SPLIT_MAX_CLIPPED_FRAC; BedUtils::Bed bed_defer_source, bed_defer_dest, bed_commit_source, bed_commit_dest; + float bed_isec_threshold = BED_ISEC_TH; }; void update_cigar(bam1_t* aln, std::vector& new_cigar); diff --git a/src/reconcile.cpp b/src/reconcile.cpp index 7572ed2..d901e69 100644 --- a/src/reconcile.cpp +++ b/src/reconcile.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "leviosam_utils.hpp" @@ -60,8 +61,11 @@ void unzip(const std::vector>& zipped, std::vector& a, } } -/* Rank a set of alignments. - * Order by: is_proper_pair > score > MAPQ +/* Returns the best alignment in a set, by the order of: + * "is_proper_pair > score > MAPQ". + * + * Update an int reference `num_tied_best` to reflect the number of tied best + * records. */ int select_best_aln(const std::vector& pair_indicators, const std::vector& scores, @@ -94,7 +98,8 @@ int select_best_aln(const std::vector& pair_indicators, for (int i = 0; i < vec_size; i++) { ranks.push_back(std::get<3>(zipped[i])); } - std::random_shuffle(ranks.begin(), ranks.begin() + num_tied_best, myrandom); + std::shuffle(ranks.begin(), ranks.begin() + num_tied_best, + std::default_random_engine(0)); return ranks[0]; } diff --git a/src/reconcile.hpp b/src/reconcile.hpp index fb953a3..021f7c8 100644 --- a/src/reconcile.hpp +++ b/src/reconcile.hpp @@ -14,6 +14,7 @@ #define RECONCILE_H__ #include +#include #include #include diff --git a/src/reconcile_test.cpp b/src/reconcile_test.cpp new file mode 100644 index 0000000..d20f0c0 --- /dev/null +++ b/src/reconcile_test.cpp @@ -0,0 +1,58 @@ +/* + * reconcile_test.cpp + * + * Test the reconcile module of levioSAM2 + * + * Author: Nae-Chyun Chen + * Dept. of Computer Science, Johns Hopkins University + * + * Distributed under the MIT license + * https://github.com/milkschen/leviosam2 + */ +#include "reconcile.hpp" + +#include + +#include +#include + +#include "gtest/gtest.h" +#include "leviosam_utils.hpp" + +/* Reconcile tests */ +TEST(ReconcileTest, SelectBestAlnOneBestAtFirst) { + std::vector pi = {true, true, false}; + std::vector sc = {0, -3, -20}; + std::vector mq = {40, 30, 2}; + int ntb = 0; + + int rank = select_best_aln(pi, sc, mq, ntb); + + EXPECT_EQ(rank, 0); + EXPECT_EQ(ntb, 1); +} + +TEST(ReconcileTest, SelectBestAlnOneBestAtSecond) { + std::vector pi = {true, true, false}; + std::vector sc = {0, 3, -20}; + std::vector mq = {40, 30, 2}; + int ntb = 0; + + int rank = select_best_aln(pi, sc, mq, ntb); + + EXPECT_EQ(rank, 1); + EXPECT_EQ(ntb, 1); +} + +TEST(ReconcileTest, SelectBestAlnFourBest) { + std::vector pi = {true, true, true, true, false}; + std::vector sc = {0, 0, 0, 0, -20}; + std::vector mq = {40, 40, 40, 40, 2}; + int ntb = 0; + + int rank = select_best_aln(pi, sc, mq, ntb); + + EXPECT_EQ(rank, 2); + EXPECT_EQ(ntb, 4); +} + diff --git a/testdata/test_small.bed b/testdata/test_small.bed new file mode 100644 index 0000000..4cbd020 --- /dev/null +++ b/testdata/test_small.bed @@ -0,0 +1,3 @@ +chr1 500 600 +chr2 700 701 +chr20 900 999 diff --git a/workflow/leviosam2.sh b/workflow/leviosam2.sh index 4617a59..7a44eb9 100644 --- a/workflow/leviosam2.sh +++ b/workflow/leviosam2.sh @@ -79,6 +79,7 @@ function usage { echo " -m INT ISIZE cutoff for the defer rule []" echo " -p FLOAT Fraction-of-clipped-bases cutoff for the defer rule []" echo " -D path Path to the force-defer annotation []" + echo " -B float Bed intersect threshold. See 'leviosam2 lift -h' for details. [0]" echo " Aligner:" echo " -a string Aligner to use (bowtie2|bwamem|bwamem2|minimap2|winnowmap2) [bowtie2]" echo " -b string Prefix to the aligner index" @@ -89,7 +90,7 @@ function usage { exit 0 } -while getopts hKMSa:A:b:C:D:f:H:g:i:l:L:m:o:p:q:r:R:t:T:x: flag +while getopts hKMSa:A:b:B:C:D:f:H:g:i:l:L:m:o:p:q:r:R:t:T:x: flag do case "${flag}" in h) usage;; @@ -99,6 +100,7 @@ do a) ALN=${OPTARG};; A) ALN_SCORE=" -S aln_score:${OPTARG}";; b) ALN_IDX=${OPTARG};; + B) BED_ISEC_TH=" -B ${OPTARG}";; C) CLFT=${OPTARG};; D) DEFER_DEST_BED=" -D ${OPTARG}";; f) REF=${OPTARG};; @@ -153,7 +155,7 @@ set -xp # Lifting over using leviosam2 if [ ! -s ${PFX}-committed.bam ]; then ${MT} ${LEVIOSAM} lift -C ${CLFT} -a ${INPUT} -t ${THR} -p ${PFX} -O bam \ - ${MAPQ} ${ISIZE} ${ALN_SCORE} ${FRAC_CLIPPED} ${HDIST} \ + ${MAPQ} ${ISIZE} ${ALN_SCORE} ${FRAC_CLIPPED} ${HDIST} ${BED_ISEC_TH}\ -G ${ALLOWED_GAPS} \ ${REALN_CONFIG} \ -m -f ${REF} ${DEFER_DEST_BED} ${COMMIT_SOURCE_BED}