Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bed processing improvements #26

Merged
merged 31 commits into from
Nov 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
88d351a
v0.3.0
milkschen Nov 3, 2022
8f47e29
minor formatting
milkschen Nov 3, 2022
2b8758f
formatting
milkschen Nov 3, 2022
97ee6f9
realign all reads instead of only primary
milkschen Nov 3, 2022
300f0d0
replace first and last CIGAR op with SCLIP if INS
milkschen Nov 3, 2022
d4bc5a8
formatting and docstrings
milkschen Nov 3, 2022
d2508ea
Merge branch 'main' into dev
milkschen Nov 3, 2022
e302942
vscode formatting
milkschen Nov 4, 2022
664ee42
resolve conflicts
milkschen Nov 4, 2022
fa6c2b9
indent =4
milkschen Nov 4, 2022
ecf13fc
fix a warning
milkschen Nov 4, 2022
a51a6ab
formatting
milkschen Nov 4, 2022
ee39887
v0.3.0
milkschen Nov 4, 2022
1ff7313
formatting
milkschen Nov 4, 2022
864ee21
Merge branch 'dev' of github.com:milkschen/leviosam2 into dev
milkschen Nov 4, 2022
11961c5
test file for bed
milkschen Nov 4, 2022
adb22e2
Merge branch 'main' into dev
milkschen Nov 4, 2022
9336acd
move to bed_test
milkschen Nov 4, 2022
b62dabe
test
milkschen Nov 4, 2022
f60b6e5
bed_test
milkschen Nov 4, 2022
3d52c0d
consider fraction/size of an interval overlap
milkschen Nov 4, 2022
0e8232a
docstring
milkschen Nov 4, 2022
6302b74
stop using deprecated std::random_shuffle(); add tests
milkschen Nov 7, 2022
18bc5fa
reconcile update
milkschen Nov 7, 2022
dec8759
minor docstring improvements
milkschen Nov 7, 2022
09e0c1f
rename isec_frac -> isec_threshold
milkschen Nov 7, 2022
b0b8b05
allow setting `bed_isec_threshold`
milkschen Nov 7, 2022
9708406
help message
milkschen Nov 7, 2022
f488702
add -B
milkschen Nov 7, 2022
6f4a6df
update intersect()
milkschen Nov 7, 2022
77bdaf1
minor formating
milkschen Nov 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
EXCLUDE_FROM_ALL)


# leviosam_test
add_executable(leviosam_test src/leviosam_test.cpp)
target_link_libraries(leviosam_test lvsam)
target_link_libraries(leviosam_test ${HTS_LIB})
Expand All @@ -224,7 +224,7 @@ add_test(NAME leviosam_test COMMAND leviosam_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)


# chain_test
add_executable(chain_test src/chain_test.cpp)
target_link_libraries(chain_test lvsam)
target_link_libraries(chain_test ${HTS_LIB})
Expand All @@ -235,6 +235,28 @@ add_test(NAME chain_test COMMAND chain_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)

# bed_test
add_executable(bed_test src/bed_test.cpp)
target_link_libraries(bed_test lvsam)
target_link_libraries(bed_test ${HTS_LIB})
target_link_libraries(bed_test ${SDSL_LIB})
target_link_libraries(bed_test gtest gtest_main)

add_test(NAME bed_test COMMAND bed_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)

# reconcile_test
add_executable(reconcile_test src/reconcile_test.cpp)
target_link_libraries(reconcile_test lvsam)
target_link_libraries(reconcile_test ${HTS_LIB})
target_link_libraries(reconcile_test ${SDSL_LIB})
target_link_libraries(reconcile_test gtest gtest_main)

add_test(NAME reconcile_test COMMAND reconcile_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)

INSTALL(TARGETS leviosam2 DESTINATION bin)
INSTALL(FILES src/leviosam.hpp DESTINATION include)

13 changes: 10 additions & 3 deletions src/IITree.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,22 @@ class IITree {
size_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
if (i1 >= a.size()) i1 = a.size();
for (i = i0; i < i1 && a[i].st < en; ++i)
if (st < a[i].en) // if overlap, append to out[]
out.push_back(i);
if (st < a[i].en) { // if overlap, append to out[]
int _en = std::min(en, a[i].en);
int _st = std::max(st, a[i].st);
out.push_back(std::max(1, _en - _st));
}
} else if (z.w == 0) { // if left child not processed
size_t y = z.x - (1LL<<(z.k-1)); // the left child of z.x; NB: y may be out of range (i.e. y>=a.size())
stack[t++] = StackCell(z.k, z.x, 1); // re-add node z.x, but mark the left child having been processed
if (y >= a.size() || a[y].max > st) // push the left child if y is out of range or may overlap with the query
stack[t++] = StackCell(z.k - 1, y, 0);
} else if (z.x < a.size() && a[z.x].st < en) { // need to push the right child
if (st < a[z.x].en) out.push_back(z.x); // test if z.x overlaps the query; if yes, append to out[]
if (st < a[z.x].en) {
int _en = std::min(en, a[z.x].en);
int _st = std::max(st, a[z.x].st);
out.push_back(std::max(1, _en - _st)); // test if z.x overlaps the query; if yes, append to out[]
}
stack[t++] = StackCell(z.k - 1, z.x + (1LL<<(z.k-1)), 0); // push the right child
}
}
Expand Down
34 changes: 28 additions & 6 deletions src/bed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ void Bed::init(const std::string &fn) {
std::string line;
while (std::getline(bed_f, line)) {
cnt += 1;
// std::cerr << line << "\n";
if (add_interval(line) == false) {
std::cerr << "[E::Bed] Failed to update BED record " << line
<< "\n";
Expand All @@ -46,9 +45,10 @@ void Bed::init(const std::string &fn) {
<< " contigs) from " << fn << "\n";
}

/* Index the interval trees for each contig and returns the number of contigs
*/
int Bed::index() {
int contig_cnt = 0;
// Index the interval trees for each contig
for (auto &itr : intervals) {
itr.second.index();
contig_cnt += 1;
Expand Down Expand Up @@ -76,14 +76,36 @@ bool Bed::add_interval(const std::string &line) {
return true;
}

// Interval intersect query
/* Interval intersect query
* Args:
* isec_threshold:
* - <0: return true if overlap >= 1bp
* - 1>= `isec_threshold` > 0: return true if overlap fraction >= `isec_threshold`
* - `isec_threshold` > 1: return true if overlap size >= `isec_threshold` (bp)
*/
bool Bed::intersect(const std::string &contig, const size_t &pos1,
const size_t &pos2) {
const size_t &pos2, const float &isec_threshold) {
if (!is_valid) return false;
if (pos2 <= pos1) return false;
if (intervals.find(contig) == intervals.end()) return false;
std::vector<size_t> isec;
intervals[contig].overlap(pos1, pos2, isec);
return (isec.size() > 0);
bool ovlp_status = intervals[contig].overlap(pos1, pos2, isec);
if (!ovlp_status) return false;
// <=0 means any overlap is good
if (isec_threshold <= 0) return (isec.size() > 0);

int max_overlap = 0;
for (auto& _o: isec) {
if (_o > max_overlap) max_overlap = _o;
}
if (isec_threshold > 0 && isec_threshold <= 1)
return (max_overlap >= (pos2 - pos1) * isec_threshold);

if (isec_threshold > 1)
return (max_overlap >= isec_threshold);

return false;
// return (isec.size() > 0);
}

// Point intersect query
Expand Down
2 changes: 1 addition & 1 deletion src/bed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class Bed {
int index();
bool add_interval(const std::string &line);
bool intersect(const std::string &contig, const size_t &pos1,
const size_t &pos2);
const size_t &pos2, const float &isec_threshold = 0);
bool intersect(const std::string &contig, const size_t &pos);
BedMap get_intervals();
std::string get_fn();
Expand Down
89 changes: 89 additions & 0 deletions src/bed_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* bed_test.cpp
*
* Test bed related functions for levioSAM2
*
* Author: Nae-Chyun Chen
* Dept. of Computer Science, Johns Hopkins University
*
* Distributed under the MIT license
* https://github.com/milkschen/leviosam2
*/
#include "bed.hpp"

#include <unistd.h>

#include <iostream>

#include "gtest/gtest.h"
#include "leviosam_utils.hpp"

/* Bed tests */
TEST(BedTest, Simple) {
std::string bed_fn("test_small.bed");
BedUtils::Bed bed(bed_fn);
EXPECT_EQ(bed.get_fn(), bed_fn);
EXPECT_EQ(bed.index(), 3);
EXPECT_EQ(bed.intersect("chr1", 450, 499), false);
EXPECT_EQ(bed.intersect("chr1", 450, 501), true);
}

TEST(BedTest, AddInterval) {
std::string bed_fn("test_small.bed");
BedUtils::Bed bed(bed_fn);
EXPECT_EQ(bed.intersect("chr1", 55), false);
EXPECT_EQ(bed.index(), 3);
EXPECT_EQ(bed.add_interval("chr1 50 60"), true);
// chr1 already exists, so cnt is still 3
EXPECT_EQ(bed.index(), 3);
EXPECT_EQ(bed.intersect("chr1", 55), true);
EXPECT_EQ(bed.intersect("chr1", 65), false);
EXPECT_EQ(bed.intersect("chr1", 55, 58), true);
EXPECT_EQ(bed.intersect("chr1", 45, 80), true);
EXPECT_EQ(bed.intersect("chr1", 45, 55), true);
}

TEST(BedTest, IntersectWithFrac) {
std::string bed_fn("test_small.bed");
BedUtils::Bed bed(bed_fn);
EXPECT_EQ(bed.intersect("chr1", 450, 650, 1.0), false);
EXPECT_EQ(bed.intersect("chr1", 550, 580, 1.0), true);
EXPECT_EQ(bed.intersect("chr1", 550, 580, 0.99), true);
EXPECT_EQ(bed.intersect("chr1", 550, 700, 0.3), true);
EXPECT_EQ(bed.intersect("chr1", 550, 700, 0.35), false);
EXPECT_EQ(bed.intersect("chr1", 550, 700, 30), true);
EXPECT_EQ(bed.intersect("chr1", 550, 580, 0), true);
EXPECT_EQ(bed.intersect("chr1", 550, 580, -1), true);
}

TEST(BedTest, AddBedRecord) {
auto b = BedUtils::Bed();
b.add_interval("chr1 972 1052");
b.add_interval("chr1 1053 1085");
b.add_interval("chr1 1150 1198");
b.add_interval("chr1 1199 1262");
b.add_interval("chr2 340 498");
b.add_interval("chr2 599 1000");
int idx_contig_cnt = b.index();
int contig_cnt = 0;
// Index the interval trees for each contig
for (auto& itr: b.get_intervals()) {
contig_cnt += 1;
}
EXPECT_EQ(contig_cnt, 2);
EXPECT_EQ(contig_cnt, idx_contig_cnt);
EXPECT_EQ(b.intersect("chr1", 1000), 1);
EXPECT_EQ(b.intersect("chr1", 1052), 0);
EXPECT_EQ(b.intersect("chr2", 339), 0);
EXPECT_EQ(b.intersect("chr2", 340), 1);
EXPECT_EQ(b.intersect("chr2", 497), 1);
EXPECT_EQ(b.intersect("chr2", 498), 0);
EXPECT_EQ(b.intersect("chr2", 340), 1);
EXPECT_EQ(b.intersect("chr2", 340), 1);
EXPECT_EQ(b.intersect("chr2", 598), 0);
EXPECT_EQ(b.intersect("chr2", 599), 1);
EXPECT_EQ(b.intersect("chr2", 600), 1);
EXPECT_EQ(b.intersect("chr1", 1500), 0);
EXPECT_EQ(b.intersect("chr15", 1000), 0);
}

Loading