From 2a2a36d69ecb9e5d778ab245682ca02b05d2ff38 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:17:47 +0200 Subject: [PATCH 01/13] Filters alignments using a fixed treshold --- smoother.cpp | 111 +++++++++++++++++++-------------------------------- 1 file changed, 42 insertions(+), 69 deletions(-) diff --git a/smoother.cpp b/smoother.cpp index a39a7b0..50cd12f 100644 --- a/smoother.cpp +++ b/smoother.cpp @@ -27,7 +27,6 @@ using namespace std; // } bam1_core_t; int do_realloc_bam_data(bam1_t *b, size_t desired) { - // cout << "Extending BAM entry." << endl ; uint32_t new_m_data; uint8_t *new_data; new_m_data = desired; @@ -115,7 +114,7 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, uint8_t *qual = bam_get_qual(alignment); uint8_t *new_qual = new_read_quals[_i][_j][_k]; int pos = alignment->core.pos + - 1; // this is 0-based, variant cpoordinates are 1-based + 1; // this is 0-based, variant coordinates are 1-based // Modify current bam1_t* struct auto &core = alignment->core; vector> new_cigar; @@ -195,7 +194,6 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, soft_clip_offset += cigar_offsets[m].first; new_cigar.push_back(cigar_offsets[m]); } else { - cout << "Illegal Cigar OP" << endl; break; // if (cigar_offsets[m].second == BAM_CPAD || cigar_offsets[m].second == // BAM_CHARD_CLIP || cigar_offsets[m].second == BAM_CBACK) { @@ -204,43 +202,37 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, } new_seq[n] = '\0'; new_qual[n] = '\0'; - // int r = 0 ; - // for (auto op: new_cigar) { - // if (op.second != BAM_CDEL) { - // n -= op.first ; - // } - // r += op.first ; - // } - // assert(n == 0) ; + char *qname = bam_get_qname(alignment); - // cout << n << " " << strlen(new_seq) << " " << r << endl ; - // cout << bam_get_qname(alignment) << endl ; - // only do this on first processing thread - if (omp_get_thread_num() == 2) { - global_num_bases += num_match; - global_num_mismatch += num_mismatch; - } - // how many errors and SNPs do we expect? 1/1000 each, so say if we see more - // than twice that then don't correct - if (config->selective) { - if (num_mismatch / num_match > 3 * expected_mismatch_rate) { - if (omp_get_thread_num() == 3) { - num_ignored_reads += 1; - } - return; - } - // if we have so many deletions and insertions, then abort - if (ins_offset + del_offset > 0.7 * strlen(read_seq)) { + // char op = 'M'; + // cerr << qname << " " << strlen(new_seq) << " " << n << endl; + // for (const auto p : new_cigar) { + // if (p.second == BAM_CMATCH || p.second == BAM_CEQUAL || + // p.second == BAM_CDIFF) + // op = 'M'; + // else if (p.second == BAM_CINS) + // op = 'I'; + // else if (p.second == BAM_CDEL) + // op = 'D'; + // else if (p.second == BAM_CSOFT_CLIP) + // op = 'S'; + // cerr << p.first << op; + // } + // cerr << endl; + if (num_mismatch / num_match >= 0.02 || should_ignore) { // FIXME: HARDCODED + // read is too dirty or is not interesting, just skip it + ignored_reads[omp_get_thread_num() - 2].push_back(qname); + } else { + if (strlen(new_seq) != n) { + // FIXME: why does this happen? + cerr << "|" + << endl; // qname << " " << strlen(new_seq) << " " << n << endl; + ignored_reads[omp_get_thread_num() - 2].push_back(qname); return; } + smoothed_reads[omp_get_thread_num() - 2].push_back(qname); + rebuild_bam_entry(alignment, new_seq, new_qual, new_cigar); } - if (should_ignore) { - // check mismatch rate - ignored_reads[omp_get_thread_num() - 2].push_back(qname); - return; - } - smoothed_reads[omp_get_thread_num() - 2].push_back(qname); - rebuild_bam_entry(alignment, new_seq, new_qual, new_cigar); } void Smoother::process_batch(vector bam_entries, int p, int i) { @@ -256,7 +248,6 @@ void Smoother::process_batch(vector bam_entries, int p, int i) { continue; } if (alignment->core.l_qseq < 2) { - // cerr << "Read too short, ignoring.." << endl ; continue; } if (alignment->core.tid < 0) { @@ -323,6 +314,7 @@ void Smoother::run() { } } } + int b = 0; lprint({"Loading first batch.."}); load_batch_bam(config->threads, batch_size, 1); @@ -336,9 +328,7 @@ void Smoother::run() { bool should_terminate = false; bool loaded_last_batch = false; int reads_written = 0; - // lprint({"Starting main loop.."}) ; while (should_process) { - // lprint({"Beginning batch", to_string(b + 1)}); if (!should_load) { should_process = false; } @@ -351,11 +341,6 @@ void Smoother::run() { if (should_load) { loaded_last_batch = !load_batch_bam(config->threads, batch_size, (p + 1) % modulo); - // if (loaded_last_batch) { - // lprint({"Last input batch loaded."}); - // } else { - // lprint({"Loaded."}); - // } } } else if (i == 1) { if (b >= 1) { @@ -363,7 +348,18 @@ void Smoother::run() { for (int k = 0; k < batch_size / config->threads; k++) { for (int j = 0; j < config->threads; j++) { if (bam_entries[(p + 2) % modulo][j][k] != nullptr) { - auto alignment = bam_entries[(p + 2) % modulo][j][k]; + // TODO: just store smoothed reads in the output .bam + // if (find(smoothed_reads[j].begin(), smoothed_reads[j].end(), + // bam_get_qname(bam_entries[(p + 2) % modulo][j][k])) + // != + // smoothed_reads[j].end()) { + if (bam_entries[(p + 2) % modulo][j][k]->core.flag & + BAM_FUNMAP || + bam_entries[(p + 2) % modulo][j][k]->core.flag & + BAM_FSUPPLEMENTARY || + bam_entries[(p + 2) % modulo][j][k]->core.flag & + BAM_FSECONDARY) + continue; ret = sam_write1(out_bam_file, bam_header, bam_entries[(p + 2) % modulo][j][k]); reads_written += 1; @@ -372,6 +368,7 @@ void Smoother::run() { should_terminate = true; break; } + // } } else { break; } @@ -388,13 +385,6 @@ void Smoother::run() { lprint({"Something went wrong, aborting.."}, 2); return; } - // if (!should_load) { - // lprint({"Processed last batch of inputs."}); - // } - // if (!should_process) { - // lprint({"Exiting accelerator loop."}); - // break ; - // } p += 1; p %= modulo; b += 1; @@ -403,20 +393,7 @@ void Smoother::run() { if (s - t == 0) { s += 1; } - cerr << "[I] Processed batch " << b << ". Reads so far " << reads_processed - << ". Reads per second: " << reads_processed / (s - t) - << ". Time: " << s - t << "\n"; - cerr << "[I] Processed bases: " << uint64_t(global_num_bases) - << ", num mismatch: " << uint64_t(global_num_mismatch) - << ", mismatch rate: " << global_num_mismatch / global_num_bases - << ", ignored reads: " << num_ignored_reads << "\n"; - expected_mismatch_rate = global_num_mismatch / global_num_bases; - cerr << "\x1b[A"; - cerr << "\x1b[A"; } - cerr << endl; - cerr << endl; - lprint({"Done."}); sam_close(bam_file); sam_close(out_bam_file); dump_smoothed_read_ids(); @@ -487,22 +464,18 @@ bool Smoother::load_batch_bam(int threads, int batch_size, int p) { break; } } - // cout << m << " reallocations.." << endl ; // last batch was incomplete if (n != batch_size) { for (int j = n % threads; j < threads; j++) { - // cout << "Terminus at " << j << " " << i << endl ; for (int _ = i; _ < batch_size / threads; _++) { bam_entries[p][j][_] = nullptr; } } for (int j = 0; j < n % threads; j++) { - // cout << "Terminus at " << j << " " << i + 1 << endl ; for (int _ = i + 1; _ < batch_size / threads; _++) { bam_entries[p][j][_] = nullptr; } } } - // lprint({"Loaded", to_string(n), "BAM reads.."}); return n != 0 ? true : false; } From aafc5b9fd0fd236939ac0b909537274798706d66 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:22:20 +0200 Subject: [PATCH 02/13] Useless commit to clean(?) code --- cluster.cpp | 13 ++++++++++--- extender.cpp | 40 +++++++++++----------------------------- vcf.cpp | 2 +- 3 files changed, 22 insertions(+), 33 deletions(-) diff --git a/cluster.cpp b/cluster.cpp index 04c3105..9ad1e25 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -27,13 +27,20 @@ string Cluster::poa() { abpt->out_msa = 0; abpt->out_cons = 1; abpt->out_gfa = 0; - // abpt->w = 6, abpt->k = 9; - // abpt->min_w = 10; // minimizer-based seeding and partition // abpt->is_diploid = 1; - // abpt->min_freq = 0.3; abpt->progressive_poa = 0; abpoa_post_set_para(abpt); + // abpt->match = 2; // match score + // abpt->mismatch = 4; // mismatch penalty + // abpt->gap_mode = ABPOA_CONVEX_GAP; // gap penalty mode + // abpt->gap_open1 = 4; // gap open penalty #1 + // abpt->gap_ext1 = 2; // gap extension penalty #1 + // abpt->gap_open2 = 24; // gap open penalty #2 + // abpt->gap_ext2 = 1; // gap extension penalty #2 + // gap_penalty = min{gap_open1 + gap_len * gap_ext1, gap_open2 + gap_len * + // gap_ext2} + int *seq_lens = (int *)malloc(sizeof(int) * n_seqs); uint8_t **bseqs = (uint8_t **)malloc(sizeof(uint8_t *) * n_seqs); for (uint i = 0; i < n_seqs; ++i) { diff --git a/extender.cpp b/extender.cpp index 9740f4e..825d537 100644 --- a/extender.cpp +++ b/extender.cpp @@ -42,9 +42,9 @@ void Extender::run(int _threads) { make_pair(cluster.first, ExtCluster(cluster.second))); } } - for (const auto &cluster : _ext_clusters) { + for (const auto &cluster : _ext_clusters) ext_clusters.push_back(cluster.second); - } + // process each cluster separately _p_clusters.resize(threads); extract_sfs_sequences(); @@ -53,6 +53,7 @@ void Extender::run(int _threads) { clusters.insert(clusters.begin(), _p_clusters[i].begin(), _p_clusters[i].end()); } + // merge POA alignments _p_svs.resize(threads); _p_alignments.resize(threads); @@ -166,11 +167,6 @@ void Extender::extend_parallel() { // load next batch of entries if (should_load) { loaded_last_batch = !load_batch_bam(threads, batch_size, (p + 1) % 2); - if (loaded_last_batch) { - // lprint({"Last input batch loaded."}); - } else { - // lprint({"Loaded."}); - } } } else { process_batch(bam_entries[p][t - 1], t - 1); @@ -178,9 +174,6 @@ void Extender::extend_parallel() { } #pragma omp single { - // if (loaded_last_batch) { - // break ; - // } p += 1; p %= 2; b += 1; @@ -235,16 +228,11 @@ bool Extender::load_batch_bam(int threads, int batch_size, int p) { } } if (n != 0 && n != batch_size) { - for (int j = n % threads; j < threads; j++) { - // cout << "Terminus at " << j << " " << i << endl ; + for (int j = n % threads; j < threads; j++) bam_entries[p][j][i] = nullptr; - } - for (int j = 0; j < n % threads; j++) { - if (i + 1 < bam_entries[p][j].size()) { - // cout << "Terminus at " << j << " " << i + 1 << endl ; + for (int j = 0; j < n % threads; j++) + if (i + 1 < bam_entries[p][j].size()) bam_entries[p][j][i + 1] = nullptr; - } - } } // lprint({"Loaded", to_string(n), "BAM reads.."}); return n != 0 ? true : false; @@ -255,7 +243,6 @@ void Extender::process_batch(vector bam_entries, int index) { for (int b = 0; b < bam_entries.size(); b++) { aln = bam_entries[b]; if (aln == nullptr) { - // cout << "Terminus " << b << " " << index << " " << endl ; break; } extend_alignment(aln, index); @@ -302,7 +289,6 @@ void Extender::extend_alignment(bam1_t *aln, int index) { break; } } - // cout << refs << "-" << refe << endl ; // We extract the local alignment of the region of interest if (refs == -1 && refe == -1) { // we couldn't place the first and the last base, so we skip this - @@ -426,9 +412,7 @@ bool overlap(int s1, int e1, const ExtSFS &sfs) { } void Extender::cluster_no_interval_tree() { - lprint({"Sorting ", to_string(extended_sfs.size()), " extended SFS.."}); - std::sort(extended_sfs.begin(), extended_sfs.end()); - lprint({"Done."}); + sort(extended_sfs.begin(), extended_sfs.end()); auto r = std::max_element(extended_sfs.begin(), extended_sfs.end(), [](const ExtSFS &lhs, const ExtSFS &rhs) { return lhs.e - lhs.s < rhs.e - rhs.s; @@ -565,8 +549,8 @@ void Extender::extract_sfs_sequences() { aln->core.flag & BAM_FSECONDARY) { continue; } - ++cov; // FIXME: this cov takes into account also reads starting or ending - // inside the cluster (maybe we should skip those?) + ++cov; // FIXME: this cov takes into account also reads starting or + // ending inside the cluster (maybe we should skip those?) char *qname = bam_get_qname(aln); if (reads.find(qname) == reads.end()) { @@ -735,6 +719,7 @@ void Extender::call() { vector _svs; // svs on current cluster string ref = string(chromosome_seqs[chrom] + c.s, c.e - c.s + 1); string consensus = c.poa(); + parasail_result_t *result = NULL; result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(), ref.c_str(), ref.size(), 10, 1, @@ -801,7 +786,7 @@ void Extender::call() { // if (del.s <= sv.e) { // sv.e = del.e ; // int overlap = sv.e - del.s + 1 ; - // sv.refall = sv.altall + del..substr(overlap, del.l - overlap) ; + // sv.refall = sv.altall + del..substr(overlap, del.l - overlap); // } //} // -- Combine svs with same length (maybe useless now - only if diploid @@ -882,9 +867,6 @@ void Extender::filter_sv_chains() { } } } - if (prev.s == 205603627) { - break; - } _svs.push_back(prev); prev = sv; } diff --git a/vcf.cpp b/vcf.cpp index 8979e8d..95775d6 100644 --- a/vcf.cpp +++ b/vcf.cpp @@ -87,4 +87,4 @@ void print_vcf_header(const unordered_map &ref, ofstream &o, o << "##FORMAT=" << endl; o << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << sample << endl; -} \ No newline at end of file +} From 21ca02d0fd3e8f4b82407023f15a2a1e59ff48b7 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:23:39 +0200 Subject: [PATCH 03/13] Adds consensus CIGAR to variant INFO field --- caller.cpp | 3 +++ extender.cpp | 4 ++-- sv.cpp | 6 ++++-- sv.hpp | 6 ++++-- vcf.cpp | 3 +++ 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/caller.cpp b/caller.cpp index 95eb25f..e27a949 100644 --- a/caller.cpp +++ b/caller.cpp @@ -164,6 +164,9 @@ void Caller::print_vcf_header() { ovcf << "##INFO=" << endl; + ovcf << "##INFO=" + << endl; ovcf << "##FORMAT=" << endl; ovcf << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDEFAULT" diff --git a/extender.cpp b/extender.cpp index 825d537..a2e5c81 100644 --- a/extender.cpp +++ b/extender.cpp @@ -749,7 +749,7 @@ void Extender::call() { SV sv = SV("INS", c.chrom, rpos, string(chromosome_seqs[chrom] + rpos - 1, 1), consensus.substr(cpos, l), c.size(), c.cov, nv, score, - false, l); + false, l, cigar_str); _svs.push_back(sv); nv++; } @@ -759,7 +759,7 @@ void Extender::call() { SV sv = SV("DEL", c.chrom, rpos, string(chromosome_seqs[chrom] + rpos - 1, l), string(chromosome_seqs[chrom] + rpos - 1, 1), c.size(), - c.cov, nv, score, false, l); + c.cov, nv, score, false, l, cigar_str); _svs.push_back(sv); nv++; } diff --git a/sv.cpp b/sv.cpp index 45d3edf..26b5c67 100644 --- a/sv.cpp +++ b/sv.cpp @@ -6,7 +6,7 @@ SV::SV() { l = 0; } SV::SV(const string type_, const string &chrom_, uint s_, const string &refall_, const string &altall_, const uint w_, const uint cov_, const int ngaps_, - const int score_, bool imprecise_, uint l_) { + const int score_, bool imprecise_, uint l_, string cigar_) { type = type_; chrom = chrom_; s = s_; @@ -19,6 +19,7 @@ SV::SV(const string type_, const string &chrom_, uint s_, const string &refall_, ngaps = ngaps_; score = score_; imprecise = imprecise_; + cigar = cigar_; idx = type + "_" + chrom + ":" + to_string(s) + "-" + to_string(e); idx += "_" + to_string(abs(l)); gt = "./."; @@ -54,7 +55,8 @@ ostream &operator<<(ostream &os, const SV &sv) { << "WEIGHT=" << sv.w << ";" << "COV=" << sv.cov << ";" << "AS=" << sv.score << ";" - << "NV=" << sv.ngaps + << "NV=" << sv.ngaps << ";" + << "CIGAR=" << sv.cigar << (sv.imprecise ? ";IMPRECISE\t" : "\t") // - << "GT" diff --git a/sv.hpp b/sv.hpp index 626b4c4..94430cc 100644 --- a/sv.hpp +++ b/sv.hpp @@ -6,8 +6,9 @@ #include #include -class SV { +using namespace std; +class SV { public: std::string type; std::string chrom; @@ -23,12 +24,13 @@ class SV { int score; std::string gt; bool imprecise; + string cigar; SV(); SV(const std::string type_, const std::string &chrom_, uint s_, const std::string &refall_, const std::string &altall_, const uint w_, const uint cov_, const int ngaps_, const int score_, - bool imprecise_ = false, uint l_ = 0); + bool imprecise_ = false, uint l_ = 0, string cigar_ = "."); void genotype(); bool operator<(const SV &c) const { diff --git a/vcf.cpp b/vcf.cpp index 95775d6..33d35af 100644 --- a/vcf.cpp +++ b/vcf.cpp @@ -84,6 +84,9 @@ void print_vcf_header(const unordered_map &ref, ofstream &o, o << "##INFO=" << endl; + o << "##INFO=" + << endl; o << "##FORMAT=" << endl; o << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << sample << endl; From f1db0647c26211868e029c21c5d0e66d7b40857d Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:25:16 +0200 Subject: [PATCH 04/13] Adds verbose mode --- config.cpp | 4 +++- config.hpp | 1 + extender.cpp | 31 ++++++++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/config.cpp b/config.cpp index 9384524..dadeb0e 100644 --- a/config.cpp +++ b/config.cpp @@ -37,7 +37,8 @@ Configuration::Configuration() "b,binary", "", cxxopts::value()->default_value("false"))( "aggregate", "", cxxopts::value()->default_value("false"))( "selective", "", cxxopts::value()->default_value("true"))( - "version", "Print version information.")("help", "Print this help."); + "version", "Print version information.")("help", "Print this help.")( + "verbose", "", cxxopts::value()->default_value("false")); } void Configuration::parse(int argc, char **argv) { @@ -114,5 +115,6 @@ void Configuration::parse(int argc, char **argv) { aggregate = results["aggregate"].as(); selective = results["selective"].as(); version = results["version"].as(); + verbose = results["verbose"].as(); help = results["help"].as(); } diff --git a/config.hpp b/config.hpp index fc6f94c..e3c0d77 100644 --- a/config.hpp +++ b/config.hpp @@ -37,6 +37,7 @@ class Configuration { bool aggregate = false; bool selective = true; bool version = false; + bool verbose = false; bool help = false; std::string bed; diff --git a/extender.cpp b/extender.cpp index a2e5c81..1c51a39 100644 --- a/extender.cpp +++ b/extender.cpp @@ -689,7 +689,30 @@ void Extender::call() { continue; } const auto &clusters_by_len = cluster_by_length(cluster); - // --- Sorting clusters by #sequences to get first 2 most weighted clusters + if (config->verbose) { + cout << "1 " << cluster.chrom << "\t" << cluster.s << "\t" << cluster.e + << "\t" << cluster.cov << "\t" << cluster.seqs.size() << "\t"; + for (const auto s : cluster.seqs) + cout << s.size() << ","; + cout << "\t"; + for (const auto s : cluster.seqs) + cout << s << ","; + cout << endl; + if (clusters_by_len.size() > 1) { + for (const auto c : clusters_by_len) { + cout << "- " << c.chrom << "\t" << c.s << "\t" << c.e << "\t" << c.cov + << "\t" << c.seqs.size() << "\t"; + for (const auto s : c.seqs) + cout << s.size() << ","; + cout << "\t"; + for (const auto s : c.seqs) + cout << s << ","; + cout << endl; + } + } + } + // --- Sorting clusters by #sequences to get first 2 most weighted + // clusters int i_max1 = -1; int i_max2 = -1; uint v_max1 = 0; @@ -728,6 +751,9 @@ void Extender::call() { parasail_result_get_cigar(result, consensus.c_str(), consensus.size(), ref.c_str(), ref.size(), NULL); string cigar_str = parasail_cigar_decode(cigar); + if (config->verbose) + cout << ref << "," << consensus << "," << cigar_str << "\n" + << "#" << endl; int score = result->score; parasail_cigar_free(cigar); parasail_result_free(result); @@ -825,6 +851,9 @@ void Extender::call() { void Extender::filter_sv_chains() { std::sort(svs.begin(), svs.end()); + if (config->verbose) + for (const auto sv : svs) + cout << sv << endl; if (svs.size() < 2) { return; } From 81bf58f98108c08d19214ca09a7390abf39d09b8 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:28:45 +0200 Subject: [PATCH 05/13] Fixes bug that made us miss a SFS per cluster --- extender.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/extender.cpp b/extender.cpp index 1c51a39..5d7a172 100644 --- a/extender.cpp +++ b/extender.cpp @@ -467,7 +467,9 @@ void Extender::cluster_no_interval_tree() { low = min(low, sfs.s); high = max(high, sfs.e); } else { - for (int k = last_j; k < j; k++) { + for (int k = last_j; k < j; + k++) { // CHECKME: < or <=? + // NOTE: <= makes the code waaaay slower _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]); } low = sfs.s; @@ -475,7 +477,9 @@ void Extender::cluster_no_interval_tree() { last_j = j; } } - for (int k = last_j; k < intervals[i].second; k++) { + for (int k = last_j; k <= intervals[i].second; + k++) { // CHECKME: it was < but in that way + // we were losing an sfs per cluster _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]); } if (t == 0) { From 117d949bce2be8a953535e734d29323c4b328cf9 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:31:33 +0200 Subject: [PATCH 06/13] Uses ratio-based (on lenghts) clustering for clusters --- config.cpp | 4 ++++ config.hpp | 1 + extender.cpp | 7 +++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/config.cpp b/config.cpp index dadeb0e..5ad12bf 100644 --- a/config.cpp +++ b/config.cpp @@ -38,6 +38,7 @@ Configuration::Configuration() "aggregate", "", cxxopts::value()->default_value("false"))( "selective", "", cxxopts::value()->default_value("true"))( "version", "Print version information.")("help", "Print this help.")( + "l", "", cxxopts::value())( "verbose", "", cxxopts::value()->default_value("false")); } @@ -108,6 +109,9 @@ void Configuration::parse(int argc, char **argv) { if (results.count("min-cluster-weight")) { min_cluster_weight = results["min-cluster-weight"].as(); } + if (results.count("l")) { + min_ratio = results["l"].as(); + } binary = results["binary"].as(); clipped = results["clipped"].as(); assemble = results["assemble"].as(); diff --git a/config.hpp b/config.hpp index e3c0d77..3216153 100644 --- a/config.hpp +++ b/config.hpp @@ -29,6 +29,7 @@ class Configuration { int min_indel_length = 20; int aggregate_batches = 5; int min_cluster_weight = 2; + float min_ratio = 0.97; // FIXME: change name bool binary = false; bool clipped = false; diff --git a/extender.cpp b/extender.cpp index 5d7a172..81159a5 100644 --- a/extender.cpp +++ b/extender.cpp @@ -648,9 +648,12 @@ vector Extender::cluster_by_length(const Cluster &cluster) { for (const string &seq : cluster.get_seqs()) { int i; for (i = 0; i < clusters_by_len.size(); i++) { - if (abs((int)clusters_by_len[i].get_len() - (int)seq.size()) <= min_d) { + float cl = clusters_by_len[i].get_len(); + float sl = seq.size(); + if (min(cl, sl) / max(cl, sl) >= config->min_ratio) + // if (abs((int)clusters_by_len[i].get_len() - (int)seq.size()) <= + // min_d) { break; - } } if (i == clusters_by_len.size()) { clusters_by_len.push_back( From 9a91453745cfe28dd7b66ed0a35b944ec1434328 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 10:32:43 +0200 Subject: [PATCH 07/13] Improves chains-filtering heuristic --- extender.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/extender.cpp b/extender.cpp index 81159a5..edde907 100644 --- a/extender.cpp +++ b/extender.cpp @@ -877,22 +877,22 @@ void Extender::filter_sv_chains() { auto &sv = svs[i]; if (sv.chrom == prev.chrom && sv.s - prev.e < 2 * sv.l && prev.type == sv.type) { - // cout << sv.chrom << " " << sv.s << " " << sv.type << " ---- " << prev.s - // << endl ; // check for sequence similarity double w_r = - max((double)sv.w, (double)prev.w) / min((double)sv.w, (double)prev.w); + min((double)sv.w, (double)prev.w) / max((double)sv.w, (double)prev.w); double l_r = min((double)sv.l, (double)prev.l) / max((double)sv.l, (double)prev.l); - if (w_r >= 2 && l_r >= 0.9) { - double d; + int d = sv.s - prev.s; + if (d < 100 && w_r >= 0.9 && + l_r >= config->min_ratio) { // FIXME: hardcoded + use different ratio + // here. min_ratio was for clusters + double sim; if (sv.type == "DEL") { - d = rapidfuzz::fuzz::ratio(sv.refall, prev.refall); + sim = rapidfuzz::fuzz::ratio(sv.refall, prev.refall); } else { - d = rapidfuzz::fuzz::ratio(sv.altall, prev.altall); + sim = rapidfuzz::fuzz::ratio(sv.altall, prev.altall); } - // cout << w_r << " " << l_r << " " << d << endl ; - if (d > 70) { + if (sim > 70) { if (sv.w > prev.w) { _svs.push_back(sv); } else { From d93332eeb414206939677b613a133cb9faa23570 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 28 Oct 2022 11:23:12 +0200 Subject: [PATCH 08/13] Replaces alignment matrix with nuc44 --- extender.cpp | 2 +- extender.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extender.cpp b/extender.cpp index edde907..d86532d 100644 --- a/extender.cpp +++ b/extender.cpp @@ -753,7 +753,7 @@ void Extender::call() { parasail_result_t *result = NULL; result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(), ref.c_str(), ref.size(), 10, 1, - ¶sail_blosum62); + ¶sail_nuc44); parasail_cigar_t *cigar = parasail_result_get_cigar(result, consensus.c_str(), consensus.size(), ref.c_str(), ref.size(), NULL); diff --git a/extender.hpp b/extender.hpp index 00b5741..c1ff590 100644 --- a/extender.hpp +++ b/extender.hpp @@ -12,7 +12,7 @@ #include "htslib/sam.h" #include "interval_tree.hpp" #include "parasail.h" -#include "parasail/matrices/blosum62.h" +#include "parasail/matrices/nuc44.h" #include "rapidfuzz/fuzz.hpp" #include "rapidfuzz/utils.hpp" From 87aedf4858759aed39f175449d36b3911d9a34c2 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Thu, 5 Jan 2023 11:48:04 +0100 Subject: [PATCH 09/13] Prints name and info of missmoothed reads --- smoother.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/smoother.cpp b/smoother.cpp index 50cd12f..35ebf22 100644 --- a/smoother.cpp +++ b/smoother.cpp @@ -225,8 +225,7 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, } else { if (strlen(new_seq) != n) { // FIXME: why does this happen? - cerr << "|" - << endl; // qname << " " << strlen(new_seq) << " " << n << endl; + cerr << "|" << qname << " " << strlen(new_seq) << " " << n << endl; ignored_reads[omp_get_thread_num() - 2].push_back(qname); return; } From 7ba9678fd227c8fcd5119776dd193d5a3148661d Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 6 Jan 2023 12:20:59 +0100 Subject: [PATCH 10/13] Fix bug with reallocation Reallocation of read_seqs must be done only before loading them. Before smoothing we need to reallocate only the new seqs. Lenghts of seqs and new_seqs may differ - hence we store them both. --- smoother.cpp | 28 ++++++++++++++++------------ smoother.hpp | 1 + 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/smoother.cpp b/smoother.cpp index 35ebf22..7d7556a 100644 --- a/smoother.cpp +++ b/smoother.cpp @@ -56,8 +56,8 @@ void rebuild_bam_entry(bam1_t *alignment, char *seq, uint8_t *qual, memcpy(aux, alignment->data + alignment->l_data - l_aux, l_aux); // update core alignment->core.n_cigar = cigar.size(); - alignment->core.l_qseq = strlen(seq); int l = strlen(seq); + alignment->core.l_qseq = l; // rebuild data int l_data = alignment->core.l_qname + (4 * alignment->core.n_cigar) + ((l + 1) >> 1) + l + l_aux; @@ -92,15 +92,13 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, for (auto op : cigar_offsets) { l += op.first; } - if (read_seq_max_lengths[_i][_j][_k] < l) { - free(read_seqs[_i][_j][_k]); + if (new_read_seq_max_lengths[_i][_j][_k] < l) { free(new_read_seqs[_i][_j][_k]); free(new_read_quals[_i][_j][_k]); // - read_seqs[_i][_j][_k] = (char *)malloc(sizeof(char) * (l + 1)); new_read_seqs[_i][_j][_k] = (char *)malloc(sizeof(char) * (l + 1)); new_read_quals[_i][_j][_k] = (uint8_t *)malloc(sizeof(char) * (l + 1)); - read_seq_max_lengths[_i][_j][_k] = l; + new_read_seq_max_lengths[_i][_j][_k] = l; } // int n = 0; @@ -224,8 +222,9 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, ignored_reads[omp_get_thread_num() - 2].push_back(qname); } else { if (strlen(new_seq) != n) { - // FIXME: why does this happen? - cerr << "|" << qname << " " << strlen(new_seq) << " " << n << endl; + // CHECKME: this now should be fixed (read_seqs must not be freed above - only the new ones) + // cerr << "[W] " << qname << " " << strlen(new_seq) << "/" << strlen((char*)new_qual) << " " << n << endl; + cerr << "[W] this shouldn't happen anymore. If you see this warning, please open an issue at https://github.com/Parsoa/SVDSS/issues" << endl; ignored_reads[omp_get_thread_num() - 2].push_back(qname); return; } @@ -283,16 +282,25 @@ void Smoother::run() { int modulo = 3; int batch_size = (10000 / config->threads) * config->threads; for (int i = 0; i < modulo; i++) { + // original read sequences read_seqs.push_back( vector>(config->threads)); // current and next output + // smoothed read sequences new_read_seqs.push_back( vector>(config->threads)); // current and next output + // smoothed read qualities new_read_quals.push_back( vector>(config->threads)); // current and next output + // CHECKME: why do we need to store two lengths (read_seq_lengths and read_seq_max_lengths)? When we have to reallocate for a too long read, we change both of them. So they should always contain the same value + // original read lengths read_seq_lengths.push_back( vector>(config->threads)); // current and next output + // original read max lengths read_seq_max_lengths.push_back( vector>(config->threads)); // current and next output + // smoothed read max lengths + new_read_seq_max_lengths.push_back( + vector>(config->threads)); // current and next output for (int j = 0; j < config->threads; j++) { for (int k = 0; k < batch_size / config->threads; k++) { read_seqs[i][j].push_back((char *)malloc(sizeof(char) * (30001))); @@ -302,6 +310,7 @@ void Smoother::run() { // read_seq_lengths[i][j].push_back(30000); read_seq_max_lengths[i][j].push_back(30000); + new_read_seq_max_lengths[i][j].push_back(30000); } } } @@ -439,13 +448,8 @@ bool Smoother::load_batch_bam(int threads, int batch_size, int p) { uint32_t l = alignment->core.l_qseq; // length of the read if (read_seq_max_lengths[p][n % threads][i] < l) { free(read_seqs[p][n % threads][i]); - free(new_read_seqs[p][n % threads][i]); - free(new_read_quals[p][n % threads][i]); // read_seqs[p][n % threads][i] = (char *)malloc(sizeof(char) * (l + 1)); - new_read_seqs[p][n % threads][i] = (char *)malloc(sizeof(char) * (l + 1)); - new_read_quals[p][n % threads][i] = - (uint8_t *)malloc(sizeof(char) * (l + 1)); read_seq_max_lengths[p][n % threads][i] = l; m += 1; } diff --git a/smoother.hpp b/smoother.hpp index 1647638..e6384f3 100644 --- a/smoother.hpp +++ b/smoother.hpp @@ -48,6 +48,7 @@ class Smoother { std::vector>> read_seqs; std::vector>> new_read_seqs; std::vector>> new_read_quals; + std::vector>> new_read_seq_max_lengths; void run(); bool load_batch_bam(int threads, int batch_size, int p); From 9e124313a322c9785bc39b083b11e2102110abc4 Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Thu, 12 Jan 2023 09:44:29 +0100 Subject: [PATCH 11/13] Update citation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ce8f9c..e1fc742 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,7 @@ For inquiries on this software please open an [issue](https://github.com/Parsoa/ ### Citation -SVDSS is currently pending peer review. A pre-print is available on [BioRxiv](https://www.biorxiv.org/content/10.1101/2022.02.12.480198v1). +SVDSS is now published in [Nature Methods](https://doi.org/10.1038/s41592-022-01674-1). ##### Experiments Instructions on how to reproduce the experiments described in the manuscript can be found [here](https://github.com/ldenti/SVDSS-experiments) (also provided as submodule of this repository). From 43ad7fbe4034e50a633aba911d0272c4f4ce7eed Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Thu, 12 Jan 2023 10:30:48 +0100 Subject: [PATCH 12/13] Update dependencies --- CMakeLists.txt | 37 ++++++++++++++----------------------- extender.hpp | 1 - 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5829612..b2d3f04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.14) include(ExternalProject) +include(FetchContent) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -43,13 +44,8 @@ if(NOT DEFINED CONDAPREFIX) message(STATUS "libdeflate will be built from source") ExternalProject_Add(deflate GIT_REPOSITORY https://github.com/ebiggers/libdeflate.git - GIT_TAG 047aa84e01b38d82f3612810e357bd40f14a3d39 + GIT_TAG 020133854ff73b8506fe59f92a9b5b622d360716 # v1.15 BUILD_IN_SOURCE 1 - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND make PREFIX="." - "CC=${CMAKE_C_COMPILER}" - "CFLAGS=-fPIC -O3" libdeflate.a INSTALL_COMMAND "") ExternalProject_Get_Property(deflate SOURCE_DIR) SET(DEFLATE_SOURCE_DIR ${SOURCE_DIR}) @@ -59,15 +55,15 @@ if(NOT DEFINED CONDAPREFIX) # HTSLIB ######### - SET(CONF_CMD autoheader && autoreconf && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR}") + SET(CONF_CMD autoheader && autoreconf -i && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR}") if (HOLYBUILD) # we have to include local libraries (libbz2 and liblzma) - SET(CONF_CMD autoheader && autoreconf && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR} -I${BZIP_LIB_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR} -L${BZIP_LIB_DIR} -L${LZMA_LIB_DIR}") + SET(CONF_CMD autoheader && autoreconf -i && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR} -I${BZIP_LIB_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR} -L${BZIP_LIB_DIR} -L${LZMA_LIB_DIR}") endif () message(STATUS "htslib will be built from source") ExternalProject_Add(htslib GIT_REPOSITORY https://github.com/samtools/htslib.git - GIT_TAG 2cd99cf333938a9eb70393bc59379e1d6ea66b37 + GIT_TAG 4e61c128238f3e7cbb3b1f4e9c0fdb4880aa9a10 # v1.16 UPDATE_COMMAND "" BUILD_IN_SOURCE 1 CONFIGURE_COMMAND ${CONF_CMD} @@ -98,7 +94,7 @@ else () endif () ExternalProject_Add(abpoa GIT_REPOSITORY https://github.com/yangao07/abPOA.git - GIT_TAG 16c64e5163cbd0502792ca20f1069fe40c929b0d + GIT_TAG 16c64e5163cbd0502792ca20f1069fe40c929b0d # v1.41 PATCH_COMMAND ${PATCH_CMD} INSTALL_COMMAND "" ) @@ -114,7 +110,7 @@ SET(ABPOA_INCLUDE_DIR ${ABPOA_SOURCE_DIR}/include) message(STATUS "parasail will be built from source") ExternalProject_Add(parasail GIT_REPOSITORY https://github.com/jeffdaily/parasail.git - GIT_TAG e236a6feec7bc84fb81923d783acb345e538a281 + GIT_TAG d23aed253fb5ecc7aeab0f81b6d64fcc26448089 # v2.6 UPDATE_COMMAND "" BUILD_IN_SOURCE 1 CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF @@ -128,22 +124,16 @@ set_target_properties(PARASAIL PROPERTIES IMPORTED_LOCATION ${PARASAIL_SOURCE_DI # rapidfuzz-cpp ################ -ExternalProject_Add(rapidfuzz - GIT_REPOSITORY https://github.com/maxbachmann/rapidfuzz-cpp.git - GIT_TAG d1e82379395cafc6d439c1c1e2cbe7512eaf2518 - BUILD_IN_SOURCE 1 - UPDATE_COMMAND "" - INSTALL_COMMAND "" -) -ExternalProject_Get_Property(rapidfuzz SOURCE_DIR) -SET(RAPIDFUZZ_SOURCE_DIR ${SOURCE_DIR}) -SET(RAPIDFUZZ_INCLUDE_DIR ${RAPIDFUZZ_SOURCE_DIR}) +FetchContent_Declare(rapidfuzz + GIT_REPOSITORY https://github.com/maxbachmann/rapidfuzz-cpp.git + GIT_TAG 5412d5d877518e7754394bdbec76e45c8187c631) # v1.10.4 +FetchContent_MakeAvailable(rapidfuzz) # interval-tree ################ ExternalProject_Add(intervaltree GIT_REPOSITORY https://github.com/5cript/interval-tree.git - GIT_TAG 4d7c66ce4fb542a29a7d74039f8e7c92fae3d6f8 + GIT_TAG 309b9c725191d4bb1d134f28a8a32ad2f68a8ffa UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -151,7 +141,7 @@ ExternalProject_Add(intervaltree ) ExternalProject_Get_Property(intervaltree SOURCE_DIR) SET(INTERVALTREE_SOURCE_DIR ${SOURCE_DIR}) -SET(INTERVALTREE_INCLUDE_DIR ${INTERVALTREE_SOURCE_DIR}) +SET(INTERVALTREE_INCLUDE_DIR ${INTERVALTREE_SOURCE_DIR}/include/interval-tree) # ropebwt2 ########## @@ -213,6 +203,7 @@ else() endif() target_link_libraries(SVDSS + PUBLIC rapidfuzz::rapidfuzz PUBLIC ${BINARY_DIR}/lib/libabpoa.a PUBLIC PARASAIL PUBLIC ROPEBWT diff --git a/extender.hpp b/extender.hpp index c1ff590..ac9e499 100644 --- a/extender.hpp +++ b/extender.hpp @@ -14,7 +14,6 @@ #include "parasail.h" #include "parasail/matrices/nuc44.h" #include "rapidfuzz/fuzz.hpp" -#include "rapidfuzz/utils.hpp" #include "bam.hpp" #include "chromosomes.hpp" From aee62f461437e647accb5015ef15e804849caa2a Mon Sep 17 00:00:00 2001 From: Luca Denti Date: Fri, 13 Jan 2023 10:47:48 +0100 Subject: [PATCH 13/13] Add smoothing treshold to cli --- config.cpp | 4 ++++ config.hpp | 1 + smoother.cpp | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/config.cpp b/config.cpp index 5ad12bf..42f5c03 100644 --- a/config.cpp +++ b/config.cpp @@ -39,6 +39,7 @@ Configuration::Configuration() "selective", "", cxxopts::value()->default_value("true"))( "version", "Print version information.")("help", "Print this help.")( "l", "", cxxopts::value())( + "acc", "", cxxopts::value())( "verbose", "", cxxopts::value()->default_value("false")); } @@ -112,6 +113,9 @@ void Configuration::parse(int argc, char **argv) { if (results.count("l")) { min_ratio = results["l"].as(); } + if (results.count("acc")) { + al_accuracy = results["acc"].as(); + } binary = results["binary"].as(); clipped = results["clipped"].as(); assemble = results["assemble"].as(); diff --git a/config.hpp b/config.hpp index 3216153..4a69573 100644 --- a/config.hpp +++ b/config.hpp @@ -30,6 +30,7 @@ class Configuration { int aggregate_batches = 5; int min_cluster_weight = 2; float min_ratio = 0.97; // FIXME: change name + float al_accuracy = 0.02; bool binary = false; bool clipped = false; diff --git a/smoother.cpp b/smoother.cpp index 7d7556a..f2ab8d9 100644 --- a/smoother.cpp +++ b/smoother.cpp @@ -217,7 +217,8 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, // cerr << p.first << op; // } // cerr << endl; - if (num_mismatch / num_match >= 0.02 || should_ignore) { // FIXME: HARDCODED + + if (num_mismatch / num_match >= config->al_accuracy || should_ignore) { // read is too dirty or is not interesting, just skip it ignored_reads[omp_get_thread_num() - 2].push_back(qname); } else {