From 2a2a36d69ecb9e5d778ab245682ca02b05d2ff38 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:17:47 +0200
Subject: [PATCH 01/13] Filters alignments using a fixed treshold

---
 smoother.cpp | 111 +++++++++++++++++++--------------------------------
 1 file changed, 42 insertions(+), 69 deletions(-)

diff --git a/smoother.cpp b/smoother.cpp
index a39a7b0..50cd12f 100644
--- a/smoother.cpp
+++ b/smoother.cpp
@@ -27,7 +27,6 @@ using namespace std;
 // } bam1_core_t;
 
 int do_realloc_bam_data(bam1_t *b, size_t desired) {
-  // cout << "Extending BAM entry." << endl ;
   uint32_t new_m_data;
   uint8_t *new_data;
   new_m_data = desired;
@@ -115,7 +114,7 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
   uint8_t *qual = bam_get_qual(alignment);
   uint8_t *new_qual = new_read_quals[_i][_j][_k];
   int pos = alignment->core.pos +
-            1; // this is 0-based, variant cpoordinates are 1-based
+            1; // this is 0-based, variant coordinates are 1-based
   // Modify current bam1_t* struct
   auto &core = alignment->core;
   vector<pair<uint32_t, uint32_t>> new_cigar;
@@ -195,7 +194,6 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
       soft_clip_offset += cigar_offsets[m].first;
       new_cigar.push_back(cigar_offsets[m]);
     } else {
-      cout << "Illegal Cigar OP" << endl;
       break;
       // if (cigar_offsets[m].second == BAM_CPAD || cigar_offsets[m].second ==
       // BAM_CHARD_CLIP || cigar_offsets[m].second == BAM_CBACK) {
@@ -204,43 +202,37 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
   }
   new_seq[n] = '\0';
   new_qual[n] = '\0';
-  // int r = 0 ;
-  // for (auto op: new_cigar) {
-  //     if (op.second != BAM_CDEL) {
-  //         n -= op.first ;
-  //     }
-  //     r += op.first ;
-  // }
-  // assert(n == 0) ;
+
   char *qname = bam_get_qname(alignment);
-  // cout << n << " " << strlen(new_seq) << " " << r << endl ;
-  // cout << bam_get_qname(alignment) << endl ;
-  //  only do this on first processing thread
-  if (omp_get_thread_num() == 2) {
-    global_num_bases += num_match;
-    global_num_mismatch += num_mismatch;
-  }
-  // how many errors and SNPs do we expect? 1/1000 each, so say if we see more
-  // than twice that then don't correct
-  if (config->selective) {
-    if (num_mismatch / num_match > 3 * expected_mismatch_rate) {
-      if (omp_get_thread_num() == 3) {
-        num_ignored_reads += 1;
-      }
-      return;
-    }
-    // if we have so many deletions and insertions, then abort
-    if (ins_offset + del_offset > 0.7 * strlen(read_seq)) {
+  // char op = 'M';
+  // cerr << qname << " " << strlen(new_seq) << " " << n << endl;
+  // for (const auto p : new_cigar) {
+  //   if (p.second == BAM_CMATCH || p.second == BAM_CEQUAL ||
+  //       p.second == BAM_CDIFF)
+  //     op = 'M';
+  //   else if (p.second == BAM_CINS)
+  //     op = 'I';
+  //   else if (p.second == BAM_CDEL)
+  //     op = 'D';
+  //   else if (p.second == BAM_CSOFT_CLIP)
+  //     op = 'S';
+  //   cerr << p.first << op;
+  // }
+  // cerr << endl;
+  if (num_mismatch / num_match >= 0.02 || should_ignore) { // FIXME: HARDCODED
+    // read is too dirty or is not interesting, just skip it
+    ignored_reads[omp_get_thread_num() - 2].push_back(qname);
+  } else {
+    if (strlen(new_seq) != n) {
+      // FIXME: why does this happen?
+      cerr << "|"
+           << endl; // qname << " " << strlen(new_seq) << " " << n << endl;
+      ignored_reads[omp_get_thread_num() - 2].push_back(qname);
       return;
     }
+    smoothed_reads[omp_get_thread_num() - 2].push_back(qname);
+    rebuild_bam_entry(alignment, new_seq, new_qual, new_cigar);
   }
-  if (should_ignore) {
-    // check mismatch rate
-    ignored_reads[omp_get_thread_num() - 2].push_back(qname);
-    return;
-  }
-  smoothed_reads[omp_get_thread_num() - 2].push_back(qname);
-  rebuild_bam_entry(alignment, new_seq, new_qual, new_cigar);
 }
 
 void Smoother::process_batch(vector<bam1_t *> bam_entries, int p, int i) {
@@ -256,7 +248,6 @@ void Smoother::process_batch(vector<bam1_t *> bam_entries, int p, int i) {
       continue;
     }
     if (alignment->core.l_qseq < 2) {
-      // cerr << "Read too short, ignoring.." << endl ;
       continue;
     }
     if (alignment->core.tid < 0) {
@@ -323,6 +314,7 @@ void Smoother::run() {
       }
     }
   }
+
   int b = 0;
   lprint({"Loading first batch.."});
   load_batch_bam(config->threads, batch_size, 1);
@@ -336,9 +328,7 @@ void Smoother::run() {
   bool should_terminate = false;
   bool loaded_last_batch = false;
   int reads_written = 0;
-  // lprint({"Starting main loop.."}) ;
   while (should_process) {
-    // lprint({"Beginning batch", to_string(b + 1)});
     if (!should_load) {
       should_process = false;
     }
@@ -351,11 +341,6 @@ void Smoother::run() {
         if (should_load) {
           loaded_last_batch =
               !load_batch_bam(config->threads, batch_size, (p + 1) % modulo);
-          // if (loaded_last_batch) {
-          //     lprint({"Last input batch loaded."});
-          // } else {
-          //     lprint({"Loaded."});
-          // }
         }
       } else if (i == 1) {
         if (b >= 1) {
@@ -363,7 +348,18 @@ void Smoother::run() {
           for (int k = 0; k < batch_size / config->threads; k++) {
             for (int j = 0; j < config->threads; j++) {
               if (bam_entries[(p + 2) % modulo][j][k] != nullptr) {
-                auto alignment = bam_entries[(p + 2) % modulo][j][k];
+                // TODO: just store smoothed reads in the output .bam
+                // if (find(smoothed_reads[j].begin(), smoothed_reads[j].end(),
+                //          bam_get_qname(bam_entries[(p + 2) % modulo][j][k]))
+                //          !=
+                //     smoothed_reads[j].end()) {
+                if (bam_entries[(p + 2) % modulo][j][k]->core.flag &
+                        BAM_FUNMAP ||
+                    bam_entries[(p + 2) % modulo][j][k]->core.flag &
+                        BAM_FSUPPLEMENTARY ||
+                    bam_entries[(p + 2) % modulo][j][k]->core.flag &
+                        BAM_FSECONDARY)
+                  continue;
                 ret = sam_write1(out_bam_file, bam_header,
                                  bam_entries[(p + 2) % modulo][j][k]);
                 reads_written += 1;
@@ -372,6 +368,7 @@ void Smoother::run() {
                   should_terminate = true;
                   break;
                 }
+                // }
               } else {
                 break;
               }
@@ -388,13 +385,6 @@ void Smoother::run() {
       lprint({"Something went wrong, aborting.."}, 2);
       return;
     }
-    // if (!should_load) {
-    //     lprint({"Processed last batch of inputs."});
-    // }
-    // if (!should_process) {
-    //     lprint({"Exiting accelerator loop."});
-    //     break ;
-    // }
     p += 1;
     p %= modulo;
     b += 1;
@@ -403,20 +393,7 @@ void Smoother::run() {
     if (s - t == 0) {
       s += 1;
     }
-    cerr << "[I] Processed batch " << b << ". Reads so far " << reads_processed
-         << ". Reads per second: " << reads_processed / (s - t)
-         << ". Time: " << s - t << "\n";
-    cerr << "[I] Processed bases: " << uint64_t(global_num_bases)
-         << ", num mismatch: " << uint64_t(global_num_mismatch)
-         << ", mismatch rate: " << global_num_mismatch / global_num_bases
-         << ", ignored reads: " << num_ignored_reads << "\n";
-    expected_mismatch_rate = global_num_mismatch / global_num_bases;
-    cerr << "\x1b[A";
-    cerr << "\x1b[A";
   }
-  cerr << endl;
-  cerr << endl;
-  lprint({"Done."});
   sam_close(bam_file);
   sam_close(out_bam_file);
   dump_smoothed_read_ids();
@@ -487,22 +464,18 @@ bool Smoother::load_batch_bam(int threads, int batch_size, int p) {
       break;
     }
   }
-  // cout << m << " reallocations.." << endl ;
   //  last batch was incomplete
   if (n != batch_size) {
     for (int j = n % threads; j < threads; j++) {
-      // cout << "Terminus at " << j << " " << i << endl ;
       for (int _ = i; _ < batch_size / threads; _++) {
         bam_entries[p][j][_] = nullptr;
       }
     }
     for (int j = 0; j < n % threads; j++) {
-      // cout << "Terminus at " << j << " " << i + 1 << endl ;
       for (int _ = i + 1; _ < batch_size / threads; _++) {
         bam_entries[p][j][_] = nullptr;
       }
     }
   }
-  // lprint({"Loaded", to_string(n), "BAM reads.."});
   return n != 0 ? true : false;
 }

From aafc5b9fd0fd236939ac0b909537274798706d66 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:22:20 +0200
Subject: [PATCH 02/13] Useless commit to clean(?) code

---
 cluster.cpp  | 13 ++++++++++---
 extender.cpp | 40 +++++++++++-----------------------------
 vcf.cpp      |  2 +-
 3 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/cluster.cpp b/cluster.cpp
index 04c3105..9ad1e25 100644
--- a/cluster.cpp
+++ b/cluster.cpp
@@ -27,13 +27,20 @@ string Cluster::poa() {
   abpt->out_msa = 0;
   abpt->out_cons = 1;
   abpt->out_gfa = 0;
-  // abpt->w = 6, abpt->k = 9;
-  // abpt->min_w = 10; // minimizer-based seeding and partition
   // abpt->is_diploid = 1;
-  // abpt->min_freq = 0.3;
   abpt->progressive_poa = 0;
   abpoa_post_set_para(abpt);
 
+  // abpt->match = 2;      // match score
+  // abpt->mismatch = 4;   // mismatch penalty
+  // abpt->gap_mode = ABPOA_CONVEX_GAP; // gap penalty mode
+  // abpt->gap_open1 = 4;  // gap open penalty #1
+  // abpt->gap_ext1 = 2;   // gap extension penalty #1
+  // abpt->gap_open2 = 24; // gap open penalty #2
+  // abpt->gap_ext2 = 1;   // gap extension penalty #2
+  // gap_penalty = min{gap_open1 + gap_len * gap_ext1, gap_open2 + gap_len *
+  // gap_ext2}
+
   int *seq_lens = (int *)malloc(sizeof(int) * n_seqs);
   uint8_t **bseqs = (uint8_t **)malloc(sizeof(uint8_t *) * n_seqs);
   for (uint i = 0; i < n_seqs; ++i) {
diff --git a/extender.cpp b/extender.cpp
index 9740f4e..825d537 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -42,9 +42,9 @@ void Extender::run(int _threads) {
           make_pair(cluster.first, ExtCluster(cluster.second)));
     }
   }
-  for (const auto &cluster : _ext_clusters) {
+  for (const auto &cluster : _ext_clusters)
     ext_clusters.push_back(cluster.second);
-  }
+
   // process each cluster separately
   _p_clusters.resize(threads);
   extract_sfs_sequences();
@@ -53,6 +53,7 @@ void Extender::run(int _threads) {
     clusters.insert(clusters.begin(), _p_clusters[i].begin(),
                     _p_clusters[i].end());
   }
+
   // merge POA alignments
   _p_svs.resize(threads);
   _p_alignments.resize(threads);
@@ -166,11 +167,6 @@ void Extender::extend_parallel() {
         // load next batch of entries
         if (should_load) {
           loaded_last_batch = !load_batch_bam(threads, batch_size, (p + 1) % 2);
-          if (loaded_last_batch) {
-            // lprint({"Last input batch loaded."});
-          } else {
-            // lprint({"Loaded."});
-          }
         }
       } else {
         process_batch(bam_entries[p][t - 1], t - 1);
@@ -178,9 +174,6 @@ void Extender::extend_parallel() {
     }
 #pragma omp single
     {
-      // if (loaded_last_batch) {
-      //     break ;
-      // }
       p += 1;
       p %= 2;
       b += 1;
@@ -235,16 +228,11 @@ bool Extender::load_batch_bam(int threads, int batch_size, int p) {
     }
   }
   if (n != 0 && n != batch_size) {
-    for (int j = n % threads; j < threads; j++) {
-      // cout << "Terminus at " << j << " " << i << endl ;
+    for (int j = n % threads; j < threads; j++)
       bam_entries[p][j][i] = nullptr;
-    }
-    for (int j = 0; j < n % threads; j++) {
-      if (i + 1 < bam_entries[p][j].size()) {
-        // cout << "Terminus at " << j << " " << i + 1 << endl ;
+    for (int j = 0; j < n % threads; j++)
+      if (i + 1 < bam_entries[p][j].size())
         bam_entries[p][j][i + 1] = nullptr;
-      }
-    }
   }
   // lprint({"Loaded", to_string(n), "BAM reads.."});
   return n != 0 ? true : false;
@@ -255,7 +243,6 @@ void Extender::process_batch(vector<bam1_t *> bam_entries, int index) {
   for (int b = 0; b < bam_entries.size(); b++) {
     aln = bam_entries[b];
     if (aln == nullptr) {
-      // cout << "Terminus " << b << " " << index << " " << endl ;
       break;
     }
     extend_alignment(aln, index);
@@ -302,7 +289,6 @@ void Extender::extend_alignment(bam1_t *aln, int index) {
         break;
       }
     }
-    // cout << refs << "-" << refe << endl ;
     // We extract the local alignment of the region of interest
     if (refs == -1 && refe == -1) {
       // we couldn't place the first and the last base, so we skip this -
@@ -426,9 +412,7 @@ bool overlap(int s1, int e1, const ExtSFS &sfs) {
 }
 
 void Extender::cluster_no_interval_tree() {
-  lprint({"Sorting ", to_string(extended_sfs.size()), " extended SFS.."});
-  std::sort(extended_sfs.begin(), extended_sfs.end());
-  lprint({"Done."});
+  sort(extended_sfs.begin(), extended_sfs.end());
   auto r = std::max_element(extended_sfs.begin(), extended_sfs.end(),
                             [](const ExtSFS &lhs, const ExtSFS &rhs) {
                               return lhs.e - lhs.s < rhs.e - rhs.s;
@@ -565,8 +549,8 @@ void Extender::extract_sfs_sequences() {
           aln->core.flag & BAM_FSECONDARY) {
         continue;
       }
-      ++cov; // FIXME: this cov takes into account also reads starting or ending
-             // inside the cluster (maybe we should skip those?)
+      ++cov; // FIXME: this cov takes into account also reads starting or
+             // ending inside the cluster (maybe we should skip those?)
 
       char *qname = bam_get_qname(aln);
       if (reads.find(qname) == reads.end()) {
@@ -735,6 +719,7 @@ void Extender::call() {
       vector<SV> _svs; // svs on current cluster
       string ref = string(chromosome_seqs[chrom] + c.s, c.e - c.s + 1);
       string consensus = c.poa();
+
       parasail_result_t *result = NULL;
       result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(),
                                             ref.c_str(), ref.size(), 10, 1,
@@ -801,7 +786,7 @@ void Extender::call() {
       //    if (del.s <= sv.e) {
       //        sv.e = del.e ;
       //        int overlap = sv.e - del.s + 1 ;
-      //        sv.refall = sv.altall + del..substr(overlap, del.l - overlap) ;
+      //        sv.refall = sv.altall + del..substr(overlap, del.l - overlap);
       //    }
       //}
       // -- Combine svs with same length (maybe useless now - only if diploid
@@ -882,9 +867,6 @@ void Extender::filter_sv_chains() {
         }
       }
     }
-    if (prev.s == 205603627) {
-      break;
-    }
     _svs.push_back(prev);
     prev = sv;
   }
diff --git a/vcf.cpp b/vcf.cpp
index 8979e8d..95775d6 100644
--- a/vcf.cpp
+++ b/vcf.cpp
@@ -87,4 +87,4 @@ void print_vcf_header(const unordered_map<string, char *> &ref, ofstream &o,
   o << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl;
   o << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << sample
     << endl;
-}
\ No newline at end of file
+}

From 21ca02d0fd3e8f4b82407023f15a2a1e59ff48b7 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:23:39 +0200
Subject: [PATCH 03/13] Adds consensus CIGAR to variant INFO field

---
 caller.cpp   | 3 +++
 extender.cpp | 4 ++--
 sv.cpp       | 6 ++++--
 sv.hpp       | 6 ++++--
 vcf.cpp      | 3 +++
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/caller.cpp b/caller.cpp
index 95eb25f..e27a949 100644
--- a/caller.cpp
+++ b/caller.cpp
@@ -164,6 +164,9 @@ void Caller::print_vcf_header() {
   ovcf << "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise "
           "structural variation\">"
        << endl;
+  ovcf << "##INFO=<ID=CIGAR,Number=A,Type=String,Description=\"CIGAR of "
+          "consensus\">"
+       << endl;
   ovcf << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
        << endl;
   ovcf << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDEFAULT"
diff --git a/extender.cpp b/extender.cpp
index 825d537..a2e5c81 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -749,7 +749,7 @@ void Extender::call() {
             SV sv = SV("INS", c.chrom, rpos,
                        string(chromosome_seqs[chrom] + rpos - 1, 1),
                        consensus.substr(cpos, l), c.size(), c.cov, nv, score,
-                       false, l);
+                       false, l, cigar_str);
             _svs.push_back(sv);
             nv++;
           }
@@ -759,7 +759,7 @@ void Extender::call() {
             SV sv = SV("DEL", c.chrom, rpos,
                        string(chromosome_seqs[chrom] + rpos - 1, l),
                        string(chromosome_seqs[chrom] + rpos - 1, 1), c.size(),
-                       c.cov, nv, score, false, l);
+                       c.cov, nv, score, false, l, cigar_str);
             _svs.push_back(sv);
             nv++;
           }
diff --git a/sv.cpp b/sv.cpp
index 45d3edf..26b5c67 100644
--- a/sv.cpp
+++ b/sv.cpp
@@ -6,7 +6,7 @@ SV::SV() { l = 0; }
 
 SV::SV(const string type_, const string &chrom_, uint s_, const string &refall_,
        const string &altall_, const uint w_, const uint cov_, const int ngaps_,
-       const int score_, bool imprecise_, uint l_) {
+       const int score_, bool imprecise_, uint l_, string cigar_) {
   type = type_;
   chrom = chrom_;
   s = s_;
@@ -19,6 +19,7 @@ SV::SV(const string type_, const string &chrom_, uint s_, const string &refall_,
   ngaps = ngaps_;
   score = score_;
   imprecise = imprecise_;
+  cigar = cigar_;
   idx = type + "_" + chrom + ":" + to_string(s) + "-" + to_string(e);
   idx += "_" + to_string(abs(l));
   gt = "./.";
@@ -54,7 +55,8 @@ ostream &operator<<(ostream &os, const SV &sv) {
      << "WEIGHT=" << sv.w << ";"
      << "COV=" << sv.cov << ";"
      << "AS=" << sv.score << ";"
-     << "NV=" << sv.ngaps
+     << "NV=" << sv.ngaps << ";"
+     << "CIGAR=" << sv.cigar
      << (sv.imprecise ? ";IMPRECISE\t" : "\t")
      // -
      << "GT"
diff --git a/sv.hpp b/sv.hpp
index 626b4c4..94430cc 100644
--- a/sv.hpp
+++ b/sv.hpp
@@ -6,8 +6,9 @@
 #include <string>
 #include <unordered_map>
 
-class SV {
+using namespace std;
 
+class SV {
 public:
   std::string type;
   std::string chrom;
@@ -23,12 +24,13 @@ class SV {
   int score;
   std::string gt;
   bool imprecise;
+  string cigar;
 
   SV();
   SV(const std::string type_, const std::string &chrom_, uint s_,
      const std::string &refall_, const std::string &altall_, const uint w_,
      const uint cov_, const int ngaps_, const int score_,
-     bool imprecise_ = false, uint l_ = 0);
+     bool imprecise_ = false, uint l_ = 0, string cigar_ = ".");
   void genotype();
 
   bool operator<(const SV &c) const {
diff --git a/vcf.cpp b/vcf.cpp
index 95775d6..33d35af 100644
--- a/vcf.cpp
+++ b/vcf.cpp
@@ -84,6 +84,9 @@ void print_vcf_header(const unordered_map<string, char *> &ref, ofstream &o,
   o << "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise "
        "structural variation\">"
     << endl;
+  o << "##INFO=<ID=CIGAR,Number=A,Type=String,Description=\"CIGAR of "
+       "consensus\">"
+    << endl;
   o << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl;
   o << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" << sample
     << endl;

From f1db0647c26211868e029c21c5d0e66d7b40857d Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:25:16 +0200
Subject: [PATCH 04/13] Adds verbose mode

---
 config.cpp   |  4 +++-
 config.hpp   |  1 +
 extender.cpp | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/config.cpp b/config.cpp
index 9384524..dadeb0e 100644
--- a/config.cpp
+++ b/config.cpp
@@ -37,7 +37,8 @@ Configuration::Configuration()
       "b,binary", "", cxxopts::value<bool>()->default_value("false"))(
       "aggregate", "", cxxopts::value<bool>()->default_value("false"))(
       "selective", "", cxxopts::value<bool>()->default_value("true"))(
-      "version", "Print version information.")("help", "Print this help.");
+      "version", "Print version information.")("help", "Print this help.")(
+      "verbose", "", cxxopts::value<bool>()->default_value("false"));
 }
 
 void Configuration::parse(int argc, char **argv) {
@@ -114,5 +115,6 @@ void Configuration::parse(int argc, char **argv) {
   aggregate = results["aggregate"].as<bool>();
   selective = results["selective"].as<bool>();
   version = results["version"].as<bool>();
+  verbose = results["verbose"].as<bool>();
   help = results["help"].as<bool>();
 }
diff --git a/config.hpp b/config.hpp
index fc6f94c..e3c0d77 100644
--- a/config.hpp
+++ b/config.hpp
@@ -37,6 +37,7 @@ class Configuration {
   bool aggregate = false;
   bool selective = true;
   bool version = false;
+  bool verbose = false;
   bool help = false;
 
   std::string bed;
diff --git a/extender.cpp b/extender.cpp
index a2e5c81..1c51a39 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -689,7 +689,30 @@ void Extender::call() {
       continue;
     }
     const auto &clusters_by_len = cluster_by_length(cluster);
-    // --- Sorting clusters by #sequences to get first 2 most weighted clusters
+    if (config->verbose) {
+      cout << "1 " << cluster.chrom << "\t" << cluster.s << "\t" << cluster.e
+           << "\t" << cluster.cov << "\t" << cluster.seqs.size() << "\t";
+      for (const auto s : cluster.seqs)
+        cout << s.size() << ",";
+      cout << "\t";
+      for (const auto s : cluster.seqs)
+        cout << s << ",";
+      cout << endl;
+      if (clusters_by_len.size() > 1) {
+        for (const auto c : clusters_by_len) {
+          cout << "- " << c.chrom << "\t" << c.s << "\t" << c.e << "\t" << c.cov
+               << "\t" << c.seqs.size() << "\t";
+          for (const auto s : c.seqs)
+            cout << s.size() << ",";
+          cout << "\t";
+          for (const auto s : c.seqs)
+            cout << s << ",";
+          cout << endl;
+        }
+      }
+    }
+    // --- Sorting clusters by #sequences to get first 2 most weighted
+    // clusters
     int i_max1 = -1;
     int i_max2 = -1;
     uint v_max1 = 0;
@@ -728,6 +751,9 @@ void Extender::call() {
           parasail_result_get_cigar(result, consensus.c_str(), consensus.size(),
                                     ref.c_str(), ref.size(), NULL);
       string cigar_str = parasail_cigar_decode(cigar);
+      if (config->verbose)
+        cout << ref << "," << consensus << "," << cigar_str << "\n"
+             << "#" << endl;
       int score = result->score;
       parasail_cigar_free(cigar);
       parasail_result_free(result);
@@ -825,6 +851,9 @@ void Extender::call() {
 
 void Extender::filter_sv_chains() {
   std::sort(svs.begin(), svs.end());
+  if (config->verbose)
+    for (const auto sv : svs)
+      cout << sv << endl;
   if (svs.size() < 2) {
     return;
   }

From 81bf58f98108c08d19214ca09a7390abf39d09b8 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:28:45 +0200
Subject: [PATCH 05/13] Fixes bug that made us miss a SFS per cluster

---
 extender.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/extender.cpp b/extender.cpp
index 1c51a39..5d7a172 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -467,7 +467,9 @@ void Extender::cluster_no_interval_tree() {
         low = min(low, sfs.s);
         high = max(high, sfs.e);
       } else {
-        for (int k = last_j; k < j; k++) {
+        for (int k = last_j; k < j;
+             k++) { // CHECKME: < or <=?
+                    // NOTE: <= makes the code waaaay slower
           _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]);
         }
         low = sfs.s;
@@ -475,7 +477,9 @@ void Extender::cluster_no_interval_tree() {
         last_j = j;
       }
     }
-    for (int k = last_j; k < intervals[i].second; k++) {
+    for (int k = last_j; k <= intervals[i].second;
+         k++) { // CHECKME: it was < but in that way
+                // we were losing an sfs per cluster
       _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]);
     }
     if (t == 0) {

From 117d949bce2be8a953535e734d29323c4b328cf9 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:31:33 +0200
Subject: [PATCH 06/13] Uses ratio-based (on lenghts) clustering for clusters

---
 config.cpp   | 4 ++++
 config.hpp   | 1 +
 extender.cpp | 7 +++++--
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/config.cpp b/config.cpp
index dadeb0e..5ad12bf 100644
--- a/config.cpp
+++ b/config.cpp
@@ -38,6 +38,7 @@ Configuration::Configuration()
       "aggregate", "", cxxopts::value<bool>()->default_value("false"))(
       "selective", "", cxxopts::value<bool>()->default_value("true"))(
       "version", "Print version information.")("help", "Print this help.")(
+      "l", "", cxxopts::value<float>())(
       "verbose", "", cxxopts::value<bool>()->default_value("false"));
 }
 
@@ -108,6 +109,9 @@ void Configuration::parse(int argc, char **argv) {
   if (results.count("min-cluster-weight")) {
     min_cluster_weight = results["min-cluster-weight"].as<int>();
   }
+  if (results.count("l")) {
+    min_ratio = results["l"].as<float>();
+  }
   binary = results["binary"].as<bool>();
   clipped = results["clipped"].as<bool>();
   assemble = results["assemble"].as<bool>();
diff --git a/config.hpp b/config.hpp
index e3c0d77..3216153 100644
--- a/config.hpp
+++ b/config.hpp
@@ -29,6 +29,7 @@ class Configuration {
   int min_indel_length = 20;
   int aggregate_batches = 5;
   int min_cluster_weight = 2;
+  float min_ratio = 0.97; // FIXME: change name
 
   bool binary = false;
   bool clipped = false;
diff --git a/extender.cpp b/extender.cpp
index 5d7a172..81159a5 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -648,9 +648,12 @@ vector<Cluster> Extender::cluster_by_length(const Cluster &cluster) {
   for (const string &seq : cluster.get_seqs()) {
     int i;
     for (i = 0; i < clusters_by_len.size(); i++) {
-      if (abs((int)clusters_by_len[i].get_len() - (int)seq.size()) <= min_d) {
+      float cl = clusters_by_len[i].get_len();
+      float sl = seq.size();
+      if (min(cl, sl) / max(cl, sl) >= config->min_ratio)
+        // if (abs((int)clusters_by_len[i].get_len() - (int)seq.size()) <=
+        // min_d) {
         break;
-      }
     }
     if (i == clusters_by_len.size()) {
       clusters_by_len.push_back(

From 9a91453745cfe28dd7b66ed0a35b944ec1434328 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 10:32:43 +0200
Subject: [PATCH 07/13] Improves chains-filtering heuristic

---
 extender.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/extender.cpp b/extender.cpp
index 81159a5..edde907 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -877,22 +877,22 @@ void Extender::filter_sv_chains() {
     auto &sv = svs[i];
     if (sv.chrom == prev.chrom && sv.s - prev.e < 2 * sv.l &&
         prev.type == sv.type) {
-      // cout << sv.chrom << " " << sv.s << " " << sv.type << " ---- " << prev.s
-      // << endl ;
       //  check for sequence similarity
       double w_r =
-          max((double)sv.w, (double)prev.w) / min((double)sv.w, (double)prev.w);
+          min((double)sv.w, (double)prev.w) / max((double)sv.w, (double)prev.w);
       double l_r =
           min((double)sv.l, (double)prev.l) / max((double)sv.l, (double)prev.l);
-      if (w_r >= 2 && l_r >= 0.9) {
-        double d;
+      int d = sv.s - prev.s;
+      if (d < 100 && w_r >= 0.9 &&
+          l_r >= config->min_ratio) { // FIXME: hardcoded + use different ratio
+                                      // here. min_ratio was for clusters
+        double sim;
         if (sv.type == "DEL") {
-          d = rapidfuzz::fuzz::ratio(sv.refall, prev.refall);
+          sim = rapidfuzz::fuzz::ratio(sv.refall, prev.refall);
         } else {
-          d = rapidfuzz::fuzz::ratio(sv.altall, prev.altall);
+          sim = rapidfuzz::fuzz::ratio(sv.altall, prev.altall);
         }
-        // cout << w_r << " " << l_r << " " << d << endl ;
-        if (d > 70) {
+        if (sim > 70) {
           if (sv.w > prev.w) {
             _svs.push_back(sv);
           } else {

From d93332eeb414206939677b613a133cb9faa23570 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 28 Oct 2022 11:23:12 +0200
Subject: [PATCH 08/13] Replaces alignment matrix with nuc44

---
 extender.cpp | 2 +-
 extender.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/extender.cpp b/extender.cpp
index edde907..d86532d 100644
--- a/extender.cpp
+++ b/extender.cpp
@@ -753,7 +753,7 @@ void Extender::call() {
       parasail_result_t *result = NULL;
       result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(),
                                             ref.c_str(), ref.size(), 10, 1,
-                                            &parasail_blosum62);
+                                            &parasail_nuc44);
       parasail_cigar_t *cigar =
           parasail_result_get_cigar(result, consensus.c_str(), consensus.size(),
                                     ref.c_str(), ref.size(), NULL);
diff --git a/extender.hpp b/extender.hpp
index 00b5741..c1ff590 100644
--- a/extender.hpp
+++ b/extender.hpp
@@ -12,7 +12,7 @@
 #include "htslib/sam.h"
 #include "interval_tree.hpp"
 #include "parasail.h"
-#include "parasail/matrices/blosum62.h"
+#include "parasail/matrices/nuc44.h"
 #include "rapidfuzz/fuzz.hpp"
 #include "rapidfuzz/utils.hpp"
 

From 87aedf4858759aed39f175449d36b3911d9a34c2 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Thu, 5 Jan 2023 11:48:04 +0100
Subject: [PATCH 09/13] Prints name and info of missmoothed reads

---
 smoother.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/smoother.cpp b/smoother.cpp
index 50cd12f..35ebf22 100644
--- a/smoother.cpp
+++ b/smoother.cpp
@@ -225,8 +225,7 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
   } else {
     if (strlen(new_seq) != n) {
       // FIXME: why does this happen?
-      cerr << "|"
-           << endl; // qname << " " << strlen(new_seq) << " " << n << endl;
+      cerr << "|" << qname << " " << strlen(new_seq) << " " << n << endl;
       ignored_reads[omp_get_thread_num() - 2].push_back(qname);
       return;
     }

From 7ba9678fd227c8fcd5119776dd193d5a3148661d Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 6 Jan 2023 12:20:59 +0100
Subject: [PATCH 10/13] Fix bug with reallocation

Reallocation of read_seqs must be done only before loading them. Before smoothing we need to reallocate only the new seqs. Lenghts of seqs and new_seqs may differ - hence we store them both.
---
 smoother.cpp | 28 ++++++++++++++++------------
 smoother.hpp |  1 +
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/smoother.cpp b/smoother.cpp
index 35ebf22..7d7556a 100644
--- a/smoother.cpp
+++ b/smoother.cpp
@@ -56,8 +56,8 @@ void rebuild_bam_entry(bam1_t *alignment, char *seq, uint8_t *qual,
   memcpy(aux, alignment->data + alignment->l_data - l_aux, l_aux);
   // update core
   alignment->core.n_cigar = cigar.size();
-  alignment->core.l_qseq = strlen(seq);
   int l = strlen(seq);
+  alignment->core.l_qseq = l;
   // rebuild data
   int l_data = alignment->core.l_qname + (4 * alignment->core.n_cigar) +
                ((l + 1) >> 1) + l + l_aux;
@@ -92,15 +92,13 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
   for (auto op : cigar_offsets) {
     l += op.first;
   }
-  if (read_seq_max_lengths[_i][_j][_k] < l) {
-    free(read_seqs[_i][_j][_k]);
+  if (new_read_seq_max_lengths[_i][_j][_k] < l) {
     free(new_read_seqs[_i][_j][_k]);
     free(new_read_quals[_i][_j][_k]);
     //
-    read_seqs[_i][_j][_k] = (char *)malloc(sizeof(char) * (l + 1));
     new_read_seqs[_i][_j][_k] = (char *)malloc(sizeof(char) * (l + 1));
     new_read_quals[_i][_j][_k] = (uint8_t *)malloc(sizeof(char) * (l + 1));
-    read_seq_max_lengths[_i][_j][_k] = l;
+    new_read_seq_max_lengths[_i][_j][_k] = l;
   }
   //
   int n = 0;
@@ -224,8 +222,9 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
     ignored_reads[omp_get_thread_num() - 2].push_back(qname);
   } else {
     if (strlen(new_seq) != n) {
-      // FIXME: why does this happen?
-      cerr << "|" << qname << " " << strlen(new_seq) << " " << n << endl;
+      // CHECKME: this now should be fixed (read_seqs must not be freed above - only the new ones)
+      // cerr << "[W] " << qname << " " << strlen(new_seq) << "/" << strlen((char*)new_qual) << " " << n << endl;
+      cerr << "[W] this shouldn't happen anymore. If you see this warning, please open an issue at https://github.com/Parsoa/SVDSS/issues" << endl;
       ignored_reads[omp_get_thread_num() - 2].push_back(qname);
       return;
     }
@@ -283,16 +282,25 @@ void Smoother::run() {
   int modulo = 3;
   int batch_size = (10000 / config->threads) * config->threads;
   for (int i = 0; i < modulo; i++) {
+    // original read sequences
     read_seqs.push_back(
         vector<vector<char *>>(config->threads)); // current and next output
+    // smoothed read sequences
     new_read_seqs.push_back(
         vector<vector<char *>>(config->threads)); // current and next output
+    // smoothed read qualities
     new_read_quals.push_back(
         vector<vector<uint8_t *>>(config->threads)); // current and next output
+    // CHECKME: why do we need to store two lengths (read_seq_lengths and read_seq_max_lengths)? When we have to reallocate for a too long read, we change both of them. So they should always contain the same value
+    // original read lengths
     read_seq_lengths.push_back(
         vector<vector<int>>(config->threads)); // current and next output
+    // original read max lengths
     read_seq_max_lengths.push_back(
         vector<vector<int>>(config->threads)); // current and next output
+    // smoothed read max lengths
+    new_read_seq_max_lengths.push_back(
+        vector<vector<int>>(config->threads)); // current and next output
     for (int j = 0; j < config->threads; j++) {
       for (int k = 0; k < batch_size / config->threads; k++) {
         read_seqs[i][j].push_back((char *)malloc(sizeof(char) * (30001)));
@@ -302,6 +310,7 @@ void Smoother::run() {
         //
         read_seq_lengths[i][j].push_back(30000);
         read_seq_max_lengths[i][j].push_back(30000);
+        new_read_seq_max_lengths[i][j].push_back(30000);
       }
     }
   }
@@ -439,13 +448,8 @@ bool Smoother::load_batch_bam(int threads, int batch_size, int p) {
     uint32_t l = alignment->core.l_qseq; // length of the read
     if (read_seq_max_lengths[p][n % threads][i] < l) {
       free(read_seqs[p][n % threads][i]);
-      free(new_read_seqs[p][n % threads][i]);
-      free(new_read_quals[p][n % threads][i]);
       //
       read_seqs[p][n % threads][i] = (char *)malloc(sizeof(char) * (l + 1));
-      new_read_seqs[p][n % threads][i] = (char *)malloc(sizeof(char) * (l + 1));
-      new_read_quals[p][n % threads][i] =
-          (uint8_t *)malloc(sizeof(char) * (l + 1));
       read_seq_max_lengths[p][n % threads][i] = l;
       m += 1;
     }
diff --git a/smoother.hpp b/smoother.hpp
index 1647638..e6384f3 100644
--- a/smoother.hpp
+++ b/smoother.hpp
@@ -48,6 +48,7 @@ class Smoother {
   std::vector<std::vector<std::vector<char *>>> read_seqs;
   std::vector<std::vector<std::vector<char *>>> new_read_seqs;
   std::vector<std::vector<std::vector<uint8_t *>>> new_read_quals;
+  std::vector<std::vector<std::vector<int>>> new_read_seq_max_lengths;
 
   void run();
   bool load_batch_bam(int threads, int batch_size, int p);

From 9e124313a322c9785bc39b083b11e2102110abc4 Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Thu, 12 Jan 2023 09:44:29 +0100
Subject: [PATCH 11/13] Update citation

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3ce8f9c..e1fc742 100644
--- a/README.md
+++ b/README.md
@@ -210,7 +210,7 @@ For inquiries on this software please open an [issue](https://github.com/Parsoa/
 
 ### Citation
 
-SVDSS is currently pending peer review. A pre-print is available on [BioRxiv](https://www.biorxiv.org/content/10.1101/2022.02.12.480198v1).
+SVDSS is now published in [Nature Methods](https://doi.org/10.1038/s41592-022-01674-1).
 
 ##### Experiments
 Instructions on how to reproduce the experiments described in the manuscript can be found [here](https://github.com/ldenti/SVDSS-experiments) (also provided as submodule of this repository).

From 43ad7fbe4034e50a633aba911d0272c4f4ce7eed Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Thu, 12 Jan 2023 10:30:48 +0100
Subject: [PATCH 12/13] Update dependencies

---
 CMakeLists.txt | 37 ++++++++++++++-----------------------
 extender.hpp   |  1 -
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5829612..b2d3f04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.14)
 
 include(ExternalProject)
+include(FetchContent)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -43,13 +44,8 @@ if(NOT DEFINED CONDAPREFIX)
   message(STATUS "libdeflate will be built from source")
   ExternalProject_Add(deflate
     GIT_REPOSITORY https://github.com/ebiggers/libdeflate.git
-    GIT_TAG 047aa84e01b38d82f3612810e357bd40f14a3d39
+    GIT_TAG 020133854ff73b8506fe59f92a9b5b622d360716 # v1.15
     BUILD_IN_SOURCE 1
-    UPDATE_COMMAND ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND make PREFIX="."
-      "CC=${CMAKE_C_COMPILER}"
-      "CFLAGS=-fPIC -O3" libdeflate.a
     INSTALL_COMMAND "")
   ExternalProject_Get_Property(deflate SOURCE_DIR)
   SET(DEFLATE_SOURCE_DIR ${SOURCE_DIR})
@@ -59,15 +55,15 @@ if(NOT DEFINED CONDAPREFIX)
 
   # HTSLIB
   #########
-  SET(CONF_CMD autoheader && autoreconf && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR}")
+  SET(CONF_CMD autoheader && autoreconf -i && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR}")
   if (HOLYBUILD)
     # we have to include local libraries (libbz2 and liblzma)
-    SET(CONF_CMD autoheader && autoreconf && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR} -I${BZIP_LIB_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR} -L${BZIP_LIB_DIR} -L${LZMA_LIB_DIR}")
+    SET(CONF_CMD autoheader && autoreconf -i && ./configure --disable-libcurl --disable-gcs --with-libdeflate "CC=${CMAKE_C_COMPILER}" "CFLAGS=-O3 -I${DEFLATE_SOURCE_DIR} -I${BZIP_LIB_DIR}" "LDFLAGS=-L${DEFLATE_SOURCE_DIR} -L${BZIP_LIB_DIR} -L${LZMA_LIB_DIR}")
   endif ()
   message(STATUS "htslib will be built from source")
   ExternalProject_Add(htslib
     GIT_REPOSITORY https://github.com/samtools/htslib.git
-    GIT_TAG 2cd99cf333938a9eb70393bc59379e1d6ea66b37
+    GIT_TAG 4e61c128238f3e7cbb3b1f4e9c0fdb4880aa9a10 # v1.16
     UPDATE_COMMAND ""
     BUILD_IN_SOURCE 1
     CONFIGURE_COMMAND ${CONF_CMD}
@@ -98,7 +94,7 @@ else ()
 endif ()
 ExternalProject_Add(abpoa
   GIT_REPOSITORY https://github.com/yangao07/abPOA.git
-  GIT_TAG 16c64e5163cbd0502792ca20f1069fe40c929b0d
+  GIT_TAG 16c64e5163cbd0502792ca20f1069fe40c929b0d # v1.41
   PATCH_COMMAND ${PATCH_CMD}
   INSTALL_COMMAND ""
 )
@@ -114,7 +110,7 @@ SET(ABPOA_INCLUDE_DIR ${ABPOA_SOURCE_DIR}/include)
 message(STATUS "parasail will be built from source")
 ExternalProject_Add(parasail
   GIT_REPOSITORY https://github.com/jeffdaily/parasail.git
-  GIT_TAG e236a6feec7bc84fb81923d783acb345e538a281
+  GIT_TAG d23aed253fb5ecc7aeab0f81b6d64fcc26448089 # v2.6
   UPDATE_COMMAND ""
   BUILD_IN_SOURCE 1
   CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
@@ -128,22 +124,16 @@ set_target_properties(PARASAIL PROPERTIES IMPORTED_LOCATION ${PARASAIL_SOURCE_DI
 
 # rapidfuzz-cpp
 ################
-ExternalProject_Add(rapidfuzz
-    GIT_REPOSITORY https://github.com/maxbachmann/rapidfuzz-cpp.git
-    GIT_TAG d1e82379395cafc6d439c1c1e2cbe7512eaf2518
-    BUILD_IN_SOURCE 1
-    UPDATE_COMMAND ""
-    INSTALL_COMMAND ""
-)
-ExternalProject_Get_Property(rapidfuzz SOURCE_DIR)
-SET(RAPIDFUZZ_SOURCE_DIR ${SOURCE_DIR})
-SET(RAPIDFUZZ_INCLUDE_DIR ${RAPIDFUZZ_SOURCE_DIR})
+FetchContent_Declare(rapidfuzz
+  GIT_REPOSITORY https://github.com/maxbachmann/rapidfuzz-cpp.git
+  GIT_TAG 5412d5d877518e7754394bdbec76e45c8187c631) # v1.10.4
+FetchContent_MakeAvailable(rapidfuzz)
 
 # interval-tree
 ################
 ExternalProject_Add(intervaltree
     GIT_REPOSITORY https://github.com/5cript/interval-tree.git
-    GIT_TAG 4d7c66ce4fb542a29a7d74039f8e7c92fae3d6f8
+    GIT_TAG 309b9c725191d4bb1d134f28a8a32ad2f68a8ffa
     UPDATE_COMMAND ""
     CONFIGURE_COMMAND ""
     BUILD_COMMAND ""
@@ -151,7 +141,7 @@ ExternalProject_Add(intervaltree
 )
 ExternalProject_Get_Property(intervaltree SOURCE_DIR)
 SET(INTERVALTREE_SOURCE_DIR ${SOURCE_DIR})
-SET(INTERVALTREE_INCLUDE_DIR ${INTERVALTREE_SOURCE_DIR})
+SET(INTERVALTREE_INCLUDE_DIR ${INTERVALTREE_SOURCE_DIR}/include/interval-tree)
 
 # ropebwt2
 ##########
@@ -213,6 +203,7 @@ else()
 endif()
 
 target_link_libraries(SVDSS
+  PUBLIC rapidfuzz::rapidfuzz
   PUBLIC ${BINARY_DIR}/lib/libabpoa.a
   PUBLIC PARASAIL
   PUBLIC ROPEBWT
diff --git a/extender.hpp b/extender.hpp
index c1ff590..ac9e499 100644
--- a/extender.hpp
+++ b/extender.hpp
@@ -14,7 +14,6 @@
 #include "parasail.h"
 #include "parasail/matrices/nuc44.h"
 #include "rapidfuzz/fuzz.hpp"
-#include "rapidfuzz/utils.hpp"
 
 #include "bam.hpp"
 #include "chromosomes.hpp"

From aee62f461437e647accb5015ef15e804849caa2a Mon Sep 17 00:00:00 2001
From: Luca Denti <ldenti92@gmail.com>
Date: Fri, 13 Jan 2023 10:47:48 +0100
Subject: [PATCH 13/13] Add smoothing treshold to cli

---
 config.cpp   | 4 ++++
 config.hpp   | 1 +
 smoother.cpp | 3 ++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/config.cpp b/config.cpp
index 5ad12bf..42f5c03 100644
--- a/config.cpp
+++ b/config.cpp
@@ -39,6 +39,7 @@ Configuration::Configuration()
       "selective", "", cxxopts::value<bool>()->default_value("true"))(
       "version", "Print version information.")("help", "Print this help.")(
       "l", "", cxxopts::value<float>())(
+      "acc", "", cxxopts::value<float>())(
       "verbose", "", cxxopts::value<bool>()->default_value("false"));
 }
 
@@ -112,6 +113,9 @@ void Configuration::parse(int argc, char **argv) {
   if (results.count("l")) {
     min_ratio = results["l"].as<float>();
   }
+  if (results.count("acc")) {
+    al_accuracy = results["acc"].as<float>();
+  }
   binary = results["binary"].as<bool>();
   clipped = results["clipped"].as<bool>();
   assemble = results["assemble"].as<bool>();
diff --git a/config.hpp b/config.hpp
index 3216153..4a69573 100644
--- a/config.hpp
+++ b/config.hpp
@@ -30,6 +30,7 @@ class Configuration {
   int aggregate_batches = 5;
   int min_cluster_weight = 2;
   float min_ratio = 0.97; // FIXME: change name
+  float al_accuracy = 0.02;
 
   bool binary = false;
   bool clipped = false;
diff --git a/smoother.cpp b/smoother.cpp
index 7d7556a..f2ab8d9 100644
--- a/smoother.cpp
+++ b/smoother.cpp
@@ -217,7 +217,8 @@ void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom,
   //   cerr << p.first << op;
   // }
   // cerr << endl;
-  if (num_mismatch / num_match >= 0.02 || should_ignore) { // FIXME: HARDCODED
+
+  if (num_mismatch / num_match >= config->al_accuracy || should_ignore) {
     // read is too dirty or is not interesting, just skip it
     ignored_reads[omp_get_thread_num() - 2].push_back(qname);
   } else {