From 6828f9e18eb0a8774b0e1dda19a727c34103071b Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Fri, 24 Mar 2023 21:19:08 -0700 Subject: [PATCH 01/22] implement simple chunk-merge sorting for GAFs next is avoiding to open too many files when merging, better progress log, gzipping the temporary GAFs, finding out how to not convert back and forth string-nodeid, better arg to adjust memory --- src/subcommand/gamsort_main.cpp | 217 ++++++++++++++++++++++++++++---- 1 file changed, 193 insertions(+), 24 deletions(-) diff --git a/src/subcommand/gamsort_main.cpp b/src/subcommand/gamsort_main.cpp index 2275cbd41ea..48b68550231 100644 --- a/src/subcommand/gamsort_main.cpp +++ b/src/subcommand/gamsort_main.cpp @@ -3,6 +3,9 @@ #include "../stream_index.hpp" #include #include "subcommand.hpp" +#include "vg/io/gafkluge.hpp" +#include "alignment.hpp" + /** * GAM sort main @@ -13,21 +16,38 @@ using namespace vg; using namespace vg::subcommand; void help_gamsort(char **argv) { - cerr << "gamsort: sort a GAM file, or index a sorted GAM file" << endl + cerr << "gamsort: sort a GAM/GAF file, or index a sorted GAM file" << endl << "Usage: " << argv[1] << " [Options] gamfile" << endl << "Options:" << endl << " -i / --index FILE produce an index of the sorted GAM file" << endl << " -d / --dumb-sort use naive sorting algorithm (no tmp files, faster for small GAMs)" << endl << " -p / --progress Show progress." << endl + << " -G / --gaf-input Input is a GAF file." << endl + << " -c / --chunk-size Number of reads per chunk when sorting GAFs." << endl << " -t / --threads Use the specified number of threads." << endl << endl; } +// defines how to compare two GAF records +// first using 'rk1' tag (here, minimum node ID). If tied, use 'rk2' tag (here, maximum node ID) +struct compare_gaf { + bool operator()(const gafkluge::GafRecord& gaf1, const gafkluge::GafRecord& gaf2) { + // TODO find a way to not have to convert the node ids to string before and then back to int here? + int rk11 = std::stoi(gaf1.opt_fields.find("rk1")->second.second); + int rk12 = std::stoi(gaf2.opt_fields.find("rk1")->second.second); + int rk21 = std::stoi(gaf1.opt_fields.find("rk2")->second.second); + int rk22 = std::stoi(gaf2.opt_fields.find("rk2")->second.second); + return rk11 < rk12 || (rk11 == rk12 && rk21 < rk22); + } +}; + int main_gamsort(int argc, char **argv) { string index_filename; bool easy_sort = false; bool show_progress = false; + string input_format = "GAM"; + int chunk_size = 1000000; // maximum number reads held in memory // We limit the max threads, and only allow thread count to be lowered, to // prevent tcmalloc from giving each thread a very large heap for many // threads. @@ -43,10 +63,12 @@ int main_gamsort(int argc, char **argv) {"dumb-sort", no_argument, 0, 'd'}, {"rocks", required_argument, 0, 'r'}, {"progress", no_argument, 0, 'p'}, + {"gaf-input", no_argument, 0, 'g'}, + {"chunk-size", required_argument, 0, 'c'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0}}; int option_index = 0; - c = getopt_long(argc, argv, "i:dhpt:", + c = getopt_long(argc, argv, "i:dhpGt:c:", long_options, &option_index); // Detect the end of the options. @@ -64,6 +86,12 @@ int main_gamsort(int argc, char **argv) case 'p': show_progress = true; break; + case 'G': + input_format = "GAF"; + break; + case 'c': + chunk_size = parse(optarg); + break; case 't': num_threads = min(parse(optarg), num_threads); break; @@ -75,7 +103,6 @@ int main_gamsort(int argc, char **argv) } } - if (argc < 3){ help_gamsort(argv); exit(1); @@ -83,34 +110,176 @@ int main_gamsort(int argc, char **argv) omp_set_num_threads(num_threads); - get_input_file(optind, argc, argv, [&](istream& gam_in) { + if (input_format == "GAM") { + get_input_file(optind, argc, argv, [&](istream& gam_in) { - GAMSorter gs(show_progress); + GAMSorter gs(show_progress); - // Do a normal GAMSorter sort - unique_ptr index; + // Do a normal GAMSorter sort + unique_ptr index; - if (!index_filename.empty()) { - // Make an index - index = unique_ptr(new GAMIndex()); - } + if (!index_filename.empty()) { + // Make an index + index = unique_ptr(new GAMIndex()); + } - if (easy_sort) { - // Sort in a single pass in memory - gs.easy_sort(gam_in, cout, index.get()); - } else { - // Sort using fan-in-limited temp file merging - gs.stream_sort(gam_in, cout, index.get()); - } + if (easy_sort) { + // Sort in a single pass in memory + gs.easy_sort(gam_in, cout, index.get()); + } else { + // Sort using fan-in-limited temp file merging + gs.stream_sort(gam_in, cout, index.get()); + } + + if (index.get() != nullptr) { + // Save the index + ofstream index_out(index_filename); + index->save(index_out); + } + }); + } else if (input_format == "GAF") { + std::string input_gaf_filename = get_input_file_name(optind, argc, argv); + + // where to store the chunk of GAF records that will be sorted, then written to disk, + // (then later merged with the other sorted chunks) + std::vector current_gaf_chunk; + int count = 0; // read count + int chunk_id = 0; // ID of the current chunk + std::vector chunk_files; // names of the chunk files - if (index.get() != nullptr) { - // Save the index - ofstream index_out(index_filename); - index->save(index_out); + // read input GAF file + htsFile* in = hts_open(input_gaf_filename.c_str(), "r"); + if (in == NULL) { + cerr << "[vg::alignment.cpp] couldn't open " << input_gaf_filename << endl; exit(1); + } + kstring_t s_buffer = KS_INITIALIZE; + gafkluge::GafRecord gaf; + + while (vg::io::get_next_record_from_gaf(nullptr, nullptr, in, s_buffer, gaf) == true) { + // find the minimum and maximum node IDs + nid_t min_node = std::numeric_limits::max(); + nid_t max_node = 0; + for (size_t i = 0; i < gaf.path.size(); ++i) { + const auto& gaf_step = gaf.path[i]; + assert(gaf_step.is_stable == false); + assert(gaf_step.is_interval == false); + nid_t nodeid = std::stol(gaf_step.name); + if (min_node > nodeid){ + min_node = nodeid; + } + if (max_node < nodeid){ + max_node = nodeid; + } + } + // write them as new GAF tags 'rk1' and 'rk2' + // they'll get written in the temporary chunks to avoid having + // to find them again when mergin them + gaf.opt_fields["rk1"] = make_pair('i', std::to_string(min_node)); + gaf.opt_fields["rk2"] = make_pair('i', std::to_string(max_node)); + current_gaf_chunk.push_back(gaf); + count++; + + // if we've read enough reads, sort them and write to disk + if(count == chunk_size){ + // sort by minimum node id + std::stable_sort(current_gaf_chunk.begin(), current_gaf_chunk.end(), compare_gaf()); + // write to temp_gafsort_.gaf + std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id); + std::ofstream out_file(chunk_outf); + for (int ii=0; ii 0){ + // sort by minimum node id + std::stable_sort(current_gaf_chunk.begin(), current_gaf_chunk.end(), compare_gaf()); + + // write to temp_gafsort_.gaf + std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id) + ".gaf"; + std::ofstream out_file(chunk_outf); + for (int ii=0; ii opened_files; + std::vector more_in_file; + std::vector opened_file_buffers; + std::vector opened_records; + + std::string line; + + // open the temp GAF files and read the first record + for(int ii=0; ii < chunk_files.size(); ii++){ + htsFile* in = hts_open(chunk_files[ii].c_str(), "r"); + if (in == NULL) { + cerr << "[vg::alignment.cpp] couldn't open " << input_gaf_filename << endl; exit(1); + } + opened_file_buffers.push_back(KS_INITIALIZE); + opened_files.push_back(in); + more_in_file.push_back(vg::io::get_next_record_from_gaf(nullptr, nullptr, opened_files.back(), opened_file_buffers.back(), gaf)); + opened_records.push_back(gaf); } - }); + // merge one by one until no more to read for all files + bool more_to_merge = true; + while(more_to_merge){ + // which file will have the smallest record (i.e. to output first) + int smallest_file = -1; + // compare current gaf records + for(int ii = 0; ii Date: Tue, 28 Mar 2023 22:32:48 -0700 Subject: [PATCH 02/22] fix overflow nodeid and add some progress logging --- src/subcommand/gamsort_main.cpp | 35 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/subcommand/gamsort_main.cpp b/src/subcommand/gamsort_main.cpp index 48b68550231..42e12d6b9d8 100644 --- a/src/subcommand/gamsort_main.cpp +++ b/src/subcommand/gamsort_main.cpp @@ -33,10 +33,10 @@ void help_gamsort(char **argv) struct compare_gaf { bool operator()(const gafkluge::GafRecord& gaf1, const gafkluge::GafRecord& gaf2) { // TODO find a way to not have to convert the node ids to string before and then back to int here? - int rk11 = std::stoi(gaf1.opt_fields.find("rk1")->second.second); - int rk12 = std::stoi(gaf2.opt_fields.find("rk1")->second.second); - int rk21 = std::stoi(gaf1.opt_fields.find("rk2")->second.second); - int rk22 = std::stoi(gaf2.opt_fields.find("rk2")->second.second); + long long rk11 = std::stoll(gaf1.opt_fields.find("rk1")->second.second); + long long rk12 = std::stoll(gaf2.opt_fields.find("rk1")->second.second); + long long rk21 = std::stoll(gaf1.opt_fields.find("rk2")->second.second); + long long rk22 = std::stoll(gaf2.opt_fields.find("rk2")->second.second); return rk11 < rk12 || (rk11 == rk12 && rk21 < rk22); } }; @@ -155,6 +155,11 @@ int main_gamsort(int argc, char **argv) kstring_t s_buffer = KS_INITIALIZE; gafkluge::GafRecord gaf; + std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id) + ".gaf"; + if(show_progress){ + cerr << "Preparing temporary chunk " << chunk_outf << "..." << endl; + } + while (vg::io::get_next_record_from_gaf(nullptr, nullptr, in, s_buffer, gaf) == true) { // find the minimum and maximum node IDs nid_t min_node = std::numeric_limits::max(); @@ -182,22 +187,28 @@ int main_gamsort(int argc, char **argv) // if we've read enough reads, sort them and write to disk if(count == chunk_size){ // sort by minimum node id + if(show_progress){ + cerr << " Sorting chunk..." << endl; + } std::stable_sort(current_gaf_chunk.begin(), current_gaf_chunk.end(), compare_gaf()); // write to temp_gafsort_.gaf - std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id); + if(show_progress){ + cerr << " Writing chunk..." << endl; + } std::ofstream out_file(chunk_outf); for (int ii=0; ii 0){ // sort by minimum node id + if(show_progress){ + cerr << " Sorting chunk..." << endl; + } std::stable_sort(current_gaf_chunk.begin(), current_gaf_chunk.end(), compare_gaf()); - // write to temp_gafsort_.gaf - std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id) + ".gaf"; + if(show_progress){ + cerr << " Writing chunk..." << endl; + } std::ofstream out_file(chunk_outf); for (int ii=0; ii Date: Tue, 4 Jul 2023 14:11:02 +0200 Subject: [PATCH 03/22] fix comment typo --- src/constructor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/constructor.cpp b/src/constructor.cpp index f0571319487..6cf81e34367 100644 --- a/src/constructor.cpp +++ b/src/constructor.cpp @@ -2480,7 +2480,7 @@ namespace vg { // Make sure each VCF file exists. Otherwise Tabix++ may exit with a non- // helpful message. - // We can't invoke stat woithout a place for it to write. But all we + // We can't invoke stat without a place for it to write. But all we // really want is its return value. struct stat temp; if(stat(vcf_filename.c_str(), &temp)) { From 0c5060ba7029dbf6fadbd1274657e685f6a75c56 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Tue, 4 Jul 2023 14:11:40 +0200 Subject: [PATCH 04/22] add sorted gaf input option to vg find --- deps/htslib | 2 +- src/subcommand/find_main.cpp | 80 ++++++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 14 deletions(-) diff --git a/deps/htslib b/deps/htslib index bd133acf514..bdf5717ead5 160000 --- a/deps/htslib +++ b/deps/htslib @@ -1 +1 @@ -Subproject commit bd133acf51498431a5c7dfd8aa06ce17ec6d3b96 +Subproject commit bdf5717ead58641cffad2d7fc1fbf3921c14a9e7 diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index 406eb2a13a2..19ac5a19d11 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -15,6 +15,7 @@ #include "../algorithms/extract_connecting_graph.hpp" #include "../algorithms/walk.hpp" #include +#include #include #include @@ -52,6 +53,7 @@ void help_find(char** argv) { << " -H, --gbwt FILE when enumerating kmers from subgraphs, determine their frequencies in this GBWT haplotype index" << endl << "alignments:" << endl << " -l, --sorted-gam FILE use this sorted, indexed GAM file" << endl + << " -F, --sorted-gaf FILE use this sorted, indexed GAF file" << endl << " -o, --alns-on N:M write alignments which align to any of the nodes between N and M (inclusive)" << endl << " -A, --to-graph VG get alignments to the provided subgraph" << endl << "sequences:" << endl @@ -92,6 +94,7 @@ int main_find(int argc, char** argv) { int mem_reseed_length = 0; bool use_fast_reseed = true; string sorted_gam_name; + string sorted_gaf_name; bool get_mappings = false; string aln_on_id_range; vg::id_t start_id = 0; @@ -146,6 +149,7 @@ int main_find(int argc, char** argv) { {"position-in", required_argument, 0, 'P'}, {"node-range", required_argument, 0, 'r'}, {"sorted-gam", required_argument, 0, 'l'}, + {"sorted-gaf", required_argument, 0, 'F'}, {"mappings", no_argument, 0, 'm'}, {"alns-on", required_argument, 0, 'o'}, {"distance", no_argument, 0, 'D'}, @@ -164,7 +168,7 @@ int main_find(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "x:n:e:s:o:hc:LS:p:P:r:l:mg:M:B:fDG:N:A:Y:Z:IQ:ER:W:K:H:", + c = getopt_long (argc, argv, "x:n:e:s:o:hc:LS:p:P:r:l:F:mg:M:B:fDG:N:A:Y:Z:IQ:ER:W:K:H:", long_options, &option_index); // Detect the end of the options. @@ -264,6 +268,10 @@ int main_find(int argc, char** argv) { sorted_gam_name = optarg; break; + case 'F': + sorted_gaf_name = optarg; + break; + case 'I': list_path_names = true; break; @@ -328,8 +336,8 @@ int main_find(int argc, char** argv) { return 1; } - if (gcsa_in.empty() && xg_name.empty() && sorted_gam_name.empty()) { - cerr << "[vg find] find requires -g, -x, or -l to know where to find its database" << endl; + if (gcsa_in.empty() && xg_name.empty() && sorted_gam_name.empty() && sorted_gaf_name.empty()) { + cerr << "[vg find] find requires -g, -x, -l, or -F to know where to find its database" << endl; return 1; } @@ -429,6 +437,24 @@ int main_find(int argc, char** argv) { }); } + // load GAF index + tbx_t *gaf_tbx = NULL; + htsFile *fp = NULL; + if (!sorted_gaf_name.empty()){ + gaf_tbx = tbx_index_load3(sorted_gaf_name.c_str(), NULL, 0); + if ( !gaf_tbx ){ + cerr << "Could not load .tbi/.csi index of " << sorted_gaf_name << endl; + exit(1); + } + int nseq; + fp = hts_open(sorted_gaf_name.c_str(),"r"); + if ( !fp ) { + cerr << "Could not open " << sorted_gaf_name << endl; + exit(1); + } + + } + if (!aln_on_id_range.empty()) { // Parse the range vector parts = split_delims(aln_on_id_range, ":"); @@ -450,9 +476,20 @@ int main_find(int argc, char** argv) { // Find the alignments and dump them to cout gam_index->find(cursor, start_id, end_id, vg::io::emit_to(cout)); }); - - } else { - cerr << "error [vg find]: Cannot find alignments on range without a sorted GAM" << endl; + } else if (!sorted_gaf_name.empty()) { + // read GAF slice in region 'reg' + string reg = "{node}:" + convert(start_id) + "-" + convert(end_id); + cout << reg << endl; + hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); + kstring_t str = {0,0,0}; + if ( itr ) { + while (tbx_itr_next(fp, gaf_tbx, itr, &str) >= 0) { + puts(str.s); + } + tbx_itr_destroy(itr); + } + } else { + cerr << "error [vg find]: Cannot find alignments on range without a sorted GAM or GAF" << endl; exit(1); } } @@ -462,23 +499,40 @@ int main_find(int argc, char** argv) { // Load up the graph auto graph = vg::io::VPKG::load_one(to_graph_file); - if (gam_index.get() != nullptr) { - // Find in sorted GAM - + + if (gam_index.get() != nullptr | !sorted_gaf_name.empty()) { // Get the ID ranges from the graph auto ranges = vg::algorithms::sorted_id_ranges(graph.get()); // Throw out the graph graph.reset(); - get_input_file(sorted_gam_name, [&](istream& in) { + if (gam_index.get() != nullptr) { + // Find in sorted GAM + get_input_file(sorted_gam_name, [&](istream& in) { // Make a cursor for input // TODO: Refactor so we can put everything with the GAM index inside one get_input_file call to deduplicate code vg::io::ProtobufIterator cursor(in); - + // Find the alignments and send them to cout gam_index->find(cursor, ranges, vg::io::emit_to(cout)); - }); - + }); + + } else if (!sorted_gaf_name.empty()) { + // Find in sorted GAF + // loop over ranges and print GAF records + for (auto range : ranges) { + string reg = "{node}:" + convert(range.first) + "-" + convert(range.second); + cout << reg << endl; + hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); + kstring_t str = {0,0,0}; + if ( itr ) { + while (tbx_itr_next(fp, gaf_tbx, itr, &str) >= 0) { + puts(str.s); + } + tbx_itr_destroy(itr); + } + } + } } else { cerr << "error [vg find]: Cannot find alignments on graph without a sorted GAM" << endl; exit(1); From 14939361777b0543c6af810310cfd5c45eb23cdb Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Tue, 4 Jul 2023 14:24:54 +0200 Subject: [PATCH 05/22] use more explicit var names for the gaf part --- src/subcommand/find_main.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index 19ac5a19d11..e894ebbb7ab 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -439,7 +439,7 @@ int main_find(int argc, char** argv) { // load GAF index tbx_t *gaf_tbx = NULL; - htsFile *fp = NULL; + htsFile *gaf_fp = NULL; if (!sorted_gaf_name.empty()){ gaf_tbx = tbx_index_load3(sorted_gaf_name.c_str(), NULL, 0); if ( !gaf_tbx ){ @@ -447,12 +447,11 @@ int main_find(int argc, char** argv) { exit(1); } int nseq; - fp = hts_open(sorted_gaf_name.c_str(),"r"); - if ( !fp ) { + gaf_fp = hts_open(sorted_gaf_name.c_str(),"r"); + if ( !gaf_fp ) { cerr << "Could not open " << sorted_gaf_name << endl; exit(1); } - } if (!aln_on_id_range.empty()) { @@ -483,7 +482,7 @@ int main_find(int argc, char** argv) { hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); kstring_t str = {0,0,0}; if ( itr ) { - while (tbx_itr_next(fp, gaf_tbx, itr, &str) >= 0) { + while (tbx_itr_next(gaf_fp, gaf_tbx, itr, &str) >= 0) { puts(str.s); } tbx_itr_destroy(itr); @@ -526,7 +525,7 @@ int main_find(int argc, char** argv) { hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); kstring_t str = {0,0,0}; if ( itr ) { - while (tbx_itr_next(fp, gaf_tbx, itr, &str) >= 0) { + while (tbx_itr_next(gaf_fp, gaf_tbx, itr, &str) >= 0) { puts(str.s); } tbx_itr_destroy(itr); From 423632abc3fd0e9c45746d5aca471c32559b063f Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Tue, 4 Jul 2023 14:47:11 +0200 Subject: [PATCH 06/22] use consistent indentation... --- src/subcommand/find_main.cpp | 88 ++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index e894ebbb7ab..f6e73ebf3e8 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -441,17 +441,17 @@ int main_find(int argc, char** argv) { tbx_t *gaf_tbx = NULL; htsFile *gaf_fp = NULL; if (!sorted_gaf_name.empty()){ - gaf_tbx = tbx_index_load3(sorted_gaf_name.c_str(), NULL, 0); - if ( !gaf_tbx ){ - cerr << "Could not load .tbi/.csi index of " << sorted_gaf_name << endl; - exit(1); - } - int nseq; - gaf_fp = hts_open(sorted_gaf_name.c_str(),"r"); - if ( !gaf_fp ) { - cerr << "Could not open " << sorted_gaf_name << endl; - exit(1); - } + gaf_tbx = tbx_index_load3(sorted_gaf_name.c_str(), NULL, 0); + if ( !gaf_tbx ){ + cerr << "Could not load .tbi/.csi index of " << sorted_gaf_name << endl; + exit(1); + } + int nseq; + gaf_fp = hts_open(sorted_gaf_name.c_str(),"r"); + if ( !gaf_fp ) { + cerr << "Could not open " << sorted_gaf_name << endl; + exit(1); + } } if (!aln_on_id_range.empty()) { @@ -476,23 +476,23 @@ int main_find(int argc, char** argv) { gam_index->find(cursor, start_id, end_id, vg::io::emit_to(cout)); }); } else if (!sorted_gaf_name.empty()) { - // read GAF slice in region 'reg' - string reg = "{node}:" + convert(start_id) + "-" + convert(end_id); - cout << reg << endl; - hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); - kstring_t str = {0,0,0}; - if ( itr ) { - while (tbx_itr_next(gaf_fp, gaf_tbx, itr, &str) >= 0) { - puts(str.s); + // read GAF slice in region 'reg' + string reg = "{node}:" + convert(start_id) + "-" + convert(end_id); + cout << reg << endl; + hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); + kstring_t str = {0,0,0}; + if ( itr ) { + while (tbx_itr_next(gaf_fp, gaf_tbx, itr, &str) >= 0) { + puts(str.s); + } + tbx_itr_destroy(itr); } - tbx_itr_destroy(itr); - } - } else { + } else { cerr << "error [vg find]: Cannot find alignments on range without a sorted GAM or GAF" << endl; exit(1); } } - + if (!to_graph_file.empty()) { // Find alignments touching a graph @@ -506,31 +506,31 @@ int main_find(int argc, char** argv) { graph.reset(); if (gam_index.get() != nullptr) { - // Find in sorted GAM - get_input_file(sorted_gam_name, [&](istream& in) { - // Make a cursor for input - // TODO: Refactor so we can put everything with the GAM index inside one get_input_file call to deduplicate code - vg::io::ProtobufIterator cursor(in); + // Find in sorted GAM + get_input_file(sorted_gam_name, [&](istream& in) { + // Make a cursor for input + // TODO: Refactor so we can put everything with the GAM index inside one get_input_file call to deduplicate code + vg::io::ProtobufIterator cursor(in); - // Find the alignments and send them to cout - gam_index->find(cursor, ranges, vg::io::emit_to(cout)); - }); + // Find the alignments and send them to cout + gam_index->find(cursor, ranges, vg::io::emit_to(cout)); + }); } else if (!sorted_gaf_name.empty()) { - // Find in sorted GAF - // loop over ranges and print GAF records - for (auto range : ranges) { - string reg = "{node}:" + convert(range.first) + "-" + convert(range.second); - cout << reg << endl; - hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); - kstring_t str = {0,0,0}; - if ( itr ) { - while (tbx_itr_next(gaf_fp, gaf_tbx, itr, &str) >= 0) { - puts(str.s); - } - tbx_itr_destroy(itr); + // Find in sorted GAF + // loop over ranges and print GAF records + for (auto range : ranges) { + string reg = "{node}:" + convert(range.first) + "-" + convert(range.second); + cout << reg << endl; + hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); + kstring_t str = {0,0,0}; + if ( itr ) { + while (tbx_itr_next(gaf_fp, gaf_tbx, itr, &str) >= 0) { + puts(str.s); + } + tbx_itr_destroy(itr); + } } - } } } else { cerr << "error [vg find]: Cannot find alignments on graph without a sorted GAM" << endl; From e731f65d3acf65a72791714746385025cd85a7cd Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Tue, 4 Jul 2023 15:46:26 +0200 Subject: [PATCH 07/22] remove debug log --- src/subcommand/find_main.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index f6e73ebf3e8..48ebda09a6a 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -478,7 +478,6 @@ int main_find(int argc, char** argv) { } else if (!sorted_gaf_name.empty()) { // read GAF slice in region 'reg' string reg = "{node}:" + convert(start_id) + "-" + convert(end_id); - cout << reg << endl; hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); kstring_t str = {0,0,0}; if ( itr ) { @@ -521,7 +520,6 @@ int main_find(int argc, char** argv) { // loop over ranges and print GAF records for (auto range : ranges) { string reg = "{node}:" + convert(range.first) + "-" + convert(range.second); - cout << reg << endl; hts_itr_t *itr = tbx_itr_querys(gaf_tbx, reg.c_str()); kstring_t str = {0,0,0}; if ( itr ) { From a8e7fac255f084e97c0bdd90714c9f5c998831fc Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Tue, 4 Jul 2023 15:48:58 +0200 Subject: [PATCH 08/22] add gaf input to vg chunk (only when specifying a region or node ids) --- src/subcommand/chunk_main.cpp | 140 ++++++++++++++++++++++++++-------- 1 file changed, 108 insertions(+), 32 deletions(-) diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 68488851667..d20164e02dd 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -48,6 +48,7 @@ void help_chunk(char** argv) { << " -G, --gbwt-name FILE use this GBWT haplotype index for haplotype extraction (for -T)" << endl << " -a, --gam-name FILE chunk this gam file instead of the graph (multiple allowed)" << endl << " -g, --gam-and-graph when used in combination with -a, both gam and graph will be chunked" << endl + << " -F, --in-gaf input alignment is a sorted bgzipped GAF" << endl << "path chunking:" << endl << " -p, --path TARGET write the chunk in the specified (0-based inclusive, multiple allowed)\n" << " path range TARGET=path[:pos1[-pos2]] to standard output" << endl @@ -92,6 +93,7 @@ int main_chunk(int argc, char** argv) { string gbwt_file; vector gam_files; bool gam_and_graph = false; + bool gam_is_gaf = false; vector region_strings; string path_list_file; int chunk_size = 0; @@ -126,6 +128,7 @@ int main_chunk(int argc, char** argv) { {"gbwt-name", required_argument, 0, 'G'}, {"gam-name", required_argument, 0, 'a'}, {"gam-and-graph", no_argument, 0, 'g'}, + {"in-gaf", no_argument, 0, 'F'}, {"path", required_argument, 0, 'p'}, {"path-names", required_argument, 0, 'P'}, {"chunk-size", required_argument, 0, 's'}, @@ -151,7 +154,7 @@ int main_chunk(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:G:a:gp:P:s:o:e:S:E:b:c:r:R:Tft:n:l:m:CMO:", + c = getopt_long (argc, argv, "hx:G:a:gFp:P:s:o:e:S:E:b:c:r:R:Tft:n:l:m:CMO:", long_options, &option_index); @@ -178,6 +181,10 @@ int main_chunk(int argc, char** argv) { gam_and_graph = true; break; + case 'F': + gam_is_gaf = true; + break; + case 'p': region_strings.push_back(optarg); break; @@ -294,7 +301,7 @@ int main_chunk(int argc, char** argv) { } // need -a if using -f if ((gam_split_size != 0 || fully_contained) && gam_files.empty()) { - cerr << "error:[vg chunk] gam file must be specified with -a when using -f or -m" << endl; + cerr << "error:[vg chunk] read alignment file (gam or gaf) must be specified with -a when using -f or -m" << endl; return 1; } if (components == true && context_steps >= 0) { @@ -341,7 +348,7 @@ int main_chunk(int argc, char** argv) { unique_ptr path_handle_graph; bdsg::ReferencePathOverlayHelper overlay_helper; - if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0) || components) { + if (chunk_graph || trace || context_steps > 0 || context_length > 0 || (!id_range && gam_split_size == 0) || (id_range && chunk_gam) || components) { if (xg_file.empty()) { cerr << "error:[vg chunk] graph or xg index (-x) required" << endl; return 1; @@ -391,20 +398,50 @@ int main_chunk(int argc, char** argv) { // We need an index on the GAM to chunk it (if we're not doing components) vector> gam_indexes; + vector> gaf_tbxs; + vector> gaf_fps; if (chunk_gam && !components) { - for (auto gam_file : gam_files) { - try { - get_input_file(gam_file + ".gai", [&](istream& index_stream) { + if (gam_is_gaf){ + for (auto gaf_file : gam_files) { + try { + tbx_t *gaf_tbx = NULL; + htsFile *gaf_fp = NULL; + if (!gaf_file.empty()){ + gaf_tbx = tbx_index_load3(gaf_file.c_str(), NULL, 0); + if ( !gaf_tbx ){ + cerr << "Could not load .tbi/.csi index of " << gaf_file << endl; + exit(1); + } + int nseq; + gaf_fp = hts_open(gaf_file.c_str(),"r"); + if ( !gaf_fp ) { + cerr << "Could not open " << gaf_file << endl; + exit(1); + } + gaf_fps.push_back(unique_ptr(gaf_fp)); + gaf_tbxs.push_back(unique_ptr(gaf_tbx)); + } + } catch (...) { + cerr << "error:[vg chunk] unable to load GAF index file: " << gaf_file << "" << endl; + exit(1); + } + } + } else { + for (auto gam_file : gam_files) { + try { + get_input_file(gam_file + ".gai", [&](istream& index_stream) { gam_indexes.push_back(unique_ptr(new GAMIndex())); gam_indexes.back()->load(index_stream); }); - } catch (...) { - cerr << "error:[vg chunk] unable to load GAM index file: " << gam_file << ".gai" << endl - << " note: a GAM index is required when *not* chunking by components with -C or -M" << endl; - exit(1); + } catch (...) { + cerr << "error:[vg chunk] unable to load GAM index file: " << gam_file << ".gai" << endl + << " note: a GAM index is required when *not* chunking by components with -C or -M" << endl; + exit(1); + } } } } + // If we're chunking on components with a GAM, this map will be used // (instead of an index) unordered_map node_to_component; @@ -593,6 +630,10 @@ int main_chunk(int argc, char** argv) { // now ready to get our chunk on if (gam_split_size != 0) { + if(!gam_is_gaf){ + cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by split." << endl; + return 1; + } for (size_t gi = 0; gi < gam_files.size(); ++gi) { ifstream gam_stream; string& gam_file = gam_files[gi]; @@ -629,7 +670,7 @@ int main_chunk(int argc, char** argv) { vector> gam_streams_vec(gam_files.size()); vector> cursors_vec(gam_files.size()); - if (chunk_gam) { + if (chunk_gam & !gam_is_gaf) { for (size_t gam_i = 0; gam_i < gam_streams_vec.size(); ++gam_i) { auto& gam_file = gam_files[gam_i]; auto& gam_streams = gam_streams_vec[gam_i]; @@ -761,30 +802,61 @@ int main_chunk(int argc, char** argv) { // optional gam chunking if (chunk_gam) { if (!components) { - // old way: use the gam index - for (size_t gi = 0; gi < gam_indexes.size(); ++gi) { - auto& gam_index = gam_indexes[gi]; - assert(gam_index.get() != nullptr); - GAMIndex::cursor_t& cursor = cursors_vec[gi][tid]; - - string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi, components); - ofstream out_gam_file(gam_name); - if (!out_gam_file) { - cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl; - exit(1); + // Work out the ID ranges to look up + vector> region_id_ranges; + if (subgraph) { + // Use the regions from the graph + region_id_ranges = vg::algorithms::sorted_id_ranges(subgraph.get()); + } else { + // Use the region we were asked for + region_id_ranges = {{region.start, region.end}}; + } + + if(gam_is_gaf){ + // use the indexed bgzipped GAFs + for (size_t gi = 0; gi < gaf_fps.size(); ++gi) { + auto& gaf_fp = gaf_fps[gi]; + assert(gaf_fp.get() != nullptr); + auto& gaf_tbx = gaf_tbxs[gi]; + assert(gaf_tbx.get() != nullptr); + + string gaf_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gaf", gi, components); + ofstream out_gaf_file(gaf_name); + if (!out_gaf_file) { + cerr << "error[vg chunk]: can't open output gaf file " << gaf_name << endl; + exit(1); + } + + // loop over ranges and print GAF records + for (auto range : region_id_ranges) { + string reg = "{node}:" + convert(range.first) + "-" + convert(range.second); + hts_itr_t *itr = tbx_itr_querys(gaf_tbx.get(), reg.c_str()); + kstring_t str = {0,0,0}; + if ( itr ) { + while (tbx_itr_next(gaf_fp.get(), gaf_tbx.get(), itr, &str) >= 0) { + out_gaf_file << str.s << endl; + } + tbx_itr_destroy(itr); + } + } + out_gaf_file.close(); } + } else { + // old way: use the gam index + for (size_t gi = 0; gi < gam_indexes.size(); ++gi) { + auto& gam_index = gam_indexes[gi]; + assert(gam_index.get() != nullptr); + GAMIndex::cursor_t& cursor = cursors_vec[gi][tid]; - // Work out the ID ranges to look up - vector> region_id_ranges; - if (subgraph) { - // Use the regions from the graph - region_id_ranges = vg::algorithms::sorted_id_ranges(subgraph.get()); - } else { - // Use the region we were asked for - region_id_ranges = {{region.start, region.end}}; + string gam_name = chunk_name(out_chunk_prefix, i, output_regions[i], ".gam", gi, components); + ofstream out_gam_file(gam_name); + if (!out_gam_file) { + cerr << "error[vg chunk]: can't open output gam file " << gam_name << endl; + exit(1); + } + + gam_index->find(cursor, region_id_ranges, vg::io::emit_to(out_gam_file), fully_contained); } - - gam_index->find(cursor, region_id_ranges, vg::io::emit_to(out_gam_file), fully_contained); } } else { #pragma omp critical (node_to_component) @@ -834,6 +906,10 @@ int main_chunk(int argc, char** argv) { // write out component gams if (chunk_gam && components) { + if(!gam_is_gaf){ + cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by chunked by component." << endl; + return 1; + } // buffer size of each component, total across threads static const size_t output_buffer_total_size = 100000; From ddc9a9adfc03f9e003e2f3ad0afd7ca845aa7f25 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Mon, 31 Jul 2023 17:48:27 +0200 Subject: [PATCH 09/22] fix error triggering for gaf splitting/components --- src/subcommand/chunk_main.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index d20164e02dd..3d0f9981fa5 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -630,7 +630,7 @@ int main_chunk(int argc, char** argv) { // now ready to get our chunk on if (gam_split_size != 0) { - if(!gam_is_gaf){ + if(gam_is_gaf){ cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by split." << endl; return 1; } @@ -698,9 +698,11 @@ int main_chunk(int argc, char** argv) { unique_ptr subgraph; map trace_thread_frequencies; if (!component_ids.empty()) { + cout << "here?" << endl; subgraph = vg::io::new_output_graph(output_format); chunker.extract_component(component_ids[i], *subgraph, false); output_regions[i] = region; + cout << "nope" << endl; } else if (id_range == false) { subgraph = vg::io::new_output_graph(output_format); @@ -906,8 +908,8 @@ int main_chunk(int argc, char** argv) { // write out component gams if (chunk_gam && components) { - if(!gam_is_gaf){ - cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by chunked by component." << endl; + if(gam_is_gaf){ + cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by chunked by component. A workaround is to query one chromosome-component as the reference path and all contained snarls using '-p PATHNAME -S SNARLFILE'." << endl; return 1; } From 39a365214cfe89ee329cdcafd3a6bb07186f3b51 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Mon, 31 Jul 2023 23:37:59 +0200 Subject: [PATCH 10/22] minor comments/msg updates --- src/subcommand/chunk_main.cpp | 2 +- src/subcommand/gamsort_main.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 3d0f9981fa5..2308615034e 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -631,7 +631,7 @@ int main_chunk(int argc, char** argv) { // now ready to get our chunk on if (gam_split_size != 0) { if(gam_is_gaf){ - cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by split." << endl; + cerr << "error[vg chunk]: GAF file input toggled with -F but, currently, only GAM files can by split. A workaround would be to split the GAF file using split -l/-n which can split text files into chunks." << endl; return 1; } for (size_t gi = 0; gi < gam_files.size(); ++gi) { diff --git a/src/subcommand/gamsort_main.cpp b/src/subcommand/gamsort_main.cpp index 42e12d6b9d8..7c3d10e332f 100644 --- a/src/subcommand/gamsort_main.cpp +++ b/src/subcommand/gamsort_main.cpp @@ -155,6 +155,7 @@ int main_gamsort(int argc, char **argv) kstring_t s_buffer = KS_INITIALIZE; gafkluge::GafRecord gaf; + // TODO manage temporary files better std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id) + ".gaf"; if(show_progress){ cerr << "Preparing temporary chunk " << chunk_outf << "..." << endl; @@ -178,7 +179,7 @@ int main_gamsort(int argc, char **argv) } // write them as new GAF tags 'rk1' and 'rk2' // they'll get written in the temporary chunks to avoid having - // to find them again when mergin them + // to find them again when merging them gaf.opt_fields["rk1"] = make_pair('i', std::to_string(min_node)); gaf.opt_fields["rk2"] = make_pair('i', std::to_string(max_node)); current_gaf_chunk.push_back(gaf); From f306e4a717aaf2454456dfb8fbc824ecb5c341f1 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Wed, 2 Aug 2023 23:29:21 +0200 Subject: [PATCH 11/22] handle bed regions on paths that are chopped in subpaths --- src/alignment.cpp | 152 +++++++++++++++++++++++++++---- src/subcommand/annotate_main.cpp | 2 +- 2 files changed, 137 insertions(+), 17 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index 6b464fe38b1..c7da73f4484 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1865,22 +1865,16 @@ void parse_bed_regions(istream& bedstream, size_t score = 0; string strand; + // in case we need to look for subpaths, keep the info store for reuse + unordered_map> base_path_to_subpaths; + for (int line = 1; getline(bedstream, row); ++line) { if (row.size() < 2 || row[0] == '#') { continue; } istringstream ss(row); + ss >> seq; - - if (!graph->has_path(seq)) { - // This path doesn't exist, and we'll get a segfault or worse if - // we go look for positions in it. - cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; - continue; - } - - path_handle_t path_handle = graph->get_path_handle(seq); - ss >> sbuf; ss >> ebuf; @@ -1890,6 +1884,73 @@ void parse_bed_regions(istream& bedstream, continue; } + if (!graph->has_path(seq)) { + // This path doesn't exist, and we'll get a segfault or worse if + // we go look for positions in it. + // but maybe it's chopped in subranges? + // first look in our cached subpaths + bool subpath_found = base_path_to_subpaths.count(seq) > 0; + // if not there, look in the graph + if(!subpath_found){ + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(seq, + sense, + sample, + locus, + haplotype, + phase_block, + subrange); + if (subrange == PathMetadata::NO_SUBRANGE) { + // the path name souldn't describe a subpath + graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { + if (graph->get_haplotype(match) != haplotype) { + // Skip this haplotype + return true; + } + if (graph->get_phase_block(match) != phase_block) { + // Skip this phase block + return true; + } + // we've found a subpath for that base name. save the handle + base_path_to_subpaths[seq].push_back(match); + subpath_found = true; + return true; + }); + } + } + if(!subpath_found){ + // we've looked for subpaths and couldn't found anything + cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; + continue; + } else { + // update the path name to the subpaths and adjust sbuf/ebuf based on its offset + // look for the subpath containing the range [sbuf-ebuf] + for (auto& path : base_path_to_subpaths[seq]) { + subrange_t subrange = graph->get_subrange(path); + // TODO: one of these should be <=/>=, but which one... + if(sbuf > subrange.first && sbuf < subrange.second){ + if(ebuf > subrange.first && ebuf < subrange.second){ + // we found the matching subpath, update handle and positions + seq = graph->get_path_name(path); + sbuf = sbuf - subrange.first; + ebuf = ebuf - subrange.first; + } else { + cerr << "warning: path " << seq << " is chopped and the queried range " + << ebuf << "-" << sbuf << "overlaps two different subpaths " + << " in bed line " << line << ", skipping: " << row << endl; + } + } + } + } + } + + path_handle_t path_handle = graph->get_path_handle(seq); + if (sbuf >= ebuf && !graph->get_is_circular(path_handle)) { // The start of the region can be after the end of the region only if the underlying path is circular. // That's not the case, so complain and skip the region. @@ -1897,12 +1958,70 @@ void parse_bed_regions(istream& bedstream, << line << ": " << row << endl; continue; } - + if (ebuf > graph->get_path_length(path_handle)) { - // Skip ends that are too late - cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) - << " in bed line " << line << ", skipping: " << row << endl; - continue; + // Could be that the path is chopped but the first subpath is named like the base/full path + // if so, it was found in the graph but is shorter than the actual full path + // in case that happened, check for other subpaths that might have that base name + // if we find one and the range match the queried range, use that + bool subpath_found = false; + if(!base_path_to_subpaths.count(seq)){ + // not in our subpath cache, so let's look for subpaths + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(seq, + sense, + sample, + locus, + haplotype, + phase_block, + subrange); + + if (subrange == PathMetadata::NO_SUBRANGE) { + // the path name souldn't describe a subpath + graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { + if (graph->get_haplotype(match) != haplotype) { + // Skip this haplotype + return true; + } + if (graph->get_phase_block(match) != phase_block) { + // Skip this phase block + return true; + } + // we've found a subrange for this base path + base_path_to_subpaths[seq].push_back(match); + return true; + }); + } + } + if(base_path_to_subpaths.count(seq)){ + // there are subpaths, let's look for the one containing [sbuf-ebuf] + for (auto& path : base_path_to_subpaths[seq]) { + subrange_t subrange = graph->get_subrange(path); + // TODO: one of these should be <=/>=, but which one... + if(sbuf > subrange.first && sbuf < subrange.second){ + if(ebuf > subrange.first && ebuf < subrange.second){ + // we found the matching subpath, update handle and positions + subpath_found = true; + path_handle = path; + sbuf = sbuf - subrange.first; + ebuf = ebuf - subrange.first; + } else { + cerr << "warning: path " << seq << " is chopped and the queried range " << ebuf << "-" << sbuf << "overlaps two different subpaths " << " in bed line " << line << ", skipping: " << row << endl; + } + } + } + } + if(!subpath_found){ + // Skip ends that are too late + cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) + << " in bed line " << line << ", skipping: " << row << endl; + continue; + } } if (sbuf >= graph->get_path_length(path_handle)) { @@ -1995,6 +2114,7 @@ void parse_gff_regions(istream& gffstream, is_reverse = true; } + // TODO HANDLE SUBPATH HERE TOO if (!graph->has_path(seq)) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. @@ -2139,7 +2259,7 @@ Alignment target_alignment(const PathPositionHandleGraph* graph, const path_hand graph->get_path_name(path)); } - // Split the proivided Mapping of edits at the path end/start junction + // Split the provided Mapping of edits at the path end/start junction auto part_mappings = cut_mapping_offset(cigar_mapping, path_len - pos1); // We extract from pos1 to the end diff --git a/src/subcommand/annotate_main.cpp b/src/subcommand/annotate_main.cpp index 418767dceaf..2dd738e50bb 100644 --- a/src/subcommand/annotate_main.cpp +++ b/src/subcommand/annotate_main.cpp @@ -219,7 +219,7 @@ int main_annotate(int argc, char** argv) { // We are computing a novelty table. // TODO: refactor this into novelty annotation and annotation-to-table conversion. if (add_positions || !bed_names.empty()) { - // We can't amke the TSV and also annotate the reads + // We can't make the TSV and also annotate the reads cerr << "error [vg annotate]: Cannot annotate reads while computing novelty table" << endl; return 1; } From 383622f5535277d9b9b52a5e926e5f8b0e1dc138 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Thu, 3 Aug 2023 11:43:01 +0200 Subject: [PATCH 12/22] gff annotation also handle paths chopped into subpaths --- src/alignment.cpp | 108 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 24 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index c7da73f4484..0fb737b4d90 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1898,13 +1898,7 @@ void parse_bed_regions(istream& bedstream, size_t haplotype; size_t phase_block; subrange_t subrange; - PathMetadata::parse_path_name(seq, - sense, - sample, - locus, - haplotype, - phase_block, - subrange); + PathMetadata::parse_path_name(seq, sense, sample, locus, haplotype, phase_block, subrange); if (subrange == PathMetadata::NO_SUBRANGE) { // the path name souldn't describe a subpath graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { @@ -1930,6 +1924,7 @@ void parse_bed_regions(istream& bedstream, } else { // update the path name to the subpaths and adjust sbuf/ebuf based on its offset // look for the subpath containing the range [sbuf-ebuf] + bool cover_two_subpaths = false; for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); // TODO: one of these should be <=/>=, but which one... @@ -1940,12 +1935,16 @@ void parse_bed_regions(istream& bedstream, sbuf = sbuf - subrange.first; ebuf = ebuf - subrange.first; } else { - cerr << "warning: path " << seq << " is chopped and the queried range " - << ebuf << "-" << sbuf << "overlaps two different subpaths " - << " in bed line " << line << ", skipping: " << row << endl; + cover_two_subpaths = true; } } } + if(cover_two_subpaths){ + cerr << "warning: path " << seq << " is chopped and the queried range " + << sbuf << "-" << ebuf << " overlaps two different subpaths in bed line " + << line << ", skipping: " << row << endl; + continue; + } } } @@ -1973,13 +1972,7 @@ void parse_bed_regions(istream& bedstream, size_t haplotype; size_t phase_block; subrange_t subrange; - PathMetadata::parse_path_name(seq, - sense, - sample, - locus, - haplotype, - phase_block, - subrange); + PathMetadata::parse_path_name(seq, sense, sample, locus, haplotype, phase_block, subrange); if (subrange == PathMetadata::NO_SUBRANGE) { // the path name souldn't describe a subpath @@ -2000,6 +1993,7 @@ void parse_bed_regions(istream& bedstream, } if(base_path_to_subpaths.count(seq)){ // there are subpaths, let's look for the one containing [sbuf-ebuf] + bool cover_two_subpaths = false; for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); // TODO: one of these should be <=/>=, but which one... @@ -2011,10 +2005,16 @@ void parse_bed_regions(istream& bedstream, sbuf = sbuf - subrange.first; ebuf = ebuf - subrange.first; } else { - cerr << "warning: path " << seq << " is chopped and the queried range " << ebuf << "-" << sbuf << "overlaps two different subpaths " << " in bed line " << line << ", skipping: " << row << endl; + cover_two_subpaths = true; } } } + if(cover_two_subpaths){ + cerr << "warning: path " << seq << " is chopped and the queried range " + << sbuf << "-" << ebuf << " overlaps two different subpaths in bed line " + << line << ", skipping: " << row << endl; + continue; + } } if(!subpath_found){ // Skip ends that are too late @@ -2070,6 +2070,9 @@ void parse_gff_regions(istream& gffstream, string num; string annotations; + // in case we need to look for subpaths, keep the info store for reuse + unordered_map> base_path_to_subpaths; + for (int line = 1; getline(gffstream, row); ++line) { if (row.size() < 2 || row[0] == '#') { continue; @@ -2114,16 +2117,73 @@ void parse_gff_regions(istream& gffstream, is_reverse = true; } - // TODO HANDLE SUBPATH HERE TOO if (!graph->has_path(seq)) { // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. - cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; - } else { - Alignment alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); - - out_alignments->push_back(alignment); + // but maybe it's chopped in subranges? + // first look in our cached subpaths + bool subpath_found = base_path_to_subpaths.count(seq) > 0; + // if not there, look in the graph + if(!subpath_found){ + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(seq, sense, sample, locus, haplotype, phase_block, subrange); + if (subrange == PathMetadata::NO_SUBRANGE) { + // the path name souldn't describe a subpath + graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { + if (graph->get_haplotype(match) != haplotype) { + // Skip this haplotype + return true; + } + if (graph->get_phase_block(match) != phase_block) { + // Skip this phase block + return true; + } + // we've found a subpath for that base name. save the handle + base_path_to_subpaths[seq].push_back(match); + subpath_found = true; + return true; + }); + } + } + if(!subpath_found){ + // we've looked for subpaths and couldn't found anything + cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; + continue; + } else { + // update the path name to the subpath's and adjust sbuf/ebuf based on its offset + // look for the subpath containing the range [sbuf-ebuf] + bool cover_two_subpaths = false; + for (auto& path : base_path_to_subpaths[seq]) { + subrange_t subrange = graph->get_subrange(path); + // TODO: one of these should be <=/>=, but which one... + if(sbuf > subrange.first && sbuf < subrange.second){ + if(ebuf > subrange.first && ebuf < subrange.second){ + // we found the matching subpath, update handle and positions + seq = graph->get_path_name(path); + sbuf = sbuf - subrange.first; + ebuf = ebuf - subrange.first; + } else { + cover_two_subpaths = true; + } + } + } + if(cover_two_subpaths){ + cerr << "warning: path " << seq << " is chopped and the queried range " + << sbuf << "-" << ebuf << " overlaps two different subpaths in bed line " + << line << ", skipping: " << row << endl; + continue; + } + } } + + Alignment alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); + + out_alignments->push_back(alignment); } } } From bb5c0591c6e2751b4f288bd2549c6f68ce2d7f92 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Tue, 8 Aug 2023 22:53:56 +0200 Subject: [PATCH 13/22] handle subrange with no end position, plus some dev tests --- src/alignment.cpp | 50 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index c7da73f4484..0d34e77a72d 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -3,6 +3,7 @@ #include "annotation.hpp" #include +#include using namespace vg::io; @@ -1867,13 +1868,19 @@ void parse_bed_regions(istream& bedstream, // in case we need to look for subpaths, keep the info store for reuse unordered_map> base_path_to_subpaths; - + // to warn about slow annotation // TODO remove after testing? + std::chrono::time_point t_start, t_end; + std::chrono::duration elapsed_seconds; + for (int line = 1; getline(bedstream, row); ++line) { if (row.size() < 2 || row[0] == '#') { continue; } istringstream ss(row); + // to warn about slow annotation // TODO remove after testing? + t_start = std::chrono::system_clock::now(); + ss >> seq; ss >> sbuf; ss >> ebuf; @@ -1888,10 +1895,12 @@ void parse_bed_regions(istream& bedstream, // This path doesn't exist, and we'll get a segfault or worse if // we go look for positions in it. // but maybe it's chopped in subranges? + bool subpath_found = false; // first look in our cached subpaths - bool subpath_found = base_path_to_subpaths.count(seq) > 0; // if not there, look in the graph - if(!subpath_found){ + if(!base_path_to_subpaths.count(seq)){ + // TODO remove if it doesn't significantly help with memory + base_path_to_subpaths.clear(); PathSense sense; string sample; string locus; @@ -1918,12 +1927,11 @@ void parse_bed_regions(istream& bedstream, } // we've found a subpath for that base name. save the handle base_path_to_subpaths[seq].push_back(match); - subpath_found = true; return true; }); } } - if(!subpath_found){ + if(!base_path_to_subpaths.count(seq)){ // we've looked for subpaths and couldn't found anything cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; continue; @@ -1932,6 +1940,9 @@ void parse_bed_regions(istream& bedstream, // look for the subpath containing the range [sbuf-ebuf] for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); + if (subrange.second == PathMetadata::NO_END_POSITION){ + subrange.second = subrange.first + graph->get_path_length(path); + } // TODO: one of these should be <=/>=, but which one... if(sbuf > subrange.first && sbuf < subrange.second){ if(ebuf > subrange.first && ebuf < subrange.second){ @@ -1939,13 +1950,19 @@ void parse_bed_regions(istream& bedstream, seq = graph->get_path_name(path); sbuf = sbuf - subrange.first; ebuf = ebuf - subrange.first; + subpath_found = true; + break; } else { cerr << "warning: path " << seq << " is chopped and the queried range " - << ebuf << "-" << sbuf << "overlaps two different subpaths " + << sbuf << "-" << ebuf << " overlaps two different subpaths " << " in bed line " << line << ", skipping: " << row << endl; + break; } } } + if (!subpath_found){ + continue; + } } } @@ -1966,6 +1983,8 @@ void parse_bed_regions(istream& bedstream, // if we find one and the range match the queried range, use that bool subpath_found = false; if(!base_path_to_subpaths.count(seq)){ + // TODO remove if it doesn't significantly help with memory + base_path_to_subpaths.clear(); // not in our subpath cache, so let's look for subpaths PathSense sense; string sample; @@ -2002,6 +2021,9 @@ void parse_bed_regions(istream& bedstream, // there are subpaths, let's look for the one containing [sbuf-ebuf] for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); + if (subrange.second == PathMetadata::NO_END_POSITION){ + subrange.second = subrange.first + graph->get_path_length(path); + } // TODO: one of these should be <=/>=, but which one... if(sbuf > subrange.first && sbuf < subrange.second){ if(ebuf > subrange.first && ebuf < subrange.second){ @@ -2010,16 +2032,19 @@ void parse_bed_regions(istream& bedstream, path_handle = path; sbuf = sbuf - subrange.first; ebuf = ebuf - subrange.first; + break; } else { - cerr << "warning: path " << seq << " is chopped and the queried range " << ebuf << "-" << sbuf << "overlaps two different subpaths " << " in bed line " << line << ", skipping: " << row << endl; + cerr << "warning: path " << seq << " is chopped and the queried range " << sbuf << "-" << ebuf << " overlaps two different subpaths " << " in bed line " << line << ", skipping: " << row << endl; + break; } } } - } - if(!subpath_found){ + } else { // Skip ends that are too late cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) << " in bed line " << line << ", skipping: " << row << endl; + } + if(!subpath_found){ continue; } } @@ -2046,6 +2071,13 @@ void parse_bed_regions(istream& bedstream, alignment.set_score(score); out_alignments->push_back(alignment); + + // to warn about slow annotation // TODO remove after testing? + t_end = std::chrono::system_clock::now(); + elapsed_seconds = t_end - t_start; + if(elapsed_seconds.count() > 120){ + cerr << "warning: bed line " << line << " took " << elapsed_seconds.count() << " seconds: " << row << endl; + } } } From 260d837241ab2dc8fd3aa99abf1c39a10f6ed332 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Wed, 9 Aug 2023 16:36:08 +0200 Subject: [PATCH 14/22] handle 2 subpath formats and regions spanning subpaths (naively) --- src/alignment.cpp | 163 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 123 insertions(+), 40 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index 0d34e77a72d..9c88a803a76 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1862,6 +1862,13 @@ void parse_bed_regions(istream& bedstream, size_t sbuf; // Record end position size_t ebuf; + // if region must be chopped, save the remaining piece to annotate + // (empty seq means no trimmed region to process) + string seq_trim = ""; + size_t sbuf_trim; + size_t ebuf_trim; + string other_trim = ""; + // region information string name; size_t score = 0; string strand; @@ -1871,9 +1878,16 @@ void parse_bed_regions(istream& bedstream, // to warn about slow annotation // TODO remove after testing? std::chrono::time_point t_start, t_end; std::chrono::duration elapsed_seconds; - - for (int line = 1; getline(bedstream, row); ++line) { + + int line = 1; + bool more_regions = !bedstream.eof(); + getline(bedstream, row); + + while (more_regions) { if (row.size() < 2 || row[0] == '#') { + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; continue; } istringstream ss(row); @@ -1888,6 +1902,9 @@ void parse_bed_regions(istream& bedstream, if (ss.fail()) { // Skip lines that can't be parsed cerr << "warning: Error parsing bed line " << line << ", skipping: " << row << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; continue; } @@ -1899,8 +1916,6 @@ void parse_bed_regions(istream& bedstream, // first look in our cached subpaths // if not there, look in the graph if(!base_path_to_subpaths.count(seq)){ - // TODO remove if it doesn't significantly help with memory - base_path_to_subpaths.clear(); PathSense sense; string sample; string locus; @@ -1934,33 +1949,52 @@ void parse_bed_regions(istream& bedstream, if(!base_path_to_subpaths.count(seq)){ // we've looked for subpaths and couldn't found anything cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; continue; } else { // update the path name to the subpaths and adjust sbuf/ebuf based on its offset // look for the subpath containing the range [sbuf-ebuf] for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); + // the two subrange formats are using different indexing + // in path[start-end] start/end are 0-based and the "end" is not included (like the input sbuf/ebuf from BED) + // in path[offset] the offset is "1-based" + // so to make path[offset] into path[start-end] we need to do path[(offset-1)-(offset-1+path_length)] if (subrange.second == PathMetadata::NO_END_POSITION){ + if (subrange.first == PathMetadata::NO_END_POSITION){ + subrange.first = 0; + } else { + subrange.first--; + } subrange.second = subrange.first + graph->get_path_length(path); } - // TODO: one of these should be <=/>=, but which one... - if(sbuf > subrange.first && sbuf < subrange.second){ - if(ebuf > subrange.first && ebuf < subrange.second){ - // we found the matching subpath, update handle and positions - seq = graph->get_path_name(path); - sbuf = sbuf - subrange.first; - ebuf = ebuf - subrange.first; - subpath_found = true; - break; - } else { - cerr << "warning: path " << seq << " is chopped and the queried range " - << sbuf << "-" << ebuf << " overlaps two different subpaths " - << " in bed line " << line << ", skipping: " << row << endl; - break; + if(sbuf >= subrange.first && sbuf < subrange.second){ + if(ebuf > subrange.second){ + // region span multiple subranges so we annonate the beginning now + // and the trimmed region that remains later + seq_trim = seq; + sbuf_trim = subrange.second; + ebuf_trim = ebuf; + ebuf = subrange.second; } + // we found the matching subpath, update handle and positions + seq = graph->get_path_name(path); + sbuf = sbuf - subrange.first; + ebuf = ebuf - subrange.first; + subpath_found = true; + break; } } if (!subpath_found){ + // we've looked for overlapping subpaths and couldn't found one + cerr << "warning: no overlap found between range " << + sbuf << "-" << ebuf << " and subpaths of \"" << + seq << "\", input line " << line << ", skipping" << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; continue; } } @@ -1973,6 +2007,9 @@ void parse_bed_regions(istream& bedstream, // That's not the case, so complain and skip the region. cerr << "warning: path \"" << seq << "\" is not circular, skipping end-spanning region on line " << line << ": " << row << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; continue; } @@ -1983,8 +2020,6 @@ void parse_bed_regions(istream& bedstream, // if we find one and the range match the queried range, use that bool subpath_found = false; if(!base_path_to_subpaths.count(seq)){ - // TODO remove if it doesn't significantly help with memory - base_path_to_subpaths.clear(); // not in our subpath cache, so let's look for subpaths PathSense sense; string sample; @@ -2017,35 +2052,57 @@ void parse_bed_regions(istream& bedstream, }); } } - if(base_path_to_subpaths.count(seq)){ + if(!base_path_to_subpaths.count(seq)){ + // Skip ends that are too late + cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) + << " in bed line " << line << ", skipping: " << row << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; + continue; + } else { // there are subpaths, let's look for the one containing [sbuf-ebuf] for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); + // the two subrange formats are using different indexing + // in path[start-end] start/end are 0-based and the "end" is not included (like the input sbuf/ebuf from BED) + // in path[offset] the offset is "1-based" + // so to make path[offset] into path[start-end] we need to do path[(offset-1)-(offset-1+path_length)] if (subrange.second == PathMetadata::NO_END_POSITION){ + if (subrange.first == PathMetadata::NO_END_POSITION){ + subrange.first = 0; + } else { + subrange.first--; + } subrange.second = subrange.first + graph->get_path_length(path); } - // TODO: one of these should be <=/>=, but which one... - if(sbuf > subrange.first && sbuf < subrange.second){ - if(ebuf > subrange.first && ebuf < subrange.second){ - // we found the matching subpath, update handle and positions - subpath_found = true; - path_handle = path; - sbuf = sbuf - subrange.first; - ebuf = ebuf - subrange.first; - break; - } else { - cerr << "warning: path " << seq << " is chopped and the queried range " << sbuf << "-" << ebuf << " overlaps two different subpaths " << " in bed line " << line << ", skipping: " << row << endl; - break; + if(sbuf >= subrange.first && sbuf < subrange.second){ + if(ebuf > subrange.second){ + // region span multiple subranges so we annonate the beginning now + // and the trimmed region that remains later + seq_trim = seq; + sbuf_trim = subrange.second; + ebuf_trim = ebuf; + ebuf = subrange.second; } + // we found the matching subpath, update handle and positions + subpath_found = true; + path_handle = path; + sbuf = sbuf - subrange.first; + ebuf = ebuf - subrange.first; + break; } } - } else { - // Skip ends that are too late - cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) - << " in bed line " << line << ", skipping: " << row << endl; - } - if(!subpath_found){ - continue; + if(!subpath_found){ + // we've looked for overlapping subpaths and couldn't found one + cerr << "warning: no overlap found between range " << + sbuf << "-" << ebuf << " and subpaths of \"" << + seq << "\", input line " << line << ", skipping" << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; + continue; + } } } @@ -2053,13 +2110,27 @@ void parse_bed_regions(istream& bedstream, // Skip starts that are too late cerr << "warning: out of range path start " << sbuf << " >= " << graph->get_path_length(path_handle) << " in bed line " << line << ", skipping: " << row << endl; + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; continue; } // Try parsing the optional fields. If they fail, ignore the problem, because they're optional. ss >> name; + if(!ss.fail()){ + other_trim += ("\t" + name); + } + ss >> score; + if(!ss.fail()){ + other_trim += ("\t" + score); + } + ss >> strand; + if(!ss.fail()){ + other_trim += ("\t" + strand); + } bool is_reverse = false; if(!ss.fail() && strand.compare("-") == 0) { @@ -2078,6 +2149,18 @@ void parse_bed_regions(istream& bedstream, if(elapsed_seconds.count() > 120){ cerr << "warning: bed line " << line << " took " << elapsed_seconds.count() << " seconds: " << row << endl; } + + // prepare next row + if (seq_trim.size() > 0){ + more_regions = true; + row = seq_trim + "\t" + std::to_string(sbuf_trim) + "\t" + std::to_string(ebuf_trim) + other_trim; + seq_trim = ""; + other_trim = ""; + } else { + more_regions = !bedstream.eof(); + getline(bedstream, row); + line++; + } } } From e0640689d33d248e8f4ac8509cdaf18114312c96 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Wed, 9 Aug 2023 22:27:28 +0200 Subject: [PATCH 15/22] save absent path to avoid querying too much --- src/alignment.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index 9c88a803a76..75980179e50 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1875,6 +1875,8 @@ void parse_bed_regions(istream& bedstream, // in case we need to look for subpaths, keep the info store for reuse unordered_map> base_path_to_subpaths; + // to remember which path we've looked for and didn't find (and avoid relooking over and over again) + unordered_map absent_paths; // to warn about slow annotation // TODO remove after testing? std::chrono::time_point t_start, t_end; std::chrono::duration elapsed_seconds; @@ -1915,7 +1917,7 @@ void parse_bed_regions(istream& bedstream, bool subpath_found = false; // first look in our cached subpaths // if not there, look in the graph - if(!base_path_to_subpaths.count(seq)){ + if(!base_path_to_subpaths.count(seq) && !absent_paths.count(seq)){ PathSense sense; string sample; string locus; @@ -1949,6 +1951,8 @@ void parse_bed_regions(istream& bedstream, if(!base_path_to_subpaths.count(seq)){ // we've looked for subpaths and couldn't found anything cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; + // remember that this path is not in the graph (no need to look it up again) + absent_paths[seq] = true; more_regions = !bedstream.eof(); getline(bedstream, row); line++; From 9ee63a20df4ec8da4a50642c2f879863ce30894c Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Thu, 10 Aug 2023 00:03:07 +0200 Subject: [PATCH 16/22] better handle regions overlapping subpaths --- src/alignment.cpp | 122 ++++++++++++++++++++++------------------------ 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index 858c0bfc312..e8594518a87 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1862,12 +1862,6 @@ void parse_bed_regions(istream& bedstream, size_t sbuf; // Record end position size_t ebuf; - // if region must be chopped, save the remaining piece to annotate - // (empty seq means no trimmed region to process) - string seq_trim = ""; - size_t sbuf_trim; - size_t ebuf_trim; - string other_trim = ""; // region information string name; size_t score = 0; @@ -1877,9 +1871,12 @@ void parse_bed_regions(istream& bedstream, unordered_map> base_path_to_subpaths; // to remember which path we've looked for and didn't find (and avoid relooking over and over again) unordered_map absent_paths; - // to warn about slow annotation // TODO remove after testing? - std::chrono::time_point t_start, t_end; - std::chrono::duration elapsed_seconds; + // if the region must be chopped to multiple subranges, use these vector to + // remember the other path names, start and end positions + vector other_starts; + vector other_ends; + vector other_seqs; + string other_info = ""; int line = 1; bool more_regions = !bedstream.eof(); @@ -1893,9 +1890,6 @@ void parse_bed_regions(istream& bedstream, continue; } istringstream ss(row); - - // to warn about slow annotation // TODO remove after testing? - t_start = std::chrono::system_clock::now(); ss >> seq; ss >> sbuf; @@ -1954,7 +1948,6 @@ void parse_bed_regions(istream& bedstream, } else { // update the path name to the subpaths and adjust sbuf/ebuf based on its offset // look for the subpath containing the range [sbuf-ebuf] - bool cover_two_subpaths = false; for (auto& path : base_path_to_subpaths[seq]) { subrange_t subrange = graph->get_subrange(path); // the two subrange formats are using different indexing @@ -1969,24 +1962,29 @@ void parse_bed_regions(istream& bedstream, } subrange.second = subrange.first + graph->get_path_length(path); } - if(sbuf >= subrange.first && sbuf < subrange.second){ - if(ebuf > subrange.second){ - // region span multiple subranges so we annonate the beginning now - // and the trimmed region that remains later - seq_trim = seq; - sbuf_trim = subrange.second; - ebuf_trim = ebuf; - ebuf = subrange.second; + // if the subpath overlap with the queried range, save it + if(ebuf >= subrange.first && sbuf < subrange.second ){ + if(sbuf < subrange.first){ + other_starts.push_back(0); + } else { + other_starts.push_back(sbuf - subrange.first); + } + if(ebuf >= subrange.second){ + other_ends.push_back(subrange.second - subrange.first); + } else { + other_ends.push_back(ebuf - subrange.first); } - // we found the matching subpath, update handle and positions - seq = graph->get_path_name(path); - sbuf = sbuf - subrange.first; - ebuf = ebuf - subrange.first; - subpath_found = true; - break; + other_seqs.push_back(graph->get_path_name(path)); } } - if (!subpath_found){ + if (!other_seqs.empty()){ + seq = other_seqs.back(); + other_seqs.pop_back(); + sbuf = other_starts.back(); + other_starts.pop_back(); + ebuf = other_ends.back(); + other_ends.pop_back(); + } else { // we've looked for overlapping subpaths and couldn't found one cerr << "warning: no overlap found between range " << sbuf << "-" << ebuf << " and subpaths of \"" << @@ -1996,12 +1994,6 @@ void parse_bed_regions(istream& bedstream, line++; continue; } - if(cover_two_subpaths){ - cerr << "warning: path " << seq << " is chopped and the queried range " - << sbuf << "-" << ebuf << " overlaps two different subpaths in bed line " - << line << ", skipping: " << row << endl; - continue; - } } } @@ -2075,24 +2067,30 @@ void parse_bed_regions(istream& bedstream, } subrange.second = subrange.first + graph->get_path_length(path); } - if(sbuf >= subrange.first && sbuf < subrange.second){ - if(ebuf > subrange.second){ - // region span multiple subranges so we annonate the beginning now - // and the trimmed region that remains later - seq_trim = seq; - sbuf_trim = subrange.second; - ebuf_trim = ebuf; - ebuf = subrange.second; + // if the subpath overlap with the queried range, save it + if(ebuf >= subrange.first && sbuf < subrange.second ){ + if(sbuf < subrange.first){ + other_starts.push_back(0); + } else { + other_starts.push_back(sbuf - subrange.first); + } + if(ebuf >= subrange.second){ + other_ends.push_back(subrange.second - subrange.first); + } else { + other_ends.push_back(ebuf - subrange.first); } - // we found the matching subpath, update handle and positions - subpath_found = true; - path_handle = path; - sbuf = sbuf - subrange.first; - ebuf = ebuf - subrange.first; - break; + other_seqs.push_back(graph->get_path_name(path)); } } - if(!subpath_found){ + if (!other_seqs.empty()){ + seq = other_seqs.back(); + other_seqs.pop_back(); + path_handle = graph->get_path_handle(seq); + sbuf = other_starts.back(); + other_starts.pop_back(); + ebuf = other_ends.back(); + other_ends.pop_back(); + } else { // we've looked for overlapping subpaths and couldn't found one cerr << "warning: no overlap found between range " << sbuf << "-" << ebuf << " and subpaths of \"" << @@ -2118,17 +2116,17 @@ void parse_bed_regions(istream& bedstream, // Try parsing the optional fields. If they fail, ignore the problem, because they're optional. ss >> name; if(!ss.fail()){ - other_trim += ("\t" + name); + other_info += ("\t" + name); } ss >> score; if(!ss.fail()){ - other_trim += ("\t" + score); + other_info += ("\t" + score); } ss >> strand; if(!ss.fail()){ - other_trim += ("\t" + strand); + other_info += ("\t" + strand); } bool is_reverse = false; @@ -2142,19 +2140,17 @@ void parse_bed_regions(istream& bedstream, out_alignments->push_back(alignment); - // to warn about slow annotation // TODO remove after testing? - t_end = std::chrono::system_clock::now(); - elapsed_seconds = t_end - t_start; - if(elapsed_seconds.count() > 120){ - cerr << "warning: bed line " << line << " took " << elapsed_seconds.count() << " seconds: " << row << endl; - } - // prepare next row - if (seq_trim.size() > 0){ + if (!other_seqs.empty()){ + seq = other_seqs.back(); + other_seqs.pop_back(); + sbuf = other_starts.back(); + other_starts.pop_back(); + ebuf = other_ends.back(); + other_ends.pop_back(); more_regions = true; - row = seq_trim + "\t" + std::to_string(sbuf_trim) + "\t" + std::to_string(ebuf_trim) + other_trim; - seq_trim = ""; - other_trim = ""; + row = seq + "\t" + std::to_string(sbuf) + "\t" + std::to_string(ebuf) + other_info; + other_info = ""; } else { more_regions = !bedstream.eof(); getline(bedstream, row); From ab9b79cb7065784eca47250e0dedd95e2a8052ab Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Thu, 10 Aug 2023 23:16:34 +0200 Subject: [PATCH 17/22] simplify how chopped regions are handled and propagate to gff parser --- src/alignment.cpp | 356 +++++++++++++++++++++++++++++----------------- 1 file changed, 228 insertions(+), 128 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index e8594518a87..e1a41c37e90 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1876,17 +1876,9 @@ void parse_bed_regions(istream& bedstream, vector other_starts; vector other_ends; vector other_seqs; - string other_info = ""; - int line = 1; - bool more_regions = !bedstream.eof(); - getline(bedstream, row); - - while (more_regions) { + for (int line = 1; getline(bedstream, row); ++line) { if (row.size() < 2 || row[0] == '#') { - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } istringstream ss(row); @@ -1898,9 +1890,6 @@ void parse_bed_regions(istream& bedstream, if (ss.fail()) { // Skip lines that can't be parsed cerr << "warning: Error parsing bed line " << line << ", skipping: " << row << endl; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } @@ -1941,9 +1930,6 @@ void parse_bed_regions(istream& bedstream, cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; // remember that this path is not in the graph (no need to look it up again) absent_paths[seq] = true; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } else { // update the path name to the subpaths and adjust sbuf/ebuf based on its offset @@ -1989,9 +1975,6 @@ void parse_bed_regions(istream& bedstream, cerr << "warning: no overlap found between range " << sbuf << "-" << ebuf << " and subpaths of \"" << seq << "\", input line " << line << ", skipping" << endl; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } } @@ -2004,9 +1987,6 @@ void parse_bed_regions(istream& bedstream, // That's not the case, so complain and skip the region. cerr << "warning: path \"" << seq << "\" is not circular, skipping end-spanning region on line " << line << ": " << row << endl; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } @@ -2047,9 +2027,6 @@ void parse_bed_regions(istream& bedstream, // Skip ends that are too late cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) << " in bed line " << line << ", skipping: " << row << endl; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } else { // there are subpaths, let's look for the one containing [sbuf-ebuf] @@ -2095,9 +2072,6 @@ void parse_bed_regions(istream& bedstream, cerr << "warning: no overlap found between range " << sbuf << "-" << ebuf << " and subpaths of \"" << seq << "\", input line " << line << ", skipping" << endl; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } } @@ -2107,27 +2081,13 @@ void parse_bed_regions(istream& bedstream, // Skip starts that are too late cerr << "warning: out of range path start " << sbuf << " >= " << graph->get_path_length(path_handle) << " in bed line " << line << ", skipping: " << row << endl; - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; continue; } // Try parsing the optional fields. If they fail, ignore the problem, because they're optional. ss >> name; - if(!ss.fail()){ - other_info += ("\t" + name); - } - ss >> score; - if(!ss.fail()){ - other_info += ("\t" + score); - } - ss >> strand; - if(!ss.fail()){ - other_info += ("\t" + strand); - } bool is_reverse = false; if(!ss.fail() && strand.compare("-") == 0) { @@ -2140,21 +2100,20 @@ void parse_bed_regions(istream& bedstream, out_alignments->push_back(alignment); - // prepare next row + // if more subpaths need to be written, write them now if (!other_seqs.empty()){ + // extract subpath information seq = other_seqs.back(); other_seqs.pop_back(); sbuf = other_starts.back(); other_starts.pop_back(); ebuf = other_ends.back(); other_ends.pop_back(); - more_regions = true; - row = seq + "\t" + std::to_string(sbuf) + "\t" + std::to_string(ebuf) + other_info; - other_info = ""; - } else { - more_regions = !bedstream.eof(); - getline(bedstream, row); - line++; + // get path handle and corresponding alignment + path_handle = graph->get_path_handle(seq); + alignment = target_alignment(graph, path_handle, sbuf, ebuf, name, is_reverse); + alignment.set_score(score); + out_alignments->push_back(alignment); } } } @@ -2182,6 +2141,13 @@ void parse_gff_regions(istream& gffstream, // in case we need to look for subpaths, keep the info store for reuse unordered_map> base_path_to_subpaths; + // to remember which path we've looked for and didn't find (and avoid relooking over and over again) + unordered_map absent_paths; + // if the region must be chopped to multiple subranges, use these vector to + // remember the other path names, start and end positions + vector other_starts; + vector other_ends; + vector other_seqs; for (int line = 1; getline(gffstream, row); ++line) { if (row.size() < 2 || row[0] == '#') { @@ -2200,99 +2166,233 @@ void parse_gff_regions(istream& gffstream, if (ss.fail() || !(sbuf < ebuf)) { cerr << "Error parsing gtf/gff line " << line << ": " << row << endl; - } else { - getline(ss, score, '\t'); - getline(ss, strand, '\t'); - getline(ss, num, '\t'); - getline(ss, annotations, '\t'); - vector vals = split(annotations, ";"); - - string name = ""; - - for (auto& s : vals) { - if (s.find("Name=") == 0) { - name = s.substr(5); - } + continue; + } + + getline(ss, score, '\t'); + getline(ss, strand, '\t'); + getline(ss, num, '\t'); + getline(ss, annotations, '\t'); + + // look for the "Name" info + vector vals = split(annotations, ";"); + string name = ""; + for (auto& s : vals) { + if (s.find("Name=") == 0) { + name = s.substr(5); } + } - // Skips annotations where the name can not be parsed. Empty names can - // results in undefinable behavior downstream. - if (name.empty()) { - cerr << "warning: could not parse annotation name (Name=), skipping line " << line << endl; - continue; - } + // Skips annotations where the name can not be parsed. Empty names can + // results in undefinable behavior downstream. + if (name.empty()) { + cerr << "warning: could not parse annotation name (Name=), skipping line " << line << endl; + continue; + } - bool is_reverse = false; - if(!ss.fail() && strand.compare("-") == 0) { - is_reverse = true; - } + bool is_reverse = false; + if(!ss.fail() && strand.compare("-") == 0) { + is_reverse = true; + } - if (!graph->has_path(seq)) { - // This path doesn't exist, and we'll get a segfault or worse if - // we go look for positions in it. - // but maybe it's chopped in subranges? - // first look in our cached subpaths - bool subpath_found = base_path_to_subpaths.count(seq) > 0; - // if not there, look in the graph - if(!subpath_found){ - PathSense sense; - string sample; - string locus; - size_t haplotype; - size_t phase_block; - subrange_t subrange; - PathMetadata::parse_path_name(seq, sense, sample, locus, haplotype, phase_block, subrange); - if (subrange == PathMetadata::NO_SUBRANGE) { - // the path name souldn't describe a subpath - graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { - if (graph->get_haplotype(match) != haplotype) { - // Skip this haplotype - return true; - } - if (graph->get_phase_block(match) != phase_block) { - // Skip this phase block - return true; - } - // we've found a subpath for that base name. save the handle - base_path_to_subpaths[seq].push_back(match); - subpath_found = true; + if (!graph->has_path(seq)) { + // This path doesn't exist, and we'll get a segfault or worse if + // we go look for positions in it. + // but maybe it's chopped in subranges? + bool subpath_found = false; + // first look in our cached subpaths + // if not there, look in the graph + if(!base_path_to_subpaths.count(seq) && !absent_paths.count(seq)){ + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(seq, sense, sample, locus, haplotype, phase_block, subrange); + if (subrange == PathMetadata::NO_SUBRANGE) { + // the path name souldn't describe a subpath + graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { + if (graph->get_haplotype(match) != haplotype) { + // Skip this haplotype + return true; + } + if (graph->get_phase_block(match) != phase_block) { + // Skip this phase block return true; - }); + } + // we've found a subpath for that base name. save the handle + base_path_to_subpaths[seq].push_back(match); + return true; + }); + } + } + if(!base_path_to_subpaths.count(seq)){ + // we've looked for subpaths and couldn't found anything + cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; + // remember that this path is not in the graph (no need to look it up again) + absent_paths[seq] = true; + continue; + } else { + // update the path name to the subpaths and adjust sbuf/ebuf based on its offset + // look for the subpath containing the range [sbuf-ebuf] + for (auto& path : base_path_to_subpaths[seq]) { + subrange_t subrange = graph->get_subrange(path); + // the two subrange formats are using different indexing + // in path[start-end] start/end are 0-based and the "end" is not included (like the input sbuf/ebuf from BED) + // in path[offset] the offset is "1-based" + // so to make path[offset] into path[start-end] we need to do path[(offset-1)-(offset-1+path_length)] + if (subrange.second == PathMetadata::NO_END_POSITION){ + if (subrange.first == PathMetadata::NO_END_POSITION){ + subrange.first = 0; + } else { + subrange.first--; + } + subrange.second = subrange.first + graph->get_path_length(path); + } + // if the subpath overlap with the queried range, save it + if(ebuf >= subrange.first && sbuf < subrange.second ){ + if(sbuf < subrange.first){ + other_starts.push_back(0); + } else { + other_starts.push_back(sbuf - subrange.first); + } + if(ebuf >= subrange.second){ + other_ends.push_back(subrange.second - subrange.first); + } else { + other_ends.push_back(ebuf - subrange.first); + } + other_seqs.push_back(graph->get_path_name(path)); } } - if(!subpath_found){ - // we've looked for subpaths and couldn't found anything - cerr << "warning: path \"" << seq << "\" not found in index, skipping" << endl; - continue; + if (!other_seqs.empty()){ + seq = other_seqs.back(); + other_seqs.pop_back(); + sbuf = other_starts.back(); + other_starts.pop_back(); + ebuf = other_ends.back(); + other_ends.pop_back(); } else { - // update the path name to the subpath's and adjust sbuf/ebuf based on its offset - // look for the subpath containing the range [sbuf-ebuf] - bool cover_two_subpaths = false; - for (auto& path : base_path_to_subpaths[seq]) { - subrange_t subrange = graph->get_subrange(path); - // TODO: one of these should be <=/>=, but which one... - if(sbuf > subrange.first && sbuf < subrange.second){ - if(ebuf > subrange.first && ebuf < subrange.second){ - // we found the matching subpath, update handle and positions - seq = graph->get_path_name(path); - sbuf = sbuf - subrange.first; - ebuf = ebuf - subrange.first; - } else { - cover_two_subpaths = true; - } + // we've looked for overlapping subpaths and couldn't found one + cerr << "warning: no overlap found between range " << + sbuf << "-" << ebuf << " and subpaths of \"" << + seq << "\", input line " << line << ", skipping" << endl; + continue; + } + } + } + + path_handle_t path_handle = graph->get_path_handle(seq); + + if (ebuf > graph->get_path_length(path_handle)) { + // Could be that the path is chopped but the first subpath is named like the base/full path + // if so, it was found in the graph but is shorter than the actual full path + // in case that happened, check for other subpaths that might have that base name + // if we find one and the range match the queried range, use that + bool subpath_found = false; + if(!base_path_to_subpaths.count(seq)){ + // not in our subpath cache, so let's look for subpaths + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(seq, sense, sample, locus, haplotype, phase_block, subrange); + + if (subrange == PathMetadata::NO_SUBRANGE) { + // the path name souldn't describe a subpath + graph->for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t& match) { + if (graph->get_haplotype(match) != haplotype) { + // Skip this haplotype + return true; + } + if (graph->get_phase_block(match) != phase_block) { + // Skip this phase block + return true; } + // we've found a subrange for this base path + base_path_to_subpaths[seq].push_back(match); + return true; + }); + } + } + if(!base_path_to_subpaths.count(seq)){ + // Skip ends that are too late + cerr << "warning: out of range path end " << ebuf << " > " << graph->get_path_length(path_handle) + << " in bed line " << line << ", skipping: " << row << endl; + continue; + } else { + // there are subpaths, let's look for the one containing [sbuf-ebuf] + for (auto& path : base_path_to_subpaths[seq]) { + subrange_t subrange = graph->get_subrange(path); + // the two subrange formats are using different indexing + // in path[start-end] start/end are 0-based and the "end" is not included (like the input sbuf/ebuf from BED) + // in path[offset] the offset is "1-based" + // so to make path[offset] into path[start-end] we need to do path[(offset-1)-(offset-1+path_length)] + if (subrange.second == PathMetadata::NO_END_POSITION){ + if (subrange.first == PathMetadata::NO_END_POSITION){ + subrange.first = 0; + } else { + subrange.first--; + } + subrange.second = subrange.first + graph->get_path_length(path); } - if(cover_two_subpaths){ - cerr << "warning: path " << seq << " is chopped and the queried range " - << sbuf << "-" << ebuf << " overlaps two different subpaths in bed line " - << line << ", skipping: " << row << endl; - continue; + // if the subpath overlap with the queried range, save it + if(ebuf >= subrange.first && sbuf < subrange.second ){ + if(sbuf < subrange.first){ + other_starts.push_back(0); + } else { + other_starts.push_back(sbuf - subrange.first); + } + if(ebuf >= subrange.second){ + other_ends.push_back(subrange.second - subrange.first); + } else { + other_ends.push_back(ebuf - subrange.first); + } + other_seqs.push_back(graph->get_path_name(path)); } } + if (!other_seqs.empty()){ + seq = other_seqs.back(); + other_seqs.pop_back(); + path_handle = graph->get_path_handle(seq); + sbuf = other_starts.back(); + other_starts.pop_back(); + ebuf = other_ends.back(); + other_ends.pop_back(); + } else { + // we've looked for overlapping subpaths and couldn't found one + cerr << "warning: no overlap found between range " << + sbuf << "-" << ebuf << " and subpaths of \"" << + seq << "\", input line " << line << ", skipping" << endl; + continue; + } } - - Alignment alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); + } + + if (sbuf >= graph->get_path_length(path_handle)) { + // Skip starts that are too late + cerr << "warning: out of range path start " << sbuf << " >= " << graph->get_path_length(path_handle) + << " in bed line " << line << ", skipping: " << row << endl; + continue; + } + + Alignment alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); + out_alignments->push_back(alignment); + + // if more subpaths need to be written, write them now + if (!other_seqs.empty()){ + // extract subpath information + seq = other_seqs.back(); + other_seqs.pop_back(); + sbuf = other_starts.back(); + other_starts.pop_back(); + ebuf = other_ends.back(); + other_ends.pop_back(); + // get alignment + alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); out_alignments->push_back(alignment); } } From 9d5df77565272d2fe84ef98b35cf8c96ddc72fcc Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Wed, 16 Aug 2023 21:55:33 +0200 Subject: [PATCH 18/22] better handle temporary files --- src/subcommand/gamsort_main.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/subcommand/gamsort_main.cpp b/src/subcommand/gamsort_main.cpp index 7c3d10e332f..59f527d2a0b 100644 --- a/src/subcommand/gamsort_main.cpp +++ b/src/subcommand/gamsort_main.cpp @@ -155,8 +155,7 @@ int main_gamsort(int argc, char **argv) kstring_t s_buffer = KS_INITIALIZE; gafkluge::GafRecord gaf; - // TODO manage temporary files better - std::string chunk_outf = "temp_gafsort_" + std::to_string(chunk_id) + ".gaf"; + string chunk_outf = temp_file::create(); if(show_progress){ cerr << "Preparing temporary chunk " << chunk_outf << "..." << endl; } @@ -192,7 +191,7 @@ int main_gamsort(int argc, char **argv) cerr << " Sorting chunk..." << endl; } std::stable_sort(current_gaf_chunk.begin(), current_gaf_chunk.end(), compare_gaf()); - // write to temp_gafsort_.gaf + // write to temporary file if(show_progress){ cerr << " Writing chunk..." << endl; } @@ -206,7 +205,7 @@ int main_gamsort(int argc, char **argv) current_gaf_chunk.clear(); count = 0; chunk_id++; - chunk_outf = "temp_gafsort_" + std::to_string(chunk_id) + ".gaf"; + chunk_outf = temp_file::create(); if(show_progress){ cerr << "Preparing temporary chunk " << chunk_outf << "..." << endl; } @@ -221,7 +220,7 @@ int main_gamsort(int argc, char **argv) cerr << " Sorting chunk..." << endl; } std::stable_sort(current_gaf_chunk.begin(), current_gaf_chunk.end(), compare_gaf()); - // write to temp_gafsort_.gaf + // write to temporary file if(show_progress){ cerr << " Writing chunk..." << endl; } From ae44fca61eeb17200c8768301c6c5c84ae3c9bd9 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Sun, 20 Aug 2023 23:57:18 +0200 Subject: [PATCH 19/22] write alignment buffers once in a while --- src/alignment.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index e1a41c37e90..ea944f51366 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -1,6 +1,7 @@ #include "alignment.hpp" #include "vg/io/gafkluge.hpp" #include "annotation.hpp" +#include #include #include @@ -2101,7 +2102,7 @@ void parse_bed_regions(istream& bedstream, out_alignments->push_back(alignment); // if more subpaths need to be written, write them now - if (!other_seqs.empty()){ + while (!other_seqs.empty()){ // extract subpath information seq = other_seqs.back(); other_seqs.pop_back(); @@ -2115,6 +2116,8 @@ void parse_bed_regions(istream& bedstream, alignment.set_score(score); out_alignments->push_back(alignment); } + + vg::io::write_buffered(cout, *out_alignments, 1000); } } @@ -2383,7 +2386,7 @@ void parse_gff_regions(istream& gffstream, out_alignments->push_back(alignment); // if more subpaths need to be written, write them now - if (!other_seqs.empty()){ + while (!other_seqs.empty()){ // extract subpath information seq = other_seqs.back(); other_seqs.pop_back(); @@ -2395,6 +2398,8 @@ void parse_gff_regions(istream& gffstream, alignment = target_alignment(graph, graph->get_path_handle(seq), sbuf, ebuf, name, is_reverse); out_alignments->push_back(alignment); } + + vg::io::write_buffered(cout, *out_alignments, 1000); } } From 73593a5780ba9b4262b3c2a92adc2590a43a23b7 Mon Sep 17 00:00:00 2001 From: Jean Monlong Date: Thu, 24 Aug 2023 22:31:13 +0200 Subject: [PATCH 20/22] use a min-heap when mergin sorted GAF and simplify to speed up --- src/subcommand/gamsort_main.cpp | 68 ++++++++++++++++----------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/src/subcommand/gamsort_main.cpp b/src/subcommand/gamsort_main.cpp index 59f527d2a0b..4fd2f47fe98 100644 --- a/src/subcommand/gamsort_main.cpp +++ b/src/subcommand/gamsort_main.cpp @@ -40,6 +40,22 @@ struct compare_gaf { return rk11 < rk12 || (rk11 == rk12 && rk21 < rk22); } }; +// defines a pair of a GAF record and the ID of the file it came from (used when merging sorted GAF files) +struct GafFile { + gafkluge::GafRecord gaf; + int file_i; +}; +// comparator used by the min-heap when merging sorted GAF files +struct greater_gaffile { + bool operator()(const GafFile& gf1, const GafFile& gf2) { + // TODO find a way to not have to convert the node ids to string before and then back to int here? + long long rk11 = std::stoll(gf1.gaf.opt_fields.find("rk1")->second.second); + long long rk12 = std::stoll(gf2.gaf.opt_fields.find("rk1")->second.second); + long long rk21 = std::stoll(gf1.gaf.opt_fields.find("rk2")->second.second); + long long rk22 = std::stoll(gf2.gaf.opt_fields.find("rk2")->second.second); + return rk11 > rk12 || (rk11 == rk12 && rk21 > rk22); + } +}; int main_gamsort(int argc, char **argv) { @@ -237,15 +253,17 @@ int main_gamsort(int argc, char **argv) if(show_progress){ cerr << "Merging " << chunk_files.size() << " files..." << endl; } - + std::vector opened_files; std::vector more_in_file; std::vector opened_file_buffers; - std::vector opened_records; + // heap with the current GAF record of each file + std::priority_queue, greater_gaffile > opened_records; std::string line; // open the temp GAF files and read the first record + GafFile gf; for(int ii=0; ii < chunk_files.size(); ii++){ htsFile* in = hts_open(chunk_files[ii].c_str(), "r"); if (in == NULL) { @@ -253,46 +271,24 @@ int main_gamsort(int argc, char **argv) } opened_file_buffers.push_back(KS_INITIALIZE); opened_files.push_back(in); - more_in_file.push_back(vg::io::get_next_record_from_gaf(nullptr, nullptr, opened_files.back(), opened_file_buffers.back(), gaf)); - opened_records.push_back(gaf); + if(vg::io::get_next_record_from_gaf(nullptr, nullptr, opened_files.back(), opened_file_buffers.back(), gaf)){ + gf.gaf = gaf; + gf.file_i = ii; + opened_records.push(gf); + } } - // merge one by one until no more to read for all files - bool more_to_merge = true; - while(more_to_merge){ + while(opened_records.size() > 0){ // which file will have the smallest record (i.e. to output first) - int smallest_file = -1; - // compare current gaf records - for(int ii = 0; ii Date: Wed, 20 Sep 2023 16:08:27 +0200 Subject: [PATCH 21/22] update htslib dep --- deps/htslib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/htslib b/deps/htslib index bdf5717ead5..5426f5cd1c6 160000 --- a/deps/htslib +++ b/deps/htslib @@ -1 +1 @@ -Subproject commit bdf5717ead58641cffad2d7fc1fbf3921c14a9e7 +Subproject commit 5426f5cd1c6c7bb377429fe335a64e1522203d82 From fc59bec26c54f8ee4e20ad2135994163f71dc893 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 26 Mar 2024 17:03:51 -0400 Subject: [PATCH 22/22] Update htslib to the version with GAF indexing merged --- deps/htslib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/htslib b/deps/htslib index 5426f5cd1c6..3cfe87690d0 160000 --- a/deps/htslib +++ b/deps/htslib @@ -1 +1 @@ -Subproject commit 5426f5cd1c6c7bb377429fe335a64e1522203d82 +Subproject commit 3cfe87690d047c06ec0d29a859c930b635a42e96