Skip to content

Commit

Permalink
Merge pull request #329 from jmcbroome/master
Browse files Browse the repository at this point in the history
--haplotype rework and metadata loading flag
  • Loading branch information
yatisht authored Feb 10, 2023
2 parents 2cdfeef + 5d56255 commit d90bc9f
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 22 deletions.
20 changes: 16 additions & 4 deletions src/matUtils/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ po::variables_map parse_extract_command(po::parsed_options parsed) {
"Set to write all final stored metadata to a tsv.")
("whitelist,L", po::value<std::string>()->default_value(""),
"Pass a list of samples, one per line, to always retain regardless of any other parameters.")
("load-all-metadata", po::bool_switch(),
"Use to load all input metadata from -M regardless of sample selection. Significantly increases memory usage.")
("threads,T", po::value<uint32_t>()->default_value(num_cores), num_threads_message.c_str())
("help,h", "Print help messages");
// Collect all the unrecognized options from the first pass. This will include the
Expand Down Expand Up @@ -175,6 +177,7 @@ void extract_main (po::parsed_options parsed) {
bool break_ties = vm["break-ties"].as<bool>();
bool include_nt = vm["include-nt"].as<bool>();
size_t distance_threshold = vm["distance-threshold"].as<size_t>();
bool load_all = vm["load-all-metadata"].as<bool>();

boost::filesystem::path path(dir_prefix);
if (!boost::filesystem::exists(path)) {
Expand Down Expand Up @@ -630,12 +633,21 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
metav.push_back(m);
}
assert (metav.size() > 0);
std::set<std::string> samples_included(samples.begin(), samples.end());
if (output_tax_filename == dir_prefix) {
// Don't do this in the case of taxodium pb output
for (auto mv: metav) {
auto scm = read_metafile(mv, samples_included);
catmeta.emplace_back(scm);
if (load_all) {
//if we're loading all sample data anyways, no need to convert to a set.
std::set<std::string> empty;
for (auto mv: metav) {
auto scm = read_metafile(mv, empty, true);
catmeta.emplace_back(scm);
}
} else {
std::set<std::string> samples_included(samples.begin(), samples.end());
for (auto mv: metav) {
auto scm = read_metafile(mv, samples_included);
catmeta.emplace_back(scm);
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/matUtils/select.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ std::unordered_set<std::string> filter_mut_density(Mutation_Annotated_Tree::Tree
return samples_to_keep;
}

std::unordered_map<std::string,std::unordered_map<std::string,std::string>> read_metafile(std::string metainf, std::set<std::string> samples_to_use) {
std::unordered_map<std::string,std::unordered_map<std::string,std::string>> read_metafile(std::string metainf, std::set<std::string> samples_to_use, bool load_all) {
std::ifstream infile(metainf);
if (!infile) {
fprintf(stderr, "ERROR: Could not open the file: %s!\n", metainf.c_str());
Expand Down Expand Up @@ -493,7 +493,7 @@ std::unordered_map<std::string,std::unordered_map<std::string,std::string>> read
first = false;
} else {
for (size_t i=1; i < words.size(); i++) {
if (samples_to_use.find(words[0]) != samples_to_use.end()) {
if ((samples_to_use.find(words[0]) != samples_to_use.end()) || (load_all)) {
metamap[keys[i]][words[0]] = words[i];
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/matUtils/select.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ std::vector<std::string> sample_intersect (std::unordered_set<std::string> sampl
std::vector<std::string> get_nearby (MAT::Tree* T, std::string sample_id, int number_to_get);
std::vector<std::string> get_short_steppers(MAT::Tree* T, std::vector<std::string> samples_to_check, int max_mutations);
std::vector<std::string> get_short_paths(MAT::Tree* T, std::vector<std::string> samples_to_check, int max_path);
std::unordered_map<std::string,std::unordered_map<std::string,std::string>> read_metafile(std::string metainf, std::set<std::string> samples_to_use);
std::unordered_map<std::string,std::unordered_map<std::string,std::string>> read_metafile(std::string metainf, std::set<std::string> samples_to_use, bool load_all = false);
std::vector<std::string> get_sample_match(MAT::Tree* T, std::vector<std::string> samples_to_check, std::string substring);
std::vector<std::string> fill_random_samples(MAT::Tree* T, std::vector<std::string> current_samples, size_t target_size, bool lca_limit = false);
std::vector<std::string> get_mrca_samples(MAT::Tree* T, std::vector<std::string> current_samples);
Expand Down
49 changes: 35 additions & 14 deletions src/matUtils/summary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,27 +178,48 @@ void write_translate_table(MAT::Tree* T, std::string output_filename, std::strin
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}

std::map<std::set<std::string>,size_t> count_haplotypes(MAT::Tree* T) {
std::map<std::set<std::string>,size_t> hapmap;
//naive method. for each sample, rsearch along the history to collect each of its mutations
//and add those to the set. At the end, add that set to the hapcount keys if its not there, and increment.
for (auto s: T->get_leaves()) {
std::set<std::string> mset;
for (auto a: T->rsearch(s->identifier, true)) {
for (auto m: a->mutations) {
mset.insert(m.get_string());
std::map<std::map<int,int8_t>,size_t> count_haplotypes(MAT::Tree* T) {
//define a special type- coordinate-state map.
typedef std::map<int,int8_t> hapset;
//count and dynamic implementation dictionaries.
std::map<hapset,size_t> hapcount;
std::map<std::string,hapset> hapmap;
for (auto s: T->depth_first_expansion()) {
hapset mset;
if (!s->is_root()) {
mset = hapmap[s->parent->identifier];
}
//update it for this node.
for (auto m: s->mutations) {
if (m.mut_nuc == m.ref_nuc) {
//reversion to reference, get rid of any change at this position.
auto iter = mset.find(m.position);
if (iter != mset.end()) {
mset.erase(iter);
}
} else {
//update the haplotype.
mset[m.position] = m.mut_nuc;
}
}
if (hapmap.find(mset) == hapmap.end()) {
hapmap[mset] = 0;
//if this node is a leaf, increment the haplotype's counter.
if (s->is_leaf()) {
if (hapcount.find(mset) == hapcount.end()) {
hapcount[mset] = 0;
}
hapcount[mset]++;
} else {
//otherwise, store the current mset
hapmap[s->identifier] = mset;
}
hapmap[mset]++;
}
return hapmap;
return hapcount;
}

static uint8_t one_hot_to_two_bit(uint8_t arg) {
return 31-__builtin_clz((unsigned int)arg);
}

void print_mut_stats(MAT::Tree* T){
std::array<int, 16> mut_frequency;
for ( auto& temp : mut_frequency) {
Expand Down Expand Up @@ -229,7 +250,7 @@ void write_haplotype_table(MAT::Tree* T, std::string filename) {
for (auto const &hapc : hapmap) {
std::ostringstream msetstr;
for (auto m: hapc.first) {
msetstr << m << ",";
msetstr << m.first << MAT::get_nuc(m.second) << ",";
}
std::string final_str = msetstr.str();
final_str.pop_back();
Expand Down
2 changes: 1 addition & 1 deletion src/matUtils/summary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ void write_clade_table(MAT::Tree* T, std::string filename);
void write_mutation_table(MAT::Tree* T, std::string filename);
void write_translate_table(MAT::Tree* T, std::string output_filename, std::string gtf_filename, std::string fasta_filename);
void write_aberrant_table(MAT::Tree* T, std::string filename);
std::map<std::set<std::string>,size_t> count_haplotypes(MAT::Tree* T);
std::map<std::map<int,int8_t>,size_t> count_haplotypes(MAT::Tree* T);
void write_haplotype_table(MAT::Tree* T, std::string filename);
void write_sample_clades_table (MAT::Tree* T, std::string sample_clades);
void translate_main(MAT::Tree *T, std::string output_filename, std::string gff_filename, std::string fasta_filename );
Expand Down

0 comments on commit d90bc9f

Please sign in to comment.