Skip to content

Commit

Permalink
Add --priority-taxids option for database building.
Browse files Browse the repository at this point in the history
All kmers associated with any priority taxids will override other taxids, meaning that the hash
value for the kmer will not be LCA'd but set at priority taxid. This is useful for artificial
vector/plasmid sequences overlapping bacteria so that their kmers don't get uninformatively LCA'd
to a root node.

The default value includes 32630 and 81077. Specifying any taxids will clear these two taxids.
Multiple taxids can be separated by colon (:), comma (,) or repeated args.
  • Loading branch information
yesimon committed Jul 1, 2020
1 parent 9c702c8 commit b40f075
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 4 deletions.
2 changes: 1 addition & 1 deletion scripts/build_kraken2_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ else
list_sequence_files | xargs -0 cat | \
build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \
-H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \
-c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag
-c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag
finalize_file taxo.k2d
finalize_file opts.k2d
finalize_file hash.k2d
Expand Down
11 changes: 11 additions & 0 deletions scripts/kraken2-build
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ my (
$minimizer_len,
$kmer_len,
$minimizer_spaces,
@priority_taxids,
$is_protein,
$masking,
$max_db_size,
Expand All @@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR;
$kmer_len = $ENV{"KRAKEN2_KMER_LEN"};
$minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"};
$minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"};
@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || ();
$is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0;
$use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0;
$skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0;
Expand All @@ -92,6 +94,7 @@ GetOptions(
"minimizer-len=i" => \$minimizer_len,
"kmer-len=i" => \$kmer_len,
"minimizer-spaces=i", \$minimizer_spaces,
"priority-taxids=s" => \@priority_taxids,
"protein" => \$is_protein,
"masking!" => \$masking,
"max-db-size=i" => \$max_db_size,
Expand Down Expand Up @@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
$ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
$ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
$ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "";
if (@priority_taxids) {
$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/, join(',', @priority_taxids)));
}
$ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
$ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : "";
$ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
Expand Down Expand Up @@ -230,6 +237,10 @@ Options:
ignored in comparisons (build task only;
def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa)
--protein Build a protein database for translated search
--priority-taxids NUM Taxids whose k-mers override other taxids. By default includes
32630: synthetic construct and 81077: artifical sequences. Specifying
any taxids will clear these two default taxids. Multiple taxids can be
separated by colon (:), comma (,) or repeated args.
--no-masking Used with --standard/--download-library/
--add-to-library to avoid masking low-complexity
sequences prior to building; masking requires
Expand Down
54 changes: 51 additions & 3 deletions src/build_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ using namespace kraken2;

#define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024) // 10 MB

// k-mers appearing in contaminant sequences will keep the contaminant
// sequence taxid, even if they also appear in a genome
const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct'
const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences'

struct Options {
string ID_to_taxon_map_filename;
string ncbi_taxonomy_directory;
Expand All @@ -36,6 +41,7 @@ struct Options {
size_t block_size;
int num_threads;
bool input_is_protein;
vector<uint64_t> priority_taxids;
ssize_t k, l;
size_t capacity;
size_t maximum_capacity;
Expand Down Expand Up @@ -65,6 +71,8 @@ int main(int argc, char **argv) {
opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK;
opts.toggle_mask = DEFAULT_TOGGLE_MASK;
opts.input_is_protein = false;
opts.priority_taxids.push_back(TID_CONTAMINANT1);
opts.priority_taxids.push_back(TID_CONTAMINANT2);
opts.num_threads = 1;
opts.block_size = DEFAULT_BLOCK_SIZE;
opts.min_clear_hash_value = 0;
Expand All @@ -82,6 +90,9 @@ int main(int argc, char **argv) {

Taxonomy taxonomy(opts.taxonomy_filename.c_str());
taxonomy.GenerateExternalToInternalIDMap();
for (auto taxid : opts.priority_taxids) {
taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid));
}
size_t bits_needed_for_value = 1;
while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count())
bits_needed_for_value++;
Expand Down Expand Up @@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map<string, uint64_t> &ID_to_taxon_map,
std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl;
}

// Split string by any of delimiters, convert to long, and insert to iterator.
template <typename OutIt>
OutIt SplitInsert(char* input, string delimiters, OutIt out) {
std::string line(input);
std::size_t prev = 0, pos;
while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) {
if (pos > prev) {
*out++ = stol(line.substr(prev, pos-prev));
}
prev = pos+1;
}
if (prev < line.length()) {
*out++ = stol(line.substr(prev, std::string::npos));
}
return out;
}

// This function exists to deal with NCBI's use of \x01 characters to denote
// the start of a new FASTA header in the same line (for non-redundant DBs).
// We return all sequence IDs in a header line, not just the first.
Expand Down Expand Up @@ -213,8 +241,9 @@ vector<string> ExtractNCBISequenceIDs(const string &header) {
void ParseCommandLine(int argc, char **argv, Options &opts) {
int opt;
long long sig;
bool priority_taxids_cleared = false;

while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) {
while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) {
switch (opt) {
case 'h' : case '?' :
usage(0);
Expand Down Expand Up @@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) {
errx(EX_USAGE, "max capacity must be positive integer");
opts.maximum_capacity = sig;
break;
case 'P' :
if (!priority_taxids_cleared) {
opts.priority_taxids.clear();
priority_taxids_cleared = true;
}
SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids));
break;
case 'X' :
opts.input_is_protein = true;
break;
Expand Down Expand Up @@ -326,6 +362,7 @@ void usage(int exit_code) {
<< " -M INT Set maximum capacity of hash table (MiniKraken)\n"
<< " -S BITSTRING Spaced seed mask\n"
<< " -T BITSTRING Minimizer toggle mask\n"
<< " -A INT(:INT) Priority taxids that don't get LCA'd\n"
<< " -X Input seqs. are proteins\n"
<< " -p INT Number of threads\n"
<< " -B INT Read block size" << endl;
Expand All @@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid,
CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner,
uint64_t min_clear_hash_value)
{
bool seq_is_priority = tax.priority_taxids.count(taxid);
scanner.LoadSequence(seq);
uint64_t *minimizer_ptr;
bool ambig_flag;
Expand All @@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid,
continue;
hvalue_t existing_taxid = 0;
hvalue_t new_taxid = taxid;
while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
if (seq_is_priority) {
hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) ||
hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid);
}
else {
while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
if (! tax.priority_taxids.count(existing_taxid)) {
new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
} else {
new_taxid = existing_taxid;
}
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/taxonomy.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class Taxonomy {
return 0;
return external_to_internal_id_map_[external_id];
}
std::set<uint64_t> priority_taxids;

private:
void Init(const char *filename, bool memory_mapping);
Expand Down

0 comments on commit b40f075

Please sign in to comment.