diff --git a/scripts/build_kraken2_db.sh b/scripts/build_kraken2_db.sh index a5dfa1e..5fb57ad 100755 --- a/scripts/build_kraken2_db.sh +++ b/scripts/build_kraken2_db.sh @@ -64,14 +64,14 @@ then KRAKEN2XFLAG="-X" fi -echo "Creating sequence ID to taxonomy ID map (step 1)..." -if [ -d "library/added" ]; then - find library/added/ -name 'prelim_map_*.txt' | xargs cat > library/added/prelim_map.txt -fi seqid2taxid_map_file=seqid2taxid.map if [ -e "$seqid2taxid_map_file" ]; then - echo "Sequence ID to taxonomy ID map already present, skipping map creation." + echo "Sequence ID to taxonomy ID map already present, skipping map creation (step 1)." else + echo "Creating sequence ID to taxonomy ID map (step 1)..." + if [ -d "library/added" ]; then + find library/added/ -name 'prelim_map_*.txt' | xargs cat > library/added/prelim_map.txt + fi step_time=$(get_current_time) find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt if [ ! -s "taxonomy/prelim_map.txt" ]; then @@ -95,28 +95,35 @@ else echo "Sequence ID to taxonomy ID map complete. [$(report_time_elapsed $step_time)]" fi -echo "Estimating required capacity (step 2)..." - -step_time=$(get_current_time) -estimate=$(list_sequence_files | xargs -0 cat | estimate_capacity -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE -p $KRAKEN2_THREAD_CT $KRAKEN2XFLAG ) -# Slight upward adjustment of distinct minimizer estimate to protect -# against crash w/ small reference sets -estimate=$(( estimate + 8192 )) -required_capacity=$(perl -le 'print int(shift() / shift())' $estimate $KRAKEN2_LOAD_FACTOR); - -echo "Estimated hash table requirement: $(( required_capacity * 4 )) bytes" - +required_capacity_file="required_capacity.txt" max_db_flag="" -if [ -n "$KRAKEN2_MAX_DB_SIZE" ] -then - if (( KRAKEN2_MAX_DB_SIZE < (required_capacity * 4) )) +if [ -e "$required_capacity_file" ]; then + echo "Required capacity has already been estimated, skipping estimation (step 2)." + required_capacity=$( < $required_capacity_file ) +else + echo "Estimating required capacity (step 2)..." + + step_time=$(get_current_time) + estimate=$(list_sequence_files | xargs -0 cat | estimate_capacity -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE -p $KRAKEN2_THREAD_CT $KRAKEN2XFLAG ) + # Slight upward adjustment of distinct minimizer estimate to protect + # against crash w/ small reference sets + estimate=$(( estimate + 8192 )) + required_capacity=$(perl -le 'print int(shift() / shift())' $estimate $KRAKEN2_LOAD_FACTOR); + echo $required_capacity > $required_capacity_file + + if [ -n "$KRAKEN2_MAX_DB_SIZE" ] then - max_db_flag="-M $(perl -le 'print int(shift() / 4)' $KRAKEN2_MAX_DB_SIZE)" - echo "Specifying lower maximum hash table size of $KRAKEN2_MAX_DB_SIZE bytes" + if (( KRAKEN2_MAX_DB_SIZE < (required_capacity * 4) )) + then + max_db_flag="-M $(perl -le 'print int(shift() / 4)' $KRAKEN2_MAX_DB_SIZE)" + echo "Specifying lower maximum hash table size of $KRAKEN2_MAX_DB_SIZE bytes" + fi fi + + echo "Capacity estimation complete. [$(report_time_elapsed $step_time)]" fi +echo "Estimated hash table requirement: $(( required_capacity * 4 )) bytes" -echo "Capacity estimation complete. [$(report_time_elapsed $step_time)]" echo "Building database files (step 3)..."