From 7d075576aa0bd32a6ae319c301fed695dfbc7daf Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Fri, 17 May 2024 15:21:54 -0400 Subject: [PATCH 1/9] Add genomes (#45) (#46) * Corynebacterium diphtheriae * added Bifidobacterium adolenscentis * replaced S. enterica IIIa; Added hops (Humulus lupulus) * added a Citrobacter species * m * replaced repressed genome accession for B. faecium --- src/chromosomes.tsv | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/chromosomes.tsv b/src/chromosomes.tsv index 97b57c6..7743c6e 100644 --- a/src/chromosomes.tsv +++ b/src/chromosomes.tsv @@ -30,12 +30,13 @@ Bacteroides fragilis NC_006347 817 816 Bacteroides thetaiotaomicron NC_004663 818 816 Bartonella bacilliformis CP000524 774 773 Betacoronavirus coronavirus MT233526 2697049 694009 +Bifidobacterium adolenscentis CP028341 1680 1678 Bifidobacterium bifidum NC_014638 1681 1678 Bifidobacterium longum NC_004307 216816 1678 Bordetella bronchiseptica NC_019382 518 517 Borreliella burgdorferi NC_001318 139 64895 Bos taurus KC153975 9913 9903 -Brachybacterium faecium NC_013172 43669 43668 +Brachybacterium faecium CP001643 43669 43668 Bradyrhizobium diazoefficiens NC_004463 1355477 374 Brassica oleracea NC_016118 3712 3705 Buchnera aphidicola NC_002528 9 32199 @@ -67,7 +68,8 @@ Chlamydia trachomatis NC_000117 813 810 Chlamydomonas reinhardtii AF008237 3055 3052 Chlorobaculum tepidum NC_002932 1097 256319 Chloroflexus aurantiacus NC_010175 1108 1107 -Citrobacter freundii CP007557 1333848 546 +Citrobacter freundii CP007557 546 544 +Citrobacter amalonaticus CP014070 35703 544 Clavibacter michiganensis_sepedonicus AM849034 31964 28447 Clostridioides difficile NC_009089 1496 1870884 Clostridium acetobutylicum NC_003030 1488 1485 @@ -77,6 +79,7 @@ Clostridium botulinum groupI NC_010723 9000005 1491 Clostridium botulinum groupII NC_010516 9000004 1491 Clostridium butyricum CP013239 1492 1485 Clostridium perfringens NC_008262 1502 1485 +Corynebacterium diphtheriae CP091095 1717 1716 Corynebacterium glutamicum NC_003450 1718 1716 Corynebacterium urealyticum AM942444 43771 1716 Coxiella burnetii NC_002971 777 776 @@ -137,6 +140,7 @@ Helianthus annuus MG770607 4232 4231 Helicobacter pylori AE000511 210 209 Heliobacterium modesticaldum CP000930 35701 2697 Homo sapiens NC_012920 9606 9605 +Humulus lupulus NC_086845 3486 3484 Ketogulonicigenium vulgare NC_017384 92945 92944 Klebsiella aerogenes NC_015663 548 570 Klebsiella pneumoniae NC_016845 573 570 @@ -202,7 +206,7 @@ Salinibacter ruber NC_007677 146919 146918 Salmonella bongori FR877557 54736 590 Salmonella enterica IIa CP053411 9000010 28901 Salmonella enterica IIb LR134141 9000011 28901 -Salmonella enterica IIIa UGXG01000002 9000014 28901 +Salmonella enterica IIIa CP000880 9000014 28901 Salmonella enterica IIIb CP053583 9000015 28901 Salmonella enterica I AE006468 59201 28901 Salmonella enterica IV CP053579 59205 28901 From 1ea49ffe7cc6ce96e840b6b2c3acb24cd64b99f3 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Sat, 25 May 2024 21:28:17 -0400 Subject: [PATCH 2/9] remove random single quotes --- src/plasmids.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plasmids.tsv b/src/plasmids.tsv index e9656a5..c2dafa6 100644 --- a/src/plasmids.tsv +++ b/src/plasmids.tsv @@ -2964,7 +2964,7 @@ Rickettsia CP015014 780 33988 Rickettsia CP010970 780 33988 Onion yellows phytoplasma AB480166 100379 85620 Onion yellows phytoplasma AB479509 100379 85620 -'Brassica napus' phytoplasma HQ637382 469009 85620 +Brassica napus phytoplasma HQ637382 469009 85620 Candidatus Phytoplasma FJ905104 33926 2146 Candidatus Phytoplasma KF801472 33926 2146 Onion yellows phytoplasma AB479513 100379 2146 @@ -2986,7 +2986,7 @@ Paulownia witches'-broom phytoplasma EF426472 39647 85620 Paulownia witches'-broom phytoplasma EF426473 39647 85620 Periwinkle little leaf phytoplasma JN835187 137854 85635 Rice orange leaf phytoplasma KY086101 146897 85635 -'Catharanthus roseus' aster yellows phytoplasma CP035950 1193712 85620 +Catharanthus roseus aster yellows phytoplasma CP035950 1193712 85620 Bacillus thuringiensis CP016196 1428 85620 Bacillus sp. BS98 CP043831 2608254 185979 Bacillus CP009595 1386 185979 From 07782988817e5d02565f14f6f00ea3cfacf4590d Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Sun, 26 May 2024 21:20:21 -0400 Subject: [PATCH 3/9] bump version --- bin/downloadKalamari.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index 314a3de..7c2d915 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -11,7 +11,7 @@ use IO::Compress::Gzip; use version 0.77; -our $VERSION = version->parse("5.6.0"); +our $VERSION = version->parse("5.6.2"); use threads; From b7a1d819aeebe7e57b43cabca6ded27da97f6742 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Fri, 31 May 2024 11:09:42 -0400 Subject: [PATCH 4/9] helpful log messages --- bin/buildKraken1.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/buildKraken1.sh b/bin/buildKraken1.sh index d51f638..0e058c9 100755 --- a/bin/buildKraken1.sh +++ b/bin/buildKraken1.sh @@ -27,6 +27,9 @@ find $SRC -name '*.fasta.gz' | \ for i in "$@"; do gzip -cd $i done > $tmpfile + echo -ne "ADDING to library:\n " + zgrep "^>" $tmpfile | sed "s/^>//" | tr '\n' ' ' + echo "^^ contents of $tmpfile ^^" kraken-build --db $DB --add-to-library $tmpfile ' From 18033f778490e11fff99d1533687ee9f560d4167 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Fri, 31 May 2024 11:09:56 -0400 Subject: [PATCH 5/9] v5.6.3 --- bin/downloadKalamari.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index 7c2d915..3a30c2b 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -11,7 +11,7 @@ use IO::Compress::Gzip; use version 0.77; -our $VERSION = version->parse("5.6.2"); +our $VERSION = version->parse("5.6.3"); use threads; From 884d741ebe9ad06de7908daeb355e44e7a6b0521 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 5 Jun 2024 13:56:28 -0400 Subject: [PATCH 6/9] make symlink to avoid naming mistakes --- bin/buildKraken1.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/buildKraken1.sh b/bin/buildKraken1.sh index 0e058c9..4688153 100755 --- a/bin/buildKraken1.sh +++ b/bin/buildKraken1.sh @@ -38,3 +38,7 @@ kraken-build --db $DB --build --threads 1 # Reduce the size of the database kraken-build --db $DB --clean + +if [ ! -e "$sharedir/kalamari-kraken1" ]; then + ln -sv kalamari-kraken "$sharedir/kalamari-kraken1" +fi From a444077032633c95dc1a72d3923ef7a7d839bc90 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 5 Jun 2024 13:56:53 -0400 Subject: [PATCH 7/9] check whether taxonkit is loaded --- bin/filterTaxonomy.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/filterTaxonomy.sh b/bin/filterTaxonomy.sh index 48fbb3f..1af4912 100755 --- a/bin/filterTaxonomy.sh +++ b/bin/filterTaxonomy.sh @@ -2,6 +2,9 @@ set -eu +# Check for dependencies +which taxonkit + thisdir=$(dirname $0) thisfile=$(basename $0) KALAMARI_VER=$(downloadKalamari.pl --version) From 4495e0c29b7f8007771953ad878b4df9e6cd064c Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 5 Jun 2024 13:57:05 -0400 Subject: [PATCH 8/9] use efetch -input --- bin/downloadKalamari.pl | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index 3a30c2b..34526a5 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -11,7 +11,7 @@ use IO::Compress::Gzip; use version 0.77; -our $VERSION = version->parse("5.6.3"); +our $VERSION = version->parse("5.7.0"); use threads; @@ -167,27 +167,25 @@ sub downloadEntries{ my $numEntries = scalar(@$entries); my @acc = map{$$_{nuccoreAcc}} @$entries; logmsg "Downloading ".scalar(@acc)." accessions"; - my $queryArg = join("[accession] OR ", sort(@acc))."[accession]"; my $dir = tempdir("download.XXXXXX", DIR=>$$settings{tempdir}); + # Make the input file for efetch + my $inputAcc = "$dir/input.acc"; + open(my $fh, ">", $inputAcc) or die "ERROR: could not write to $inputAcc: $!"; + print $fh join("\n", @acc)."\n"; + close $fh; + # Accessions that had errors my @err; - # Get the esearch xml in place for at least one downstream query - my $esearchXml = "$dir/esearch.xml"; - my $esearchCmd = "esearch -db nuccore -query '$queryArg' > $esearchXml"; - command($esearchCmd); + # Get started on the comprehensive assembly file + my $outfile = "$dir/all.fasta"; + logmsg "Downloading all accessions to $outfile using input accessions in $inputAcc"; + command("efetch -db nuccore -input $inputAcc -format fasta > $dir/all.fasta"); if($?){ - die "ERROR running: $esearchCmd: $!"; + die "ERROR: could not download all accessions"; } - # Get started on the assembly file - my $outfile = "$dir/all.fasta"; - - # Main query: efetch - my $efetchCmd = "cat $esearchXml | efetch -format fasta > $outfile"; - system($efetchCmd); - my $seqsWithVersion = readSeqs($outfile); my $seqs = {}; while(my($acc, $seq) = each(%$seqsWithVersion)){ From 81b3bd2dccd48b0aabd296f550478d0a2d01cb9f Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 5 Jun 2024 14:05:47 -0400 Subject: [PATCH 9/9] fix tr bug --- bin/buildKraken1.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/buildKraken1.sh b/bin/buildKraken1.sh index 4688153..851ff75 100755 --- a/bin/buildKraken1.sh +++ b/bin/buildKraken1.sh @@ -22,13 +22,14 @@ cp -rv $TAXDIR $DB/taxonomy # Make --add-to-library more efficient with # concatenated fasta files +export nl=$'\n' find $SRC -name '*.fasta.gz' | \ xargs -n 100 -P 1 bash -c ' for i in "$@"; do gzip -cd $i done > $tmpfile echo -ne "ADDING to library:\n " - zgrep "^>" $tmpfile | sed "s/^>//" | tr '\n' ' ' + zgrep "^>" $tmpfile | sed "s/^>//" | tr "$nl" " " echo "^^ contents of $tmpfile ^^" kraken-build --db $DB --add-to-library $tmpfile '