diff --git a/.github/workflows/unit-testing.Listeria.Kraken1.yml b/.github/workflows/unit-testing.Listeria.Kraken1.yml index e06f425..9d5c442 100644 --- a/.github/workflows/unit-testing.Listeria.Kraken1.yml +++ b/.github/workflows/unit-testing.Listeria.Kraken1.yml @@ -1,7 +1,7 @@ # This is a subsampling unit test to get early results on: push: - branches: [master, dev, validate-taxonomy] + branches: [master, dev] name: Listeria-with-Kraken1 env: @@ -41,18 +41,10 @@ jobs: tree $(realpath .) - name: install-edirect run: | - sudo apt-get install ncbi-entrez-direct - echo "installed edirect the apt way" - exit - cd $HOME - perl -MNet::FTP -e '$ftp = new Net::FTP("ftp.ncbi.nlm.nih.gov", Passive => 1); $ftp->login; $ftp->binary; $ftp->get("/entrez/entrezdirect/edirect.tar.gz");' - gunzip -cv edirect.tar.gz | tar xf - - rm -v edirect.tar.gz - echo $GITHUB_WORKSPACE/edirect >> $GITHUB_PATH + sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)" + echo $HOME/edirect >> $GITHUB_PATH echo $GITHUB_WORKSPACE/Kalamari/bin >> $GITHUB_PATH - #export PATH=${PATH}:$HOME/edirect >& /dev/null || setenv PATH "${PATH}:$HOME/edirect" - yes Y | ./edirect/setup.sh - tree edirect + tree $HOME/edirect - name: check-env run: echo "$PATH" - name: select for only Listeria diff --git a/.github/workflows/unit-testing.Yersinia.Kraken2.yml b/.github/workflows/unit-testing.Yersinia.Kraken2.yml index 0f7efde..4b34a7a 100644 --- a/.github/workflows/unit-testing.Yersinia.Kraken2.yml +++ b/.github/workflows/unit-testing.Yersinia.Kraken2.yml @@ -1,7 +1,7 @@ # This is a subsampling unit test to get early results on: push: - branches: [master, dev, validate-taxonomy] + branches: [master, dev] name: Genera-with-Kraken2 env: @@ -34,6 +34,14 @@ jobs: - name: env check run: | echo $PATH | tr ':' '\n' | sort + + - name: install-edirect + run: | + sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)" + echo $HOME/edirect >> $GITHUB_PATH + echo $GITHUB_WORKSPACE/Kalamari/bin >> $GITHUB_PATH + tree $HOME/edirect + - name: apt-get install run: sudo apt-get install ca-certificates tree jellyfish ncbi-entrez-direct - name: select for only for this genus diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 2cb5dd3..b29d7bb 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -1,6 +1,6 @@ on: push: - branches: [master, dev, validate-taxonomy] + branches: [master, dev] name: Pull-down-all-accessions jobs: @@ -25,18 +25,13 @@ jobs: - name: apt-get install run: sudo apt-get install ca-certificates tree + - name: install-edirect run: | - sudo apt-get install ncbi-entrez-direct - echo "installed edirect the apt way" - exit - cd $HOME - perl -MNet::FTP -e '$ftp = new Net::FTP("ftp.ncbi.nlm.nih.gov", Passive => 1); $ftp->login; $ftp->binary; $ftp->get("/entrez/entrezdirect/edirect.tar.gz");' - gunzip -cv edirect.tar.gz | tar xf - - rm -v edirect.tar.gz - export PATH=${PATH}:$HOME/edirect >& /dev/null || setenv PATH "${PATH}:$HOME/edirect" - yes Y | ./edirect/setup.sh - tree edirect + sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)" + echo $HOME/edirect >> $GITHUB_PATH + echo $GITHUB_WORKSPACE/Kalamari/bin >> $GITHUB_PATH + tree $HOME/edirect - name: check-env run: echo "$PATH" - name: download diff --git a/.github/workflows/validateTaxonomy.yml b/.github/workflows/validateTaxonomy.yml index 5147f40..1ab159c 100644 --- a/.github/workflows/validateTaxonomy.yml +++ b/.github/workflows/validateTaxonomy.yml @@ -1,6 +1,6 @@ on: push: - branches: [master, dev, validate-taxonomy] + branches: [master, dev, esearch-input] name: Validate taxonomy jobs: @@ -27,11 +27,18 @@ jobs: echo $PATH echo "" cat $GITHUB_PATH + - name: install taxonkit + run: | + wget https://github.com/shenwei356/taxonkit/releases/download/v0.16.0/taxonkit_linux_amd64.tar.gz + tar -xvf taxonkit_linux_amd64.tar.gz + rm -v taxonkit_linux_amd64.tar.gz + chmod +x taxonkit + echo $(realpath .) >> $GITHUB_PATH - name: build taxonomy run: | echo $PATH - bash Kalamari/bin/buildTaxonomy.sh - bash Kalamari/bin/filterTaxonomy.sh + bash -x Kalamari/bin/buildTaxonomy.sh + bash -x Kalamari/bin/filterTaxonomy.sh ls -lhR Kalamari/share/kalamari-*/taxonomy - name: validate taxonomy run: | diff --git a/bin/buildKraken1.sh b/bin/buildKraken1.sh index 0e058c9..f1816b0 100755 --- a/bin/buildKraken1.sh +++ b/bin/buildKraken1.sh @@ -22,13 +22,15 @@ cp -rv $TAXDIR $DB/taxonomy # Make --add-to-library more efficient with # concatenated fasta files +export nl=$'\n' find $SRC -name '*.fasta.gz' | \ xargs -n 100 -P 1 bash -c ' for i in "$@"; do gzip -cd $i done > $tmpfile echo -ne "ADDING to library:\n " - zgrep "^>" $tmpfile | sed "s/^>//" | tr '\n' ' ' + zgrep "^>" $tmpfile | sed "s/^>//" | tr "$nl" " " + echo echo "^^ contents of $tmpfile ^^" kraken-build --db $DB --add-to-library $tmpfile ' @@ -38,3 +40,7 @@ kraken-build --db $DB --build --threads 1 # Reduce the size of the database kraken-build --db $DB --clean + +if [ ! -e "$sharedir/kalamari-kraken1" ]; then + ln -sv kalamari-kraken "$sharedir/kalamari-kraken1" +fi diff --git a/bin/downloadKalamari.pl b/bin/downloadKalamari.pl index 3a30c2b..34526a5 100755 --- a/bin/downloadKalamari.pl +++ b/bin/downloadKalamari.pl @@ -11,7 +11,7 @@ use IO::Compress::Gzip; use version 0.77; -our $VERSION = version->parse("5.6.3"); +our $VERSION = version->parse("5.7.0"); use threads; @@ -167,27 +167,25 @@ sub downloadEntries{ my $numEntries = scalar(@$entries); my @acc = map{$$_{nuccoreAcc}} @$entries; logmsg "Downloading ".scalar(@acc)." accessions"; - my $queryArg = join("[accession] OR ", sort(@acc))."[accession]"; my $dir = tempdir("download.XXXXXX", DIR=>$$settings{tempdir}); + # Make the input file for efetch + my $inputAcc = "$dir/input.acc"; + open(my $fh, ">", $inputAcc) or die "ERROR: could not write to $inputAcc: $!"; + print $fh join("\n", @acc)."\n"; + close $fh; + # Accessions that had errors my @err; - # Get the esearch xml in place for at least one downstream query - my $esearchXml = "$dir/esearch.xml"; - my $esearchCmd = "esearch -db nuccore -query '$queryArg' > $esearchXml"; - command($esearchCmd); + # Get started on the comprehensive assembly file + my $outfile = "$dir/all.fasta"; + logmsg "Downloading all accessions to $outfile using input accessions in $inputAcc"; + command("efetch -db nuccore -input $inputAcc -format fasta > $dir/all.fasta"); if($?){ - die "ERROR running: $esearchCmd: $!"; + die "ERROR: could not download all accessions"; } - # Get started on the assembly file - my $outfile = "$dir/all.fasta"; - - # Main query: efetch - my $efetchCmd = "cat $esearchXml | efetch -format fasta > $outfile"; - system($efetchCmd); - my $seqsWithVersion = readSeqs($outfile); my $seqs = {}; while(my($acc, $seq) = each(%$seqsWithVersion)){ diff --git a/bin/filterTaxonomy.sh b/bin/filterTaxonomy.sh index 48fbb3f..38d1bb8 100755 --- a/bin/filterTaxonomy.sh +++ b/bin/filterTaxonomy.sh @@ -2,6 +2,11 @@ set -eu +# Check for dependencies +echo "Check for dependencies" +which taxonkit +echo + thisdir=$(dirname $0) thisfile=$(basename $0) KALAMARI_VER=$(downloadKalamari.pl --version)