From 4f8ee13b61f928bb60a7bc26dc673075e68b8e3c Mon Sep 17 00:00:00 2001 From: Scot Federman Date: Wed, 23 Apr 2014 17:08:15 -0700 Subject: [PATCH] parameter modifications --- SURPI.sh | 38 +++++++++++++++++++++++++++++-------- cutadapt_quality.csh | 7 ++++--- preprocess.sh | 9 +++++---- preprocess_ncores.sh | 7 ++++--- taxonomy_lookup.pl | 7 +++---- taxonomy_lookup_embedded.pl | 22 ++++++++++++++------- 6 files changed, 61 insertions(+), 29 deletions(-) diff --git a/SURPI.sh b/SURPI.sh index 07c9812..d1e19d6 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -13,7 +13,7 @@ # Please see license file for details. # Last revised 1/26/2014 -version="1.0.5" #SURPI version +version="1.0.6" #SURPI version optspec=":a:c:d:f:hi:l:m:n:p:q:r:s:vx:z:" bold=$(tput bold) @@ -218,6 +218,13 @@ SURPI_db_directory_FAST="/reference/FAST_SNAP" #This folder should contain both files created by RAPSearch - indexed database + .info file RAPSearch_db_directory="/reference/RAPSearch" +#Taxonomy Reference data directory +#This folder should contain the 3 SQLite files created by the script "create_taxonomy_db.sh" +#gi_taxid_nucl.db - nucleotide db of gi/taxonid +#gi_taxid_prot.db - protein db of gi/taxonid +#names_nodes_scientific.db - db of taxonid/taxonomy +taxonomy_db_directory="/reference/taxonomy" + #RAPSearch viral database name: indexed protein dataset (all of Viruses) located in RAPSearch_db_directory #make sure that directory also includes the .info file RAPSearchDB_Vir="rapsearch_viral_aa_130628_db_v2.12" @@ -228,6 +235,11 @@ RAPSearchDB_NR="rapsearch_nr_130624_db_v2.12" #e value for BLASTn used in coverage map generation eBLASTn="1e-15" + +#specify a location for storage of temporary files. +#Space needed may be up to 10x the size of the input file. +#This folder will not be created by SURPI, so be sure it already exists with proper permissions. +temporary_files_directory="/tmp/" EOF ) > $configprefix.config #------------------------------------------------------------------------------------------------ @@ -376,8 +388,11 @@ fi nopathf=${FASTQ_file##*/} # remove the path to file basef=${nopathf%.fastq} +#This is the point to verify all input before the SURPI run initiates +# add verification for all databases and access to all dependencies + #verify taxonomy is functioning properly -result=$( taxonomy_lookup_embedded.pl -d nucl 149408158 ) +result=$( taxonomy_lookup_embedded.pl -d nucl -q $taxonomy_db_directory 149408158 ) if [ $result = "149408158" ] then echo "taxonomy is functioning properly." @@ -422,6 +437,7 @@ echo "Command Line Usage: $scriptname $@" echo "SURPI version: $version" echo "config_file: $config_file" echo "Server: $host" +echo "Working directory: $( pwd )" echo "run_mode: $run_mode" echo "inputfile: $inputfile" echo "inputtype: $inputtype" @@ -429,6 +445,9 @@ echo "FASTQ_file: $FASTQ_file" echo "cores used: $cores" echo "Raw Read quality: $quality" echo "Read length_cutoff for preprocessing under which reads are thrown away: $length_cutoff" + +echo "temporary files directory used: $temporary_files_directory" + echo "SURPI_db_directory housing the reference databases for Comprehensive Mode: $SURPI_db_directory_COMP" echo "SURPI_db_directory housing the reference databases for Fast Mode: $SURPI_db_directory_FAST" @@ -440,6 +459,7 @@ echo "RAPSearch indexed viral db used: $RAPSearchDB_Vir" echo "RAPSearch indexed NR db used: $RAPSearchDB_NR" echo "rapsearch_database: $rapsearch_database" +echo "taxonomy database directory: $taxonomy_db_directory" echo "adapter_set: $adapter_set" echo "Raw Read length: $length" @@ -449,6 +469,8 @@ echo "abysskmer length: $abysskmer" echo "cache_reset: $cache_reset" echo "start_nt: $start_nt" echo "crop_length: $crop_length" + +echo "e value for BLASTn used in coverage map generation: $eBLASTn" echo "-----------------------------------------------------------------------------------------" curdate=$(date) @@ -466,8 +488,8 @@ then echo -n "Starting: preprocessing using $cores cores " date START2=$(date +%s) - echo "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length>& $basef.preprocess.log" - preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length>& $basef.preprocess.log + echo "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length>& $basef.preprocess.log $temporary_files_directory" + preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log echo -n "Done: preprocessing " date END2=$(date +%s) @@ -571,7 +593,7 @@ then ###retrieve taxonomy matched to NT ### echo -n "taxonomy retrieval for $basef.NT.snap.matched.fulllength.sam" date - taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores >& "$basef.taxonomy.SNAPNT.log" + taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores $taxonomy_db_directory >& "$basef.taxonomy.SNAPNT.log" sed 's/NM:i:\([0-9]\)/0\1/g' "$basef.NT.snap.matched.fulllength.all.annotated" | sort -k 14,14 > "$basef.NT.snap.matched.fulllength.all.annotated.sorted" rm -f "$basef.NT.snap.matched.fulllength.gi" "$basef.NT.snap.matched.fullength.gi.taxonomy" fi @@ -664,7 +686,7 @@ then sed '/>/d' $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8.fasta > $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8.fasta.seq paste $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8 $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8.fasta.seq > $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.m8 - taxonomy_lookup.pl $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores + taxonomy_lookup.pl $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory mv $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.all.annotated $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.annotated table_generator.sh $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.annotated RAP Y Y Y Y @@ -702,7 +724,7 @@ then paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e$ecutoff_Vir.NR.e${ecutoff_NR}.m8 $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta.seq > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 echo "made addseq file $(date)" echo "############# RAPSearch Taxonomy $(date)" - taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores + taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated echo "retrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated @@ -768,7 +790,7 @@ then paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.m8 $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.m8.fasta.seq > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.m8 echo "made addseq file $(date)" echo "############# RAPSearch Taxonomy $(date)" - taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.m8 blast prot $cores + taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.annotated echo "retrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.Viruses.annotated diff --git a/cutadapt_quality.csh b/cutadapt_quality.csh index 8277c1b..61bb303 100755 --- a/cutadapt_quality.csh +++ b/cutadapt_quality.csh @@ -21,10 +21,10 @@ # Please see license file for details. # Last revised 1/26/2014 -set TEMPDIR = "/tmp/" +# set TEMPDIR = "/tmp/" -if ($#argv != 5) then - echo "Usage: cutadapt_quality.csh " +if ($#argv != 6) then + echo "Usage: cutadapt_quality.csh " exit(1) endif @@ -34,6 +34,7 @@ set quality = $argv[2] set length_cutoff = $argv[3] set keep_short_reads = $argv[4] set adapter_set = $argv[5] +set TEMPDIR = $argv[6] ### set numreads_start = `egrep -c "@HWI|@M00|@SRR" $inputfile` diff --git a/preprocess.sh b/preprocess.sh index 9d322d0..72e2035 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -17,8 +17,8 @@ # Please see license file for details. # Last revised 1/26/2014 -if [ $# != 8 ]; then - echo "Usage: preprocess.sh " +if [ $# != 9 ]; then + echo "Usage: preprocess.sh " exit fi @@ -31,6 +31,7 @@ keep_short_reads=$5 adapter_set=$6 start_nt=$7 crop_length=$8 +temporary_files_directory=$9 ### if [ ! -f $inputfile ]; then @@ -65,12 +66,12 @@ then # modified to take into account anything in there [N or Y] echo `date` START1=$(date +%s) - cutadapt_quality.csh $basef.modheader.fastq $quality $length_cutoff $keep_short_reads $adapter_set + cutadapt_quality.csh $basef.modheader.fastq $quality $length_cutoff $keep_short_reads $adapter_set $temporary_files_directory mv $basef.modheader.cutadapt.fastq $basef.cutadapt.fastq else echo `date` START1=$(date +%s) - cutadapt_quality.csh $inputfile $quality $length_cutoff $keep_short_reads $adapter_set + cutadapt_quality.csh $inputfile $quality $length_cutoff $keep_short_reads $adapter_set $temporary_files_directory fi END1=$(date +%s) diff --git a/preprocess_ncores.sh b/preprocess_ncores.sh index 8391ecc..392635c 100755 --- a/preprocess_ncores.sh +++ b/preprocess_ncores.sh @@ -14,8 +14,8 @@ # Please see license file for details. # Last revised 1/26/2014 -if [ $# -lt 8 ]; then - echo "Usage: preprocess_ncores.sh <# of cores> " +if [ $# != 11 ]; then + echo "Usage: preprocess_ncores.sh <# of cores> " exit fi @@ -30,6 +30,7 @@ keep_short_reads=$7 adapter_set=$8 start_nt=$9 crop_length=${10} +temporary_files_directory=${11} ### if [ ! -f $inputfile ]; @@ -78,7 +79,7 @@ do mv $f $f.fastq echo "preprocessing $f.fastq..." echo "preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length >& $f.preprocess.log &" - preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length >& $f.preprocess.log & + preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log & done for job in `jobs -p` diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl index 7dee8b6..0bae5d7 100755 --- a/taxonomy_lookup.pl +++ b/taxonomy_lookup.pl @@ -18,14 +18,13 @@ use Time::HiRes qw[gettimeofday tv_interval]; use DBI; -if ( @ARGV != 4 ) { - print "USAGE: \n"; +if ( @ARGV != 5 ) { + print "USAGE: taxonomy_lookup.pl \n"; exit; } -my ($inputfile, $filetype, $seq_type, $cores) = @ARGV; +my ($inputfile, $filetype, $seq_type, $cores, $database_directory) = @ARGV; -my $database_directory = "/reference/taxonomy"; my $sql_taxdb_loc_nucl = "$database_directory/gi_taxid_nucl.db"; my $sql_taxdb_loc_prot = "$database_directory/gi_taxid_prot.db"; my $names_nodes = "$database_directory/names_nodes_scientific.db"; diff --git a/taxonomy_lookup_embedded.pl b/taxonomy_lookup_embedded.pl index 4e575ea..c2b652e 100755 --- a/taxonomy_lookup_embedded.pl +++ b/taxonomy_lookup_embedded.pl @@ -24,19 +24,16 @@ my $sql_taxdb_loc; my $taxid; -my $database_directory = "/reference/taxonomy"; -my $sql_taxdb_loc_nucl = "$database_directory/gi_taxid_nucl.db"; -my $sql_taxdb_loc_prot = "$database_directory/gi_taxid_prot.db"; +# my $database_directory = "/reference/taxonomy"; my $lineage=""; my $name; -my $names_nodes = "$database_directory/names_nodes_scientific.db"; my $gi; my $gi_count = 0; sub trim($); my %rank_to_print; -our ($opt_d, $opt_k, $opt_p, $opt_c, $opt_o, $opt_f, $opt_g, $opt_s, $opt_l, $opt_h, $opt_x); +our ($opt_q, $opt_d, $opt_k, $opt_p, $opt_c, $opt_o, $opt_f, $opt_g, $opt_s, $opt_l, $opt_h, $opt_x); -getopts('d:kpcofgslhtxn:m:'); +getopts('q:d:kpcofgslhx'); if ($opt_h) { print <