Skip to content

Commit

Permalink
parameter modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
Scot Federman committed Apr 24, 2014
1 parent cd8160c commit 4f8ee13
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 29 deletions.
38 changes: 30 additions & 8 deletions SURPI.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# Please see license file for details.
# Last revised 1/26/2014

version="1.0.5" #SURPI version
version="1.0.6" #SURPI version

optspec=":a:c:d:f:hi:l:m:n:p:q:r:s:vx:z:"
bold=$(tput bold)
Expand Down Expand Up @@ -218,6 +218,13 @@ SURPI_db_directory_FAST="/reference/FAST_SNAP"
#This folder should contain both files created by RAPSearch - indexed database + .info file
RAPSearch_db_directory="/reference/RAPSearch"
#Taxonomy Reference data directory
#This folder should contain the 3 SQLite files created by the script "create_taxonomy_db.sh"
#gi_taxid_nucl.db - nucleotide db of gi/taxonid
#gi_taxid_prot.db - protein db of gi/taxonid
#names_nodes_scientific.db - db of taxonid/taxonomy
taxonomy_db_directory="/reference/taxonomy"
#RAPSearch viral database name: indexed protein dataset (all of Viruses) located in RAPSearch_db_directory
#make sure that directory also includes the .info file
RAPSearchDB_Vir="rapsearch_viral_aa_130628_db_v2.12"
Expand All @@ -228,6 +235,11 @@ RAPSearchDB_NR="rapsearch_nr_130624_db_v2.12"
#e value for BLASTn used in coverage map generation
eBLASTn="1e-15"
#specify a location for storage of temporary files.
#Space needed may be up to 10x the size of the input file.
#This folder will not be created by SURPI, so be sure it already exists with proper permissions.
temporary_files_directory="/tmp/"
EOF
) > $configprefix.config
#------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -376,8 +388,11 @@ fi
nopathf=${FASTQ_file##*/} # remove the path to file
basef=${nopathf%.fastq}

#This is the point to verify all input before the SURPI run initiates
# add verification for all databases and access to all dependencies

#verify taxonomy is functioning properly
result=$( taxonomy_lookup_embedded.pl -d nucl 149408158 )
result=$( taxonomy_lookup_embedded.pl -d nucl -q $taxonomy_db_directory 149408158 )
if [ $result = "149408158" ]
then
echo "taxonomy is functioning properly."
Expand Down Expand Up @@ -422,13 +437,17 @@ echo "Command Line Usage: $scriptname $@"
echo "SURPI version: $version"
echo "config_file: $config_file"
echo "Server: $host"
echo "Working directory: $( pwd )"
echo "run_mode: $run_mode"
echo "inputfile: $inputfile"
echo "inputtype: $inputtype"
echo "FASTQ_file: $FASTQ_file"
echo "cores used: $cores"
echo "Raw Read quality: $quality"
echo "Read length_cutoff for preprocessing under which reads are thrown away: $length_cutoff"

echo "temporary files directory used: $temporary_files_directory"

echo "SURPI_db_directory housing the reference databases for Comprehensive Mode: $SURPI_db_directory_COMP"
echo "SURPI_db_directory housing the reference databases for Fast Mode: $SURPI_db_directory_FAST"

Expand All @@ -440,6 +459,7 @@ echo "RAPSearch indexed viral db used: $RAPSearchDB_Vir"
echo "RAPSearch indexed NR db used: $RAPSearchDB_NR"
echo "rapsearch_database: $rapsearch_database"

echo "taxonomy database directory: $taxonomy_db_directory"
echo "adapter_set: $adapter_set"

echo "Raw Read length: $length"
Expand All @@ -449,6 +469,8 @@ echo "abysskmer length: $abysskmer"
echo "cache_reset: $cache_reset"
echo "start_nt: $start_nt"
echo "crop_length: $crop_length"

echo "e value for BLASTn used in coverage map generation: $eBLASTn"
echo "-----------------------------------------------------------------------------------------"

curdate=$(date)
Expand All @@ -466,8 +488,8 @@ then
echo -n "Starting: preprocessing using $cores cores "
date
START2=$(date +%s)
echo "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length>& $basef.preprocess.log"
preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length>& $basef.preprocess.log
echo "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length>& $basef.preprocess.log $temporary_files_directory"
preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log
echo -n "Done: preprocessing "
date
END2=$(date +%s)
Expand Down Expand Up @@ -571,7 +593,7 @@ then
###retrieve taxonomy matched to NT ###
echo -n "taxonomy retrieval for $basef.NT.snap.matched.fulllength.sam"
date
taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores >& "$basef.taxonomy.SNAPNT.log"
taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores $taxonomy_db_directory >& "$basef.taxonomy.SNAPNT.log"
sed 's/NM:i:\([0-9]\)/0\1/g' "$basef.NT.snap.matched.fulllength.all.annotated" | sort -k 14,14 > "$basef.NT.snap.matched.fulllength.all.annotated.sorted"
rm -f "$basef.NT.snap.matched.fulllength.gi" "$basef.NT.snap.matched.fullength.gi.taxonomy"
fi
Expand Down Expand Up @@ -664,7 +686,7 @@ then

sed '/>/d' $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8.fasta > $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8.fasta.seq
paste $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8 $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.m8.fasta.seq > $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.m8
taxonomy_lookup.pl $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores
taxonomy_lookup.pl $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory
mv $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.addseq.all.annotated $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.annotated

table_generator.sh $basef.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.annotated RAP Y Y Y Y
Expand Down Expand Up @@ -702,7 +724,7 @@ then
paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e$ecutoff_Vir.NR.e${ecutoff_NR}.m8 $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta.seq > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8
echo "made addseq file $(date)"
echo "############# RAPSearch Taxonomy $(date)"
taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores
taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory
cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated
echo "retrieved taxonomy"
grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated
Expand Down Expand Up @@ -768,7 +790,7 @@ then
paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.m8 $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.m8.fasta.seq > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.m8
echo "made addseq file $(date)"
echo "############# RAPSearch Taxonomy $(date)"
taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.m8 blast prot $cores
taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory
cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.annotated
echo "retrieved taxonomy"
grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPsearch.e${ecutoff_NR}.Viruses.annotated
Expand Down
7 changes: 4 additions & 3 deletions cutadapt_quality.csh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
# Please see license file for details.
# Last revised 1/26/2014

set TEMPDIR = "/tmp/"
# set TEMPDIR = "/tmp/"

if ($#argv != 5) then
echo "Usage: cutadapt_quality.csh <input FASTQ file> <quality S/I> <length cutoff> <keep short reads Y/N> <adapter_set>"
if ($#argv != 6) then
echo "Usage: cutadapt_quality.csh <input FASTQ file> <quality S/I> <length cutoff> <keep short reads Y/N> <adapter_set> <temporary_files_directory>"
exit(1)
endif

Expand All @@ -34,6 +34,7 @@ set quality = $argv[2]
set length_cutoff = $argv[3]
set keep_short_reads = $argv[4]
set adapter_set = $argv[5]
set TEMPDIR = $argv[6]
###

set numreads_start = `egrep -c "@HWI|@M00|@SRR" $inputfile`
Expand Down
9 changes: 5 additions & 4 deletions preprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
# Please see license file for details.
# Last revised 1/26/2014

if [ $# != 8 ]; then
echo "Usage: preprocess.sh <R1 FASTQ file> <S/I quality> <Y/N uniq> <length_cutoff; 0 for no length_cutoff> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length>"
if [ $# != 9 ]; then
echo "Usage: preprocess.sh <R1 FASTQ file> <S/I quality> <Y/N uniq> <length_cutoff; 0 for no length_cutoff> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length> <temporary_files_directory>"
exit
fi

Expand All @@ -31,6 +31,7 @@ keep_short_reads=$5
adapter_set=$6
start_nt=$7
crop_length=$8
temporary_files_directory=$9
###
if [ ! -f $inputfile ];
then
Expand Down Expand Up @@ -65,12 +66,12 @@ then
# modified to take into account anything in there [N or Y]
echo `date`
START1=$(date +%s)
cutadapt_quality.csh $basef.modheader.fastq $quality $length_cutoff $keep_short_reads $adapter_set
cutadapt_quality.csh $basef.modheader.fastq $quality $length_cutoff $keep_short_reads $adapter_set $temporary_files_directory
mv $basef.modheader.cutadapt.fastq $basef.cutadapt.fastq
else
echo `date`
START1=$(date +%s)
cutadapt_quality.csh $inputfile $quality $length_cutoff $keep_short_reads $adapter_set
cutadapt_quality.csh $inputfile $quality $length_cutoff $keep_short_reads $adapter_set $temporary_files_directory
fi

END1=$(date +%s)
Expand Down
7 changes: 4 additions & 3 deletions preprocess_ncores.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
# Please see license file for details.
# Last revised 1/26/2014

if [ $# -lt 8 ]; then
echo "Usage: preprocess_ncores.sh <R1 FASTQ file> <S/I quality> <Y/N uniq> <length cutoff; 0 for no length cutoff> <# of cores> <Y/N clear_cache> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length>"
if [ $# != 11 ]; then
echo "Usage: preprocess_ncores.sh <R1 FASTQ file> <S/I quality> <Y/N uniq> <length cutoff; 0 for no length cutoff> <# of cores> <Y/N clear_cache> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length> <temporary_files_directory>"
exit
fi

Expand All @@ -30,6 +30,7 @@ keep_short_reads=$7
adapter_set=$8
start_nt=$9
crop_length=${10}
temporary_files_directory=${11}
###

if [ ! -f $inputfile ];
Expand Down Expand Up @@ -78,7 +79,7 @@ do
mv $f $f.fastq
echo "preprocessing $f.fastq..."
echo "preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length >& $f.preprocess.log &"
preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length >& $f.preprocess.log &
preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &
done

for job in `jobs -p`
Expand Down
7 changes: 3 additions & 4 deletions taxonomy_lookup.pl
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,13 @@
use Time::HiRes qw[gettimeofday tv_interval];
use DBI;

if ( @ARGV != 4 ) {
print "USAGE: <taxonomy_lookup.pl> <blast_file/sam_file> <file_type:blast/sam> <nucl/prot> <cores>\n";
if ( @ARGV != 5 ) {
print "USAGE: taxonomy_lookup.pl <blast_file/sam_file> <file_type:blast/sam> <nucl/prot> <cores> <taxonomy_reference_directory>\n";
exit;
}

my ($inputfile, $filetype, $seq_type, $cores) = @ARGV;
my ($inputfile, $filetype, $seq_type, $cores, $database_directory) = @ARGV;

my $database_directory = "/reference/taxonomy";
my $sql_taxdb_loc_nucl = "$database_directory/gi_taxid_nucl.db";
my $sql_taxdb_loc_prot = "$database_directory/gi_taxid_prot.db";
my $names_nodes = "$database_directory/names_nodes_scientific.db";
Expand Down
22 changes: 15 additions & 7 deletions taxonomy_lookup_embedded.pl
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,16 @@
my $sql_taxdb_loc;
my $taxid;

my $database_directory = "/reference/taxonomy";
my $sql_taxdb_loc_nucl = "$database_directory/gi_taxid_nucl.db";
my $sql_taxdb_loc_prot = "$database_directory/gi_taxid_prot.db";
# my $database_directory = "/reference/taxonomy";
my $lineage="";
my $name;
my $names_nodes = "$database_directory/names_nodes_scientific.db";
my $gi;
my $gi_count = 0;
sub trim($);
my %rank_to_print;
our ($opt_d, $opt_k, $opt_p, $opt_c, $opt_o, $opt_f, $opt_g, $opt_s, $opt_l, $opt_h, $opt_x);
our ($opt_q, $opt_d, $opt_k, $opt_p, $opt_c, $opt_o, $opt_f, $opt_g, $opt_s, $opt_l, $opt_h, $opt_x);

getopts('d:kpcofgslhtxn:m:');
getopts('q:d:kpcofgslhx');

if ($opt_h) {
print <<USAGE;
Expand All @@ -51,11 +48,17 @@
Usage:
To look up taxonomy for a file containing gi
taxonomy_lookup_embedded.pl -kpcofgsx -d nucl 149408158
taxonomy_lookup_embedded.pl -kpcofgsx -d nucl -q "/reference/taxonomy" 149408158
Command Line Switches:
-h Show help & ignore all other switches
-q folder containing taxonomy databases
This folder should contain the 3 SQLite files created by the script "create_taxonomy_db.sh"
gi_taxid_nucl.db - nucleotide db of gi/taxonid
gi_taxid_prot.db - protein db of gi/taxonid
names_nodes_scientific.db - db of taxonid/taxonomy
-d nucl/prot
This specifies whether the gi list are nucleotides or protein. It is a required parameter.
Expand All @@ -74,6 +77,11 @@
exit;
}

my $database_directory = $opt_q;
my $sql_taxdb_loc_nucl = "$database_directory/gi_taxid_nucl.db";
my $sql_taxdb_loc_prot = "$database_directory/gi_taxid_prot.db";
my $names_nodes = "$database_directory/names_nodes_scientific.db";

if ($opt_k) {$rank_to_print{kingdom} = "1";}
if ($opt_p) {$rank_to_print{phylum} = "1";}
if ($opt_c) {$rank_to_print{class} = "1";}
Expand Down

0 comments on commit 4f8ee13

Please sign in to comment.