param.txt

# This is the parameter file for the pathogen discovery pipeline.
# Every command (or "module", if you like) has its own settings. The settings for a particular module must follow its "command module" line. Otherwise, the order doesn't matter.

# How to use this parameter file:
# for boolean options, use "yes" or "1" for assent OR "no" or "0" or "-" for dissent (or simply comment out or omit the line).
# the default settings are generally "no" unless otherwise specified

# note: make sure nt and nr are up-to-date!

# -------------------
command step1

seq_platform				illumina										# choices are: illumina or 454

# explanation: must be run first --- hence titled step1. this module maps the fastq IDs into simple numerical IDs and processes .sff file if 454

# -------------------
command quality_filter

cutadapt_options_R1			-a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -g GCCGGAGCTCTGCAGATATC -a GATATCTGCAGAGCTCCGGC -m 50 --match-read-wildcards
cutadapt_options_R2			-a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT -g GCCGGAGCTCTGCAGATATC -a GATATCTGCAGAGCTCCGGC -m 50 --match-read-wildcards

# cutadapt_options			-g GCCGGAGCTCTGCAGATATC -m 20 --match-read-wildcards								# this should have every flag except the input and output ones
# cutadapt_options2			-g CGCCGTTTCCCAGTAGGTCTC -m 20 --match-read-wildcards								# this should have every flag except the input and output ones
# prinseq_options			-log -verbose -min_len 50 -ns_max_p 10 -derep 12345 								# this should have every flag except the input and output ones (i.e., don't specify the "-out_good" "-out_bad" or "fastq" flags)
prinseq_options				-min_len 50 -derep 14 -lc_method dust -lc_threshold 3 -trim_ns_left 1 -trim_ns_right 1 -trim_qual_right 15 	# this should have every flag except the input and output ones (i.e., don't specify the "-out_good" "-out_bad" or "fastq" flags)

# explanation: run up to 2 iterations of cutadapt, if specified, and prinseq, if specified

# -------------------
command host_map
# for the "_list" settings, use a comma-delimited list with no spaces

mapper_program_list		bowtie2,bowtie2											# choices are: bwa, bowtie2
mapper_db_list			/media/VD_Research/databases/humandna/hg38,/media/VD_Research/databases/humanrna/hg38_mrna
mapper_name_list		bowtie2_genome_local,bowtie2_transcript_local							# names that will appear on a graph
mapper_options_list		--local -p 10,--local -p 10											# flags for aligner(-p specifies how many threads to use)

# explanation: perform a chain of alignments.

command iterative_blast_phylo

# for the "_list" settings, use a comma-delimited list with no spaces

blast_db_list				/media/VD_Research/databases/ncbi/blast/nt/nt,/media/VD_Research/databases/ncbi/blast/nt/nt #,/media/VD_Research/databases/diamond/diamondnr blast db prefix
blast_task_list				megablast,dc-megablast								# options are: megablast dc-megablast blastn, diamond
# blast_options_list			-evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,-evalue 1e-4 -word_size 7		# blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are
blast_options_list			-evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12		##,--compress 0 -p 0 -v -k 10 -w 28 --id 0.7 -c 4 -t ~/tmp blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are hardwired)
ninst_list				10,10									# the input file will be broken into chunks and blasted in parallel - this parameter is the number of instances of BLAST you want to run in parallel
taxonomy_names				/media/VD_Research/databases/ncbi/taxonomy/names.dmp					# NCBI taxonomy names dump file
taxonomy_nodes				/media/VD_Research/databases/ncbi/taxonomy/nodes.dmp					# NCBI taxonomy nodes dump file
blast_pro_db                            /media/VD_Research/databases/ncbi/blast/nr/nr                                         # BLAST Protien database used for annotation
# explanation: perform a chain of blasts. the blast is performed in chunks to speed up the process. note: if you use diamond, make sure the chunks are really small. otherwise, it takes a long time.

# -------------------
command iterative_blast_phylo_2
# this is the same as iterative_blast_phylo --- it just allows you to do it an n-th time

# for the "_list" settings, use a comma-delimited list with no spaces

blast_db_list				/media/VD_Research/databases/ncbi/blast/nt/nt,/media/VD_Research/databases/ncbi/blast/nt/nt # blast db prefix
blast_task_list				megablast,dc-megablast								# options are: megablast dc-megablast blastn diamond
# blast_options_list			-evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,-evalue 1e-4 -word_size 7		# blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are
blast_options_list			-evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12		# blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are hardwired)
ninst_list				10,10						# the input file will be broken into chunks and blasted in parallel - this parameter is the number of instances of BLAST you want to run in parallel
taxonomy_names				/media/VD_Research/databases/ncbi/taxonomy/names.dmp					# NCBI taxonomy names dump file
taxonomy_nodes				/media/VD_Research/databases/ncbi/taxonomy/nodes.dmp					# NCBI taxonomy nodes dump file

# explanation: run another instance of the iterative_blast_phylo module. you can run more instances by adding entries in this parameter file for "iterative_blast_3", "iterative_blast_4", etc
#
command iterative_blast_phylo_3
# Run diamond on get_orf output

blast_db_list				/media/VD_Research/databases/diamond/diamondnr # blast db prefix
blast_task_list				diamond								# options are: megablast dc-megablast blastn diamond
# blast_options_list			-evalue 1e-4 -word_size 28,-evalue 1e-4 -word_size 12,-evalue 1e-4 -word_size 7		# blast options (except for: -task -query -db -out -outfmt -num_descriptions; these are
blast_options_list			 --compress 0 -p 0 -v -k 10 -w 28 --id 0.7 -c 4 -t ~/tmp
ninst_list				    10
taxonomy_names				/media/VD_Research/databases/ncbi/taxonomy/names.dmp					# NCBI taxonomy names dump file
taxonomy_nodes				/media/VD_Research/databases/ncbi/taxonomy/nodes.dmp					# NCBI taxonomy nodes dump file
blast_pro_db                /media/VD_Research/databases/ncbi/blast/nr/nr

# -------------------
command ray2_assembly

kmer					25											# assembler k-mer
ninst   				10											# number of instances for mpiexec
cap   					1											# use cap after ray
# cap_options   														# cap options
map2contigs				yes											# if "yes" or "1", map reads back onto assembly
bowtie2_options			--local -p 10											# only nec if map2contigs. Options for the mapper(-p specifies how many threads)

# explanation: perform an assembly using Ray
parse_contigs                           yes                                             #if yes or 1 parse contigs into reads

# -------------------
command orf_filter
# filter fasta input by orf

getorf_options				-minsize 60 -find 0 									# any options other than "-sequence" and "-outseq"
# -------------------