analysis_directory/parameters.tsv

############################################################################################
################################   PEMA 's PARAMETERS   ####################################
############################################################################################
#
#    for version 2.1.4
#
# In this file there are all the parameters that need to be assigned every time PEMA is 
# about to run! The parameters we have here, are not the only parameters of the tools 
# invoked by PEMA. Hence, we encourage you the most to study the manual of each tool and 
# make them as good as possible for your specific experiment.
#
# ATTENTION!
# From each variable you have to leave EXACTLY ONE (1) TAB and then fill in the parameter. 
#
#
############################################################################################
########################## The parameter setting starts frome here!  #######################
############################################################################################
#
# Give in your each uniq experiment a NAME, so a single output file will be created for each of them 
#
outputFolderName	test_Swarm_16S
#
#
# PEMA needs the names of the samples to be as in ENA format (e.g "ERR1021912") but 
# they are more like as the Illumina sequencer returned them (e.g "FdCor_1.fast.gz") 
# In order to change the names of your samples as they should be.
# Otherwise, you need to set the "EnaData" variable as 'No'.  
EnaData	Yes
#
# Each sequencer has a special combination of letters with which all its reads start with
# For example MiSeq has "@M0". Please provide this pattern to PEMA to convert your data to the ENA 
# format. If you do not know that, you can either google it or gunzip a fastq.gz files of yours and provide
# the letatters between the "@" and the first ":" characters.
sequencerPrefix	M0
#
#############################################################
#################   fastqc (v0.11.8)    #####################
#############################################################
#
# No parameters here!
#
#
#############################################################
################ trimmomatic (v.0.38)   #####################	// http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf
#############################################################
#
# Performs an adaptive quality trim, balancing the benefits of retaining longer reads against the costs of retaining bases with errors.
# It needs to be set either as 'Yes' or 'No'.
#
maxInfo	Yes
#
#
############ for MAXINFO ####################
# Specifies the read length which is likely to allow the location of the read within the target sequence to be determined.
# It needs an integer to be set as a value.
#
targetLength	100
#
#
# This value, specifies the balance between preserving as much read length as possible vs. removal of incorrect bases.
# It can take values between 0 and 1.
# A low value of this parameter (<0.2) favours longer reads, while a high value (>0.8) favours read correctness.
#
strictness	0.8
#
#
############ for ILLUMINACLIP ##################
# Specifies the path to a fasta file containing all the adapters, PCR sequences etc.
# The naming of the various sequences within this file determines how they are used.
#
adapters	TruSeq2-PE.fa
#
#
# This parameter specifies the maximum mismatch count which will still allow a full match to be performed.
# It needs an integer to be set as a value.
#
seedMismatches	0
#
#
# This parameter specifies how accurate the match between the two 'adapter ligated' reads must be for PE palindrome read alignment.
# It needs an integer to be set as a value.
#
palindromeClipThreshold	20
#
#
# It specifies how accurate the match between any adapter etc. sequence must be against a read.
# It needs an integer to be set as a value.
#
simpleClipThreshold	30
#
#
############ for LEADING  ##########################
# The LEADING modules, removes low quality bases from the beginning.
# As long as a base has a value below this threshold (value of the 'leading' parameter) the base is removed and the next base will be investigated.
# It needs an integer to be set as a value.
#
leading	20
#
#
############ for TRAILING  #############################
# This module of Trimmomatic removes low quality bases from the end.
# As long as a base has a value below this threshold (value of the 'trailing' parameter), the base is removed
# and the next base (which as trimmomatic is starting from the 3' prime end, would be base preceding the just removed base) will be investigated.
# It needs an integer to be set as a value.
#
trailing	2
#
#
############  for MINLEN  ################################
# This module removes reads that fall below the specified minimal length. If required, it should normally be after all other processing steps.
# Reads removed by this step will be counted and included in the "dropped reads" count presented in the trimmomatic summary.
# It needs an integer to be set as a value.
#
minlen	100
#
#
# Finally, you need to set how many threads you want Trimmomatic to run into. 
#
threadsTrimmomatic	8
#
#
###############################################################
#########   BayesHammer (from SPAdes: v3.13.0)  ###############	
###############################################################
#
# This algorithm is part of the SPAdes tool package. You can find its manual here: http://cab.spbu.ru/files/release3.10.1/manual.html#sec1
#
# no parameters here!
#
#
################################################################
#################    PANDAseq (v. 2.11)      ###################	// https://storage.googleapis.com/pandaseq/pandaseq.html
################################################################
#
# PANDAseq is the algorithm that PEMA uses in order to merge the paired-end reads.
# PANDAseq has more than one merging algorithms.
#
# Here, we set the algorithm used for assembly. The most common of them are:
# pear --> uses the formula described in the PEAR paper (Zhang 2013), optionally with the probability of a random base (q) provide
# simple_bayesian --> uses the formula described in the original paper (Masella 2012), optionally with an error estimation (ε) provided.
# other options are stich, flash and more that you can fing in the above link.
#
pandaseqAlgorithm	simple_bayesian
#
#
# PANDAseq is a I/O bound algorithm. That means that it needs severous time in order to handle the ipnut and output files
# while the process is quite fast. However, it does support multithreading and here you can set the number of threads it is going to use.
#
pandaseqThreads	8
#
#
# The 'minlen' parameter sets the minimum length for a sequence, after primers are removed. 
# By default, all sequences are kept. With this option, sequences shorter than desired can be discarded.
# In case you need to use this parameter, be sure you leave a tab after 'minlen' and set it like this: '-l 80'
# If you do not want to use this parameter, please remove everything after the 'minlen' 
#
pandaseqMinlen	
#
#
# The 'minoverlap' parameter sets the minimum overlap between forward and reverse reads. 
# By default, this is at least one nucleotide of overlap. 
# Raising this number does not generally increase the quality of the output as alignments with small overlaps tend to score poorly and are discarded anyway.
#
minoverlap	1
#
#
# The 'threshold' parameter sets the score, between zero and one, that a sequence must meet to be kept in the output. 
# Any alignments lower than this will be discarded as low quality. 
# Increasing this number will not necessarily prevent uncalled bases (Ns) from appearing in the final sequence. 
# It is also used as the threshold to match primers, if primers are supplied. The default value is 0.6.
#
threshold	0.6
#
#
# The '-N' parameter eliminates all sequences with uncalled nucleotides in the output. 
# Otherwise, during assembly, uncalled bases (Ns) from unpaired regions may be emitted.
# If you need -N to be on your analysis, please add '-N' after 'elimination'. Please make sure you leave a tab. 
# If you do not want the parameter to be on, please make sure there is nothing after the 'elimination' parameter.
#
elimination	
#
#
## PEMA performs the PANDAseq algorithm, with the -a and the -B parameters also on.
## That it for striping the primers after assembly, rather than before and allowing input sequences to lack a barcode/tag correspondingly.
#
#
###############################################################
###########  obiuniq  (from OBITools: v1.2.12)  ###############
###############################################################
#
# This algorithm is part of the OBITools package. You can find more about them here: https://pythonhosted.org/OBITools/
#
# no parameters here!
#
#
###############################################################
################    VSEARCH  (v2.9.1)    ######################
###############################################################
#
# VSEARCH is the main algorithm used for a lot of steps in the case of the 16S marker gene.
# Set how many threads do you want PANDAseq to use.
# It needs an integer to be set as a value.
#
vsearchThreads	8
#
#
# Here you need to set a score about the clustering step of the VSEARCH algorithm. 
# Do not add a read into a certain cluster if the pairwise identity with its centroid, is lower than the value of the 'vsearchId' parameter.
# The pairwise identity is defined as the number of (matching columns) / (alignment length - terminal gaps).
# It needs a real number to be set as a value, ranging from 0.0 to 1.0 .
#
vsearchId	0.95
#
#
################################################################################################
#//////////////////////////////////////////////////////////////////////////////////////////////
############################ GENE -  dependent parameters #####################################
#//////////////////////////////////////////////////////////////////////////////////////////////
###############################################################################################
#
# The marker gene you have is really important for both the clustering & chimera removal procedure and the
# taxonomy assignment. By default, the pipeline runs for 16S. Substitute with 'COI' if COI is your marker gene
# write it down after the underscore (_) - do not erase the udenrscore!!
#
gene	gene_16S
#
#
##########################################################################################################
#  Here are some parameters needed when the the metabarcoding analysis is about the 16S/18S marker genes #
##########################################################################################################
#
#
# If your marker gene is 16S, you can choose between 2 different approaches of taxonomy assignment (alignment & phylogenetic based)
# An alignment based taxonomy assignment - set as 'alignment' -  which is based on SILVA and CREST (version 3.0).
# However, you can also get a phylogenetic based assignment, by putting 'phylogeny' in this parameter. In that case, a reference tree we created is being used as well as the RAxML 
#
taxonomyAssignmentMethod	alignment
#
#
# I you choose phylogeny based taxonomy assignment, then you ll need to rum PaPaRa.
# Pleas fill in how many cores PaPaRa is able to use.
#
numberOfCoresForPapara	7
#
#
# When you use the alignment-based taxonomy assignment, then the LCAClassifier from the CREST algorithm, uses a Silva version for the assignment. 
# PEMA allows you to choose between the two last version of Silva. Hence, set the "silvaVersion" parameter either as 'silva_128' or as 'silva_132'
# depending on the version of your choice. 
# In case you are running 18S rRNA data, you may also use the PR2 database, by setting the referenceDb parametera as 'pr2'.
#
referenceDb	silva_132
#
#
# As you may need a series of taxonomy assignment when you use the alignment-based method, please give another name in your 
# taxonomy output folder of the CREST algorithm, each time you are about to use it. 
# CREST creates an output folder every time and if a folder with the same already exists, it is going to abort the task!
# You need to set the value of this parameter in case of 16S/18S rRNA and ITS marker genes. 
#
taxonomyFolderName	my_taxon_assign
#
#
##########################################################################################################
#######################     For the case of the ITS marker gene     ######################################
##########################################################################################################
#
# For the case of ITS there is an extra problem with respect to the primes used.
# Please complete the next two variables with the primers you used
#
forwardITSPrimer	GATGAAGAACGYAGYRAA
reverseITSPrimer	CTBTTVCCKCTTCACTCG
#
#
##########################################################################################################
########     Here are some parameters needed with respect to clustering algorithms     ###############
##########################################################################################################
#
#
# For the case of the 16S and 18S rRNA marker genes, you can either get an OTU-table using the VSEARCH algorithm
# or you can get an ASV-table by taking advantage of the SWARM algorithm. 
# If your marker gene is  COI or ITS, you can choose between 2 different approaches of clustering. 
# Depending on which of them you choose you get either a robust output in a short time (Swarm) or a non-robust output (CROP) that requires quite much more time.
# Please fill in accroding to your marker gene and needs (write "Swarm" or "vsearch" or "CROP" after algo_).
#
clusteringAlgo	algo_Swarm
#
#
# In case of SWARM, the user needs to speeecify the value of "d" parameter,
# maximum number of differences allowed between two amplicons, meaning that two amplicons
# will be grouped if they hav e integer (or less) differences. This is swarm's most important
# parameter
#
d	15
#
#
# when using the option --fastidious (-f), define the minimum mass of a large ASV. 
# By default, an ASV with a mass of 3 or more is considered large. 
# Conversely, an ASV is small if it has a mass of less than 3, meaning that it is composed of either one amplicon of abundance 2, or two amplicons of abundance 1. 
# Any positive value greater than 1 can be specified. Using higher boundary values will speed up the second pass, but also reduce the taxonomical resolution of swarm
# results. Default mass of a large OTU is 3.
#
boundary	3
#
#
# You also need to set the number of threads that Swarm is able to use
#
swarmThreads	20
#
#
## SWARM tends to create a great numebr of ASVs, especially when d takes a low value. 
## Would you like to remove the singletons (ASVs that appear only once with abundance equal to 1) ?
#
removeSingletons	Yes
#
#
# CROP is able to run on multiple CPUs. As PEMA needs to update an environmental variable to do thi$
#
omp_num_threads	20
#
#
# PEMA invokes the UCHIME_DENOVO3 algorithm for the chimera removal in the case of the COI marker gene.
# This is a quite common algorithm for that step, but you still need to choose the minimum abundance skew value.
# By default this value is equal to 16. However, the only difference between UCHIME_DENOVO2 and 3 is that in the second version,
# the abskew value used to be equal to 2. So, be my guest and pick your value as I really have no clue about that.
# Probably, for environmental studies a low abskew is better, while in more specific studies a larger one would fit most.
#
abskew	2
#
#
#########################################################################################################
# 
#                     For the taxonomy assignment of the COI marker gene
#
#########################################################################################################
#
# Since PEMA v.2.0 both Midori and Midori2 are supported. The second version of Midori includes more than
# 180.000 unique species and expands beyond Metazoa. However, the computational time needed increases to a
# a great extent when Midori2 is the one selected. Choose wisely, especially in case you have a lot of query sequencies
# If you want Midori2 for your analysis, replace "1" with "2" after the "midori_" prefix.
#
midori_version	midori_1
#
#
#########################################################################################################
#
###########################################################################################################
#
#               In - house (local) refererence database for the taxonoomy assignment step
#
###########################################################################################################
# In case you would prefer to use a reference database of yours for the taxonomy step, PEMA will ask you for the 
# files necessary to train each of the two classifiers supported. You need to provide those in the directory called
# `local_ref_db` and depending on which classifier will train, the files have to be in a different format.
# More specifically, if you are working with COI, then PEMA uses thes RDPClassifier and you need to provide a taxonomy
# and a sequence file, as described here: https://john-quensen.com/tutorials/training-the-rdp-classifier/
# If you are working with 16S, 18S or ITS then PEMA uses the CREST classifier for the taxonomy step
# and you need to provide the corresponding two files as described at: https://github.com/lanzen/CREST
# (see after the Output section).
# As containers are lost when exit from one, you will have to train the classifier, every time you run a new PEMA container
# If you are about to use a custom ref db, set the following parameter as 'Yes'. Otherwise, it must be set as 'No'.
# The `name_of_custom_db` may be empty or no depending on whether you will use a custom db or not.
#
custom_ref_db	No
name_of_custom_db	partialCustomdb
#
#######################################################################################################
#
# In case you would like to link the OTU/ASV assignment to its closest NCBI Taxonomy Id
# set the following parameter as `Yes`
#
getNCBITaxId	No
#
######################################################################################################
#
# The following parameters is only for the case that the Phyloseq R package is about to run
#
######################################################################################################
#
# If wish to use Phyloseq in order to analyse your returned data then set the following  parameter 'phyloseq' with 'Yes'.
# In order to do that, PEMA needs an MSA that it is returned by the MAFFT (v7.427) aligner and a phylogeny tree of the OTUs found which is built by the RAxML-ng algorithm.
# Please remember that in order to use phyloseq a "metadata.tsv" file is necessary to be part of your anaylis folder. 
#
phyloseq	Yes
#
#
# The phyloseq object can handle phylogenetic trees as well. PEMA uses RAxML-ng in order to build such trees.
# Do you want to create such a tree with your OTUs? In case you build this once, you can use it as many times as you want.
#
tree	Yes
#
# In case you are about to use the phyloseq option, then a phylogeny tree has to be built. Hence, PEMA invokes the RAxML-ng algorithm
# which is able to run in more than one threads. Please set the number of threads RAxML is able to use.
#
raxmlThreads	5
#
#
# You can also set the number of the parsimony-based starting trees for the RAxML-ng 
#
parsTrees	1
#
#
# And finally, the number of the bootstrap trees
#
bootstrapTrees	1
#
###########################################################################################################
#
# Finally, do you want your raw data to be removed in another file and empty the "rawData" file and all the checkpoints of PEMA to be also in an extra folder
# Swich 'Yes' to 'No' if you wish so. Be very careful when you do that, as you need to remember that if you want to analyze another dataset through P.E.M.A
# you will have to remove the first one manually.
#
emptyRawDataFile	Yes
emptyCheckpoints	Yes
#
#
# Classifier to use, parameter values ("CREST","phylogeny","RDPClassifier")
# If you are about to use a version of Silva db, or Unite db or a custom db that you have prepared
# the required files accordingly 
# (see https://hariszaf.github.io/pema_documentation/training_crest_classifier/), 
# you may set this parameter as "CREST".
# In case you are running 16S data and you would like a phylogeny based taxonomy assignment, 
# you may set this parametera as "phylogeny"
# In case you are about to use a version of Midori or a custom db for which you have prepared the 
# required files (see https://hariszaf.github.io/pema_documentation/training_rdpclassifier/) you 
# may set this parameter as "RDPClassifier"
classifierAlgo	CREST
#
# 
#