_run_melseq

#!/bin/sh

export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/melseq_prism/seq_prisms 
export GBS_PRISM_BIN=/dataset/gseq_processing/active/bin/gbs_prism
export MELSEQ_PRISM_BIN=/dataset/gseq_processing/active/bin/melseq_prism
export HISEQ_ROOT=/dataset/gseq_processing/scratch/illumina/hiseq

NUM_THREADS=8 # for make

function read_answer_with_default() {
   read answer
   if [ -z "$answer" ]; then
      answer=$@
   fi
}

function get_opts() {
   INTERACTIVE=no
   INTERACTIVE_OPT=""
   BATCH_MODE="no"
   help_text="
This script is called by run_melseq (or non-interactively by the gbs pipeline)
usage :\n
"
   while getopts ":hba:O:" opt; do
   case $opt in
       h)
         echo -e $help_text
         exit 0
         ;;
       b)
         BATCH_MODE=yes
         ;;
       a)
         ANALYSIS=$OPTARG
         ;;
       O)
         OUT_ROOT=$OPTARG
         ;;
       \?)
         echo "Invalid option: -$OPTARG" >&2
         exit 1
         ;;
       :)
         echo "Option -$OPTARG requires an argument." >&2
         exit 1
         ;;
     esac
   done

   shift $((OPTIND-1))

   if [ $BATCH_MODE == "yes" ]; then
      DATASET=$@ 
   else 
      melseq_version=$1
   fi
}


function get_line_count() {
   line_count=`wc -l $1 | awk '{print $1}' -`
}

function get_input_files() {
   # this gets input files to process, including the sample info 
   while [ 1 ]; do
      if [ -f $OUT_ROOT/input_file_list.txt ]; then

         echo "looks like you might be restarting a previous run - $OUT_ROOT/input_file_list.txt exists"
         echo "(this is OK to restart from the current step - see below to restart from a previous step)" 
         echo "the input for the current step would be . . . (press Enter/Return for a list - then press space bar to page through listing)"
         echo "(answer n to the next OK ? prompt you get, if you want to backtrack, or customise the list)"
         read answer
         more $OUT_ROOT/input_file_list.txt
         echo "OK ? (default = y)"
         read_answer_with_default y
         if [ $answer == "y" ]; then
            echo ""
         else
            echo "
ok quitting - here are some suggestions for modifying the list of files to be processed, before re-trying:

* you could edit $OUT_ROOT/input_file_list.txt
* you could delete $OUT_ROOT/input_file_list.txt 
* you could replace $OUT_ROOT/input_file_list.txt with one of the following input lists, if starting from that step : 
"
            ls -l $OUT_ROOT/*input*.txt
            echo "
(i.e. cp _one_of_the_inputs  $OUT_ROOT/input_file_list.txt)

note that as well as setting up $OUT_ROOT/input_file_list.txt , you should also remove the job-completed 
landmark files for the job you want to run - for example if you want to re-start from the format step,
then as well as copying the format step input file list , to $OUT_ROOT/input_file_list.txt, you should 
rm $OUT_ROOT/*format
            "
            exit 1
         fi
      fi

      # we only need to get the sample info file ( needed for demultiplexing) if we are not starting with merged fasta (as in that case demultiplex has already been done)
      if [ $start_with_merged != "yes" ]; then

          if [[ ( -f $OUT_ROOT/sample_info.txt ) && ( ( $ANALYSIS == "demultiplex" ) || ( $ANALYSIS == "all" ) || ( $ANALYSIS == "merge_lanes" ) || ( $ANALYSIS == "trim" ) || ( $ANALYSIS == "format" ) )  ]]; then
             echo "will use the following sample info . . . (press Enter/Return for a list - then press space bar to page through listing)"
             read answer
             more $OUT_ROOT/sample_info.txt
             echo "OK ? (default = y)"
             read_answer_with_default y
             if [ $answer == "y" ]; then
                if [ -f $OUT_ROOT/input_file_list.txt ]; then
                   break   # we have everything we need 
                fi
             else
                echo "
    ok quitting - you can edit the file $OUT_ROOT/sample_info.txt in order to customise the
    sample info , or remove it so that it is re-extracted, and try again"
                exit 1
             fi
          fi
      fi

      if [ ! -f $OUT_ROOT/input_file_list.txt ]; then 
         echo "
finding files to process...(if this fails, you can make a list yourself and save to $OUT_ROOT/input_file_list.txt)"
         if [ $start_with_merged != "yes" ]; then
            ls $DATASET > $OUT_ROOT/input_file_list.txt
         else
            ls $DATASET/*.non-redundant.fasta > $OUT_ROOT/input_file_list.txt
            ls $DATASET/*.non-redundant.fasta > $OUT_ROOT/blast_input_file_list.txt
         fi
         get_line_count $OUT_ROOT/input_file_list.txt
         if [  $line_count == 0  ]; then
            echo "sorry could not find any data in $DATASET , for $ANALYSIS"
            exit 1
         fi

         echo "will process the following files: . . . (press Enter/Return for a list - then press space bar to page through listing)"
         read answer
         more $OUT_ROOT/input_file_list.txt
         echo "OK ? (default = y)"
         read_answer_with_default y
         if [ $answer != "y" ]; then
            echo "
ok quitting - you can edit the file $OUT_ROOT/input_file_list.txt in order to customise the
files to process, or remove it so that the data folder is re-scanned for files to process, and try again"
            exit 1
         fi
      fi

      # we have files to process
      # try getting the keyfile if not there and we need it 
      if [[ ( ! -f $OUT_ROOT/sample_info.txt ) && ( $start_with_merged != "yes" ) && ( ( $ANALYSIS == "demultiplex" ) || ( $ANALYSIS == "all" ) || ( $ANALYSIS == "merge_lanes" ) || ( $ANALYSIS == "trim" ) || ( $ANALYSIS == "format" ) ) ]]; then
         echo "trying to get keyfile... "

         set -x
         # try to get SQ number 
         for file in `ls $DATASET`; do
            base=`basename $file`
            sq=`echo $base | awk -F_ '{print $1}' -`
            break
         done
         # if we could not get something like an SQ number , prompt for it
         echo $sq | egrep -q "^SQ" > /dev/null 2>&1
         if [ $? != 0 ]; then
            echo "unable to guess SQ number from sequence filename - please enter SQ number if applicable"
            read_answer_with_default none
            if [ $answer == "none" ]; then
               echo "OK, please create $OUT_ROOT/sample_info.txt, containing sample barcode info, and try again"
               exit 1
            else
               sq=$answer
            fi
         fi
               
         set -x
         if [ -z "$COHORT" ]; then 
            $GBS_PRISM_BIN/listDBKeyfile.sh -s $sq -t gbsx  | awk '{if(NR>1) print}' - > $OUT_ROOT/sample_info.txt
         else
            $GBS_PRISM_BIN/listDBKeyfile.sh -s $sq -g $COHORT -t gbsx  | awk '{if(NR>1) print}' - > $OUT_ROOT/sample_info.txt
         fi
         set +x
         if [ $? != 0 ]; then
            rm -f OUT_ROOT/sample_info.txt >/dev/null 2>&1
         fi
         set +x 
         if [ ! -f $OUT_ROOT/sample_info.txt ]; then
            echo "was unable to get keyfile for this dataset - please create $OUT_ROOT/sample_info.txt, containing sample barcode info, and try again"
            exit 1
         fi  
         get_line_count $OUT_ROOT/sample_info.txt
         if [ $line_count == 1 ]; then
            echo "was unable to get keyfile for this run - please create $OUT_ROOT/sample_info.txt, containing sample barcode info, and try again"
            exit 1
         else
            echo "will use the following sample info . . . (press Enter/Return for a list - then press space bar to page through listing)"
            read answer
            more $OUT_ROOT/sample_info.txt
            echo "OK ? (default = y)"
            read_answer_with_default y
            if [ $answer != "y" ]; then
               echo "
ok quitting - you can edit the file $OUT_ROOT/sample_info.txt in order to customise the
sample info , or remove it so that it is re-extracted, and try again"
               exit 1
            else
               break
            fi
         fi
      fi

      break
   done
}


function get_run_opts() {

   DRY_RUN=no
   DEBUG=no
   HPC_TYPE=slurm
   FILES=""
   OUT_ROOT=""
   BLAST_DATABASE=""
   TAX_DATA=""
   SIMILARITY=""
   SEQLENGTH_MIN=""
   FILE_TYPE=""
   ENZYME_INFO=""
   SAMPLE_INFO=""
   SEQLENGTH_MIN=""
   SEQQUAL_MIN=$OPTARG

   PROCESSING_ROOT=/dataset/hiseq/scratch/postprocessing/melseq


   echo "*** melseq analysis version $melseq_version ***
* note that you can paste into your terminal window by clicking your right mouse button
* you can select text in the terminal window using your left mouse button
* at any stage you can press CTRL-C to exit the dialogs
"

   ####### get and check DATASET
   while [ 1 ] ; do
      echo "
please give either the name of the file(s) or folder(s) you would like to process (or just press enter if you are restarting) 
e.g. 

/dataset/gseq_processing/scratch/illumina/hiseq/190524_D00390_0462_ACDN6VANXX/SampleSheet/bcl2fastq/Research/SQ0990_S2311_L008_R1_001.fastq.gz

/dataset/gseq_processing/active/bin/melseq_prism/test/SQ0990_S2311_L008_R1_sample.fastq.gz

/dataset/gseq_processing/scratch/illumina/hiseq/190614_D00390_0467_ACDR6TANXX/SampleSheet/bcl2fastq/Undetermined_S0_L004_R1_001.fastq.gz

/dataset/hiseq/active/fastq-link-farm/SQ1642_HCH3GDRXY_s_1_fastq.txt.gz

/dataset/sequencing_facility_replication/scratch/microbiome_fasta/qc/SQ1917_HGT5JDRX2_s_1_fastq.txt.gz/merged_fasta   # i.e. starting with merged_fasta from qc pipeline

"
      read_answer_with_default restarting 
      if [ "$answer" != "restarting"  ]; then 
         ls $DATASET > /dev/null 2>&1 
         if [ $? == 0 ]; then
            DATASET="$answer"
            break
         else
            echo "$answer not found"
         fi
      else 
         DATASET="$answer"
         break
      fi
   done

   if [ "$DATASET" != "restarting" ]; then
      for file in `ls $DATASET`; do
         DATASET_BASE=`basename $file`
         break
      done 
   else
      DATASET_BASE=restarting
   fi

   # test if DATASET looks like a folder containing merged fasta - if so we will start with merged data
   start_with_merged=yes
   TEST=`echo $DATASET | awk '{print $1}' `
   if [ -d $TEST ]; then
      ls $DATASET/*.non-redundant.fasta > /dev/null 2>&1
      if [ $? != 0 ]; then
         start_with_merged=no
      fi
   else
      start_with_merged=no
   fi
   echo "are you starting with demultiplexed-trimmed-merged-non-redundant fasta (as generated by qc pipeline) ? (press Enter/Return to use default , \"$start_with_merged\")"
   read_answer_with_default $start_with_merged
   start_with_merged=$answer

   # set up output folder
   if [ $DATASET_BASE != "restarting" ]; then
      if [ $start_with_merged != yes ]; then
         if [ -z "$COHORT" ]; then
            default=$PROCESSING_ROOT/$DATASET_BASE
         else
            default=$PROCESSING_ROOT/$DATASET_BASE/$COHORT
         fi
      else
         # currently, DATASET will look like /dataset/sequencing_facility_replication/scratch/microbiome_fasta/qc/SQ1917_HGT5JDRX2_s_1_fastq.txt.gz/merged_fasta
         #, from this we would want an output folder of /dataset/hiseq/scratch/postprocessing/melseq/SQ1917_HGT5JDRX2_s_1_fastq.txt.gz
         default=`dirname $DATASET`
         default=`basename $default`
         default=$PROCESSING_ROOT/$default
      fi
   else
      default=""
   fi

   while [ 1 ]; do
      if [ $DATASET_BASE == "restarting" ]; then
         echo "please specify the original output base folder"
         read_answer_with_default $default
      else
         echo "please specify output base folder (or just press Enter/Return to use default , $default )"
         read_answer_with_default $default
      fi
      NEW_ROOT=$answer
      if [ -d $NEW_ROOT ]; then
         echo "warning - $NEW_ROOT already exists, use anyway ? (y/n, default=y)"
         read_answer_with_default y
         if [[ ( $answer == "y" ) || ( -z $answer ) ]]; then
            OUT_ROOT=$NEW_ROOT
            break
         fi
      else
         mkdir -p $NEW_ROOT
         if [ -d $NEW_ROOT ]; then
            OUT_ROOT=$NEW_ROOT
            break
         fi
      fi
   done

   echo "will use output root folder $OUT_ROOT

   "

   if [ $DATASET_BASE == "restarting" ]; then
      if [ $start_with_merged != "yes" ]; then
         ANALYSIS=all
         return
      else
         ANALYSIS=format   # start from here at the earliest
      fi
   fi


   ####### get and check cohort 
   while [ 1 ] ; do
      echo "
if the library contains multiple cohorts, please enter the name of the cohort you wish to process (e.g. sheep)  (default = not applicable, just press enter)
"
      read_answer_with_default ""

      COHORT=$answer

      break
   done


   ####### get and check whether to run locally or on the cluster
   echo "

should this run be queued on the compute cluster ? (y/n, default=y. If n, will be run locally)
"
   read_answer_with_default y
   if [ "$answer" != "n" ]; then 
      HPC_TYPE=slurm
   else
      HPC_TYPE=local
      NUM_THREADS=2  # this ensures the number of threads launched locally is not *too* large
   fi

   ####### if they chose local , suggest they don't
   if [ $HPC_TYPE == "local" ]; then
      echo "
are you really really really sure you want to run this all on the local machine !? It will take longer, and will launch large numbers of processes  ? (y/n, default=n. If n, will change to SLURM )
"
      read_answer_with_default n
      if [ "$answer" == "n" ]; then
         HPC_TYPE=slurm
         NUM_THREADS=8
         echo "OK will use the compute cluster"
      fi
   fi
   
   ####### get and check the analysis type 
   while [ 1 ] ; do
      ANALYSIS=all
   
      if [ $start_with_merged != "yes" ]; then
         echo "
please give which analysis you want to build (all, demultiplex, trim, format, merge_lanes, blast, summarise, kmer_analysis, html, clean) (or just press enter to do them all)  
"
      else
         echo "
please give which analysis you want to build (blast, summarise, kmer_analysis, html, clean) (or just press enter to do them all)  
"
      fi 
      read_answer_with_default all
      ANALYSIS=$answer


      if [ $start_with_merged != "yes" ]; then
         if [[ ( $ANALYSIS != "all" ) && ( $ANALYSIS != "demultiplex" ) && ( $ANALYSIS != "trim" ) && ( $ANALYSIS != "format" ) && ( $ANALYSIS != "merge_lanes" ) && ( $ANALYSIS != "blast" ) && ( $ANALYSIS != "kmer_analysis" ) && ( $ANALYSIS != "summarise" ) && ( $ANALYSIS != "html" )  ]] ; then
            echo "analysis must be one of demultiplex, trim , format, merge_lanes, blast, summarise , kmer_analysis, html, clean) "
         else
            break
         fi
      else
         if [[ ( $ANALYSIS != "all" ) && ( $ANALYSIS != "blast" ) && ( $ANALYSIS != "kmer_analysis" ) && ( $ANALYSIS != "summarise" ) && ( $ANALYSIS != "html" )  ]] ; then
            echo "analysis must be one of blast, summarise , kmer_analysis, html, clean) "
         else
            break
         fi
      fi
   done

   echo "will build $ANALYSIS
   "

   # get input files (depends on what the ANALYSIS is ) 
   get_input_files

   # check if any existing results
   for result_type in demultiplex trim format blast merge_lanes trim summarise kmer_analysis html; do
      ls $OUT_ROOT/*.${result_type} > /dev/null 2>&1
      if [ $? == 0 ]; then
         echo "found some existing results ( e.g. ${result_type} ) under $OUT_ROOT - are you sure you want to continue (e.g. complete an interrupted run) ? (y/n)"
         read_answer_with_default n 
         if [[ ( $answer != "y" )  && ( $answer != "Y" ) ]]; then
            echo "ok quitting - please either use a different output folder or clean up $OUT_ROOT"
            exit 1
         fi
         break
      fi
   done

   ####### get and check the blast data to use 
   if [ 1 ] ; then
      while [ 1 ] ; do
         echo "
please give the full path to the blast database (or just press enter to use default, /dataset/GBS_Rumen_Metagenomes/scratch/blast_analysis/GenusPlusQuinella

(to use the GTDB database , enter /dataset/gseq_processing/scratch/melseq/gtdb/GTDB1 ) 
"
         read_answer_with_default /dataset/GBS_Rumen_Metagenomes/scratch/blast_analysis/GenusPlusQuinella
         BLAST_DATABASE=$answer

         if [[ -f ${BLAST_DATABASE}.nin || -f ${BLAST_DATABASE}.nal ]]; then
            break
         else
            echo "could not find blast index file ${BLAST_DATABASE}.nin or ${BLAST_DATABASE}.nal"
         fi
      done

      echo "will use blast database $BLAST_DATABASE"
   fi


   ####### get and check the taxonomiser to use
   TAXONOMISER=$MELSEQ_PRISM_BIN/summarizeR_counts.code
   blast_base=`basename $BLAST_DATABASE`
   if [ $blast_base == "GTDB1" ]; then
      TAXONOMISER=$MELSEQ_PRISM_BIN/gtdb/summarizeR_counts.code
   fi
   if [ 1 ] ; then
      while [ 1 ] ; do
         echo "
please give the full path to the taxonomiser script  (or just press enter to use default, $TAXONOMISER )

"
         read_answer_with_default $TAXONOMISER
         TAXONOMISER=$answer

         if [ ! -f $TAXONOMISER ]; then 
            echo "could not find taxonomiser script $TAXONOMISER"
         else
            break
         fi
      done

      echo "will use taxonomiser $TAXONOMISER"
   fi


   ####### get and check the similarity to use  
   if [ 1 ] ; then
      while [ 1 ] ; do
         echo "
please specify the similarity (as an evalue) to use for the taxonomy blast runs (or just press enter to use default, 0.02) 
"
         read_answer_with_default 0.02
         SIMILARITY=$answer

         python -c "print float('$SIMILARITY')" >/dev/null 2>&1
         if [ $? != 0 ]; then
            echo "looks like similarity requested ( $SIMILARITY ) is not a number"
         else
            break
         fi
      done

      echo "will use similarity $SIMILARITY"
   fi


   ####### get and check additional blast options for GTDB 
   BLAST_EXTRA=""
   blast_base=`basename $BLAST_DATABASE`
   if [ $blast_base == "GTDB1" ]; then
      BLAST_EXTRA="-max_hsps 1  -best_hit_overhang .15 -best_hit_score_edge .1"
   fi
   if [ 1 ] ; then
      while [ 1 ] ; do
         echo "
please enter any custom additional blast parameters for this database (or just press enter to use default, $BLAST_EXTRA  )
"
         read_answer_with_default "$BLAST_EXTRA"
         BLAST_EXTRA="$answer"

         break
      done

      echo "will use additional blast parameters $BLAST_EXTRA"
   fi


   ####### get and check the minimum length to use  
   if [ 1 ] ; then
      while [ 1 ] ; do
         SEQLENGTH_MIN=200 
         echo "
please specify the minimum seq length (or just press enter to use default, 40) 
"
         read_answer_with_default 40
         SEQLENGTH_MIN=$answer

         python -c "print float('$SEQLENGTH_MIN')" >/dev/null 2>&1
         if [ $? != 0 ]; then
            echo "looks like minimum seqlength requested ( $SEQLENGTH_MIN ) is not a number"
         else
            break
         fi
      done

      echo "will use minimum seqlength $SEQLENGTH_MIN"
   fi


   ####### get and check the minimum quality to use
   if [ 1 ]; then
      while [ 1 ] ; do
         SEQQUAL_MIN=20
         echo "
please specify the minimum seq quality (or just press enter to use default, 20)
"
         read_answer_with_default 20
         SEQQUAL_MIN=$answer

         python -c "print float('$SEQQUAL_MIN')" >/dev/null 2>&1
         if [ $? != 0 ]; then
            echo "looks like minimum seq qual requested ( $SEQQUAL_MIN ) is not a number"
         else
            break
         fi
      done

      echo "will use minimum seq qual $SEQQUAL_MIN"
   fi

   ####### get and check the blast wordsize  to use
   if [ 1 ]; then
      while [ 1 ] ; do
         WORDSIZE=16
         echo "
please specify the blast wordsize to use (or just press enter to use default, 16)
"
         read_answer_with_default 16
         WORDSIZE=$answer

         python -c "print float('$WORDSIZE')" >/dev/null 2>&1
         if [ $? != 0 ]; then
            echo "looks like wordsize requested ( $WORDSIZE ) is not a number"
         else
            break
         fi
      done

      echo "will use blast wordsize $WORDSIZE"
   fi

   ####### get and check the blast task to use
   if [ 1 ]; then
      while [ 1 ] ; do
         BLAST_TASK=blastn
         echo "
please specify the blast task to use (either blastn or megablast - just press enter to use default, blastn)
"
         read_answer_with_default blastn
         BLAST_TASK=$answer

         if [[ ( $BLAST_TASK != blastn ) && ( $BLAST_TASK != megablast ) ]]; then
            echo "blast task must be blastn or megablast"
         else
            break
         fi
      done

      echo "will use blast task $BLAST_TASK"
   fi

   ####### check whether want a dry run 
   echo "

do you want this to be a dry run ? (will generate wrapper scripts etc. but not run them (just press Enter for default no, or enter y)
"
   read_answer_with_default no
   DRY_RUN=$answer
   
} # get_run_opts

function check_opts() {
   # nothing here yet 
   echo ""
}

function echo_opts() {
   # nothing here yet 
   echo ""
}


#
# edit this method to set required environment (or set up
# before running this script)
#
function configure_env() {
   # not much here yet 
   cd $OUT_ROOT
   cp $MELSEQ_PRISM_BIN/run_melseq.mk $OUT_ROOT
   echo "
max_tasks=50
jobtemplatefile = \"$OUT_ROOT/larger_mem_slurm_array_job\"
" > $OUT_ROOT/tardis_demultiplex.toml
   if [ -f  $OUT_ROOT/tardis.toml ]; then
      cp $OUT_ROOT/tardis.toml $OUT_ROOT/tardis.toml.orig
   fi
   cp  $OUT_ROOT/tardis_demultiplex.toml $OUT_ROOT/tardis.toml
   cp $MELSEQ_PRISM_BIN/etc/larger_mem_slurm_array_job $OUT_ROOT
}


function check_env() {
   # nothing here yet 
   echo ""
}


function get_targets() {
   # make target monikers  and write associated
   # wrapper, which will be called by make

   rm -f $OUT_ROOT/*_targets.txt

   project_moniker=""
   for file in `ls $DATASET`; do
      if [ ! -z $project_moniker ]; then
         project_moniker="${project_moniker}_etc"
         break
      else
         project_moniker=`basename $file`
      fi
   done 

   if [ -z "$COHORT" ]; then
      project_moniker=${project_moniker}.all
   else
      project_moniker=${project_moniker}.$COHORT
   fi
      

   for analysis_type in all demultiplex trim format merge_lanes blast summarise kmer_analysis html clean; do
      echo $OUT_ROOT/$project_moniker.run_${analysis_type}  >> $OUT_ROOT/run_${analysis_type}_targets.txt
      script=$OUT_ROOT/${project_moniker}.run_${analysis_type}.sh
      if [ -f $script ]; then
         if [ ! $FORCE == yes ]; then
            echo "found existing script $script  - will re-use (use -f to force rebuild of scripts) "
            continue
         fi
      fi
   done


   dry_run_phrase=""
   if [ $DRY_RUN != "no" ]; then
      dry_run_phrase="-n"
   fi


   ############# demultiplex ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN 
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN
cp $OUT_ROOT/input_file_list.txt $OUT_ROOT/demultiplex_input_file_list.txt
$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a demultiplex -t $TAXONOMISER -l $OUT_ROOT/sample_info.txt -O $OUT_ROOT \`cat $OUT_ROOT/demultiplex_input_file_list.txt\` > $OUT_ROOT/run_demultiplex.log 2>&1
if [ \$? != 0 ]; then
   echo \"demultiplex returned an error code ( \$? )\"
   exit 1
fi
   ls $OUT_ROOT/demultiplex/*.demultiplexed/*.fastq.gz | grep -v undetermined > $OUT_ROOT/trim_input_file_list.txt 
" > $OUT_ROOT/${project_moniker}.run_demultiplex.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_demultiplex.sh

   # if starting with merged , satisfy this target 
   if [ $start_with_merged == "yes" ]; then
      date > $OUT_ROOT/${project_moniker}.run_demultiplex
   fi


   ############# trim  ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a trim -t $TAXONOMISER -m $SEQLENGTH_MIN -q $SEQQUAL_MIN -O $OUT_ROOT \`cat $OUT_ROOT/trim_input_file_list.txt\` > $OUT_ROOT/run_trim.log 2>&1
if [ \$? != 0 ]; then
   echo \"trim returned an error code ( \$? )\"
   exit 1
fi
   ls $OUT_ROOT/trimming/*.fastq > $OUT_ROOT/format_input_file_list.txt 
" > $OUT_ROOT/${project_moniker}.run_trim.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_trim.sh

   # if starting with merged , satisfy this target
   if [ $start_with_merged == "yes" ]; then
      date > $OUT_ROOT/${project_moniker}.run_trim
   fi

   ############# format   ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a format -t $TAXONOMISER -O $OUT_ROOT \`cat $OUT_ROOT/format_input_file_list.txt\` > $OUT_ROOT/run_format.log 2>&1
if [ \$? != 0 ]; then
   echo \"format returned an error code ( \$? )\"
   exit 1
fi
   ls $OUT_ROOT/fasta/*.non-redundant.fasta > $OUT_ROOT/blast_input_file_list.txt
" > $OUT_ROOT/${project_moniker}.run_format.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_format.sh

   # if starting with merged , satisfy this target
   if [ $start_with_merged == "yes" ]; then
      date > $OUT_ROOT/${project_moniker}.run_format
   fi

   ############# merge lanes ##################
   #
   # ***************************************
   # ********* this target deprecated ******
   # ***************************************
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a merge_lanes -t $TAXONOMISER -O $OUT_ROOT \`cat $OUT_ROOT/merge_lanes_input_file_list.txt\` > $OUT_ROOT/run_merge_lanes.log 2>&1
if [ \$? != 0 ]; then
   echo \"format returned an error code ( \$? )\"
   exit 1
fi
   ls $OUT_ROOT/merged_fasta/*.non-redundant.fasta  > $OUT_ROOT/blast_input_file_list.txt
   ls $OUT_ROOT/merged_fasta/*.non-redundant.fasta  > $OUT_ROOT/kmer_input_file_list.txt
" > $OUT_ROOT/${project_moniker}.run_merge_lanes.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_merge_lanes.sh

   # if starting with merged , satisfy this target
   if [ $start_with_merged == "yes" ]; then
      date > $OUT_ROOT/${project_moniker}.run_merge_lanes
   fi

   ############# blast   ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a blast -t $TAXONOMISER  -s $SIMILARITY -b $BLAST_DATABASE -w $WORDSIZE -T $BLAST_TASK -O $OUT_ROOT -x \"$BLAST_EXTRA\" \`cat $OUT_ROOT/blast_input_file_list.txt\` > $OUT_ROOT/run_blast.log 2>&1
if [ \$? != 0 ]; then
   echo \"blast returned an error code ( \$? )\"
   exit 1
fi
   ls $OUT_ROOT/blast/*.results.gz > $OUT_ROOT/summarise_input_file_list.txt
" > $OUT_ROOT/${project_moniker}.run_blast.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_blast.sh
   

   ############# summary   ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a summarise -t $TAXONOMISER -O $OUT_ROOT \`cat $OUT_ROOT/summarise_input_file_list.txt\` > $OUT_ROOT/run_summarise.log 2>&1
if [ \$? != 0 ]; then
   echo \"summarise returned an error code ( \$? )\"
   exit 1
fi
   ls $OUT_ROOT/summary/*.summary > $OUT_ROOT/html_input_file_list.txt
" > $OUT_ROOT/${project_moniker}.run_summarise.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_summarise.sh


   ############# kmer anlaysis   ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a kmer_analysis -t $TAXONOMISER  -O $OUT_ROOT \`cat $OUT_ROOT/kmer_input_file_list.txt\` > $OUT_ROOT/run_kmer_analysis.log 2>&1
if [ \$? != 0 ]; then
   echo \"kmer_analysis returned an error code ( \$? )\"
   exit 1
fi
" > $OUT_ROOT/${project_moniker}.run_kmer_analysis.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_kmer_analysis.sh

   ############# html   ##################
   echo "
export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
export MELSEQ_PRISM_BIN=$MELSEQ_PRISM_BIN
export GBS_PRISM_BIN=$GBS_PRISM_BIN

$MELSEQ_PRISM_BIN/melseq_prism.sh $dry_run_phrase -C $HPC_TYPE -a html -t $TAXONOMISER  -O $OUT_ROOT \`cat $OUT_ROOT/html_input_file_list.txt\` > $OUT_ROOT/run_html_summary.log 2>&1
if [ \$? != 0 ]; then
   echo \"html_summary returned an error code ( \$? )\"
   exit 1
fi
" > $OUT_ROOT/${project_moniker}.run_html.sh
   chmod +x $OUT_ROOT/${project_moniker}.run_html.sh
}


function fake_prism() {
   echo "dry run ! 

(logging to $OUT_ROOT/run_${ANALYSIS}.log

   "

   echo "
export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/melseq_prism/seq_prisms
export GBS_PRISM_BIN=/dataset/gseq_processing/active/bin/gbs_prism
export MELSEQ_PRISM_BIN=/dataset/gseq_processing/active/bin/melseq_prism
export HISEQ_ROOT=/dataset/gseq_processing/scratch/illumina/hiseq
cd $OUT_ROOT
make -f run_melseq.mk -n -d -k  --no-builtin-rules -j $NUM_THREADS \`cat $OUT_ROOT/run_${ANALYSIS}_targets.txt\` > $OUT_ROOT/dryrun_${ANALYSIS}.log 2>&1
" > $OUT_ROOT/restart.src


   make -f run_melseq.mk -n -d -k  --no-builtin-rules -j $NUM_THREADS `cat $OUT_ROOT/run_${ANALYSIS}_targets.txt` > $OUT_ROOT/dryrun_${ANALYSIS}.log 2>&1

echo "
** done **

(review at $OUT_ROOT )
"

   exit 0
}

function run_prism() {
   cd $OUT_ROOT

   echo "
export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/melseq_prism/seq_prisms
export GBS_PRISM_BIN=/dataset/gseq_processing/active/bin/gbs_prism
export MELSEQ_PRISM_BIN=/dataset/gseq_processing/active/bin/melseq_prism
export HISEQ_ROOT=/dataset/gseq_processing/scratch/illumina/hiseq
cd $OUT_ROOT
time make -f run_melseq.mk -d -k  --no-builtin-rules -j $NUM_THREADS \`cat $OUT_ROOT/run_${ANALYSIS}_targets.txt\` > $OUT_ROOT/run_melseq_restart.log 2>&1
" > $OUT_ROOT/restart.src

   echo "
starting run , logging to $OUT_ROOT/run_melseq.log 

(if you need to kill or restart this run, you can usually restart using 

source $OUT_ROOT/restart.src

)
"

   time make -f run_melseq.mk -d -k  --no-builtin-rules -j $NUM_THREADS `cat $OUT_ROOT/run_${ANALYSIS}_targets.txt` > $OUT_ROOT/run_melseq.log 2>&1
}

function html_prism() {
   mkdir -p $OUT_ROOT/html
   # nothing here yet 
}

function clientreport_prism() {
   # nothing here yet 
   echo ""
}

function clean() {
   # clean is done by the called scripts
   echo ""
}


function main() {
   get_opts "$@"

   if [ $BATCH_MODE == "yes" ]; then
      if [ $ANALYSIS == "format" ]; then
         # set some options and initialise some variables (some irrelevant to this task but need to satisfy later option checking)
         start_with_merged=no
         echo $DATASET > $OUT_ROOT/input_file_list.txt
         COHORT=""
         HPC_TYPE=slurm
         NUM_THREADS=8
         BLAST_DATABASE=/dataset/GBS_Rumen_Metagenomes/scratch/blast_analysis/GenusPlusQuinella
         TAXONOMISER=/dataset/gseq_processing/active/bin/melseq_prism/summarizeR_counts.code
         SIMILARITY=0.02
         SEQLENGTH_MIN=40
         SEQQUAL_MIN=20
         WORDSIZE=16
         BLAST_TASK=blastn
         DRY_RUN=no
      else
         echo "in batch mode, only the fasta format option has been tested so far"
         exit 1
      fi
      check_env
      configure_env
      get_targets
      run_prism
      if [ $? == 0 ] ; then
         clean
         echo "* done clean *"  # mainly to yield zero exit code
      else
         echo "error state from  run - skipping html page generation and clean-up
         (check $OUT_ROOT/run_${ANALYSIS}.log )"
         exit 1
      fi
   else
      get_run_opts
      check_opts
      echo_opts
      check_env
      configure_env

      if [ $ANALYSIS == "clientreport" ]; then
         clientreport_prism
      else
         if [ "$DATASET" != "restarting" ]; then 
            get_targets
         fi
         if [ $DRY_RUN != "no" ]; then
            fake_prism
         else
            run_prism
            if [ $? == 0 ] ; then
               echo "counting up inputs and outputs at each stage for audit..."
               $MELSEQ_PRISM_BIN/check_processing.sh $OUT_ROOT > $OUT_ROOT/check_processing.log 2>&1
               clean
               echo "* done clean *"  # mainly to yield zero exit code
            else
               echo "error state from  run - skipping html page generation and clean-up
               (check $OUT_ROOT/run_${ANALYSIS}.log )"
               exit 1
            fi
         fi
      fi
   fi
}

main "$@"