POGENOM_poly.sh

#!/bin/bash -l
wkd=$(pwd)
err_report() {
  echo "Error on line $1 - script POGENOM_poly.sh"
  if [ "$1" >= 236 ]; then echo "TIP - Look at the file $outdir/log_pogenom/pogenom_*$name.log"; fi
  if test -f "tempo"; then rm tempo; fi
  exit 1
}
trap 'err_report $LINENO' ERR

# --- default parameters
NS=1 # Minimum number of samples
extra_params="--min_count 10 --subsample 10"  # Extra pogenom parameters
VCF_text="Required"
sizedir=$wkd/Input_POGENOM/Genome_sizes # Directory with Genome size files
gffdir="none" # Directory with gff files
outdir=POGENOM_OUTPUT # pogenom's Output directory
RI=no # option for forcing re-execution
dirfasta=none # Direcotry with genome's fasta files, e.g., dirfasta=$wkd/Input_POGENOM/RAW_DATA/Genomes/$dataset
ccode=$wkd/standard_genetic_code.txt # path to standard_genetic_code.txt file
rloci=none # Minimum ratio to include a loci in the analysis: number of samples with the loci/total number of samples in vcf file. Float number (upto 1) or 'none'
configFile=$wkd/Input_POGENOM/config_files/Input_POGENOM_config.json # path to Input_POGENOM config file

#Converting json to bash configuration and setting default parameters according to the Input_POGENOM parameter used
if [ -s $configFile ]; then
cat $configFile | sed s/"[{|}]"//g | sed s/":"/"="/g | sed s/",$"//g | sed s/'"'//g | sed s/" "//g > tempo
. tempo

extra_params="--min_count $min_coverage --subsample $min_coverage"
gffdir=$wkd/Input_POGENOM/GFF_files/$dataset
params="params_cov_"$min_coverage"_bdth_"$min_breadth"_mpq_"$mapqual"_bq_"$min_bsq_for_cov_median_calculation
  if [ "$mode" == "prefilt" ]; then dts=$dataset"_prefilt"; else dts=$dataset; fi
vcfdir=$wkd/Input_POGENOM/06_VCF/$dts/$params # path to directory containing VCF files genearted with Input_POGENOM
VCF_text="Default = $vcfdir"
fi

#---------------Arguments
Usage="\nUsage: bash POGENOM_poly.sh [options]\n -d=<absolute path to directory containing pogenom.pl, i.e. working directory, default=$wkd >\n\
 -o=<Output directory name, default=$outdir >\n\
 -v=<absolute path to directory containing VCF files, $VCF_text >\n\
 -s=<absolute path to directory containing Genome sizes or 'none', default=$sizedir >\n\
 -f=<absolute path to directory containing Genome sequences - fasta files or 'none', default=$dirfasta >\n\
 -g=<absolute path to directory containing GFF files or 'none', default=$gffdir >\n\
 -c=<absolute path to file containing standard_genetic_code or 'none', default=$ccode >\n\
 -e=<space-separated and single-quoted string of extra pogenome.pl commands. If not required, set an empty string, i.e., -e=''. Default='$extra_params' >.\n\
 -n=<Minimum number of samples in vcf file. Default=$NS  >\n\
 -r=<Force the re-execution, yes or no. Default=$RI >\n\
 -l=<Minimum ratio to include a loci in the analysis: number of samples with the loci/total number of samples in vcf file. Float number (upto 1) or 'none'. Defaul=$rloci >\n\
  if this flag is used, pogenom's --min_found flag will be set to the closest integer of (ratio*Number of samples in vcf file)\n"

for i in "$@"
do
case $i in
   -v=*|--vcf_directory=*)
    if [ -z "${i#*=}" ];  then echo "value to argument -v No supplied"; exit 0; else vcfdir="${i#*=}"; fi
    shift # past argument
    ;;

   -s=*|--size_directory=*)
    if [ -z "${i#*=}" ];  then echo "value to argument -s No supplied"; exit 0; else sizedir="${i#*=}"; fi
    shift # past argument
    ;;

   -g=*|--gff_directory=*)
    if [ -z "${i#*=}" ];  then echo "value to argument -g No supplied"; exit 0; else gffdir="${i#*=}"; fi
    shift # past argument
    ;;

   -f=*|--fasta_directory=*)
    if [ -z "${i#*=}" ];  then echo "value to argument -f No supplied"; exit 0; else dirfasta="${i#*=}"; fi
    shift # past argument
    ;;

   -d=*|--wk_directory=*)
    if [ -z "${i#*=}" ];  then echo "value to argument -d No supplied"; exit 0; else wkd="${i#*=}"; fi
    shift # past argument
    ;;

   -c=*|--standard_genetic_code=*)
    if [ -z "${i#*=}" ];  then echo "value to argument -c No supplied"; exit 0; else ccode="${i#*=}"; fi
    shift # past argument
    ;;

   -o=*|--out_dir_name=*)
    if [ -z "${i#*=}" ];  then cho "value to argument -o No supplied"; exit 0; else outdir="${i#*=}"; fi
    shift # past argument
    ;;

    -r=*|--re_run=*)
    RI="${i#*=}"
    shift # past argument
    ;;

    -e=*|--extr_prs=*)
    extra_params="${i#*=}"
    shift # past argument
    ;;

    -l=*|--loci_ratio=*)
    rloci="${i#*=}"
    shift # past argument
    ;;

    -n=*|--num_samples=*)
    NS="${i#*=}"
    shift # past argument
    ;;

    *)
    echo -e "${Usage}"
    echo -e "Description:\nThis program runs POGENOM on several Genomes, using files generated by Input_POGENOM"
    if test -f "tempo"; then rm tempo; fi
    exit 0
    ;;

esac
done

# --checking paths -----
if [[ "$vcfdir" != /* ]]; then
    echo "Please provide an absolute path -v=/absolute/path/to/VCF_files"
      if test -f "tempo"; then rm tempo; fi
    exit 1
fi

if [[ "$sizedir" != /* ]] && [[ "$sizedir" != none ]]; then
    echo "Please provide an absolute path -s=/absolute/path/to/Genome_sizes"
      if test -f "tempo"; then rm tempo; fi
    exit 1
fi

if [[ "$gffdir" != /* ]] && [[ "$gffdir" != none ]]; then
    echo "Please provide an absolute path -g=/absolute/path/to/GFF_sizes"
      if test -f "tempo"; then rm tempo; fi
    exit 1
fi

if [[ "$dirfasta" != /* ]] && [[ "$dirfasta" != none ]]; then
    echo "Please provide an absolute path -f=/absolute/path/to/GFF_sizes"
      if test -f "tempo"; then rm tempo; fi
    exit 1
fi

if [[ "$wkd" != /* ]]; then
    echo "Please provide an absolute path -d=/absolute/path/to/directory containing pogenom.pl, i.e., working directory"
      if test -f "tempo"; then rm tempo; fi
    exit 1
fi
# Cheking Minimum size requirement
if [ ! -d "$vcfdir" ] || [ -z "$(ls $vcfdir/*.vcf)" ]; then # If directory doesn-t exists, or it is empty
    echo "Please provide a valid directory with Genome vcf files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi

if [[ "$sizedir" != none ]] && ([ ! -d "$sizedir" ] || [ -z "$(ls $sizedir/*.size)" ]); then # If directory doesn-t exists, or it is empty
    echo "Please provide a valid directory with Genome size files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi

if [[ "$dirfasta" != none ]] && ([ ! -d "$dirfasta" ] || [ -z "$(ls $dirfasta/*.fa)" ]); then # If directory doesn-t exists, or it is empty
    echo "Please provide a valid directory with Genome fasta files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi

if [[ "$sizedir" == none ]] && [[ "$dirfasta" == none ]]; then
  echo -e "\nPlease provide argument for flag -s or -f"
  echo -e "$Usage"
    if test -f "tempo"; then rm tempo; fi
  exit 1
fi

if [[ "$gffdir" != none ]] && ([ ! -d "$gffdir" ] || [ -z "$(ls $gffdir/*.gff)" ]); then # If directory doesn-t exists, or it is empty
    echo "Please provide a valid directory with Genome gff files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi

# ----------pogenom extra parameters
if [ -n "$extra_params" ]; then extra=$(echo "-e='$extra_params'") ; fi
# -----------------------------
# ****** MAIN ******
if [[ "$sizedir" != none ]] && [[ "$dirfasta" != none ]]; then
  echo -e "INFO: Genome size will be calculated from the Genome fasta files and flag -s will be ginored, -s=none"
  sizedir="none"
fi

echo -e "INFO: POGENOM_poly.sh -o=$outdir -v=$vcfdir -s=$sizedir -g=$gffdir -d=$wkd -r=$RI -n=$NS -c=$ccode -f=$dirfasta -l=$rloci $extra"

if [ "$RI" == yes ]; then echo "Option - Force the re-execution - activated"; fi

cd $vcfdir
Lgs=($(ls *.vcf)) # list of VCF files
cd $wkd
mkdir -p $wkd/$outdir/log_pogenom
for f in "${Lgs[@]}" # for each VCF file
  do
    name=$( echo "${f%.vcf}" ) # name of VCF file
     if [ ! -d "$wkd/$outdir/$name" ] || [  -z "$(ls -A $wkd/$outdir/$name)" ] || [ "$RI" == yes ]; then # If directory doesn-t exists, or it is empty or Re-execution option is activated

         if [ "$dirfasta" != none ] && [ ! -s "$dirfasta/$name$genomes_ext" ]; then #if fasta file is provided when required
            echo -e "INFO: POGENOM on Genome: $name cannot be run, the file $dirfasta/$name$genomes_ext doesn't exit\n      please provide a fast (flag -f) or size (flag -s) file"
            continue
         fi
         if [ "$dirfasta" == none ] && [ ! -s "$sizedir/$name.size" ]; then #if size file is provided when required
           echo -e "INFO: POGENOM on Genome: $name cannot be run, the file $sizedir/$name.size doesn't exit\n      please provide a fast (flag -f) or size (flag -s) file"
           continue
         fi

         if [ "$dirfasta" == none ]; then # If size will be calculated from size file and not from fast file
         sizefile=$(echo "$sizedir/$name.size")
         size=$( cat $sizefile | cut -d ":" -f2 | tr -d " ")
         fi

        inputfile=$vcfdir/$f
        n_samples=$( echo "$(grep "^#" $inputfile | grep -v "^##" | wc -w) - 9" | bc ) # number of samples
        if [[ "$n_samples" -lt "0" ]]; then n_samples="0" ; fi
        if [[ "$n_samples" -ge "$NS" ]]; then # if the number of samples in the vcf files is higher than the user-defined Minimum
          echo "INFO: Running POGENOM on Genome: $name"
          echo "      Number of samples: $n_samples"
          if [ "$rloci" != none ]; then
             minfound=$( echo "$rloci*$n_samples" | bc | awk '{print ($0-int($0)<0.499)?int($0):int($0)+1}')
             if [ "$minfound" == 0 ]; then minfound=1; fi # --min_found 1 default. It is the lowest value for this flag in pogenom.pl
             if [ "$minfound" -eq $n_samples ]; then minfound=0; fi # --min_found 0 means all samples in pogenom.pl
              new_params=$(echo $extra_params | sed s/" --min_found [0-9]*"//) # removing --min_found flag if the user has provided it at the same time with flag -l ( the aim is to avoid flag conflicts)
              extra_params="$new_params --min_found $minfound" # Setting pogenom.pl extra-parameters
              echo "      --min_found is set to $minfound"
          fi
        #Pogenome according to user specifications
            if [ -s "$gffdir/$name.gff" ] && [[ "$gffdir" != none ]]; then # if gff file exists when the user wants to use
              if [[ "$ccode" != none ]] && [[ -s "$ccode" ]]; then # if standard_genetic_code file exits, it is not empty and the user wants to use
                 if [[ "$dirfasta" != none ]]; then # When size will be calculated from fast file
                   echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params"
                   perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params > $outdir/log_pogenom/pogenom_gff_fasta_$name.log
                   mkdir -p $wkd/$outdir/$name
                   mv Pogenom_$name.* $wkd/$outdir/$name/.
                 else # When genome size will be taken from size file
                   contigs_seq_in_gff=$(echo $(grep -c "##FASTA" $gffdir/$name.gff | bc )) # checking right header (and by consequence, the presence of contigs sequences in gff file)
                   if [ "$contigs_seq_in_gff" -ne 1 ]; then
                     echo -e "Error: Pogenome cannot be run for $name with the current parameters.\n      The $gffdir/$name.gff file must contain contigs sequences and the line before the first conting should be ##FASTA.\n      You should use the flag -f instead of -s"
                     continue
                   fi
                   echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params"
                   perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params > $outdir/log_pogenom/pogenom_gff_geneticode_$name.log
                   mkdir -p $wkd/$outdir/$name
                   mv Pogenom_$name.* $wkd/$outdir/$name/.
                 fi
              else
                  if [[ "$dirfasta" != none ]]; then # When size will be calculated from fast file
                    echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff $extra_params"
                    perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff $extra_params > $outdir/log_pogenom/pogenom_gff_$name.log
                    mkdir -p $wkd/$outdir/$name
                    mv Pogenom_$name.* $wkd/$outdir/$name/.
                  else # When genome size will be taken from size file
                    echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff $extra_params"
                    perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff $extra_params > $outdir/log_pogenom/pogenom_gff_$name.log
                    mkdir -p $wkd/$outdir/$name
                    mv Pogenom_$name.* $wkd/$outdir/$name/.
                  fi
              fi
            else
              if [[ "$dirfasta" != none ]]; then # When size will be calculated from fast file
                echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext $extra_params"
                perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext $extra_params > $outdir/log_pogenom/pogenom_size_$name.log
                mkdir -p $wkd/$outdir/$name
                mv Pogenom_$name.* $wkd/$outdir/$name/.
              else # When genome size will be taken from size file
                echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size $extra_params"
                perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size $extra_params > $outdir/log_pogenom/pogenom_size_$name.log
                mkdir -p $wkd/$outdir/$name
                mv Pogenom_$name.* $wkd/$outdir/$name/.
              fi
            fi
        else
            echo "INFO: $f has lower number of samples ( $n_samples ) than cutoff ( $NS )"
        fi
     else
       echo "Directory $outdir/$name already exists"
     fi
  done
echo "INFO: *** Please find the results in the directory $wkd/$outdir ***"

if test -f "tempo"; then rm tempo; fi