build_sp

#!/bin/bash
#############################################
############################################
# build species database
############################################
############################################

cd $WDR
GDR=$(readlink -f $GEN_DIR)
echo "$package option activated"
echo "$WDR is present directory" 
echo "$GDR is the genomes directory"
echo "$ODR is the output directory"

cd $GDR
exec 3>&2
exec 2> /dev/null
if [ "$LIST" == "false" ]; then
ls {*.fna,*.fa,*.fasta} > $ODR/genomes.txt #suppress error to std output
else
cat $LIS > $ODR/genomes.txt
fi
exec 2>&3

echo "####### PRE PROCESSING GENOMES #####
"

####################
exec 3>&2
exec 2> /dev/null
for f in `cat $ODR/genomes.txt`
do
awk '{if (substr($0,1) ~ "plasmid" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f > $ODR/$f.noplasmid

cd $ODR
awk '{if (substr($0,1) ~ "phage" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid > $f.noplasmid1
awk '{if (substr($0,1) ~ "extrachomosomal" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid1 > $f.noplasmid
rename $f.noplasmid $f $f.noplasmid
rm $f.noplasmid1
num_of_contigs=$(grep -c ">" $f)
echo -e "$f\t$num_of_contigs" >> num_of_contigs.txt
cd $GDR
done
exec 2>&3
###################

cd $ODR

if [[ ! -s num_of_contigs.txt ]]; then 
echo "ERROR: Appropriate fasta files not found in Genomes directory"
exit 1
fi

awk '$2 == 1' num_of_contigs.txt | cut -f1 > reference_genomes.txt
awk 'FNR==NR { a[$NF]; next } !($NF in a)' reference_genomes.txt <(cut -f1 num_of_contigs.txt ) >  draft_genomes.txt 

rm num_of_contigs.txt
rm genomes.txt
mkdir reordered_contigs
#################################################

if [ -s reference_genomes.txt ]; then 
echo "Complete reference genome(s) detected" 
else
echo "ERROR: At least, a complete reference genome is required to reorder contigs of draft genomes"
rm reference_genomes.txt draft_genomes.txt
exit 1
fi


for f in `cat reference_genomes.txt`
do
cd $GDR
grep -v ">" $f | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$f" | sed -e '$a\ ' > $ODR/reordered_contigs/$f.fna
cd $ODR
done

######### get position of dnaA ######
cd reordered_contigs
ls *.fna > reordered_ref_genomes
cat $(cat reordered_ref_genomes) > $ODR/ref.fa
rm reordered_ref_genomes
cd $ODR

blastn -query ref.fa -db $SMEG_DIR/dnaA_database/dnaa.fasta -evalue 0.05 -num_threads $NUM_THREAD -max_target_seqs 1 -out dnaa_id -word_size 11 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen" || exit 1


cut -f1 dnaa_id | sort | uniq > temp.txt
for i in `cat temp.txt`
do
grep -w "$i" dnaa_id | sort -nr -k12,12 | head -1 | cut -f1,7,13 >> temp.dnaA_output.txt
done

###### User defined Representative genome
if [ "x$REPGEN" != "x" ]; then
 grep -P '(^|\s)\K'$REPGEN'(?=\s|$)' temp.dnaA_output.txt | awk  '{printf "%s\t%4.3f\n", $1 , $2/$3 }'  > dnaA_output.txt 
else
awk  '{printf "%s\t%4.3f\n", $1 , $2/$3 }' temp.dnaA_output.txt | sort -n -k2,2 | head -1 > dnaA_output.txt
fi

rm temp* dnaa_id ref.fa


if [[ ! -s dnaA_output.txt ]]; then
repGenome=$(head -1 reference_genomes.txt)
dnaA_pos=0.5
else
repGenome=$(cut -f1 dnaA_output.txt)
dnaA_pos=$(cut -f2 dnaA_output.txt)
fi

rm dnaA_output.txt
#########################

no_of_comp_genomes=$(grep -c "." reference_genomes.txt)
no_of_draft_genomes=$(grep -c "." draft_genomes.txt)

echo "Number of complete genomes = $no_of_comp_genomes" >> log.txt
echo "Number of draft genomes = $no_of_draft_genomes" >> log.txt
echo "Selected representative genome is $repGenome" >> log.txt
echo "dnaA position relative to ori is $dnaA_pos " >> log.txt
echo " " >> log.txt
 
if [ -s draft_genomes.txt ]; then
############## Reorder contigs using Mauve ###########################
echo "####### RE-ORDERING CONTIGS OF DRAFT GENOME(S) (MAUVE)#####
"
for i in `cat draft_genomes.txt`
do
mkdir $i.dir
prefix=$(type -P Mauve |  rev | cut -d'/' -f3- | rev)
mauvepath=$(find $prefix -name "Mauve.jar")
echo "java -cp $mauvepath -Djava.awt.headless=true org.gel.mauve.contigs.ContigOrderer -output $ODR/$i.dir -ref $ODR/reordered_contigs/$repGenome.fna -draft $ODR/$i" >> parallel_commands
done
cat parallel_commands | parallel '{}'  >> Mauve.error.log.txt 2>&1

rm parallel_commands

for i in `cat draft_genomes.txt`
do
cd $i.dir
aa=$(ls alignment*/$i.fas | sort | tail -1) || exit 1
grep -v ">" $aa | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$i" | sed -e '$a\ ' > $ODR/reordered_contigs/$i.fna || exit 1
cd ../
done
rm Mauve.error.log.txt
else
echo "Only complete genomes identified"
fi


exec 3>&2
exec 2> /dev/null
rm reference_genomes.txt
rm draft_genomes.txt
rm -rf *.dir
rm temp.txt
rm {*.fna,*.fa,*.fasta}
exec 2>&3

########### Run prokka ################
cd $ODR/reordered_contigs
ls *.fna > ../genomes.txt
cd $ODR
mkdir gff
for f in `cat genomes.txt`
do
aa=$( echo "$f" | rev | cut -d'.' -f2- | rev)
cat $ODR/reordered_contigs/$f | sed 's/>.*/>temp/' > $ODR/$f
echo "prokka --quiet --kingdom Bacteria --outdir $aa.dir --locustag $aa --prefix $aa $ODR/$f" >> parallel_commands
done

echo "####### ANNOTATING GENOMES (PROKKA) #####
"


cat parallel_commands | parallel '{}'  >> prokka.error.log.txt 2>&1
for f in *.dir
do
cp $f/*.gff gff/. || exit 1
rm -rf $f
done

exec 3>&2
exec 2> /dev/null
rm {*.fna,*.fa,*.fasta}
exec 2>&3

rm prokka.error.log.txt
rm parallel_commands
rm genomes.txt
############ Run Roary ####################
cd $ODR
echo "####### CORE-GENOME ANALYSIS (ROARY) #####
"
roary -p $NUM_THREAD -f ./Roary -e -n -s $ODR/gff/*.gff || exit 1
rm -rf gff/


################ Generate phylogenetic tree #########

FastTree -nt -quiet -gtr $ODR/Roary/core_gene_alignment.aln > tree.newick || exit 1


######## If -e flag is chosen ######
if [ "$REF_ONLY" == "true" ]; then
Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0
cp $ODR/Roary/core_alignment_header.embl . 
cp $ODR/Roary/core_gene_alignment.aln .
if [ "$KEEP" == "false" ]; then
rm -rf $ODR/Roary
fi
 exit 0
fi


############ Now build SMEG database ########
cp reordered_contigs/$repGenome.fna .
 
echo "####### BUILDING SMEG DATABASE(S) #####
"

cat <<EOT >> parallel.sh
#!/bin/bash
if [ "$IGNORE_ITER" == "true" ]; then
count=2
else
count=0
fi

touch breakCluster.txt
until [ ! \$count -lt 3 ]
do
Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0

if ! [[ -s clusterOutput.txt ]]; then echo "ERROR: clusterOutput.txt empty. Unable to generate clusters"; exit 1 ; fi
$SMEG_DIR/uniqueClusterSNP $ODR/Roary/core_gene_alignment.aln clusterOutput.txt $NUM_THREAD $SAT ClusterSNPs.txt
cut -f3 ClusterSNPs.txt | grep -v "cluster" | sort | uniq -c | sed -r 's/^ *([0-9]+)/\1\t/' | awk '\$2 != 0 && \$1 < '$CNT'' | cut -f2 > breakCluster.txt || exit 1
num_of_cluS_rerun=\$(grep -c "." breakCluster.txt)
count=\`expr \$count + 1\`
echo "iteration count \$count"

if [ \$num_of_cluS_rerun -lt 1 ]
then
count=\`expr \$count + 100000\`
fi
done

rm breakCluster.txt
awk '\$3 != 0' ClusterSNPs.txt > ClusterSNPs_final.txt
rm ClusterSNPs.txt

samtools faidx $ODR/Roary/core_gene_alignment.aln $repGenome > $repGenome.aln
Rscript $SMEG_DIR/getPositionWithoutGaps.R -i ClusterSNPs_final.txt -x $repGenome.aln -m 0

grep "label=" $ODR/Roary/core_alignment_header.embl | cut -f2 -d'=' > core_genes.txt
grep "feature" $ODR/Roary/core_alignment_header.embl | rev | cut -d' ' -f1 | rev | sed 's/\../ /g' | awk '{print (\$1 - 1) "\t" \$2}' | sed "s/^/$repGenome	/" > core_gene_coordinate_in_align

paste -d'\t' core_gene_coordinate_in_align core_genes.txt > bedfile
rm core_gene_coordinate_in_align core_genes.txt

samtools faidx $repGenome.aln
bedtools getfasta -fi $repGenome.aln -bed bedfile -name | sed 's/-//g'  | fold -w 60  >> core_genes2.fa
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} \$2 {print ">"\$0}' core_genes2.fa > core_genes.fa
rm core_genes2.fa
rm bedfile

awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' core_genes.fa | grep -v ">" > geneLengths
grep ">" core_genes.fa | sed 's/>//g' > geneNames
paste <(cat geneLengths | awk '{total += \$0; \$0 = total - \$0}1') <(cat geneLengths | awk '{total += \$0; \$0 = total}1') <(cat geneNames) --delimiters '\t' > geneCoordinates.txt
rm geneLengths geneNames
########
makeblastdb -in $repGenome.fna -parse_seqids -dbtype nucl
blastn -query core_genes.fa -db $repGenome.fna -evalue 5 -num_threads 1 -max_target_seqs 1 -outfmt 6 -out $repGenome.blast.txt -word_size 11

cut -f1 $repGenome.blast.txt | sort | uniq > tmp.txt

for i in \`cat tmp.txt\`
do
grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f2,9,10 > temp1
grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f1 > temp2
paste -d'\t' temp1 temp2 >> $repGenome.core.geneCood.txt
done

rm temp* tmp.txt
rm $repGenome.blast.txt core_genes.fa

########################
sed '1d' modified_uniq_cluster_SNPs.txt | cut -f1 | sort | uniq > clusters.txt || exit 1
###########
Rscript $SMEG_DIR/getPositioninRef.R -i modified_uniq_cluster_SNPs.txt -x $repGenome.core.geneCood.txt -y geneCoordinates.txt
##########

for strains in \`cat clusters.txt\`
do
grep -P '(^|\s)\K'\$strains'(?=\s|$)' newcoordinates.txt  > \$strains.Input.txt || exit 1
done

rm newcoordinates.txt modified_uniq_cluster_SNPs.txt ClusterSNPs_final.txt tree.newick
rm $repGenome.aln $repGenome.aln.fai
rm $repGenome.fna.n*

mkdir Index
bowtie2-build $repGenome.fna Index/$repGenome.fna -q || exit 1
samtools faidx $repGenome.fna

cp $ODR/Roary/core_alignment_header.embl .
cp $ODR/Roary/core_gene_alignment.aln .

awk -F'\t' 'NR==FNR{c[\$1]++;next};c[\$2] > 0' <(grep -w -v -f <(cat clusters.txt | rev | cut -d'.' -f3 | rev | cut -c8-) <(cut -f2 clusterOutput.txt)) clusterOutput.txt > clusters_with_no_unique_SNP.txt
printf '1\ni\nStrain\tclusterID\n.\nw\n' | ed -s clusters_with_no_unique_SNP.txt
EOT

if [ "$AUTO" == "false" ]; then
echo "bash parallel.sh" >> parallel_commands
else
thresholds=$(echo "0.4 0.5 0.6 0.7 0.8 0.9")
for z in `echo $thresholds` 
do 
sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"false\" ==/g" | sed "2i mkdir F.$z" | sed "3i cd F.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.F.sh
sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"true\" ==/g" | sed "2i mkdir T.$z" | sed "3i cd T.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.T.sh
echo "bash parallel.$z.F.sh" >> parallel_commands
echo "bash parallel.$z.T.sh" >> parallel_commands
done
fi

cat parallel_commands | parallel '{}' | grep -v "sites screened" >> smeg.error.log.txt 2>&1 || exit 1
rm parallel_commands parallel.*

if [ "$AUTO" == "false" ]; then
unclustered_strains_count=$(sed '1d' clusters_with_no_unique_SNP.txt | wc -l)
clusters_wo_uniq=$(sed '1d' clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l)
total_cluster_count=$(grep -c "." clusters.txt)

echo "Total number of clusters = $total_cluster_count" >> log.txt
echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt 
echo "See clusters_with_no_unique_SNP.txt for more details" >> log.txt 
echo " "
else
iterative_yes=$(echo "F.0.4 F.0.5 F.0.6 F.0.7 F.0.8 F.0.9")
iterative_no=$(echo "T.0.4 T.0.5 T.0.6 T.0.7 T.0.8 T.0.9")
for f in `echo $iterative_yes`
do
unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l)
clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l)
total_cluster_count=$(grep -c "." $f/clusters.txt)
assign_thres=$(cut -f2- -d'.' <<< $f)
median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}')
head -4 log.txt > $f/misc.txt

echo "### SNP assignment threshold $assign_thres with iterative clustering output ######" >> log.txt
echo "Total number of clusters = $total_cluster_count" >> log.txt
echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt
echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt
echo "See $ODR/$f/clusters_with_no_unique_SNP.txt for more details" >> log.txt
echo "Database created with above parameters located in $ODR/$f " >> log.txt
echo "################################################################################## 

" >> log.txt
done

for f in `echo $iterative_no`
do
unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l)
clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l)
total_cluster_count=$(grep -c "." $f/clusters.txt)
assign_thres=$(cut -f2- -d'.' <<< $f)
median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}')
head -4 log.txt > $f/misc.txt

echo "### SNP assignment threshold $assign_thres without iterative clustering output ######" >> log.txt
echo "Total number of clusters = $total_cluster_count" >> log.txt
echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt
echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt
echo "See $f/clusters_with_no_unique_SNP.txt for more details" >> log.txt
echo "Database created with above parameters located in $ODR/$f " >> log.txt
echo "################################################################################## 

" >> log.txt
done
fi
if [ "$KEEP" == "false" ]; then
rm -rf $ODR/Roary
fi

rm smeg.error.log.txt
echo "####### DONE #####"