Add files via upload

ohlab · Dec 9, 2019 · 93ac17a · 93ac17a
1 parent 63419af
commit 93ac17a
Show file tree

Hide file tree

Showing 3 changed files with 1,139 additions and 0 deletions.
diff --git a/build_sp b/build_sp
@@ -0,0 +1,387 @@
+#!/bin/bash
+#############################################
+############################################
+# build species database
+############################################
+############################################
+
+cd $WDR
+GDR=$(readlink -f $GEN_DIR)
+echo "$package option activated"
+echo "$WDR is present directory" 
+echo "$GDR is the genomes directory"
+echo "$ODR is the output directory"
+
+cd $GDR
+exec 3>&2
+exec 2> /dev/null
+if [ "$LIST" == "false" ]; then
+ls {*.fna,*.fa,*.fasta} > $ODR/genomes.txt #suppress error to std output
+else
+cat $LIS > $ODR/genomes.txt
+fi
+exec 2>&3
+
+echo "####### PRE PROCESSING GENOMES #####
+"
+
+####################
+exec 3>&2
+exec 2> /dev/null
+for f in `cat $ODR/genomes.txt`
+do
+awk '{if (substr($0,1) ~ "plasmid" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f > $ODR/$f.noplasmid
+
+cd $ODR
+awk '{if (substr($0,1) ~ "phage" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid > $f.noplasmid1
+awk '{if (substr($0,1) ~ "extrachomosomal" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid1 > $f.noplasmid
+rename $f.noplasmid $f $f.noplasmid
+rm $f.noplasmid1
+num_of_contigs=$(grep -c ">" $f)
+echo -e "$f\t$num_of_contigs" >> num_of_contigs.txt
+cd $GDR
+done
+exec 2>&3
+###################
+
+cd $ODR
+
+if [[ ! -s num_of_contigs.txt ]]; then 
+echo "ERROR: Appropriate fasta files not found in Genomes directory"
+exit 1
+fi
+
+awk '$2 == 1' num_of_contigs.txt | cut -f1 > reference_genomes.txt
+awk 'FNR==NR { a[$NF]; next } !($NF in a)' reference_genomes.txt <(cut -f1 num_of_contigs.txt ) >  draft_genomes.txt 
+
+rm num_of_contigs.txt
+rm genomes.txt
+mkdir reordered_contigs
+#################################################
+
+if [ -s reference_genomes.txt ]; then 
+echo "Complete reference genome(s) detected" 
+else
+echo "ERROR: At least, a complete reference genome is required to reorder contigs of draft genomes"
+rm reference_genomes.txt draft_genomes.txt
+exit 1
+fi
+
+
+for f in `cat reference_genomes.txt`
+do
+cd $GDR
+grep -v ">" $f | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$f" | sed -e '$a\ ' > $ODR/reordered_contigs/$f.fna
+cd $ODR
+done
+
+######### get position of dnaA ######
+cd reordered_contigs
+ls *.fna > reordered_ref_genomes
+cat $(cat reordered_ref_genomes) > $ODR/ref.fa
+rm reordered_ref_genomes
+cd $ODR
+
+blastn -query ref.fa -db $SMEG_DIR/dnaA_database/dnaa.fasta -evalue 0.05 -num_threads $NUM_THREAD -max_target_seqs 1 -out dnaa_id -word_size 11 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen" || exit 1
+
+
+cut -f1 dnaa_id | sort | uniq > temp.txt
+for i in `cat temp.txt`
+do
+grep -w "$i" dnaa_id | sort -nr -k12,12 | head -1 | cut -f1,7,13 >> temp.dnaA_output.txt
+done
+
+###### User defined Representative genome
+if [ "x$REPGEN" != "x" ]; then
+ grep -P '(^|\s)\K'$REPGEN'(?=\s|$)' temp.dnaA_output.txt | awk  '{printf "%s\t%4.3f\n", $1 , $2/$3 }'  > dnaA_output.txt 
+else
+awk  '{printf "%s\t%4.3f\n", $1 , $2/$3 }' temp.dnaA_output.txt | sort -n -k2,2 | head -1 > dnaA_output.txt
+fi
+
+rm temp* dnaa_id ref.fa
+
+
+if [[ ! -s dnaA_output.txt ]]; then
+repGenome=$(head -1 reference_genomes.txt)
+dnaA_pos=0.5
+else
+repGenome=$(cut -f1 dnaA_output.txt)
+dnaA_pos=$(cut -f2 dnaA_output.txt)
+fi
+
+rm dnaA_output.txt
+#########################
+
+no_of_comp_genomes=$(grep -c "." reference_genomes.txt)
+no_of_draft_genomes=$(grep -c "." draft_genomes.txt)
+
+echo "Number of complete genomes = $no_of_comp_genomes" >> log.txt
+echo "Number of draft genomes = $no_of_draft_genomes" >> log.txt
+echo "Selected representative genome is $repGenome" >> log.txt
+echo "dnaA position relative to ori is $dnaA_pos " >> log.txt
+echo " " >> log.txt
+
+if [ -s draft_genomes.txt ]; then
+############## Reorder contigs using Mauve ###########################
+echo "####### RE-ORDERING CONTIGS OF DRAFT GENOME(S) (MAUVE)#####
+"
+for i in `cat draft_genomes.txt`
+do
+mkdir $i.dir
+prefix=$(type -P Mauve |  rev | cut -d'/' -f3- | rev)
+mauvepath=$(find $prefix -name "Mauve.jar")
+echo "java -cp $mauvepath -Djava.awt.headless=true org.gel.mauve.contigs.ContigOrderer -output $ODR/$i.dir -ref $ODR/reordered_contigs/$repGenome.fna -draft $ODR/$i" >> parallel_commands
+done
+cat parallel_commands | parallel '{}'  >> Mauve.error.log.txt 2>&1
+
+rm parallel_commands
+
+for i in `cat draft_genomes.txt`
+do
+cd $i.dir
+aa=$(ls alignment*/$i.fas | sort | tail -1) || exit 1
+grep -v ">" $aa | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$i" | sed -e '$a\ ' > $ODR/reordered_contigs/$i.fna || exit 1
+cd ../
+done
+rm Mauve.error.log.txt
+else
+echo "Only complete genomes identified"
+fi
+
+
+exec 3>&2
+exec 2> /dev/null
+rm reference_genomes.txt
+rm draft_genomes.txt
+rm -rf *.dir
+rm temp.txt
+rm {*.fna,*.fa,*.fasta}
+exec 2>&3
+
+########### Run prokka ################
+cd $ODR/reordered_contigs
+ls *.fna > ../genomes.txt
+cd $ODR
+mkdir gff
+for f in `cat genomes.txt`
+do
+aa=$( echo "$f" | rev | cut -d'.' -f2- | rev)
+cat $ODR/reordered_contigs/$f | sed 's/>.*/>temp/' > $ODR/$f
+echo "prokka --quiet --kingdom Bacteria --outdir $aa.dir --locustag $aa --prefix $aa $ODR/$f" >> parallel_commands
+done
+
+echo "####### ANNOTATING GENOMES (PROKKA) #####
+"
+
+
+cat parallel_commands | parallel '{}'  >> prokka.error.log.txt 2>&1
+for f in *.dir
+do
+cp $f/*.gff gff/. || exit 1
+rm -rf $f
+done
+
+exec 3>&2
+exec 2> /dev/null
+rm {*.fna,*.fa,*.fasta}
+exec 2>&3
+
+rm prokka.error.log.txt
+rm parallel_commands
+rm genomes.txt
+############ Run Roary ####################
+cd $ODR
+echo "####### CORE-GENOME ANALYSIS (ROARY) #####
+"
+roary -p $NUM_THREAD -f ./Roary -e -n -s $ODR/gff/*.gff || exit 1
+rm -rf gff/
+
+
+################ Generate phylogenetic tree #########
+
+FastTree -nt -quiet -gtr $ODR/Roary/core_gene_alignment.aln > tree.newick || exit 1
+
+
+######## If -e flag is chosen ######
+if [ "$REF_ONLY" == "true" ]; then
+Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0
+cp $ODR/Roary/core_alignment_header.embl . 
+cp $ODR/Roary/core_gene_alignment.aln .
+if [ "$KEEP" == "false" ]; then
+rm -rf $ODR/Roary
+fi
+ exit 0
+fi
+
+
+############ Now build SMEG database ########
+cp reordered_contigs/$repGenome.fna .
+
+echo "####### BUILDING SMEG DATABASE(S) #####
+"
+
+cat <<EOT >> parallel.sh
+#!/bin/bash
+if [ "$IGNORE_ITER" == "true" ]; then
+count=2
+else
+count=0
+fi
+
+touch breakCluster.txt
+until [ ! \$count -lt 3 ]
+do
+Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0
+
+if ! [[ -s clusterOutput.txt ]]; then echo "ERROR: clusterOutput.txt empty. Unable to generate clusters"; exit 1 ; fi
+$SMEG_DIR/uniqueClusterSNP $ODR/Roary/core_gene_alignment.aln clusterOutput.txt $NUM_THREAD $SAT ClusterSNPs.txt
+cut -f3 ClusterSNPs.txt | grep -v "cluster" | sort | uniq -c | sed -r 's/^ *([0-9]+)/\1\t/' | awk '\$2 != 0 && \$1 < '$CNT'' | cut -f2 > breakCluster.txt || exit 1
+num_of_cluS_rerun=\$(grep -c "." breakCluster.txt)
+count=\`expr \$count + 1\`
+echo "iteration count \$count"
+
+if [ \$num_of_cluS_rerun -lt 1 ]
+then
+count=\`expr \$count + 100000\`
+fi
+done
+
+rm breakCluster.txt
+awk '\$3 != 0' ClusterSNPs.txt > ClusterSNPs_final.txt
+rm ClusterSNPs.txt
+
+samtools faidx $ODR/Roary/core_gene_alignment.aln $repGenome > $repGenome.aln
+Rscript $SMEG_DIR/getPositionWithoutGaps.R -i ClusterSNPs_final.txt -x $repGenome.aln -m 0
+
+grep "label=" $ODR/Roary/core_alignment_header.embl | cut -f2 -d'=' > core_genes.txt
+grep "feature" $ODR/Roary/core_alignment_header.embl | rev | cut -d' ' -f1 | rev | sed 's/\../ /g' | awk '{print (\$1 - 1) "\t" \$2}' | sed "s/^/$repGenome	/" > core_gene_coordinate_in_align
+
+paste -d'\t' core_gene_coordinate_in_align core_genes.txt > bedfile
+rm core_gene_coordinate_in_align core_genes.txt
+
+samtools faidx $repGenome.aln
+bedtools getfasta -fi $repGenome.aln -bed bedfile -name | sed 's/-//g'  | fold -w 60  >> core_genes2.fa
+awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} \$2 {print ">"\$0}' core_genes2.fa > core_genes.fa
+rm core_genes2.fa
+rm bedfile
+
+awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' core_genes.fa | grep -v ">" > geneLengths
+grep ">" core_genes.fa | sed 's/>//g' > geneNames
+paste <(cat geneLengths | awk '{total += \$0; \$0 = total - \$0}1') <(cat geneLengths | awk '{total += \$0; \$0 = total}1') <(cat geneNames) --delimiters '\t' > geneCoordinates.txt
+rm geneLengths geneNames
+########
+makeblastdb -in $repGenome.fna -parse_seqids -dbtype nucl
+blastn -query core_genes.fa -db $repGenome.fna -evalue 5 -num_threads 1 -max_target_seqs 1 -outfmt 6 -out $repGenome.blast.txt -word_size 11
+
+cut -f1 $repGenome.blast.txt | sort | uniq > tmp.txt
+
+for i in \`cat tmp.txt\`
+do
+grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f2,9,10 > temp1
+grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f1 > temp2
+paste -d'\t' temp1 temp2 >> $repGenome.core.geneCood.txt
+done
+
+rm temp* tmp.txt
+rm $repGenome.blast.txt core_genes.fa
+
+########################
+sed '1d' modified_uniq_cluster_SNPs.txt | cut -f1 | sort | uniq > clusters.txt || exit 1
+###########
+Rscript $SMEG_DIR/getPositioninRef.R -i modified_uniq_cluster_SNPs.txt -x $repGenome.core.geneCood.txt -y geneCoordinates.txt
+##########
+
+for strains in \`cat clusters.txt\`
+do
+grep -P '(^|\s)\K'\$strains'(?=\s|$)' newcoordinates.txt  > \$strains.Input.txt || exit 1
+done
+
+rm newcoordinates.txt modified_uniq_cluster_SNPs.txt ClusterSNPs_final.txt tree.newick
+rm $repGenome.aln $repGenome.aln.fai
+rm $repGenome.fna.n*
+
+mkdir Index
+bowtie2-build $repGenome.fna Index/$repGenome.fna -q || exit 1
+samtools faidx $repGenome.fna
+
+cp $ODR/Roary/core_alignment_header.embl .
+cp $ODR/Roary/core_gene_alignment.aln .
+
+awk -F'\t' 'NR==FNR{c[\$1]++;next};c[\$2] > 0' <(grep -w -v -f <(cat clusters.txt | rev | cut -d'.' -f3 | rev | cut -c8-) <(cut -f2 clusterOutput.txt)) clusterOutput.txt > clusters_with_no_unique_SNP.txt
+printf '1\ni\nStrain\tclusterID\n.\nw\n' | ed -s clusters_with_no_unique_SNP.txt
+EOT
+
+if [ "$AUTO" == "false" ]; then
+echo "bash parallel.sh" >> parallel_commands
+else
+thresholds=$(echo "0.4 0.5 0.6 0.7 0.8 0.9")
+for z in `echo $thresholds` 
+do 
+sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"false\" ==/g" | sed "2i mkdir F.$z" | sed "3i cd F.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.F.sh
+sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"true\" ==/g" | sed "2i mkdir T.$z" | sed "3i cd T.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.T.sh
+echo "bash parallel.$z.F.sh" >> parallel_commands
+echo "bash parallel.$z.T.sh" >> parallel_commands
+done
+fi
+
+cat parallel_commands | parallel '{}' | grep -v "sites screened" >> smeg.error.log.txt 2>&1 || exit 1
+rm parallel_commands parallel.*
+
+if [ "$AUTO" == "false" ]; then
+unclustered_strains_count=$(sed '1d' clusters_with_no_unique_SNP.txt | wc -l)
+clusters_wo_uniq=$(sed '1d' clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l)
+total_cluster_count=$(grep -c "." clusters.txt)
+
+echo "Total number of clusters = $total_cluster_count" >> log.txt
+echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt 
+echo "See clusters_with_no_unique_SNP.txt for more details" >> log.txt 
+echo " "
+else
+iterative_yes=$(echo "F.0.4 F.0.5 F.0.6 F.0.7 F.0.8 F.0.9")
+iterative_no=$(echo "T.0.4 T.0.5 T.0.6 T.0.7 T.0.8 T.0.9")
+for f in `echo $iterative_yes`
+do
+unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l)
+clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l)
+total_cluster_count=$(grep -c "." $f/clusters.txt)
+assign_thres=$(cut -f2- -d'.' <<< $f)
+median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}')
+head -4 log.txt > $f/misc.txt
+
+echo "### SNP assignment threshold $assign_thres with iterative clustering output ######" >> log.txt
+echo "Total number of clusters = $total_cluster_count" >> log.txt
+echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt
+echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt
+echo "See $ODR/$f/clusters_with_no_unique_SNP.txt for more details" >> log.txt
+echo "Database created with above parameters located in $ODR/$f " >> log.txt
+echo "################################################################################## 
+
+" >> log.txt
+done
+
+for f in `echo $iterative_no`
+do
+unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l)
+clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l)
+total_cluster_count=$(grep -c "." $f/clusters.txt)
+assign_thres=$(cut -f2- -d'.' <<< $f)
+median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}')
+head -4 log.txt > $f/misc.txt
+
+echo "### SNP assignment threshold $assign_thres without iterative clustering output ######" >> log.txt
+echo "Total number of clusters = $total_cluster_count" >> log.txt
+echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt
+echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt
+echo "See $f/clusters_with_no_unique_SNP.txt for more details" >> log.txt
+echo "Database created with above parameters located in $ODR/$f " >> log.txt
+echo "################################################################################## 
+
+" >> log.txt
+done
+fi
+if [ "$KEEP" == "false" ]; then
+rm -rf $ODR/Roary
+fi
+
+rm smeg.error.log.txt
+echo "####### DONE #####"