-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
ohlab
authored
Dec 9, 2019
1 parent
63419af
commit 93ac17a
Showing
3 changed files
with
1,139 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,387 @@ | ||
#!/bin/bash | ||
############################################# | ||
############################################ | ||
# build species database | ||
############################################ | ||
############################################ | ||
|
||
cd $WDR | ||
GDR=$(readlink -f $GEN_DIR) | ||
echo "$package option activated" | ||
echo "$WDR is present directory" | ||
echo "$GDR is the genomes directory" | ||
echo "$ODR is the output directory" | ||
|
||
cd $GDR | ||
exec 3>&2 | ||
exec 2> /dev/null | ||
if [ "$LIST" == "false" ]; then | ||
ls {*.fna,*.fa,*.fasta} > $ODR/genomes.txt #suppress error to std output | ||
else | ||
cat $LIS > $ODR/genomes.txt | ||
fi | ||
exec 2>&3 | ||
|
||
echo "####### PRE PROCESSING GENOMES ##### | ||
" | ||
|
||
#################### | ||
exec 3>&2 | ||
exec 2> /dev/null | ||
for f in `cat $ODR/genomes.txt` | ||
do | ||
awk '{if (substr($0,1) ~ "plasmid" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f > $ODR/$f.noplasmid | ||
|
||
cd $ODR | ||
awk '{if (substr($0,1) ~ "phage" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid > $f.noplasmid1 | ||
awk '{if (substr($0,1) ~ "extrachomosomal" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid1 > $f.noplasmid | ||
rename $f.noplasmid $f $f.noplasmid | ||
rm $f.noplasmid1 | ||
num_of_contigs=$(grep -c ">" $f) | ||
echo -e "$f\t$num_of_contigs" >> num_of_contigs.txt | ||
cd $GDR | ||
done | ||
exec 2>&3 | ||
################### | ||
|
||
cd $ODR | ||
|
||
if [[ ! -s num_of_contigs.txt ]]; then | ||
echo "ERROR: Appropriate fasta files not found in Genomes directory" | ||
exit 1 | ||
fi | ||
|
||
awk '$2 == 1' num_of_contigs.txt | cut -f1 > reference_genomes.txt | ||
awk 'FNR==NR { a[$NF]; next } !($NF in a)' reference_genomes.txt <(cut -f1 num_of_contigs.txt ) > draft_genomes.txt | ||
|
||
rm num_of_contigs.txt | ||
rm genomes.txt | ||
mkdir reordered_contigs | ||
################################################# | ||
|
||
if [ -s reference_genomes.txt ]; then | ||
echo "Complete reference genome(s) detected" | ||
else | ||
echo "ERROR: At least, a complete reference genome is required to reorder contigs of draft genomes" | ||
rm reference_genomes.txt draft_genomes.txt | ||
exit 1 | ||
fi | ||
|
||
|
||
for f in `cat reference_genomes.txt` | ||
do | ||
cd $GDR | ||
grep -v ">" $f | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$f" | sed -e '$a\ ' > $ODR/reordered_contigs/$f.fna | ||
cd $ODR | ||
done | ||
|
||
######### get position of dnaA ###### | ||
cd reordered_contigs | ||
ls *.fna > reordered_ref_genomes | ||
cat $(cat reordered_ref_genomes) > $ODR/ref.fa | ||
rm reordered_ref_genomes | ||
cd $ODR | ||
|
||
blastn -query ref.fa -db $SMEG_DIR/dnaA_database/dnaa.fasta -evalue 0.05 -num_threads $NUM_THREAD -max_target_seqs 1 -out dnaa_id -word_size 11 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen" || exit 1 | ||
|
||
|
||
cut -f1 dnaa_id | sort | uniq > temp.txt | ||
for i in `cat temp.txt` | ||
do | ||
grep -w "$i" dnaa_id | sort -nr -k12,12 | head -1 | cut -f1,7,13 >> temp.dnaA_output.txt | ||
done | ||
|
||
###### User defined Representative genome | ||
if [ "x$REPGEN" != "x" ]; then | ||
grep -P '(^|\s)\K'$REPGEN'(?=\s|$)' temp.dnaA_output.txt | awk '{printf "%s\t%4.3f\n", $1 , $2/$3 }' > dnaA_output.txt | ||
else | ||
awk '{printf "%s\t%4.3f\n", $1 , $2/$3 }' temp.dnaA_output.txt | sort -n -k2,2 | head -1 > dnaA_output.txt | ||
fi | ||
|
||
rm temp* dnaa_id ref.fa | ||
|
||
|
||
if [[ ! -s dnaA_output.txt ]]; then | ||
repGenome=$(head -1 reference_genomes.txt) | ||
dnaA_pos=0.5 | ||
else | ||
repGenome=$(cut -f1 dnaA_output.txt) | ||
dnaA_pos=$(cut -f2 dnaA_output.txt) | ||
fi | ||
|
||
rm dnaA_output.txt | ||
######################### | ||
|
||
no_of_comp_genomes=$(grep -c "." reference_genomes.txt) | ||
no_of_draft_genomes=$(grep -c "." draft_genomes.txt) | ||
|
||
echo "Number of complete genomes = $no_of_comp_genomes" >> log.txt | ||
echo "Number of draft genomes = $no_of_draft_genomes" >> log.txt | ||
echo "Selected representative genome is $repGenome" >> log.txt | ||
echo "dnaA position relative to ori is $dnaA_pos " >> log.txt | ||
echo " " >> log.txt | ||
|
||
if [ -s draft_genomes.txt ]; then | ||
############## Reorder contigs using Mauve ########################### | ||
echo "####### RE-ORDERING CONTIGS OF DRAFT GENOME(S) (MAUVE)##### | ||
" | ||
for i in `cat draft_genomes.txt` | ||
do | ||
mkdir $i.dir | ||
prefix=$(type -P Mauve | rev | cut -d'/' -f3- | rev) | ||
mauvepath=$(find $prefix -name "Mauve.jar") | ||
echo "java -cp $mauvepath -Djava.awt.headless=true org.gel.mauve.contigs.ContigOrderer -output $ODR/$i.dir -ref $ODR/reordered_contigs/$repGenome.fna -draft $ODR/$i" >> parallel_commands | ||
done | ||
cat parallel_commands | parallel '{}' >> Mauve.error.log.txt 2>&1 | ||
|
||
rm parallel_commands | ||
|
||
for i in `cat draft_genomes.txt` | ||
do | ||
cd $i.dir | ||
aa=$(ls alignment*/$i.fas | sort | tail -1) || exit 1 | ||
grep -v ">" $aa | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$i" | sed -e '$a\ ' > $ODR/reordered_contigs/$i.fna || exit 1 | ||
cd ../ | ||
done | ||
rm Mauve.error.log.txt | ||
else | ||
echo "Only complete genomes identified" | ||
fi | ||
|
||
|
||
exec 3>&2 | ||
exec 2> /dev/null | ||
rm reference_genomes.txt | ||
rm draft_genomes.txt | ||
rm -rf *.dir | ||
rm temp.txt | ||
rm {*.fna,*.fa,*.fasta} | ||
exec 2>&3 | ||
|
||
########### Run prokka ################ | ||
cd $ODR/reordered_contigs | ||
ls *.fna > ../genomes.txt | ||
cd $ODR | ||
mkdir gff | ||
for f in `cat genomes.txt` | ||
do | ||
aa=$( echo "$f" | rev | cut -d'.' -f2- | rev) | ||
cat $ODR/reordered_contigs/$f | sed 's/>.*/>temp/' > $ODR/$f | ||
echo "prokka --quiet --kingdom Bacteria --outdir $aa.dir --locustag $aa --prefix $aa $ODR/$f" >> parallel_commands | ||
done | ||
|
||
echo "####### ANNOTATING GENOMES (PROKKA) ##### | ||
" | ||
|
||
|
||
cat parallel_commands | parallel '{}' >> prokka.error.log.txt 2>&1 | ||
for f in *.dir | ||
do | ||
cp $f/*.gff gff/. || exit 1 | ||
rm -rf $f | ||
done | ||
|
||
exec 3>&2 | ||
exec 2> /dev/null | ||
rm {*.fna,*.fa,*.fasta} | ||
exec 2>&3 | ||
|
||
rm prokka.error.log.txt | ||
rm parallel_commands | ||
rm genomes.txt | ||
############ Run Roary #################### | ||
cd $ODR | ||
echo "####### CORE-GENOME ANALYSIS (ROARY) ##### | ||
" | ||
roary -p $NUM_THREAD -f ./Roary -e -n -s $ODR/gff/*.gff || exit 1 | ||
rm -rf gff/ | ||
|
||
|
||
################ Generate phylogenetic tree ######### | ||
|
||
FastTree -nt -quiet -gtr $ODR/Roary/core_gene_alignment.aln > tree.newick || exit 1 | ||
|
||
|
||
######## If -e flag is chosen ###### | ||
if [ "$REF_ONLY" == "true" ]; then | ||
Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0 | ||
cp $ODR/Roary/core_alignment_header.embl . | ||
cp $ODR/Roary/core_gene_alignment.aln . | ||
if [ "$KEEP" == "false" ]; then | ||
rm -rf $ODR/Roary | ||
fi | ||
exit 0 | ||
fi | ||
|
||
|
||
############ Now build SMEG database ######## | ||
cp reordered_contigs/$repGenome.fna . | ||
|
||
echo "####### BUILDING SMEG DATABASE(S) ##### | ||
" | ||
|
||
cat <<EOT >> parallel.sh | ||
#!/bin/bash | ||
if [ "$IGNORE_ITER" == "true" ]; then | ||
count=2 | ||
else | ||
count=0 | ||
fi | ||
touch breakCluster.txt | ||
until [ ! \$count -lt 3 ] | ||
do | ||
Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0 | ||
if ! [[ -s clusterOutput.txt ]]; then echo "ERROR: clusterOutput.txt empty. Unable to generate clusters"; exit 1 ; fi | ||
$SMEG_DIR/uniqueClusterSNP $ODR/Roary/core_gene_alignment.aln clusterOutput.txt $NUM_THREAD $SAT ClusterSNPs.txt | ||
cut -f3 ClusterSNPs.txt | grep -v "cluster" | sort | uniq -c | sed -r 's/^ *([0-9]+)/\1\t/' | awk '\$2 != 0 && \$1 < '$CNT'' | cut -f2 > breakCluster.txt || exit 1 | ||
num_of_cluS_rerun=\$(grep -c "." breakCluster.txt) | ||
count=\`expr \$count + 1\` | ||
echo "iteration count \$count" | ||
if [ \$num_of_cluS_rerun -lt 1 ] | ||
then | ||
count=\`expr \$count + 100000\` | ||
fi | ||
done | ||
rm breakCluster.txt | ||
awk '\$3 != 0' ClusterSNPs.txt > ClusterSNPs_final.txt | ||
rm ClusterSNPs.txt | ||
samtools faidx $ODR/Roary/core_gene_alignment.aln $repGenome > $repGenome.aln | ||
Rscript $SMEG_DIR/getPositionWithoutGaps.R -i ClusterSNPs_final.txt -x $repGenome.aln -m 0 | ||
grep "label=" $ODR/Roary/core_alignment_header.embl | cut -f2 -d'=' > core_genes.txt | ||
grep "feature" $ODR/Roary/core_alignment_header.embl | rev | cut -d' ' -f1 | rev | sed 's/\../ /g' | awk '{print (\$1 - 1) "\t" \$2}' | sed "s/^/$repGenome /" > core_gene_coordinate_in_align | ||
paste -d'\t' core_gene_coordinate_in_align core_genes.txt > bedfile | ||
rm core_gene_coordinate_in_align core_genes.txt | ||
samtools faidx $repGenome.aln | ||
bedtools getfasta -fi $repGenome.aln -bed bedfile -name | sed 's/-//g' | fold -w 60 >> core_genes2.fa | ||
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} \$2 {print ">"\$0}' core_genes2.fa > core_genes.fa | ||
rm core_genes2.fa | ||
rm bedfile | ||
awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' core_genes.fa | grep -v ">" > geneLengths | ||
grep ">" core_genes.fa | sed 's/>//g' > geneNames | ||
paste <(cat geneLengths | awk '{total += \$0; \$0 = total - \$0}1') <(cat geneLengths | awk '{total += \$0; \$0 = total}1') <(cat geneNames) --delimiters '\t' > geneCoordinates.txt | ||
rm geneLengths geneNames | ||
######## | ||
makeblastdb -in $repGenome.fna -parse_seqids -dbtype nucl | ||
blastn -query core_genes.fa -db $repGenome.fna -evalue 5 -num_threads 1 -max_target_seqs 1 -outfmt 6 -out $repGenome.blast.txt -word_size 11 | ||
cut -f1 $repGenome.blast.txt | sort | uniq > tmp.txt | ||
for i in \`cat tmp.txt\` | ||
do | ||
grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f2,9,10 > temp1 | ||
grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f1 > temp2 | ||
paste -d'\t' temp1 temp2 >> $repGenome.core.geneCood.txt | ||
done | ||
rm temp* tmp.txt | ||
rm $repGenome.blast.txt core_genes.fa | ||
######################## | ||
sed '1d' modified_uniq_cluster_SNPs.txt | cut -f1 | sort | uniq > clusters.txt || exit 1 | ||
########### | ||
Rscript $SMEG_DIR/getPositioninRef.R -i modified_uniq_cluster_SNPs.txt -x $repGenome.core.geneCood.txt -y geneCoordinates.txt | ||
########## | ||
for strains in \`cat clusters.txt\` | ||
do | ||
grep -P '(^|\s)\K'\$strains'(?=\s|$)' newcoordinates.txt > \$strains.Input.txt || exit 1 | ||
done | ||
rm newcoordinates.txt modified_uniq_cluster_SNPs.txt ClusterSNPs_final.txt tree.newick | ||
rm $repGenome.aln $repGenome.aln.fai | ||
rm $repGenome.fna.n* | ||
mkdir Index | ||
bowtie2-build $repGenome.fna Index/$repGenome.fna -q || exit 1 | ||
samtools faidx $repGenome.fna | ||
cp $ODR/Roary/core_alignment_header.embl . | ||
cp $ODR/Roary/core_gene_alignment.aln . | ||
awk -F'\t' 'NR==FNR{c[\$1]++;next};c[\$2] > 0' <(grep -w -v -f <(cat clusters.txt | rev | cut -d'.' -f3 | rev | cut -c8-) <(cut -f2 clusterOutput.txt)) clusterOutput.txt > clusters_with_no_unique_SNP.txt | ||
printf '1\ni\nStrain\tclusterID\n.\nw\n' | ed -s clusters_with_no_unique_SNP.txt | ||
EOT | ||
|
||
if [ "$AUTO" == "false" ]; then | ||
echo "bash parallel.sh" >> parallel_commands | ||
else | ||
thresholds=$(echo "0.4 0.5 0.6 0.7 0.8 0.9") | ||
for z in `echo $thresholds` | ||
do | ||
sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"false\" ==/g" | sed "2i mkdir F.$z" | sed "3i cd F.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.F.sh | ||
sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"true\" ==/g" | sed "2i mkdir T.$z" | sed "3i cd T.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.T.sh | ||
echo "bash parallel.$z.F.sh" >> parallel_commands | ||
echo "bash parallel.$z.T.sh" >> parallel_commands | ||
done | ||
fi | ||
|
||
cat parallel_commands | parallel '{}' | grep -v "sites screened" >> smeg.error.log.txt 2>&1 || exit 1 | ||
rm parallel_commands parallel.* | ||
|
||
if [ "$AUTO" == "false" ]; then | ||
unclustered_strains_count=$(sed '1d' clusters_with_no_unique_SNP.txt | wc -l) | ||
clusters_wo_uniq=$(sed '1d' clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l) | ||
total_cluster_count=$(grep -c "." clusters.txt) | ||
|
||
echo "Total number of clusters = $total_cluster_count" >> log.txt | ||
echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt | ||
echo "See clusters_with_no_unique_SNP.txt for more details" >> log.txt | ||
echo " " | ||
else | ||
iterative_yes=$(echo "F.0.4 F.0.5 F.0.6 F.0.7 F.0.8 F.0.9") | ||
iterative_no=$(echo "T.0.4 T.0.5 T.0.6 T.0.7 T.0.8 T.0.9") | ||
for f in `echo $iterative_yes` | ||
do | ||
unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l) | ||
clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l) | ||
total_cluster_count=$(grep -c "." $f/clusters.txt) | ||
assign_thres=$(cut -f2- -d'.' <<< $f) | ||
median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}') | ||
head -4 log.txt > $f/misc.txt | ||
|
||
echo "### SNP assignment threshold $assign_thres with iterative clustering output ######" >> log.txt | ||
echo "Total number of clusters = $total_cluster_count" >> log.txt | ||
echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt | ||
echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt | ||
echo "See $ODR/$f/clusters_with_no_unique_SNP.txt for more details" >> log.txt | ||
echo "Database created with above parameters located in $ODR/$f " >> log.txt | ||
echo "################################################################################## | ||
" >> log.txt | ||
done | ||
|
||
for f in `echo $iterative_no` | ||
do | ||
unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l) | ||
clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l) | ||
total_cluster_count=$(grep -c "." $f/clusters.txt) | ||
assign_thres=$(cut -f2- -d'.' <<< $f) | ||
median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}') | ||
head -4 log.txt > $f/misc.txt | ||
|
||
echo "### SNP assignment threshold $assign_thres without iterative clustering output ######" >> log.txt | ||
echo "Total number of clusters = $total_cluster_count" >> log.txt | ||
echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt | ||
echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt | ||
echo "See $f/clusters_with_no_unique_SNP.txt for more details" >> log.txt | ||
echo "Database created with above parameters located in $ODR/$f " >> log.txt | ||
echo "################################################################################## | ||
" >> log.txt | ||
done | ||
fi | ||
if [ "$KEEP" == "false" ]; then | ||
rm -rf $ODR/Roary | ||
fi | ||
|
||
rm smeg.error.log.txt | ||
echo "####### DONE #####" |
Oops, something went wrong.