diff --git a/build_sp b/build_sp new file mode 100644 index 0000000..9ba31e5 --- /dev/null +++ b/build_sp @@ -0,0 +1,387 @@ +#!/bin/bash +############################################# +############################################ +# build species database +############################################ +############################################ + +cd $WDR +GDR=$(readlink -f $GEN_DIR) +echo "$package option activated" +echo "$WDR is present directory" +echo "$GDR is the genomes directory" +echo "$ODR is the output directory" + +cd $GDR +exec 3>&2 +exec 2> /dev/null +if [ "$LIST" == "false" ]; then +ls {*.fna,*.fa,*.fasta} > $ODR/genomes.txt #suppress error to std output +else +cat $LIS > $ODR/genomes.txt +fi +exec 2>&3 + +echo "####### PRE PROCESSING GENOMES ##### +" + +#################### +exec 3>&2 +exec 2> /dev/null +for f in `cat $ODR/genomes.txt` +do +awk '{if (substr($0,1) ~ "plasmid" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f > $ODR/$f.noplasmid + +cd $ODR +awk '{if (substr($0,1) ~ "phage" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid > $f.noplasmid1 +awk '{if (substr($0,1) ~ "extrachomosomal" ) censor=1; else if (substr($0,1,1) == ">") censor=0; if (censor==0) print $0}' $f.noplasmid1 > $f.noplasmid +rename $f.noplasmid $f $f.noplasmid +rm $f.noplasmid1 +num_of_contigs=$(grep -c ">" $f) +echo -e "$f\t$num_of_contigs" >> num_of_contigs.txt +cd $GDR +done +exec 2>&3 +################### + +cd $ODR + +if [[ ! -s num_of_contigs.txt ]]; then +echo "ERROR: Appropriate fasta files not found in Genomes directory" +exit 1 +fi + +awk '$2 == 1' num_of_contigs.txt | cut -f1 > reference_genomes.txt +awk 'FNR==NR { a[$NF]; next } !($NF in a)' reference_genomes.txt <(cut -f1 num_of_contigs.txt ) > draft_genomes.txt + +rm num_of_contigs.txt +rm genomes.txt +mkdir reordered_contigs +################################################# + +if [ -s reference_genomes.txt ]; then +echo "Complete reference genome(s) detected" +else +echo "ERROR: At least, a complete reference genome is required to reorder contigs of draft genomes" +rm reference_genomes.txt draft_genomes.txt +exit 1 +fi + + +for f in `cat reference_genomes.txt` +do +cd $GDR +grep -v ">" $f | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$f" | sed -e '$a\ ' > $ODR/reordered_contigs/$f.fna +cd $ODR +done + +######### get position of dnaA ###### +cd reordered_contigs +ls *.fna > reordered_ref_genomes +cat $(cat reordered_ref_genomes) > $ODR/ref.fa +rm reordered_ref_genomes +cd $ODR + +blastn -query ref.fa -db $SMEG_DIR/dnaA_database/dnaa.fasta -evalue 0.05 -num_threads $NUM_THREAD -max_target_seqs 1 -out dnaa_id -word_size 11 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen" || exit 1 + + +cut -f1 dnaa_id | sort | uniq > temp.txt +for i in `cat temp.txt` +do +grep -w "$i" dnaa_id | sort -nr -k12,12 | head -1 | cut -f1,7,13 >> temp.dnaA_output.txt +done + +###### User defined Representative genome +if [ "x$REPGEN" != "x" ]; then + grep -P '(^|\s)\K'$REPGEN'(?=\s|$)' temp.dnaA_output.txt | awk '{printf "%s\t%4.3f\n", $1 , $2/$3 }' > dnaA_output.txt +else +awk '{printf "%s\t%4.3f\n", $1 , $2/$3 }' temp.dnaA_output.txt | sort -n -k2,2 | head -1 > dnaA_output.txt +fi + +rm temp* dnaa_id ref.fa + + +if [[ ! -s dnaA_output.txt ]]; then +repGenome=$(head -1 reference_genomes.txt) +dnaA_pos=0.5 +else +repGenome=$(cut -f1 dnaA_output.txt) +dnaA_pos=$(cut -f2 dnaA_output.txt) +fi + +rm dnaA_output.txt +######################### + +no_of_comp_genomes=$(grep -c "." reference_genomes.txt) +no_of_draft_genomes=$(grep -c "." draft_genomes.txt) + +echo "Number of complete genomes = $no_of_comp_genomes" >> log.txt +echo "Number of draft genomes = $no_of_draft_genomes" >> log.txt +echo "Selected representative genome is $repGenome" >> log.txt +echo "dnaA position relative to ori is $dnaA_pos " >> log.txt +echo " " >> log.txt + +if [ -s draft_genomes.txt ]; then +############## Reorder contigs using Mauve ########################### +echo "####### RE-ORDERING CONTIGS OF DRAFT GENOME(S) (MAUVE)##### +" +for i in `cat draft_genomes.txt` +do +mkdir $i.dir +prefix=$(type -P Mauve | rev | cut -d'/' -f3- | rev) +mauvepath=$(find $prefix -name "Mauve.jar") +echo "java -cp $mauvepath -Djava.awt.headless=true org.gel.mauve.contigs.ContigOrderer -output $ODR/$i.dir -ref $ODR/reordered_contigs/$repGenome.fna -draft $ODR/$i" >> parallel_commands +done +cat parallel_commands | parallel '{}' >> Mauve.error.log.txt 2>&1 + +rm parallel_commands + +for i in `cat draft_genomes.txt` +do +cd $i.dir +aa=$(ls alignment*/$i.fas | sort | tail -1) || exit 1 +grep -v ">" $aa | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$i" | sed -e '$a\ ' > $ODR/reordered_contigs/$i.fna || exit 1 +cd ../ +done +rm Mauve.error.log.txt +else +echo "Only complete genomes identified" +fi + + +exec 3>&2 +exec 2> /dev/null +rm reference_genomes.txt +rm draft_genomes.txt +rm -rf *.dir +rm temp.txt +rm {*.fna,*.fa,*.fasta} +exec 2>&3 + +########### Run prokka ################ +cd $ODR/reordered_contigs +ls *.fna > ../genomes.txt +cd $ODR +mkdir gff +for f in `cat genomes.txt` +do +aa=$( echo "$f" | rev | cut -d'.' -f2- | rev) +cat $ODR/reordered_contigs/$f | sed 's/>.*/>temp/' > $ODR/$f +echo "prokka --quiet --kingdom Bacteria --outdir $aa.dir --locustag $aa --prefix $aa $ODR/$f" >> parallel_commands +done + +echo "####### ANNOTATING GENOMES (PROKKA) ##### +" + + +cat parallel_commands | parallel '{}' >> prokka.error.log.txt 2>&1 +for f in *.dir +do +cp $f/*.gff gff/. || exit 1 +rm -rf $f +done + +exec 3>&2 +exec 2> /dev/null +rm {*.fna,*.fa,*.fasta} +exec 2>&3 + +rm prokka.error.log.txt +rm parallel_commands +rm genomes.txt +############ Run Roary #################### +cd $ODR +echo "####### CORE-GENOME ANALYSIS (ROARY) ##### +" +roary -p $NUM_THREAD -f ./Roary -e -n -s $ODR/gff/*.gff || exit 1 +rm -rf gff/ + + +################ Generate phylogenetic tree ######### + +FastTree -nt -quiet -gtr $ODR/Roary/core_gene_alignment.aln > tree.newick || exit 1 + + +######## If -e flag is chosen ###### +if [ "$REF_ONLY" == "true" ]; then +Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0 +cp $ODR/Roary/core_alignment_header.embl . +cp $ODR/Roary/core_gene_alignment.aln . +if [ "$KEEP" == "false" ]; then +rm -rf $ODR/Roary +fi + exit 0 +fi + + +############ Now build SMEG database ######## +cp reordered_contigs/$repGenome.fna . + +echo "####### BUILDING SMEG DATABASE(S) ##### +" + +cat <> parallel.sh +#!/bin/bash +if [ "$IGNORE_ITER" == "true" ]; then +count=2 +else +count=0 +fi + +touch breakCluster.txt +until [ ! \$count -lt 3 ] +do +Rscript $SMEG_DIR/create_clusters.R -i clusterOutput.txt -c 0 + +if ! [[ -s clusterOutput.txt ]]; then echo "ERROR: clusterOutput.txt empty. Unable to generate clusters"; exit 1 ; fi +$SMEG_DIR/uniqueClusterSNP $ODR/Roary/core_gene_alignment.aln clusterOutput.txt $NUM_THREAD $SAT ClusterSNPs.txt +cut -f3 ClusterSNPs.txt | grep -v "cluster" | sort | uniq -c | sed -r 's/^ *([0-9]+)/\1\t/' | awk '\$2 != 0 && \$1 < '$CNT'' | cut -f2 > breakCluster.txt || exit 1 +num_of_cluS_rerun=\$(grep -c "." breakCluster.txt) +count=\`expr \$count + 1\` +echo "iteration count \$count" + +if [ \$num_of_cluS_rerun -lt 1 ] +then +count=\`expr \$count + 100000\` +fi +done + +rm breakCluster.txt +awk '\$3 != 0' ClusterSNPs.txt > ClusterSNPs_final.txt +rm ClusterSNPs.txt + +samtools faidx $ODR/Roary/core_gene_alignment.aln $repGenome > $repGenome.aln +Rscript $SMEG_DIR/getPositionWithoutGaps.R -i ClusterSNPs_final.txt -x $repGenome.aln -m 0 + +grep "label=" $ODR/Roary/core_alignment_header.embl | cut -f2 -d'=' > core_genes.txt +grep "feature" $ODR/Roary/core_alignment_header.embl | rev | cut -d' ' -f1 | rev | sed 's/\../ /g' | awk '{print (\$1 - 1) "\t" \$2}' | sed "s/^/$repGenome /" > core_gene_coordinate_in_align + +paste -d'\t' core_gene_coordinate_in_align core_genes.txt > bedfile +rm core_gene_coordinate_in_align core_genes.txt + +samtools faidx $repGenome.aln +bedtools getfasta -fi $repGenome.aln -bed bedfile -name | sed 's/-//g' | fold -w 60 >> core_genes2.fa +awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} \$2 {print ">"\$0}' core_genes2.fa > core_genes.fa +rm core_genes2.fa +rm bedfile + +awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' core_genes.fa | grep -v ">" > geneLengths +grep ">" core_genes.fa | sed 's/>//g' > geneNames +paste <(cat geneLengths | awk '{total += \$0; \$0 = total - \$0}1') <(cat geneLengths | awk '{total += \$0; \$0 = total}1') <(cat geneNames) --delimiters '\t' > geneCoordinates.txt +rm geneLengths geneNames +######## +makeblastdb -in $repGenome.fna -parse_seqids -dbtype nucl +blastn -query core_genes.fa -db $repGenome.fna -evalue 5 -num_threads 1 -max_target_seqs 1 -outfmt 6 -out $repGenome.blast.txt -word_size 11 + +cut -f1 $repGenome.blast.txt | sort | uniq > tmp.txt + +for i in \`cat tmp.txt\` +do +grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f2,9,10 > temp1 +grep -w "\$i" $repGenome.blast.txt | sort -nr -k12,12 | head -1 | cut -f1 > temp2 +paste -d'\t' temp1 temp2 >> $repGenome.core.geneCood.txt +done + +rm temp* tmp.txt +rm $repGenome.blast.txt core_genes.fa + +######################## +sed '1d' modified_uniq_cluster_SNPs.txt | cut -f1 | sort | uniq > clusters.txt || exit 1 +########### +Rscript $SMEG_DIR/getPositioninRef.R -i modified_uniq_cluster_SNPs.txt -x $repGenome.core.geneCood.txt -y geneCoordinates.txt +########## + +for strains in \`cat clusters.txt\` +do +grep -P '(^|\s)\K'\$strains'(?=\s|$)' newcoordinates.txt > \$strains.Input.txt || exit 1 +done + +rm newcoordinates.txt modified_uniq_cluster_SNPs.txt ClusterSNPs_final.txt tree.newick +rm $repGenome.aln $repGenome.aln.fai +rm $repGenome.fna.n* + +mkdir Index +bowtie2-build $repGenome.fna Index/$repGenome.fna -q || exit 1 +samtools faidx $repGenome.fna + +cp $ODR/Roary/core_alignment_header.embl . +cp $ODR/Roary/core_gene_alignment.aln . + +awk -F'\t' 'NR==FNR{c[\$1]++;next};c[\$2] > 0' <(grep -w -v -f <(cat clusters.txt | rev | cut -d'.' -f3 | rev | cut -c8-) <(cut -f2 clusterOutput.txt)) clusterOutput.txt > clusters_with_no_unique_SNP.txt +printf '1\ni\nStrain\tclusterID\n.\nw\n' | ed -s clusters_with_no_unique_SNP.txt +EOT + +if [ "$AUTO" == "false" ]; then +echo "bash parallel.sh" >> parallel_commands +else +thresholds=$(echo "0.4 0.5 0.6 0.7 0.8 0.9") +for z in `echo $thresholds` +do +sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"false\" ==/g" | sed "2i mkdir F.$z" | sed "3i cd F.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.F.sh +sed "s/clusterOutput.txt $NUM_THREAD $SAT/clusterOutput.txt $NUM_THREAD $z/g" parallel.sh | sed "s/\"$IGNORE_ITER\" ==/\"true\" ==/g" | sed "2i mkdir T.$z" | sed "3i cd T.$z" | sed "4i cp $ODR/tree.newick ." | sed "5i cp $ODR/$repGenome.fna ." > parallel.$z.T.sh +echo "bash parallel.$z.F.sh" >> parallel_commands +echo "bash parallel.$z.T.sh" >> parallel_commands +done +fi + +cat parallel_commands | parallel '{}' | grep -v "sites screened" >> smeg.error.log.txt 2>&1 || exit 1 +rm parallel_commands parallel.* + +if [ "$AUTO" == "false" ]; then +unclustered_strains_count=$(sed '1d' clusters_with_no_unique_SNP.txt | wc -l) +clusters_wo_uniq=$(sed '1d' clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l) +total_cluster_count=$(grep -c "." clusters.txt) + +echo "Total number of clusters = $total_cluster_count" >> log.txt +echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt +echo "See clusters_with_no_unique_SNP.txt for more details" >> log.txt +echo " " +else +iterative_yes=$(echo "F.0.4 F.0.5 F.0.6 F.0.7 F.0.8 F.0.9") +iterative_no=$(echo "T.0.4 T.0.5 T.0.6 T.0.7 T.0.8 T.0.9") +for f in `echo $iterative_yes` +do +unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l) +clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l) +total_cluster_count=$(grep -c "." $f/clusters.txt) +assign_thres=$(cut -f2- -d'.' <<< $f) +median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}') +head -4 log.txt > $f/misc.txt + +echo "### SNP assignment threshold $assign_thres with iterative clustering output ######" >> log.txt +echo "Total number of clusters = $total_cluster_count" >> log.txt +echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt +echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt +echo "See $ODR/$f/clusters_with_no_unique_SNP.txt for more details" >> log.txt +echo "Database created with above parameters located in $ODR/$f " >> log.txt +echo "################################################################################## + +" >> log.txt +done + +for f in `echo $iterative_no` +do +unclustered_strains_count=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | wc -l) +clusters_wo_uniq=$(sed '1d' $f/clusters_with_no_unique_SNP.txt | cut -f2 | sort | uniq | wc -l) +total_cluster_count=$(grep -c "." $f/clusters.txt) +assign_thres=$(cut -f2- -d'.' <<< $f) +median_cluster_SNPs=$(wc -l $f/*.Input.txt | grep -v "total" | sed -r 's/^ *([0-9]+)/\1\t/' | cut -f1 | sort -n | awk '{arr[NR]=$1}END { if (NR%2==1) print arr[(NR+1)/2]; else print (arr[NR/2]+arr[NR/2+1])/2}') +head -4 log.txt > $f/misc.txt + +echo "### SNP assignment threshold $assign_thres without iterative clustering output ######" >> log.txt +echo "Total number of clusters = $total_cluster_count" >> log.txt +echo "Median unique SNPs in clusters = $median_cluster_SNPs" >> log.txt +echo "Could not generate unique SNPs for $clusters_wo_uniq clusters containing a total of $unclustered_strains_count strains" >> log.txt +echo "See $f/clusters_with_no_unique_SNP.txt for more details" >> log.txt +echo "Database created with above parameters located in $ODR/$f " >> log.txt +echo "################################################################################## + +" >> log.txt +done +fi +if [ "$KEEP" == "false" ]; then +rm -rf $ODR/Roary +fi + +rm smeg.error.log.txt +echo "####### DONE #####" diff --git a/growth_est_ref b/growth_est_ref new file mode 100644 index 0000000..6febdf7 --- /dev/null +++ b/growth_est_ref @@ -0,0 +1,247 @@ +#!/bin/bash +############################################# +############################################ +# growth estimation reference-based +############################################ +############################################ + +cd $WDR +RDR=$(readlink -f $READS_DIR) +if [ $DESMAN == "false" ]; then SPEC=$(readlink -f $SPECIES_DIR) ; fi +echo "$package option activated" +echo "$WDR is present directory" +echo "$RDR is the reads directory" +echo "$ODR is the output directory" +echo "Reference-based method activated" + +################ + +cd $ODR +######################## +# USE REFERENCE GENOME TO GET POSITION OF CORE GENES +########################### +if [ $DESMAN == "false" ]; then +if ! [[ -s $GEN_LIS ]]; then echo "ERROR: Input file is empty"; exit 1 ; fi +samtools faidx $SPEC/core_gene_alignment.aln $(cat $GEN_LIS) > strains.aln || exit 1 +$SMEG_DIR/uniqueSNPmultithreading strains.aln 2 2 || exit 1 + +if ! [[ -s uniqueSNPs ]]; then echo "ERROR: The generated uniqueSNPs file is empty. Check your input file or GCC version"; exit 1 ; fi + +############################### +############################### +for ref in `cat $GEN_LIS` +do +reference=$(echo "$ref" | rev | cut -d'/' -f1 | rev) || exit 1 + +if [[ -e "$SPEC/misc.txt" ]]; then +cp $SPEC/../reordered_contigs/$reference.fna . +else +cp $SPEC/reordered_contigs/$reference.fna . || exit 1 +fi + +samtools faidx $SPEC/core_gene_alignment.aln $reference > $reference.aln || exit 1 +Rscript $SMEG_DIR/getPositionWithoutGaps.R -i uniqueSNPs -x $reference.aln -m 1 || exit 1 + +grep "label=" $SPEC/core_alignment_header.embl | cut -f2 -d'=' > core_genes.txt || exit 1 +grep "feature" $SPEC/core_alignment_header.embl | rev | cut -d' ' -f1 | rev | sed 's/\../ /g' | awk '{print ($1 - 1) "\t" $2}' | sed "s/^/$reference /" > core_gene_coordinate_in_align || exit 1 + +paste -d'\t' core_gene_coordinate_in_align core_genes.txt > bedfile +rm core_gene_coordinate_in_align core_genes.txt + +bedtools getfasta -fi $reference.aln -bed bedfile -name | sed 's/-//g' | fold -w 60 >> core_genes2.fa || exit 1 +awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' core_genes2.fa > core_genes.fa || exit 1 +rm core_genes2.fa +rm bedfile + +awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' core_genes.fa | grep -v ">" > geneLengths +grep ">" core_genes.fa | sed 's/>//g' > geneNames +paste <(cat geneLengths | awk '{total += $0; $0 = total - $0}1') <(cat geneLengths | awk '{total += $0; $0 = total}1') <(cat geneNames) --delimiters '\t' > geneCoordinates.txt +rm geneLengths geneNames +######## +makeblastdb -in $reference.fna -parse_seqids -dbtype nucl || exit 1 +blastn -query core_genes.fa -db $reference.fna -evalue 5 -num_threads $NUM_THREAD -max_target_seqs 1 -outfmt 6 -out $reference.blast.txt -word_size 11 || exit 1 + +cut -f1 $reference.blast.txt | sort | uniq > tmp.txt + +for i in `cat tmp.txt` +do +grep -w "$i" $reference.blast.txt | sort -nr -k12,12 | head -1 | cut -f2,9,10 > temp1 || exit 1 +grep -w "$i" $reference.blast.txt | sort -nr -k12,12 | head -1 | cut -f1 > temp2 || exit 1 +paste -d'\t' temp1 temp2 >> $reference.core.geneCood.txt +done + +rm temp* tmp.txt +rm $reference.blast.txt core_genes.fa + +########### +# EXTRACT UNIQUE SNPS FROM INPUT STRAINS +########## + +Rscript $SMEG_DIR/getPositioninRef.R -i modified_uniq_cluster_SNPs.txt -x $reference.core.geneCood.txt -y geneCoordinates.txt +########### + +grep -P '(^|\s)\K'$reference'(?=\s|$)' newcoordinates.txt | sort | uniq > $reference.Input.txt || exit 1 + +rm newcoordinates.txt +rm $reference.fna.nsq +rm $reference.fna.nsi +rm $reference.fna.nsd +rm $reference.fna.nog +rm $reference.fna.nin +rm $reference.fna.nhr +rm $reference.core.geneCood.txt geneCoordinates.txt +bowtie2-build $reference.fna $reference.fna -q || exit 1 +echo "$reference" >> strain_list.txt +done +rm strains.aln uniqueSNPs modified_uniq_cluster_SNPs.txt +################ +##### +# DESMAN strains +else +################ +################ +if ! [[ -s $DESM ]]; then echo "ERROR: Input file is empty"; exit 1 ; fi +for f in `cat $DESM` +do +cp $f . +reference=$(echo "$f" | rev | cut -d'/' -f1 | rev) || exit 1 +grep -v ">" $reference | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$reference" | sed -e '$a\ ' >> allStrains.fna +done + +samtools faidx allStrains.fna || exit 1 +$SMEG_DIR/uniqueSNPmultithreading allStrains.fna 2 2 || exit 1 +if ! [[ -s uniqueSNPs ]]; then echo "ERROR: The generated uniqueSNPs file is empty. Check your input file or GCC version"; exit 1 ; fi +sed '1d' uniqueSNPs | cut -f1 | sort | uniq > passed_strains.txt + +for strains in `cat passed_strains.txt` +do +grep -P '(^|\s)\K'$strains'(?=\s|$)' uniqueSNPs > $strains.Input.txt || exit 1 +echo "$strains" >> strain_list.txt +cat $strains | grep -v ">" | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$strains" | sed -e '$a\ ' > $strains.fna +bowtie2-build $strains.fna $strains.fna -q || exit 1 +rm $strains +done + +############## + +rm uniqueSNPs allStrains.fna.fai +rm passed_strains.txt + +fi +####################################### +####################################### +#### Estimate growth rate +#### +####################################### +####################################### +########## +cd $RDR +if [ "$LIST" == "false" ]; then +ls *.$READS_EXT | sed "s/\.$READS_EXT$//" > $ODR/samples.txt || exit 1 +else +cat $LIS | sed "s/\.$READS_EXT$//" > $ODR/samples.txt || exit 1 +fi + +cd $ODR +for i in `cat samples.txt` +do +mkdir $i +cd $i + +for reference in `cat $ODR/strain_list.txt` +do +bowtie2 -x $ODR/$reference.fna -U $RDR/$i.$READS_EXT -S $i.sam -p $NUM_THREAD --quiet || exit 1 + +samtools view -@ $NUM_THREAD -bS $i.sam > $i.bam || exit 1 +rm $i.sam + +if [[ $MISMATCH == 9999 ]]; then +samtools view -@ $NUM_THREAD -b -F 4 $i.bam | samtools sort -@ $NUM_THREAD -o $i.sorted.bam || exit 1 +rm $i.bam +else +bamtools filter -tag "XM:<=$MISMATCH" -in $i.bam | bamtools filter -tag "XO:0" | bamtools filter -tag "XG:0" -out $i.2.bam || exit 1 +rm $i.bam +samtools view -@ $NUM_THREAD -b -F 4 $i.2.bam | samtools sort -@ $NUM_THREAD -o $i.sorted.bam || exit 1 +rm $i.2.bam +fi + +samtools faidx $ODR/$reference.fna || exit 1 +samtools mpileup -a -a -A -B -f $ODR/$reference.fna $i.sorted.bam > $i.pileup || exit 1 +$SMEG_DIR/pileupParser $i.pileup + +rm $i.pileup +rm $i.sorted.bam +################### +sed '1d' polymorphic.site.coverage > polymorphic.site.coverage2 +rm polymorphic.site.coverage + +Rscript $SMEG_DIR/SMEG_SNP.R -i $ODR/$reference.Input.txt -p polymorphic.site.coverage2 -o $reference.coord.txt + +awk '$8 ~ "a" || $8 ~ "A" {print $2"\011"$1"\011"$3}' $reference.coord.txt >> $reference.final.temp || exit 1 +awk '$8 ~ "t" || $8 ~ "T" {print $2"\011"$1"\011"$4}' $reference.coord.txt >> $reference.final.temp || exit 1 +awk '$8 ~ "g" || $8 ~ "G" {print $2"\011"$1"\011"$5}' $reference.coord.txt >> $reference.final.temp || exit 1 +awk '$8 ~ "c" || $8 ~ "C" {print $2"\011"$1"\011"$6}' $reference.coord.txt >> $reference.final.temp || exit 1 + +sort -n -k2,2 $reference.final.temp > $i.$reference.cov.txt + +rm $reference.final.temp +rm $reference.coord.txt +rm polymorphic.site.coverage2 + +done +Rscript $SMEG_DIR/SNP_method_ref.R -i $i.temp1.txt -c $COV_CUTOFF -d 0.5 -s $MIN_SNP +awk -F'\t' '{ $2 = ($2 > 10 ? 1 : $2) } 1' OFS='\t' $i.temp1.txt | awk '$3 > '$COV_CUTOFF'' | sed "s/$i.//g" > $i.SMEG.txt +rm $i.temp1.txt +printf '1\ni\nStrain\tSMEG\tCoverage\tNo of SNPs\tSMEG range\n.\nw\n' | ed -s $i.SMEG.txt + +rm $i.*.cov.txt +cd $ODR +done +cd $ODR +exec 3>&2 +exec 2> /dev/null +rm *.txt +rm *.bt2 +rm *.aln +rm *.fai +rm *.fna +exec 2>&3 + +cp */*.SMEG.txt $ODR/. || exit 1 +rm -rf */ +################# +# MERGE OPTION +################ +cd $ODR +if [ "$MERGE" == "true" ]; then +cat *.SMEG.txt | grep -v "SMEG" | cut -f1 | sort | uniq > SMEG.genomes +ls *.txt | sed 's/\.txt//g' > SMEG_list.txt + +for i in `cat SMEG_list.txt` +do +for f in `cat SMEG.genomes` +do +paste <(awk '$1 == "'$f'" {print $1}' SMEG.genomes ) <(awk '$1 == "'$f'" {print $2}' $i.txt ) >> $i.temp.merge.txt +done +printf '1\ni\nStrain\t'$i'\n.\nw\n' | ed -s $i.temp.merge.txt +done + +for f in *.temp.merge.txt +do +cut -f2 $f > $f.tmp +rm $f +done + +printf '1\ni\nStrain\n.\nw\n' | ed -s SMEG.genomes +paste -d'\t' SMEG.genomes *.temp.merge.txt.tmp > merged_table.txt +rm *.temp.merge.txt.tmp +rm SMEG_list.txt +rm SMEG.genomes +exec 3>&2 +exec 2> /dev/null +Rscript $SMEG_DIR/heatmap.R +exec 2>&3 +echo "run complete" +else +echo "run complete" +fi diff --git a/smeg b/smeg new file mode 100644 index 0000000..d8d041a --- /dev/null +++ b/smeg @@ -0,0 +1,505 @@ +#!/bin/bash + +package="" # Default to empty package +SMEG_DIR="$( cd "$( dirname "$(readlink -f ${BASH_SOURCE[0]})" )" && pwd )" +WDR=$(pwd) + +while getopts ":hv" opt; do + case ${opt} in + h ) + echo "Usage:" + echo " smeg build_species Build species database" + echo " smeg growth_est Estimate strain-specific growth rate" + echo " smeg -v Version" + echo " smeg -h Display this help message" + exit 0 + ;; + v ) + echo "smeg v1.1" + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + esac +done +########################################################################### +if [ $# -eq 0 ]; +then + echo "Usage:" + echo " smeg build_species Build species database" + echo " smeg growth_est Estimate strain-specific growth rate" + echo " smeg -v Version" + echo " smeg -h Display this help message" + exit 1 +fi +########################################################################## + +shift $((OPTIND -1)) + +subcommand=$1; +case "$subcommand" in + # Parse options to the first sub command + build_species) + package=$1; shift # Remove 'build_species' from the argument list + LIST=false + NUM_THREAD=4 + SAT=0.6 + CNT=50 + IGNORE_ITER=false + AUTO=false + KEEP=false + REF_ONLY=false + INT='^[0-9]+$' + FLOAT='^[0-9]+([.][0-9]+)?$' + # Process package options + while getopts ":g:o:l:p:r:s:t:hiake" opt; do + case ${opt} in + g ) + GEN_DIR=$OPTARG + ;; + o ) + OUTPUT_DIR=$OPTARG + ;; + l ) + LIST=$OPTARG + ;; + p ) + NUM_THREAD=$OPTARG + ;; + s ) + SAT=$OPTARG + ;; + t ) + CNT=$OPTARG + ;; + i ) + IGNORE_ITER=true + ;; + a ) + AUTO=true + ;; + r ) + REPGEN=$OPTARG + ;; + k ) + KEEP=true + ;; + e ) + REF_ONLY=true + ;; + h ) + echo "Usage:" + echo " smeg build_species " + echo " " + echo " -g Genomes directory" + echo " -o Output directory" + echo " -l File listing a subset of genomes for database building" + echo " [default = use all genomes in 'Genomes directory']" + echo " -p INT Number of threads [default 4]" + echo " -s FLOAT SNP assignment threshold (range 0.1 - 1) [default 0.6]" + echo " -t INT Cluster SNPs threshold for iterative clustering [default 50]" + echo " -i Ignore iterative clustering" + echo " -a Activate auto-mode" + echo " -r Representative genome [default = auto select Rep genome]" + echo " -k Keep Roary output [default = false]" + echo " -e Create database ONLY applicable with Reference-based SMEG method" + echo " [default = generate database suitable for both de novo and ref-based methods]" + echo " -h Display this message" + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + : ) + echo "Invalid Option: -$OPTARG requires an argument" 1>&2 + exit 1 + ;; + * ) + echo "Unimplemented option: -$OPTARG" >&2 + exit 1 + ;; +esac +done + +if [ $# -eq 0 ]; +then + echo "Usage:" + echo " smeg build_species " + echo " " + echo " -g Genomes directory" + echo " -o Output directory" + echo " -l File listing a subset of genomes for database building" + echo " [default = use all genomes in 'Genomes directory']" + echo " -p INT Number of threads [default 4]" + echo " -s FLOAT SNP assignment threshold (range 0.1 - 1) [default 0.6]" + echo " -t INT Cluster SNPs threshold for iterative clustering [default 50]" + echo " -i Ignore iterative clustering" + echo " -a Activate auto-mode" + echo " -r Representative genome [default = auto select Rep genome]" + echo " -k Keep Roary output [default = false]" + echo " -e Create database ONLY applicable with Reference-based SMEG method" + echo " [default = generate database suitable for both de novo and ref-based methods]" + echo " -h Display this message" + exit 1 +fi + +if [ "x" == "x$GEN_DIR" ]; then + echo "-g [option] is required" + exit 1 +fi +if ! [[ $NUM_THREAD =~ $INT ]] && [[ $NUM_THREAD != 1 ]] ; then + echo "-p [option] requires an integer" + exit 1 +fi +if [ "$LIST" != "false" ]; then +LIS=$(readlink -f $LIST) +fi +if ! [[ $SAT =~ $FLOAT ]] || [[ $SAT > 1 ]] || [[ $SAT < 0.1 ]]; then + echo "-s [option] requires a numeric value between 0.1 and 1" + exit 1 +fi +if ! [[ $CNT =~ $INT ]] && [[ $CNT != 50 ]]; then + echo "-t [option] requires an integer" + exit 1 +fi +if [[ $CNT == 0 ]]; then + echo "-t [option] requires a non-zero value" + exit 1 +fi +if [ "$AUTO" == "true" ] ; then + REF_ONLY=false +fi +if [ "x" == "x$OUTPUT_DIR" ]; then + directory=$(echo "species_database_$(date +%s)") + mkdir $WDR/$directory + OUTPUT_DIR=$WDR/$directory +fi + +shift $((OPTIND -1)) + ;; +############### + growth_est) + package=$1; shift # Remove 'growth_est' from the argument list + METHOD=0 + LIST=false + GEN_LIST=false + DESMAN=false + COV_CUTOFF=0.5 + READS_EXT="fastq" + NUM_THREAD=4 + CLUS_DET=0.2 + SAT=0.6 + INT='^[0-9]+$' + FLOAT='^[0-9]+([.][0-9]+)?$' + MERGE=false + MIN_SNP=100 + MISMATCH=9999 + # Process package options + while getopts ":r:o:s:u:d:n:l:m:x:t:c:a:g:p:eh" opt; do + case ${opt} in + r ) + READS_DIR=$OPTARG + ;; + x ) + READS_EXT=$OPTARG + ;; + o ) + OUTPUT_DIR=$OPTARG + ;; + s ) + SPECIES_DIR=$OPTARG + ;; + d ) + CLUS_DET=$OPTARG + ;; + c ) + COV_CUTOFF=$OPTARG + ;; + p ) + NUM_THREAD=$OPTARG + ;; + t ) + SAT=$OPTARG + ;; + m ) + METHOD=$OPTARG + ;; + e ) + MERGE=true + ;; + l ) + LIST=$OPTARG + ;; + g ) + GEN_LIST=$OPTARG + ;; + n ) + MISMATCH=$OPTARG + ;; + u ) + MIN_SNP=$OPTARG + ;; + a ) + DESMAN=$OPTARG + ;; + h ) + echo "Usage:" + echo " smeg growth_est " + echo " " + echo "" + echo " ## MAIN OPTIONS ## " + echo " -r Reads directory (single-end reads)" + echo " -x Sample filename extension (fq, fastq, fastq.gz) [default fastq]" + echo " -o Output directory" + echo " -s Species database directory" + echo " -m INT SMEG method (0 = de novo-based method, 1 = reference-based method) [default = 0]" + echo " -c FLOAT Coverage cutoff (>= 0.5) [default 0.5]" + echo " -u INT Minimum number of SNPs to estimate growth rate [default = 100]" + echo " -l Path to file listing a subset of reads for analysis" + echo " [default = analyze all samples in Reads directory]" + echo "" + echo " ## DE-NOVO BASED APPROACH OPTIONS ## " + echo " -d FLOAT Cluster detection threshold (range 0.1 - 1) [default = 0.2]" + echo " -t FLOAT Sample-specific SNP assignment threshold (range 0.1 - 1) [default = 0.6] +" + echo " ## REFERENCE BASED APPROACH OPTIONS ##" + echo " -g File listing reference genomes for growth rate estimation" + echo " -a File listing FULL PATH to DESMAN-resolved strains in fasta format (core-genes)" + echo " -n INT Max number of mismatch [default = use default bowtie2 threshold] +" + echo " ## OTHER OPTIONS ##" + echo " -e merge output tables into a single matrix file and generate heatmap" + echo " -p INT Number of threads [default 4]" + echo " -h Display this message" + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + : ) + echo "Invalid Option: -$OPTARG requires an argument" 1>&2 + exit 1 + ;; + esac + done + +if [ $# -eq 0 ]; +then + echo "Usage:" + echo " smeg growth_est " + echo " " + echo "" + echo " ## MAIN OPTIONS ## " + echo " -r Reads directory (single-end reads)" + echo " -x Sample filename extension (fq, fastq, fastq.gz) [default fastq]" + echo " -o Output directory" + echo " -s Species database directory" + echo " -m INT SMEG method (0 = de novo-based method, 1 = reference-based method) [default = 0]" + echo " -c FLOAT Coverage cutoff (>= 0.5) [default 0.5]" + echo " -u INT Minimum number of SNPs to estimate growth rate [default = 100]" + echo " -l Path to file listing a subset of reads for analysis" + echo " [default = analyze all samples in Reads directory]" + echo "" + echo " ## DE-NOVO BASED APPROACH OPTIONS ## " + echo " -d FLOAT Cluster detection threshold (range 0.1 - 1) [default = 0.2]" + echo " -t FLOAT Sample-specific SNP assignment threshold (range 0.1 - 1) [default = 0.6] +" + echo " ## REFERENCE BASED APPROACH OPTIONS ##" + echo " -g File listing reference genomes for growth rate estimation" + echo " -a FIle listing FULL PATH to DESMAN-resolved strains in fasta format (core-genes)" + echo " -n INT Max number of mismatch [default = use default bowtie2 threshold] +" + echo " ## OTHER OPTIONS ##" + echo " -e merge output tables into a single matrix file and generate heatmap" + echo " -p INT Number of threads [default 4]" + echo " -h Display this message" + exit 1 +fi + +if [ "x" == "x$READS_DIR" ]; then + echo "-r [option] is required" + exit 1 +fi +if [ "x" == "x$SPECIES_DIR" ] && [ "$DESMAN" == "false" ]; then + echo "-s [option] is required" + exit 1 +fi +if [ "$READS_EXT" != "fq" ] && [ "$READS_EXT" != "fastq" ] && [ "$READS_EXT" != "fastq.gz" ] ; then + echo "Invalid filename extension. Recognized extensions are fq, fastq, fastq.gz" + exit 1 +fi +if ! [[ $COV_CUTOFF =~ $FLOAT ]] && [[ $COV_CUTOFF != 1 ]]; then + echo "-c [option] requires a numeric value" + exit 1 +fi +if (( $(bc <<< "$COV_CUTOFF < 0.5") )); then + echo "required minimum coverage cutoff is 0.5" + exit 1 +fi +if [[ $METHOD != 1 ]] && [[ $METHOD != 0 ]]; then + echo "-m [option] requires an integer value of 0 or 1" + echo "de novo-based method = 0, reference-based method = 1" + exit 1 +fi + +if [ "$GEN_LIST" != "false" ] && [ "$DESMAN" != "false" ]; then +echo "-g [option] and -a [option] cannot be specified simultaneously" +exit 1 +fi +if [ "$GEN_LIST" != "false" ] || [ "$DESMAN" != "false" ]; then +METHOD=1 +if [ "$GEN_LIST" != "false" ]; then +GEN_LIS=$(readlink -f $GEN_LIST) +fi +if [ "$DESMAN" != "false" ]; then +DESM=$(readlink -f $DESMAN) +fi + +if ! [[ $MISMATCH =~ $INT ]] ; then + echo "-n [option] requires an integer" + exit 1 +fi +else +if ! [[ $SAT =~ $FLOAT ]] || [[ $SAT > 1 ]] || [[ $SAT < 0.1 ]]; then + echo "-t [option] requires a numeric value between 0.1 and 1" + exit 1 +fi +if ! [[ $CLUS_DET =~ $FLOAT ]] || [[ $CLUS_DET > 1 ]] || [[ $CLUS_DET < 0.1 ]]; then + echo "-d [option] requires a numeric value between 0.1 and 1" + exit 1 +fi +fi +if ! [[ $MIN_SNP =~ $INT ]] ; then + echo "-u [option] requires an integer " + exit 1 +fi +if ! [[ $NUM_THREAD =~ $INT ]] && [[ $NUM_THREAD != 1 ]] ; then + echo "-p [option] requires an integer" + exit 1 +fi + +if [[ $METHOD == 1 ]]; then +if [ "$GEN_LIST" == "false" ] && [ "$DESMAN" == "false" ]; then + echo "Reference-based method (-m = 1) requires input from -g or -a flags" + exit 1 +fi +fi +if [ "x" == "x$OUTPUT_DIR" ]; then + directory=$(echo "SMEG_growth_$(date +%s)") + mkdir $WDR/$directory + OUTPUT_DIR=$WDR/$directory +fi +if [ "$LIST" != "false" ]; then +LIS=$(readlink -f $LIST) +fi + + shift $((OPTIND -1)) + ;; +esac +################################### +if [ "$package" != "build_species" ] && [ "$package" != "growth_est" ]; then + echo "Invalid option" + exit 1 +fi +############################################# + + +echo " ################ Checking for dependencies ########" +requirements=$(echo "gcc parallel R Mauve roary prokka bowtie2 samtools bamtools bedtools blastn") +for f in `echo $requirements` +do +toolCheck=$(type -P $f) +if [ -z $toolCheck ]; then +echo "ERROR: $f missing" +echo "Check https://github.com/ohlab/SMEG for required packages" +exit 1 +else +echo "$f found" +fi +done +echo "All required packages found" +echo " ################ Checking for required R libraries ########" +###################3 +cd $WDR +ODR=$(readlink -f $OUTPUT_DIR) +mkdir -p $ODR + +if [ -z "$(ls -A $ODR)" ]; then + echo "Output directory ok" +else + echo "ERROR: Output directory not Empty" + exit 1 +fi +#################### +cd $ODR +Rscript $SMEG_DIR/check_R_libraries.R +if [ -s .checkedR ]; then +echo "ERROR: The following R libraries are missing" +cat .checkedR +echo "Check https://github.com/ohlab/SMEG for required packages" +rm .checkedR +exit 1 +else +echo "R libraries ok" +rm .checkedR +fi + +cd $SMEG_DIR +############################################## +########################################### +if [ "$package" == "build_species" ] + then +export GEN_DIR +export ODR +export NUM_THREAD +export SAT +export KEEP +export CNT +export IGNORE_ITER +export AUTO +export SMEG_DIR +export WDR +export package +export LIST +export REF_ONLY +export REPGEN +if [ "$LIST" != "false" ]; then +export LIS +fi +if [ "x$REPGEN" != "x" ]; then + export REPGEN +fi + +./build_sp +else +export SMEG_DIR +export WDR +export package +export SPECIES_DIR +export NUM_THREAD +export READS_DIR +export ODR +export METHOD +export LIST +export LIS +export MIN_SNP +export GEN_LIS +export GEN_LIST +export DESM +export DESMAN +export COV_CUTOFF +export READS_EXT +export CLUS_DET +export SAT +export MERGE +export MISMATCH +if [[ $METHOD == 0 ]]; then +./growth_est_denovo +else +./growth_est_ref +fi +fi