-
Notifications
You must be signed in to change notification settings - Fork 0
/
fclassifier.sh
272 lines (218 loc) · 10.6 KB
/
fclassifier.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/bin/bash
#"FAST FUNGAL GENOME CLASSIFIER": A TOOL FOR QUICKLY AND ACCURATELY CLASSIFYING FUNGAL GENOMES
####################################################################################################################
####################################################################################################################
# #
#COPYRIGHT="Copyright (C) 2022 Ayixon Sánchez Reyes" #
# #
# This program is free software: you can redistribute it and/or modify it under the terms of the GNU #
# General Public License as published by the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version.This program is distributed WITHOUT ANY WARRANTY. #
# #
####################################################################################################################
####################################################################################################################
# DEPENDENCIES: JolyTree, apcalc, ncbi-entrez-direct, orthoani, Blast, Biopython, bPTP,mPTP #
# #
##########Before you begin, install the following: #
#
########## sudo apt install apcalc
########## sudo apt install ncbi-entrez-direct
########## sudo apt install ncbi-blast+
########## conda install -c bioconda fastani
########## conda install -c bioconda jolytree
########## https://github.com/Pas-Kapli/mptp
########## conda install -c bfurneaux bptp
####################################################################################################################
# #
####################################################################################################################
# Usage: ./fclassifier.sh -i query_genome -d database.msh - m <model> mptp or bptp (bPTP by deafult) #
# The working directory must contain the mash database (.msh) and the query genome in fasta format #
####################################################################################################################
# Rational: Compare a query_genome vs a curated MASH database, select the nearest phylogenetic neighbors; #
# Estimate the ANI of the query vs the references, #
# JolyTree estimate a rapid and robust genome based phylogeny
# The tree is subjected to speciation hypothesis testing under Poisson Tree Processes Model #
#
# THIS GENOME CLASSIFIER deals with the "Phylophenetic Species Concept"
# AND WITH "Molecular Species Delimitation" by testing three working hypotheses: #
# The Genomic Coherence measured through the genomic distance of Mash and the ANI #
# The Phylogenetic Hypothesis of monophyly
# The speciation or coalescence hypothesis under the Poisson tree processes model #
# This makes it possible to evaluate phenetic, genomic and evolutionary-molecular
# elements in a corpus of speciation #
#
# Ayixon Sánchez-Reyes ayixon@gmail.com #
# Computational Microbiology #
# Microbiological Observatory #
# Institute of Biotechnology, UNAM, Cuernavaca, MEXICO #
#############################################################################################################
# #
# ============ #
# = VERSIONS = VERSION=1.0. Written by Ayixon Sánchez Reyes #
# VERSION=1.0 #
# ============ #
#############################################################################################################
#############################################################################################################
# Use getopts to parse the input options
start=`date +%s.%N`
function display_usage {
echo "Usage: $0 -i <input_file> -d <database_file> -m <model>"
echo
echo "Options:"
echo " -i <input_file> Input fasta, fna, or fa archive"
echo " -d <database_file> Database file in .msh format"
echo " -m <model> Select between two models: mptp or bptp"
echo " -h Display this help message"
}
input_file=""
database_file=""
model=""
while getopts "i:d:m:h" opt; do
case $opt in
i)
input_file="$OPTARG"
;;
d)
database_file="$OPTARG"
;;
m)
model="$OPTARG"
;;
h)
display_usage
exit 0
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
if [ -z "$model" ]; then
model="bptp"
fi
if [ "$model" == "mptp" ]; then
# Execute mptp command
echo "Executing mptp command"
else
echo "bptp selected by default"
fi
# Estimate Mash distance with the input files
mash dist $input_file $database_file -p 10| sort -n -k3 > output.mash.txt;
echo ""
echo -e " \e[0;32mMASH analysis finished. These are the top hits \e[0m "
echo " Query Reference D p_value shared hashed"
head output.mash.txt
cut -f3 output.mash.txt |head -n 500 > distancias
uniq distancias > dist.uniq
echo ""
echo -e " \e[0;32mPrinting the distance table (Mash D) \e[0m "
echo ""
echo -e " \e[0;32mThese are the closest genomes to your query and their approximate ANI \e[0m "
# Approximate ANI estimation as 1-Distance
echo ""
for i in $(fmt dist.uniq)
do sort -n -k3 output.mash.txt| grep -m1 -F "$i"
echo ""
var1=$(calc 1-"$i")
echo -e " \e[0;32mComputing approximate ANI \e[0m "
echo $var1
echo "---------"
if ((`bc <<< "$var1>=0.95"`)) ; then
echo -e '\e[0;33mGenomic Coherence Detected \e[0m '
echo ""
echo "--------------------------"
fi
done
echo "--------"
echo ""
#Selection and download of the neighboring genomes with the smallest genomic distance
echo -e " \e[0;32mDownloading the reference genomes from NCBI \e[0m "
echo "#####################################"
for z in $(fmt dist.uniq)
do grep -F -m 5 "$z" output.mash.txt| awk '{print $2}'| sed 's/_[A-Z]/ / gI' | awk '{print $1}' >> Genome_accnumber.txt
done
echo ""
cat Genome_accnumber.txt | while read -r acc ; do
esearch -db assembly -query $acc </dev/null \
| esummary \
| xtract -pattern DocumentSummary -element FtpPath_GenBank \
| while read -r url ; do
fname=$(echo $url | grep -o 'GCA_.*' | sed 's/$/_genomic.fna.gz/') ;
wget "$url/$fname" ;
done ;
done
echo ""
echo -e " \e[0;32mDownload successful \e[0m "
echo ""
gzip -d *.gz
#Genomes are copied to a separate directory for phylogenetic analysis
#working directory cleanup operations
echo -e " \e[0;32mStarting Phylogenetic Analysis with JolyTree \e[0m "
echo "##############################################"
mkdir Mash_out
cp dist.uniq distancias Mash_out/
mkdir JolyTree_in
cp *.fna JolyTree_in/
cp *.fasta JolyTree_in/
cp *.fa JolyTree_in/
JolyTree.sh -i JolyTree_in/ -b out_tree -t 10
mkdir JolyTree_out
cp out_tree* JolyTree_out/
echo ""
echo -e " \e[0;32m#Phylogenetic analysis finished, results are in the directory JolyTree_out \e[0m"
echo ""
echo -e " \e[0;32m#Computing fastANI \e[0m "
echo "--------------------"
echo Genome_Query: $input_file
rm *.fna
rm out_tree*
cp Genome_accnumber.txt output.mash.txt Mash_out/
rm dist.uniq distancias output.mash.txt Genome_accnumber.txt
cd JolyTree_in/
for ani in *.fna
do fastANI -q $input_file -r $ani -t 4 --fragLen 1000 --minFraction 0.1 -o $ani.txt
done
echo "--------------------"
echo -e " \e[0;32m#fastANI done \e[0m "
echo "--------------------"
awk '{print $1,$2,$3}' *.txt
#awk '{print substr(FILENAME, 3)" \""$0"\" "}' ./*.txt|sort -t\" -nrk2 | sed 's/_[A-Z]/ / gI'| awk '{print $1,"ANI=", $NF}'
cd ..
mkdir fastANI_out
cp JolyTree_in/*.txt fastANI_out/
echo ""
rm JolyTree_in/*.txt
rm *.gz
echo ""
echo -e " \e[0;32m#Computing Species Delimitation under Markov Chain Monte Carlo \e[0m "
mkdir sp_Results; \
cd JolyTree_out/; \
#------------------------------------------------------------------------------------------------
if [[ $model == mptp ]]; then
mptp --mcmc 500000 --single --mcmc_sample 10000 --mcmc_burnin 100000 \
--tree_file out_tree.nwk --output_file PTP_sp --tree_show ; \
else
echo -e " \e[0;32m#mptp was not selected, it is ok; lets try bptp algorithm \e[0m "
fi
if [[ $model == bptp ]]; then
bPTP.py -t out_tree.nwk -o PTP_sp -s 1234 -r -i 1000000
fi
#------------------------------------------------------------------------------------------------
cd .. ; \
cp JolyTree_out/PTP* sp_Results/ ; \
echo -e " \e[0;32mSpeciation test done, look at the sp_Results directory \e[0m "
echo ""
echo -e " \e[0;32m Fast Fungal Genome Classifier pipeline done \e[0m "
echo ""
echo -e " \e[0;33mWe recommend that you test the phylophenetic hypothesis about *Diagnostic Characters* \e[0m "
echo ""
echo "Thank you for your support, enjoy it"
echo ""
end=`date +%s.%N`
runtime=$( echo "$end - $start" | bc )
echo Time $runtime seconds