-
Notifications
You must be signed in to change notification settings - Fork 0
/
POGENOM_poly.sh
285 lines (251 loc) · 14.3 KB
/
POGENOM_poly.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#!/bin/bash -l
wkd=$(pwd)
err_report() {
echo "Error on line $1 - script POGENOM_poly.sh"
if [ "$1" >= 236 ]; then echo "TIP - Look at the file $outdir/log_pogenom/pogenom_*$name.log"; fi
if test -f "tempo"; then rm tempo; fi
exit 1
}
trap 'err_report $LINENO' ERR
# --- default parameters
NS=1 # Minimum number of samples
extra_params="--min_count 10 --subsample 10" # Extra pogenom parameters
VCF_text="Required"
sizedir=$wkd/Input_POGENOM/Genome_sizes # Directory with Genome size files
gffdir="none" # Directory with gff files
outdir=POGENOM_OUTPUT # pogenom's Output directory
RI=no # option for forcing re-execution
dirfasta=none # Direcotry with genome's fasta files, e.g., dirfasta=$wkd/Input_POGENOM/RAW_DATA/Genomes/$dataset
ccode=$wkd/standard_genetic_code.txt # path to standard_genetic_code.txt file
rloci=none # Minimum ratio to include a loci in the analysis: number of samples with the loci/total number of samples in vcf file. Float number (upto 1) or 'none'
configFile=$wkd/Input_POGENOM/config_files/Input_POGENOM_config.json # path to Input_POGENOM config file
#Converting json to bash configuration and setting default parameters according to the Input_POGENOM parameter used
if [ -s $configFile ]; then
cat $configFile | sed s/"[{|}]"//g | sed s/":"/"="/g | sed s/",$"//g | sed s/'"'//g | sed s/" "//g > tempo
. tempo
extra_params="--min_count $min_coverage --subsample $min_coverage"
gffdir=$wkd/Input_POGENOM/GFF_files/$dataset
params="params_cov_"$min_coverage"_bdth_"$min_breadth"_mpq_"$mapqual"_bq_"$min_bsq_for_cov_median_calculation
if [ "$mode" == "prefilt" ]; then dts=$dataset"_prefilt"; else dts=$dataset; fi
vcfdir=$wkd/Input_POGENOM/06_VCF/$dts/$params # path to directory containing VCF files genearted with Input_POGENOM
VCF_text="Default = $vcfdir"
fi
#---------------Arguments
Usage="\nUsage: bash POGENOM_poly.sh [options]\n -d=<absolute path to directory containing pogenom.pl, i.e. working directory, default=$wkd >\n\
-o=<Output directory name, default=$outdir >\n\
-v=<absolute path to directory containing VCF files, $VCF_text >\n\
-s=<absolute path to directory containing Genome sizes or 'none', default=$sizedir >\n\
-f=<absolute path to directory containing Genome sequences - fasta files or 'none', default=$dirfasta >\n\
-g=<absolute path to directory containing GFF files or 'none', default=$gffdir >\n\
-c=<absolute path to file containing standard_genetic_code or 'none', default=$ccode >\n\
-e=<space-separated and single-quoted string of extra pogenome.pl commands. If not required, set an empty string, i.e., -e=''. Default='$extra_params' >.\n\
-n=<Minimum number of samples in vcf file. Default=$NS >\n\
-r=<Force the re-execution, yes or no. Default=$RI >\n\
-l=<Minimum ratio to include a loci in the analysis: number of samples with the loci/total number of samples in vcf file. Float number (upto 1) or 'none'. Defaul=$rloci >\n\
if this flag is used, pogenom's --min_found flag will be set to the closest integer of (ratio*Number of samples in vcf file)\n"
for i in "$@"
do
case $i in
-v=*|--vcf_directory=*)
if [ -z "${i#*=}" ]; then echo "value to argument -v No supplied"; exit 0; else vcfdir="${i#*=}"; fi
shift # past argument
;;
-s=*|--size_directory=*)
if [ -z "${i#*=}" ]; then echo "value to argument -s No supplied"; exit 0; else sizedir="${i#*=}"; fi
shift # past argument
;;
-g=*|--gff_directory=*)
if [ -z "${i#*=}" ]; then echo "value to argument -g No supplied"; exit 0; else gffdir="${i#*=}"; fi
shift # past argument
;;
-f=*|--fasta_directory=*)
if [ -z "${i#*=}" ]; then echo "value to argument -f No supplied"; exit 0; else dirfasta="${i#*=}"; fi
shift # past argument
;;
-d=*|--wk_directory=*)
if [ -z "${i#*=}" ]; then echo "value to argument -d No supplied"; exit 0; else wkd="${i#*=}"; fi
shift # past argument
;;
-c=*|--standard_genetic_code=*)
if [ -z "${i#*=}" ]; then echo "value to argument -c No supplied"; exit 0; else ccode="${i#*=}"; fi
shift # past argument
;;
-o=*|--out_dir_name=*)
if [ -z "${i#*=}" ]; then cho "value to argument -o No supplied"; exit 0; else outdir="${i#*=}"; fi
shift # past argument
;;
-r=*|--re_run=*)
RI="${i#*=}"
shift # past argument
;;
-e=*|--extr_prs=*)
extra_params="${i#*=}"
shift # past argument
;;
-l=*|--loci_ratio=*)
rloci="${i#*=}"
shift # past argument
;;
-n=*|--num_samples=*)
NS="${i#*=}"
shift # past argument
;;
*)
echo -e "${Usage}"
echo -e "Description:\nThis program runs POGENOM on several Genomes, using files generated by Input_POGENOM"
if test -f "tempo"; then rm tempo; fi
exit 0
;;
esac
done
# --checking paths -----
if [[ "$vcfdir" != /* ]]; then
echo "Please provide an absolute path -v=/absolute/path/to/VCF_files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$sizedir" != /* ]] && [[ "$sizedir" != none ]]; then
echo "Please provide an absolute path -s=/absolute/path/to/Genome_sizes"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$gffdir" != /* ]] && [[ "$gffdir" != none ]]; then
echo "Please provide an absolute path -g=/absolute/path/to/GFF_sizes"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$dirfasta" != /* ]] && [[ "$dirfasta" != none ]]; then
echo "Please provide an absolute path -f=/absolute/path/to/GFF_sizes"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$wkd" != /* ]]; then
echo "Please provide an absolute path -d=/absolute/path/to/directory containing pogenom.pl, i.e., working directory"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
# Cheking Minimum size requirement
if [ ! -d "$vcfdir" ] || [ -z "$(ls $vcfdir/*.vcf)" ]; then # If directory doesn-t exists, or it is empty
echo "Please provide a valid directory with Genome vcf files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$sizedir" != none ]] && ([ ! -d "$sizedir" ] || [ -z "$(ls $sizedir/*.size)" ]); then # If directory doesn-t exists, or it is empty
echo "Please provide a valid directory with Genome size files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$dirfasta" != none ]] && ([ ! -d "$dirfasta" ] || [ -z "$(ls $dirfasta/*.fa)" ]); then # If directory doesn-t exists, or it is empty
echo "Please provide a valid directory with Genome fasta files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$sizedir" == none ]] && [[ "$dirfasta" == none ]]; then
echo -e "\nPlease provide argument for flag -s or -f"
echo -e "$Usage"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
if [[ "$gffdir" != none ]] && ([ ! -d "$gffdir" ] || [ -z "$(ls $gffdir/*.gff)" ]); then # If directory doesn-t exists, or it is empty
echo "Please provide a valid directory with Genome gff files"
if test -f "tempo"; then rm tempo; fi
exit 1
fi
# ----------pogenom extra parameters
if [ -n "$extra_params" ]; then extra=$(echo "-e='$extra_params'") ; fi
# -----------------------------
# ****** MAIN ******
if [[ "$sizedir" != none ]] && [[ "$dirfasta" != none ]]; then
echo -e "INFO: Genome size will be calculated from the Genome fasta files and flag -s will be ginored, -s=none"
sizedir="none"
fi
echo -e "INFO: POGENOM_poly.sh -o=$outdir -v=$vcfdir -s=$sizedir -g=$gffdir -d=$wkd -r=$RI -n=$NS -c=$ccode -f=$dirfasta -l=$rloci $extra"
if [ "$RI" == yes ]; then echo "Option - Force the re-execution - activated"; fi
cd $vcfdir
Lgs=($(ls *.vcf)) # list of VCF files
cd $wkd
mkdir -p $wkd/$outdir/log_pogenom
for f in "${Lgs[@]}" # for each VCF file
do
name=$( echo "${f%.vcf}" ) # name of VCF file
if [ ! -d "$wkd/$outdir/$name" ] || [ -z "$(ls -A $wkd/$outdir/$name)" ] || [ "$RI" == yes ]; then # If directory doesn-t exists, or it is empty or Re-execution option is activated
if [ "$dirfasta" != none ] && [ ! -s "$dirfasta/$name$genomes_ext" ]; then #if fasta file is provided when required
echo -e "INFO: POGENOM on Genome: $name cannot be run, the file $dirfasta/$name$genomes_ext doesn't exit\n please provide a fast (flag -f) or size (flag -s) file"
continue
fi
if [ "$dirfasta" == none ] && [ ! -s "$sizedir/$name.size" ]; then #if size file is provided when required
echo -e "INFO: POGENOM on Genome: $name cannot be run, the file $sizedir/$name.size doesn't exit\n please provide a fast (flag -f) or size (flag -s) file"
continue
fi
if [ "$dirfasta" == none ]; then # If size will be calculated from size file and not from fast file
sizefile=$(echo "$sizedir/$name.size")
size=$( cat $sizefile | cut -d ":" -f2 | tr -d " ")
fi
inputfile=$vcfdir/$f
n_samples=$( echo "$(grep "^#" $inputfile | grep -v "^##" | wc -w) - 9" | bc ) # number of samples
if [[ "$n_samples" -lt "0" ]]; then n_samples="0" ; fi
if [[ "$n_samples" -ge "$NS" ]]; then # if the number of samples in the vcf files is higher than the user-defined Minimum
echo "INFO: Running POGENOM on Genome: $name"
echo " Number of samples: $n_samples"
if [ "$rloci" != none ]; then
minfound=$( echo "$rloci*$n_samples" | bc | awk '{print ($0-int($0)<0.499)?int($0):int($0)+1}')
if [ "$minfound" == 0 ]; then minfound=1; fi # --min_found 1 default. It is the lowest value for this flag in pogenom.pl
if [ "$minfound" -eq $n_samples ]; then minfound=0; fi # --min_found 0 means all samples in pogenom.pl
new_params=$(echo $extra_params | sed s/" --min_found [0-9]*"//) # removing --min_found flag if the user has provided it at the same time with flag -l ( the aim is to avoid flag conflicts)
extra_params="$new_params --min_found $minfound" # Setting pogenom.pl extra-parameters
echo " --min_found is set to $minfound"
fi
#Pogenome according to user specifications
if [ -s "$gffdir/$name.gff" ] && [[ "$gffdir" != none ]]; then # if gff file exists when the user wants to use
if [[ "$ccode" != none ]] && [[ -s "$ccode" ]]; then # if standard_genetic_code file exits, it is not empty and the user wants to use
if [[ "$dirfasta" != none ]]; then # When size will be calculated from fast file
echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params"
perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params > $outdir/log_pogenom/pogenom_gff_fasta_$name.log
mkdir -p $wkd/$outdir/$name
mv Pogenom_$name.* $wkd/$outdir/$name/.
else # When genome size will be taken from size file
contigs_seq_in_gff=$(echo $(grep -c "##FASTA" $gffdir/$name.gff | bc )) # checking right header (and by consequence, the presence of contigs sequences in gff file)
if [ "$contigs_seq_in_gff" -ne 1 ]; then
echo -e "Error: Pogenome cannot be run for $name with the current parameters.\n The $gffdir/$name.gff file must contain contigs sequences and the line before the first conting should be ##FASTA.\n You should use the flag -f instead of -s"
continue
fi
echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params"
perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff --genetic_code_file $ccode $extra_params > $outdir/log_pogenom/pogenom_gff_geneticode_$name.log
mkdir -p $wkd/$outdir/$name
mv Pogenom_$name.* $wkd/$outdir/$name/.
fi
else
if [[ "$dirfasta" != none ]]; then # When size will be calculated from fast file
echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff $extra_params"
perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext --gff_file $gffdir/$name.gff $extra_params > $outdir/log_pogenom/pogenom_gff_$name.log
mkdir -p $wkd/$outdir/$name
mv Pogenom_$name.* $wkd/$outdir/$name/.
else # When genome size will be taken from size file
echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff $extra_params"
perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size --gff_file $gffdir/$name.gff $extra_params > $outdir/log_pogenom/pogenom_gff_$name.log
mkdir -p $wkd/$outdir/$name
mv Pogenom_$name.* $wkd/$outdir/$name/.
fi
fi
else
if [[ "$dirfasta" != none ]]; then # When size will be calculated from fast file
echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext $extra_params"
perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --fasta_file $dirfasta/$name$genomes_ext $extra_params > $outdir/log_pogenom/pogenom_size_$name.log
mkdir -p $wkd/$outdir/$name
mv Pogenom_$name.* $wkd/$outdir/$name/.
else # When genome size will be taken from size file
echo "perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size $extra_params"
perl $wkd/pogenom.pl --vcf_file $inputfile --out Pogenom_$name --genome_size $size $extra_params > $outdir/log_pogenom/pogenom_size_$name.log
mkdir -p $wkd/$outdir/$name
mv Pogenom_$name.* $wkd/$outdir/$name/.
fi
fi
else
echo "INFO: $f has lower number of samples ( $n_samples ) than cutoff ( $NS )"
fi
else
echo "Directory $outdir/$name already exists"
fi
done
echo "INFO: *** Please find the results in the directory $wkd/$outdir ***"
if test -f "tempo"; then rm tempo; fi