-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgrowth_est_ref
247 lines (208 loc) · 8.12 KB
/
growth_est_ref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/bin/bash
#############################################
############################################
# growth estimation reference-based
############################################
############################################
cd $WDR
RDR=$(readlink -f $READS_DIR)
if [ $DESMAN == "false" ]; then SPEC=$(readlink -f $SPECIES_DIR) ; fi
echo "$package option activated"
echo "$WDR is present directory"
echo "$RDR is the reads directory"
echo "$ODR is the output directory"
echo "Reference-based method activated"
################
cd $ODR
########################
# USE REFERENCE GENOME TO GET POSITION OF CORE GENES
###########################
if [ $DESMAN == "false" ]; then
if ! [[ -s $GEN_LIS ]]; then echo "ERROR: Input file is empty"; exit 1 ; fi
samtools faidx $SPEC/core_gene_alignment.aln $(cat $GEN_LIS) > strains.aln || exit 1
$SMEG_DIR/uniqueSNPmultithreading strains.aln 2 2 || exit 1
if ! [[ -s uniqueSNPs ]]; then echo "ERROR: The generated uniqueSNPs file is empty. Check your input file or GCC version"; exit 1 ; fi
###############################
###############################
for ref in `cat $GEN_LIS`
do
reference=$(echo "$ref" | rev | cut -d'/' -f1 | rev) || exit 1
if [[ -e "$SPEC/misc.txt" ]]; then
cp $SPEC/../reordered_contigs/$reference.fna .
else
cp $SPEC/reordered_contigs/$reference.fna . || exit 1
fi
samtools faidx $SPEC/core_gene_alignment.aln $reference > $reference.aln || exit 1
Rscript $SMEG_DIR/getPositionWithoutGaps.R -i uniqueSNPs -x $reference.aln -m 1 || exit 1
grep "label=" $SPEC/core_alignment_header.embl | cut -f2 -d'=' > core_genes.txt || exit 1
grep "feature" $SPEC/core_alignment_header.embl | rev | cut -d' ' -f1 | rev | sed 's/\../ /g' | awk '{print ($1 - 1) "\t" $2}' | sed "s/^/$reference /" > core_gene_coordinate_in_align || exit 1
paste -d'\t' core_gene_coordinate_in_align core_genes.txt > bedfile
rm core_gene_coordinate_in_align core_genes.txt
bedtools getfasta -fi $reference.aln -bed bedfile -name | sed 's/-//g' | fold -w 60 >> core_genes2.fa || exit 1
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' core_genes2.fa > core_genes.fa || exit 1
rm core_genes2.fa
rm bedfile
awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' core_genes.fa | grep -v ">" > geneLengths
grep ">" core_genes.fa | sed 's/>//g' > geneNames
paste <(cat geneLengths | awk '{total += $0; $0 = total - $0}1') <(cat geneLengths | awk '{total += $0; $0 = total}1') <(cat geneNames) --delimiters '\t' > geneCoordinates.txt
rm geneLengths geneNames
########
makeblastdb -in $reference.fna -parse_seqids -dbtype nucl || exit 1
blastn -query core_genes.fa -db $reference.fna -evalue 5 -num_threads $NUM_THREAD -max_target_seqs 1 -outfmt 6 -out $reference.blast.txt -word_size 11 || exit 1
cut -f1 $reference.blast.txt | sort | uniq > tmp.txt
for i in `cat tmp.txt`
do
grep -w "$i" $reference.blast.txt | sort -nr -k12,12 | head -1 | cut -f2,9,10 > temp1 || exit 1
grep -w "$i" $reference.blast.txt | sort -nr -k12,12 | head -1 | cut -f1 > temp2 || exit 1
paste -d'\t' temp1 temp2 >> $reference.core.geneCood.txt
done
rm temp* tmp.txt
rm $reference.blast.txt core_genes.fa
###########
# EXTRACT UNIQUE SNPS FROM INPUT STRAINS
##########
Rscript $SMEG_DIR/getPositioninRef.R -i modified_uniq_cluster_SNPs.txt -x $reference.core.geneCood.txt -y geneCoordinates.txt
###########
grep -P '(^|\s)\K'$reference'(?=\s|$)' newcoordinates.txt | sort | uniq > $reference.Input.txt || exit 1
rm newcoordinates.txt
rm $reference.fna.nsq
rm $reference.fna.nsi
rm $reference.fna.nsd
rm $reference.fna.nog
rm $reference.fna.nin
rm $reference.fna.nhr
rm $reference.core.geneCood.txt geneCoordinates.txt
bowtie2-build $reference.fna $reference.fna -q || exit 1
echo "$reference" >> strain_list.txt
done
rm strains.aln uniqueSNPs modified_uniq_cluster_SNPs.txt
################
#####
# DESMAN strains
else
################
################
if ! [[ -s $DESM ]]; then echo "ERROR: Input file is empty"; exit 1 ; fi
for f in `cat $DESM`
do
cp $f .
reference=$(echo "$f" | rev | cut -d'/' -f1 | rev) || exit 1
grep -v ">" $reference | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$reference" | sed -e '$a\ ' >> allStrains.fna
done
samtools faidx allStrains.fna || exit 1
$SMEG_DIR/uniqueSNPmultithreading allStrains.fna 2 2 || exit 1
if ! [[ -s uniqueSNPs ]]; then echo "ERROR: The generated uniqueSNPs file is empty. Check your input file or GCC version"; exit 1 ; fi
sed '1d' uniqueSNPs | cut -f1 | sort | uniq > passed_strains.txt
for strains in `cat passed_strains.txt`
do
grep -P '(^|\s)\K'$strains'(?=\s|$)' uniqueSNPs > $strains.Input.txt || exit 1
echo "$strains" >> strain_list.txt
cat $strains | grep -v ">" | tr -d '[:space:]' | fold -w 60 | sed "1 i\>$strains" | sed -e '$a\ ' > $strains.fna
bowtie2-build $strains.fna $strains.fna -q || exit 1
rm $strains
done
##############
rm uniqueSNPs allStrains.fna.fai
rm passed_strains.txt
fi
#######################################
#######################################
#### Estimate growth rate
####
#######################################
#######################################
##########
cd $RDR
if [ "$LIST" == "false" ]; then
ls *.$READS_EXT | sed "s/\.$READS_EXT$//" > $ODR/samples.txt || exit 1
else
cat $LIS | sed "s/\.$READS_EXT$//" > $ODR/samples.txt || exit 1
fi
cd $ODR
for i in `cat samples.txt`
do
mkdir $i
cd $i
for reference in `cat $ODR/strain_list.txt`
do
bowtie2 -x $ODR/$reference.fna -U $RDR/$i.$READS_EXT -S $i.sam -p $NUM_THREAD --quiet || exit 1
samtools view -@ $NUM_THREAD -bS $i.sam > $i.bam || exit 1
rm $i.sam
if [[ $MISMATCH == 9999 ]]; then
samtools view -@ $NUM_THREAD -b -F 4 $i.bam | samtools sort -@ $NUM_THREAD -o $i.sorted.bam || exit 1
rm $i.bam
else
bamtools filter -tag "XM:<=$MISMATCH" -in $i.bam | bamtools filter -tag "XO:0" | bamtools filter -tag "XG:0" -out $i.2.bam || exit 1
rm $i.bam
samtools view -@ $NUM_THREAD -b -F 4 $i.2.bam | samtools sort -@ $NUM_THREAD -o $i.sorted.bam || exit 1
rm $i.2.bam
fi
samtools faidx $ODR/$reference.fna || exit 1
samtools mpileup -a -a -A -B -f $ODR/$reference.fna $i.sorted.bam > $i.pileup || exit 1
$SMEG_DIR/pileupParser $i.pileup
rm $i.pileup
rm $i.sorted.bam
###################
sed '1d' polymorphic.site.coverage > polymorphic.site.coverage2
rm polymorphic.site.coverage
Rscript $SMEG_DIR/SMEG_SNP.R -i $ODR/$reference.Input.txt -p polymorphic.site.coverage2 -o $reference.coord.txt
awk '$8 ~ "a" || $8 ~ "A" {print $2"\011"$1"\011"$3}' $reference.coord.txt >> $reference.final.temp || exit 1
awk '$8 ~ "t" || $8 ~ "T" {print $2"\011"$1"\011"$4}' $reference.coord.txt >> $reference.final.temp || exit 1
awk '$8 ~ "g" || $8 ~ "G" {print $2"\011"$1"\011"$5}' $reference.coord.txt >> $reference.final.temp || exit 1
awk '$8 ~ "c" || $8 ~ "C" {print $2"\011"$1"\011"$6}' $reference.coord.txt >> $reference.final.temp || exit 1
sort -n -k2,2 $reference.final.temp > $i.$reference.cov.txt
rm $reference.final.temp
rm $reference.coord.txt
rm polymorphic.site.coverage2
done
Rscript $SMEG_DIR/SNP_method_ref.R -i $i.temp1.txt -c $COV_CUTOFF -d 0.5 -s $MIN_SNP
awk -F'\t' '{ $2 = ($2 > 10 ? 1 : $2) } 1' OFS='\t' $i.temp1.txt | awk '$3 > '$COV_CUTOFF'' | sed "s/$i.//g" > $i.SMEG.txt
rm $i.temp1.txt
printf '1\ni\nStrain\tSMEG\tCoverage\tNo of SNPs\tSMEG range\n.\nw\n' | ed -s $i.SMEG.txt
rm $i.*.cov.txt
cd $ODR
done
cd $ODR
exec 3>&2
exec 2> /dev/null
rm *.txt
rm *.bt2
rm *.aln
rm *.fai
rm *.fna
exec 2>&3
cp */*.SMEG.txt $ODR/. || exit 1
rm -rf */
#################
# MERGE OPTION
################
cd $ODR
if [ "$MERGE" == "true" ]; then
cat *.SMEG.txt | grep -v "SMEG" | cut -f1 | sort | uniq > SMEG.genomes
ls *.txt | sed 's/\.txt//g' > SMEG_list.txt
for i in `cat SMEG_list.txt`
do
for f in `cat SMEG.genomes`
do
paste <(awk '$1 == "'$f'" {print $1}' SMEG.genomes ) <(awk '$1 == "'$f'" {print $2}' $i.txt ) >> $i.temp.merge.txt
done
printf '1\ni\nStrain\t'$i'\n.\nw\n' | ed -s $i.temp.merge.txt
done
for f in *.temp.merge.txt
do
cut -f2 $f > $f.tmp
rm $f
done
printf '1\ni\nStrain\n.\nw\n' | ed -s SMEG.genomes
paste -d'\t' SMEG.genomes *.temp.merge.txt.tmp > merged_table.txt
rm *.temp.merge.txt.tmp
rm SMEG_list.txt
rm SMEG.genomes
exec 3>&2
exec 2> /dev/null
Rscript $SMEG_DIR/heatmap.R
exec 2>&3
echo "run complete"
else
echo "run complete"
fi