-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_polypolish_legacy.sh
145 lines (125 loc) · 4.71 KB
/
run_polypolish_legacy.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/bash
# This script was called run_polypolish.sh and was only compatible with Polypolish v0.5.0 and
# earlier versions.
# Reference: https://github.com/rrwick/Polypolish/wiki/How-to-run-Polypolish
# Copyright (C) 2022-2023 Yu Wan <wanyuac@gmail.com>
# Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
# First version: 1 May 2022; latest update: 17 November 2024
SCRIPT_VERSION=1.0.0
# Function definitions ###############
# Run './run_polypolish_legacy.sh' to display the information below.
display_parameters() {
echo "
run_polypolish.sh v$SCRIPT_VERSION
This script polishes an input genome assembly with Polypolish and paired-end short reads.
Prerequisites: please ensure bwa aligner is accessible in PATH.
Parameters (six in total):
-a=*: path and filename of the input assembly in FASTA format (mandatory)
-r=*: path to the directory of paired-end short reads (without the end forward
slash), where read files' names follow format [isolate name]_[1,2].fastq.gz (mandatory)
-i=*: isolate name (default: isolate)
-n=*: prefix of output filenames (default: isolate)
-o=*: path to output directory (default: polypolish_output)
-t=*: number of threads (default: 2)
Example command:
/usr/local/bin/Assembly_toolkit/run_polypolish_legacy.sh -a=1_isolate1_medaka.fasta -r=reads/illumina \\
-i=isolate1 -n="2_isolate1" -o="\$PWD" -t=8
Outputs:
1. a polished assembly [o]/[n]_polypolish.fna in FASTA format
2. a log of messages from Polypolish's scripts [o]/[n]_polypolish.log
"
}
MYDIR="$(dirname "$(readlink -f "$0")")" # Directory of this script
source $MYDIR/modules.sh # Function print_failure_message
# Print parameter information ###############
if [ -z "$1" ]
then
display_parameters
exit
fi
# Set default values ###############
i=isolate
n="$i"
outdir=polypolish_output
t=2
# Read parameters ###############
for arg in "$@"
do
case $arg in
-a=*)
fasta_in="${arg#*=}"
;;
-r=*)
read_dir="${arg#*=}"
;;
-i=*)
i="${arg#*=}"
;;
-n=*)
n="${arg#*=}"
;;
-o=*)
outdir="${arg#*=}"
;;
-t=*)
t="${arg#*=}"
;;
*)
;;
esac
done
# Run Polypolish for the input genome assembly ###############
if [ -z "$(which bwa)" ]
then
echo "Error: bwa was not accessible"
print_failure_message "$i"
exit
fi
if [ -d "$read_dir" ]
then
r1="${read_dir}/${i}_1.fastq.gz"
r2="${read_dir}/${i}_2.fastq.gz"
if [ ! -e "$r1" ] || [ ! -e "$r2" ] # $r1 and $r2 can be files or symbolic links
then
echo "Error: read file $r1 and/or $r2 were not found."
print_failure_message "$i"
exit
fi
else
echo "Error: read directory $read_dir was not found."
print_failure_message "$i"
fi
if [ -f "$fasta_in" ]
then
output_prefix="${outdir}/${n}_polypolish"
echo "run_polypolish.sh v$SCRIPT_VERSION" > "${output_prefix}.txt"
# Set up output directories and filenames
if [ ! -d "$outdir" ]
then
echo "Create output directory $outdir"
mkdir "$outdir"
fi
echo "Start to polish $fasta_in of isolate $i with reads from $read_dir and save outputs in ${outdir}/"
# Filtering read alignments to exclude those of unusually large insert sizes ###############
echo "$(date): Creating SAM files for isolate $i" >> "${output_prefix}.txt"
tm="sams_$i"
mkdir $tm
bwa index $fasta_in
bwa mem -a -t $t $fasta_in $r1 > ${tm}/unfiltered_1.sam
bwa mem -a -t $t $fasta_in $r2 > ${tm}/unfiltered_2.sam
polypolish_insert_filter.py --in1 ${tm}/unfiltered_1.sam --in2 ${tm}/unfiltered_2.sam --out1 ${tm}/filtered_1.sam --out2 ${tm}/filtered_2.sam 1>> "${output_prefix}.txt" 2>&1
# Polish the input assembly ###############
echo "$(date): Polishing assembly $fasta_in with Polypolish" >> "${output_prefix}.txt"
polypolish $fasta_in ${tm}/filtered_1.sam ${tm}/filtered_2.sam 1>"${output_prefix}.fna" 2>>"${output_prefix}.txt"
echo "$(date): Finished polishing $fasta_in" >> "${output_prefix}.txt"
rm -rf $tm
# Remove '_polypolish' from sequence headers to preserve the original ones ###############
echo "$(date): processing the polished assembly and log files" >>"${output_prefix}.txt"
sed -i 's/_polypolish//g' ${output_prefix}.fna
# Remove non-character content from log files ###############
perl -p -e 's/\x1b\[[0-9;]*[mG]//g' ${output_prefix}.txt > ${output_prefix}.log
rm ${output_prefix}.txt
# Clean up ###############
rm "$fasta_in".*
echo "This Polypolish job successfully finished. Results have been saved in ${outdir}."
fi