-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_simplify.sh
executable file
·83 lines (72 loc) · 2.1 KB
/
vcf_simplify.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# Usage info
usage() {
cat <<EOF
Usage: ${0##*/} [options] <in.vcf>
Simplify in.vcf by decomposing multiallelic sites, normalizing variants,
decomposing biallelic block substitutions, and dropping duplicates
-h display this help and exit
-n do not fail when REF is inconsistent with reference sequence for non SNPs
-r ref.fasta reference sequence fasta file (required)
-s sort before simplifying
EOF
}
# Process options
nflag=""
while getopts ":hnr:s" opt; do
case "$opt" in
h ) usage; exit 0;;
n ) nflag="-n";;
r ) ref=$OPTARG;;
s ) sflag="-s";;
\? ) echo "Unknow option: -$OPTARG" >&2
usage >&2; exit 1;;
: ) echo "Missing option argumnet for -$OPTARG" >&2
usage >&2; exit 1;;
esac
done
shift "$((OPTIND-1))"
in_vcf=$1
if [ -z $ref ]; then
echo "-r must be set." >&2
usage >&2; exit 1
elif [ ! -e $ref ]; then
echo "$ref file doesn't exist." >&2; exit 1
elif [ -z $in_vcf ]; then
echo "Input vcf file must be set." >&2
usage >&2; exit 1
elif [ ! -e $in_vcf ]; then
echo "$in_vcf file doesn't exist." >&2; exit 1
fi
# Check input file.
if [[ $in_vcf =~ .vcf$ ]]; then
CATCMD=cat
out_name=${in_vcf/%.vcf/}
elif [[ $in_vcf =~ .vcf.gz$ ]]; then
CATCMD=zcat
out_name=${in_vcf/%.vcf.gz/}
elif [[ $in_vcf =~ .vcf.bz2$ ]]; then
CATCMD=bzcat
out_name=${in_vcf/%.vcf.bz2/}
else
echo "$in_vcf file isn't one of vcf, vcf.gz, and vcf.bz2." >&2; exit 1
fi
# Main
out_vcf=$out_name.decomp.norm.uniq.vcf.gz
out_log=$out_name.decomp.norm.uniq.log
config=$(readlink -f ${BASH_SOURCE[0]}|xargs dirname)/job.config
eval $(grep ^VT $config)
if [ ! -z $sflag ]; then
$CATCMD $in_vcf \
|$VT sort - \
|$VT decompose - 2> $out_log.tmp1 \
|$VT normalize -r $ref $nflag - 2> $out_log.tmp2 \
|$VT uniq -o $out_vcf - 2> $out_log.tmp3
else
$CATCMD $in_vcf \
|$VT decompose - 2> $out_log.tmp1 \
|$VT normalize -r $ref $nflag - 2> $out_log.tmp2 \
|$VT uniq -o $out_vcf - 2> $out_log.tmp3
fi
cat $out_log.tmp{1,2,3} |tee $out_log
rm $out_log.tmp{1,2,3}