-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcreate_pseudohaploid.sh
executable file
·84 lines (68 loc) · 2.7 KB
/
create_pseudohaploid.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
set -o pipefail
BIN="$( cd "$(dirname "$0")" ; pwd -P )"
## Minimum alignment Identity
#MIN_IDENTITY=95
MIN_IDENTITY=90
## Minimum alignment length to consider
#MIN_LENGTH=10000
MIN_LENGTH=1000
## Minimum containment percentage from overlap chains to filter contig
#MIN_CONTAIN=95
MIN_CONTAIN=93
## Maximum distance in bp allowed between alignments on the same alignment chain
MAX_CHAIN_GAP=20000
if [ $# -lt 2 ]
then
echo "create_pseudohaploid.sh assembly.fa outprefix"
exit
fi
GENOME=$1
PREFIX=$2
echo "Generating pseudohaploid genome sequence"
echo "----------------------------------------"
echo "GENOME: $GENOME"
echo "OUTPREFIX: $PREFIX"
echo "MIN_IDENTITY: $MIN_IDENTITY"
echo "MIN_LENGTH: $MIN_LENGTH"
echo "MIN_CONTAIN: $MIN_CONTAIN"
echo "MAX_CHAIN_GAP: $MAX_CHAIN_GAP"
echo
## You may want to replace this with sge_mummer for large genomes
## See: https://github.com/fritzsedlazeck/sge_mummer
if [ ! -r $PREFIX.delta ]
then
echo "1. Aligning $GENOME to itself with nucmer"
(nucmer --maxmatch -c 100 -l 500 $GENOME $GENOME -p $PREFIX) >& nucmer.log
numorig=`grep -c '^>' $GENOME`
echo "Original assembly has $numorig contigs"
echo
fi
## Pre-filter for just longer high identity alignments
echo "2. Filter for alignments longer than $MIN_LENGTH bp and below $MIN_IDENTITY identity"
delta-filter -l $MIN_LENGTH -i $MIN_IDENTITY $PREFIX.delta > $PREFIX.filter.delta
echo
## Create the coord file
echo "3. Generating coords file"
show-coords -rclH $PREFIX.filter.delta > $PREFIX.filter.coords
echo
## Find and analyze the alignment chains
## Note you can rerun this step multiple times from the same coords file
echo "4. Identifying alignment chains: min_id: $MIN_IDENTITY min_contain: $MIN_CONTAIN max_gap: $MAX_CHAIN_GAP"
($BIN/pseudohaploid.chains.pl $PREFIX.filter.coords \
$MIN_IDENTITY $MIN_CONTAIN $MAX_CHAIN_GAP > $PREFIX.chains) >& $PREFIX.chains.log
cat $PREFIX.chains.log
echo
## Generate a list of contained contigs
## This can also be rerun from the same chain file but using different containment thresholds
echo "5. Generating a list of redundant contig ids using min_contain: $MIN_CONTAIN"
grep '^#' $PREFIX.chains | \
awk -v cut=$MIN_CONTAIN '{if ($4 >= cut){print ">"$2}}' > $PREFIX.contained.ids
numcontained=`wc -l $PREFIX.contained.ids | awk '{print $1}'`
echo "Identified $numcontained redundant contig to remove in $PREFIX.contained.ids"
echo
## Finally filter the original assembly to remove the contained contigs
echo "6. Creating final pseudohaploid assembly in $PREFIX.pseudohap.fa"
($BIN/filter_seq -v $PREFIX.contained.ids $GENOME > $PREFIX.pseudohap.fa) >& $PREFIX.pseudohap.fa.log
numfinal=`grep -c '^>' $PREFIX.pseudohap.fa`
echo "Pseudohaploid assembly has $numfinal contigs"