forked from seqan/iGenVar
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mason_simulation.sh
172 lines (151 loc) · 10.7 KB
/
mason_simulation.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env sh
cd data
mkdir -p simulation_GRCh38 && cd simulation_GRCh38
conda env create -f Repos/iGenVar/test/benchmark/envs/simulation.yaml
conda activate simulation
# With a 47644184 bp of chr21 and 10000000 simulated reads, which are 200 bp long, we have a coverage of:
# (10000000*200)/47644184 = 41.9778414087
# Cut chromosome 21 from reference
sed -n '/^>chr21/,$p' ../reference/GRCh38/hg38.fa > hg38_chr21_.fa
sed '/>chr22/,$d' hg38_chr21_.fa > hg38_chr21.fa
bwa index hg38_chr21.fa
# Mason
# Tools for Biological Sequence Simulation
# ========================================
# https://github.com/seqan/seqan/blob/master/apps/mason2/README
git clone --single-branch --depth 1 --branch develop https://github.com/seqan/seqan
cd ../.. && mkdir seqan && cd seqan
cmake -DCMAKE_BUILD_TYPE=Release ../../Repos/seqan
make -j16 -k mason_simulator mason_variator
# mason_variator - Variation Simulation
# =====================================
# mason_variator [OPTIONS] -ir IN.fa -ov OUT.vcf [-of OUT.fa]
# Variation Simulation:
# --snp-rate DOUBLE
# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0001.
# --small-indel-rate DOUBLE
# Small indel rate. In range [0.0..1.0]. Default: 0.000001.
# --min-small-indel-size INTEGER
# Minimal small indel size to simulate. In range [0..inf]. Default: 1.
# --max-small-indel-size INTEGER
# Maximal small indel size to simulate. In range [0..inf]. Default: 6.
# --sv-indel-rate DOUBLE
# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001.
# --sv-inversion-rate DOUBLE
# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001.
# --sv-translocation-rate DOUBLE
# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001.
# --sv-duplication-rate DOUBLE
# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001.
# --min-sv-size INTEGER
# Minimal SV size to simulate. In range [0..inf]. Default: 50.
# --max-sv-size INTEGER
# Maximal SV size to simulate. In range [0..inf]. Default: 1000.
SMALL_INDEL_SIZE=19
MIN_SV_SIZE=20
MAX_SV_SIZE=20000
SV_RATE=0.00001
./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_default.vcf --seed 0 \
> hg38_SV_simulation_default.log
./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_InDel.vcf --seed 1 \
--snp-rate 0 --small-indel-rate ${SV_RATE} --max-small-indel-size ${SMALL_INDEL_SIZE} \
--sv-indel-rate ${SV_RATE} --min-sv-size ${MIN_SV_SIZE} --max-sv-size ${MAX_SV_SIZE} \
--sv-inversion-rate 0 --sv-translocation-rate 0 --sv-duplication-rate 0 \
> hg38_SV_simulation_InDel.log
./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_noSNP.vcf --seed 2 \
--snp-rate 0 --small-indel-rate ${SV_RATE} --max-small-indel-size ${SMALL_INDEL_SIZE} \
--sv-indel-rate ${SV_RATE} --min-sv-size ${MIN_SV_SIZE} --max-sv-size ${MAX_SV_SIZE} \
--sv-inversion-rate ${SV_RATE} --sv-translocation-rate ${SV_RATE} --sv-duplication-rate ${SV_RATE} \
> hg38_SV_simulation_noSNP.log
./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_SNPandSV.vcf --seed 3 \
--snp-rate ${SV_RATE} --small-indel-rate ${SV_RATE} --max-small-indel-size ${SMALL_INDEL_SIZE} \
--sv-indel-rate ${SV_RATE} --min-sv-size ${MIN_SV_SIZE} --max-sv-size ${MAX_SV_SIZE} \
--sv-inversion-rate ${SV_RATE} --sv-translocation-rate ${SV_RATE} --sv-duplication-rate ${SV_RATE} \
> hg38_SV_simulation_SNPandSV.log
# sort the vcf truth
picard SortVcf -I hg38_SV_simulation_default.vcf \
-O hg38_SV_simulation_default_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \
>> hg38_SV_simulation_default.log
picard SortVcf -I hg38_SV_simulation_InDel.vcf \
-O hg38_SV_simulation_InDel_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \
>> hg38_SV_simulation_InDel.log
picard SortVcf -I hg38_SV_simulation_noSNP.vcf \
-O hg38_SV_simulation_noSNP_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \
>> hg38_SV_simulation_noSNP.log
picard SortVcf -I hg38_SV_simulation_SNPandSV.vcf \
-O hg38_SV_simulation_SNPandSV_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \
>> hg38_SV_simulation_InDel.log
# Repair vcf for truvari:
../../build/iGenVar/bin/MasonVcfConverter -i hg38_SV_simulation_default_sorted.vcf -o hg38_SV_simulation_default_sorted_fixed.vcf
../../build/iGenVar/bin/MasonVcfConverter -i hg38_SV_simulation_InDel_sorted.vcf -o hg38_SV_simulation_InDel_sorted_fixed.vcf
../../build/iGenVar/bin/MasonVcfConverter -i hg38_SV_simulation_noSNP_sorted.vcf -o hg38_SV_simulation_noSNP_sorted_fixed.vcf
../../build/iGenVar/bin/MasonVcfConverter -i hg38_SV_simulation_SNPandSV_sorted.vcf -o hg38_SV_simulation_SNPandSV_sorted_fixed.vcf
bgzip -c hg38_SV_simulation_default_sorted_fixed.vcf > hg38_SV_simulation_default_sorted_fixed.vcf.gz
bgzip -c hg38_SV_simulation_InDel_sorted_fixed.vcf > hg38_SV_simulation_InDel_sorted_fixed.vcf.gz
bgzip -c hg38_SV_simulation_noSNP_sorted_fixed.vcf > hg38_SV_simulation_noSNP_sorted_fixed.vcf.gz
bgzip -c hg38_SV_simulation_SNPandSV_sorted_fixed.vcf > hg38_SV_simulation_SNPandSV_sorted_fixed.vcf.gz
# create index
tabix -p vcf hg38_SV_simulation_default_sorted_fixed.vcf.gz
tabix -p vcf hg38_SV_simulation_InDel_sorted_fixed.vcf.gz
tabix -p vcf hg38_SV_simulation_noSNP_sorted_fixed.vcf.gz
tabix -p vcf hg38_SV_simulation_SNPandSV_sorted_fixed.vcf.gz
# mason_simulator - Read Simulation
# =================================
# mason_simulator [OPTIONS] -ir IN.fa --num-fragments NUM [-iv IN.vcf] --out LEFT.fq [--out-right RIGHT.fq]
# Simulate NUM reads/pairs from the reference sequence IN.fa, potentially with variants from IN.vcf. In case that
# both -o and -or are given, write out paired-end data, if only -io is given, only single-end reads are simulated.
# -oa, --out-alignment OUTPUT_FILE
# SAM/BAM file with alignments. Valid filetypes are: .sam[.*] and .bam, where * is any of the following
# extensions: gz and bgzf for transparent (de)compression.
# -ir, --input-reference INPUT_FILE
# Path to FASTA file to read the reference from. Valid filetypes are: .sam[.*], .raw[.*], .gbk[.*], .frn[.*],
# .fq[.*], .fna[.*], .ffn[.*], .fastq[.*], .fasta[.*], .faa[.*], .fa[.*], .embl[.*], and .bam, where * is any
# of the following extensions: gz and bgzf for transparent (de)compression.
# -iv, --input-vcf INPUT_FILE
# Path to the VCF file with variants to apply. Valid filetype is: .vcf[.*], where * is any of the following
# extensions: gz and bgzf for transparent (de)compression.
# ! needs indexed fasta file !
./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_default.vcf \
--out hg38_simulation_10000000_Illumina_reads_default.fastq \
--num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \
>> hg38_SV_simulation_default.log
./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_InDel.vcf \
--out hg38_simulation_10000000_Illumina_reads_InDel.fastq \
--num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \
>> hg38_SV_simulation_InDel.log
./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_noSNP.vcf \
--out hg38_simulation_10000000_Illumina_reads_noSNP.fastq \
--num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \
>> hg38_SV_simulation_noSNP.log
./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_SNPandSV.vcf \
--out hg38_simulation_10000000_Illumina_reads_SNPandSV.fastq \
--num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \
>> hg38_SV_simulation_InDel.log
bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_default.fastq \
> hg38_simulation_10000000_Illumina_reads_default.sam
bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_InDel.fastq \
> hg38_simulation_10000000_Illumina_reads_InDel.sam
bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_noSNP.fastq \
> hg38_simulation_10000000_Illumina_reads_noSNP.sam
bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_SNPandSV.fastq \
> hg38_simulation_10000000_Illumina_reads_SNPandSV.sam
# You can use the --mate-orientation to set the relative orientation when doing paired-end sequencing. The valid
# values are given in the following.
# FR Reads are inward-facing, the same as Illumina paired-end reads: R1 --> <-- R2.
# RF Reads are outward-facing, the same as Illumina mate-pair reads: R1 <-- --> R2.
# FF Reads are on the same strand: R1 --> --> R2.
# FF2 Reads are on the same strand but the "right" reads are sequenced to the left of the "left" reads, same as
# 454 paired: R2 --> --> R1.
samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_default.sam \
> hg38_simulation_10000000_Illumina_reads_default_sorted.bam
samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_InDel.sam \
> hg38_simulation_10000000_Illumina_reads_InDel_sorted.bam
samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_noSNP.sam \
> hg38_simulation_10000000_Illumina_reads_noSNP_sorted.bam
samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_SNPandSV.sam \
> hg38_simulation_10000000_Illumina_reads_SNPandSV_sorted.bam
samtools index hg38_simulation_10000000_Illumina_reads_default_sorted.bam
samtools index hg38_simulation_10000000_Illumina_reads_InDel_sorted.bam
samtools index hg38_simulation_10000000_Illumina_reads_noSNP_sorted.bam
samtools index hg38_simulation_10000000_Illumina_reads_SNPandSV_sorted.bam
cd ../..