-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
166 lines (153 loc) · 6.16 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
############################################################################
# Workflow: sourmash classification of the ICTV Taxonomy Challenge Dataset #
# This is a workflow that uses sourmash to classify viral sequences to
# the ICTV VMR MSL39 (v4) database. It downloads the ICTV Taxonomy
# Challenge dataset, sketches the sequences, then uses `sourmash gather`
# to find the closest reference genome. Finally, it uses `sourmash tax`
# to assign taxonomy to the sequences based on the reference genome
# and then converts the results to the challenge output format.
# To run:
# First, make sure you've cloned the ictv-challenge-sourmash repo
# and created/activated the conda environment as specified in the README.
# Then run:
#
# snakemake -j 1
#
# You can modify the number of cores used by changing the -c parameter, but
# it does not significantly speed up this workflow. Larger datasets would
# benefit more from parallelization.
#
# Requirements:
# The challenge dataset is ~1.5 GB and the database is <0.5 GB; classification
# requires ~1 GB of RAM. To be safe, we recommend running on a machine with
# at least 5 GB disk space, 5G RAM. The full workflow (including data download)
# takes 5-15 minutes to run.
############################################################################
out_dir = "output.ictv-challenge"
logs_dir = f"{out_dir}/logs"
THRESHOLD_BP = 200
TAX_THRESHOLD = 0.75
rule all:
input:
f"{out_dir}/ictv-challenge.sourmash.csv",
f"{logs_dir}/summary.csv",
f"{out_dir}/results-diff.csv",
rule download_database:
output:
rocksdb_tar = "vmr_MSL39_v4.skipm2n3-k24-sc50.rocksdb.tar.gz"
params:
download_link = "https://osf.io/download/u3ftq/",
benchmark: f"{logs_dir}/download_database.benchmark"
shell:
"""
curl -JLO {params.download_link}
"""
rule untar_database:
input:
rocksdb_tar = ancient("vmr_MSL39_v4.skipm2n3-k24-sc50.rocksdb.tar.gz"),
output:
rocksdb_current = "vmr_MSL39_v4.skipm2n3-k24-sc50.rocksdb/CURRENT"
benchmark: f"{logs_dir}/untar_database.benchmark"
shell:
"""
tar -xzf {input.rocksdb_tar}
"""
rule download_ictv_challenge:
output:
challenge_fromfile="dataset-challenge.fromfile.csv",
params:
challenge_link = "https://github.com/ICTV-VBEG/ICTV-TaxonomyChallenge/raw/refs/heads/main/dataset/dataset_challenge.tar.gz?download=",
challenge_file = "dataset_challenge.tar.gz",
challenge_dir = "dataset_challenge",
benchmark: f"{logs_dir}/download_challenge_dataset.benchmark"
shell:
"""
curl -JL {params.challenge_link} -o {params.challenge_file}
tar -xzf {params.challenge_file}
python scripts/challengedir-to-csvinput.py {params.challenge_dir} {output.challenge_fromfile}
"""
rule sketch_challenge_dataset:
input:
ancient("dataset-challenge.fromfile.csv")
output:
challenge_zip=f"{out_dir}/ictv-challenge.zip"
params:
param_str = "-p skipm2n3,k=24,scaled=50,abund",
log: f"{logs_dir}/manysketch.log"
benchmark: f"{logs_dir}/manysketch.benchmark"
shell:
"""
sourmash scripts manysketch {input} {params.param_str} -o {output} 2> {log}
"""
rule sourmash_fastmultigather:
input:
challenge_zip=f"{out_dir}/ictv-challenge.zip",
vmr_rdb_current = ancient("vmr_MSL39_v4.skipm2n3-k24-sc50.rocksdb/CURRENT")
output:
fmg= f"{out_dir}/ictv-challenge.fmg.csv"
log: f"{logs_dir}/fmg.log"
benchmark: f"{logs_dir}/fmg.benchmark"
params:
db_dir = lambda w: f"vmr_MSL39_v4.skipm2n3-k24-sc50.rocksdb",
threshold_bp = THRESHOLD_BP,
shell:
"""
sourmash scripts fastmultigather {input.challenge_zip} {params.db_dir} \
-m skipm2n3 -k 24 --scaled 50 \
--threshold-bp {params.threshold_bp} \
-o {output.fmg} 2> {log}
"""
rule sourmash_tax_genome:
input:
fmg= f"{out_dir}/ictv-challenge.fmg.csv",
vmr_lineages = "vmr_MSL39_v4.lineages.csv.gz",
output:
tax=f"{out_dir}/ictv-challenge.classifications.csv"
log: f"{logs_dir}/tax-genome.log"
benchmark: f"{logs_dir}/tax-genome.benchmark"
params:
out_base = lambda w: f"ictv-challenge",
out_dir = out_dir,
tax_threshold = TAX_THRESHOLD,
shell:
"""
sourmash tax genome -t {input.vmr_lineages} -g {input.fmg} --ictv -F csv_summary \
--ani-threshold {params.tax_threshold} --output-base {params.out_base} \
--output-dir {params.out_dir} 2> {log}
"""
rule sourmash_tg_to_challenge_format:
input:
tax=f"{out_dir}/ictv-challenge.classifications.csv",
dataset_csv=ancient("dataset-challenge.fromfile.csv"),
output:
ch= f"{out_dir}/ictv-challenge.sourmash.csv"
log: f"{logs_dir}/convert.log"
benchmark: f"{logs_dir}/convert.benchmark"
shell:
"""
python scripts/tg-to-challengeformat.py {input.tax} {output.ch} 2> {log}
"""
rule summarize_resource_utilization:
input:
benches = expand(f"{logs_dir}/{{log}}.benchmark", log=["download_database", "download_challenge_dataset", "manysketch", "fmg", "tax-genome", "convert"]),
output:
benchmarks=f"{logs_dir}/benchmarks.csv",
summary=f"{logs_dir}/summary.csv"
shell:
"""
python scripts/benchmarks.py {input.benches} --benchmarks-csv {output.benchmarks} --summary-csv {output.summary}
"""
rule compare_vs_saved_results:
input:
saved_results="results/ictv-challenge.sourmash.csv",
workflow_results="output.ictv-challenge/ictv-challenge.sourmash.csv",
output:
diff=f"{out_dir}/results-diff.csv"
shell:
"""
python scripts/compare-results.py --f1 {input.saved_results} \
--f1-name "saved" \
--f2 {input.workflow_results} \
--f2-name "workflow" \
--output {output}
"""