-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathsweep_simulate.snake
900 lines (792 loc) · 36.6 KB
/
sweep_simulate.snake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
"""
Snakefile for running sweeps benchmark on stdpopsim.
"""
import os
import numpy as np
import pandas as pd
import tskit
import msprime
import stdpopsim
import math
import subprocess
from scipy.optimize import root_scalar
configfile: "workflows/config/snakemake/sweep_config.yaml"
#configfile: "workflows/config/snakemake/small_sweep.yaml"
rng = np.random.default_rng(seed=config["seed"])
rng2 = np.random.default_rng(seed=config["seed"]+1)
# ###############################################################################
# GLOBALS
# ###############################################################################
# ----------------- Setup parameter grid -----------------
# The number of replicates of each analysis you would like to run
replicates = config["replicates"]
# Where you would like all output files from analysis to live
output_dir = os.path.abspath(config["output_dir"])
# The analysis species
species = stdpopsim.get_species(config["species"])
# The name of the chromosome to simulate
chrom = config["chrom"]
# The genetic map to use
genetic_map_id = config["genetic_map"]
# Getting starting position for sweep windows
rel_win_start = np.linspace(0, 1.0, config["num_windows"], endpoint=False)
# Shift/scale windows from unit coordinates to
# [start of recombination map, chromosome length]
# where "start of recombination map" means the first interval with data.
# NB: this will change simulation coordinates depending on genetic map!
recmap = species.get_contig(chrom, genetic_map=genetic_map_id).recombination_map
min_coord = np.min(recmap.position[:-1][~np.isnan(recmap.rate)]) # getting the position after nans
max_coord = recmap.left[np.where(recmap.rate == 0)[0][-1]-1] # getting the leftmost position before 0 map
win_start = min_coord + (max_coord - min_coord - config["focal_size"]) * rel_win_start
win_end = win_start + config["focal_size"]
windows = np.column_stack((win_start,win_end))
#windows = windows[~np.any(windows>max_coord, axis=1),:]
windows = np.floor(windows)
# Grid of selection coeffs
coeffs_dict = config["coeffs"]
sel_coeffs = np.linspace(*coeffs_dict["bounds"], coeffs_dict["grid_size"])
time_mults = config["time_multipliers"]
# Random seeds for main simulations, checks on boundary size
seed_array = rng.integers(1,2**31,replicates)
seed2_array = rng2.integers(1,2**31,config["boundary_reps"])
# The specific demographic model you would like to run
demo_model = config["demo_model"]
pop_list = []
for popname, n in demo_model["samples"].items():
if n > 0:
pop_list.append(popname)
# Simulate across multiple DFEs/annotations
dfe_list = config["dfe_list"]
annotation_list = config["annotation_list"]
# Region size
region_size = config["region_size"]
# -------------- Helper functions for making buffer around focal window ---------------
def genetic_distance(ratemap, x, x0=0):
"""
Function that returns the genetic distance (in cM) between positions x and
x0 based on a msprime.RateMap with recombination rates.
"""
return 100*(ratemap.get_cumulative_mass(x) - ratemap.get_cumulative_mass(x0))
def write_rec_rates(fname, ratemap, windows):
"""
Function that writes a .tsv file with the recombination rates within the
`windows`from a msprime.RateMap.
"""
last = 0
fh = open(fname, 'w')
for start, end in windows:
wsize = end - start
start_center = start + (wsize*0.4)
end_center = start + (wsize*0.6)
cM_center = ratemap.get_cumulative_mass(end_center) - ratemap.get_cumulative_mass(start_center)
cM = ratemap.get_cumulative_mass(end) - ratemap.get_cumulative_mass(start)
print(f'{start}\t{end}\t{cM}\t{cM_center}', file=fh)
fh.close()
def write_annot_density(fname, chrom, annot, windows):
annot_intervals = species.get_annotations(annot).get_chromosome_annotations(chrom)
ex_density = np.zeros(windows.shape[0])
for i, (start, end) in enumerate(windows):
for es, ee in annot_intervals:
if es > end:
break
# case where exon starts before window
if (es < start and ee >= start) :
# dealing with case where exon spans more than the entire window
tee = ee if ee < end else end
ex_density[i] += tee - start
elif es > start and es < end:
tee = ee if ee < end else end
ex_density[i] += tee - es
ex_density = ex_density / (windows[:,1] - windows[:,0])
fh = open(fname, 'w')
print('chr\tsim_left\tsim_right\toverlap\tannot', file=fh)
for ed, (start, end) in zip (ex_density, windows):
print(f'{chrom}\t{start}\t{end}\t{ed}\t{annot}', file=fh)
fh.close()
def make_buffer(gmap, cM, left, right):
"""
This function adds a buffer zone to the interval [left, right] of fixed genetic distance cM
and returns the physical positions of the buffered interval.
The buffered intervals are bounded by the size of the genetic map gmap.
"""
assert right > left
assert left >= 0
assert right <= gmap.sequence_length
# this is gonna fail in the beginning and ends of chroms bc of nans and 0s in ratemap
try:
left = root_scalar(
lambda y: genetic_distance(ratemap=gmap, x=left, x0=y) - cM,
method='bisect',
bracket=[0, gmap.sequence_length],
)
left = left.root
except:
left = 0
try:
right = root_scalar(
lambda y: genetic_distance(ratemap=gmap, x=y, x0=right) - cM,
method='bisect',
bracket=[0, gmap.sequence_length],
)
right = right.root
except:
right = gmap.sequence_length
return left, right
def add_window_metadata(ts, window_left, window_right, window_bleft, window_bright, extra=None):
"""
Add information on window and buffer to the tree sequence metadata.
"bleft" and "bright" are for buffer, "left" and "right" are for focal window.
Positions are measured relative to "bleft".a
"""
if extra is None:
extra = ""
tables = ts.dump_tables()
metadata = tables.metadata
new_metadata = {
"windowing":
{"window_left": window_left,
"window_bleft": window_bleft,
"window_right": window_right,
"window_bright": window_bright,
},
"extra":extra
}
metadata.update(new_metadata)
tables.metadata = metadata
return tables.tree_sequence()
# ----------- Helper functions for sweepfinder -----------
def compute_clr(ts_path, recmap, model, samples, sample_sets):
"""
Dump allele frequencies, recombination rates into sweepfinder input formats;
calculate expected site frequency spectrum under neutrality; run sweepfinder
across a grid of test sites in the focal window
"""
out = []
for pop, ss in enumerate(sample_sets):
base_path = f"{ts_path}.{pop}.sf2"
spec_path = f"{base_path}.spec"
grid_path = f"{base_path}.grid"
freq_path = f"{base_path}.freq"
rate_path = f"{base_path}.rate"
clr_path = f"{base_path}.clr"
ts = tskit.load(ts_path)
dump_sf2_spectrum_file(spec_path, model, samples, ss)
dump_sf2_allele_frequency_file(freq_path, ts, ss)
dump_sf2_recombination_rate_file(rate_path, recmap, ts)
grid = dump_sf2_grid_file(grid_path, ts)
args = [
"SweepFinder2", "-lru", f"{grid_path}", f"{freq_path}",
f"{spec_path}", f"{rate_path}", f"{clr_path}",
]
subprocess.run(args, stdout=subprocess.DEVNULL)
clr = np.loadtxt(clr_path, skiprows=1)
out.append(clr[:,1])
# windows for the CLR -- note that we are going from 1-based positions to [,) intervals
windows = np.array([grid, grid+1]).T
return np.array(out).T, windows
def dump_sf2_spectrum_file(output_path, model, samples, sample_set, replicates=10000, seed=1024):
"""
Simulate neutral spectrum for sweepfinder, given a stdpopsim demographic model
and a population-sample dict. Uses marginal trees to cut computation time,
fix seed so as to use the same spectrum. Dump into sweepfinder2's input format,
which is two columns per SFS bin (including monomorphic bins), that are
frequency and proportion of sites. The proportions should be 0 for the monomorphic
bins.
This is quick enough that there's no need to cache
"""
sim_gen = msprime.sim_ancestry(
samples=samples,
demography=model.model,
num_replicates=replicates,
sequence_length=1,
recombination_rate=0.0,
random_seed=seed,
)
out = None
for ts in sim_gen:
sfs = ts.allele_frequency_spectrum(
sample_sets=[sample_set], mode='branch', polarised=True
)
sfs[0] = 0.0 #monomorphic ancestral
sfs[-1] = 0.0 #monomorphic derived
sfs /= np.sum(sfs)
if out is None:
out = sfs
else:
out += sfs
out /= np.sum(out)
handle = open(output_path, "w")
for i, prop in enumerate(out[:-1]):
handle.write(f"{i}\t{'%.6e' % prop}\n")
handle.close()
def dump_sf2_allele_frequency_file(output_path, ts, sample_set):
"""
Dump allele frequencies from tree sequence into sweepfinder2's input
format, which is four columns per SNP; that are position, allele frequency,
number of haploids, and not folded/folded (0/1). Positions are 1-based.
Not including fixed derived sites.
TODO: Might want to restrict this to just SNPs in focal window
"""
handle = open(output_path, "w")
offset = ts.metadata['windowing']['window_bleft']
focal_left = ts.metadata['windowing']['window_left'] - offset
focal_right = ts.metadata['windowing']['window_right'] - offset
handle.write("position\tx\tn\tfolded\n")
for var in ts.variants():
#if var.position >= focal_right:
# break
#elif var.position >= focal_left and len(var.alleles) == 2:
if len(var.alleles) == 2:
pos = int(var.position)
geno = var.genotypes[sample_set]
geno = geno[geno != tskit.MISSING_DATA]
x = np.sum(geno)
n = geno.size
assert n >= x >= 0
if x > 0 and x < n:
handle.write(f"{pos+1}\t{x}\t{n}\t{0}\n")
handle.close()
def dump_sf2_recombination_rate_file(output_path, recmap, ts):
"""
Dump recombination rates into sweepfinder2's input format, which is two
columns per SNP, that are position and recombination rate in cM from the
last SNP (e.g. cumulative recombination rate). Positions are 1-based.
"""
offset = ts.metadata['windowing']['window_bleft']
focal_left = ts.metadata['windowing']['window_left'] - offset
focal_right = ts.metadata['windowing']['window_right'] - offset
recmap_slice = recmap.slice(
left=offset, trim=True
)
handle = open(output_path, "w")
handle.write("position\trate\n")
last_pos = None
for var in ts.variants():
#if var.position >= focal_right:
# break
#elif var.position >= focal_left and len(var.alleles) == 2:
if len(var.alleles) == 2:
pos = int(var.position)
if last_pos is None:
dist = 0
else:
dist = genetic_distance(
recmap_slice, pos, last_pos
)
last_pos = pos
handle.write(f"{pos+1}\t{'%0.6e' % dist}\n")
handle.close()
def dump_sf2_grid_file(output_path, ts, grid_size=21):
"""
Dump test grid into sweepfinder2's input format, which is one position on
each line. Positions are 1-based.
For debugging/visualisation use grid. Probably only need the
centermost point (where sweep is located).
"""
offset = ts.metadata['windowing']['window_bleft']
focal_left = ts.metadata['windowing']['window_left'] - offset
focal_right = ts.metadata['windowing']['window_right'] - offset
grid = np.linspace(focal_left, focal_right-1, grid_size) # -1 bc the right is not inclusive
handle = open(output_path, "w")
for pos in grid:
handle.write(f"{int(pos)}\n")
handle.close()
return grid+offset
# ----------- Helper functions for summary statistics -----------
def compute_stats(ts, sample_sets, num_subwins=1):
"""
Calculate summary statistics for a window
TODO: offload to scikit-allel, add more stats?
TODO: it'd be useful to have a grid across the focal window,
for visualization. Then, the merge_stats rule could just
extract the middle point of the grid (value where the sweep is).
easiest to do this w/ scikit-allel, using overlapping windows
"""
offset = ts.metadata['windowing']['window_bleft']
bleft = ts.metadata['windowing']['window_bleft']-offset
left = ts.metadata['windowing']['window_left']-offset
right = ts.metadata['windowing']['window_right']-offset
bright = ts.metadata['windowing']['window_bright']-offset
in_between_wins = np.linspace(left, right, num_subwins+1)
windows_bpoints = np.concatenate(([bleft],in_between_wins,[bright]))
# removing duplicates if bleft/left or bright/right are the same
windows_bpoints = np.unique(windows_bpoints)
# something wonky happening with float conversion
windows_bpoints[-1] = ts.sequence_length
div = ts.diversity(sample_sets=sample_sets, windows=windows_bpoints, mode="site")
windows = np.column_stack((windows_bpoints[:-1], windows_bpoints[1:]))
assert windows.shape[0] == div.shape[0]
# removing the buffer window
if bleft != left:
div = div[1:,]
windows = windows[1:,]
if bright != right:
div = div[:-1,]
windows = windows[:-1,]
windows = windows + offset # returning windows to original scale
# always include the last bpoints from the windows array
return div, windows
# ------------ Wrapper to aggregate results --------------
def _print_stat_line(statname, stats, windows, target_pops, meta_str, fh):
assert stats.shape == (windows.shape[0],len(target_pops))
for i in range(windows.shape[0]):
for j in range(len(target_pops)):
print(f'{meta_str}\t{target_pops[j]}\t{windows[i,0]:.0f}\t{windows[i,1]:.0f}\t{statname}\t{stats[i,j]:.4e}', file=fh, flush=True)
def dump_results(input, output, params_dict, target_pops, num_subwins=1):
"""
Dump sweep various test statistics to file
"""
model = species.get_demographic_model(demo_model["id"])
samples = demo_model["samples"]
ts = tskit.load(input[0])
all_pops_in_ts = [pop.metadata['name'] for pop in ts.populations()]
if target_pops is None:
target_pops = pop_list
# Printing the header
fh = open(output[0], 'w')
meta = ["model", "demo", "seed", "chrom", "sim_left", "sim_right", "annot", "dfe", "coeff", "tmult"]
header = "\t".join(meta + ['pop','stat_left', 'stat_right', 'stat_name', 'stat_val'])
meta_str = "\t".join([params_dict[m] for m in meta])
print(f'{header}', file=fh, flush=True)
# Outputting VCF
fh_vcf = open(output[1], 'w')
sample_sets = [ts.samples(population=all_pops_in_ts.index(pop)) for pop in target_pops]
tss = ts.simplify(np.array(sample_sets).flatten())
# Removing the buffer region
w_dict = ts.metadata["windowing"]
del_intervals = []
if w_dict["window_bleft"] != w_dict["window_left"]:
del_intervals.append([0,w_dict["window_left"]-w_dict["window_bleft"]])
if w_dict["window_right"] != w_dict["window_bright"]:
del_intervals.append([w_dict["window_right"]-w_dict["window_bleft"], ts.sequence_length])
if len(del_intervals) > 0:
tss = tss.delete_intervals(del_intervals)
tss = tss.trim()
# because we are shifting from 0-based to 1-based, I need to remove any sites that may have happened at the last position
sites_at_last = np.where(np.round(tss.sites_position)==config["focal_size"])[0]
assert sites_at_last.shape[0] < 4 # realistically we shouldn't get more than two or three hits there
tss = tss.delete_sites(sites_at_last)
tss.write_vcf(fh_vcf, position_transform = lambda x: 1 + np.round(x))
fh_vcf.close()
# write seqlen of shortened ts
with open(output[2], 'w') as f:
f.write(str(int(tss.sequence_length)))
# write anc file
seq = list("A" * int(tss.sequence_length))
with open(output[3], "w") as f:
f.write(">1\n")
for v in tss.variants():
seq[int(v.site.position)] = v.alleles[0]
f.write(''.join(seq) + "\n")
# outputting diploshic samples to pop
write_diploshic_sampleToPopFile(ts, target_pops, output[4])
# Computing tsstats diversity
divs, windows = compute_stats(ts, sample_sets, num_subwins=num_subwins)
assert divs.shape == (num_subwins,len(target_pops))
assert windows.shape == (num_subwins,2)
_print_stat_line("div", divs, windows, target_pops, meta_str, fh)
clrs, windows = compute_clr(input[0], recmap, model, samples, sample_sets)
_print_stat_line("clr", clrs, windows, target_pops, meta_str, fh)
fh.close()
#######################
# Import diploshic rules
#
##########################
module diploshic_workflow:
snakefile:
"diploshic.snake"
config:
"diploshic_params.yaml"
use rule * from diploshic_workflow as diploshic_*
#### diploshic helpers ##########
def write_diploshic_sampleToPopFile(ts, target_pops, filename):
all_pops_in_ts = [pop.metadata['name'] for pop in ts.populations()]
samp_dict = {pop: ts.samples(population=all_pops_in_ts.index(pop)) for pop in target_pops}
with open(filename, "w") as f:
count = 0
for pop, samps in samp_dict.items():
ind_samps = ts.nodes_individual[samps]
assert np.all(np.diff(ind_samps)[::2]==0)
for samp in ind_samps[::2]:
# if there is only one target pop, then the other samples will be simplified out, so we need to start from 0
if len(target_pops) == 1:
samp = count
f.write(f"tsk_{samp}\t{pop}\n")
count += 1
f.close()
# ###############################################################################
# GENERAL RULES
# ###############################################################################
boundary_outs = [output_dir +f"/simulated_data/sweeps/boundary_sims/sim_{seed}_{config['region_size']}.trees"
for seed in seed2_array
]
neutral_outs_prefix = [output_dir + f"/simulated_data/sweeps/neutral/{demo_model['id']}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
for left, right in windows
for seed in seed_array
]
bgs_outs_prefix = [output_dir + f"/simulated_data/sweeps/bgs/{demo_model['id']}/{annot}/{dfe}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
for left, right in windows
for seed in seed_array
for dfe in dfe_list
for annot in annotation_list
]
sw_outs_prefix = [output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/{annot}/{dfe}/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
for left, right in windows
for seed in seed_array
for dfe in dfe_list
for annot in annotation_list
for coeff in sel_coeffs
for popu in pop_list
for tmult in time_mults
] + [output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/NA/NA/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
for left, right in windows
for seed in seed_array
for coeff in sel_coeffs
for popu in pop_list
for tmult in time_mults
]
sw_outs_prefix_pop = [output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/{annot}/{dfe}/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}_{popu}"
for left, right in windows
for seed in seed_array
for dfe in dfe_list
for annot in annotation_list
for coeff in sel_coeffs
for popu in pop_list
for tmult in time_mults
] + [output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/NA/NA/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}_{popu}"
for left, right in windows
for seed in seed_array
for coeff in sel_coeffs
for popu in pop_list
for tmult in time_mults
]
trees_outs = [file_prefix+".trees" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
stats_outs = [file_prefix+".stats.tsv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
vcf_outs = [file_prefix+".vcf" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
annot_outs = [output_dir + f'/simulated_data/sweeps/annot_overlap_{annot}_{chrom}_{config["num_windows"]}.tsv' for annot in annotation_list]
anc_outs = [file_prefix+".diploshic.ancFile" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
fv_outs = [file_prefix+f"_{popu}.diploshic.fv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix for popu in pop_list] + [file_prefix+".diploshic.fv" for file_prefix in sw_outs_prefix_pop]
pred_outs = [file_prefix+f"_{popu}.diploshic.preds" for file_prefix in neutral_outs_prefix + bgs_outs_prefix for popu in pop_list] + [file_prefix+".diploshic.preds" for file_prefix in sw_outs_prefix_pop]
shic_outs1 = [file_prefix+f"_{popu}.stats.tsv.shic" for file_prefix in neutral_outs_prefix for popu in pop_list]
shic_outs2 = [file_prefix+f"_{popu}.stats.tsv.shic" for file_prefix in bgs_outs_prefix for popu in pop_list]
shic_outs3 = [file_prefix+".stats.tsv.shic" for file_prefix in sw_outs_prefix_pop]
rule all:
input:
rules.diploshic_all.input,
boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs + shic_outs1 + shic_outs2 + shic_outs3
default_target: True
rule write_rec:
input:
output: output_dir + f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'
run:
write_rec_rates(output[0], recmap, windows)
rule write_annot:
input:
output: output_dir + f'/simulated_data/sweeps/annot_overlap_{{annot}}_{{chrom}}_{config["num_windows"]}.tsv'
run:
print(windows)
write_annot_density(output[0], wildcards.chrom, wildcards.annot, windows)
rule boundary_sims:
input:
output:
output_dir + "/simulated_data/sweeps/boundary_sims/sim_{seed}_{region_size}.trees"
resources: time=60, mem_mb=6000
run:
model = species.get_demographic_model(demo_model["id"])
mut_rate = model.mutation_rate
# Use new sample specification, give selected pop and sample size as config arguments
samples = demo_model["samples"]
dfe = species.get_dfe(dfe_list[0])
# reducing number of del muts bc it's too much!
dfe.proportions = [1-(0.7*0.25), 0.7*0.25]
clen = int(wildcards.region_size)
contig = species.get_contig(length=clen)
contig.add_dfe(intervals=[[0, clen]], DFE=dfe)# applying DFE over some equally spaced
contig.mutation_rate = mut_rate
engine = stdpopsim.get_engine("slim")
ts = engine.simulate(
model,
contig,
samples,
slim_scaling_factor=config["slim_scaling_factor"],
slim_burn_in=config["slim_burn_in"],
seed=int(wildcards.seed),
)
ts.dump(output[0])
rule neutral:
input:
output:
output_dir + f"/simulated_data/sweeps/neutral/{demo_model['id']}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
resources: time=30, mem_mb=7000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
extended_events = None
# Use new sample specification, give selected pop and sample size as config arguments
samples = demo_model["samples"]
# Extract contig spanning window of chromosome
window_left = int(wildcards.left)
window_right = int(wildcards.right)
contig = species.get_contig(chrom, genetic_map=genetic_map_id, left=window_left, right=window_right)
contig.mutation_rate = mutation_rate
engine = stdpopsim.get_engine("slim")
ts = engine.simulate(
model,
contig,
samples,
extended_events=extended_events,
slim_scaling_factor=config["slim_scaling_factor"],
slim_burn_in=config["slim_burn_in"],
seed=int(wildcards.seed),
)
ts = add_window_metadata(ts, window_left, window_right, window_left, window_right)
ts.dump(output[0])
rule bgs:
input:
output:
output_dir + f"/simulated_data/sweeps/bgs/{demo_model['id']}/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
resources: time=30, mem_mb=8000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
extended_events = None
# Use new sample specification, give selected pop and sample size as config arguments
samples = demo_model["samples"]
# Extract contig spanning window of chromosome
window_left = int(wildcards.left)
window_right = int(wildcards.right)
b_window_left, b_window_right = make_buffer(recmap, config["buffer_cM"], window_left, window_right)
contig = species.get_contig(chrom, genetic_map=genetic_map_id, left=b_window_left, right=b_window_right)
dfe = species.get_dfe(wildcards.dfe)
## Adding annotation only seletion on exon region
annot = species.get_annotations(wildcards.annot)
annot_intervals = annot.get_chromosome_annotations(chrom)
contig.add_dfe(intervals=annot_intervals, DFE=dfe)
contig.mutation_rate = mutation_rate
engine = stdpopsim.get_engine("slim")
ts = engine.simulate(
model,
contig,
samples,
extended_events=extended_events,
slim_scaling_factor=config["slim_scaling_factor"],
slim_burn_in=config["slim_burn_in"],
seed=int(wildcards.seed),
)
ts = add_window_metadata(ts, window_left, window_right, b_window_left, b_window_right)
ts.dump(output[0])
rule sweep:
input:
output:
output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{{popu}}/{{annot}}/{{dfe}}/{{coeff}}/{{tmult}}/{{seed}}/sim_{{chrom}}_{{left}}_{{right}}.trees",
resources: time=30, mem_mb=12000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
extended_events = None
# Use new sample specification, give selected pop and sample size as config arguments
samples = demo_model["samples"]
# Extract contig spanning window of chromosome
window_left = int(wildcards.left)
window_right = int(wildcards.right)
b_window_left, b_window_right = make_buffer(recmap, config["buffer_cM"], window_left, window_right)
contig = species.get_contig(chrom, genetic_map=genetic_map_id, left=b_window_left, right=b_window_right)
if wildcards.dfe != "NA":
dfe = species.get_dfe(wildcards.dfe)
## Adding annotation only seletion on exon region
annot = species.get_annotations(wildcards.annot)
annot_intervals = annot.get_chromosome_annotations(chrom)
contig.add_dfe(intervals=annot_intervals, DFE=dfe)
# sweep parameters
coord = ((window_left+window_right)//2) # the API does the right thing even if it is chunked
coeff = float(wildcards.coeff)
# Defining how long ago beneficial mutations arise
# First getting 2N from the model and pop
debug = model.model.debug()
coal_T = debug.mean_coalescence_time({wildcards.popu: 2})
gamma = coal_T* coeff #2Ns
# TODO: check this is not off by a factor of two
T_f = 4 * (np.log(gamma) + 0.5772 - (1/gamma)) / coeff # this is from Charlesworth 2020/Pennings and Hermisson 2007
time = np.floor(float(wildcards.tmult)*T_f) # I dilated the time a bit because of variance (don't want to lose too many sims due to this -- note we are conditioning on sweep_min_freq)
print(coal_T, gamma, T_f, time)
contig.add_single_site("sweep", coordinate=coord)
extended_events = stdpopsim.ext.selective_sweep(
"sweep",
population=wildcards.popu, #selected pop
selection_coeff=coeff,
mutation_generation_ago=time,
min_freq_at_end=config["sweep_min_freq"]
)
contig.mutation_rate = mutation_rate
engine = stdpopsim.get_engine("slim")
ts = engine.simulate(
model,
contig,
samples,
extended_events=extended_events,
slim_scaling_factor=config["slim_scaling_factor"],
slim_burn_in=config["slim_burn_in"],
seed=int(wildcards.seed),
)
ts = add_window_metadata(ts, window_left, window_right, b_window_left, b_window_right, extra=f"sweep_time={time};")
ts.dump(output[0])
def _get_params_dict_from_wildcards(wildcards):
target_pops = None
params_list = wildcards.middle.split("/")
model = params_list[0]
demo = params_list[1]
seed = params_list[-1]
params_dict = {
"model":model,
"demo":demo,
"seed":seed,
"annot":"NA",
"dfe":"NA",
"coeff":"NA",
"tmult":"NA",
"chrom": wildcards.chrom,
"sim_left": wildcards.left,
"sim_right": wildcards.right
}
assert model in ["neutral", "bgs", "sweep"]
if model == "bgs":
params_dict["annot"] = params_list[2]
params_dict["dfe"] = params_list[3]
if model == "sweep":
target_pops = [params_list[2]]
params_dict["annot"] = params_list[3]
params_dict["dfe"] = params_list[4]
params_dict["coeff"] = params_list[5]
params_dict["tmult"] = params_list[6]
return params_dict, target_pops
rule get_stats:
input: output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees'
output:
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.stats.tsv",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.samples"
resources: time=30, mem_mb=4000
run:
params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
dump_results(input, output, params_dict, target_pops, config["num_subwins"])
rule diploshic_fvs:
input:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen',
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf',
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile',
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.samples"
output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.fv'
resources: time=40, mem_mb=5000
run:
with open(input[0],'r') as f:
seq_len = f.read().strip()
cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --targetPop {wildcards.popu} --sampleToPopFileName {input[3]} --winSize 1100000 --ancFileName {input[2]}"
shell(cmd)
rule diploshic_pred:
input:
rules.diploshic_fvs.output,
rules.diploshic_train_classifier.output
output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds'
resources: time=30, mem_mb=3000
run:
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.h5 {input[0]} {output[0]}"
shell(cmd)
rule add_diploshic_preds_to_stats:
input: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds"
output: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.stats.tsv.shic"
resources: time=30, mem_mb=6000
run:
params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
target_pops = [wildcards.popu]
meta = ["model", "demo", "seed", "chrom", "sim_left", "sim_right", "annot", "dfe", "coeff", "tmult"]
meta_str = "\t".join([params_dict[m] for m in meta])
ts = tskit.load(input[0])
df = pd.read_csv(input[1], sep="\t")
df.classifiedWinStart = df.classifiedWinStart - 1 + ts.metadata['windowing']['window_left']
df.classifiedWinEnd = df.classifiedWinEnd + ts.metadata['windowing']['window_left']
assert np.all(df.classifiedWinStart <= ts.metadata['windowing']['window_right'])
assert np.all(df.classifiedWinEnd <= ts.metadata['windowing']['window_right'])
assert np.all(df.classifiedWinStart > ts.metadata['windowing']['window_left']+1)
windows = df.iloc[:,[1,2]].to_numpy().astype(np.int32)
stats = (df.iloc[:,8].to_numpy() + df.iloc[:,9].to_numpy()).astype(np.float32) # prob soft+hard
stats = np.expand_dims(stats, axis = 1) # adding another dim going from (n_wins,) to (n_wins,1)
fh = open(output[0], 'w')
_print_stat_line("diploshic", stats, windows, target_pops, meta_str, fh)
fh.close()
rule merge_stats:
input: stats_outs
output:
output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv'
resources: time=1500, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t"), input))
df.to_csv(output[0], sep="\t", index=False)
rule merge_stats_shic1:
input: shic_outs1
output:
output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv'
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t", header=None), input))
df.to_csv(output[0], sep="\t", index=False, header=None)
rule merge_stats_shic2:
input: shic_outs2
output:
output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv'
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t", header=None), input))
df.to_csv(output[0], sep="\t", index=False, header=None)
rule merge_stats_shic3:
input: shic_outs3
output:
output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv'
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t", header=None), input))
df.to_csv(output[0], sep="\t", index=False, header=None)
rule merge_all_stats:
input: [output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv']
output: output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv'
resources: time=3000, mem_mb=350000, disk_mb=350000
shell:
"cat {input} > {output}"
"""
rule get_neut_stats:
input:
output_dir + f"/simulated_data/sweeps/neutral/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
output:
output_dir + f"/simulated_data/sweeps/neutral/{{seed}}/sim_{chrom}_{{left}}_{{right}}.stats.tsv",
resources: time=3000, mem_mb=2000
run:
dump_results(input, output, wildcards, model="neutral")
rule get_bgs_stats:
input:
output_dir + f"/simulated_data/sweeps/bgs/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
output:
output_dir + f"/simulated_data/sweeps/bgs/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.stats.tsv",
resources: time=3000, mem_mb=2000
run:
dump_results(input, output)
rule get_sw_stats:
input:
output_dir + f"/simulated_data/sweeps/sweep/{{annot}}/{{dfe}}/{{coeff}}/{{time}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
output:
output_dir + f"/simulated_data/sweeps/sweep/{{annot}}/{{dfe}}/{{coeff}}/{{time}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.stats.tsv",
resources: time=3000, mem_mb=2000
run:
dump_results(input, output)
"""