workflows/sweep_simulate.snake

"""
Snakefile for running sweeps benchmark on stdpopsim.
"""
import os
import numpy as np
import pandas as pd
import tskit
import msprime
import stdpopsim
import math
import subprocess
from scipy.optimize import root_scalar

configfile: "workflows/config/snakemake/sweep_config.yaml"
#configfile: "workflows/config/snakemake/small_sweep.yaml"

rng = np.random.default_rng(seed=config["seed"])
rng2 = np.random.default_rng(seed=config["seed"]+1)

# ###############################################################################
# GLOBALS
# ###############################################################################

# ----------------- Setup parameter grid ----------------- 

# The number of replicates of each analysis you would like to run
replicates = config["replicates"]

# Where you would like all output files from analysis to live
output_dir = os.path.abspath(config["output_dir"])

# The analysis species
species = stdpopsim.get_species(config["species"])

# The name of the chromosome to simulate
chrom = config["chrom"]

# The genetic map to use
genetic_map_id = config["genetic_map"]

# Getting starting position for sweep windows
rel_win_start = np.linspace(0, 1.0, config["num_windows"], endpoint=False)

# Shift/scale windows from unit coordinates to
# [start of recombination map, chromosome length]
# where "start of recombination map" means the first interval with data.
# NB: this will change simulation coordinates depending on genetic map!
recmap = species.get_contig(chrom, genetic_map=genetic_map_id).recombination_map
min_coord = np.min(recmap.position[:-1][~np.isnan(recmap.rate)]) # getting the position after nans
max_coord = recmap.left[np.where(recmap.rate == 0)[0][-1]-1] # getting the leftmost position before 0 map
win_start = min_coord + (max_coord - min_coord - config["focal_size"]) * rel_win_start
win_end = win_start + config["focal_size"] 
windows = np.column_stack((win_start,win_end)) 
#windows = windows[~np.any(windows>max_coord, axis=1),:]
windows = np.floor(windows)

# Grid of selection coeffs
coeffs_dict = config["coeffs"]
sel_coeffs = np.linspace(*coeffs_dict["bounds"], coeffs_dict["grid_size"])
time_mults = config["time_multipliers"]

# Random seeds for main simulations, checks on boundary size
seed_array = rng.integers(1,2**31,replicates)
seed2_array = rng2.integers(1,2**31,config["boundary_reps"])

# The specific demographic model you would like to run
demo_model = config["demo_model"]
pop_list = []
for popname, n in demo_model["samples"].items():
    if n > 0:
        pop_list.append(popname)

# Simulate across multiple DFEs/annotations
dfe_list = config["dfe_list"]
annotation_list = config["annotation_list"]

# Region size 
region_size = config["region_size"]

# -------------- Helper functions for making buffer around focal window ---------------


def genetic_distance(ratemap, x, x0=0):
    """
    Function that returns the genetic distance (in cM) between positions x and 
    x0 based on a msprime.RateMap with recombination rates.
    """
    return 100*(ratemap.get_cumulative_mass(x) - ratemap.get_cumulative_mass(x0))


def write_rec_rates(fname, ratemap, windows):
    """
    Function that writes a .tsv file with the recombination rates within the 
    `windows`from a msprime.RateMap.
    """
    last = 0
    fh = open(fname, 'w')
    for start, end in windows:
        wsize = end - start
        start_center = start + (wsize*0.4)
        end_center = start + (wsize*0.6)
        cM_center = ratemap.get_cumulative_mass(end_center) - ratemap.get_cumulative_mass(start_center)
        cM = ratemap.get_cumulative_mass(end) - ratemap.get_cumulative_mass(start)
        print(f'{start}\t{end}\t{cM}\t{cM_center}', file=fh)
    fh.close()

def write_annot_density(fname, chrom, annot, windows):
    annot_intervals = species.get_annotations(annot).get_chromosome_annotations(chrom)
    ex_density = np.zeros(windows.shape[0])
    for i, (start, end) in enumerate(windows):
        for es, ee in annot_intervals:
            if es > end:
                break
            # case where exon starts before window
            if (es < start and ee >= start) :
                # dealing with case where exon spans more than the entire window
                tee = ee if ee < end else end
                ex_density[i] += tee - start
            elif es > start and es < end:
                tee = ee if ee < end else end
                ex_density[i] += tee - es
    ex_density = ex_density / (windows[:,1] - windows[:,0])
    fh = open(fname, 'w')
    print('chr\tsim_left\tsim_right\toverlap\tannot', file=fh)
    for ed, (start, end) in zip (ex_density, windows):
        print(f'{chrom}\t{start}\t{end}\t{ed}\t{annot}', file=fh)
    fh.close()


def make_buffer(gmap, cM, left, right):
    """
    This function adds a buffer zone to the interval [left, right] of fixed genetic distance cM
    and returns the physical positions of the buffered interval.
    The buffered intervals are bounded by the size of the genetic map gmap.
    """
    assert right > left
    assert left >= 0
    assert right <= gmap.sequence_length
    # this is gonna fail in the beginning and ends of chroms bc of nans and 0s in ratemap
    try:
        left = root_scalar(
            lambda y: genetic_distance(ratemap=gmap, x=left, x0=y) - cM,
            method='bisect',
            bracket=[0, gmap.sequence_length],
        )
        left = left.root
    except:
        left = 0
    try:
        right = root_scalar(
            lambda y: genetic_distance(ratemap=gmap, x=y, x0=right) - cM,
            method='bisect',
            bracket=[0, gmap.sequence_length],
        )
        right = right.root
    except:
        right = gmap.sequence_length
    return left, right


def add_window_metadata(ts, window_left, window_right, window_bleft, window_bright, extra=None):
    """
    Add information on window and buffer to the tree sequence metadata. 
    "bleft" and "bright" are for buffer, "left" and "right" are for focal window.
    Positions are measured relative to "bleft".a
    """
    if extra is None:
        extra = ""
    tables = ts.dump_tables()
    metadata = tables.metadata
    new_metadata = {
        "windowing":
            {"window_left": window_left,
                "window_bleft": window_bleft,
                "window_right": window_right,
                "window_bright": window_bright,
            },
        "extra":extra
    }
    metadata.update(new_metadata)
    tables.metadata = metadata
    return tables.tree_sequence()


# ----------- Helper functions for sweepfinder -----------

def compute_clr(ts_path, recmap, model, samples, sample_sets):
    """
    Dump allele frequencies, recombination rates into sweepfinder input formats;
    calculate expected site frequency spectrum under neutrality; run sweepfinder
    across a grid of test sites in the focal window
    """
    out = []
    for pop, ss in enumerate(sample_sets):
        base_path = f"{ts_path}.{pop}.sf2"
        spec_path = f"{base_path}.spec"
        grid_path = f"{base_path}.grid"
        freq_path = f"{base_path}.freq"
        rate_path = f"{base_path}.rate"
        clr_path = f"{base_path}.clr"
        ts = tskit.load(ts_path)
        dump_sf2_spectrum_file(spec_path, model, samples, ss)
        dump_sf2_allele_frequency_file(freq_path, ts, ss)
        dump_sf2_recombination_rate_file(rate_path, recmap, ts)
        grid = dump_sf2_grid_file(grid_path, ts)
        args = [
            "SweepFinder2", "-lru", f"{grid_path}", f"{freq_path}", 
            f"{spec_path}", f"{rate_path}", f"{clr_path}",
        ]
        subprocess.run(args, stdout=subprocess.DEVNULL)
        clr = np.loadtxt(clr_path, skiprows=1)
        out.append(clr[:,1])
    # windows for the CLR -- note that we are going from 1-based positions to [,) intervals
    windows = np.array([grid, grid+1]).T
    return np.array(out).T, windows


def dump_sf2_spectrum_file(output_path, model, samples, sample_set, replicates=10000, seed=1024):
    """
    Simulate neutral spectrum for sweepfinder, given a stdpopsim demographic model
    and a population-sample dict. Uses marginal trees to cut computation time,
    fix seed so as to use the same spectrum. Dump into sweepfinder2's input format,
    which is two columns per SFS bin (including monomorphic bins), that are
    frequency and proportion of sites. The proportions should be 0 for the monomorphic
    bins.

    This is quick enough that there's no need to cache
    """
    sim_gen = msprime.sim_ancestry(
        samples=samples,
        demography=model.model,
        num_replicates=replicates,
        sequence_length=1,
        recombination_rate=0.0,
        random_seed=seed,
    )
    out = None
    for ts in sim_gen:
        sfs = ts.allele_frequency_spectrum(
            sample_sets=[sample_set], mode='branch', polarised=True
        )
        sfs[0] = 0.0 #monomorphic ancestral
        sfs[-1] = 0.0 #monomorphic derived
        sfs /= np.sum(sfs)
        if out is None:
            out = sfs
        else:
            out += sfs
    out /= np.sum(out)
    handle = open(output_path, "w")
    for i, prop in enumerate(out[:-1]):
        handle.write(f"{i}\t{'%.6e' % prop}\n")
    handle.close()


def dump_sf2_allele_frequency_file(output_path, ts, sample_set):
    """
    Dump allele frequencies from tree sequence into sweepfinder2's input
    format, which is four columns per SNP; that are position, allele frequency,
    number of haploids, and not folded/folded (0/1). Positions are 1-based.

    Not including fixed derived sites.
    
    TODO: Might want to restrict this to just SNPs in focal window
    """
    handle = open(output_path, "w")
    offset = ts.metadata['windowing']['window_bleft']
    focal_left = ts.metadata['windowing']['window_left'] - offset
    focal_right = ts.metadata['windowing']['window_right'] - offset
    handle.write("position\tx\tn\tfolded\n")
    for var in ts.variants():
        #if var.position >= focal_right:
        #    break
        #elif var.position >= focal_left and len(var.alleles) == 2:
        if len(var.alleles) == 2:
            pos = int(var.position)
            geno = var.genotypes[sample_set]
            geno = geno[geno != tskit.MISSING_DATA]
            x = np.sum(geno)
            n = geno.size
            assert n >= x >= 0
            if x > 0 and x < n:
                handle.write(f"{pos+1}\t{x}\t{n}\t{0}\n")
    handle.close()


def dump_sf2_recombination_rate_file(output_path, recmap, ts):
    """
    Dump recombination rates into sweepfinder2's input format, which is two
    columns per SNP, that are position and recombination rate in cM from the
    last SNP (e.g. cumulative recombination rate). Positions are 1-based.
    """
    offset = ts.metadata['windowing']['window_bleft']
    focal_left = ts.metadata['windowing']['window_left'] - offset
    focal_right = ts.metadata['windowing']['window_right'] - offset
    recmap_slice = recmap.slice(
      left=offset, trim=True
    )
    handle = open(output_path, "w")
    handle.write("position\trate\n")
    last_pos = None
    for var in ts.variants():
        #if var.position >= focal_right:
        #    break
        #elif var.position >= focal_left and len(var.alleles) == 2:
        if len(var.alleles) == 2:
            pos = int(var.position)
            if last_pos is None:
                dist = 0
            else:
                dist = genetic_distance(
                  recmap_slice, pos, last_pos
                )
            last_pos = pos
            handle.write(f"{pos+1}\t{'%0.6e' % dist}\n")
    handle.close()


def dump_sf2_grid_file(output_path, ts, grid_size=21):
    """
    Dump test grid into sweepfinder2's input format, which is one position on
    each line. Positions are 1-based.

    For debugging/visualisation use grid. Probably only need the
    centermost point (where sweep is located).
    """
    offset = ts.metadata['windowing']['window_bleft']
    focal_left = ts.metadata['windowing']['window_left'] - offset
    focal_right = ts.metadata['windowing']['window_right'] - offset
    grid = np.linspace(focal_left, focal_right-1, grid_size) # -1 bc the right is not inclusive
    handle = open(output_path, "w")
    for pos in grid:
        handle.write(f"{int(pos)}\n")
    handle.close()
    return grid+offset


# ----------- Helper functions for summary statistics -----------

def compute_stats(ts, sample_sets, num_subwins=1):
    """
    Calculate summary statistics for a window

    TODO: offload to scikit-allel, add more stats?
    TODO: it'd be useful to have a grid across the focal window,
      for visualization. Then, the merge_stats rule could just
      extract the middle point of the grid (value where the sweep is).
      easiest to do this w/ scikit-allel, using overlapping windows
    """
    offset = ts.metadata['windowing']['window_bleft']
    bleft = ts.metadata['windowing']['window_bleft']-offset
    left = ts.metadata['windowing']['window_left']-offset
    right = ts.metadata['windowing']['window_right']-offset
    bright = ts.metadata['windowing']['window_bright']-offset
    in_between_wins = np.linspace(left, right, num_subwins+1)
    windows_bpoints = np.concatenate(([bleft],in_between_wins,[bright]))
    # removing duplicates if bleft/left or bright/right are the same
    windows_bpoints = np.unique(windows_bpoints)
    # something wonky happening with float conversion
    windows_bpoints[-1] = ts.sequence_length
    div = ts.diversity(sample_sets=sample_sets, windows=windows_bpoints, mode="site")
    windows = np.column_stack((windows_bpoints[:-1], windows_bpoints[1:]))
    assert windows.shape[0] == div.shape[0]
    # removing the buffer window
    if bleft != left:
        div = div[1:,]
        windows = windows[1:,]
    if bright != right:
        div = div[:-1,]
        windows = windows[:-1,]
    windows = windows + offset # returning windows to original scale
    # always include the last bpoints from the windows array
    return div, windows


# ------------ Wrapper to aggregate results --------------
def _print_stat_line(statname, stats, windows, target_pops, meta_str, fh):
    assert stats.shape == (windows.shape[0],len(target_pops))
    for i in range(windows.shape[0]):
        for j in range(len(target_pops)):
            print(f'{meta_str}\t{target_pops[j]}\t{windows[i,0]:.0f}\t{windows[i,1]:.0f}\t{statname}\t{stats[i,j]:.4e}', file=fh, flush=True)

def dump_results(input, output, params_dict, target_pops, num_subwins=1):
    """
    Dump sweep various test statistics to file
    """
    model = species.get_demographic_model(demo_model["id"])
    samples = demo_model["samples"]

    ts = tskit.load(input[0])
    all_pops_in_ts = [pop.metadata['name'] for pop in ts.populations()]
    if target_pops is None:
        target_pops = pop_list
    
    # Printing the header
    fh = open(output[0], 'w')
    meta = ["model", "demo", "seed", "chrom", "sim_left", "sim_right", "annot", "dfe", "coeff", "tmult"]
    header = "\t".join(meta + ['pop','stat_left', 'stat_right', 'stat_name', 'stat_val'])
    meta_str = "\t".join([params_dict[m] for m in meta])
    print(f'{header}', file=fh, flush=True)
    
    # Outputting VCF
    fh_vcf = open(output[1], 'w')
    sample_sets = [ts.samples(population=all_pops_in_ts.index(pop)) for pop in target_pops]
    tss = ts.simplify(np.array(sample_sets).flatten())
    # Removing the buffer region
    w_dict = ts.metadata["windowing"]
    del_intervals = []
    if w_dict["window_bleft"] != w_dict["window_left"]:
        del_intervals.append([0,w_dict["window_left"]-w_dict["window_bleft"]])
    if w_dict["window_right"] != w_dict["window_bright"]:
        del_intervals.append([w_dict["window_right"]-w_dict["window_bleft"], ts.sequence_length])
    if len(del_intervals) > 0:
        tss = tss.delete_intervals(del_intervals)
        tss = tss.trim()
    # because we are shifting from 0-based to 1-based, I need to remove any sites that may have happened at the last position
    sites_at_last = np.where(np.round(tss.sites_position)==config["focal_size"])[0]
    assert sites_at_last.shape[0] < 4 # realistically we shouldn't get more than two or three hits there
    tss = tss.delete_sites(sites_at_last)
    tss.write_vcf(fh_vcf, position_transform = lambda x: 1 + np.round(x))
    fh_vcf.close()
    # write seqlen of shortened ts
    with open(output[2], 'w') as f:
        f.write(str(int(tss.sequence_length)))
    # write anc file
    seq = list("A" * int(tss.sequence_length))
    with open(output[3], "w") as f:
        f.write(">1\n")
        for v in tss.variants():
            seq[int(v.site.position)] = v.alleles[0]
        f.write(''.join(seq) + "\n")
    # outputting diploshic samples to pop
    write_diploshic_sampleToPopFile(ts, target_pops, output[4])
    # Computing tsstats diversity
    divs, windows = compute_stats(ts, sample_sets, num_subwins=num_subwins)
    assert divs.shape == (num_subwins,len(target_pops))
    assert windows.shape == (num_subwins,2)
    _print_stat_line("div", divs, windows, target_pops, meta_str, fh)

    clrs, windows = compute_clr(input[0], recmap, model, samples, sample_sets)
    _print_stat_line("clr", clrs, windows, target_pops, meta_str, fh)

    fh.close()


#######################
# Import diploshic rules
#
##########################

module diploshic_workflow:
    snakefile:
        "diploshic.snake"
    config:
        "diploshic_params.yaml"


use rule * from diploshic_workflow as diploshic_*


#### diploshic helpers ##########

def write_diploshic_sampleToPopFile(ts, target_pops, filename):
    all_pops_in_ts = [pop.metadata['name'] for pop in ts.populations()]
    samp_dict = {pop: ts.samples(population=all_pops_in_ts.index(pop)) for pop in target_pops}
    with open(filename, "w") as f:
        count = 0
        for pop, samps in samp_dict.items():
            ind_samps = ts.nodes_individual[samps]
            assert np.all(np.diff(ind_samps)[::2]==0)
            for samp in ind_samps[::2]:
                # if there is only one target pop, then the other samples will be simplified out, so we need to start from 0
                if len(target_pops) == 1:
                    samp = count
                f.write(f"tsk_{samp}\t{pop}\n")
                count += 1
    f.close()


# ###############################################################################
# GENERAL RULES
# ###############################################################################

boundary_outs = [output_dir +f"/simulated_data/sweeps/boundary_sims/sim_{seed}_{config['region_size']}.trees"
    for seed in seed2_array
]

neutral_outs_prefix = [output_dir + f"/simulated_data/sweeps/neutral/{demo_model['id']}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
    for left, right in windows
    for seed in seed_array
]

bgs_outs_prefix = [output_dir + f"/simulated_data/sweeps/bgs/{demo_model['id']}/{annot}/{dfe}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
    for left, right in windows
    for seed in seed_array
    for dfe in dfe_list
    for annot in annotation_list
]

sw_outs_prefix = [output_dir +  f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/{annot}/{dfe}/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
    for left, right in windows
    for seed in seed_array
    for dfe in dfe_list
    for annot in annotation_list
    for coeff in sel_coeffs
    for popu in pop_list
    for tmult in time_mults
] + [output_dir +  f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/NA/NA/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}"
    for left, right in windows
    for seed in seed_array
    for coeff in sel_coeffs
    for popu in pop_list
    for tmult in time_mults
]
sw_outs_prefix_pop = [output_dir +  f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/{annot}/{dfe}/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}_{popu}"
    for left, right in windows
    for seed in seed_array
    for dfe in dfe_list
    for annot in annotation_list
    for coeff in sel_coeffs
    for popu in pop_list
    for tmult in time_mults
] + [output_dir +  f"/simulated_data/sweeps/sweep/{demo_model['id']}/{popu}/NA/NA/{coeff}/{tmult}/{seed}/sim_{chrom}_{left:.0f}_{right:.0f}_{popu}"
    for left, right in windows
    for seed in seed_array
    for coeff in sel_coeffs
    for popu in pop_list
    for tmult in time_mults
]

trees_outs = [file_prefix+".trees" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
stats_outs = [file_prefix+".stats.tsv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
vcf_outs = [file_prefix+".vcf" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
annot_outs = [output_dir + f'/simulated_data/sweeps/annot_overlap_{annot}_{chrom}_{config["num_windows"]}.tsv' for annot in annotation_list]
anc_outs = [file_prefix+".diploshic.ancFile" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
fv_outs = [file_prefix+f"_{popu}.diploshic.fv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix for popu in pop_list] + [file_prefix+".diploshic.fv" for file_prefix in sw_outs_prefix_pop]
pred_outs = [file_prefix+f"_{popu}.diploshic.preds" for file_prefix in neutral_outs_prefix + bgs_outs_prefix for popu in pop_list] + [file_prefix+".diploshic.preds" for file_prefix in sw_outs_prefix_pop]
shic_outs1 = [file_prefix+f"_{popu}.stats.tsv.shic" for file_prefix in neutral_outs_prefix for popu in pop_list]
shic_outs2 = [file_prefix+f"_{popu}.stats.tsv.shic" for file_prefix in bgs_outs_prefix for popu in pop_list]
shic_outs3 = [file_prefix+".stats.tsv.shic" for file_prefix in sw_outs_prefix_pop]

rule all:
    input:
        rules.diploshic_all.input, 
        boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs + shic_outs1 + shic_outs2 + shic_outs3
    default_target: True

rule write_rec:
    input:
    output: output_dir + f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'
    run:
        write_rec_rates(output[0], recmap, windows)

rule write_annot:
    input:
    output: output_dir + f'/simulated_data/sweeps/annot_overlap_{{annot}}_{{chrom}}_{config["num_windows"]}.tsv'
    run:
        print(windows)
        write_annot_density(output[0], wildcards.chrom, wildcards.annot, windows)

rule boundary_sims:
    input:
    output:
        output_dir + "/simulated_data/sweeps/boundary_sims/sim_{seed}_{region_size}.trees"
    resources: time=60, mem_mb=6000
    run:
        model = species.get_demographic_model(demo_model["id"])
        mut_rate = model.mutation_rate 
        # Use new sample specification, give selected pop and sample size as config arguments
        samples = demo_model["samples"]
        dfe = species.get_dfe(dfe_list[0])
        # reducing number of del muts bc it's too much!
        dfe.proportions = [1-(0.7*0.25), 0.7*0.25]
        clen = int(wildcards.region_size)
        contig = species.get_contig(length=clen)
        contig.add_dfe(intervals=[[0, clen]], DFE=dfe)# applying DFE over some equally spaced 
        contig.mutation_rate = mut_rate
        engine = stdpopsim.get_engine("slim")
        ts = engine.simulate(
            model,
            contig,
            samples,
            slim_scaling_factor=config["slim_scaling_factor"],
            slim_burn_in=config["slim_burn_in"],
            seed=int(wildcards.seed),
        )
        ts.dump(output[0])


rule neutral:
    input:
    output:
        output_dir + f"/simulated_data/sweeps/neutral/{demo_model['id']}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees", 
    resources: time=30, mem_mb=7000
    run:
        model = species.get_demographic_model(demo_model["id"])
        mutation_rate = model.mutation_rate
        extended_events = None
        # Use new sample specification, give selected pop and sample size as config arguments
        samples = demo_model["samples"]

        # Extract contig spanning window of chromosome
        window_left = int(wildcards.left)
        window_right = int(wildcards.right)
        contig = species.get_contig(chrom, genetic_map=genetic_map_id, left=window_left, right=window_right)

        contig.mutation_rate = mutation_rate
        engine = stdpopsim.get_engine("slim")
        ts = engine.simulate(
            model,
            contig,
            samples,
            extended_events=extended_events,
            slim_scaling_factor=config["slim_scaling_factor"],
            slim_burn_in=config["slim_burn_in"],
            seed=int(wildcards.seed),
        )
        ts = add_window_metadata(ts, window_left, window_right, window_left, window_right)
        ts.dump(output[0])


rule bgs:
    input:
    output:
        output_dir + f"/simulated_data/sweeps/bgs/{demo_model['id']}/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees", 
    resources: time=30, mem_mb=8000
    run:
        model = species.get_demographic_model(demo_model["id"])
        mutation_rate = model.mutation_rate
        extended_events = None
        # Use new sample specification, give selected pop and sample size as config arguments
        samples = demo_model["samples"]

        # Extract contig spanning window of chromosome
        window_left = int(wildcards.left)
        window_right = int(wildcards.right)
        b_window_left, b_window_right = make_buffer(recmap, config["buffer_cM"], window_left, window_right)
        contig = species.get_contig(chrom, genetic_map=genetic_map_id, left=b_window_left, right=b_window_right)

        dfe = species.get_dfe(wildcards.dfe)

        ## Adding annotation only seletion on exon region
        annot = species.get_annotations(wildcards.annot)
        annot_intervals = annot.get_chromosome_annotations(chrom)
        contig.add_dfe(intervals=annot_intervals, DFE=dfe)
        contig.mutation_rate = mutation_rate
        engine = stdpopsim.get_engine("slim")
        ts = engine.simulate(
            model,
            contig,
            samples,
            extended_events=extended_events,
            slim_scaling_factor=config["slim_scaling_factor"],
            slim_burn_in=config["slim_burn_in"],
            seed=int(wildcards.seed),
        )
        ts = add_window_metadata(ts, window_left, window_right, b_window_left, b_window_right)
        ts.dump(output[0])


rule sweep:
    input:
    output:
        output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{{popu}}/{{annot}}/{{dfe}}/{{coeff}}/{{tmult}}/{{seed}}/sim_{{chrom}}_{{left}}_{{right}}.trees", 
    resources: time=30, mem_mb=12000
    run:
        model = species.get_demographic_model(demo_model["id"])
        mutation_rate = model.mutation_rate
        extended_events = None
        # Use new sample specification, give selected pop and sample size as config arguments
        samples = demo_model["samples"]

        # Extract contig spanning window of chromosome
        window_left = int(wildcards.left)
        window_right = int(wildcards.right)
        b_window_left, b_window_right = make_buffer(recmap, config["buffer_cM"], window_left, window_right)
        contig = species.get_contig(chrom, genetic_map=genetic_map_id, left=b_window_left, right=b_window_right)
        if wildcards.dfe != "NA":
            dfe = species.get_dfe(wildcards.dfe)

            ## Adding annotation only seletion on exon region
            annot = species.get_annotations(wildcards.annot)
            annot_intervals = annot.get_chromosome_annotations(chrom)
            contig.add_dfe(intervals=annot_intervals, DFE=dfe)

        # sweep parameters
        coord = ((window_left+window_right)//2) # the API does the right thing even if it is chunked
        coeff = float(wildcards.coeff)
        # Defining how long ago beneficial mutations arise
        # First getting 2N from the model and pop
        debug = model.model.debug()
        coal_T = debug.mean_coalescence_time({wildcards.popu: 2})
        gamma = coal_T* coeff #2Ns
        # TODO: check this is not off by a factor of two
        T_f = 4 * (np.log(gamma) + 0.5772 - (1/gamma)) / coeff # this is from Charlesworth 2020/Pennings and Hermisson 2007
        time = np.floor(float(wildcards.tmult)*T_f) # I dilated the time a bit because of variance (don't want to lose too many sims due to this -- note we are conditioning on sweep_min_freq)
        print(coal_T, gamma, T_f, time)
        contig.add_single_site("sweep", coordinate=coord)
        extended_events = stdpopsim.ext.selective_sweep(
            "sweep",
            population=wildcards.popu, #selected pop
            selection_coeff=coeff,
            mutation_generation_ago=time,
            min_freq_at_end=config["sweep_min_freq"]
        )
        
        contig.mutation_rate = mutation_rate
        engine = stdpopsim.get_engine("slim")
        ts = engine.simulate(
            model,
            contig,
            samples,
            extended_events=extended_events,
            slim_scaling_factor=config["slim_scaling_factor"],
            slim_burn_in=config["slim_burn_in"],
            seed=int(wildcards.seed),
        )
        ts = add_window_metadata(ts, window_left, window_right, b_window_left, b_window_right, extra=f"sweep_time={time};")
        ts.dump(output[0])

def _get_params_dict_from_wildcards(wildcards):
    target_pops = None
    params_list = wildcards.middle.split("/")
    model = params_list[0]
    demo = params_list[1]
    seed = params_list[-1]
    params_dict = {
                    "model":model,
                    "demo":demo,
                    "seed":seed,
                    "annot":"NA",
                    "dfe":"NA",
                    "coeff":"NA",
                    "tmult":"NA",
                    "chrom": wildcards.chrom,
                    "sim_left": wildcards.left,
                    "sim_right": wildcards.right
                }
    assert model in ["neutral", "bgs", "sweep"]
    if model == "bgs":
        params_dict["annot"] = params_list[2]
        params_dict["dfe"] = params_list[3]
    if model == "sweep":
        target_pops = [params_list[2]]
        params_dict["annot"] = params_list[3]
        params_dict["dfe"] = params_list[4]
        params_dict["coeff"] = params_list[5]
        params_dict["tmult"] = params_list[6]
    return params_dict, target_pops

rule get_stats:
    input: output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees'
    output:
        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.stats.tsv", 
        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf",
        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen",
        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile",
        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.samples"

    resources: time=30, mem_mb=4000
    run:
        params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
        dump_results(input, output, params_dict, target_pops, config["num_subwins"])


rule diploshic_fvs:
    input: 
        output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen',
        output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf',
        output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile',
        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.samples"

    output:
        output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.fv'
    resources: time=40, mem_mb=5000
    run:
        with open(input[0],'r') as f:
            seq_len = f.read().strip()
        cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --targetPop {wildcards.popu} --sampleToPopFileName {input[3]} --winSize 1100000 --ancFileName {input[2]}"
        shell(cmd)    


rule diploshic_pred:
    input:
        rules.diploshic_fvs.output,
        rules.diploshic_train_classifier.output
    output:
        output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds'
    resources: time=30, mem_mb=3000
    run:
        cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.h5 {input[0]} {output[0]}" 
        shell(cmd)   


rule add_diploshic_preds_to_stats:
    input: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees",
            output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds"
    output: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.stats.tsv.shic"
    resources: time=30, mem_mb=6000
    run:
        params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
        target_pops = [wildcards.popu]
        meta = ["model", "demo", "seed", "chrom", "sim_left", "sim_right", "annot", "dfe", "coeff", "tmult"]
        meta_str = "\t".join([params_dict[m] for m in meta])
        
        ts = tskit.load(input[0])

        df = pd.read_csv(input[1], sep="\t")
        df.classifiedWinStart = df.classifiedWinStart - 1 + ts.metadata['windowing']['window_left']
        df.classifiedWinEnd = df.classifiedWinEnd + ts.metadata['windowing']['window_left']
        assert np.all(df.classifiedWinStart <= ts.metadata['windowing']['window_right'])
        assert np.all(df.classifiedWinEnd <= ts.metadata['windowing']['window_right'])
        assert np.all(df.classifiedWinStart > ts.metadata['windowing']['window_left']+1)
        windows = df.iloc[:,[1,2]].to_numpy().astype(np.int32)
        stats = (df.iloc[:,8].to_numpy() + df.iloc[:,9].to_numpy()).astype(np.float32) # prob soft+hard
        stats = np.expand_dims(stats, axis = 1) # adding another dim going from (n_wins,) to (n_wins,1)
        fh = open(output[0], 'w')
        _print_stat_line("diploshic", stats, windows, target_pops, meta_str, fh)
        fh.close()


rule merge_stats:
    input: stats_outs
    output:
        output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv'
    resources: time=1500, mem_mb=350000, disk_mb=350000
    run:
        #print(input, flush=True)
        #import pdb; pdb.set_trace()
        df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t"), input))
        df.to_csv(output[0], sep="\t", index=False)
rule merge_stats_shic1:
    input: shic_outs1
    output:
        output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv'
    resources: time=3000, mem_mb=350000, disk_mb=350000
    run:
        #print(input, flush=True)
        #import pdb; pdb.set_trace()
        df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t", header=None), input))
        df.to_csv(output[0], sep="\t", index=False, header=None)
rule merge_stats_shic2:
    input: shic_outs2
    output:
        output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv'
    resources: time=3000, mem_mb=350000, disk_mb=350000
    run:
        #print(input, flush=True)
        #import pdb; pdb.set_trace()
        df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t", header=None), input))
        df.to_csv(output[0], sep="\t", index=False, header=None)
rule merge_stats_shic3:
    input: shic_outs3
    output:
        output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv'
    resources: time=3000, mem_mb=350000, disk_mb=350000
    run:
        #print(input, flush=True)
        #import pdb; pdb.set_trace()
        df = pd.concat(map(lambda x: pd.read_csv(x, sep="\t", header=None), input))
        df.to_csv(output[0], sep="\t", index=False, header=None)

rule merge_all_stats:
    input: [output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv']
    output: output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv'
    resources: time=3000, mem_mb=350000, disk_mb=350000
    shell:
       "cat {input} > {output}"

"""
rule get_neut_stats:
    input:
        output_dir + f"/simulated_data/sweeps/neutral/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees", 
    output:
        output_dir + f"/simulated_data/sweeps/neutral/{{seed}}/sim_{chrom}_{{left}}_{{right}}.stats.tsv", 
    resources: time=3000, mem_mb=2000
    run:
        dump_results(input, output, wildcards, model="neutral")
rule get_bgs_stats:
    input:
        output_dir + f"/simulated_data/sweeps/bgs/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees", 
    output:
        output_dir + f"/simulated_data/sweeps/bgs/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.stats.tsv", 
    resources: time=3000, mem_mb=2000
    run:
        dump_results(input, output)
rule get_sw_stats:
    input:
        output_dir + f"/simulated_data/sweeps/sweep/{{annot}}/{{dfe}}/{{coeff}}/{{time}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees", 
    output:
        output_dir + f"/simulated_data/sweeps/sweep/{{annot}}/{{dfe}}/{{coeff}}/{{time}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.stats.tsv", 
    resources: time=3000, mem_mb=2000
    run:
        dump_results(input, output)
"""