workflows/n_t.snake

"""
Snakefile for running N_t analysis on stdpopsim.

Simply running `snakemake` will run all analysis 
defined by the arguments above.

Currently, for each rep,
This will run stairway plot, smc++, and msmc
on the data resulting from simulations
on all chromosomes included in chrm_list
for estimates of N_t (Ne through time).
"""

import pathlib
import sys
import os
import numpy as np
import stdpopsim
import stairway
import smc
import msmc
import gone
import plots
import masks

# TODO: add support for running some but not all inference methods
# NOTE: see other todo specific to rule below
# ###############################################################################
# KNOBS - 
# ###############################################################################

# A seed to replicate results

configfile: "workflows/config/snakemake/tiny_config.yaml"

np.random.seed(config["seed"])
output_dir = os.path.abspath(config["output_dir"])
# The analysis species
species = stdpopsim.get_species(config["species"])

# This is the number of samples to simulate for within each population
# for each replicate

# Here is a list of sample sizes to run msmc on. 
# Each element counts as its own analysis
# so there will be "replicates" runs for each size
num_sampled_genomes_msmc =  config["num_sampled_genomes_msmc"]

# The number of msmc Baumwelch(?) iterations to run,
# typically 20
num_msmc_iterations = config["num_msmc_iterations"]


# The number of replicates of each analysis you would like to run
# For now leaving it a 1 just to get results quickly
replicates = config["replicates"]


# The genetic map you would like to use.
# if value None is given default_recombination_rates are
# used with a flat map
genetic_map_id = config["genetic_map"]

# The DFE id used for selection analyses
dfe_id = config["dfe_list"][0] # need to generalize to more than one...


# The names of all chromosomes to simulate, separated by commas
# Use "all" to simulate all chromosomes for the genome
chrm_list = [chrom.id for chrom in species.genome.chromosomes]
if "chrY" in chrm_list:
    chrm_list.remove("chrY")
if(config["chrm_list"] != "all"):
    chrm_list = [chr for chr in config["chrm_list"].split(",")]

# The specific demographic model you would like to run
demo_model_array = config["demo_models"]
demo_model_ids = [x["id"] for x in demo_model_array]
demo_sample_size_dict = {}
for x in demo_model_array:
    if x["id"] == "Constant":
        model = stdpopsim.PiecewiseConstantSize(species.population_size)
    else:
        model = species.get_demographic_model(x["id"])
    demo_sample_size_dict[x["id"]] = {f"{model.populations[i].name}": m for i, m in enumerate(x["num_samples_per_population"])}

# Select DFE model from catalog  
dfe_list = config["dfe_list"]   
annotation_list = config["annotation_list"]

methods = config["methods"]
def pop_expand(output_dir, method, filename="temp.txt"):
    infiles = []
    for demog in demo_model_ids:
        for dfe, annot in zip(dfe_list, annotation_list):
            for seeds in seed_array:
                for chrms in chrm_list:
                    for pops in demo_sample_size_dict[demog].keys():
                        infiles.append(output_dir + f"/inference/{demog}/{method}/{dfe}/{annot}/{seeds}/{pops}/{filename}")
    return infiles

# ###############################################################################
# GENERAL RULES & GLOBALS
# ###############################################################################

slim_scaling_factor = config["slim_scaling_factor"]
seed_array = np.random.random_integers(1,2**31,replicates)
genetic_map_downloaded_flag= ".genetic_map_downloaded"
msmc_exec = config["msmc_exec"]
stairwayplot_code = config["stairwayplot_code"]
gone_code = config["gone_code"]
try:
    mask_file = config["mask_file"]
except KeyError:
    mask_file = None


rule all:
    input:
        #rules.simulation_all.output,
        expand(output_dir + "/plots/{demog}/estimated_Ne_t_final.csv", demog=demo_model_ids),
        expand(output_dir + "/plots/{demog}/estimated_Ne_t_final.pdf", demog=demo_model_ids),
        expand(output_dir + "/plots/{demog}/coal_estimated_Ne_t.csv", demog=demo_model_ids),
        expand(output_dir + "/plots/{demog}/{method}/{method}_estimated_Ne_t.csv",
            demog=demo_model_ids, method=methods),
        expand(output_dir + "/plots/{demog}/{method}/{method}_estimated_Ne_t.pdf",
            demog=demo_model_ids, method=methods),


rule download_genetic_map:
    input:
    output:
        genetic_map_downloaded_flag,
    message:
        "Downloading default genetic map"
    run:
        # We need to have this here to avoid several threads trying to download the
        # the genetic map into the cache at the same time.
        if genetic_map_id is not None:
            genetic_map = species.get_genetic_map(genetic_map_id)
            if not genetic_map.is_cached():
                genetic_map.download()
            with open(output[0], "w") as f:
                print("File to indicate genetic map has been downloaded", file=f)


################################################################################
# UTILS HELPERS ETC
###############################################################################

def generation_time_helper(demog, species):
    if demog == "Constant":
        generation_time = species.generation_time
    else:
        generation_time = species.get_demographic_model(demog).generation_time
    return generation_time

def mutation_rate_helper(demog, species):
    if demog == "Constant":
        mutation_rate = species.genome.mean_mutation_rate
    else:
        mutation_rate = species.get_demographic_model(demog).mutation_rate
        if mutation_rate is None:
                mutation_rate = species.genome.mean_mutation_rate
    return mutation_rate

rule write_bdd:
    output:
        output_dir + "/plots/{demog}/coal_estimated_Ne_t.csv",
    run:
        steps = None
        if wildcards.demog == "Constant":
            max_time = species.population_size
            # NOTE: this keep throwing an error and I cant find it in the docs
            #max_time = species.GenericConstantSize().default_population_size
            max_time *= 2  # 4?
            steps = np.linspace(1, max_time, max_time + 1)
            model = stdpopsim.PiecewiseConstantSize(species.population_size)
        else:
            model = species.get_demographic_model(wildcards.demog)
        generation_time = generation_time_helper(wildcards.demog, species)
        plots.gather_coal_rate(
            output,
            model,
            demo_sample_size_dict[wildcards.demog],
            generation_time,
            steps,
        )


# ###############################################################################
# STAIRWAYPLOT
# ###############################################################################
stairwayplot_code = config["stairwayplot_code"]
sp_mask = config["stairway_annot_mask"]


rule sp_download:
    output:
        directory("ext/stairwayplot"),
    message:
        "downloading stairwayplot"
    threads: 1
    shell:
        """
        cd ext/
        wget http://sesame.uoregon.edu/~adkern/stdpopsim/stairwayplot.tar.gz
        tar zxf stairwayplot.tar.gz
        rm -f stairwayplot.tar.gz
        cd ../
        """


rule run_stairwayplot:
    input:
        rules.sp_download.output,
        expand(output_dir + "/simulated_data/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{chrms}.trees",
            chrms=chrm_list)
    output:
        output_dir + "/inference/{demog}/stairwayplot/{dfes}/{annots}/{seeds}/{pops}/stairwayplot_estimated_Ne.txt"
    threads: 20
    resources:
        time = 3_000,
        mem_mb=120_000,
    run:
        inputs = expand(
            output_dir
            + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees",
            demog=wildcards.demog,
            dfes=wildcards.dfes,
            annots=wildcards.annots,
            seeds=wildcards.seeds,
            chrms=chrm_list,
        )

        runner = stairway.StairwayPlotRunner(
            workdir=output_dir
            + f"/inference/{wildcards.demog}/stairwayplot/{wildcards.dfes}/{wildcards.annots}/{wildcards.seeds}/{wildcards.pops}/",
            stairway_dir=pathlib.Path.cwd() / "ext/stairwayplot",
        )

        chromIDs = [Path(file).stem.split('_')[1] for file in inputs]
        if wildcards.annots == "none" or sp_mask == "none":
            mask_intervals = masks.get_combined_masks(
                species.id,
                mask_file,
                chromIDs,
            )
        else:
            mask_intervals = masks.get_combined_masks(
                species.id,
                mask_file,
                chromIDs,
                chrom_annotation=wildcards.annots,
            )

        runner.ts_to_stairway(inputs, wildcards.pops, mask_intervals=mask_intervals, num_bootstraps=200)
        runner.run_theta_estimation(max_workers=threads, show_progress=True)
        runner.run_summary(
            output,
            mutation_rate=mutation_rate_helper(wildcards.demog, species),
            generation_time=generation_time_helper(wildcards.demog, species),
        )


rule compound_stairwayplot:
    input:
        pop_expand(output_dir, "stairwayplot", "stairwayplot_estimated_Ne.txt")
    output:
        output_dir + "/plots/{demog}/stairwayplot/stairwayplot_estimated_Ne_t.csv",
    run:
        gen_time = generation_time_helper(wildcards.demog, species)
        plots.gather_inference_results(output_dir, wildcards.demog, output[0], "stairwayplot",
                                       mask_file, sp_mask, demo_sample_size_dict[wildcards.demog],
                                       slim_scaling_factor, gen_time)


rule plot_compound_stairway:
    input:
        rules.compound_stairwayplot.output,
    output:
        output_dir + "/plots/{demog}/stairwayplot/stairwayplot_estimated_Ne_t.pdf",
    run:
        plots.plot_compound_Ne_t(input[0], output[0])


# ###############################################################################
# MSMC2 https://github.com/stschiff/msmc2
# ###############################################################################
# TODO: ln 379, get wildcards.samps to work so that num_sampled_genomes_msmc are run in parallel
# sample size
num_sampled_genomes_msmc =  config["num_sampled_genomes_msmc"]
# The number of msmc Baumwelch iterations to run
num_msmc_iterations = config["num_msmc_iterations"]
msmc_exec = config["msmc_exec"]
msmc_mask = config["msmc_annot_mask"]


rule download_msmc:
    output:
        directory("ext/msmc2")
    message:
        "downloading msmc"
    threads: 1
    shell:
        """
        cd ext
	    git clone https://github.com/stschiff/msmc2.git
	    cat msmc2_makefile_stdpopsim_patch > msmc2/Makefile
	    cd msmc2
	    make
        cd ../../
        """

rule ts_to_multihep:
    input:
        output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"
    output:
        output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{chrms}.trees.multihep.txt"
    resources:
        time=2_000,
    run:
        print(input[0], num_sampled_genomes_msmc, mask_file)
        if wildcards.annots == "none" or msmc_mask == "none":
            mask_intervals = masks.get_combined_masks(
                                            species.id,
                                            mask_file,
                                            wildcards.chrms,
                                            )
        else:
            mask_intervals = masks.get_combined_masks(
                                            species.id,
                                            mask_file,
                                            wildcards.chrms,
                                            chrom_annotation=wildcards.annots,
                                            )
        msmc.write_msmc_file(input[0], output[0], wildcards.pops, mask_intervals)


rule run_msmc:
    input:
        expand(output_dir + "/inference/{{demog}}/msmc/{{dfes}}/{{annots}}/{{seeds}}/{{pops}}/{chrms}.trees.multihep.txt",
                chrms=chrm_list),
        rules.download_msmc.output,
    output:
        #expand(output_dir + "/inference/{{demog}}/msmc/{{dfes}}/{{annots}}/{{seeds}}/{{pops}}/{samps}.trees.multihep.txt.final.txt",
        #            samps=num_sampled_genomes_msmc),
        output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{samps}.trees.multihep.txt.final.txt"
    threads: 8
    resources:
        time = lambda wildcards, attempt: attempt * 1_000,
        mem_mb = lambda wildcards, attempt: attempt * 200_000
        
    run:
        inputs = expand(output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{chrms}.trees.multihep.txt",
                    demog=wildcards.demog,
                    dfes=wildcards.dfes,
                    annots=wildcards.annots,
                    seeds=wildcards.seeds,
                    pops=wildcards.pops,
                    chrms=chrm_list,
                        )
        total_samples = demo_sample_size_dict[wildcards.demog][wildcards.pops]
        input_file_string = " ".join(inputs)
        output_file_string = output_dir + f"/inference/{wildcards.demog}/msmc/{wildcards.dfes}/{wildcards.annots}/{wildcards.seeds}/{wildcards.pops}/"
        msmc.run_msmc_estimate(input_file_string, output_file_string, wildcards.samps, msmc_exec, total_samples,
            iterations=num_msmc_iterations, ncores=threads)


rule convert_msmc:
    input:
        #rules.run_msmc.output,
        #output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{samps}.trees.multihep.txt.final.txt"
        expand(output_dir + "/inference/{{demog}}/msmc/{{dfes}}/{{annots}}/{{seeds}}/{{pops}}/{samps}.trees.multihep.txt.final.txt",
                    samps=num_sampled_genomes_msmc),
    output:
        output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/msmc_estimated_Ne.txt"
    run:
        msmc.convert_msmc_output(input[0], output[0],
            mutation_rate=mutation_rate_helper(wildcards.demog, species),
            generation_time=generation_time_helper(wildcards.demog, species)
        )


rule compound_msmc:
    input:
        pop_expand(output_dir, "msmc", "msmc_estimated_Ne.txt")
    output:
        output_dir + "/plots/{demog}/msmc/msmc_estimated_Ne_t.csv"
    run:
        gen_time = generation_time_helper(wildcards.demog, species)
        plots.gather_inference_results(output_dir, wildcards.demog, output[0], "msmc",
                                       mask_file, msmc_mask, demo_sample_size_dict[wildcards.demog],
                                       slim_scaling_factor, gen_time)


rule plot_compound_msmc:
    input:
        rules.compound_msmc.output
    output:
        output_dir + "/plots/{demog}/msmc/msmc_estimated_Ne_t.pdf"
    run:
        plots.plot_compound_Ne_t(input[0], output[0])


# ###############################################################################
# GONe
# ###############################################################################
gone_code = config["gone_code"]
gone_mask = config["gone_annot_mask"]


rule gone_clone:
    output:
        directory("ext/GONE")
    message:
        "cloning GONE repo"
    threads: 1
    shell:
        """
        cd ext/
        git clone https://github.com/esrud/GONE.git
        cd ..
        """


rule gone_params:
    input:
        rules.gone_clone.output,
    output:
        ".params_edited"
    message:
        "specifying GONE params"
    threads: 1
    run:
        prms = {"gone_phase":config["gone_phase"],
                "gone_max_snps":config["gone_max_snps"],
                "gone_num_gens":config["gone_num_gens"],
                "gone_num_bins":config["gone_num_bins"]}
        gone.params(gone_code, prms)


rule gone_copy:
    input:
        rules.gone_params.output,
        rules.gone_clone.output,
    output:
        output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/.scripts_copied"
    message:
        "copying GONE scripts into individual working directories"
    threads: 1
    run:
        print(output[0])
        outpath = "/".join(output[0].split("/")[:-1])
        gone.copy(gone_code, outpath, wildcards.seeds, threads)


rule gone_prep_inputs:
    input:
        rules.gone_copy.output,
        expand(output_dir + "/simulated_data/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{chrms}.trees",
            chrms=chrm_list)
    output:
        output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/gone.ped",
        output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/gone.map",
    threads: 1
    resources:
        mem_mb=36000
    run:
        inputs = expand(
            output_dir
            + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees",
            demog=wildcards.demog,
            dfes=wildcards.dfes,
            annots=wildcards.annots,
            seeds=wildcards.seeds,
            chrms=chrm_list,
        )
        # handle no annotation case
        chromIDs = [Path(file).stem.split('_')[1] for file in inputs]
        if wildcards.annots == "none" or gone_mask == "none":
            mask_intervals = masks.get_combined_masks(
                                            species.id,
                                            mask_file,
                                            chromIDs,
                                            )
        else:
            mask_intervals = masks.get_combined_masks(
                                            species.id,
                                            mask_file,
                                            chromIDs,
                                            chrom_annotation=wildcards.annots,
                                            )
        genetic_map = None
        if genetic_map_id is not None:
            genetic_map = species.get_genetic_map(genetic_map_id)
            if not genetic_map.is_cached():
                genetic_map.download()

        gone.ts2plink(inputs, output[0], output[1], species, wildcards.pops, genetic_map, chromIDs, mask_intervals=mask_intervals)


rule gone_run:
    input:
        rules.gone_copy.output,
        rules.gone_prep_inputs.output,
    output:
        output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/gone_estimated_Ne.txt",
    threads: 8
    resources: time=360
    shell:
        """
        cwd=$PWD
        cd {output_dir}/inference/{wildcards.demog}/gone/{wildcards.dfes}/{wildcards.annots}/{wildcards.seeds}/{wildcards.pops}
        bash script_GONE.sh gone
        cd $cwd
        """


rule compound_gone:
    input:
        pop_expand(output_dir, "gone", "gone_estimated_Ne.txt")
    output:
        output_dir + "/plots/{demog}/gone/gone_estimated_Ne_t.csv"
    run:
        gen_time = generation_time_helper(wildcards.demog, species)
        plots.gather_inference_results(output_dir, wildcards.demog, output[0], "gone",
                                       mask_file, gone_mask, demo_sample_size_dict[wildcards.demog],
                                       slim_scaling_factor, gen_time)


rule plot_compound_gone:
    input:
        rules.compound_gone.output
    output:
        output_dir + "/plots/{demog}/gone/gone_estimated_Ne_t.pdf"
    run:
        plots.plot_compound_Ne_t(input[0], output[0])


# ###############################################################################
# SMC++
# ###############################################################################
# TODO: how to label/record when using > 2 haps in composite lkhood calc ...
smcpp_mask = config["smcpp_annot_mask"]


rule clone_smcpp:
    output:
        "ext/smcpp/pyproject.toml"
    message: "Cloning SMC++"
    threads: 1
    shell:
        """
        cd ext/
        git clone https://github.com/popgenmethods/smcpp.git
        cat smc_setup_stdpopsim_patch > smcpp/setup.py
        cat smc_pyproject_stdpopsim_patch > smcpp/pyproject.toml
        cd smcpp/
        pip install .
        cd ..
        """


rule ts_to_smc:
    input:
        output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees",
    output:
        output_dir + "/inference/{demog}/smcpp/{dfes}/{annots}/{seeds}/{pops}/sim_{chrms}.trees.smc.gz"
    run:
        # handle no annotation case
        if wildcards.annots == "none" or smcpp_mask == "none":
            mask_intervals = masks.get_combined_masks(
                                            species.id,
                                            mask_file,
                                            wildcards.chrms,
                                            )
        else:
            mask_intervals = masks.get_combined_masks(
                                            species.id,
                                            mask_file,
                                            wildcards.chrms,
                                            chrom_annotation=wildcards.annots,
                                            )
        smc.write_smcpp_file(input[0], output[0], wildcards.pops, mask_intervals=mask_intervals)


rule run_smcpp:
    input:
        expand(output_dir+ "/inference/{{demog}}/smcpp/{{dfes}}/{{annots}}/{{seeds}}/{{pops}}/sim_{chrms}.trees.smc.gz",
            chrms=chrm_list),
    output:
        output_dir + "/inference/{demog}/smcpp/{dfes}/{annots}/{seeds}/{pops}/model.final.json"
    threads: 20
    run:
        # need to cd into subdir because smc++ crashes otherwise
        cur = os.getcwd()
        os.chdir(f"{output_dir}/inference/{wildcards.demog}/smcpp/{wildcards.dfes}/{wildcards.annots}/{wildcards.seeds}/{wildcards.pops}")
        base = f"trees.smc.gz"
        mutation_rate=mutation_rate_helper(wildcards.demog, species)
        smc.run_smcpp_estimate(base, mutation_rate=mutation_rate, ncores=threads)
        # need to cd out of subdir for snakemake sanity
        os.chdir(cur)


rule smcpp_plot:
    input:
        rules.run_smcpp.output
    output:
        output_dir + "/inference/{demog}/smcpp/{dfes}/{annots}/{seeds}/{pops}/smcpp_estimated_Ne.csv"
    run:
        gen_time = generation_time_helper(wildcards.demog, species)
        smc.run_smcpp_plot(input[0], output[0], generation_time=gen_time)


rule compound_smcpp:
    input:
        #rules.smcpp_plot.output,
        pop_expand(output_dir, "smcpp", "smcpp_estimated_Ne.csv")
    output:
        output_dir + "/plots/{demog}/smcpp/smcpp_estimated_Ne_t.csv"
    run:
        gen_time = generation_time_helper(wildcards.demog, species)
        plots.gather_inference_results(output_dir, wildcards.demog, output[0], "smcpp",
                                       mask_file, smcpp_mask, demo_sample_size_dict[wildcards.demog],
                                       slim_scaling_factor, gen_time)


rule plot_compound_smcpp:
    input:
        rules.compound_smcpp.output
    output:
        output_dir + "/plots/{demog}/smcpp/smcpp_estimated_Ne_t.pdf"
    run:
        plots.plot_compound_Ne_t(input[0], output[0])


# ###############################################################################
#  Plotting results
# ###############################################################################


rule gather_inference:
    input:
        rules.compound_stairwayplot.output,
        rules.compound_msmc.output,
        rules.compound_smcpp.output,
        rules.compound_gone.output
    output:
        output_dir + "/plots/{demog}/estimated_Ne_t_final.csv",
    run:
        shell("echo 'method,population,nsamp,DFE,annotations,year,Ne,seed,chrm_mask,annot_mask,slim_scaling_factor' > {output[0]}.temp")
        for infile in input:
            shell("sed 1d {infile} >> {output[0]}.temp")
        shell("cut -d',' -f1-11 {output[0]}.temp > {output[0]}")
        shell("rm -f {output[0]}.temp")


rule all_plot:
    input:
        rules.gather_inference.output,
    output:
        output_dir + "/plots/{demog}/estimated_Ne_t_final.pdf",
    run:
        plots.plot_all_ne_estimates(input[0], output[0])


rule clean_temp:
    message:
        "removing temp inference files"
    run:
        stairway_temp = pop_expand(output_dir, "stairwayplot", "sfs*")
        stairway_temp = " ".join(stairway_temp)
        shell("rm -rf {stairway_temp}")
        gone_temp = pop_expand(output_dir, "gone", "PROGRAMMES") + pop_expand(output_dir, "gone", "TEMPORARY_FILES")
        gone_temp = " ".join(gone_temp)
        shell("rm -rf {gone_temp}")