Skip to content

Commit

Permalink
Merge pull request #46 from gymrek-lab/feat/ukb
Browse files Browse the repository at this point in the history
feat: finish running `platelet_count` pipeline on UKB
  • Loading branch information
aryarm authored Dec 5, 2024
2 parents 2bf6d5f + 3090781 commit 2bad3c3
Show file tree
Hide file tree
Showing 56 changed files with 3,172 additions and 704 deletions.
5 changes: 1 addition & 4 deletions .devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/base:jammy",
"features": {
"ghcr.io/rocker-org/devcontainer-features/miniforge:1": {
"version": "latest",
"variant": "Mambaforge"
}
"ghcr.io/rocker-org/devcontainer-features/miniforge:2": {}
},

// Features to add to the dev container. More info: https://containers.dev/features.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/conventional-prs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ jobs:
statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR
runs-on: ubuntu-latest
steps:
- uses: amannn/action-semantic-pull-request@v5.0.2
- uses: amannn/action-semantic-pull-request@v5
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
13 changes: 6 additions & 7 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ jobs:
fail-fast: false
matrix:
include:
- { python: "3.8", os: "ubuntu-latest", session: "lint" }
- { python: "3.8", os: "ubuntu-latest", session: "tests" }
- { python: "3.9", os: "ubuntu-latest", session: "lint" }
- { python: "3.9", os: "ubuntu-latest", session: "tests" }
- { python: "3.10", os: "ubuntu-latest", session: "tests" }
- { python: "3.11", os: "ubuntu-latest", session: "tests" }
- { python: "3.12", os: "ubuntu-latest", session: "tests" }
- { python: "3.13", os: "ubuntu-latest", session: "tests" }
# - { python: "3.11", os: "windows-latest", session: "tests" }
# - { python: "3.9", os: "macos-latest", session: "tests" }
- { python: "3.8", os: "ubuntu-latest", session: "size" }
- { python: "3.9", os: "ubuntu-latest", session: "size" }

env:
NOXSESSION: ${{ matrix.session }}
Expand All @@ -29,11 +29,10 @@ jobs:
- name: Check out the repository
uses: actions/checkout@v4

- name: Setup Mambaforge
- name: Setup Miniforge
uses: conda-incubator/setup-miniconda@v3
with:
activate-environment: happler
miniforge-variant: Mambaforge
auto-activate-base: false
miniforge-version: latest
use-mamba: true
Expand All @@ -44,7 +43,7 @@ jobs:
shell: bash

- name: Cache Conda env
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: ${{ env.CONDA }}/envs
key:
Expand Down Expand Up @@ -94,7 +93,7 @@ jobs:
uses: actions/checkout@v4

- name: Check for large files
uses: actionsdesk/lfs-warning@v3.2
uses: ppremk/lfs-warning@v3.3
with:
token: ${{ secrets.GITHUB_TOKEN }} # Optional
filesizelimit: 500000b
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ version: 2
build:
os: "ubuntu-22.04"
tools:
python: "3.8"
python: "3.9"
jobs:
post_create_environment:
# Install poetry
Expand Down
4 changes: 4 additions & 0 deletions analysis/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
midway_manhattans-indepterminator
!midway_manhattans-indepterminator/LINKS
outs
HAPFILES
21 changes: 3 additions & 18 deletions analysis/config/config-geuvadis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,6 @@ snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen"
# locus: 19:45401409-46401409 # center: 45901409 (APOe4)
locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed
modes:
str:
pos: 19:45903857 # STR_691361
snp:
pos: 19:45910672 # rs1046282
hap:
alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
ld_range:
reps: 1
min_ld: 0
max_ld: 1
step: 0.1
min_af: 0.25
max_af: 0.75
# beta: [0.35]
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
alpha: [0.05]
random: false # whether to also produce random haplotypes
run:
pheno: data/geuvadis/phenos/{trait}.pheno
SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
Expand All @@ -79,6 +61,9 @@ min_maf: 0.1
# sample_size: [500, 1000, 1500, 2000, 2500]
# sample_size: 777

# A threshold on the pvalues of the haplotypes
out_thresh: 5e-6

# Whether to include the causal variant in the set of genotypes provided to the
# finemapping methods. Set this to true if you're interested in seeing how the
# methods perform when the causal variant is absent from the data.
Expand Down
92 changes: 92 additions & 0 deletions analysis/config/config-simulations.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# This is the Snakemake configuration file that specifies paths and
# and options for the pipeline. Anybody wishing to use
# the provided snakemake pipeline should first fill out this file with paths to
# their own data, as the Snakefile requires it.
# Every config option has reasonable defaults unless it is labeled as "required."
# All paths are relative to the directory that Snakemake is executed in.
# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)


# Paths to a SNP-STR haplotype reference panel
# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html
# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
# required!
# snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
snp_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.SNPs.pgen"
str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen"


# Path to a list of samples to exclude from the analysis
# There should be one sample ID per line
# exclude_samples: data/ukb_random_samples_exclude.tsv

# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these:
# https://github.com/odelaneau/shapeit4/tree/master/maps
# The map file should use the same reference genome as the reference panel VCFs
# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz

# A "locus" is a string with a contig name, a colon, the start position, a dash, and
# the end position or a BED file with a ".bed" file ending
# There are different simulation modes that you can use:
# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position
# 2. "snp" - a SNP follows the same format as "str"
# 3. "hap" - a haplotype
# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype
# 5. "run" - execute happler on a locus without simulating anything
# The STR and SNP positions should be contained within the locus.
# The positions should be provided in the same coordinate system as the reference
# genome of the reference panel VCFs
# The contig should correspond with the contig name from the {chr} wildcard in the VCF
# required! and unless otherwise noted, all attributes of each mode are required
locus: 19:45401409-46401409 # center: 45901409 (APOe4)
modes:
str:
pos: 19:45903859
id: 19:45903859
reps: 10
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
snp:
pos: 19:45910672 # rs1046282
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
hap:
alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
ld_range:
reps: 10
min_ld: 0
max_ld: 1
step: 0.1
min_af: 0.25
max_af: 0.75
# beta: [0.35]
beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
alpha: [0.05]
random: false # whether to also produce random haplotypes
num_haps: [1, 2, 3, 4]
run:
pheno: data/geuvadis/phenos/{trait}.pheno
SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
# pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
mode: str

# Covariates to use if they're needed
# Otherwise, they're assumed to be regressed out of the phenotypes
# Note: the finemapping methods won't be able to handle these covariates
# covar: data/geuvadis/5PCs_sex.covar

# Discard rare variants with a MAF below this number
# Defaults to 0 if not specified
min_maf: 0.05

# Sample sizes to use
# sample_size: [777, 800, 1600, 2400, 3200]
sample_size: 3200

# Whether to include the causal variant in the set of genotypes provided to the
# finemapping methods. Set this to true if you're interested in seeing how the
# methods perform when the causal variant is absent from the data.
# Defaults to false if not specified
# You can also provide a list of booleans, if you want to test both values
exclude_causal: [true, false]
2 changes: 1 addition & 1 deletion analysis/config/config-ukb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ mode: run

# Discard rare variants with a MAF below this number
# Defaults to 0 if not specified
min_maf: 0.00005
min_maf: 0.005

# Whether to include the causal variant in the set of genotypes provided to the
# finemapping methods. Set this to true if you're interested in seeing how the
Expand Down
2 changes: 1 addition & 1 deletion analysis/config/config.yml
3 changes: 3 additions & 0 deletions analysis/run.bash
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#SBATCH --time 0:30:00
#SBATCH --mem 2G
#SBATCH --output /dev/null
#SBATCH --signal=B:SIGUSR1@15

trap "slack job terminating early" SIGUSR1

# An example bash script demonstrating how to run the entire snakemake pipeline
# This script creates two separate log files in the output dir:
Expand Down
65 changes: 33 additions & 32 deletions analysis/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ module genotypes:
config: config

use rule * from genotypes as genotypes_*
config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf
if mode == "str":
config["gts_str_panel"] = config["str_panel"]
if check_config("sample_size"):
sampsize = config["sample_size"]
config["gts_snp_panel"] = rules.genotypes_subset.output.pgen
Expand All @@ -59,6 +60,10 @@ else:
config["gts_snp_panel"] = rules.genotypes_subset.input.pgen
config["random"] = False

if mode == "ld_range" and check_config("num_haps", place=config["modes"][mode]):
if isinstance(config["modes"][mode]["num_haps"], int):
config["modes"][mode]["num_haps"] = [config["modes"][mode]["num_haps"],]

if mode != "run":
module simulate:
snakefile: "rules/simulate.smk"
Expand Down Expand Up @@ -90,23 +95,32 @@ if check_config("covar"):
if "pheno_matrix" in covar_config:
covar_file = rules.covariates_merge.output.covar

if mode in ("snp", "hap", "ld_range"):
if mode in ("str", "snp", "hap", "ld_range"):
happler_config = {
"pheno": rules.simulate_simphenotype.output.pheno,
"hap_file": rules.simulate_transform.input.hap,
"snp_panel": config["gts_snp_panel"],
"out": config["out"] + "/happler/"+mode+"/{beta}",
"out": config["out"] + "/happler/"+mode+"/beta_{beta}",
"random": None,
"covar": covar_file,
"min_maf": check_config("min_maf", 0),
"out_thresh": 1, # artificially set to 1 to avoid filtering
}
if mode in ("hap", "ld_range"):
happler_config["snps"] = []
happler_config["causal_gt"] = rules.simulate_transform.output
if mode == "hap":
happler_config["snps"] = config["modes"][mode]["alleles"]
happler_config["causal_gt"] = rules.simulate_transform.output
if mode == "ld_range":
happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
elif mode == "ld_range":
happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
elif mode in ("str", "snp"):
# also provide the SNP and STR IDs so that they can be used in the manhattan plot
happler_config["snps"] = (config["modes"][mode]["id"],)
# TODO: consider uncommenting this to compute LD with causal STR
# if mode == "str":
# happler_config["causal_gt"] = config["gts_str_panel"]
if check_config("reps", place=config["modes"][mode]):
happler_config["out"] = happler_config["out"] + "/rep_{rep}"
elif mode == "run":
happler_config = {
"pheno": config["modes"][mode]["pheno"],
Expand All @@ -116,11 +130,10 @@ elif mode == "run":
"random": None,
"covar": covar_file,
"min_maf": check_config("min_maf", 0),
"out_thresh": check_config("out_thresh", 5e-8),
}
if check_config("SVs", place=config["modes"][mode]):
happler_config["SVs"] = config["modes"][mode]["SVs"]
elif "str" == mode:
pass
else:
raise ValueError("Unsupported operating 'mode' in config")

Expand All @@ -133,31 +146,12 @@ merged_happler = rules.happler_merge.output.pgen
if mode == "ld_range" and config["modes"][mode]["random"]:
happler_config["random"] = rules.simulate_random_transform.output
happler_config["random_hap"] = rules.simulate_random_transform.input.hap
happler_config["out"] = config["out"] + "/happler/"+mode+"/random/ld_{ld}/beta_{beta}/alpha_{alpha}"
happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
module happler_random:
snakefile: "rules/happler.smk"
config: happler_config
use rule * from happler_random as happler_random_*

# if mode in ("hap", "str", "ld_range"):
# finemappers_config = {
# "pheno": rules.simulate_simphenotype.output.pheno,
# "out": config["out"] + "/finemappers/"+mode+"/{beta}",
# "snp_panel": config["gts_snp_panel"],
# }
# if mode in ("hap", "ld_range"):
# finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen
# if mode == "ld_range":
# finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
# else:
# raise ValueError("Not yet implemented operating mode 'str' in config")
# module finemappers:
# snakefile: "rules/finemappers.smk"
# config: finemappers_config
# use rule * from finemappers as finemappers_*
# else:
# raise ValueError(f"Unsupported operating mode '{mode}' in config")

plots_config = {
"out": config["out"] + "/plots",
"mode": mode,
Expand Down Expand Up @@ -190,7 +184,7 @@ elif mode == "run":
output:
hps = orig_out + "/multiline.tsv",
resources:
runtime=3,
runtime=10,
log:
orig_out + "/logs/multiline",
benchmark:
Expand All @@ -205,10 +199,11 @@ elif mode == "run":

def FINAL_OUTPUT(wildcards):
files = [
rules.happler_pips.output.tsv,
rules.happler_tree.output.png,
rules.happler_linreg.output.png,
rules.happler_heatmap.output.png,
rules.happler_manhattan.output.png,
# rules.happler_manhattan.output.png,
rules.happler_cond_linreg.output.png,
rules.happler_results.output.susie_pdf,
]
Expand All @@ -230,16 +225,22 @@ elif mode == "run":
else:
FINAL_OUTPUT = expand(
[
rules.happler_cond_linreg.output.png,
rules.happler_tree.output.png,
rules.happler_manhattan.output.png,
# rules.finemappers_results.output.susie_pdf,
rules.happler_results.output.susie_pdf,
rules.plots_params.output.png,
# rules.plots_params.output.png,
],
ex=(("in",) if mode == "hap" else ("ex", "in")),
beta=config["modes"]["hap"]["beta"],
allow_missing=True,
)
if check_config("reps", place=config["modes"][mode]):
FINAL_OUTPUT = expand(
FINAL_OUTPUT,
rep=range(config["modes"][mode]["reps"]),
allow_missing=True,
)

# If '--config debug=true' is specified, we won't try to build the entire DAG
# This can be helpful when you want to play around with a specific target output
Expand Down
2 changes: 2 additions & 0 deletions analysis/workflow/envs/default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ dependencies:
- conda-forge::sed==4.8
- conda-forge::bash==5.0.018
- conda-forge::gzip==1.10
- conda-forge::click==8.1.7
- bioconda::plink2==2.00a5.10
- conda-forge::graphviz==2.50.0
- conda-forge::scikit-learn==1.5.1
- bioconda::igv==2.17.4
- conda-forge::ipython # for debugging
Loading

0 comments on commit 2bad3c3

Please sign in to comment.