Merge pull request #46 from gymrek-lab/feat/ukb

feat: finish running `platelet_count` pipeline on UKB
gymrek-lab · Dec 5, 2024 · 2bad3c3 · 2bad3c3
2 parents 2bf6d5f + 3090781
commit 2bad3c3
Show file tree

Hide file tree

Showing 56 changed files with 3,172 additions and 704 deletions.
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -5,10 +5,7 @@
 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 	"image": "mcr.microsoft.com/devcontainers/base:jammy",
 	"features": {
-		"ghcr.io/rocker-org/devcontainer-features/miniforge:1": {
-			"version": "latest",
-			"variant": "Mambaforge"
-		}
+		"ghcr.io/rocker-org/devcontainer-features/miniforge:2": {}
 	},
 
 	// Features to add to the dev container. More info: https://containers.dev/features.

diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml
@@ -17,6 +17,6 @@ jobs:
       statuses: write  # for amannn/action-semantic-pull-request to mark status of analyzed PR
     runs-on: ubuntu-latest
     steps:
-      - uses: amannn/action-semantic-pull-request@v5.0.2
+      - uses: amannn/action-semantic-pull-request@v5
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -10,15 +10,15 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { python: "3.8", os: "ubuntu-latest", session: "lint" }
-          - { python: "3.8", os: "ubuntu-latest", session: "tests" }
+          - { python: "3.9", os: "ubuntu-latest", session: "lint" }
           - { python: "3.9", os: "ubuntu-latest", session: "tests" }
           - { python: "3.10", os: "ubuntu-latest", session: "tests" }
           - { python: "3.11", os: "ubuntu-latest", session: "tests" }
           - { python: "3.12", os: "ubuntu-latest", session: "tests" }
+          - { python: "3.13", os: "ubuntu-latest", session: "tests" }
           # - { python: "3.11", os: "windows-latest", session: "tests" }
           # - { python: "3.9", os: "macos-latest", session: "tests" }
-          - { python: "3.8", os: "ubuntu-latest", session: "size" }
+          - { python: "3.9", os: "ubuntu-latest", session: "size" }
 
     env:
       NOXSESSION: ${{ matrix.session }}
@@ -29,11 +29,10 @@ jobs:
       - name: Check out the repository
         uses: actions/checkout@v4
 
-      - name: Setup Mambaforge
+      - name: Setup Miniforge
         uses: conda-incubator/setup-miniconda@v3
         with:
           activate-environment: happler
-          miniforge-variant: Mambaforge
           auto-activate-base: false
           miniforge-version: latest
           use-mamba: true
@@ -44,7 +43,7 @@ jobs:
         shell: bash
 
       - name: Cache Conda env
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ env.CONDA }}/envs
           key:
@@ -94,7 +93,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Check for large files
-        uses: actionsdesk/lfs-warning@v3.2
+        uses: ppremk/lfs-warning@v3.3
         with:
           token: ${{ secrets.GITHUB_TOKEN }} # Optional
           filesizelimit: 500000b

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 build:
   os: "ubuntu-22.04"
   tools:
-    python: "3.8"
+    python: "3.9"
   jobs:
     post_create_environment:
       # Install poetry

diff --git a/analysis/.gitignore b/analysis/.gitignore
@@ -0,0 +1,4 @@
+midway_manhattans-indepterminator
+!midway_manhattans-indepterminator/LINKS
+outs
+HAPFILES
diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml
@@ -42,24 +42,6 @@ snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen"
 # locus: 19:45401409-46401409 # center: 45901409 (APOe4)
 locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed
 modes:
- str:
-  pos: 19:45903857 # STR_691361
- snp:
-  pos: 19:45910672 # rs1046282
- hap:
-  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
-  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
- ld_range:
-  reps: 1
-  min_ld: 0
-  max_ld: 1
-  step: 0.1
-  min_af: 0.25
-  max_af: 0.75
-  # beta: [0.35]
-  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
-  alpha: [0.05]
-  random: false # whether to also produce random haplotypes
  run:
   pheno: data/geuvadis/phenos/{trait}.pheno
   SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
@@ -79,6 +61,9 @@ min_maf: 0.1
 # sample_size: [500, 1000, 1500, 2000, 2500]
 # sample_size: 777
 
+# A threshold on the pvalues of the haplotypes
+out_thresh: 5e-6
+
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the
 # methods perform when the causal variant is absent from the data.

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
@@ -0,0 +1,92 @@
+# This is the Snakemake configuration file that specifies paths and
+# and options for the pipeline. Anybody wishing to use
+# the provided snakemake pipeline should first fill out this file with paths to
+# their own data, as the Snakefile requires it.
+# Every config option has reasonable defaults unless it is labeled as "required."
+# All paths are relative to the directory that Snakemake is executed in.
+# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)
+
+
+# Paths to a SNP-STR haplotype reference panel
+# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html
+# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
+# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
+# required!
+# snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
+# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
+snp_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.SNPs.pgen"
+str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen"
+
+
+# Path to a list of samples to exclude from the analysis
+# There should be one sample ID per line
+# exclude_samples: data/ukb_random_samples_exclude.tsv
+
+# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these:
+# https://github.com/odelaneau/shapeit4/tree/master/maps
+# The map file should use the same reference genome as the reference panel VCFs
+# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz
+
+# A "locus" is a string with a contig name, a colon, the start position, a dash, and
+# the end position or a BED file with a ".bed" file ending
+# There are different simulation modes that you can use:
+# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position
+# 2. "snp" - a SNP follows the same format as "str"
+# 3. "hap" - a haplotype
+# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype
+# 5. "run" - execute happler on a locus without simulating anything
+# The STR and SNP positions should be contained within the locus.
+# The positions should be provided in the same coordinate system as the reference
+# genome of the reference panel VCFs
+# The contig should correspond with the contig name from the {chr} wildcard in the VCF
+# required! and unless otherwise noted, all attributes of each mode are required
+locus: 19:45401409-46401409 # center: 45901409 (APOe4)
+modes:
+ str:
+  pos: 19:45903859
+  id: 19:45903859
+  reps: 10
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+ snp:
+  pos: 19:45910672 # rs1046282
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+ hap:
+  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+ ld_range:
+  reps: 10
+  min_ld: 0
+  max_ld: 1
+  step: 0.1
+  min_af: 0.25
+  max_af: 0.75
+  # beta: [0.35]
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+  alpha: [0.05]
+  random: false # whether to also produce random haplotypes
+  num_haps: [1, 2, 3, 4]
+ run:
+  pheno: data/geuvadis/phenos/{trait}.pheno
+  SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
+  # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
+mode: str
+
+# Covariates to use if they're needed
+# Otherwise, they're assumed to be regressed out of the phenotypes
+# Note: the finemapping methods won't be able to handle these covariates
+# covar: data/geuvadis/5PCs_sex.covar
+
+# Discard rare variants with a MAF below this number
+# Defaults to 0 if not specified
+min_maf: 0.05
+
+# Sample sizes to use
+# sample_size: [777, 800, 1600, 2400, 3200]
+sample_size: 3200
+
+# Whether to include the causal variant in the set of genotypes provided to the
+# finemapping methods. Set this to true if you're interested in seeing how the
+# methods perform when the causal variant is absent from the data.
+# Defaults to false if not specified
+# You can also provide a list of booleans, if you want to test both values
+exclude_causal: [true, false]
diff --git a/analysis/config/config-ukb.yml b/analysis/config/config-ukb.yml
@@ -44,7 +44,7 @@ mode: run
 
 # Discard rare variants with a MAF below this number
 # Defaults to 0 if not specified
-min_maf: 0.00005
+min_maf: 0.005
 
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the

diff --git a/analysis/config/config.yml b/analysis/config/config.yml
@@ -1 +1 @@
-config-ukb.yml
+config-simulations.yml
diff --git a/analysis/run.bash b/analysis/run.bash
@@ -10,6 +10,9 @@
 #SBATCH --time 0:30:00
 #SBATCH --mem 2G
 #SBATCH --output /dev/null
+#SBATCH --signal=B:SIGUSR1@15
+
+trap "slack job terminating early" SIGUSR1
 
 # An example bash script demonstrating how to run the entire snakemake pipeline
 # This script creates two separate log files in the output dir:

diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
@@ -50,7 +50,8 @@ module genotypes:
     config: config
 
 use rule * from genotypes as genotypes_*
-config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf
+if mode == "str":
+    config["gts_str_panel"] = config["str_panel"]
 if check_config("sample_size"):
     sampsize = config["sample_size"]
     config["gts_snp_panel"] = rules.genotypes_subset.output.pgen
@@ -59,6 +60,10 @@ else:
     config["gts_snp_panel"] = rules.genotypes_subset.input.pgen
 config["random"] = False
 
+if mode == "ld_range" and check_config("num_haps", place=config["modes"][mode]):
+    if isinstance(config["modes"][mode]["num_haps"], int):
+        config["modes"][mode]["num_haps"] = [config["modes"][mode]["num_haps"],]
+
 if mode != "run":
     module simulate:
         snakefile: "rules/simulate.smk"
@@ -90,23 +95,32 @@ if check_config("covar"):
     if "pheno_matrix" in covar_config:
         covar_file = rules.covariates_merge.output.covar
 
-if mode in ("snp", "hap", "ld_range"):
+if mode in ("str", "snp", "hap", "ld_range"):
     happler_config = {
         "pheno": rules.simulate_simphenotype.output.pheno,
         "hap_file": rules.simulate_transform.input.hap,
         "snp_panel": config["gts_snp_panel"],
-        "out": config["out"] + "/happler/"+mode+"/{beta}",
+        "out": config["out"] + "/happler/"+mode+"/beta_{beta}",
         "random": None,
         "covar": covar_file,
         "min_maf": check_config("min_maf", 0),
+        "out_thresh": 1, # artificially set to 1 to avoid filtering
     }
     if mode in ("hap", "ld_range"):
         happler_config["snps"] = []
+        happler_config["causal_gt"] = rules.simulate_transform.output
         if mode == "hap":
             happler_config["snps"] = config["modes"][mode]["alleles"]
-        happler_config["causal_gt"] = rules.simulate_transform.output
-    if mode == "ld_range":
-        happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
+        elif mode == "ld_range":
+            happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
+    elif mode in ("str", "snp"):
+        # also provide the SNP and STR IDs so that they can be used in the manhattan plot
+        happler_config["snps"] = (config["modes"][mode]["id"],)
+        # TODO: consider uncommenting this to compute LD with causal STR
+        # if mode == "str":
+        #     happler_config["causal_gt"] = config["gts_str_panel"]
+    if check_config("reps", place=config["modes"][mode]):
+        happler_config["out"] = happler_config["out"] + "/rep_{rep}"
 elif mode == "run":
     happler_config = {
         "pheno": config["modes"][mode]["pheno"],
@@ -116,11 +130,10 @@ elif mode == "run":
         "random": None,
         "covar": covar_file,
         "min_maf": check_config("min_maf", 0),
+        "out_thresh": check_config("out_thresh", 5e-8),
     }
     if check_config("SVs", place=config["modes"][mode]):
         happler_config["SVs"] = config["modes"][mode]["SVs"]
-elif "str" == mode:
-    pass
 else:
     raise ValueError("Unsupported operating 'mode' in config")
 
@@ -133,31 +146,12 @@ merged_happler = rules.happler_merge.output.pgen
 if mode == "ld_range" and config["modes"][mode]["random"]:
     happler_config["random"] = rules.simulate_random_transform.output
     happler_config["random_hap"] = rules.simulate_random_transform.input.hap
-    happler_config["out"] = config["out"] + "/happler/"+mode+"/random/ld_{ld}/beta_{beta}/alpha_{alpha}"
+    happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
     module happler_random:
         snakefile: "rules/happler.smk"
         config: happler_config
     use rule * from happler_random as happler_random_*
 
-# if mode in ("hap", "str", "ld_range"):
-#     finemappers_config = {
-#         "pheno": rules.simulate_simphenotype.output.pheno,
-#         "out": config["out"] + "/finemappers/"+mode+"/{beta}",
-#         "snp_panel": config["gts_snp_panel"],
-#     }
-#     if mode in ("hap", "ld_range"):
-#         finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen
-#         if mode == "ld_range":
-#             finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
-#     else:
-#         raise ValueError("Not yet implemented operating mode 'str' in config")
-#     module finemappers:
-#         snakefile: "rules/finemappers.smk"
-#         config: finemappers_config
-#     use rule * from finemappers as finemappers_*
-# else:
-#     raise ValueError(f"Unsupported operating mode '{mode}' in config")
-
 plots_config = {
     "out": config["out"] + "/plots",
     "mode": mode,
@@ -190,7 +184,7 @@ elif mode == "run":
         output:
             hps = orig_out + "/multiline.tsv",
         resources:
-            runtime=3,
+            runtime=10,
         log:
             orig_out + "/logs/multiline",
         benchmark:
@@ -205,10 +199,11 @@ elif mode == "run":
 
     def FINAL_OUTPUT(wildcards):
         files = [
+            rules.happler_pips.output.tsv,
             rules.happler_tree.output.png,
             rules.happler_linreg.output.png,
             rules.happler_heatmap.output.png,
-            rules.happler_manhattan.output.png,
+            # rules.happler_manhattan.output.png,
             rules.happler_cond_linreg.output.png,
             rules.happler_results.output.susie_pdf,
         ]
@@ -230,16 +225,22 @@ elif mode == "run":
 else:
     FINAL_OUTPUT = expand(
         [
+            rules.happler_cond_linreg.output.png,
             rules.happler_tree.output.png,
-            rules.happler_manhattan.output.png,
             # rules.finemappers_results.output.susie_pdf,
             rules.happler_results.output.susie_pdf,
-            rules.plots_params.output.png,
+            # rules.plots_params.output.png,
         ],
         ex=(("in",) if mode == "hap" else ("ex", "in")),
         beta=config["modes"]["hap"]["beta"],
         allow_missing=True,
     )
+    if check_config("reps", place=config["modes"][mode]):
+        FINAL_OUTPUT = expand(
+            FINAL_OUTPUT,
+            rep=range(config["modes"][mode]["reps"]),
+            allow_missing=True,
+        )
 
 # If '--config debug=true' is specified, we won't try to build the entire DAG
 # This can be helpful when you want to play around with a specific target output

diff --git a/analysis/workflow/envs/default.yml b/analysis/workflow/envs/default.yml
@@ -11,7 +11,9 @@ dependencies:
     - conda-forge::sed==4.8
     - conda-forge::bash==5.0.018
     - conda-forge::gzip==1.10
+    - conda-forge::click==8.1.7
     - bioconda::plink2==2.00a5.10
     - conda-forge::graphviz==2.50.0
+    - conda-forge::scikit-learn==1.5.1
     - bioconda::igv==2.17.4
     - conda-forge::ipython # for debugging