Merge branch 'feat/ukb' into feat/inverted-hap

gymrek-lab · Jul 26, 2024 · 1b5b37e · 1b5b37e
2 parents 6331382 + 41a4f84
commit 1b5b37e
Show file tree

Hide file tree

Showing 6 changed files with 388 additions and 11 deletions.
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
@@ -487,7 +487,7 @@ rule gwas:
         maf = check_config("min_maf", 0),
         in_prefix = lambda w, input: Path(input.pgen).with_suffix(""),
         out_prefix = lambda w, output: Path(output.log).with_suffix(""),
-        covar = lambda wildcards, input: ("'no-x-sex' --covar 'iid-only' " + input["covar"]) if check_config("covar") else "allow-no-covars",
+        covar = lambda wildcards, input: ("no-x-sex hide-covar --covar 'iid-only' " + input["covar"]) if check_config("covar") else "allow-no-covars",
         start=lambda wildcards: parse_locus(wildcards.locus)[1],
         end=lambda wildcards: parse_locus(wildcards.locus)[2],
         chrom=lambda wildcards: parse_locus(wildcards.locus)[0],

diff --git a/analysis/workflow/scripts/README.md b/analysis/workflow/scripts/README.md
@@ -33,6 +33,9 @@ Create a manhattan plot to visualize the results of a GWAS.
 ## [merge_plink.py](merge_plink.py)
 Merge variants from two PGEN files that have the same set of samples.
 
+## [midway_manhattan.bash](midway_manhattan.bash)
+Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process.
+
 ## [parameter_plot.py](parameter_plot.py)
 Plot the LD between a causal haplotype and a set of observed haplotypes across a range of hyper-parameters.
 

diff --git a/analysis/workflow/scripts/linear2ttest.py b/analysis/workflow/scripts/linear2ttest.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+from typing import Tuple
+from pathlib import Path
+from logging import Logger
+
+import click
+import numpy as np
+import pandas as pd
+from haptools import data
+from haptools.logging import getLogger
+
+from happler.tree.assoc_test import NodeResults
+from happler.tree.terminator import TTestTerminator
+from happler.tree.assoc_test import AssocResults, AssocTestSimple
+
+
+PLINK_COLS = {
+    "#CHROM": "chromosome",
+    "POS": "pos",
+    "ID": "id",
+    "OBS_CT": "samples",
+    "BETA": "beta",
+    "SE": "stderr",
+    "P": "pval",
+}
+
+def load_linear_file(linear_fname: Path):
+    keep_cols = list(PLINK_COLS.keys())
+    df = pd.read_csv(
+        linear_fname,
+        sep="\t",
+        header=0,
+        usecols=keep_cols,
+    ).rename(columns=PLINK_COLS)
+    df = df.sort_values("pos")
+    df["pval"] = df["pval"].fillna(np.inf)
+    return df
+
+
+@click.command()
+@click.argument("linear", type=click.Path(exists=True, path_type=Path))
+@click.argument("parent", type=click.Path(exists=True, path_type=Path))
+@click.argument("linear_gts", type=click.Path(exists=True, path_type=Path))
+@click.argument("parent_gts", type=click.Path(exists=True, path_type=Path))
+@click.option(
+    "-i",
+    "--id",
+    "hap_id",
+    type=str,
+    show_default="first",
+    help=(
+        "The ID of the variant to choose as the parent haplotype"
+    ),
+)
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(path_type=Path),
+    default=Path("/dev/stdout"),
+    show_default="stdout",
+    help="A transformed genotypes file",
+)
+@click.option(
+    "-v",
+    "--verbosity",
+    type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]),
+    default="INFO",
+    show_default=True,
+    help="The level of verbosity desired",
+)
+def main(
+    linear: Path,
+    parent: Path,
+    linear_gts: Path,
+    parent_gts: Path,
+    hap_id: str = None,
+    output: Path = Path("/dev/stdout"),
+    verbosity: str = "DEBUG",
+):
+    """
+    Convert p-values from plink2 --glm into t-test p-values using the given parent
+
+    The parent file should come from running plink2 --glm on haptools transform
+    """
+    log = getLogger("linear2ttest", verbosity)
+
+    log.info("Loading linear files")
+    df = load_linear_file(linear)
+    parent_df = load_linear_file(parent)
+    if hap_id is not None:
+        parent_df = parent_df[parent_df.id == hap_id]
+    parent_df = parent_df.iloc[0]
+
+    log.info("Loading genotypes")
+    linear_gts = data.GenotypesPLINK.load(linear_gts)
+    linear_gts.subset(variants=tuple(df["id"]), inplace=True)
+    linear_stds = linear_gts.data.sum(axis=2).std(axis=0)
+    parent_gts = data.GenotypesPLINK.load(parent_gts)
+    parent_gts.subset(variants=(parent_df["id"],), inplace=True)
+    parent_stds = parent_gts.data.sum(axis=2).std(axis=0)
+
+    log.info("Adjusting betas and stderrs")
+    df["beta"] = df["beta"] * linear_stds
+    df["stderr"] = df["stderr"] * linear_stds
+    parent_df["beta"] = parent_df["beta"] * parent_stds
+    parent_df["stderr"] = parent_df["stderr"] * parent_stds
+
+    log.info("Computing t-test p-values")
+    num_tests = 1
+    num_samps = int(parent_df.samples)
+    parent_res = NodeResults.from_np(parent_df.loc[["beta", "pval", "stderr"]])
+    results = AssocResults(
+        np.array(
+            list(df[["beta", "pval", "stderr"]].itertuples(index=False)),
+            dtype=AssocTestSimple().return_dtype,
+        )
+    )
+    terminator = TTestTerminator(corrector=None)
+    vals = [
+        terminator.compute_val(
+            parent_res,
+            NodeResults.from_np(val[["beta", "pval", "stderr"]]),
+            results,
+            idx,
+            num_samps,
+            num_tests,
+            short_circuit=False,
+        ) for idx, val in df.iterrows()
+    ]
+    df["pval"] = np.array([(val[0] if val != True else 1) for val in vals])
+
+    log.info("Outputting t-test p-values")
+    # first, reset column names
+    df.rename(columns={v: k for k, v in PLINK_COLS.items()}, inplace=True)
+    df.to_csv(output, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/workflow/scripts/midway_manhattan.bash b/analysis/workflow/scripts/midway_manhattan.bash
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+# Creates a manhattan plot of SNP p-values midway through happler's tree branching process
+# arg1: PGEN file from which to obtain SNP genotypes
+# arg2: pheno file from which to obtain the phenotypes
+# arg3: happler hap file from which to obtain the haplotypes
+# arg4: output prefix
+# arg5: ID of the target haplotype in the hap file
+# arg6: ID of a "child" SNP in the target haplotype. The parent will include all SNPs of the haplotype up to this one.
+# arg7: 0 or 1 indicating whether to include the parent and child SNPs in the regression as covariates, or 2 if the regular p-values should be converted to t-test p-values. 1 requires that the child SNP be the second node from the root of the tree. (optional - defaults to 0)
+# ex: workflow/scripts/midway_manhattan.bash out/19_55363180-55833573/genotypes/snps.pgen 19_55363180-55833573.indep.pheno 19_55363180-55833573.indep.hap 19_55363180-55833573.indep H0 rs61734259 0.005 1
+
+pgen_file="$1"
+pheno_file="$2"
+hap_file="$3"
+out_prefix="$4"
+hap_id="$5"
+snp_id="$6"
+maf="$7"
+condition="${8:-0}"
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+############################################## CHECKS ##############################################
+
+# first, check that the child SNP is in the hap file
+grep -m1 -P '^V\t'"$hap_id"'\t.*\t'"$snp_id"'\t' "$hap_file" >/dev/null || {
+    echo "Failed to find $hap_id:$snp_id in $hap_file" 1>&2
+    exit
+}
+
+# now, retrieve the contents of the hap file that denote the haplotype up until (but not including!) the child SNP
+hap="$(grep -P '\t'"$hap_id"'\t' "$hap_file" | sed '/'"$snp_id"'/Q')"
+
+# next, check that the haplotype has at least one SNP preceding the child SNP
+[ "$(echo "$hap" | wc -l)" -gt 1 ] || {
+    echo "Failed to find at least one parent SNP for $hap_id:$snp_id in $hap_file" 1>&2
+    exit
+}
+
+# if we should include the parent as a covariate, then there should ONLY be one SNP preceding the child SNP
+if [ "$condition" -eq 1 ]; then
+    [ "$(echo "$hap" | wc -l)" -eq 2 ] || {
+        echo "There can only be one SNP before $snp_id for $hap_id in $hap_file" 1>&2
+        exit
+    }
+fi
+parent_snp_id="$(echo "$hap" | tail -n1 | cut -f5)"
+
+# finally, check that the child SNP appears in the PVAR file
+grep -m1 -P "\t"$snp_id"\t" "${pgen_file%.pgen}.pvar" >/dev/null || {
+    echo "Failed to find at least one parent SNP for $hap_id:$snp_id in $hap_file" 1>&2
+    exit
+}
+
+# now, determine whether the child SNP appears in the haplotype by its REF (0) or ALT (1) allele
+allele=$(grep "$(grep -m1 -P '^V\t'"$hap_id"'\t.*\t'"$snp_id"'\t' "$hap_file" | cut -f5,6)" "${pgen_file%.pgen}.pvar" | wc -l)
+allele=$(expr 1 - $allele)
+
+
+############################################## MAIN PROGRAM ########################################
+
+# step 1: use happler transform to retrieve the genotypes of the parent haplotype extended by each SNP in the PGEN file
+happler transform \
+--maf $maf \
+-a $allele \
+--verbosity DEBUG \
+-o "$out_prefix".pgen \
+-S <(cut -f1 "$pheno_file" | tail -n+2) \
+"$pgen_file" <(
+    grep -E '^#' "$hap_file"
+    echo "$hap"
+)
+
+# step 2: use plink2 to obtain p-values for a manhattan plot
+if [ "$condition" -eq 1 ]; then
+    # output the parent and child nodes to a covariate file
+    plink2 \
+    --maf $maf \
+    --out "$out_prefix" \
+    --export A ref-first \
+    --pfile "${pgen_file%.pgen}" \
+    --snps $parent_snp_id $snp_id
+    { echo -n "#"; cut -f 2,7- "$out_prefix".raw; } > "$out_prefix".covar
+    # incorporate the parent and child nodes as covariates in the model
+    plink2 \
+    --maf $maf \
+    --glm no-x-sex hide-covar \
+    --out "$out_prefix" \
+    --pfile "$out_prefix" \
+    --variance-standardize \
+    --pheno iid-only "$pheno_file" \
+    --covar 'iid-only' "$out_prefix".covar
+
+    last_arg="-a 0.05"
+else
+    plink2 \
+    --maf $maf \
+    --out "$out_prefix" \
+    --pfile "$out_prefix" \
+    --variance-standardize \
+    --pheno iid-only "$pheno_file" \
+    --glm no-x-sex allow-no-covars
+
+    last_arg="-a $(grep -P '^V\t.*\t'"$parent_snp_id"'\t' "$hap_file" | cut -f7)"
+fi
+
+linear_file="$(ls -1 "$out_prefix".*.glm.linear | head -n1)"
+
+if [ "$condition" -eq 2 ]; then
+    # first, get the parent haplotype as a PGEN file
+    haptools transform \
+    --verbosity DEBUG \
+    -o "$out_prefix".parent.pgen \
+    -S <(cut -f1 "$pheno_file" | tail -n+2) \
+    "$pgen_file" <(
+        grep -E '^#' "$hap_file"
+        echo "$hap"
+    )
+    # get summary statistics for the parent haplotype
+    plink2 \
+    --maf $maf \
+    --out "$out_prefix".parent \
+    --pfile "$out_prefix".parent \
+    --variance-standardize \
+    --pheno iid-only "$pheno_file" \
+    --glm no-x-sex allow-no-covars
+    # now, convert the plink2 --glm results into t-test p-values
+    "$SCRIPT_DIR"/linear2ttest.py \
+    -i "$hap_id" \
+    --verbosity DEBUG \
+    -o "$out_prefix".ttest.linear \
+    "$linear_file" "$out_prefix".parent.*.glm.linear \
+    "$out_prefix".pgen "$out_prefix".parent.pgen
+    # rename the linear file
+    linear_file="$out_prefix".ttest.linear
+    last_arg="-a 0.05"
+fi
+
+# step 3: use manhattan.py to actually create the manhattan plot
+"$SCRIPT_DIR"/manhattan.py \
+-i "$snp_id" \
+-o "$out_prefix".png \
+-l "$linear_file" \
+$last_arg
diff --git a/happler/__main__.py b/happler/__main__.py
@@ -461,10 +461,10 @@ def transform(
         )
     if samples_file:
         with samples_file as samps_file:
-            samples = samps_file.read().splitlines()
+            samples = set(samps_file.read().splitlines())
     elif samples:
-        # needs to be converted from tuple to list
-        samples = list(samples)
+        # needs to be converted from tuple to set
+        samples = set(samples)
     else:
         samples = None
     # load data
@@ -516,6 +516,13 @@ def transform(
     hp_gt.samples = gt.samples
     hp_gt.data = hp.transform(gt, allele)
 
+    # remove variants that no longer pass the MAF threshold
+    num_variants = len(hp_gt.variants)
+    hp_gt.check_maf(threshold=maf, discard_also=True)
+    removed = num_variants - len(hp_gt.variants)
+    if maf is not None:
+        log.info(f"Ignoring {removed} variants with MAF < {maf}")
+
     hp_gt.write()