Skip to content

Commit

Permalink
Merge branch 'feat/ukb' into feat/inverted-hap
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Jul 26, 2024
2 parents 6331382 + 41a4f84 commit 1b5b37e
Show file tree
Hide file tree
Showing 6 changed files with 388 additions and 11 deletions.
2 changes: 1 addition & 1 deletion analysis/workflow/rules/happler.smk
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ rule gwas:
maf = check_config("min_maf", 0),
in_prefix = lambda w, input: Path(input.pgen).with_suffix(""),
out_prefix = lambda w, output: Path(output.log).with_suffix(""),
covar = lambda wildcards, input: ("'no-x-sex' --covar 'iid-only' " + input["covar"]) if check_config("covar") else "allow-no-covars",
covar = lambda wildcards, input: ("no-x-sex hide-covar --covar 'iid-only' " + input["covar"]) if check_config("covar") else "allow-no-covars",
start=lambda wildcards: parse_locus(wildcards.locus)[1],
end=lambda wildcards: parse_locus(wildcards.locus)[2],
chrom=lambda wildcards: parse_locus(wildcards.locus)[0],
Expand Down
3 changes: 3 additions & 0 deletions analysis/workflow/scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ Create a manhattan plot to visualize the results of a GWAS.
## [merge_plink.py](merge_plink.py)
Merge variants from two PGEN files that have the same set of samples.

## [midway_manhattan.bash](midway_manhattan.bash)
Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process.

## [parameter_plot.py](parameter_plot.py)
Plot the LD between a causal haplotype and a set of observed haplotypes across a range of hyper-parameters.

Expand Down
139 changes: 139 additions & 0 deletions analysis/workflow/scripts/linear2ttest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/usr/bin/env python
from typing import Tuple
from pathlib import Path
from logging import Logger

import click
import numpy as np
import pandas as pd
from haptools import data
from haptools.logging import getLogger

from happler.tree.assoc_test import NodeResults
from happler.tree.terminator import TTestTerminator
from happler.tree.assoc_test import AssocResults, AssocTestSimple


PLINK_COLS = {
"#CHROM": "chromosome",
"POS": "pos",
"ID": "id",
"OBS_CT": "samples",
"BETA": "beta",
"SE": "stderr",
"P": "pval",
}

def load_linear_file(linear_fname: Path):
keep_cols = list(PLINK_COLS.keys())
df = pd.read_csv(
linear_fname,
sep="\t",
header=0,
usecols=keep_cols,
).rename(columns=PLINK_COLS)
df = df.sort_values("pos")
df["pval"] = df["pval"].fillna(np.inf)
return df


@click.command()
@click.argument("linear", type=click.Path(exists=True, path_type=Path))
@click.argument("parent", type=click.Path(exists=True, path_type=Path))
@click.argument("linear_gts", type=click.Path(exists=True, path_type=Path))
@click.argument("parent_gts", type=click.Path(exists=True, path_type=Path))
@click.option(
"-i",
"--id",
"hap_id",
type=str,
show_default="first",
help=(
"The ID of the variant to choose as the parent haplotype"
),
)
@click.option(
"-o",
"--output",
type=click.Path(path_type=Path),
default=Path("/dev/stdout"),
show_default="stdout",
help="A transformed genotypes file",
)
@click.option(
"-v",
"--verbosity",
type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]),
default="INFO",
show_default=True,
help="The level of verbosity desired",
)
def main(
linear: Path,
parent: Path,
linear_gts: Path,
parent_gts: Path,
hap_id: str = None,
output: Path = Path("/dev/stdout"),
verbosity: str = "DEBUG",
):
"""
Convert p-values from plink2 --glm into t-test p-values using the given parent
The parent file should come from running plink2 --glm on haptools transform
"""
log = getLogger("linear2ttest", verbosity)

log.info("Loading linear files")
df = load_linear_file(linear)
parent_df = load_linear_file(parent)
if hap_id is not None:
parent_df = parent_df[parent_df.id == hap_id]
parent_df = parent_df.iloc[0]

log.info("Loading genotypes")
linear_gts = data.GenotypesPLINK.load(linear_gts)
linear_gts.subset(variants=tuple(df["id"]), inplace=True)
linear_stds = linear_gts.data.sum(axis=2).std(axis=0)
parent_gts = data.GenotypesPLINK.load(parent_gts)
parent_gts.subset(variants=(parent_df["id"],), inplace=True)
parent_stds = parent_gts.data.sum(axis=2).std(axis=0)

log.info("Adjusting betas and stderrs")
df["beta"] = df["beta"] * linear_stds
df["stderr"] = df["stderr"] * linear_stds
parent_df["beta"] = parent_df["beta"] * parent_stds
parent_df["stderr"] = parent_df["stderr"] * parent_stds

log.info("Computing t-test p-values")
num_tests = 1
num_samps = int(parent_df.samples)
parent_res = NodeResults.from_np(parent_df.loc[["beta", "pval", "stderr"]])
results = AssocResults(
np.array(
list(df[["beta", "pval", "stderr"]].itertuples(index=False)),
dtype=AssocTestSimple().return_dtype,
)
)
terminator = TTestTerminator(corrector=None)
vals = [
terminator.compute_val(
parent_res,
NodeResults.from_np(val[["beta", "pval", "stderr"]]),
results,
idx,
num_samps,
num_tests,
short_circuit=False,
) for idx, val in df.iterrows()
]
df["pval"] = np.array([(val[0] if val != True else 1) for val in vals])

log.info("Outputting t-test p-values")
# first, reset column names
df.rename(columns={v: k for k, v in PLINK_COLS.items()}, inplace=True)
df.to_csv(output, sep="\t", index=False)


if __name__ == "__main__":
main()
144 changes: 144 additions & 0 deletions analysis/workflow/scripts/midway_manhattan.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env bash
# Creates a manhattan plot of SNP p-values midway through happler's tree branching process
# arg1: PGEN file from which to obtain SNP genotypes
# arg2: pheno file from which to obtain the phenotypes
# arg3: happler hap file from which to obtain the haplotypes
# arg4: output prefix
# arg5: ID of the target haplotype in the hap file
# arg6: ID of a "child" SNP in the target haplotype. The parent will include all SNPs of the haplotype up to this one.
# arg7: 0 or 1 indicating whether to include the parent and child SNPs in the regression as covariates, or 2 if the regular p-values should be converted to t-test p-values. 1 requires that the child SNP be the second node from the root of the tree. (optional - defaults to 0)
# ex: workflow/scripts/midway_manhattan.bash out/19_55363180-55833573/genotypes/snps.pgen 19_55363180-55833573.indep.pheno 19_55363180-55833573.indep.hap 19_55363180-55833573.indep H0 rs61734259 0.005 1

pgen_file="$1"
pheno_file="$2"
hap_file="$3"
out_prefix="$4"
hap_id="$5"
snp_id="$6"
maf="$7"
condition="${8:-0}"

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

############################################## CHECKS ##############################################

# first, check that the child SNP is in the hap file
grep -m1 -P '^V\t'"$hap_id"'\t.*\t'"$snp_id"'\t' "$hap_file" >/dev/null || {
echo "Failed to find $hap_id:$snp_id in $hap_file" 1>&2
exit
}

# now, retrieve the contents of the hap file that denote the haplotype up until (but not including!) the child SNP
hap="$(grep -P '\t'"$hap_id"'\t' "$hap_file" | sed '/'"$snp_id"'/Q')"

# next, check that the haplotype has at least one SNP preceding the child SNP
[ "$(echo "$hap" | wc -l)" -gt 1 ] || {
echo "Failed to find at least one parent SNP for $hap_id:$snp_id in $hap_file" 1>&2
exit
}

# if we should include the parent as a covariate, then there should ONLY be one SNP preceding the child SNP
if [ "$condition" -eq 1 ]; then
[ "$(echo "$hap" | wc -l)" -eq 2 ] || {
echo "There can only be one SNP before $snp_id for $hap_id in $hap_file" 1>&2
exit
}
fi
parent_snp_id="$(echo "$hap" | tail -n1 | cut -f5)"

# finally, check that the child SNP appears in the PVAR file
grep -m1 -P "\t"$snp_id"\t" "${pgen_file%.pgen}.pvar" >/dev/null || {
echo "Failed to find at least one parent SNP for $hap_id:$snp_id in $hap_file" 1>&2
exit
}

# now, determine whether the child SNP appears in the haplotype by its REF (0) or ALT (1) allele
allele=$(grep "$(grep -m1 -P '^V\t'"$hap_id"'\t.*\t'"$snp_id"'\t' "$hap_file" | cut -f5,6)" "${pgen_file%.pgen}.pvar" | wc -l)
allele=$(expr 1 - $allele)


############################################## MAIN PROGRAM ########################################

# step 1: use happler transform to retrieve the genotypes of the parent haplotype extended by each SNP in the PGEN file
happler transform \
--maf $maf \
-a $allele \
--verbosity DEBUG \
-o "$out_prefix".pgen \
-S <(cut -f1 "$pheno_file" | tail -n+2) \
"$pgen_file" <(
grep -E '^#' "$hap_file"
echo "$hap"
)

# step 2: use plink2 to obtain p-values for a manhattan plot
if [ "$condition" -eq 1 ]; then
# output the parent and child nodes to a covariate file
plink2 \
--maf $maf \
--out "$out_prefix" \
--export A ref-first \
--pfile "${pgen_file%.pgen}" \
--snps $parent_snp_id $snp_id
{ echo -n "#"; cut -f 2,7- "$out_prefix".raw; } > "$out_prefix".covar
# incorporate the parent and child nodes as covariates in the model
plink2 \
--maf $maf \
--glm no-x-sex hide-covar \
--out "$out_prefix" \
--pfile "$out_prefix" \
--variance-standardize \
--pheno iid-only "$pheno_file" \
--covar 'iid-only' "$out_prefix".covar

last_arg="-a 0.05"
else
plink2 \
--maf $maf \
--out "$out_prefix" \
--pfile "$out_prefix" \
--variance-standardize \
--pheno iid-only "$pheno_file" \
--glm no-x-sex allow-no-covars

last_arg="-a $(grep -P '^V\t.*\t'"$parent_snp_id"'\t' "$hap_file" | cut -f7)"
fi

linear_file="$(ls -1 "$out_prefix".*.glm.linear | head -n1)"

if [ "$condition" -eq 2 ]; then
# first, get the parent haplotype as a PGEN file
haptools transform \
--verbosity DEBUG \
-o "$out_prefix".parent.pgen \
-S <(cut -f1 "$pheno_file" | tail -n+2) \
"$pgen_file" <(
grep -E '^#' "$hap_file"
echo "$hap"
)
# get summary statistics for the parent haplotype
plink2 \
--maf $maf \
--out "$out_prefix".parent \
--pfile "$out_prefix".parent \
--variance-standardize \
--pheno iid-only "$pheno_file" \
--glm no-x-sex allow-no-covars
# now, convert the plink2 --glm results into t-test p-values
"$SCRIPT_DIR"/linear2ttest.py \
-i "$hap_id" \
--verbosity DEBUG \
-o "$out_prefix".ttest.linear \
"$linear_file" "$out_prefix".parent.*.glm.linear \
"$out_prefix".pgen "$out_prefix".parent.pgen
# rename the linear file
linear_file="$out_prefix".ttest.linear
last_arg="-a 0.05"
fi

# step 3: use manhattan.py to actually create the manhattan plot
"$SCRIPT_DIR"/manhattan.py \
-i "$snp_id" \
-o "$out_prefix".png \
-l "$linear_file" \
$last_arg
13 changes: 10 additions & 3 deletions happler/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,10 @@ def transform(
)
if samples_file:
with samples_file as samps_file:
samples = samps_file.read().splitlines()
samples = set(samps_file.read().splitlines())
elif samples:
# needs to be converted from tuple to list
samples = list(samples)
# needs to be converted from tuple to set
samples = set(samples)
else:
samples = None
# load data
Expand Down Expand Up @@ -516,6 +516,13 @@ def transform(
hp_gt.samples = gt.samples
hp_gt.data = hp.transform(gt, allele)

# remove variants that no longer pass the MAF threshold
num_variants = len(hp_gt.variants)
hp_gt.check_maf(threshold=maf, discard_also=True)
removed = num_variants - len(hp_gt.variants)
if maf is not None:
log.info(f"Ignoring {removed} variants with MAF < {maf}")

hp_gt.write()


Expand Down
Loading

0 comments on commit 1b5b37e

Please sign in to comment.