Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add methylation and motif purity to allele db #31

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions rules/outlier_expansions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ rule generate_allele_db:
family = config["run"]["project"]
log: "logs/repeat_outliers/{family}_allele_db.log"
resources:
mem_mb = 10000
mem_mb = 80000
conda:
"../envs/str_sv.yaml"
shell:
Expand All @@ -20,6 +20,8 @@ rule sort_allele_db:
input_file = "repeat_outliers/allele_db/{family}.alleles.db.gz"
output: "repeat_outliers/allele_db/{family}.alleles.sorted.db.gz"
log: "logs/repeat_outliers/{family}.allele.sorted.db.log"
resources:
mem_mb = 80000
shell:
"""
(zcat < {input.input_file} | sort -T . -k 1,1 | gzip > {output}) > {log} 2>&1
Expand All @@ -35,7 +37,7 @@ rule find_repeat_outliers:
crg2_pacbio = config["tools"]["crg2_pacbio"]
log: "logs/repeat_outliers/{family}.repeat.outliers.log"
resources:
mem_mb = 20000
mem_mb = 40000
conda:
"../envs/str_sv.yaml"
shell:
Expand Down
150 changes: 150 additions & 0 deletions scripts/allele_distribution_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import argparse
import os
import pandas as pd
import numpy as np
import sys


def resample_quantiles(counts: pd.Series, num_resamples: int) -> list:
"""Based on https://github.com/Illumina/ExpansionHunterDenovo/blob/master/scripts/core/common.py"""
resamples = np.random.choice(counts, len(counts) * num_resamples)
resamples = np.split(resamples, num_resamples)

resampled_quantiles = []
for resample in resamples:
quantile = np.quantile(resample, 0.95)
resampled_quantiles.append(quantile)

return resampled_quantiles


def get_counts_with_finite_std(counts: pd.Series) -> pd.Series:
if len(set(counts)) == 1:
return counts[:-1] + [counts[-1] + 0.1]
return counts


def get_cutoff(quantiles: list) -> str:
mean = np.mean(quantiles)
std = max(1, np.std(quantiles))
cutoff = mean + std
return cutoff


def split_alleles(alleles: pd.DataFrame) -> tuple:
"""
Split alleles by comma and return lengths of shortest and longest alleles.
Note that there may be more than two alleles called by TRGT at a locus in an individual (somaticism?)
"""
allele_lens = [len(a) for a in alleles.split(",")]
short_allele_len = min(allele_lens)
long_allele_len = max(allele_lens)

return short_allele_len, long_allele_len


def get_quantiles_cutoffs(trid: str, allele_type: str, alleles: pd.DataFrame) -> tuple:
"""
Calculate the a length cutoff for a particular tandem repeat locus as defined by the mean plus std of 95% quantile after resampling
"""
alleles_trid = alleles[alleles["trid"] == trid]
min = alleles_trid[allele_type].min()
max = alleles_trid[allele_type].max()
quantiles = resample_quantiles(alleles_trid[allele_type], 100)
quantiles = get_counts_with_finite_std(quantiles)
cutoff_short = get_cutoff(quantiles)

return cutoff_short, [min, max]


print("Loading allele db")
alleles = sys.argv[1]
dirname = os.path.dirname(alleles)
outfile = os.path.basename(alleles).replace(".db", ".distribution.tsv.gz")
outpath = os.path.join(dirname, outfile)
alleles = pd.read_csv(
alleles,
header=None,
names=["trid", "sample", "motif_purity", "avg_methylation", "alleles"],
sep=" ",
)

# get length of shortest and longest alleles
alleles["short_allele_len"], alleles["long_allele_len"] = zip(
*alleles["alleles"].map(lambda x: split_alleles(x))
)

try:
# split motif purity column
alleles[["MP1", "MP2"]] = alleles["motif_purity"].str.split(
",",
expand=True,
)
# split average methylation column
alleles[["AM1", "AM2"]] = alleles["avg_methylation"].str.split(
",",
expand=True,
)
except:
# on Y chromosome, MP is a single float value
# for simplicity, just encode MP1/MP2 and AM1/AM2 as the same value
alleles["MP1"] = alleles["motif_purity"]
alleles["MP2"] = alleles["motif_purity"]
alleles["AM1"] = alleles["avg_methylation"]
alleles["AM2"] = alleles["avg_methylation"]

alleles = alleles.replace(".", np.nan)
alleles = alleles.astype({"MP1": float, "MP2": float})
alleles = alleles.astype({"AM1": float, "AM2": float})

# calculate mean motify purity and average methylation at locus
# should we separate these out in the future (i.e. for short and long alleles)?
alleles["MP"] = alleles[["MP1", "MP2"]].mean(axis=1)
alleles["AM"] = alleles[["AM1", "AM2"]].mean(axis=1)

# group alleles by tandem repeat ID and extract statistics
print("Grouping alleles")
grouped_alleles = alleles.groupby("trid").agg(
{
"short_allele_len": ["mean", "std"],
"long_allele_len": ["mean", "std"],
"MP": ["mean", "std"],
"AM": ["mean", "std"],
}
)
grouped_alleles = grouped_alleles.reset_index()

# calculate upper threshold for downstream outlier processing
print("Computing quantiles")
grouped_alleles["cutoff_short"], grouped_alleles["range_short"] = zip(
*grouped_alleles["trid"].map(
lambda x: get_quantiles_cutoffs(x, "short_allele_len", alleles)
)
)
grouped_alleles["cutoff_long"], grouped_alleles["range_long"] = zip(
*grouped_alleles["trid"].map(
lambda x: get_quantiles_cutoffs(x, "long_allele_len", alleles)
)
)

grouped_alleles = grouped_alleles.round(2)

columns = [
"trid",
"short_allele_len_mean",
"short_allele_len_std",
"long_allele_len_mean",
"long_allele_len_std",
"MP_mean",
"MP_std",
"AM_mean",
"AM_std",
"cutoff_short",
"range_short",
"cutoff_long",
"range_long",
]

grouped_alleles.to_csv(
outpath, sep="\t", compression="gzip", header=columns, index=False
)
25 changes: 15 additions & 10 deletions scripts/generate_allele_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import gzip

# https://github.com/egor-dolzhenko/discover-expansions/blob/main/discover-expansions.ipynb
# MC: added motif purity and methylation

def skip_header(file_handle: gzip.GzipFile, prefix: bytes = b'#') -> None:
last_pos = file_handle.tell()
Expand All @@ -14,12 +15,12 @@ def skip_header(file_handle: gzip.GzipFile, prefix: bytes = b'#') -> None:

def get_alleles(path: PosixPath):
gt_actions = {
"0/0": lambda: (trid, [ref, ref]),
"0/1": lambda: (trid, [ref, alt]),
"1/2": lambda: (trid, alt.split(",")),
"1/1": lambda: (trid, [alt, alt]),
"1": lambda: (trid, [alt]),
"0": lambda: (trid, [ref])
"0/0": lambda: (trid, motif_purity, avg_methylation, [ref, ref]),
"0/1": lambda: (trid, motif_purity, avg_methylation, [ref, alt]),
"1/2": lambda: (trid, motif_purity, avg_methylation, alt.split(",")),
"1/1": lambda: (trid, motif_purity, avg_methylation, [alt, alt]),
"1": lambda: (trid, motif_purity, avg_methylation, [alt]),
"0": lambda: (trid, motif_purity, avg_methylation, [ref])
}

with gzip.open(path, 'r') as f_in:
Expand All @@ -37,8 +38,12 @@ def get_alleles(path: PosixPath):

ref, alt = sl[3], sl[4]
trid = sl[-3].split(";")[0].lstrip("TRID=")
motif = sl[-3].split(";")[2].lstrip("MOTIFS=")
trid = trid
motif_purity = sl[-1].split(":")[-2]
avg_methylation = sl[-1].split(":")[-1]

yield gt_actions[gt]()
yield gt_actions[gt]()

def main(vcf_path, out_filepath):
if not os.path.exists("repeat_outliers"):
Expand All @@ -49,9 +54,9 @@ def main(vcf_path, out_filepath):
with gzip.open(out_filepath, "w", compresslevel=6) as f_out:
for index, path in enumerate(vcf_path.glob("*.vcf.gz")):
sample = path.name.rstrip(".vcf.gz")
for (trid, alleles) in get_alleles(path):
for (trid, motif_purity, avg_methylation, alleles) in get_alleles(path):
alleles = ",".join(alleles)
f_out.write(f"{trid} {sample} {alleles}\n".encode())
f_out.write(f"{trid} {sample} {motif_purity} {avg_methylation} {alleles}\n".encode())


if __name__ == "__main__":
Expand All @@ -62,7 +67,7 @@ def main(vcf_path, out_filepath):
"--vcf_path",
type=str,
required=True,
help="Path to TRGT VCFs for cases and controls",
help="Path to TRGT VCFs",
)
parser.add_argument(
"--output_file",
Expand Down