ccmbioinfo · Madelinehazel · May 9, 2024 · May 23, 2024 · May 23, 2024 · Jun 19, 2024
diff --git a/rules/outlier_expansions.smk b/rules/outlier_expansions.smk
@@ -6,7 +6,7 @@ rule generate_allele_db:
         family = config["run"]["project"]
     log: "logs/repeat_outliers/{family}_allele_db.log"
     resources:
-        mem_mb = 10000
+        mem_mb = 80000
     conda:
         "../envs/str_sv.yaml"
     shell: 
@@ -20,6 +20,8 @@ rule sort_allele_db:
         input_file = "repeat_outliers/allele_db/{family}.alleles.db.gz"
     output: "repeat_outliers/allele_db/{family}.alleles.sorted.db.gz"
     log: "logs/repeat_outliers/{family}.allele.sorted.db.log"
+    resources:
+        mem_mb = 80000
     shell:
         """
         (zcat < {input.input_file} | sort -T . -k 1,1 | gzip > {output}) > {log} 2>&1
@@ -35,7 +37,7 @@ rule find_repeat_outliers:
         crg2_pacbio = config["tools"]["crg2_pacbio"]
     log:  "logs/repeat_outliers/{family}.repeat.outliers.log"
     resources:
-        mem_mb = 20000
+        mem_mb = 40000
     conda: 
         "../envs/str_sv.yaml"
     shell: 

diff --git a/scripts/allele_distribution_db.py b/scripts/allele_distribution_db.py
@@ -0,0 +1,150 @@
+import argparse
+import os
+import pandas as pd
+import numpy as np
+import sys
+
+
+def resample_quantiles(counts: pd.Series, num_resamples: int) -> list:
+    """Based on https://github.com/Illumina/ExpansionHunterDenovo/blob/master/scripts/core/common.py"""
+    resamples = np.random.choice(counts, len(counts) * num_resamples)
+    resamples = np.split(resamples, num_resamples)
+
+    resampled_quantiles = []
+    for resample in resamples:
+        quantile = np.quantile(resample, 0.95)
+        resampled_quantiles.append(quantile)
+
+    return resampled_quantiles
+
+
+def get_counts_with_finite_std(counts: pd.Series) -> pd.Series:
+    if len(set(counts)) == 1:
+        return counts[:-1] + [counts[-1] + 0.1]
+    return counts
+
+
+def get_cutoff(quantiles: list) -> str:
+    mean = np.mean(quantiles)
+    std = max(1, np.std(quantiles))
+    cutoff = mean + std
+    return cutoff
+
+
+def split_alleles(alleles: pd.DataFrame) -> tuple:
+    """
+    Split alleles by comma and return lengths of shortest and longest alleles.
+    Note that there may be more than two alleles called by TRGT at a locus in an individual (somaticism?)
+    """
+    allele_lens = [len(a) for a in alleles.split(",")]
+    short_allele_len = min(allele_lens)
+    long_allele_len = max(allele_lens)
+
+    return short_allele_len, long_allele_len
+
+
+def get_quantiles_cutoffs(trid: str, allele_type: str, alleles: pd.DataFrame) -> tuple:
+    """
+    Calculate the a length cutoff for a particular tandem repeat locus as defined by the mean plus std of 95% quantile after resampling
+    """
+    alleles_trid = alleles[alleles["trid"] == trid]
+    min = alleles_trid[allele_type].min()
+    max = alleles_trid[allele_type].max()
+    quantiles = resample_quantiles(alleles_trid[allele_type], 100)
+    quantiles = get_counts_with_finite_std(quantiles)
+    cutoff_short = get_cutoff(quantiles)
+
+    return cutoff_short, [min, max]
+
+
+print("Loading allele db")
+alleles = sys.argv[1]
+dirname = os.path.dirname(alleles)
+outfile = os.path.basename(alleles).replace(".db", ".distribution.tsv.gz")
+outpath = os.path.join(dirname, outfile)
+alleles = pd.read_csv(
+    alleles,
+    header=None,
+    names=["trid", "sample", "motif_purity", "avg_methylation", "alleles"],
+    sep=" ",
+)
+
+# get length of shortest and longest alleles
+alleles["short_allele_len"], alleles["long_allele_len"] = zip(
+    *alleles["alleles"].map(lambda x: split_alleles(x))
+)
+
+try:
+    # split motif purity column
+    alleles[["MP1", "MP2"]] = alleles["motif_purity"].str.split(
+        ",",
+        expand=True,
+    )
+    # split average methylation column
+    alleles[["AM1", "AM2"]] = alleles["avg_methylation"].str.split(
+        ",",
+        expand=True,
+    )
+except:
+    # on Y chromosome, MP is a single float value
+    # for simplicity, just encode MP1/MP2 and AM1/AM2 as the same value
+    alleles["MP1"] = alleles["motif_purity"]
+    alleles["MP2"] = alleles["motif_purity"]
+    alleles["AM1"] = alleles["avg_methylation"]
+    alleles["AM2"] = alleles["avg_methylation"]
+
+alleles = alleles.replace(".", np.nan)
+alleles = alleles.astype({"MP1": float, "MP2": float})
+alleles = alleles.astype({"AM1": float, "AM2": float})
+
+# calculate mean motify purity and average methylation at locus
+# should we separate these out in the future (i.e. for short and long alleles)?
+alleles["MP"] = alleles[["MP1", "MP2"]].mean(axis=1)
+alleles["AM"] = alleles[["AM1", "AM2"]].mean(axis=1)
+
+# group alleles by tandem repeat ID and extract statistics
+print("Grouping alleles")
+grouped_alleles = alleles.groupby("trid").agg(
+    {
+        "short_allele_len": ["mean", "std"],
+        "long_allele_len": ["mean", "std"],
+        "MP": ["mean", "std"],
+        "AM": ["mean", "std"],
+    }
+)
+grouped_alleles = grouped_alleles.reset_index()
+
+# calculate upper threshold for downstream outlier processing
+print("Computing quantiles")
+grouped_alleles["cutoff_short"], grouped_alleles["range_short"] = zip(
+    *grouped_alleles["trid"].map(
+        lambda x: get_quantiles_cutoffs(x, "short_allele_len", alleles)
+    )
+)
+grouped_alleles["cutoff_long"], grouped_alleles["range_long"] = zip(
+    *grouped_alleles["trid"].map(
+        lambda x: get_quantiles_cutoffs(x, "long_allele_len", alleles)
+    )
+)
+
+grouped_alleles = grouped_alleles.round(2)
+
+columns = [
+    "trid",
+    "short_allele_len_mean",
+    "short_allele_len_std",
+    "long_allele_len_mean",
+    "long_allele_len_std",
+    "MP_mean",
+    "MP_std",
+    "AM_mean",
+    "AM_std",
+    "cutoff_short",
+    "range_short",
+    "cutoff_long",
+    "range_long",
+]
+
+grouped_alleles.to_csv(
+    outpath, sep="\t", compression="gzip", header=columns, index=False
+)
diff --git a/scripts/generate_allele_db.py b/scripts/generate_allele_db.py
@@ -5,6 +5,7 @@
 import gzip
 
 # https://github.com/egor-dolzhenko/discover-expansions/blob/main/discover-expansions.ipynb
+# MC: added motif purity and methylation
 
 def skip_header(file_handle: gzip.GzipFile, prefix: bytes = b'#') -> None:
     last_pos = file_handle.tell()
@@ -14,12 +15,12 @@ def skip_header(file_handle: gzip.GzipFile, prefix: bytes = b'#') -> None:
 
 def get_alleles(path: PosixPath):    
     gt_actions = {
-        "0/0": lambda: (trid, [ref, ref]),
-        "0/1": lambda: (trid, [ref, alt]),
-        "1/2": lambda: (trid, alt.split(",")),
-        "1/1": lambda: (trid, [alt, alt]),
-        "1": lambda: (trid, [alt]),
-        "0": lambda: (trid, [ref])
+        "0/0": lambda: (trid, motif_purity, avg_methylation, [ref, ref]),
+        "0/1": lambda: (trid, motif_purity, avg_methylation, [ref, alt]),
+        "1/2": lambda: (trid, motif_purity, avg_methylation, alt.split(",")),
+        "1/1": lambda: (trid, motif_purity, avg_methylation, [alt, alt]),
+        "1": lambda: (trid, motif_purity, avg_methylation, [alt]),
+        "0": lambda: (trid, motif_purity, avg_methylation, [ref])
     }
 
     with gzip.open(path, 'r') as f_in:
@@ -37,8 +38,12 @@ def get_alleles(path: PosixPath):
 
             ref, alt = sl[3], sl[4]               
             trid = sl[-3].split(";")[0].lstrip("TRID=")
+            motif = sl[-3].split(";")[2].lstrip("MOTIFS=")
+            trid = trid 
+            motif_purity = sl[-1].split(":")[-2]
+            avg_methylation = sl[-1].split(":")[-1]
 
-            yield gt_actions[gt]()        
+            yield gt_actions[gt]()          
 
 def main(vcf_path, out_filepath):     
     if not os.path.exists("repeat_outliers"):
@@ -49,9 +54,9 @@ def main(vcf_path, out_filepath):
     with gzip.open(out_filepath, "w", compresslevel=6) as f_out:
         for index, path in enumerate(vcf_path.glob("*.vcf.gz")):    
             sample = path.name.rstrip(".vcf.gz")
-            for (trid, alleles) in get_alleles(path):        
+            for (trid, motif_purity, avg_methylation, alleles) in get_alleles(path):        
                 alleles = ",".join(alleles)
-                f_out.write(f"{trid} {sample} {alleles}\n".encode())  
+                f_out.write(f"{trid} {sample} {motif_purity} {avg_methylation} {alleles}\n".encode())  
 
 
 if __name__ == "__main__":
@@ -62,7 +67,7 @@ def main(vcf_path, out_filepath):
         "--vcf_path",
         type=str,
         required=True,
-        help="Path to TRGT VCFs for cases and controls",
+        help="Path to TRGT VCFs",
     )
     parser.add_argument(
         "--output_file",