Refactored test dir. Added length subcommand.

logsdon-lab · May 15, 2024 · 4e6f882 · 4e6f882
1 parent ed68170
commit 4e6f882
Show file tree

Hide file tree

Showing 37 changed files with 375 additions and 76 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,14 +1,15 @@
-test/input/chr9_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
-test/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out filter=lfs diff=lfs merge=lfs -text
-test/input/chr21_cens.fa.out filter=lfs diff=lfs merge=lfs -text
-test/input/chr21_chr13_cens_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
-test/input/chr22_cens.fa.out filter=lfs diff=lfs merge=lfs -text
-test/input/chr4_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
-test/input/chr4_cens_partials.png filter=lfs diff=lfs merge=lfs -text
-test/input/chr9_cens_partials.png filter=lfs diff=lfs merge=lfs -text
-test/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out.png filter=lfs diff=lfs merge=lfs -text
-test/input/chr21_cens_false_neg_mismap.png filter=lfs diff=lfs merge=lfs -text
-test/input/chr21_cens_false_neg_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
-test/input/chr21_cens.png filter=lfs diff=lfs merge=lfs -text
-test/input/chr21_chr13_cens_mismap.png filter=lfs diff=lfs merge=lfs -text
-test/input/chr22_cens.png filter=lfs diff=lfs merge=lfs -text
+test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr9_cens_partials.png filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr21_cens_false_neg_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr21_chr13_cens_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr22_cens.png filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr4_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr4_cens_partials.png filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr21_cens_false_neg_mismap.png filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr21_cens.fa.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr22_cens.fa.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr21_chr13_cens_mismap.png filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr9_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out filter=lfs diff=lfs merge=lfs -text
+test/status/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out.png filter=lfs diff=lfs merge=lfs -text
+test/status/input/chr21_cens.png filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ Centromere statistics toolkit.
 
 * `status`
     * Determine the status of centromeric contigs based on [`RepeatMasker`](https://www.repeatmasker.org/) annotations.
+* `length`
+    * Estimate HOR array length from [`stv`](https://github.com/fedorrik/stv) bed file and [`HumAS-HMMER`](https://github.com/fedorrik/HumAS-HMMER_for_AnVIL) output.
 
 ### Setup
 ```bash
@@ -14,15 +16,15 @@ pip install censtats
 
 ### Usage
 ```bash
-usage: censtats [-h] {status} ...
+usage: censtats [-h] {status,length} ...
 
-Centromere statistics tool kit.
+Centromere statistics toolkit.
 
 positional arguments:
-  {status}
+  {status,length}
 
 options:
-  -h, --help  show this help message and exit
+  -h, --help       show this help message and exit
 ```
 
 Read the docs [here](https://github.com/logsdon-lab/CenStats/wiki/Usage).

diff --git a/censtats/length/__init__.py b/censtats/length/__init__.py
diff --git a/censtats/length/cli.py b/censtats/length/cli.py
@@ -0,0 +1,213 @@
+import re
+import sys
+import argparse
+import polars as pl
+
+from typing import TYPE_CHECKING, Any
+
+from .constants import (
+    RGX_CHR,
+    DEF_BP_JUMP_LEN_THR,
+    DEF_ARR_LEN_THR,
+    HOR_BP_LEN,
+    DEF_EXP_STV_ROW_BED_COLS,
+    DEF_OUTPUT_BED_COLS,
+)
+
+
+if TYPE_CHECKING:
+    SubArgumentParser = argparse._SubParsersAction[argparse.ArgumentParser]
+else:
+    SubArgumentParser = Any
+
+
+def add_hor_length_cli(parser: SubArgumentParser) -> None:
+    ap = parser.add_parser(
+        "length",
+        description="Estimate HOR array length from stv bed file / HumAS-HMMER output.",
+    )
+    ap.add_argument(
+        "-i",
+        "--input",
+        help=f"Input stv row bed file produced by HumAS-HMMER and stv. Expects columns: {DEF_EXP_STV_ROW_BED_COLS}",
+        type=str,
+    )
+    ap.add_argument(
+        "-o",
+        "--output",
+        help=f"Output bed file with columns: {DEF_OUTPUT_BED_COLS}",
+        default=sys.stdout,
+        type=argparse.FileType("wt"),
+    )
+    ap.add_argument(
+        "--bp_jump_thr",
+        help="Base pair jump threshold to group by",
+        type=int,
+        default=DEF_BP_JUMP_LEN_THR,
+    )
+    ap.add_argument(
+        "--arr_len_thr",
+        help="Length threshold to filter out.",
+        type=int,
+        default=DEF_ARR_LEN_THR,
+    )
+    return None
+
+
+def calculate_hor_length(
+    infile: str, bp_jump_thr: int, arr_len_thr: int, output: str
+) -> int:
+    """
+    Calculate HOR array length from HumAS-HMMER structural variation row output.
+
+    ### Parameters
+    `infile`
+        Input bed file made from HumAS-HMMER output.
+        Expects the following columns: `{chr, start, stop, hor, 0, strand, ...}`.
+    `bp_jump_thr`
+        Base pair jump threshold to group by.
+    `arr_len_thr`
+        Length threshold of HOR array to filter out.
+    `output`
+        Output bed file with HOR array lengths.
+        Columns: `{chr_name, start_pos, stop_pos, len}`.
+
+    ### Returns
+    0 if successful.
+    """
+    df = pl.read_csv(
+        infile,
+        separator="\t",
+        columns=[0, 1, 2, 3, 4, 5],
+        new_columns=DEF_EXP_STV_ROW_BED_COLS,
+        has_header=False,
+    )
+
+    dfs = []
+    for ctg_name, df_chr in df.group_by(["chr"], maintain_order=True):
+        df_chr = df_chr.with_columns(len=pl.col("stop") - pl.col("start")).with_columns(
+            mer=(pl.col("len") / HOR_BP_LEN).round()
+        )
+        ctg_name = ctg_name[0]
+        mtch_chr_name = re.search(RGX_CHR, ctg_name)
+        if mtch_chr_name is None:
+            continue
+
+        chr_name = mtch_chr_name.group(0)
+        df_live_hor = df_chr.filter(pl.col("hor").str.contains("L"))
+
+        # Specific edge case for chr8.
+        if chr_name == "chr8" or chr_name == "chr10" or chr_name == "chr16":
+            bp_jump_thr = 10_000
+        elif chr_name == "chrY":
+            bp_jump_thr = 2_000
+        else:
+            bp_jump_thr = bp_jump_thr
+
+        df_bp_jumps = df_live_hor.with_columns(
+            diff=pl.col("start") - pl.col("stop").shift(1)
+        ).filter(pl.col("diff") > bp_jump_thr)
+
+        if df_bp_jumps.is_empty():
+            adj_start = df_live_hor.get_column("start").min()
+            adj_stop = df_live_hor.get_column("stop").max()
+            adj_len = adj_stop - adj_start
+
+            if adj_len < arr_len_thr:
+                continue
+
+            dfs.append(
+                pl.DataFrame(
+                    {
+                        "chr_name": ctg_name,
+                        "start_pos": adj_start,
+                        "stop_pos": adj_stop,
+                        "len": adj_len,
+                    }
+                )
+            )
+            continue
+
+        starts, stops = [], []
+        for i, row in enumerate(df_bp_jumps.iter_rows()):
+            prev_row = pl.DataFrame() if i == 0 else df_bp_jumps.slice(i - 1)
+            next_row = df_bp_jumps.slice(i + 1)
+
+            if prev_row.is_empty():
+                starts.append(df_chr.get_column("start").min())
+                stops.append(
+                    df_chr.filter(pl.col("start") < row[1]).row(-1, named=True)["stop"]
+                )
+
+            if next_row.is_empty():
+                starts.append(row[1])
+                stops.append(df_chr.get_column("stop").max())
+            else:
+                starts.append(row[1])
+                stops.append(
+                    df_chr.filter(
+                        pl.col("start") < next_row.get_column("start")[0]
+                    ).row(-1, named=True)["stop"]
+                )
+
+        lens = []
+        chr_mer_filter = None
+        if chr_name == "chr10" or chr_name == "chr20":
+            chr_mer_filter = pl.col("mer") >= 5
+        elif chr_name == "chrY":
+            chr_mer_filter = pl.col("mer") >= 30
+        elif chr_name == "chr17":
+            chr_mer_filter = pl.col("mer") >= 4
+
+        for start, stop in zip(starts, stops):
+            df_slice = (
+                df_chr.filter(pl.col("start") >= start, pl.col("stop") <= stop)
+                .with_columns(bp_jump=pl.col("start") - pl.col("stop").shift(1))
+                .fill_null(0)
+            )
+            # Filter out mers based on chr.
+            if chr_mer_filter is not None:
+                df_slice = df_slice.filter(chr_mer_filter)
+
+            if df_slice.is_empty():
+                lens.append(0)
+                continue
+            df_slice_dst = (
+                # df_slice.with_columns(len=pl.col("stop") - pl.col("start")).get_column("len").sum()
+                df_slice.get_column("stop").max() - df_slice.get_column("start").min()
+            )
+            lens.append(df_slice_dst)
+
+        lf = pl.LazyFrame(
+            {
+                "chr_name": ctg_name,
+                "start_pos": starts,
+                "stop_pos": stops,
+                "len": lens,
+            }
+        )
+        if (
+            chr_name == "chr8"
+            or chr_name == "chr10"
+            or chr_name == "chr17"
+            or chr_name == "chrY"
+        ):
+            arr_len_thr = 100_000
+        else:
+            arr_len_thr = arr_len_thr
+
+        dfs.append(lf.filter(pl.col("len") > arr_len_thr).collect())
+
+    df_all_dsts: pl.DataFrame = pl.concat(dfs)
+    (
+        df_all_dsts.with_columns(
+            sort_idx=pl.col("chr_name")
+            .str.extract("chr([0-9XY]+)")
+            .replace({"X": "23", "Y": "24"})
+            .cast(pl.Int32)
+        )
+        .sort(by="sort_idx")
+        .select(DEF_OUTPUT_BED_COLS)
+        .write_csv(output, include_header=False, separator="\t")
+    )
+    return 0
diff --git a/censtats/length/constants.py b/censtats/length/constants.py
@@ -0,0 +1,9 @@
+import re
+
+
+RGX_CHR = re.compile(r"chr[0-9XY]{1,2}")
+HOR_BP_LEN = 170
+DEF_ARR_LEN_THR = 30_000
+DEF_BP_JUMP_LEN_THR = 100_000
+DEF_EXP_STV_ROW_BED_COLS = ["chr", "start", "stop", "hor", "other", "strand"]
+DEF_OUTPUT_BED_COLS = ["chr_name", "start_pos", "stop_pos", "len"]
diff --git a/censtats/main.py b/censtats/main.py
@@ -2,6 +2,7 @@
 from typing import Any, TYPE_CHECKING
 
 from .status.cli import add_status_cli, check_cens_status
+from .length.cli import add_hor_length_cli, calculate_hor_length
 
 if TYPE_CHECKING:
     SubArgumentParser = argparse._SubParsersAction[argparse.ArgumentParser]
@@ -13,6 +14,7 @@ def main() -> int:
     ap = argparse.ArgumentParser(description="Centromere statistics toolkit.")
     sub_ap = ap.add_subparsers(dest="cmd")
     add_status_cli(sub_ap)
+    add_hor_length_cli(sub_ap)
 
     args = ap.parse_args()
 
@@ -30,7 +32,9 @@ def main() -> int:
             restrict_14_22=args.restrict_14_22,
         )
     elif args.cmd == "length":
-        raise NotImplementedError("Length command not implemented.")
+        return calculate_hor_length(
+            args.input, args.bp_jump_thr, args.arr_len_thr, args.output
+        )
     else:
         raise ValueError(f"Unknown command: {args.cmd}")
 

diff --git a/test/helpers/__init__.py b/test/helpers/__init__.py
diff --git a/test/helpers/integration.py b/test/helpers/integration.py
@@ -0,0 +1,17 @@
+import subprocess
+
+
+def run_integration_test(*args: str, expected_output: str) -> None:
+    process = subprocess.run(
+        [*args],
+        capture_output=True,
+        check=True,
+    )
+    res = sorted(
+        line.strip().split("\t") for line in process.stdout.decode().split("\n") if line
+    )
+    with open(expected_output, "rt") as exp_res_fh:
+        exp_res = sorted(
+            line.strip().split("\t") for line in exp_res_fh.readlines() if line
+        )
+        assert res == exp_res
diff --git a/test/length/__init__.py b/test/length/__init__.py
diff --git a/test/length/expected/AS-HOR_chm13_lengths.bed b/test/length/expected/AS-HOR_chm13_lengths.bed
@@ -0,0 +1,26 @@
+chr1:121100001-127300000	121796049	126300488	4504439
+chr2:91800001-95600000	92316506	94672686	2356180
+chr3:89850001-97000000	91735802	92595822	860020
+chr3:89850001-97000000	95863533	96414943	551410
+chr4:49200001-55800000	49705154	50433573	728419
+chr4:49200001-55800000	52115487	54870511	2755024
+chr4:49200001-55800000	54980290	55199795	219505
+chr5:45650001-51600000	47077202	49596627	2519425
+chr6:57750001-63100000	58286707	61058391	2771684
+chr7:57650001-64700000	60414373	63714498	3300125
+chr8:43350001-47450000	44243546	46325081	2081535
+chr9:44200001-48100000	44952797	47585318	2632521
+chr10:38500001-42550000	39631089	41664590	2030117
+chr11:48300001-55700000	51023872	54413487	3389615
+chr12:33800001-38500000	34620840	37203843	2583003
+chr13:10650001-18100000	15547595	17509783	1962188
+chr14:5600001-13300000	10089210	12708411	2619201
+chr15:13500001-18250000	16677773	17694129	1016356
+chr16:32400001-38950000	35848276	37829524	1981248
+chr17:22850001-28650000	23892422	27488300	3595878
+chr18:15050001-21650000	15966042	20933551	4967509
+chr19:23850001-30750000	25808855	29768170	3959315
+chr20:25800001-32500000	26925855	29099656	2173801
+chr21:7700001-11850000	10962852	11306205	343353
+chr22:8000001-17400000	12786986	15711065	2924079
+chrX:56950001-61750000	57818585	60927025	3108440
diff --git a/test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed b/test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed
diff --git a/test/length/test_integration.py b/test/length/test_integration.py
@@ -0,0 +1,27 @@
+import pytest
+
+from test.helpers.integration import run_integration_test
+
+
+@pytest.mark.parametrize(
+    ["input_stv_row_bed", "expected_hor_len_bed"],
+    [
+        (
+            "test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed",
+            "test/length/expected/AS-HOR_chm13_lengths.bed",
+        ),
+    ],
+)
+def test_check_cens_status(
+    input_stv_row_bed: str,
+    expected_hor_len_bed: str,
+):
+    run_integration_test(
+        "python",
+        "-m",
+        "censtats.main",
+        "length",
+        "-i",
+        input_stv_row_bed,
+        expected_output=expected_hor_len_bed,
+    )
diff --git a/test/status/__init__.py b/test/status/__init__.py
diff --git a/test/expected/correct_chr21_cens.tsv → test/status/expected/correct_chr21_cens.tsv b/test/expected/correct_chr21_cens.tsv → test/status/expected/correct_chr21_cens.tsv
diff --git a/...d/correct_chr21_cens_false_neg_mismap.tsv → ...d/correct_chr21_cens_false_neg_mismap.tsv b/...d/correct_chr21_cens_false_neg_mismap.tsv → ...d/correct_chr21_cens_false_neg_mismap.tsv
diff --git a/...ected/correct_chr21_chr13_cens_mismap.tsv → ...ected/correct_chr21_chr13_cens_mismap.tsv b/...ected/correct_chr21_chr13_cens_mismap.tsv → ...ected/correct_chr21_chr13_cens_mismap.tsv
diff --git a/test/expected/correct_chr22_cens.tsv → test/status/expected/correct_chr22_cens.tsv b/test/expected/correct_chr22_cens.tsv → test/status/expected/correct_chr22_cens.tsv
diff --git a/test/expected/correct_chr4_cens_partials.tsv → ...s/expected/correct_chr4_cens_partials.tsv b/test/expected/correct_chr4_cens_partials.tsv → ...s/expected/correct_chr4_cens_partials.tsv
diff --git a/test/expected/correct_chr9_cens_partials.tsv → ...s/expected/correct_chr9_cens_partials.tsv b/test/expected/correct_chr9_cens_partials.tsv → ...s/expected/correct_chr9_cens_partials.tsv
diff --git a/...m13_chm1_cens_v21.trimmed.fa.noheader.out → ...m13_chm1_cens_v21.trimmed.fa.noheader.out b/...m13_chm1_cens_v21.trimmed.fa.noheader.out → ...m13_chm1_cens_v21.trimmed.fa.noheader.out
diff --git a/...chm1_cens_v21.trimmed.fa.noheader.out.png → ...chm1_cens_v21.trimmed.fa.noheader.out.png b/...chm1_cens_v21.trimmed.fa.noheader.out.png → ...chm1_cens_v21.trimmed.fa.noheader.out.png
diff --git a/test/input/chr21_cens.fa.out → test/status/input/chr21_cens.fa.out b/test/input/chr21_cens.fa.out → test/status/input/chr21_cens.fa.out
diff --git a/test/input/chr21_cens.png → test/status/input/chr21_cens.png b/test/input/chr21_cens.png → test/status/input/chr21_cens.png
diff --git a/.../input/chr21_cens_false_neg_mismap.fa.out → .../input/chr21_cens_false_neg_mismap.fa.out b/.../input/chr21_cens_false_neg_mismap.fa.out → .../input/chr21_cens_false_neg_mismap.fa.out
diff --git a/test/input/chr21_cens_false_neg_mismap.png → ...tus/input/chr21_cens_false_neg_mismap.png b/test/input/chr21_cens_false_neg_mismap.png → ...tus/input/chr21_cens_false_neg_mismap.png
diff --git a/test/input/chr21_chr13_cens_mismap.fa.out → ...atus/input/chr21_chr13_cens_mismap.fa.out b/test/input/chr21_chr13_cens_mismap.fa.out → ...atus/input/chr21_chr13_cens_mismap.fa.out
diff --git a/test/input/chr21_chr13_cens_mismap.png → .../status/input/chr21_chr13_cens_mismap.png b/test/input/chr21_chr13_cens_mismap.png → .../status/input/chr21_chr13_cens_mismap.png
diff --git a/test/input/chr22_cens.fa.out → test/status/input/chr22_cens.fa.out b/test/input/chr22_cens.fa.out → test/status/input/chr22_cens.fa.out
diff --git a/test/input/chr22_cens.png → test/status/input/chr22_cens.png b/test/input/chr22_cens.png → test/status/input/chr22_cens.png
diff --git a/test/input/chr4_cens_partials.fa.out → test/status/input/chr4_cens_partials.fa.out b/test/input/chr4_cens_partials.fa.out → test/status/input/chr4_cens_partials.fa.out
diff --git a/test/input/chr4_cens_partials.png → test/status/input/chr4_cens_partials.png b/test/input/chr4_cens_partials.png → test/status/input/chr4_cens_partials.png
diff --git a/test/input/chr9_cens_partials.fa.out → test/status/input/chr9_cens_partials.fa.out b/test/input/chr9_cens_partials.fa.out → test/status/input/chr9_cens_partials.fa.out
diff --git a/test/input/chr9_cens_partials.png → test/status/input/chr9_cens_partials.png b/test/input/chr9_cens_partials.png → test/status/input/chr9_cens_partials.png
diff --git a/test/plot_inputs.sh → test/status/plot_inputs.sh b/test/plot_inputs.sh → test/status/plot_inputs.sh
diff --git a/test/plot_repeatStructure_onlyRM.R → test/status/plot_repeatStructure_onlyRM.R b/test/plot_repeatStructure_onlyRM.R → test/status/plot_repeatStructure_onlyRM.R