Skip to content

Commit

Permalink
Refactored test dir. Added length subcommand.
Browse files Browse the repository at this point in the history
  • Loading branch information
Keisuke Oshima committed May 15, 2024
1 parent ed68170 commit 4e6f882
Show file tree
Hide file tree
Showing 37 changed files with 375 additions and 76 deletions.
29 changes: 15 additions & 14 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
test/input/chr9_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
test/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out filter=lfs diff=lfs merge=lfs -text
test/input/chr21_cens.fa.out filter=lfs diff=lfs merge=lfs -text
test/input/chr21_chr13_cens_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
test/input/chr22_cens.fa.out filter=lfs diff=lfs merge=lfs -text
test/input/chr4_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
test/input/chr4_cens_partials.png filter=lfs diff=lfs merge=lfs -text
test/input/chr9_cens_partials.png filter=lfs diff=lfs merge=lfs -text
test/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out.png filter=lfs diff=lfs merge=lfs -text
test/input/chr21_cens_false_neg_mismap.png filter=lfs diff=lfs merge=lfs -text
test/input/chr21_cens_false_neg_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
test/input/chr21_cens.png filter=lfs diff=lfs merge=lfs -text
test/input/chr21_chr13_cens_mismap.png filter=lfs diff=lfs merge=lfs -text
test/input/chr22_cens.png filter=lfs diff=lfs merge=lfs -text
test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed filter=lfs diff=lfs merge=lfs -text
test/status/input/chr9_cens_partials.png filter=lfs diff=lfs merge=lfs -text
test/status/input/chr21_cens_false_neg_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chr21_chr13_cens_mismap.fa.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chr22_cens.png filter=lfs diff=lfs merge=lfs -text
test/status/input/chr4_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chr4_cens_partials.png filter=lfs diff=lfs merge=lfs -text
test/status/input/chr21_cens_false_neg_mismap.png filter=lfs diff=lfs merge=lfs -text
test/status/input/chr21_cens.fa.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chr22_cens.fa.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chr21_chr13_cens_mismap.png filter=lfs diff=lfs merge=lfs -text
test/status/input/chr9_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out filter=lfs diff=lfs merge=lfs -text
test/status/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out.png filter=lfs diff=lfs merge=lfs -text
test/status/input/chr21_cens.png filter=lfs diff=lfs merge=lfs -text
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Centromere statistics toolkit.

* `status`
* Determine the status of centromeric contigs based on [`RepeatMasker`](https://www.repeatmasker.org/) annotations.
* `length`
* Estimate HOR array length from [`stv`](https://github.com/fedorrik/stv) bed file and [`HumAS-HMMER`](https://github.com/fedorrik/HumAS-HMMER_for_AnVIL) output.

### Setup
```bash
Expand All @@ -14,15 +16,15 @@ pip install censtats

### Usage
```bash
usage: censtats [-h] {status} ...
usage: censtats [-h] {status,length} ...

Centromere statistics tool kit.
Centromere statistics toolkit.

positional arguments:
{status}
{status,length}

options:
-h, --help show this help message and exit
-h, --help show this help message and exit
```

Read the docs [here](https://github.com/logsdon-lab/CenStats/wiki/Usage).
Expand Down
Empty file added censtats/length/__init__.py
Empty file.
213 changes: 213 additions & 0 deletions censtats/length/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import re
import sys
import argparse
import polars as pl

from typing import TYPE_CHECKING, Any

from .constants import (
RGX_CHR,
DEF_BP_JUMP_LEN_THR,
DEF_ARR_LEN_THR,
HOR_BP_LEN,
DEF_EXP_STV_ROW_BED_COLS,
DEF_OUTPUT_BED_COLS,
)


if TYPE_CHECKING:
SubArgumentParser = argparse._SubParsersAction[argparse.ArgumentParser]
else:
SubArgumentParser = Any


def add_hor_length_cli(parser: SubArgumentParser) -> None:
ap = parser.add_parser(
"length",
description="Estimate HOR array length from stv bed file / HumAS-HMMER output.",
)
ap.add_argument(
"-i",
"--input",
help=f"Input stv row bed file produced by HumAS-HMMER and stv. Expects columns: {DEF_EXP_STV_ROW_BED_COLS}",
type=str,
)
ap.add_argument(
"-o",
"--output",
help=f"Output bed file with columns: {DEF_OUTPUT_BED_COLS}",
default=sys.stdout,
type=argparse.FileType("wt"),
)
ap.add_argument(
"--bp_jump_thr",
help="Base pair jump threshold to group by",
type=int,
default=DEF_BP_JUMP_LEN_THR,
)
ap.add_argument(
"--arr_len_thr",
help="Length threshold to filter out.",
type=int,
default=DEF_ARR_LEN_THR,
)
return None


def calculate_hor_length(
infile: str, bp_jump_thr: int, arr_len_thr: int, output: str
) -> int:
"""
Calculate HOR array length from HumAS-HMMER structural variation row output.
### Parameters
`infile`
Input bed file made from HumAS-HMMER output.
Expects the following columns: `{chr, start, stop, hor, 0, strand, ...}`.
`bp_jump_thr`
Base pair jump threshold to group by.
`arr_len_thr`
Length threshold of HOR array to filter out.
`output`
Output bed file with HOR array lengths.
Columns: `{chr_name, start_pos, stop_pos, len}`.
### Returns
0 if successful.
"""
df = pl.read_csv(
infile,
separator="\t",
columns=[0, 1, 2, 3, 4, 5],
new_columns=DEF_EXP_STV_ROW_BED_COLS,
has_header=False,
)

dfs = []
for ctg_name, df_chr in df.group_by(["chr"], maintain_order=True):
df_chr = df_chr.with_columns(len=pl.col("stop") - pl.col("start")).with_columns(
mer=(pl.col("len") / HOR_BP_LEN).round()
)
ctg_name = ctg_name[0]
mtch_chr_name = re.search(RGX_CHR, ctg_name)
if mtch_chr_name is None:
continue

chr_name = mtch_chr_name.group(0)
df_live_hor = df_chr.filter(pl.col("hor").str.contains("L"))

# Specific edge case for chr8.
if chr_name == "chr8" or chr_name == "chr10" or chr_name == "chr16":
bp_jump_thr = 10_000
elif chr_name == "chrY":
bp_jump_thr = 2_000
else:
bp_jump_thr = bp_jump_thr

df_bp_jumps = df_live_hor.with_columns(
diff=pl.col("start") - pl.col("stop").shift(1)
).filter(pl.col("diff") > bp_jump_thr)

if df_bp_jumps.is_empty():
adj_start = df_live_hor.get_column("start").min()
adj_stop = df_live_hor.get_column("stop").max()
adj_len = adj_stop - adj_start

if adj_len < arr_len_thr:
continue

dfs.append(
pl.DataFrame(
{
"chr_name": ctg_name,
"start_pos": adj_start,
"stop_pos": adj_stop,
"len": adj_len,
}
)
)
continue

starts, stops = [], []
for i, row in enumerate(df_bp_jumps.iter_rows()):
prev_row = pl.DataFrame() if i == 0 else df_bp_jumps.slice(i - 1)
next_row = df_bp_jumps.slice(i + 1)

if prev_row.is_empty():
starts.append(df_chr.get_column("start").min())
stops.append(
df_chr.filter(pl.col("start") < row[1]).row(-1, named=True)["stop"]
)

if next_row.is_empty():
starts.append(row[1])
stops.append(df_chr.get_column("stop").max())
else:
starts.append(row[1])
stops.append(
df_chr.filter(
pl.col("start") < next_row.get_column("start")[0]
).row(-1, named=True)["stop"]
)

lens = []
chr_mer_filter = None
if chr_name == "chr10" or chr_name == "chr20":
chr_mer_filter = pl.col("mer") >= 5
elif chr_name == "chrY":
chr_mer_filter = pl.col("mer") >= 30
elif chr_name == "chr17":
chr_mer_filter = pl.col("mer") >= 4

for start, stop in zip(starts, stops):
df_slice = (
df_chr.filter(pl.col("start") >= start, pl.col("stop") <= stop)
.with_columns(bp_jump=pl.col("start") - pl.col("stop").shift(1))
.fill_null(0)
)
# Filter out mers based on chr.
if chr_mer_filter is not None:
df_slice = df_slice.filter(chr_mer_filter)

if df_slice.is_empty():
lens.append(0)
continue
df_slice_dst = (
# df_slice.with_columns(len=pl.col("stop") - pl.col("start")).get_column("len").sum()
df_slice.get_column("stop").max() - df_slice.get_column("start").min()
)
lens.append(df_slice_dst)

lf = pl.LazyFrame(
{
"chr_name": ctg_name,
"start_pos": starts,
"stop_pos": stops,
"len": lens,
}
)
if (
chr_name == "chr8"
or chr_name == "chr10"
or chr_name == "chr17"
or chr_name == "chrY"
):
arr_len_thr = 100_000
else:
arr_len_thr = arr_len_thr

dfs.append(lf.filter(pl.col("len") > arr_len_thr).collect())

df_all_dsts: pl.DataFrame = pl.concat(dfs)
(
df_all_dsts.with_columns(
sort_idx=pl.col("chr_name")
.str.extract("chr([0-9XY]+)")
.replace({"X": "23", "Y": "24"})
.cast(pl.Int32)
)
.sort(by="sort_idx")
.select(DEF_OUTPUT_BED_COLS)
.write_csv(output, include_header=False, separator="\t")
)
return 0
9 changes: 9 additions & 0 deletions censtats/length/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import re


RGX_CHR = re.compile(r"chr[0-9XY]{1,2}")
HOR_BP_LEN = 170
DEF_ARR_LEN_THR = 30_000
DEF_BP_JUMP_LEN_THR = 100_000
DEF_EXP_STV_ROW_BED_COLS = ["chr", "start", "stop", "hor", "other", "strand"]
DEF_OUTPUT_BED_COLS = ["chr_name", "start_pos", "stop_pos", "len"]
6 changes: 5 additions & 1 deletion censtats/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any, TYPE_CHECKING

from .status.cli import add_status_cli, check_cens_status
from .length.cli import add_hor_length_cli, calculate_hor_length

if TYPE_CHECKING:
SubArgumentParser = argparse._SubParsersAction[argparse.ArgumentParser]
Expand All @@ -13,6 +14,7 @@ def main() -> int:
ap = argparse.ArgumentParser(description="Centromere statistics toolkit.")
sub_ap = ap.add_subparsers(dest="cmd")
add_status_cli(sub_ap)
add_hor_length_cli(sub_ap)

args = ap.parse_args()

Expand All @@ -30,7 +32,9 @@ def main() -> int:
restrict_14_22=args.restrict_14_22,
)
elif args.cmd == "length":
raise NotImplementedError("Length command not implemented.")
return calculate_hor_length(
args.input, args.bp_jump_thr, args.arr_len_thr, args.output
)
else:
raise ValueError(f"Unknown command: {args.cmd}")

Expand Down
Empty file added test/helpers/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions test/helpers/integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import subprocess


def run_integration_test(*args: str, expected_output: str) -> None:
process = subprocess.run(
[*args],
capture_output=True,
check=True,
)
res = sorted(
line.strip().split("\t") for line in process.stdout.decode().split("\n") if line
)
with open(expected_output, "rt") as exp_res_fh:
exp_res = sorted(
line.strip().split("\t") for line in exp_res_fh.readlines() if line
)
assert res == exp_res
Empty file added test/length/__init__.py
Empty file.
26 changes: 26 additions & 0 deletions test/length/expected/AS-HOR_chm13_lengths.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
chr1:121100001-127300000 121796049 126300488 4504439
chr2:91800001-95600000 92316506 94672686 2356180
chr3:89850001-97000000 91735802 92595822 860020
chr3:89850001-97000000 95863533 96414943 551410
chr4:49200001-55800000 49705154 50433573 728419
chr4:49200001-55800000 52115487 54870511 2755024
chr4:49200001-55800000 54980290 55199795 219505
chr5:45650001-51600000 47077202 49596627 2519425
chr6:57750001-63100000 58286707 61058391 2771684
chr7:57650001-64700000 60414373 63714498 3300125
chr8:43350001-47450000 44243546 46325081 2081535
chr9:44200001-48100000 44952797 47585318 2632521
chr10:38500001-42550000 39631089 41664590 2030117
chr11:48300001-55700000 51023872 54413487 3389615
chr12:33800001-38500000 34620840 37203843 2583003
chr13:10650001-18100000 15547595 17509783 1962188
chr14:5600001-13300000 10089210 12708411 2619201
chr15:13500001-18250000 16677773 17694129 1016356
chr16:32400001-38950000 35848276 37829524 1981248
chr17:22850001-28650000 23892422 27488300 3595878
chr18:15050001-21650000 15966042 20933551 4967509
chr19:23850001-30750000 25808855 29768170 3959315
chr20:25800001-32500000 26925855 29099656 2173801
chr21:7700001-11850000 10962852 11306205 343353
chr22:8000001-17400000 12786986 15711065 2924079
chrX:56950001-61750000 57818585 60927025 3108440
3 changes: 3 additions & 0 deletions test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed
Git LFS file not shown
27 changes: 27 additions & 0 deletions test/length/test_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pytest

from test.helpers.integration import run_integration_test


@pytest.mark.parametrize(
["input_stv_row_bed", "expected_hor_len_bed"],
[
(
"test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed",
"test/length/expected/AS-HOR_chm13_lengths.bed",
),
],
)
def test_check_cens_status(
input_stv_row_bed: str,
expected_hor_len_bed: str,
):
run_integration_test(
"python",
"-m",
"censtats.main",
"length",
"-i",
input_stv_row_bed,
expected_output=expected_hor_len_bed,
)
Empty file added test/status/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 4e6f882

Please sign in to comment.