-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactored test dir. Added length subcommand.
- Loading branch information
Keisuke Oshima
committed
May 15, 2024
1 parent
ed68170
commit 4e6f882
Showing
37 changed files
with
375 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
test/input/chr9_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr21_cens.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr21_chr13_cens_mismap.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr22_cens.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr4_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr4_cens_partials.png filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr9_cens_partials.png filter=lfs diff=lfs merge=lfs -text | ||
test/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out.png filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr21_cens_false_neg_mismap.png filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr21_cens_false_neg_mismap.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr21_cens.png filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr21_chr13_cens_mismap.png filter=lfs diff=lfs merge=lfs -text | ||
test/input/chr22_cens.png filter=lfs diff=lfs merge=lfs -text | ||
test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr9_cens_partials.png filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr21_cens_false_neg_mismap.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr21_chr13_cens_mismap.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr22_cens.png filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr4_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr4_cens_partials.png filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr21_cens_false_neg_mismap.png filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr21_cens.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr22_cens.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr21_chr13_cens_mismap.png filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr9_cens_partials.fa.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chm13_chm1_cens_v21.trimmed.fa.noheader.out.png filter=lfs diff=lfs merge=lfs -text | ||
test/status/input/chr21_cens.png filter=lfs diff=lfs merge=lfs -text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
import re | ||
import sys | ||
import argparse | ||
import polars as pl | ||
|
||
from typing import TYPE_CHECKING, Any | ||
|
||
from .constants import ( | ||
RGX_CHR, | ||
DEF_BP_JUMP_LEN_THR, | ||
DEF_ARR_LEN_THR, | ||
HOR_BP_LEN, | ||
DEF_EXP_STV_ROW_BED_COLS, | ||
DEF_OUTPUT_BED_COLS, | ||
) | ||
|
||
|
||
if TYPE_CHECKING: | ||
SubArgumentParser = argparse._SubParsersAction[argparse.ArgumentParser] | ||
else: | ||
SubArgumentParser = Any | ||
|
||
|
||
def add_hor_length_cli(parser: SubArgumentParser) -> None: | ||
ap = parser.add_parser( | ||
"length", | ||
description="Estimate HOR array length from stv bed file / HumAS-HMMER output.", | ||
) | ||
ap.add_argument( | ||
"-i", | ||
"--input", | ||
help=f"Input stv row bed file produced by HumAS-HMMER and stv. Expects columns: {DEF_EXP_STV_ROW_BED_COLS}", | ||
type=str, | ||
) | ||
ap.add_argument( | ||
"-o", | ||
"--output", | ||
help=f"Output bed file with columns: {DEF_OUTPUT_BED_COLS}", | ||
default=sys.stdout, | ||
type=argparse.FileType("wt"), | ||
) | ||
ap.add_argument( | ||
"--bp_jump_thr", | ||
help="Base pair jump threshold to group by", | ||
type=int, | ||
default=DEF_BP_JUMP_LEN_THR, | ||
) | ||
ap.add_argument( | ||
"--arr_len_thr", | ||
help="Length threshold to filter out.", | ||
type=int, | ||
default=DEF_ARR_LEN_THR, | ||
) | ||
return None | ||
|
||
|
||
def calculate_hor_length( | ||
infile: str, bp_jump_thr: int, arr_len_thr: int, output: str | ||
) -> int: | ||
""" | ||
Calculate HOR array length from HumAS-HMMER structural variation row output. | ||
### Parameters | ||
`infile` | ||
Input bed file made from HumAS-HMMER output. | ||
Expects the following columns: `{chr, start, stop, hor, 0, strand, ...}`. | ||
`bp_jump_thr` | ||
Base pair jump threshold to group by. | ||
`arr_len_thr` | ||
Length threshold of HOR array to filter out. | ||
`output` | ||
Output bed file with HOR array lengths. | ||
Columns: `{chr_name, start_pos, stop_pos, len}`. | ||
### Returns | ||
0 if successful. | ||
""" | ||
df = pl.read_csv( | ||
infile, | ||
separator="\t", | ||
columns=[0, 1, 2, 3, 4, 5], | ||
new_columns=DEF_EXP_STV_ROW_BED_COLS, | ||
has_header=False, | ||
) | ||
|
||
dfs = [] | ||
for ctg_name, df_chr in df.group_by(["chr"], maintain_order=True): | ||
df_chr = df_chr.with_columns(len=pl.col("stop") - pl.col("start")).with_columns( | ||
mer=(pl.col("len") / HOR_BP_LEN).round() | ||
) | ||
ctg_name = ctg_name[0] | ||
mtch_chr_name = re.search(RGX_CHR, ctg_name) | ||
if mtch_chr_name is None: | ||
continue | ||
|
||
chr_name = mtch_chr_name.group(0) | ||
df_live_hor = df_chr.filter(pl.col("hor").str.contains("L")) | ||
|
||
# Specific edge case for chr8. | ||
if chr_name == "chr8" or chr_name == "chr10" or chr_name == "chr16": | ||
bp_jump_thr = 10_000 | ||
elif chr_name == "chrY": | ||
bp_jump_thr = 2_000 | ||
else: | ||
bp_jump_thr = bp_jump_thr | ||
|
||
df_bp_jumps = df_live_hor.with_columns( | ||
diff=pl.col("start") - pl.col("stop").shift(1) | ||
).filter(pl.col("diff") > bp_jump_thr) | ||
|
||
if df_bp_jumps.is_empty(): | ||
adj_start = df_live_hor.get_column("start").min() | ||
adj_stop = df_live_hor.get_column("stop").max() | ||
adj_len = adj_stop - adj_start | ||
|
||
if adj_len < arr_len_thr: | ||
continue | ||
|
||
dfs.append( | ||
pl.DataFrame( | ||
{ | ||
"chr_name": ctg_name, | ||
"start_pos": adj_start, | ||
"stop_pos": adj_stop, | ||
"len": adj_len, | ||
} | ||
) | ||
) | ||
continue | ||
|
||
starts, stops = [], [] | ||
for i, row in enumerate(df_bp_jumps.iter_rows()): | ||
prev_row = pl.DataFrame() if i == 0 else df_bp_jumps.slice(i - 1) | ||
next_row = df_bp_jumps.slice(i + 1) | ||
|
||
if prev_row.is_empty(): | ||
starts.append(df_chr.get_column("start").min()) | ||
stops.append( | ||
df_chr.filter(pl.col("start") < row[1]).row(-1, named=True)["stop"] | ||
) | ||
|
||
if next_row.is_empty(): | ||
starts.append(row[1]) | ||
stops.append(df_chr.get_column("stop").max()) | ||
else: | ||
starts.append(row[1]) | ||
stops.append( | ||
df_chr.filter( | ||
pl.col("start") < next_row.get_column("start")[0] | ||
).row(-1, named=True)["stop"] | ||
) | ||
|
||
lens = [] | ||
chr_mer_filter = None | ||
if chr_name == "chr10" or chr_name == "chr20": | ||
chr_mer_filter = pl.col("mer") >= 5 | ||
elif chr_name == "chrY": | ||
chr_mer_filter = pl.col("mer") >= 30 | ||
elif chr_name == "chr17": | ||
chr_mer_filter = pl.col("mer") >= 4 | ||
|
||
for start, stop in zip(starts, stops): | ||
df_slice = ( | ||
df_chr.filter(pl.col("start") >= start, pl.col("stop") <= stop) | ||
.with_columns(bp_jump=pl.col("start") - pl.col("stop").shift(1)) | ||
.fill_null(0) | ||
) | ||
# Filter out mers based on chr. | ||
if chr_mer_filter is not None: | ||
df_slice = df_slice.filter(chr_mer_filter) | ||
|
||
if df_slice.is_empty(): | ||
lens.append(0) | ||
continue | ||
df_slice_dst = ( | ||
# df_slice.with_columns(len=pl.col("stop") - pl.col("start")).get_column("len").sum() | ||
df_slice.get_column("stop").max() - df_slice.get_column("start").min() | ||
) | ||
lens.append(df_slice_dst) | ||
|
||
lf = pl.LazyFrame( | ||
{ | ||
"chr_name": ctg_name, | ||
"start_pos": starts, | ||
"stop_pos": stops, | ||
"len": lens, | ||
} | ||
) | ||
if ( | ||
chr_name == "chr8" | ||
or chr_name == "chr10" | ||
or chr_name == "chr17" | ||
or chr_name == "chrY" | ||
): | ||
arr_len_thr = 100_000 | ||
else: | ||
arr_len_thr = arr_len_thr | ||
|
||
dfs.append(lf.filter(pl.col("len") > arr_len_thr).collect()) | ||
|
||
df_all_dsts: pl.DataFrame = pl.concat(dfs) | ||
( | ||
df_all_dsts.with_columns( | ||
sort_idx=pl.col("chr_name") | ||
.str.extract("chr([0-9XY]+)") | ||
.replace({"X": "23", "Y": "24"}) | ||
.cast(pl.Int32) | ||
) | ||
.sort(by="sort_idx") | ||
.select(DEF_OUTPUT_BED_COLS) | ||
.write_csv(output, include_header=False, separator="\t") | ||
) | ||
return 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import re | ||
|
||
|
||
RGX_CHR = re.compile(r"chr[0-9XY]{1,2}") | ||
HOR_BP_LEN = 170 | ||
DEF_ARR_LEN_THR = 30_000 | ||
DEF_BP_JUMP_LEN_THR = 100_000 | ||
DEF_EXP_STV_ROW_BED_COLS = ["chr", "start", "stop", "hor", "other", "strand"] | ||
DEF_OUTPUT_BED_COLS = ["chr_name", "start_pos", "stop_pos", "len"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import subprocess | ||
|
||
|
||
def run_integration_test(*args: str, expected_output: str) -> None: | ||
process = subprocess.run( | ||
[*args], | ||
capture_output=True, | ||
check=True, | ||
) | ||
res = sorted( | ||
line.strip().split("\t") for line in process.stdout.decode().split("\n") if line | ||
) | ||
with open(expected_output, "rt") as exp_res_fh: | ||
exp_res = sorted( | ||
line.strip().split("\t") for line in exp_res_fh.readlines() if line | ||
) | ||
assert res == exp_res |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
chr1:121100001-127300000 121796049 126300488 4504439 | ||
chr2:91800001-95600000 92316506 94672686 2356180 | ||
chr3:89850001-97000000 91735802 92595822 860020 | ||
chr3:89850001-97000000 95863533 96414943 551410 | ||
chr4:49200001-55800000 49705154 50433573 728419 | ||
chr4:49200001-55800000 52115487 54870511 2755024 | ||
chr4:49200001-55800000 54980290 55199795 219505 | ||
chr5:45650001-51600000 47077202 49596627 2519425 | ||
chr6:57750001-63100000 58286707 61058391 2771684 | ||
chr7:57650001-64700000 60414373 63714498 3300125 | ||
chr8:43350001-47450000 44243546 46325081 2081535 | ||
chr9:44200001-48100000 44952797 47585318 2632521 | ||
chr10:38500001-42550000 39631089 41664590 2030117 | ||
chr11:48300001-55700000 51023872 54413487 3389615 | ||
chr12:33800001-38500000 34620840 37203843 2583003 | ||
chr13:10650001-18100000 15547595 17509783 1962188 | ||
chr14:5600001-13300000 10089210 12708411 2619201 | ||
chr15:13500001-18250000 16677773 17694129 1016356 | ||
chr16:32400001-38950000 35848276 37829524 1981248 | ||
chr17:22850001-28650000 23892422 27488300 3595878 | ||
chr18:15050001-21650000 15966042 20933551 4967509 | ||
chr19:23850001-30750000 25808855 29768170 3959315 | ||
chr20:25800001-32500000 26925855 29099656 2173801 | ||
chr21:7700001-11850000 10962852 11306205 343353 | ||
chr22:8000001-17400000 12786986 15711065 2924079 | ||
chrX:56950001-61750000 57818585 60927025 3108440 |
Git LFS file not shown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import pytest | ||
|
||
from test.helpers.integration import run_integration_test | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["input_stv_row_bed", "expected_hor_len_bed"], | ||
[ | ||
( | ||
"test/length/input/AS-HOR-vs-chm13_cens_v18.stv_row.all.bed", | ||
"test/length/expected/AS-HOR_chm13_lengths.bed", | ||
), | ||
], | ||
) | ||
def test_check_cens_status( | ||
input_stv_row_bed: str, | ||
expected_hor_len_bed: str, | ||
): | ||
run_integration_test( | ||
"python", | ||
"-m", | ||
"censtats.main", | ||
"length", | ||
"-i", | ||
input_stv_row_bed, | ||
expected_output=expected_hor_len_bed, | ||
) |
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
Oops, something went wrong.