Skip to content

Commit

Permalink
feat: Add Interval constructor from UCSC-formatted string (#29)
Browse files Browse the repository at this point in the history
* feat: add Interval constructor from UCSC formatted string

* feat: permit strand

* fix: don't make assumptions about contig names

* refactor: rename value and update strand extraction

* refactor: string splitting

* test: refactor tests

* refactor: cleanup

* test: add doc
  • Loading branch information
msto authored Jul 15, 2024
1 parent 3a4b23a commit 592fb8e
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 0 deletions.
52 changes: 52 additions & 0 deletions pybedlite/overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,58 @@ def from_bedrecord(cls: Type["Interval"], record: BedRecord) -> "Interval":
name=record.name,
)

@classmethod
def from_ucsc(
cls: Type["Interval"],
string: str,
name: Optional[str] = None,
) -> "Interval":
"""
Construct an `Interval` from a UCSC "position"-formatted string.
The "Position" format (referring to the "1-start, fully-closed" system as coordinates are
"positioned" in the browser)
* Written as: chr1:127140001-127140001
* The location may optionally be followed by a parenthetically enclosed strand, e.g.
chr1:127140001-127140001(+).
* No spaces.
* Includes punctuation: a colon after the chromosome, and a dash between the start and
end coordinates.
* When in this format, the assumption is that the coordinate is **1-start,
fully-closed.**
https://genome-blog.gi.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/ # noqa: E501
Note that when the string does not have a specified strand, the `Interval`'s negative
attribute is set to `False`. This mimics the behavior of `OverlapDetector.from_bed()` when
reading a record that does not have a specified strand.
Args:
string: The UCSC "position"-formatted string.
name: An optional name for the interval.
Returns:
An `Interval` corresponding to the same region specified in the string.
Note that the `Interval` is **zero-based open-ended**.
Raises:
ValueError: If the string is not a valid UCSC position-formatted string.
"""
try:
if string[-1] == ")":
interval, strand = string.rstrip(")").rsplit("(", 1)
else:
interval, strand = string, "+"

contig, span = interval.rsplit(":", 1)
start, end = span.split("-")

return Interval(contig, int(start) - 1, int(end), negative=(strand == "-"), name=name)

except Exception as exception:
raise ValueError(
f"Not a valid UCSC position-formatted string: {string}"
) from exception


class OverlapDetector(Iterable[Interval]):
"""Detects and returns overlaps between a set of genomic regions and another genomic region.
Expand Down
31 changes: 31 additions & 0 deletions pybedlite/tests/test_overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import List

import pytest

from pybedlite.bed_record import BedRecord
from pybedlite.bed_record import BedStrand
from pybedlite.overlap_detector import Interval
Expand Down Expand Up @@ -188,3 +190,32 @@ def test_construction_from_interval(bed_records: List[BedRecord]) -> None:
assert new_record.strand is BedStrand.Positive
else:
assert new_record.strand is record.strand


def test_construction_from_ucsc() -> None:
"""
`Interval.from_ucsc()` should convert a UCSC position-formatted string to an `Interval`.
The position-formatted string should be one-based fully-closed, and the `Interval` should be
zero-based half-open.
"""
assert Interval.from_ucsc("chr1:101-200") == Interval("chr1", 100, 200)


@pytest.mark.parametrize("strand", ["+", "-"])
def test_construction_from_ucsc_with_strand(strand: str) -> None:
"""
`Interval.from_ucsc()` should correctly parse UCSC position-formatted strings with strands.
"""
expected_interval = Interval("chr1", 100, 200, negative=(strand == "-"))
assert Interval.from_ucsc(f"chr1:101-200({strand})") == expected_interval


@pytest.mark.parametrize(
"contig", ["chrUn_JTFH01001499v1_decoy", "HLA-DRB1*15:01:01:02", "chr10_GL383545v1_alt"]
)
def test_construction_from_ucsc_other_contigs(contig: str) -> None:
"""
`Interval.from_ucsc()` should accomodate non-human, decoy, custom, and other contig names.
"""
assert Interval.from_ucsc(f"{contig}:101-200") == Interval(contig, 100, 200)

0 comments on commit 592fb8e

Please sign in to comment.