Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #18

Merged
merged 9 commits into from
Nov 18, 2020
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ repos:
rev: stable
hooks:
- id: black
language_version: python3.6
language_version: python3.8
8 changes: 8 additions & 0 deletions make_prg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# ___Constants/Aliases___ #
from Bio.AlignIO import MultipleSeqAlignment

MSA = MultipleSeqAlignment
NESTING_LVL = 5
MIN_MATCH_LEN = 7

# ___Version___ #
from pkg_resources import get_distribution

try:
Expand Down
78 changes: 13 additions & 65 deletions make_prg/__main__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import argparse
import logging

import make_prg

NESTING_LVL = 5
MIN_MATCH_LEN = 7
from make_prg import __version__
from make_prg.subcommands import prg_from_msa


def main():
Expand All @@ -13,80 +12,29 @@ def main():
description="script to run make_prg subcommands",
)

parser.add_argument("--version", action="version", version=make_prg.__version__)
parser.add_argument("--version", action="version", version=__version__)
subparsers = parser.add_subparsers(
title="Available subcommands", help="", metavar=""
)

# _____________________________ prg_from_msa ______________________________#
subparser_prg_from_msa = subparsers.add_parser(
"prg_from_msa",
usage="make_prg prg_from_msa [options] <MSA input file>",
help="Make PRG from multiple sequence alignment",
)

subparser_prg_from_msa.add_argument(
"MSA",
action="store",
type=str,
help=(
"Input file: a multiple sequence alignment in supported alignment_format. "
"If not in aligned fasta alignment_format, use -f to input the "
"alignment_format type"
),
)
subparser_prg_from_msa.add_argument(
"-f",
"--alignment_format",
dest="alignment_format",
action="store",
default="fasta",
help=(
"Alignment format of MSA, must be a biopython AlignIO input "
"alignment_format. See http://biopython.org/wiki/AlignIO. Default: fasta"
),
)
subparser_prg_from_msa.add_argument(
"--max_nesting",
dest="max_nesting",
action="store",
type=int,
default=NESTING_LVL,
help="Maximum number of levels to use for nesting. Default: {}".format(
NESTING_LVL
),
)
subparser_prg_from_msa.add_argument(
"--min_match_length",
dest="min_match_length",
action="store",
type=int,
default=MIN_MATCH_LEN,
help=(
"Minimum number of consecutive characters which must be identical for a "
"match. Default: {}".format(MIN_MATCH_LEN)
),
)
subparser_prg_from_msa.add_argument(
"-p", "--prefix", dest="output_prefix", action="store", help="Output prefix"
)
subparser_prg_from_msa.add_argument(
"--no_overwrite",
dest="no_overwrite",
action="store_true",
help="Do not overwrite pre-existing prg file with same name",
)
subparser_prg_from_msa.add_argument(
parser.add_argument(
"-v",
"--verbose",
dest="verbose",
action="store_true",
help="Run with high verbosity " "(debug level logging)",
)
subparser_prg_from_msa.set_defaults(func=make_prg.subcommands.prg_from_msa.run)

prg_from_msa.register_parser(subparsers)

args = parser.parse_args()

if args.verbose:
log_level = logging.DEBUG
else:
log_level = logging.INFO
logging.basicConfig(level=log_level, handlers=[])

if hasattr(args, "func"):
args.func(args)
else:
Expand Down
2 changes: 0 additions & 2 deletions make_prg/exceptions.py

This file was deleted.

240 changes: 240 additions & 0 deletions make_prg/interval_partition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
"""
Code responsible for converting a consensus string into a set of disjoint
match/non_match intervals.
"""
from enum import Enum, auto
from typing import List, Tuple, Optional

from make_prg import MSA

from make_prg.seq_utils import get_interval_seqs, is_non_match, has_empty_sequence


class PartitioningError(Exception):
pass


class IntervalType(Enum):
Match = auto()
NonMatch = auto()

@classmethod
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def from_char(cls, letter: str) -> "IntervalType":
if letter == "*":
return IntervalType.NonMatch
else:
return IntervalType.Match


def is_type(letter: str, interval_type: IntervalType) -> bool:
if IntervalType.from_char(letter) is interval_type:
return True
else:
return False


class Interval:
"""Stores a closed interval [a,b]"""

def __init__(self, it_type: IntervalType, start: int, stop: int = None):
self.type = it_type
self.start = start
if stop is not None:
assert stop >= start
self.stop = stop if stop is not None else start

def modify_by(self, left_delta: int, right_delta: int):
self.start += left_delta
self.stop += right_delta

def contains(self, position: int):
return self.start <= position <= self.stop
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be nice to add a docstring mentioning what sort of intervals this class stores. i.e. half-open, open, closed. It seems like this interval is a closed interval?
Also, just mentioning this incase you aren't aware of it: https://github.com/chaimleib/intervaltree
I use this for all interval work and it is super nice. I'm not suggesting you replace the class here or anything. Just putting it on your radar if it isn't already in case there is need for it down the line.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, i added a docstring


def __len__(self) -> int:
return self.stop - self.start + 1

def __lt__(self, other: "Interval") -> bool:
return self.start < other.start

def __eq__(self, other: "Interval") -> bool:
return (
self.start == other.start
and self.stop == other.stop
and self.type is other.type
)

def __repr__(self):
return f"[{self.start}, {self.stop}]"


Intervals = List[Interval]


class IntervalPartitioner:
"""Produces a list of intervals in which we have
consensus sequence longer than min_match_length, and
a list of the non-match intervals left."""

def __init__(self, consensus_string: str, min_match_length: int, alignment: MSA):
self._match_intervals: Intervals = list()
self._non_match_intervals: Intervals = list()
self.mml = min_match_length

if len(consensus_string) < self.mml:
# In this case, a match of less than the min_match_length gets counted
# as a match (usually, it counts as a non_match)
it_type = IntervalType.Match
if any(map(is_non_match, consensus_string)):
it_type = IntervalType.NonMatch
self._append(Interval(it_type, 0, len(consensus_string) - 1))

else:
cur_interval = self._new_interval(consensus_string[0], 0)

for i, letter in enumerate(consensus_string[1:], start=1):
if is_type(letter, cur_interval.type):
cur_interval.modify_by(0, 1) # simple interval extension
else:
new_interval = self._add_interval(cur_interval, alignment)
if new_interval is None:
cur_interval = self._new_interval(letter, i)
else:
cur_interval = new_interval
self._add_interval(cur_interval, alignment, end=True)

self.enforce_multisequence_nonmatch_intervals(
self._match_intervals, self._non_match_intervals, alignment
)
self.enforce_alignment_interval_bijection(
self._match_intervals,
self._non_match_intervals,
alignment.get_alignment_length(),
)

def get_intervals(self) -> Tuple[Intervals, Intervals, Intervals]:
return (
sorted(self._match_intervals),
sorted(self._non_match_intervals),
sorted(self._match_intervals + self._non_match_intervals),
)

def _new_interval(self, letter: str, start_pos: int) -> Interval:
return Interval(IntervalType.from_char(letter), start_pos)

def _append(self, interval: Interval):
if interval.type is IntervalType.Match:
self._match_intervals.append(interval)
else:
self._non_match_intervals.append(interval)

def _pop(self, it_type: IntervalType) -> Interval:
if it_type is IntervalType.Match:
return self._match_intervals.pop()
else:
return self._non_match_intervals.pop()

def _add_interval(
self, interval: Interval, alignment: MSA, end: bool = False
) -> Optional[Interval]:
"""
i)If we are given a match interval < min_match_length, we return an extended non_match interval
ii)If we are given a non_match interval containing 1+ empty sequence, we pad it with
previous match_interval, if any, to avoid empty alleles in resulting prg.
"""
if interval.type is IntervalType.Match:
# The +1 is because we also extend the non_match interval
if len(interval) < self.mml:
try:
last_non_match = self._pop(IntervalType.NonMatch)
last_non_match.modify_by(0, len(interval) + 1)
except IndexError:
last_non_match = Interval(
IntervalType.NonMatch, interval.start, interval.stop + 1
)
if end: # If this is final call, go to append the interval
last_non_match.modify_by(0, -1)
self._append(last_non_match)
return last_non_match
else:
if len(self._match_intervals) > 0 and has_empty_sequence(
alignment, (interval.start, interval.stop)
):
# Pad interval with sequence to avoid empty alleles
len_match = len(self._match_intervals[-1])
if len_match - 1 < self.mml:
# Case: match is now too small, converted to non_match
self._match_intervals.pop()
interval.modify_by(-1 * len_match, 0)
if len(self._non_match_intervals) > 0:
# Case: merge previous non_match with this non_match
self._non_match_intervals[-1].modify_by(0, len(interval))
return None
else:
self._match_intervals[-1].modify_by(0, -1)
interval.modify_by(-1, 0)

self._append(interval)
return None

@classmethod
def enforce_multisequence_nonmatch_intervals(
cls, match_intervals: Intervals, non_match_intervals: Intervals, alignment: MSA
) -> None:
"""
Goes through non-match intervals and makes sure there is more than one sequence there, else makes it a match
interval.
Modifies the intervals in-place.
Example reasons for such a conversion to occur:
- 'N' in a sequence causes it to be filtered out, and left with a single useable sequence
- '-' in sequences causes them to appear different, but they are the same
"""
if len(alignment) == 0: # For testing convenience
return
for i in reversed(range(len(non_match_intervals))):
interval = non_match_intervals[i]
interval_alignment = alignment[:, interval.start : interval.stop + 1]
interval_seqs = get_interval_seqs(interval_alignment)
if len(interval_seqs) < 2:
changed_interval = non_match_intervals[i]
match_intervals.append(
Interval(
IntervalType.Match,
changed_interval.start,
changed_interval.stop,
)
)
non_match_intervals.pop(i)

@classmethod
def enforce_alignment_interval_bijection(
cls,
match_intervals: Intervals,
non_match_intervals: Intervals,
alignment_length: int,
):
"""
Check each position in an alignment is in one, and one only, (match or non_match) interval
"""
for i in range(alignment_length):
count_match = 0
for interval in match_intervals:
if interval.contains(i):
count_match += 1
count_non_match = 0
for interval in non_match_intervals:
if interval.contains(i):
count_non_match += 1

if count_match > 1 or count_non_match > 1:
raise PartitioningError(
f"Failed interval partitioning: position {i}"
" appears in more than one interval"
)
if (
not count_match ^ count_non_match
): # test fails if they are the same integer
msg = ["neither", "nor"] if count_match == 0 else ["both", "and"]
raise PartitioningError(
"Failed interval partitioning: alignment position %d"
"classified as %s match %s non-match " % (i, msg[0], msg[1])
)
5 changes: 2 additions & 3 deletions make_prg/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

from Bio import AlignIO

from make_prg import MSA
from make_prg.prg_encoder import PrgEncoder, PRG_Ints


def load_alignment_file(
msa_file: str, alignment_format: str
) -> AlignIO.MultipleSeqAlignment:
def load_alignment_file(msa_file: str, alignment_format: str) -> MSA:
logging.info("Read from MSA file %s", msa_file)
if ".gz" in msa_file:
logging.debug("MSA is gzipped")
Expand Down
Loading