iqbal-lab-org · bricoletc · Nov 18, 2020 · Aug 11, 2020 · Aug 11, 2020 · Aug 11, 2020
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
     rev: stable
     hooks:
       - id: black
-        language_version: python3.6
+        language_version: python3.8
diff --git a/make_prg/__init__.py b/make_prg/__init__.py
@@ -1,3 +1,11 @@
+# ___Constants/Aliases___ #
+from Bio.AlignIO import MultipleSeqAlignment
+
+MSA = MultipleSeqAlignment
+NESTING_LVL = 5
+MIN_MATCH_LEN = 7
+
+# ___Version___ #
 from pkg_resources import get_distribution
 
 try:

diff --git a/make_prg/__main__.py b/make_prg/__main__.py
@@ -1,9 +1,8 @@
 import argparse
+import logging
 
-import make_prg
-
-NESTING_LVL = 5
-MIN_MATCH_LEN = 7
+from make_prg import __version__
+from make_prg.subcommands import prg_from_msa
 
 
 def main():
@@ -13,80 +12,29 @@ def main():
         description="script to run make_prg subcommands",
     )
 
-    parser.add_argument("--version", action="version", version=make_prg.__version__)
+    parser.add_argument("--version", action="version", version=__version__)
     subparsers = parser.add_subparsers(
         title="Available subcommands", help="", metavar=""
     )
 
-    # _____________________________ prg_from_msa ______________________________#
-    subparser_prg_from_msa = subparsers.add_parser(
-        "prg_from_msa",
-        usage="make_prg prg_from_msa [options] <MSA input file>",
-        help="Make PRG from multiple sequence alignment",
-    )
-
-    subparser_prg_from_msa.add_argument(
-        "MSA",
-        action="store",
-        type=str,
-        help=(
-            "Input file: a multiple sequence alignment in supported alignment_format. "
-            "If not in aligned fasta alignment_format, use -f to input the "
-            "alignment_format type"
-        ),
-    )
-    subparser_prg_from_msa.add_argument(
-        "-f",
-        "--alignment_format",
-        dest="alignment_format",
-        action="store",
-        default="fasta",
-        help=(
-            "Alignment format of MSA, must be a biopython AlignIO input "
-            "alignment_format. See http://biopython.org/wiki/AlignIO. Default: fasta"
-        ),
-    )
-    subparser_prg_from_msa.add_argument(
-        "--max_nesting",
-        dest="max_nesting",
-        action="store",
-        type=int,
-        default=NESTING_LVL,
-        help="Maximum number of levels to use for nesting. Default: {}".format(
-            NESTING_LVL
-        ),
-    )
-    subparser_prg_from_msa.add_argument(
-        "--min_match_length",
-        dest="min_match_length",
-        action="store",
-        type=int,
-        default=MIN_MATCH_LEN,
-        help=(
-            "Minimum number of consecutive characters which must be identical for a "
-            "match. Default: {}".format(MIN_MATCH_LEN)
-        ),
-    )
-    subparser_prg_from_msa.add_argument(
-        "-p", "--prefix", dest="output_prefix", action="store", help="Output prefix"
-    )
-    subparser_prg_from_msa.add_argument(
-        "--no_overwrite",
-        dest="no_overwrite",
-        action="store_true",
-        help="Do not overwrite pre-existing prg file with same name",
-    )
-    subparser_prg_from_msa.add_argument(
+    parser.add_argument(
         "-v",
         "--verbose",
         dest="verbose",
         action="store_true",
         help="Run with high verbosity " "(debug level logging)",
     )
-    subparser_prg_from_msa.set_defaults(func=make_prg.subcommands.prg_from_msa.run)
+
+    prg_from_msa.register_parser(subparsers)
 
     args = parser.parse_args()
 
+    if args.verbose:
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+    logging.basicConfig(level=log_level, handlers=[])
+
     if hasattr(args, "func"):
         args.func(args)
     else:

diff --git a/make_prg/exceptions.py b/make_prg/exceptions.py
diff --git a/make_prg/interval_partition.py b/make_prg/interval_partition.py
@@ -0,0 +1,240 @@
+"""
+Code responsible for converting a consensus string into a set of disjoint
+match/non_match intervals.
+"""
+from enum import Enum, auto
+from typing import List, Tuple, Optional
+
+from make_prg import MSA
+
+from make_prg.seq_utils import get_interval_seqs, is_non_match, has_empty_sequence
+
+
+class PartitioningError(Exception):
+    pass
+
+
+class IntervalType(Enum):
+    Match = auto()
+    NonMatch = auto()
+
+    @classmethod
+    def from_char(cls, letter: str) -> "IntervalType":
+        if letter == "*":
+            return IntervalType.NonMatch
+        else:
+            return IntervalType.Match
+
+
+def is_type(letter: str, interval_type: IntervalType) -> bool:
+    if IntervalType.from_char(letter) is interval_type:
+        return True
+    else:
+        return False
+
+
+class Interval:
+    """Stores a closed interval [a,b]"""
+
+    def __init__(self, it_type: IntervalType, start: int, stop: int = None):
+        self.type = it_type
+        self.start = start
+        if stop is not None:
+            assert stop >= start
+        self.stop = stop if stop is not None else start
+
+    def modify_by(self, left_delta: int, right_delta: int):
+        self.start += left_delta
+        self.stop += right_delta
+
+    def contains(self, position: int):
+        return self.start <= position <= self.stop
+
+    def __len__(self) -> int:
+        return self.stop - self.start + 1
+
+    def __lt__(self, other: "Interval") -> bool:
+        return self.start < other.start
+
+    def __eq__(self, other: "Interval") -> bool:
+        return (
+            self.start == other.start
+            and self.stop == other.stop
+            and self.type is other.type
+        )
+
+    def __repr__(self):
+        return f"[{self.start}, {self.stop}]"
+
+
+Intervals = List[Interval]
+
+
+class IntervalPartitioner:
+    """Produces a list of intervals in which we have
+    consensus sequence longer than min_match_length, and
+    a list of the non-match intervals left."""
+
+    def __init__(self, consensus_string: str, min_match_length: int, alignment: MSA):
+        self._match_intervals: Intervals = list()
+        self._non_match_intervals: Intervals = list()
+        self.mml = min_match_length
+
+        if len(consensus_string) < self.mml:
+            # In this case, a match of less than the min_match_length gets counted
+            # as a match (usually, it counts as a non_match)
+            it_type = IntervalType.Match
+            if any(map(is_non_match, consensus_string)):
+                it_type = IntervalType.NonMatch
+            self._append(Interval(it_type, 0, len(consensus_string) - 1))
+
+        else:
+            cur_interval = self._new_interval(consensus_string[0], 0)
+
+            for i, letter in enumerate(consensus_string[1:], start=1):
+                if is_type(letter, cur_interval.type):
+                    cur_interval.modify_by(0, 1)  # simple interval extension
+                else:
+                    new_interval = self._add_interval(cur_interval, alignment)
+                    if new_interval is None:
+                        cur_interval = self._new_interval(letter, i)
+                    else:
+                        cur_interval = new_interval
+            self._add_interval(cur_interval, alignment, end=True)
+
+        self.enforce_multisequence_nonmatch_intervals(
+            self._match_intervals, self._non_match_intervals, alignment
+        )
+        self.enforce_alignment_interval_bijection(
+            self._match_intervals,
+            self._non_match_intervals,
+            alignment.get_alignment_length(),
+        )
+
+    def get_intervals(self) -> Tuple[Intervals, Intervals, Intervals]:
+        return (
+            sorted(self._match_intervals),
+            sorted(self._non_match_intervals),
+            sorted(self._match_intervals + self._non_match_intervals),
+        )
+
+    def _new_interval(self, letter: str, start_pos: int) -> Interval:
+        return Interval(IntervalType.from_char(letter), start_pos)
+
+    def _append(self, interval: Interval):
+        if interval.type is IntervalType.Match:
+            self._match_intervals.append(interval)
+        else:
+            self._non_match_intervals.append(interval)
+
+    def _pop(self, it_type: IntervalType) -> Interval:
+        if it_type is IntervalType.Match:
+            return self._match_intervals.pop()
+        else:
+            return self._non_match_intervals.pop()
+
+    def _add_interval(
+        self, interval: Interval, alignment: MSA, end: bool = False
+    ) -> Optional[Interval]:
+        """
+        i)If we are given a match interval < min_match_length, we return an extended non_match interval
+        ii)If we are given a non_match interval containing 1+ empty sequence, we pad it with
+           previous match_interval, if any, to avoid empty alleles in resulting prg.
+        """
+        if interval.type is IntervalType.Match:
+            # The +1 is because we also extend the non_match interval
+            if len(interval) < self.mml:
+                try:
+                    last_non_match = self._pop(IntervalType.NonMatch)
+                    last_non_match.modify_by(0, len(interval) + 1)
+                except IndexError:
+                    last_non_match = Interval(
+                        IntervalType.NonMatch, interval.start, interval.stop + 1
+                    )
+                if end:  # If this is final call, go to append the interval
+                    last_non_match.modify_by(0, -1)
+                    self._append(last_non_match)
+                return last_non_match
+        else:
+            if len(self._match_intervals) > 0 and has_empty_sequence(
+                alignment, (interval.start, interval.stop)
+            ):
+                # Pad interval with sequence to avoid empty alleles
+                len_match = len(self._match_intervals[-1])
+                if len_match - 1 < self.mml:
+                    # Case: match is now too small, converted to non_match
+                    self._match_intervals.pop()
+                    interval.modify_by(-1 * len_match, 0)
+                    if len(self._non_match_intervals) > 0:
+                        # Case: merge previous non_match with this non_match
+                        self._non_match_intervals[-1].modify_by(0, len(interval))
+                        return None
+                else:
+                    self._match_intervals[-1].modify_by(0, -1)
+                    interval.modify_by(-1, 0)
+
+        self._append(interval)
+        return None
+
+    @classmethod
+    def enforce_multisequence_nonmatch_intervals(
+        cls, match_intervals: Intervals, non_match_intervals: Intervals, alignment: MSA
+    ) -> None:
+        """
+        Goes through non-match intervals and makes sure there is more than one sequence there, else makes it a match
+        interval.
+        Modifies the intervals in-place.
+        Example reasons for such a conversion to occur:
+            - 'N' in a sequence causes it to be filtered out, and left with a single useable sequence
+            - '-' in sequences causes them to appear different, but they are the same
+        """
+        if len(alignment) == 0:  # For testing convenience
+            return
+        for i in reversed(range(len(non_match_intervals))):
+            interval = non_match_intervals[i]
+            interval_alignment = alignment[:, interval.start : interval.stop + 1]
+            interval_seqs = get_interval_seqs(interval_alignment)
+            if len(interval_seqs) < 2:
+                changed_interval = non_match_intervals[i]
+                match_intervals.append(
+                    Interval(
+                        IntervalType.Match,
+                        changed_interval.start,
+                        changed_interval.stop,
+                    )
+                )
+                non_match_intervals.pop(i)
+
+    @classmethod
+    def enforce_alignment_interval_bijection(
+        cls,
+        match_intervals: Intervals,
+        non_match_intervals: Intervals,
+        alignment_length: int,
+    ):
+        """
+        Check each position in an alignment is in one, and one only, (match or non_match) interval
+        """
+        for i in range(alignment_length):
+            count_match = 0
+            for interval in match_intervals:
+                if interval.contains(i):
+                    count_match += 1
+            count_non_match = 0
+            for interval in non_match_intervals:
+                if interval.contains(i):
+                    count_non_match += 1
+
+            if count_match > 1 or count_non_match > 1:
+                raise PartitioningError(
+                    f"Failed interval partitioning: position {i}"
+                    " appears in more than one interval"
+                )
+            if (
+                not count_match ^ count_non_match
+            ):  # test fails if they are the same integer
+                msg = ["neither", "nor"] if count_match == 0 else ["both", "and"]
+                raise PartitioningError(
+                    "Failed interval partitioning: alignment position %d"
+                    "classified as %s match %s non-match " % (i, msg[0], msg[1])
+                )
diff --git a/make_prg/io_utils.py b/make_prg/io_utils.py
@@ -5,12 +5,11 @@
 
 from Bio import AlignIO
 
+from make_prg import MSA
 from make_prg.prg_encoder import PrgEncoder, PRG_Ints
 
 
-def load_alignment_file(
-    msa_file: str, alignment_format: str
-) -> AlignIO.MultipleSeqAlignment:
+def load_alignment_file(msa_file: str, alignment_format: str) -> MSA:
     logging.info("Read from MSA file %s", msa_file)
     if ".gz" in msa_file:
         logging.debug("MSA is gzipped")