Merge pull request #104 from openvax/more-rna-filtering-options

Major rewrite to enable use of newer pysam, fix several bugs, and add filtering options
openvax · Jun 12, 2019 · e6a870e · e6a870e
2 parents b39b684 + fbc01de
commit e6a870e
Show file tree

Hide file tree

Showing 95 changed files with 6,090 additions and 3,393 deletions.
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,7 @@ target/
 
 #Ipython Notebook
 .ipynb_checkpoints
+
+# PyCharm
+.idea/
+vcs.xml
diff --git a/.travis.yml b/.travis.yml
@@ -58,8 +58,10 @@ before_script:
     --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.87/
 script:
   - >
-    nosetests test --with-coverage --cover-package=isovar &&
-    nosetests openvax-integration-tests/test
+    nosetests test --with-coverage --cover-package=isovar
+  # removing integration tests until Vaxrank is updated to use new API
+  # &&
+  # nosetests openvax-integration-tests/test
 after_success:
   coveralls
 deploy:

diff --git a/README.md b/README.md
diff --git a/isovar/__init__.py b/isovar/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016-2018. Mount Sinai School of Medicine
+# Copyright (c) 2016-2019. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,4 +14,34 @@
 
 from __future__ import print_function, division, absolute_import
 
-__version__ = "0.9.0"
+__version__ = "1.0.0"
+
+
+from .allele_read import AlleleRead
+from .dataframe_helpers import isovar_results_to_dataframe
+from .isovar_result import IsovarResult
+from .locus_read import LocusRead
+from .main import run_isovar
+from .protein_sequence import ProteinSequence
+from .protein_sequence_creator import ProteinSequenceCreator
+from .read_collector import ReadCollector
+from .read_evidence import ReadEvidence
+from .variant_orf import VariantORF
+from .variant_sequence import VariantSequence
+from .variant_sequence_creator import VariantSequenceCreator
+
+
+__all__ = [
+    "run_isovar",
+    "isovar_results_to_dataframe",
+    "AlleleRead",
+    "IsovarResult",
+    "LocusRead",
+    "ProteinSequence",
+    "ProteinSequenceCreator",
+    "ReadCollector",
+    "ReadEvidence",
+    "VariantORF",
+    "VariantSequence",
+    "VariantSequenceCreator",
+]
diff --git a/isovar/alignment_score.py b/isovar/alignment_score.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Sequence alignment helpers
+"""
+
+from __future__ import print_function, division, absolute_import
+
+
+def alignment_score(a, b, min_subsequence_length=1):
+    """
+    Number of mismatches between all two input sequences, allows
+    for trimming of ends of sequences but not insertions or deletions
+    within the sequences. Number of trimmed amino acids from each sequence
+    count toward the mismatch total.
+
+    Parameters
+    ----------
+    a : str
+
+    b : str
+
+    min_subsequence_length : int
+        Only consider subsequences which are at least this long.
+    Returns int
+    """
+
+    n_a = len(a)
+    n_b = len(b)
+
+    # swap a and b if a is longer since the loops below expect the first
+    # sequence to be shorter. This happens because any subsequence of `a`
+    # is expected to be of a length that can be found within `b`
+    if n_a > n_b:
+        a, b = b, a
+        n_a, n_b = n_b, n_a
+
+    # compare all subsequences of a and b and count the number of mismatches
+    # between
+    best_score = n_a + n_b
+
+    # if we need to make sequence of at least length `min_subsequence_length`
+    # then we can't start the sequence more than that number of characters
+    # from the end of the string.
+    # For example, if n_a = 7 and min_subsequence_length = 6 then
+    # this loop should only consider start_a = {0, 1} but not any value
+    # higher than that.
+    for start_a in range(n_a - min_subsequence_length + 1):
+        # Similarly, only consider end indices for the subsequence of `a`
+        # which span the minimum number of characters required.
+        # For example, if n_a = 7, min_subsequence_length = 6, start_a = 1
+        # then this loop should only consider end_a = 7.
+        for end_a in range(start_a + min_subsequence_length, n_a + 1):
+            subseq_a = a[start_a:end_a]
+            n_subseq_a = len(subseq_a)
+            n_trimmed_a = n_a - n_subseq_a
+            # consider all subsequences of the second string of the same length
+            # as the subsequence extracted from the first string
+            for start_b in range(n_b - n_subseq_a + 1):
+                subseq_b = b[start_b:start_b + n_subseq_a]
+                n_subseq_b = len(subseq_b)
+                assert n_subseq_a == n_subseq_b
+                n_trimmed_b = n_b - n_subseq_b
+                # now that we have two subsequences of the same length,
+                # count up the number of mismatching characters between them
+                n_mismatches = sum([
+                    a_char != b_char
+                    for (a_char, b_char)
+                    in zip(subseq_a, subseq_b)])
+                score = n_trimmed_a + n_trimmed_b + n_mismatches
+                if score < best_score:
+                    best_score = score
+    return best_score
diff --git a/isovar/allele_counts.py b/isovar/allele_counts.py
diff --git a/isovar/allele_read.py b/isovar/allele_read.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2016-2019. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Reads overlapping a locus of interest split into prefix,
+allele (ref, alt, or otherwise), and suffix portions
+"""
+
+from __future__ import print_function, division, absolute_import
+import logging
+
+from .string_helpers import convert_from_bytes_if_necessary, trim_N_nucleotides
+from .value_object import ValueObject
+
+logger = logging.getLogger(__name__)
+
+
+class AlleleRead(ValueObject):
+    """
+    Extremely simplified representation of a read at a locus: just the allele
+    at the locus and sequence before/after. We're ignoring the base qualities
+    and any additional information about splicing, clipping or alignment.
+    """
+    __slots__ = ["prefix", "allele", "suffix", "name", "sequence"]
+
+    def __init__(self, prefix, allele, suffix, name):
+        self.prefix = prefix
+        self.allele = allele
+        self.suffix = suffix
+        self.name = name
+        self.sequence = prefix + allele + suffix
+
+    def __len__(self):
+        return len(self.sequence)
+
+    @classmethod
+    def from_locus_read(cls, locus_read):
+        """
+        Given a single LocusRead object, return either an AlleleRead or None
+
+        Parameters
+        ----------
+        locus_read : LocusRead
+            Read which overlaps a variant locus but doesn't necessarily contain the
+            alternate nucleotides
+        """
+        sequence = locus_read.sequence
+        read_name = locus_read.name
+
+        reference_base0_start_inclusive = locus_read.reference_base0_start_inclusive
+        reference_base0_end_exclusive = locus_read.reference_base0_end_exclusive
+
+        read_base0_start_inclusive = locus_read.read_base0_start_inclusive
+        read_base0_end_exclusive = locus_read.read_base0_end_exclusive
+
+        if read_base0_start_inclusive is None or read_base0_end_exclusive is None:
+            logger.debug(
+                "Skipping read '%s' because required bases in reference interval %s:%s aren't mapped",
+                read_name,
+                reference_base0_start_inclusive,
+                reference_base0_end_exclusive)
+            return None
+
+        reference_positions = locus_read.reference_positions
+
+        n_ref_bases = reference_base0_end_exclusive - reference_base0_start_inclusive
+
+        insertion = (n_ref_bases == 0)
+
+        if insertion:
+            # insertions require a sequence of non-aligned bases
+            # followed by the subsequence reference position
+            for read_index in range(read_base0_start_inclusive, read_base0_end_exclusive):
+                # all the inserted nucleotides should *not* align to the reference
+                if reference_positions[read_index] is not None:
+                    logger.debug(
+                        "Skipping read '%s', inserted nucleotides shouldn't map to reference",
+                        read_name)
+                    return None
+
+        nucleotides_at_variant_locus = convert_from_bytes_if_necessary(
+            sequence[read_base0_start_inclusive:read_base0_end_exclusive])
+
+        if "N" in nucleotides_at_variant_locus:
+            logger.debug(
+                "Skipping read '%s', found N nucleotides at variant locus",
+                read_name)
+        prefix = convert_from_bytes_if_necessary(sequence[:read_base0_start_inclusive])
+        suffix = convert_from_bytes_if_necessary(sequence[read_base0_end_exclusive:])
+
+        prefix, suffix = trim_N_nucleotides(prefix, suffix)
+
+        return AlleleRead(
+            prefix,
+            nucleotides_at_variant_locus,
+            suffix,
+            name=read_name)