Skip to content

Commit

Permalink
Merge pull request #104 from openvax/more-rna-filtering-options
Browse files Browse the repository at this point in the history
Major rewrite to enable use of newer pysam, fix several bugs, and add filtering options
  • Loading branch information
iskandr authored Jun 12, 2019
2 parents b39b684 + fbc01de commit e6a870e
Show file tree
Hide file tree
Showing 95 changed files with 6,090 additions and 3,393 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,7 @@ target/

#Ipython Notebook
.ipynb_checkpoints

# PyCharm
.idea/
vcs.xml
6 changes: 4 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ before_script:
--custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.87/
script:
- >
nosetests test --with-coverage --cover-package=isovar &&
nosetests openvax-integration-tests/test
nosetests test --with-coverage --cover-package=isovar
# removing integration tests until Vaxrank is updated to use new API
# &&
# nosetests openvax-integration-tests/test
after_success:
coveralls
deploy:
Expand Down
395 changes: 362 additions & 33 deletions README.md

Large diffs are not rendered by default.

34 changes: 32 additions & 2 deletions isovar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2016-2018. Mount Sinai School of Medicine
# Copyright (c) 2016-2019. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,4 +14,34 @@

from __future__ import print_function, division, absolute_import

__version__ = "0.9.0"
__version__ = "1.0.0"


from .allele_read import AlleleRead
from .dataframe_helpers import isovar_results_to_dataframe
from .isovar_result import IsovarResult
from .locus_read import LocusRead
from .main import run_isovar
from .protein_sequence import ProteinSequence
from .protein_sequence_creator import ProteinSequenceCreator
from .read_collector import ReadCollector
from .read_evidence import ReadEvidence
from .variant_orf import VariantORF
from .variant_sequence import VariantSequence
from .variant_sequence_creator import VariantSequenceCreator


__all__ = [
"run_isovar",
"isovar_results_to_dataframe",
"AlleleRead",
"IsovarResult",
"LocusRead",
"ProteinSequence",
"ProteinSequenceCreator",
"ReadCollector",
"ReadEvidence",
"VariantORF",
"VariantSequence",
"VariantSequenceCreator",
]
86 changes: 86 additions & 0 deletions isovar/alignment_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright (c) 2019. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
Sequence alignment helpers
"""

from __future__ import print_function, division, absolute_import


def alignment_score(a, b, min_subsequence_length=1):
"""
Number of mismatches between all two input sequences, allows
for trimming of ends of sequences but not insertions or deletions
within the sequences. Number of trimmed amino acids from each sequence
count toward the mismatch total.
Parameters
----------
a : str
b : str
min_subsequence_length : int
Only consider subsequences which are at least this long.
Returns int
"""

n_a = len(a)
n_b = len(b)

# swap a and b if a is longer since the loops below expect the first
# sequence to be shorter. This happens because any subsequence of `a`
# is expected to be of a length that can be found within `b`
if n_a > n_b:
a, b = b, a
n_a, n_b = n_b, n_a

# compare all subsequences of a and b and count the number of mismatches
# between
best_score = n_a + n_b

# if we need to make sequence of at least length `min_subsequence_length`
# then we can't start the sequence more than that number of characters
# from the end of the string.
# For example, if n_a = 7 and min_subsequence_length = 6 then
# this loop should only consider start_a = {0, 1} but not any value
# higher than that.
for start_a in range(n_a - min_subsequence_length + 1):
# Similarly, only consider end indices for the subsequence of `a`
# which span the minimum number of characters required.
# For example, if n_a = 7, min_subsequence_length = 6, start_a = 1
# then this loop should only consider end_a = 7.
for end_a in range(start_a + min_subsequence_length, n_a + 1):
subseq_a = a[start_a:end_a]
n_subseq_a = len(subseq_a)
n_trimmed_a = n_a - n_subseq_a
# consider all subsequences of the second string of the same length
# as the subsequence extracted from the first string
for start_b in range(n_b - n_subseq_a + 1):
subseq_b = b[start_b:start_b + n_subseq_a]
n_subseq_b = len(subseq_b)
assert n_subseq_a == n_subseq_b
n_trimmed_b = n_b - n_subseq_b
# now that we have two subsequences of the same length,
# count up the number of mismatching characters between them
n_mismatches = sum([
a_char != b_char
for (a_char, b_char)
in zip(subseq_a, subseq_b)])
score = n_trimmed_a + n_trimmed_b + n_mismatches
if score < best_score:
best_score = score
return best_score
59 changes: 0 additions & 59 deletions isovar/allele_counts.py

This file was deleted.

108 changes: 108 additions & 0 deletions isovar/allele_read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) 2016-2019. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Reads overlapping a locus of interest split into prefix,
allele (ref, alt, or otherwise), and suffix portions
"""

from __future__ import print_function, division, absolute_import
import logging

from .string_helpers import convert_from_bytes_if_necessary, trim_N_nucleotides
from .value_object import ValueObject

logger = logging.getLogger(__name__)


class AlleleRead(ValueObject):
"""
Extremely simplified representation of a read at a locus: just the allele
at the locus and sequence before/after. We're ignoring the base qualities
and any additional information about splicing, clipping or alignment.
"""
__slots__ = ["prefix", "allele", "suffix", "name", "sequence"]

def __init__(self, prefix, allele, suffix, name):
self.prefix = prefix
self.allele = allele
self.suffix = suffix
self.name = name
self.sequence = prefix + allele + suffix

def __len__(self):
return len(self.sequence)

@classmethod
def from_locus_read(cls, locus_read):
"""
Given a single LocusRead object, return either an AlleleRead or None
Parameters
----------
locus_read : LocusRead
Read which overlaps a variant locus but doesn't necessarily contain the
alternate nucleotides
"""
sequence = locus_read.sequence
read_name = locus_read.name

reference_base0_start_inclusive = locus_read.reference_base0_start_inclusive
reference_base0_end_exclusive = locus_read.reference_base0_end_exclusive

read_base0_start_inclusive = locus_read.read_base0_start_inclusive
read_base0_end_exclusive = locus_read.read_base0_end_exclusive

if read_base0_start_inclusive is None or read_base0_end_exclusive is None:
logger.debug(
"Skipping read '%s' because required bases in reference interval %s:%s aren't mapped",
read_name,
reference_base0_start_inclusive,
reference_base0_end_exclusive)
return None

reference_positions = locus_read.reference_positions

n_ref_bases = reference_base0_end_exclusive - reference_base0_start_inclusive

insertion = (n_ref_bases == 0)

if insertion:
# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
for read_index in range(read_base0_start_inclusive, read_base0_end_exclusive):
# all the inserted nucleotides should *not* align to the reference
if reference_positions[read_index] is not None:
logger.debug(
"Skipping read '%s', inserted nucleotides shouldn't map to reference",
read_name)
return None

nucleotides_at_variant_locus = convert_from_bytes_if_necessary(
sequence[read_base0_start_inclusive:read_base0_end_exclusive])

if "N" in nucleotides_at_variant_locus:
logger.debug(
"Skipping read '%s', found N nucleotides at variant locus",
read_name)
prefix = convert_from_bytes_if_necessary(sequence[:read_base0_start_inclusive])
suffix = convert_from_bytes_if_necessary(sequence[read_base0_end_exclusive:])

prefix, suffix = trim_N_nucleotides(prefix, suffix)

return AlleleRead(
prefix,
nucleotides_at_variant_locus,
suffix,
name=read_name)
Loading

0 comments on commit e6a870e

Please sign in to comment.