-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #104 from openvax/more-rna-filtering-options
Major rewrite to enable use of newer pysam, fix several bugs, and add filtering options
- Loading branch information
Showing
95 changed files
with
6,090 additions
and
3,393 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,3 +60,7 @@ target/ | |
|
||
#Ipython Notebook | ||
.ipynb_checkpoints | ||
|
||
# PyCharm | ||
.idea/ | ||
vcs.xml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# Copyright (c) 2019. Mount Sinai School of Medicine | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
""" | ||
Sequence alignment helpers | ||
""" | ||
|
||
from __future__ import print_function, division, absolute_import | ||
|
||
|
||
def alignment_score(a, b, min_subsequence_length=1): | ||
""" | ||
Number of mismatches between all two input sequences, allows | ||
for trimming of ends of sequences but not insertions or deletions | ||
within the sequences. Number of trimmed amino acids from each sequence | ||
count toward the mismatch total. | ||
Parameters | ||
---------- | ||
a : str | ||
b : str | ||
min_subsequence_length : int | ||
Only consider subsequences which are at least this long. | ||
Returns int | ||
""" | ||
|
||
n_a = len(a) | ||
n_b = len(b) | ||
|
||
# swap a and b if a is longer since the loops below expect the first | ||
# sequence to be shorter. This happens because any subsequence of `a` | ||
# is expected to be of a length that can be found within `b` | ||
if n_a > n_b: | ||
a, b = b, a | ||
n_a, n_b = n_b, n_a | ||
|
||
# compare all subsequences of a and b and count the number of mismatches | ||
# between | ||
best_score = n_a + n_b | ||
|
||
# if we need to make sequence of at least length `min_subsequence_length` | ||
# then we can't start the sequence more than that number of characters | ||
# from the end of the string. | ||
# For example, if n_a = 7 and min_subsequence_length = 6 then | ||
# this loop should only consider start_a = {0, 1} but not any value | ||
# higher than that. | ||
for start_a in range(n_a - min_subsequence_length + 1): | ||
# Similarly, only consider end indices for the subsequence of `a` | ||
# which span the minimum number of characters required. | ||
# For example, if n_a = 7, min_subsequence_length = 6, start_a = 1 | ||
# then this loop should only consider end_a = 7. | ||
for end_a in range(start_a + min_subsequence_length, n_a + 1): | ||
subseq_a = a[start_a:end_a] | ||
n_subseq_a = len(subseq_a) | ||
n_trimmed_a = n_a - n_subseq_a | ||
# consider all subsequences of the second string of the same length | ||
# as the subsequence extracted from the first string | ||
for start_b in range(n_b - n_subseq_a + 1): | ||
subseq_b = b[start_b:start_b + n_subseq_a] | ||
n_subseq_b = len(subseq_b) | ||
assert n_subseq_a == n_subseq_b | ||
n_trimmed_b = n_b - n_subseq_b | ||
# now that we have two subsequences of the same length, | ||
# count up the number of mismatching characters between them | ||
n_mismatches = sum([ | ||
a_char != b_char | ||
for (a_char, b_char) | ||
in zip(subseq_a, subseq_b)]) | ||
score = n_trimmed_a + n_trimmed_b + n_mismatches | ||
if score < best_score: | ||
best_score = score | ||
return best_score |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
# Copyright (c) 2016-2019. Mount Sinai School of Medicine | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
""" | ||
Reads overlapping a locus of interest split into prefix, | ||
allele (ref, alt, or otherwise), and suffix portions | ||
""" | ||
|
||
from __future__ import print_function, division, absolute_import | ||
import logging | ||
|
||
from .string_helpers import convert_from_bytes_if_necessary, trim_N_nucleotides | ||
from .value_object import ValueObject | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class AlleleRead(ValueObject): | ||
""" | ||
Extremely simplified representation of a read at a locus: just the allele | ||
at the locus and sequence before/after. We're ignoring the base qualities | ||
and any additional information about splicing, clipping or alignment. | ||
""" | ||
__slots__ = ["prefix", "allele", "suffix", "name", "sequence"] | ||
|
||
def __init__(self, prefix, allele, suffix, name): | ||
self.prefix = prefix | ||
self.allele = allele | ||
self.suffix = suffix | ||
self.name = name | ||
self.sequence = prefix + allele + suffix | ||
|
||
def __len__(self): | ||
return len(self.sequence) | ||
|
||
@classmethod | ||
def from_locus_read(cls, locus_read): | ||
""" | ||
Given a single LocusRead object, return either an AlleleRead or None | ||
Parameters | ||
---------- | ||
locus_read : LocusRead | ||
Read which overlaps a variant locus but doesn't necessarily contain the | ||
alternate nucleotides | ||
""" | ||
sequence = locus_read.sequence | ||
read_name = locus_read.name | ||
|
||
reference_base0_start_inclusive = locus_read.reference_base0_start_inclusive | ||
reference_base0_end_exclusive = locus_read.reference_base0_end_exclusive | ||
|
||
read_base0_start_inclusive = locus_read.read_base0_start_inclusive | ||
read_base0_end_exclusive = locus_read.read_base0_end_exclusive | ||
|
||
if read_base0_start_inclusive is None or read_base0_end_exclusive is None: | ||
logger.debug( | ||
"Skipping read '%s' because required bases in reference interval %s:%s aren't mapped", | ||
read_name, | ||
reference_base0_start_inclusive, | ||
reference_base0_end_exclusive) | ||
return None | ||
|
||
reference_positions = locus_read.reference_positions | ||
|
||
n_ref_bases = reference_base0_end_exclusive - reference_base0_start_inclusive | ||
|
||
insertion = (n_ref_bases == 0) | ||
|
||
if insertion: | ||
# insertions require a sequence of non-aligned bases | ||
# followed by the subsequence reference position | ||
for read_index in range(read_base0_start_inclusive, read_base0_end_exclusive): | ||
# all the inserted nucleotides should *not* align to the reference | ||
if reference_positions[read_index] is not None: | ||
logger.debug( | ||
"Skipping read '%s', inserted nucleotides shouldn't map to reference", | ||
read_name) | ||
return None | ||
|
||
nucleotides_at_variant_locus = convert_from_bytes_if_necessary( | ||
sequence[read_base0_start_inclusive:read_base0_end_exclusive]) | ||
|
||
if "N" in nucleotides_at_variant_locus: | ||
logger.debug( | ||
"Skipping read '%s', found N nucleotides at variant locus", | ||
read_name) | ||
prefix = convert_from_bytes_if_necessary(sequence[:read_base0_start_inclusive]) | ||
suffix = convert_from_bytes_if_necessary(sequence[read_base0_end_exclusive:]) | ||
|
||
prefix, suffix = trim_N_nucleotides(prefix, suffix) | ||
|
||
return AlleleRead( | ||
prefix, | ||
nucleotides_at_variant_locus, | ||
suffix, | ||
name=read_name) |
Oops, something went wrong.