SQANTI filters (MatthiasLienhard#8)

* Added types, variable renames, refactored functions They improve code readability and auto completion in IDEs. I doubt that they are complete yet, some specifically marked as "Any" because I wasn't certain. Renamed abbreviated variables. Increases code readability. Follow up to the previous commits. Refactored `has_overlap` and `get_intersects` in _utils.py. Especially the later one was unnecessarily complex. * version number * Types, error handling, variable names, splice_bubbles tweaked moved from numeric type ids in _find_splice_bubbles_at_position to readable strings TSS and PAS are broken for this one I think, left two TODO comments * SQANTI support import, new filters, filtered coordination test more types * Changed TSS/PAS events from 1 vs All to 1 vs 1 * Fix gene track without annotation * Fix swapped 3' and 5' fragment * Typos, variable renames and removing unused code * Quote type
IceFreez3r · Oct 4, 2024 · 1c6cd0a · 1c6cd0a
1 parent 584e282
commit 1c6cd0a
Show file tree

Hide file tree

Showing 11 changed files with 550 additions and 358 deletions.
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.3.5_rc10
+0.3.5_rc11
diff --git a/src/isotools/_gene_plots.py b/src/isotools/_gene_plots.py
@@ -462,7 +462,7 @@ def gene_track(self, ax=None, title=None, reference=True, select_transcripts=Non
             blocked[i] = transcript_end
 
         # use SQANTI color palette if colorbySqanti is True
-        if colorbySqanti:
+        if colorbySqanti and 'annotation' in transcript:
             color = sqanti_palette[transcript['annotation'][0]]['color']
 
         # line from TSS to PAS at 0.25

diff --git a/src/isotools/_transcriptome_filter.py b/src/isotools/_transcriptome_filter.py
@@ -3,6 +3,11 @@
 import logging
 import re
 from ._utils import _filter_function, DEFAULT_KOZAK_PWM
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .transcriptome import Transcriptome
+    from .gene import Gene
 
 logger = logging.getLogger('isotools')
 BOOL_OP = {'and', 'or', 'not', 'is'}
@@ -30,6 +35,10 @@
     'PERMISSIVE': 'gene.coverage.sum(0)[transcript_id] >= 2 and (FSM or not (RTTS or INTERNAL_PRIMING or FRAGMENT))',
     'BALANCED': 'gene.coverage.sum(0)[transcript_id] >= 2 and (FSM or (HIGH_COVER and not (RTTS or FRAGMENT or INTERNAL_PRIMING)))',
     'STRICT': 'gene.coverage.sum(0)[transcript_id] >= 7 and SUBSTANTIAL and (FSM or not (RTTS or FRAGMENT or INTERNAL_PRIMING))',
+    'CAGE_SUPPORT': 'sqanti_classification is not None and sqanti_classification["within_CAGE_peak"]',
+    'TSS_RATIO': 'sqanti_classification is not None and sqanti_classification["ratio_TSS"] > 1.5',
+    'POLYA_MOTIF': 'sqanti_classification is not None and sqanti_classification["polyA_motif_found"]',
+    'POLYA_SITE': 'sqanti_classification is not None and sqanti_classification["within_polyA_site"]',
 }
 
 SPLICE_CATEGORY = ['FSM', 'ISM', 'NIC', 'NNC', 'NOVEL']
@@ -92,7 +101,7 @@ def add_orf_prediction(self, genome_fn, progress_bar=True, filter_transcripts={}
                                   get_fickett=fickett_score, kozak_matrix=kozak_matrix, coding_hexamers=coding, noncoding_hexamers=noncoding)
 
 
-def add_qc_metrics(self, genome_fn, progress_bar=True, downstream_a_len=30, direct_repeat_wd=15, direct_repeat_wobble=2, direct_repeat_mm=2,
+def add_qc_metrics(self: 'Transcriptome', genome_fn: str, progress_bar=True, downstream_a_len=30, direct_repeat_wd=15, direct_repeat_wobble=2, direct_repeat_mm=2,
                    unify_ends=True):
     ''' Retrieves QC metrics for the transcripts.
 
@@ -186,7 +195,7 @@ def add_filter(self, tag, expression, context='transcript', update=False):
     self.filter[context][tag] = expression
 
 
-def iter_genes(self, region=None, query=None, min_coverage=None, max_coverage=None, gois=None, progress_bar=False):
+def iter_genes(self: 'Transcriptome', region=None, query=None, min_coverage=None, max_coverage=None, gois=None, progress_bar=False):
     '''Iterates over the genes of a region, optionally applying filters.
 
     :param region: The region to be considered. Either a string "chr:start-end", or a tuple (chr, start, end). Start and end is optional.
@@ -249,7 +258,7 @@ def iter_genes(self, region=None, query=None, min_coverage=None, max_coverage=No
             yield gene
 
 
-def iter_transcripts(self, region=None, query=None, min_coverage=None, max_coverage=None, genewise=False, gois=None, progress_bar=False):
+def iter_transcripts(self: 'Transcriptome', region=None, query=None, min_coverage=None, max_coverage=None, genewise=False, gois=None, progress_bar=False):
     '''Iterates over the transcripts of a region, optionally applying filters.
 
     By default, each iteration returns a 3 Tuple with the gene object, the transcript number and the transcript dictionary.
@@ -297,7 +306,7 @@ def iter_transcripts(self, region=None, query=None, min_coverage=None, max_cover
                 yield gene, i, transcript
 
 
-def iter_ref_transcripts(self, region=None, query=None, genewise=False, gois=None, progress_bar=False):
+def iter_ref_transcripts(self: 'Transcriptome', region=None, query=None, genewise=False, gois=None, progress_bar=False):
     '''Iterates over the referemce transcripts of a region, optionally applying filters.
 
     :param region: The region to be considered. Either a string "chr:start-end", or a tuple (chr,start,end). Start and end is optional.
@@ -352,7 +361,7 @@ def _eval_filter_fun(fun, name, **args):
         # return False   #or continue
 
 
-def _filter_transcripts(gene, transcripts, query_fun, filter_fun, g_filter_eval, mincoverage=None, maxcoverage=None):
+def _filter_transcripts(gene: 'Gene', transcripts, query_fun, filter_fun, g_filter_eval, mincoverage=None, maxcoverage=None):
     ''' Iterator over the transcripts of the gene.
 
     Transcrips are specified by lists of flags submitted to the parameters.