From 3727d7258e44074d13cfd49d065ebdb0a880d986 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 26 Dec 2023 13:32:25 +0300
Subject: [PATCH 01/10] move threshold

---
 isoquant.py              | 1 +
 src/dataset_processor.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/isoquant.py b/isoquant.py
index 0d89a940..f876cb79 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -673,6 +673,7 @@ def set_additional_params(args):
         args.needs_reference = False
 
     args.simple_models_mapq_cutoff = 30
+    args.polya_percentage_threshold = 0.7
 
 
 def run_pipeline(args):
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 2e5099e5..9388f29b 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -406,7 +406,7 @@ def process_sample(self, sample):
         polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0
         logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" %
                     (total_alignments, polya_found, polya_fraction * 100.0))
-        self.args.needs_polya_for_construction = polya_fraction >= 0.7
+        self.args.needs_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold
 
         self.process_assigned_reads(sample, saves_file)
         if not self.args.read_assignments and not self.args.keep_tmp:

From a1bac778f569c7c37044abff9bc2e2c57edf9c4f Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 26 Dec 2023 13:51:16 +0300
Subject: [PATCH 02/10] add condition for reporting novel monointronic
 transcripts

---
 isoquant.py                           | 22 ++++++++++++----------
 src/dataset_processor.py              | 13 ++++++++++++-
 src/graph_based_model_construction.py |  8 ++++----
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/isoquant.py b/isoquant.py
index f876cb79..77353912 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -501,7 +501,7 @@ def set_data_dependent_options(args):
         args.splice_correction_strategy = splice_correction_strategies[args.data_type]
 
     args.resolve_ambiguous = 'monoexon_and_fsm' if args.fl_data else 'default'
-    args.needs_polya_for_construction = False
+    args.requires_polya_for_construction = False
     if args.read_group is None and args.input_data.has_replicas():
         args.read_group = "file_name"
     args.use_technical_replicas = args.read_group == "file_name"
@@ -591,16 +591,16 @@ def set_model_construction_options(args):
                                             'min_known_count', 'min_nonfl_count',
                                             'min_novel_count', 'min_novel_count_rel',
                                             'min_mono_count_rel', 'singleton_adjacent_cov',
-                                            'fl_only', 'novel_monoexonic'))
+                                            'fl_only', 'novel_monoexonic', 'require_monointronic_polya'))
     strategies = {
-        'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50, True, False),
-        'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100, False, True),
-        'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100, False, True),
-        'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10, False, False),
-        'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10, False, True),
-        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100, True, True),
-        'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True),
-        'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50, False, True)
+        'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50, True, False, True),
+        'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100, False, True, False),
+        'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100, False, True, False),
+        'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10, False, False, True),
+        'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10, False, True, False),
+        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100, True, True, True),
+        'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False),
+        'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50, False, True, False)
     }
     strategy = strategies[args.model_construction_strategy]
 
@@ -633,6 +633,8 @@ def set_model_construction_options(args):
         logger.info("Novel unspliced transcripts will not be reported, "
                     "set --report_novel_unspliced true to discover them")
 
+    args.require_monointronic_polya = strategy.require_monointronic_polya
+
 
 def set_configs_directory(args):
     config_dir = os.path.join(os.environ['HOME'], '.config', 'IsoQuant')
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 9388f29b..b8d17fe7 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -406,7 +406,9 @@ def process_sample(self, sample):
         polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0
         logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" %
                     (total_alignments, polya_found, polya_fraction * 100.0))
-        self.args.needs_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold
+        self.args.requires_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold
+        # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
+        self.args.require_monointronic_polya = self.args.require_monointronic_polya or self.args.requires_polya_for_construction
 
         self.process_assigned_reads(sample, saves_file)
         if not self.args.read_assignments and not self.args.keep_tmp:
@@ -510,6 +512,15 @@ def process_assigned_reads(self, sample, dump_filename):
             reverse=True
         )
         logger.info("Processing assigned reads " + sample.prefix)
+        logger.info("Transcript construction options:")
+        logger.info("  Novel monoexonic transcripts will be reported: %s" %
+                    "yes" if self.args.report_novel_unspliced else "no")
+        logger.info("  Presence of polyA tail is required for a multi-exon transcript to be reported: %s"
+                    % "yes" if self.args.requires_polya_for_construction else "no")
+        logger.info("  Presence of polyA tail is required for a 2-exon transcript to be reported: %s"
+                    % "yes" if self.args.require_monointronic_polya else "no")
+        logger.info("  Report transcript for which strand cannot be detected using canonical splice sites: %s"
+                    % "yes" if self.args.report_unstranded else "no")
 
         # set up aggregators and outputs
         aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups)
diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 8b7b3820..7bbee789 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -376,7 +376,7 @@ def construct_fl_isoforms(self):
                 if count < novel_isoform_cutoff:
                     # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff))
                     pass
-                elif len(novel_exons) == 2 and (not polya_site or transcript_ss_strand == '.'):
+                elif len(novel_exons) == 2 and self.params.require_monointronic_polya and not polya_site:
                     # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path)))
                     pass
                 elif transcript_strand == '.' and not self.params.report_unstranded:
@@ -468,13 +468,13 @@ def construct_assignment_based_isoforms(self, read_assignment_storage):
                     #             (read_assignment.read_id, refrenence_isoform_id))
                 spliced_isoform_reads[refrenence_isoform_id].append(read_assignment)
 
-                if self.params.needs_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '-':
+                if self.params.requires_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '-':
                     if any(x.event_type == MatchEventSubtype.correct_polya_site_left for x in events):
                         isoform_left_support[refrenence_isoform_id] += 1
                 elif abs(self.gene_info.all_isoforms_exons[refrenence_isoform_id][0][0] - read_assignment.corrected_exons[0][0]) <= self.params.apa_delta:
                     isoform_left_support[refrenence_isoform_id] += 1
 
-                if self.params.needs_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '+':
+                if self.params.requires_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '+':
                     if any(x.event_type == MatchEventSubtype.correct_polya_site_right for x in events):
                         isoform_right_support[refrenence_isoform_id] += 1
                 elif abs(self.gene_info.all_isoforms_exons[refrenence_isoform_id][-1][1] - read_assignment.corrected_exons[-1][1]) <= self.params.apa_delta:
@@ -761,7 +761,7 @@ def fill(self, read_assignments):
             path_tuple = tuple(intron_path)
             self.paths[path_tuple] += 1
             if terminal_vertex and starting_vertex:
-                if not self.params.needs_polya_for_construction or\
+                if not self.params.requires_polya_for_construction or\
                         (terminal_vertex[0] == VERTEX_polya or starting_vertex[0] == VERTEX_polyt):
                     self.fl_paths.add(path_tuple)
             self.paths_to_reads[path_tuple].append(a)

From 8088fa2eeb0d82ffd9a29042d0176cf99f62bdb9 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 26 Dec 2023 15:50:39 +0300
Subject: [PATCH 03/10] add polya strategy option

---
 README.md                | 11 ++++++++
 isoquant.py              | 19 ++++++++------
 src/dataset_processor.py | 54 ++++++++++++++++++++++++++++++----------
 3 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 5793368e..075ce0f8 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,9 @@ IsoQuant support all kinds of long RNA data:
 
 Reads must be provided in FASTQ or FASTA format (can be gzipped). If you have already aligned your reads to the reference genome, simply provide sorted and indexed BAM files.
 
+IsoQuant expect reads to contain polyA tails. For more reliable transcript model construction do not trim polyA tails.
+
+
 <a name="sec1.2"></a>
 ## Supported reference data
 
@@ -519,6 +522,14 @@ The main explanation that some aligner report a lot of false unspliced alignment
 for ONT reads.
 
 
+`--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of:
+
+* `default` - polyA tails are required if at least 70% of the reads have polyA tail; 
+polyA tails are always required for 1/2-exon transcripts when using ONT data (this is caused by elevated number of false 1/2-exonic alignments reported by minimap2); 
+* `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data;
+* `always` - reported transcripts are always required to have polyA support in the reads.
+
+
 
 ### Hidden options
 <a name="hidden"></a>
diff --git a/isoquant.py b/isoquant.py
index 77353912..22ce3ca8 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -29,7 +29,7 @@
     NANOPORE_DATA,
     DataSetReadMapper
 )
-from src.dataset_processor import DatasetProcessor
+from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
 from src.long_read_assigner import AmbiguityResolvingMethod
 from src.long_read_counter import COUNTING_STRATEGIES
 from src.input_data_storage import InputDataStorage
@@ -151,6 +151,10 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
     add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
                           help="report novel monoexonic transcripts (true/false), "
                                "default: False for ONT, True for other data types")
+    add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
+                          help="require polyA tails to be present when reporting transcripts (default/never/always), "
+                               "default: require polyA only when polyA percentage is >= 70%",
+                          default=PolyAUsageStrategies.default.name)
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                      default="16")
@@ -271,11 +275,12 @@ def check_and_load_args(args, parser):
                 logger.warning("Output folder already contains a previous run, will be overwritten.")
             else:
                 logger.warning("Output folder already contains a previous run, some files may be overwritten. "
-                               "Use --resume to resume a failed run. Use --force to avoid this message. "
-                               "Press Ctrl+C to interrupt the run now.")
+                               "Use --resume to resume a failed run. Use --force to avoid this message.")
+                logger.warning("Press Ctrl+C to interrupt the run now.")
                 delay = 9
                 for i in range(delay):
-                    sys.stdout.write("Resuming the run in %d seconds\r" % (delay - i))
+                    countdown = delay - i
+                    sys.stdout.write("Resuming the run in %d second%s\r" % (countdown, "s" if countdown > 1 else ""))
                     time.sleep(1)
                 logger.info("Overwriting the previous run")
                 time.sleep(1)
@@ -598,7 +603,7 @@ def set_model_construction_options(args):
         'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100, False, True, False),
         'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10, False, False, True),
         'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10, False, True, False),
-        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100, True, True, True),
+        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100, True, True, False),
         'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False),
         'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50, False, True, False)
     }
@@ -634,6 +639,7 @@ def set_model_construction_options(args):
                     "set --report_novel_unspliced true to discover them")
 
     args.require_monointronic_polya = strategy.require_monointronic_polya
+    args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
 
 
 def set_configs_directory(args):
@@ -658,8 +664,6 @@ def set_additional_params(args):
     set_splice_correction_options(args)
 
     args.print_additional_info = True
-    args.no_polya = False
-
     args.indel_near_splice_site_dist = 10
     args.upstream_region_len = 20
 
@@ -676,6 +680,7 @@ def set_additional_params(args):
 
     args.simple_models_mapq_cutoff = 30
     args.polya_percentage_threshold = 0.7
+    args.low_polya_percentage_threshold = 0.1
 
 
 def run_pipeline(args):
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index b8d17fe7..fa9b3472 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -10,7 +10,7 @@
 import logging
 import os
 import shutil
-import time
+from enum import Enum, unique
 from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor
 
@@ -72,6 +72,22 @@ def clean_locks(chr_ids, base_name, fname_function):
             os.remove(fname)
 
 
+@unique
+class PolyAUsageStrategies(Enum):
+    default = 1
+    never = 2
+    always = 3
+
+
+def set_polya_requirement_strategy(flag, polya_requirement_strategy):
+    if polya_requirement_strategy == PolyAUsageStrategies.default:
+        return flag
+    elif polya_requirement_strategy == PolyAUsageStrategies.never:
+        return False
+    else:
+        return True
+
+
 def collect_reads_in_parallel(sample, chr_id, args):
     current_chr_record = Fasta(args.reference)[chr_id]
     if not args.low_memory:
@@ -406,9 +422,18 @@ def process_sample(self, sample):
         polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0
         logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" %
                     (total_alignments, polya_found, polya_fraction * 100.0))
-        self.args.requires_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold
-        # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
-        self.args.require_monointronic_polya = self.args.require_monointronic_polya or self.args.requires_polya_for_construction
+        if polya_fraction < self.args.low_polya_percentage_threshold:
+            logger.warning("PolyA percentage is suspiciously low. IsoQuant expects non-polya-trimmed reads. "
+                           "If you aim to construct transcript models, consider using --polya_requirement option.")
+
+        self.args.requires_polya_for_construction = set_polya_requirement_strategy(
+            polya_fraction >= self.args.polya_percentage_threshold,
+            self.args.polya_requirement_strategy)
+        self.args.require_monointronic_polya = set_polya_requirement_strategy(
+            # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
+            self.args.require_monointronic_polya or self.args.requires_polya_for_construction,
+            self.args.polya_requirement_strategy)
+
 
         self.process_assigned_reads(sample, saves_file)
         if not self.args.read_assignments and not self.args.keep_tmp:
@@ -512,21 +537,24 @@ def process_assigned_reads(self, sample, dump_filename):
             reverse=True
         )
         logger.info("Processing assigned reads " + sample.prefix)
-        logger.info("Transcript construction options:")
-        logger.info("  Novel monoexonic transcripts will be reported: %s" %
-                    "yes" if self.args.report_novel_unspliced else "no")
-        logger.info("  Presence of polyA tail is required for a multi-exon transcript to be reported: %s"
-                    % "yes" if self.args.requires_polya_for_construction else "no")
-        logger.info("  Presence of polyA tail is required for a 2-exon transcript to be reported: %s"
-                    % "yes" if self.args.require_monointronic_polya else "no")
-        logger.info("  Report transcript for which strand cannot be detected using canonical splice sites: %s"
-                    % "yes" if self.args.report_unstranded else "no")
+        logger.info("  Transcript models construction is turned %s" %
+                    "off" if self.args.no_model_construction else "on")
 
         # set up aggregators and outputs
         aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups)
         transcript_stat_counter = EnumStats()
 
         if not self.args.no_model_construction:
+            logger.info("Transcript construction options:")
+            logger.info("  Novel monoexonic transcripts will be reported: %s" %
+                        "yes" if self.args.report_novel_unspliced else "no")
+            logger.info("  Presence of polyA tail is required for a multi-exon transcript to be reported: %s"
+                        % "yes" if self.args.requires_polya_for_construction else "no")
+            logger.info("  Presence of polyA tail is required for a 2-exon transcript to be reported: %s"
+                        % "yes" if self.args.require_monointronic_polya else "no")
+            logger.info("  Report transcript for which strand cannot be detected using canonical splice sites: %s"
+                        % "yes" if self.args.report_unstranded else "no")
+
             gff_printer = GFFPrinter(
                 sample.out_dir, sample.prefix, self.io_support, header=self.common_header
             )

From bee9888267a73499a9551781a269eac58aa9dfa3 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 26 Dec 2023 16:38:37 +0300
Subject: [PATCH 04/10] monoexonic polya strategy

---
 README.md                             |  2 ++
 isoquant.py                           | 28 ++++++++++++++++++---------
 src/dataset_processor.py              | 26 +++++++++++++++----------
 src/graph_based_model_construction.py |  4 +++-
 4 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 075ce0f8..dce014ad 100644
--- a/README.md
+++ b/README.md
@@ -529,6 +529,8 @@ polyA tails are always required for 1/2-exon transcripts when using ONT data (th
 * `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data;
 * `always` - reported transcripts are always required to have polyA support in the reads.
 
+Note, that polyA tails are always required for reporting novel unspliced isoforms. 
+
 
 
 ### Hidden options
diff --git a/isoquant.py b/isoquant.py
index 22ce3ca8..328eaa3f 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -596,16 +596,25 @@ def set_model_construction_options(args):
                                             'min_known_count', 'min_nonfl_count',
                                             'min_novel_count', 'min_novel_count_rel',
                                             'min_mono_count_rel', 'singleton_adjacent_cov',
-                                            'fl_only', 'novel_monoexonic', 'require_monointronic_polya'))
+                                            'fl_only', 'novel_monoexonic',
+                                            'require_monointronic_polya', 'require_monoexonic_polya'))
     strategies = {
-        'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50, True, False, True),
-        'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100, False, True, False),
-        'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100, False, True, False),
-        'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10, False, False, True),
-        'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10, False, True, False),
-        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100, True, True, False),
-        'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False),
-        'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50, False, True, False)
+        'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50,
+                                                     True, False, True, True),
+        'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100,
+                                                     False, True, False, True),
+        'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100,
+                                                     False, True, False, False),
+        'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10,
+                                                     False, False, True, True),
+        'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10,
+                                                     False, True, False, False),
+        'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100,
+                                                     True, True, False, False),
+        'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500,
+                                                     False, True, False, False),
+        'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50,
+                                                     False, True, False, False)
     }
     strategy = strategies[args.model_construction_strategy]
 
@@ -639,6 +648,7 @@ def set_model_construction_options(args):
                     "set --report_novel_unspliced true to discover them")
 
     args.require_monointronic_polya = strategy.require_monointronic_polya
+    args.require_monoexonic_polya = strategy.require_monoexonic_polya
     args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
 
 
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index fa9b3472..5a28062f 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -433,7 +433,10 @@ def process_sample(self, sample):
             # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
             self.args.require_monointronic_polya or self.args.requires_polya_for_construction,
             self.args.polya_requirement_strategy)
-
+        self.args.require_monoexonic_polya = set_polya_requirement_strategy(
+            # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
+            self.args.require_monoexonic_polya or self.args.requires_polya_for_construction,
+            self.args.polya_requirement_strategy)
 
         self.process_assigned_reads(sample, saves_file)
         if not self.args.read_assignments and not self.args.keep_tmp:
@@ -538,7 +541,7 @@ def process_assigned_reads(self, sample, dump_filename):
         )
         logger.info("Processing assigned reads " + sample.prefix)
         logger.info("  Transcript models construction is turned %s" %
-                    "off" if self.args.no_model_construction else "on")
+                    ("off" if self.args.no_model_construction else "on"))
 
         # set up aggregators and outputs
         aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups)
@@ -546,14 +549,17 @@ def process_assigned_reads(self, sample, dump_filename):
 
         if not self.args.no_model_construction:
             logger.info("Transcript construction options:")
-            logger.info("  Novel monoexonic transcripts will be reported: %s" %
-                        "yes" if self.args.report_novel_unspliced else "no")
-            logger.info("  Presence of polyA tail is required for a multi-exon transcript to be reported: %s"
-                        % "yes" if self.args.requires_polya_for_construction else "no")
-            logger.info("  Presence of polyA tail is required for a 2-exon transcript to be reported: %s"
-                        % "yes" if self.args.require_monointronic_polya else "no")
-            logger.info("  Report transcript for which strand cannot be detected using canonical splice sites: %s"
-                        % "yes" if self.args.report_unstranded else "no")
+            logger.info("  Novel monoexonic transcripts will be reported: %s"
+                        % ("yes" if self.args.report_novel_unspliced else "no"))
+            logger.info("  PolyA tails are required for multi-exon transcripts to be reported: %s"
+                        % ("yes" if self.args.requires_polya_for_construction else "no"))
+            logger.info("  PolyA tails are required for 2-exon transcripts to be reported: %s"
+                        % ("yes" if self.args.require_monointronic_polya else "no"))
+            logger.info("  PolyA tails are required for known monoexon transcripts to be reported: %s"
+                        % ("yes" if self.args.require_monoexonic_polya else "no"))
+            logger.info("  PolyA tails are required for novel monoexon transcripts to be reported: %s" % "yes")
+            logger.info("  Report transcript for which the strand cannot be detected using canonical splice sites: %s"
+                        % ("yes" if self.args.report_unstranded else "no"))
 
             gff_printer = GFFPrinter(
                 sample.out_dir, sample.prefix, self.io_support, header=self.common_header
diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 7bbee789..3c1cc8c0 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -128,6 +128,7 @@ def process(self, read_assignment_storage):
 
         self.construct_fl_isoforms()
         self.construct_assignment_based_isoforms(read_assignment_storage)
+        # FIXME: assign reads AFTER filtering
         self.assign_reads_to_models(read_assignment_storage)
         self.filter_transcripts()
 
@@ -596,7 +597,8 @@ def construct_monoexon_isoforms(self, mono_exon_isoform_reads, mono_exon_isoform
             polya_support = polya_sites[isoform_id]
 
             # logger.debug(">> Monoexon transcript %s: %d\t%d\t%.4f\t%d" % (isoform_id, self.intron_graph.max_coverage, count, coverage, polya_support))
-            if count < self.params.min_known_count or coverage < self.params.min_mono_exon_coverage or polya_support == 0:
+            if (count < self.params.min_known_count or coverage < self.params.min_mono_exon_coverage or
+                    (self.params.require_monoexonic_polya and polya_support == 0)):
                 pass # logger.debug(">> Will NOT be added, abs cutoff=%d" % (self.params.min_known_count))
             elif isoform_id not in GraphBasedModelConstructor.detected_known_isoforms:
                 new_model = self.transcript_from_reference(isoform_id)

From 47db3b434001223e488eb527fc280f839a82dfb1 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 26 Dec 2023 17:03:11 +0300
Subject: [PATCH 05/10] fix help message

---
 isoquant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/isoquant.py b/isoquant.py
index 328eaa3f..0dff1955 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -153,7 +153,7 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                "default: False for ONT, True for other data types")
     add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
                           help="require polyA tails to be present when reporting transcripts (default/never/always), "
-                               "default: require polyA only when polyA percentage is >= 70%",
+                               "default: require polyA only when polyA percentage is >= 70%%",
                           default=PolyAUsageStrategies.default.name)
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,

From d9315b50c17014d1f5f1562fd7303b44085a9824 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Wed, 27 Dec 2023 02:08:19 +0300
Subject: [PATCH 06/10] fix SS strand filtering

---
 src/dataset_processor.py              | 2 +-
 src/graph_based_model_construction.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 5a28062f..8afe13dd 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -540,7 +540,7 @@ def process_assigned_reads(self, sample, dump_filename):
             reverse=True
         )
         logger.info("Processing assigned reads " + sample.prefix)
-        logger.info("  Transcript models construction is turned %s" %
+        logger.info("Transcript models construction is turned %s" %
                     ("off" if self.args.no_model_construction else "on"))
 
         # set up aggregators and outputs
diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 3c1cc8c0..6520418e 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -377,7 +377,8 @@ def construct_fl_isoforms(self):
                 if count < novel_isoform_cutoff:
                     # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff))
                     pass
-                elif len(novel_exons) == 2 and self.params.require_monointronic_polya and not polya_site:
+                elif (len(novel_exons) == 2 and
+                      ((self.params.require_monointronic_polya and not polya_site) or transcript_ss_strand == '.')):
                     # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path)))
                     pass
                 elif transcript_strand == '.' and not self.params.report_unstranded:

From 8adb85d8dde33f0b4682cff7032607197153d955 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Wed, 27 Dec 2023 02:44:35 +0300
Subject: [PATCH 07/10] allow user to set splice site filtration strategy

---
 README.md                             | 12 +++++++++---
 isoquant.py                           | 16 ++++++++++------
 src/gene_info.py                      | 15 ++++++++++++++-
 src/graph_based_model_construction.py | 19 +++++++++++++++----
 4 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index dce014ad..4eb69179 100644
--- a/README.md
+++ b/README.md
@@ -522,6 +522,15 @@ The main explanation that some aligner report a lot of false unspliced alignment
 for ONT reads.
 
 
+`--report_canonical`
+    Strategy for reporting novel transcripts based on canonical splice sites, should be one of:
+
+* `only_canonical` - report novel transcripts, which contain only canonical splice sites;
+* `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and 
+presence of a polyA tail, allowing some splice sites to be non-canonical (default);
+* `all` -- report all transcript model regardless of their splice sites.
+
+
 `--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of:
 
 * `default` - polyA tails are required if at least 70% of the reads have polyA tail; 
@@ -541,9 +550,6 @@ We recommend _not_ to modify these options unless you are clearly aware of their
 `--no_secondary`
     Ignore secondary alignments.
 
-`--report_unstranded`
-    Report transcripts for which the strand cannot be detected using canonical splice sites.
-
 `--aligner`
     Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable.
 
diff --git a/isoquant.py b/isoquant.py
index 0dff1955..8e473f65 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -30,6 +30,7 @@
     DataSetReadMapper
 )
 from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
+from src.graph_based_model_construction import StrandnessReportingLevel
 from src.long_read_assigner import AmbiguityResolvingMethod
 from src.long_read_counter import COUNTING_STRATEGIES
 from src.input_data_storage import InputDataStorage
@@ -150,9 +151,15 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                           help="gene quantification strategy", type=str, default="with_inconsistent")
     add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
                           help="report novel monoexonic transcripts (true/false), "
-                               "default: False for ONT, True for other data types")
+                               "default: false for ONT, true for other data types")
+    add_additional_option("--report_canonical",  type=str, choices=[e.name for e in StrandnessReportingLevel],
+                          help="reporting level for novel transcripts based on canonical splice sites " +
+                               "/".join([e.name for e in StrandnessReportingLevel]) +
+                               "default: " + StrandnessReportingLevel.only_canonical.name,
+                          default=StrandnessReportingLevel.only_canonical.name)
     add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
-                          help="require polyA tails to be present when reporting transcripts (default/never/always), "
+                          help="require polyA tails to be present when reporting transcripts " +
+                               "/".join([e.name for e in StrandnessReportingLevel]) +
                                "default: require polyA only when polyA percentage is >= 70%%",
                           default=PolyAUsageStrategies.default.name)
     # OUTPUT PROPERTIES
@@ -160,10 +167,6 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                      default="16")
     pipeline_args_group.add_argument('--check_canonical', action='store_true', default=False,
                                      help="report whether splice junctions are canonical")
-    add_additional_option_to_group(pipeline_args_group, "--report_unstranded",
-                                   help="report transcripts for which the strand cannot be detected "
-                                        "using canonical splice sites",
-                                   action='store_true', default=False)
     pipeline_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output",
                                      action='store_true', default=False)
     pipeline_args_group.add_argument("--count_exons", help="perform exon and intron counting",
@@ -650,6 +653,7 @@ def set_model_construction_options(args):
     args.require_monointronic_polya = strategy.require_monointronic_polya
     args.require_monoexonic_polya = strategy.require_monoexonic_polya
     args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
+    args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical]
 
 
 def set_configs_directory(args):
diff --git a/src/gene_info.py b/src/gene_info.py
index 88ce30f2..2a0e318d 100644
--- a/src/gene_info.py
+++ b/src/gene_info.py
@@ -635,7 +635,7 @@ def set_strand(self, intron, strand=None):
         elif self.chr_record:
             self.strand_dict[intron] = get_intron_strand(intron, self.chr_record)
 
-    def get_strand(self, introns, has_polya=False, has_polyt=False):
+    def count_canonical_sites(self, introns):
         count_fwd = 0
         count_rev = 0
         for intron in introns:
@@ -648,6 +648,19 @@ def get_strand(self, introns, has_polya=False, has_polyt=False):
                 count_fwd += 1
             elif strand == '-':
                 count_rev += 1
+        return count_fwd, count_rev
+
+    # all splice sites must be canonical from the same strand, not just the majority
+    def get_clean_strand(self, introns):
+        count_fwd, count_rev = self.count_canonical_sites(introns)
+        if count_fwd == 0 and count_rev > 0:
+            return  '-'
+        elif count_fwd > 0 and count_rev == 0:
+            return '+'
+        return '.'
+
+    def get_strand(self, introns, has_polya=False, has_polyt=False):
+        count_fwd, count_rev = self.count_canonical_sites(introns)
         if count_fwd == count_rev:
             if has_polya and not has_polyt:
                 return '+'
diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 6520418e..fb135e45 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -7,6 +7,7 @@
 import logging
 from collections import defaultdict
 from functools import cmp_to_key
+from enum import unique, Enum
 
 from .common import (
     AtomicCounter,
@@ -30,6 +31,13 @@
 logger = logging.getLogger('IsoQuant')
 
 
+@unique
+class StrandnessReportingLevel(Enum):
+    only_canonical = 1
+    only_stranded = 2
+    all = 3
+
+
 class GraphBasedModelConstructor:
     transcript_id_counter = AtomicCounter()
     transcript_prefix = "transcript"
@@ -370,7 +378,7 @@ def construct_fl_isoforms(self):
                 has_polya = path[-1][0] == VERTEX_polya
                 polya_site = has_polya or has_polyt
                 transcript_strand = self.strand_detector.get_strand(intron_path, has_polya, has_polyt)
-                transcript_ss_strand = self.strand_detector.get_strand(intron_path)
+                transcript_clean_strand = self.strand_detector.get_clean_strand(intron_path)
 
                 #logger.debug("uuu Novel isoform %s has coverage: %d cutoff = %d, component cov = %d, max_coverage = %d"
                 #             % (new_transcript_id, count, novel_isoform_cutoff, component_coverage, self.intron_graph.max_coverage))
@@ -378,11 +386,14 @@ def construct_fl_isoforms(self):
                     # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff))
                     pass
                 elif (len(novel_exons) == 2 and
-                      ((self.params.require_monointronic_polya and not polya_site) or transcript_ss_strand == '.')):
+                      ((self.params.require_monointronic_polya and not polya_site) or transcript_clean_strand == '.')):
                     # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path)))
                     pass
-                elif transcript_strand == '.' and not self.params.report_unstranded:
-                    logger.info("Avoiding unreliable transcript with %d exons" % len(novel_exons))
+                elif ((self.params.report_canonical_strategy == StrandnessReportingLevel.only_canonical
+                       and transcript_clean_strand == '.')
+                      or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded
+                          and transcript_strand == '.')):
+                    logger.debug("Avoiding unreliable transcript with %d exons" % len(novel_exons))
                     pass
                 else:
                     if self.params.use_technical_replicas and \

From fc4366d4dc276d0a6d75079e46775617c97bd376 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Wed, 27 Dec 2023 03:19:25 +0300
Subject: [PATCH 08/10] introduce automatic filtering selection

---
 README.md                             |  5 ++--
 isoquant.py                           | 43 ++++++++++++++-------------
 src/dataset_processor.py              |  7 ++---
 src/graph_based_model_construction.py |  1 +
 4 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 4eb69179..713f615f 100644
--- a/README.md
+++ b/README.md
@@ -525,15 +525,16 @@ for ONT reads.
 `--report_canonical`
     Strategy for reporting novel transcripts based on canonical splice sites, should be one of:
 
+* `auto` - automatic selection based on the data type and model construction strategy (default); 
 * `only_canonical` - report novel transcripts, which contain only canonical splice sites;
 * `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and 
-presence of a polyA tail, allowing some splice sites to be non-canonical (default);
+presence of a polyA tail, allowing some splice sites to be non-canonical
 * `all` -- report all transcript model regardless of their splice sites.
 
 
 `--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of:
 
-* `default` - polyA tails are required if at least 70% of the reads have polyA tail; 
+* `auto` - default behaviour: polyA tails are required if at least 70% of the reads have polyA tail; 
 polyA tails are always required for 1/2-exon transcripts when using ONT data (this is caused by elevated number of false 1/2-exonic alignments reported by minimap2); 
 * `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data;
 * `always` - reported transcripts are always required to have polyA support in the reads.
diff --git a/isoquant.py b/isoquant.py
index 8e473f65..be1e39ca 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -49,10 +49,10 @@ def bool_str(s):
 
 def parse_args(cmd_args=None, namespace=None):
     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
-    ref_args_group = parser.add_argument_group('Reference data:')
-    input_args_group = parser.add_argument_group('Input data:')
-    output_args_group = parser.add_argument_group('Output naming:')
-    pipeline_args_group = parser.add_argument_group('Pipeline options:')
+    ref_args_group = parser.add_argument_group('Reference data')
+    input_args_group = parser.add_argument_group('Input data')
+    output_args_group = parser.add_argument_group('Output naming')
+    pipeline_args_group = parser.add_argument_group('Pipeline options')
 
     other_options = parser.add_argument_group("Additional options:")
     show_full_help = '--full_help' in cmd_args
@@ -153,15 +153,13 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                           help="report novel monoexonic transcripts (true/false), "
                                "default: false for ONT, true for other data types")
     add_additional_option("--report_canonical",  type=str, choices=[e.name for e in StrandnessReportingLevel],
-                          help="reporting level for novel transcripts based on canonical splice sites " +
-                               "/".join([e.name for e in StrandnessReportingLevel]) +
-                               "default: " + StrandnessReportingLevel.only_canonical.name,
-                          default=StrandnessReportingLevel.only_canonical.name)
+                          help="reporting level for novel transcripts based on canonical splice sites;"
+                               " default: " + StrandnessReportingLevel.auto.name,
+                          default=StrandnessReportingLevel.only_stranded.name)
     add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
-                          help="require polyA tails to be present when reporting transcripts " +
-                               "/".join([e.name for e in StrandnessReportingLevel]) +
-                               "default: require polyA only when polyA percentage is >= 70%%",
-                          default=PolyAUsageStrategies.default.name)
+                          help="require polyA tails to be present when reporting transcripts; "
+                               "default: auto (requires polyA only when polyA percentage is >= 70%%)",
+                          default=PolyAUsageStrategies.auto.name)
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                      default="16")
@@ -600,24 +598,25 @@ def set_model_construction_options(args):
                                             'min_novel_count', 'min_novel_count_rel',
                                             'min_mono_count_rel', 'singleton_adjacent_cov',
                                             'fl_only', 'novel_monoexonic',
-                                            'require_monointronic_polya', 'require_monoexonic_polya'))
+                                            'require_monointronic_polya', 'require_monoexonic_polya',
+                                            'report_canonical'))
     strategies = {
         'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50,
-                                                     True, False, True, True),
+                                                     True, False, True, True, StrandnessReportingLevel.only_canonical),
         'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100,
-                                                     False, True, False, True),
+                                                     False, True, False, True, StrandnessReportingLevel.only_stranded),
         'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100,
-                                                     False, True, False, False),
+                                                     False, True, False, False, StrandnessReportingLevel.only_stranded),
         'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10,
-                                                     False, False, True, True),
+                                                     False, False, True, True, StrandnessReportingLevel.only_stranded),
         'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10,
-                                                     False, True, False, False),
+                                                     False, True, False, False, StrandnessReportingLevel.only_stranded),
         'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100,
-                                                     True, True, False, False),
+                                                     True, True, False, False, StrandnessReportingLevel.only_stranded),
         'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500,
-                                                     False, True, False, False),
+                                                     False, True, False, False, StrandnessReportingLevel.all),
         'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50,
-                                                     False, True, False, False)
+                                                     False, True, False, False, StrandnessReportingLevel.only_stranded)
     }
     strategy = strategies[args.model_construction_strategy]
 
@@ -654,6 +653,8 @@ def set_model_construction_options(args):
     args.require_monoexonic_polya = strategy.require_monoexonic_polya
     args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
     args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical]
+    if args.report_canonical_strategy == StrandnessReportingLevel.auto:
+        args.report_canonical_strategy = strategy.report_canonical
 
 
 def set_configs_directory(args):
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 8afe13dd..9c16f82a 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -74,13 +74,13 @@ def clean_locks(chr_ids, base_name, fname_function):
 
 @unique
 class PolyAUsageStrategies(Enum):
-    default = 1
+    auto = 1
     never = 2
     always = 3
 
 
 def set_polya_requirement_strategy(flag, polya_requirement_strategy):
-    if polya_requirement_strategy == PolyAUsageStrategies.default:
+    if polya_requirement_strategy == PolyAUsageStrategies.auto:
         return flag
     elif polya_requirement_strategy == PolyAUsageStrategies.never:
         return False
@@ -558,8 +558,7 @@ def process_assigned_reads(self, sample, dump_filename):
             logger.info("  PolyA tails are required for known monoexon transcripts to be reported: %s"
                         % ("yes" if self.args.require_monoexonic_polya else "no"))
             logger.info("  PolyA tails are required for novel monoexon transcripts to be reported: %s" % "yes")
-            logger.info("  Report transcript for which the strand cannot be detected using canonical splice sites: %s"
-                        % ("yes" if self.args.report_unstranded else "no"))
+            logger.info("  Splice site reporting level: %s" % self.args.report_canonical_strategy.name)
 
             gff_printer = GFFPrinter(
                 sample.out_dir, sample.prefix, self.io_support, header=self.common_header
diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index fb135e45..526deff8 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -36,6 +36,7 @@ class StrandnessReportingLevel(Enum):
     only_canonical = 1
     only_stranded = 2
     all = 3
+    auto = 10
 
 
 class GraphBasedModelConstructor:

From 864e150be04512786d0760473a258a22d338577c Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Wed, 27 Dec 2023 17:03:30 +0300
Subject: [PATCH 09/10] set default strategy to only_canonical

---
 isoquant.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/isoquant.py b/isoquant.py
index be1e39ca..a4753cb4 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -604,15 +604,15 @@ def set_model_construction_options(args):
         'reliable':        ModelConstructionStrategy(2, 0.5, 20,  5, 0.05,  1, 0.1,  0.1,  2, 4, 8, 0.05, 0.05, 50,
                                                      True, False, True, True, StrandnessReportingLevel.only_canonical),
         'default_pacbio':  ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.05,  1, 2, 2, 0.02, 0.005, 100,
-                                                     False, True, False, True, StrandnessReportingLevel.only_stranded),
+                                                     False, True, False, True, StrandnessReportingLevel.only_canonical),
         'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5,   2, 0.005,  1, 0.01,  0.02,  1, 2, 2, 0.005, 0.001, 100,
                                                      False, True, False, False, StrandnessReportingLevel.only_stranded),
         'default_ont':     ModelConstructionStrategy(1, 0.5, 20,  3, 0.02,  1, 0.05,  0.05,  1, 3, 3, 0.02, 0.02, 10,
-                                                     False, False, True, True, StrandnessReportingLevel.only_stranded),
+                                                     False, False, True, True, StrandnessReportingLevel.only_canonical),
         'sensitive_ont':   ModelConstructionStrategy(1, 0.5, 20,  3, 0.005,  1, 0.01,  0.02,  1, 2, 3, 0.005, 0.005, 10,
                                                      False, True, False, False, StrandnessReportingLevel.only_stranded),
         'fl_pacbio':       ModelConstructionStrategy(1, 0.5, 10,  2, 0.02,  1, 0.05,  0.01,  1, 2, 3, 0.02, 0.005, 100,
-                                                     True, True, False, False, StrandnessReportingLevel.only_stranded),
+                                                     True, True, False, False, StrandnessReportingLevel.only_canonical),
         'all':             ModelConstructionStrategy(0, 0.3, 5,   1, 0.002,  1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500,
                                                      False, True, False, False, StrandnessReportingLevel.all),
         'assembly':        ModelConstructionStrategy(0, 0.3, 5,   1, 0.05,  1, 0.01, 0.02,  1, 1, 1, 0.05, 0.01, 50,

From af910aa788b61aeaeb61ed2402d9fac65ff15792 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Wed, 27 Dec 2023 17:26:07 +0300
Subject: [PATCH 10/10] remove low_memory

---
 README.md                  |  6 ++----
 isoquant.py                | 13 ++++++-------
 src/alignment_processor.py |  4 ++--
 src/dataset_processor.py   |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 713f615f..b4eb3100 100644
--- a/README.md
+++ b/README.md
@@ -568,11 +568,9 @@ We recommend _not_ to modify these options unless you are clearly aware of their
     for storing the annotation database, because SQLite database cannot be created on a shared disks.
     The folder will be created automatically.
 
-`--low_memory`
-    Deprecated, default behaviour since 3.2.
-
 `--high_memory`
-    Cache read alignments instead for making several passes over a BAM file, noticeably increases RAM usage.
+    Cache read alignments instead for making several passes over a BAM file, noticeably increases RAM usage, 
+but may improve running time when disk I/O is relatively slow.
 
 `--min_mapq`
     Filers out all alignments with MAPQ less than this value (will also filter all secondary alignments, as they typically have MAPQ = 0).
diff --git a/isoquant.py b/isoquant.py
index a4753cb4..dd8bbd27 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -185,8 +185,6 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                    help="align reads to reference without running further analysis")
 
     # ADDITIONAL
-    add_additional_option("--low_memory", help="decrease RAM consumption (deprecated, set by default)",
-                          action='store_true', default=True)
     add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)",
                           action='store_true', default=False)
     add_additional_option("--no_junc_bed", action="store_true", default=False,
@@ -239,8 +237,12 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                    type=str, required=True)
         resume_parser.add_argument('--debug', action='store_true', default=argparse.SUPPRESS,
                                    help='Debug log output.')
-        resume_parser.add_argument("--threads", "-t", help="number of threads to use", type=int, default=argparse.SUPPRESS)
-        resume_parser.add_argument("--low_memory", help="decrease RAM consumption (deprecated, set by default)",
+        resume_parser.add_argument("--threads", "-t", help="number of threads to use",
+                                   type=int, default=argparse.SUPPRESS)
+        resume_parser.add_argument("--high_memory",
+                                   help="increase RAM consumption (store alignment and the genome in RAM)",
+                                   action='store_true', default=False)
+        resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end",
                                    action='store_true', default=argparse.SUPPRESS)
         resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end", action='store_true',
                                    default=argparse.SUPPRESS)
@@ -294,9 +296,6 @@ def check_and_load_args(args, parser):
     elif not os.path.exists(args.genedb_output):
         os.makedirs(args.genedb_output)
 
-    if args.high_memory:
-        args.low_memory = False
-
     if not check_input_params(args):
         parser.print_usage()
         exit(-1)
diff --git a/src/alignment_processor.py b/src/alignment_processor.py
index d916ff94..67df2262 100644
--- a/src/alignment_processor.py
+++ b/src/alignment_processor.py
@@ -229,7 +229,7 @@ def __init__(self, chr_id, bam_pairs, params, illumina_bam, genedb=None, chr_rec
 
         self.bam_merger = BAMOnlineMerger(self.bam_pairs, self.chr_id, 0,
                                           self.bam_pairs[0][0].get_reference_length(self.chr_id),
-                                          multiple_iterators=self.params.low_memory)
+                                          multiple_iterators=not self.params.high_memory)
         self.strand_detector = StrandDetector(self.chr_record)
         self.read_groupper = read_groupper
         self.polya_finder = PolyAFinder(self.params.polya_window, self.params.polya_fraction)
@@ -238,7 +238,7 @@ def __init__(self, chr_id, bam_pairs, params, illumina_bam, genedb=None, chr_rec
 
     def process(self):
         current_region = None
-        alignment_storage = BAMAlignmentStorage(self.bam_merger) if self.params.low_memory else InMemoryAlignmentStorage()
+        alignment_storage = BAMAlignmentStorage(self.bam_merger) if not self.params.high_memory else InMemoryAlignmentStorage()
 
         for bam_index, alignment in self.bam_merger.get():
             if alignment_storage.alignment_is_not_adjacent(alignment):
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 9c16f82a..8559097a 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -90,7 +90,7 @@ def set_polya_requirement_strategy(flag, polya_requirement_strategy):
 
 def collect_reads_in_parallel(sample, chr_id, args):
     current_chr_record = Fasta(args.reference)[chr_id]
-    if not args.low_memory:
+    if args.high_memory:
         current_chr_record = str(current_chr_record)
     read_grouper = create_read_grouper(args, sample, chr_id)
     lock_file = reads_collected_lock_file_name(sample.out_raw_file, chr_id)