From e4736fbd4761a65dae612b262914f4d3a0e447ac Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 12 Apr 2024 17:34:14 +0300
Subject: [PATCH 01/21] add gene assignment type

---
 src/isoform_assignment.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 5e4c4883..b2356e77 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -496,6 +496,11 @@ def __init__(self, read_id, assignment_type, match=None):
             self.isoform_matches = match
         else:
             self.isoform_matches = [match]
+        if self.assignment_type != ReadAssignmentType.ambiguous:
+            self.gene_assignment_type = self.assignment_type
+        else:
+            assigned_genes = set([m.assigned_gene for m in self.isoform_matches])
+            self.gene_assignment_type = ReadAssignmentType.ambiguous if len(assigned_genes) > 1 else ReadAssignmentType.unique
         self.additional_info = {}
         self.additional_attributes = {}
         self.introns_match = False
@@ -522,6 +527,7 @@ def deserialize(cls, infile, gene_info):
         read_assignment.chr_id = read_string(infile)
         read_assignment.mapping_quality = read_short_int(infile)
         read_assignment.assignment_type = ReadAssignmentType(read_short_int(infile))
+        read_assignment.gene_assignment_type = ReadAssignmentType(read_short_int(infile))
         read_assignment.isoform_matches = read_list(infile, IsoformMatch.deserialize)
         read_assignment.additional_info = read_dict(infile)
         read_assignment.additional_attributes = read_dict(infile)
@@ -546,6 +552,7 @@ def serialize(self, outfile):
         write_string(self.chr_id, outfile)
         write_short_int(self.mapping_quality, outfile)
         write_short_int(self.assignment_type.value, outfile)
+        write_short_int(self.gene_assignment_type.value, outfile)
         write_list(self.isoform_matches, outfile, IsoformMatch.serialize)
         write_dict(self.additional_info, outfile)
         write_dict(self.additional_attributes, outfile)

From 92994253f03d8b803d0fa0f6e9dc686197beb7b4 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 16 Apr 2024 02:10:09 +0300
Subject: [PATCH 02/21] inroduce various ambiguity assignment types

---
 src/assignment_io.py      | 11 +++++-----
 src/isoform_assignment.py | 45 +++++++++++++++++++++++++++++++++------
 src/long_read_assigner.py | 13 +++++++----
 3 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/src/assignment_io.py b/src/assignment_io.py
index a47bd937..2d5cbc91 100644
--- a/src/assignment_io.py
+++ b/src/assignment_io.py
@@ -21,8 +21,8 @@
     read_short_int,
     SHORT_TERMINATION_INT
 )
-from .isoform_assignment import match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification
-from .long_read_assigner import ReadAssignmentType
+from .isoform_assignment import (match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification,
+                                 ReadAssignmentType, ReadAssignmentTypeNaming)
 from .gene_info import GeneInfo
 
 
@@ -241,9 +241,10 @@ def add_read_info(self, read_assignment):
                                                                                read_introns, isoform_introns)
                                      for x in m.match_subclassifications])
             strand = read_assignment.strand
-            line = read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" + \
-                   m.assigned_transcript + "\t" + m.assigned_gene + "\t" + \
-                   read_assignment.assignment_type.name + "\t" + event_string + "\t" + range_list_to_str(read_exons)
+            line = (read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" +
+                    m.assigned_transcript + "\t" + m.assigned_gene + "\t" +
+                    ReadAssignmentTypeNaming[read_assignment.assignment_type] + "\t" + event_string + "\t" +
+                    range_list_to_str(read_exons))
 
             additional_info = []
             additional_info.append("PolyA=" + str(read_assignment.polyA_found) + ";")
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index b2356e77..04150433 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -20,9 +20,27 @@ class ReadAssignmentType(Enum):
     intergenic = 20
     ambiguous = 10
     unique_minor_difference = 2
-    inconsistent = 3
+    inconsistent = 30
+    inconsistent_non_intronic = 31
+    inconsistent_amb = 32
     suspended = 255
 
+    @staticmethod
+    def is_inconsistent(assignment_type):
+        return assignment_type in [ReadAssignmentType.inconsistent,
+                                   ReadAssignmentType.inconsistent_amb,
+                                   ReadAssignmentType.inconsistent_non_intronic]
+
+
+ReadAssignmentTypeNaming = {ReadAssignmentType.unique: ReadAssignmentType.unique.name,
+                            ReadAssignmentType.noninformative: ReadAssignmentType.noninformative.name,
+                            ReadAssignmentType.intergenic: ReadAssignmentType.intergenic.name,
+                            ReadAssignmentType.ambiguous: ReadAssignmentType.ambiguous.name,
+                            ReadAssignmentType.unique_minor_difference: ReadAssignmentType.unique_minor_difference.name,
+                            ReadAssignmentType.inconsistent: ReadAssignmentType.inconsistent.name,
+                            ReadAssignmentType.inconsistent_non_intronic: ReadAssignmentType.inconsistent.name,
+                            ReadAssignmentType.inconsistent_amb: ReadAssignmentType.inconsistent.name,
+                            ReadAssignmentType.suspended: ReadAssignmentType.suspended.name}
 
 # SQANTI-like
 @unique
@@ -206,7 +224,11 @@ def is_minor_elongation(match_event_subtype):
 
     @staticmethod
     def is_major_inconsistency(match_event_subtype):
-        return match_event_subtype in nnic_event_types or match_event_subtype in nic_event_types
+        return match_event_subtype in all_major_events
+
+    @staticmethod
+    def is_intronic_inconsistency(match_event_subtype):
+        return match_event_subtype in intronic_major_events
 
 
 event_subtype_cost = {
@@ -332,9 +354,12 @@ def elongation_cost(params, elongation_len):
     MatchEventSubtype.internal_polya_right, MatchEventSubtype.internal_polya_left,
     MatchEventSubtype.major_exon_elongation_left, MatchEventSubtype.major_exon_elongation_right,
     MatchEventSubtype.exon_elongation_left, MatchEventSubtype.exon_elongation_right,
-    MatchEventSubtype.antisense
 }
 
+all_major_events = nic_event_types.union(nnic_event_types)
+
+intronic_major_events = all_major_events.difference(nonintronic_events)
+
 
 # (side, is_known) -> alternation type
 alternative_sites = {("left", True): MatchEventSubtype.alt_left_site_known,
@@ -496,11 +521,17 @@ def __init__(self, read_id, assignment_type, match=None):
             self.isoform_matches = match
         else:
             self.isoform_matches = [match]
-        if self.assignment_type != ReadAssignmentType.ambiguous:
-            self.gene_assignment_type = self.assignment_type
-        else:
+        if self.assignment_type == ReadAssignmentType.ambiguous:
+            assigned_genes = set([m.assigned_gene for m in self.isoform_matches])
+            self.gene_assignment_type = ReadAssignmentType.ambiguous if len(
+                assigned_genes) > 1 else ReadAssignmentType.unique
+        elif self.assignment_type == ReadAssignmentType.inconsistent_amb:
             assigned_genes = set([m.assigned_gene for m in self.isoform_matches])
-            self.gene_assignment_type = ReadAssignmentType.ambiguous if len(assigned_genes) > 1 else ReadAssignmentType.unique
+            self.gene_assignment_type = ReadAssignmentType.inconsistent_amb if len(
+                assigned_genes) > 1 else ReadAssignmentType.inconsistent
+        else:
+            self.gene_assignment_type = self.assignment_type
+
         self.additional_info = {}
         self.additional_attributes = {}
         self.introns_match = False
diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py
index ab294336..174d3e6a 100644
--- a/src/long_read_assigner.py
+++ b/src/long_read_assigner.py
@@ -322,7 +322,8 @@ def check_read_ends(self, read_split_exon_profile, assignment):
 
             if any(MatchEventSubtype.is_major_elongation(e.event_type) for e in exon_elongation_types):
                 # serious exon elongation
-                assignment.set_assignment_type(ReadAssignmentType.inconsistent)
+                if not ReadAssignmentType.is_inconsistent(assignment.assignment_type):
+                    assignment.set_assignment_type(ReadAssignmentType.inconsistent_non_intronic)
             elif any(MatchEventSubtype.is_minor_elongation(e.event_type) for e in exon_elongation_types):
                 # minor exon elongation
                 if assignment.assignment_type == ReadAssignmentType.unique:
@@ -513,7 +514,7 @@ def match_consistent(self, read_id, combined_read_profile):
             self.verify_read_ends_for_assignment(combined_read_profile, assignment)
 
             # if apa detected isoform should be treated as inconsistent and rechecked
-            if assignment.assignment_type == ReadAssignmentType.inconsistent:
+            if ReadAssignmentType.is_inconsistent(assignment.assignment_type):
                 return None
 
         return assignment
@@ -602,7 +603,7 @@ def match_inconsistent(self, read_id, combined_read_profile):
         assignment_type = self.classify_assignment(best_isoforms, read_matches)
         if not combined_read_profile.read_intron_profile.read_profile:
             isoform_matches = self.create_monoexon_matches(read_matches, best_isoforms)
-        elif assignment_type == ReadAssignmentType.inconsistent:
+        elif ReadAssignmentType.is_inconsistent(assignment_type):
             isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, best_score)
         else:
             isoform_matches = self.create_consistent_matches(read_matches, best_isoforms, combined_read_profile)
@@ -621,7 +622,11 @@ def classify_assignment(self, best_isoforms, read_matches):
             assignment_type = ReadAssignmentType.ambiguous if is_abmiguous else ReadAssignmentType.unique
         elif any(MatchEventSubtype.is_major_inconsistency(e) for e in all_event_types):
             #logger.debug("* * Assignment is inconsistent")
-            assignment_type = ReadAssignmentType.inconsistent
+            if any(MatchEventSubtype.is_intronic_inconsistency(e) for e in all_event_types):
+                inconsistency_type = ReadAssignmentType.inconsistent
+            else:
+                inconsistency_type = ReadAssignmentType.inconsistent_non_intronic
+            assignment_type = ReadAssignmentType.inconsistent_amb if is_abmiguous else inconsistency_type
         elif any(MatchEventSubtype.is_minor_error(e) for e in all_event_types):
             #logger.debug("* * Assignment has minor errors")
             assignment_type = ReadAssignmentType.ambiguous if is_abmiguous else ReadAssignmentType.unique_minor_difference

From 9271b0abc45d8557132b4698ee918e55a84fbd6b Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 18 Apr 2024 01:56:15 +0300
Subject: [PATCH 03/21] conventional naming

---
 src/assignment_io.py      |  2 +-
 src/isoform_assignment.py | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/assignment_io.py b/src/assignment_io.py
index 2d5cbc91..4c430db1 100644
--- a/src/assignment_io.py
+++ b/src/assignment_io.py
@@ -243,7 +243,7 @@ def add_read_info(self, read_assignment):
             strand = read_assignment.strand
             line = (read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" +
                     m.assigned_transcript + "\t" + m.assigned_gene + "\t" +
-                    ReadAssignmentTypeNaming[read_assignment.assignment_type] + "\t" + event_string + "\t" +
+                    read_assignment.assignment_type.name + "\t" + event_string + "\t" +
                     range_list_to_str(read_exons))
 
             additional_info = []
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 04150433..441e8b8d 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -32,16 +32,6 @@ def is_inconsistent(assignment_type):
                                    ReadAssignmentType.inconsistent_non_intronic]
 
 
-ReadAssignmentTypeNaming = {ReadAssignmentType.unique: ReadAssignmentType.unique.name,
-                            ReadAssignmentType.noninformative: ReadAssignmentType.noninformative.name,
-                            ReadAssignmentType.intergenic: ReadAssignmentType.intergenic.name,
-                            ReadAssignmentType.ambiguous: ReadAssignmentType.ambiguous.name,
-                            ReadAssignmentType.unique_minor_difference: ReadAssignmentType.unique_minor_difference.name,
-                            ReadAssignmentType.inconsistent: ReadAssignmentType.inconsistent.name,
-                            ReadAssignmentType.inconsistent_non_intronic: ReadAssignmentType.inconsistent.name,
-                            ReadAssignmentType.inconsistent_amb: ReadAssignmentType.inconsistent.name,
-                            ReadAssignmentType.suspended: ReadAssignmentType.suspended.name}
-
 # SQANTI-like
 @unique
 class MatchClassification(Enum):

From 30f3388a397d9371c5df84b2ae888a798ebd3d7b Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 18 Apr 2024 15:22:45 +0300
Subject: [PATCH 04/21] refactor multimap resolver, print gene type assignment

---
 src/assignment_io.py      |  7 ++-
 src/dataset_processor.py  |  9 ++--
 src/isoform_assignment.py | 41 ++++++++++++----
 src/long_read_assigner.py | 30 ++++++------
 src/multimap_resolver.py  | 99 ++++++++++++++++++++++++---------------
 5 files changed, 118 insertions(+), 68 deletions(-)

diff --git a/src/assignment_io.py b/src/assignment_io.py
index 4c430db1..682a808c 100644
--- a/src/assignment_io.py
+++ b/src/assignment_io.py
@@ -21,8 +21,10 @@
     read_short_int,
     SHORT_TERMINATION_INT
 )
-from .isoform_assignment import (match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification,
-                                 ReadAssignmentType, ReadAssignmentTypeNaming)
+from .isoform_assignment import (match_subtype_to_str_with_additional_info,
+                                 ReadAssignment,
+                                 MatchClassification,
+                                 ReadAssignmentType)
 from .gene_info import GeneInfo
 
 
@@ -247,6 +249,7 @@ def add_read_info(self, read_assignment):
                     range_list_to_str(read_exons))
 
             additional_info = []
+            additional_info.append("gene_assignment=" + str(read_assignment.gene_assignment_type.name) + ";")
             additional_info.append("PolyA=" + str(read_assignment.polyA_found) + ";")
             if self.params.cage is not None:
                 additional_info.append("CAGE=" + str(read_assignment.cage_found) + ";")
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index ea48a87e..884930cf 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -180,6 +180,7 @@ def get_next(self):
                     continue
                 else:
                     read_assignment.assignment_type = resolved_assignment.assignment_type
+                    read_assignment.gene_assignment_type = resolved_assignment.gene_assignment_type
                     read_assignment.multimapper = resolved_assignment.multimapper
             assignment_storage.append(read_assignment)
 
@@ -506,14 +507,14 @@ def collect_reads(self, sample):
                 results = proc.map(*read_gen, chunksize=1)
 
                 for storage, read_groups, alignment_stats in results:
-                    for read_assignment in storage:
-                        multimapped_reads[read_assignment.read_id].append(read_assignment)
+                    for basic_read_assignment in storage:
+                        multimapped_reads[basic_read_assignment.read_id].append(basic_read_assignment)
                     all_read_groups.update(read_groups)
                     self.alignment_stat_counter.merge(alignment_stats)
         else:
             for storage, read_groups, alignment_stats in map(*read_gen):
-                for read_assignment in storage:
-                    multimapped_reads[read_assignment.read_id].append(read_assignment)
+                for basic_read_assignment in storage:
+                    multimapped_reads[basic_read_assignment.read_id].append(basic_read_assignment)
                 all_read_groups.update(read_groups)
                 self.alignment_stat_counter.merge(alignment_stats)
 
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 441e8b8d..0de604c3 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -31,6 +31,11 @@ def is_inconsistent(assignment_type):
                                    ReadAssignmentType.inconsistent_amb,
                                    ReadAssignmentType.inconsistent_non_intronic]
 
+    @staticmethod
+    def is_consistent(assignment_type):
+        return assignment_type in [ReadAssignmentType.unique,
+                                   ReadAssignmentType.unique_minor_difference,
+                                   ReadAssignmentType.ambiguous]
 
 # SQANTI-like
 @unique
@@ -403,12 +408,12 @@ def __repr__(self):
 
 class IsoformMatch:
     def __init__(self, match_classification, assigned_gene=None, assigned_transcript=None,
-                 match_subclassification = None, transcript_strand='.', score=0):
+                 match_subclassification = None, transcript_strand='.', penalty_score=0):
         self.assigned_gene = assigned_gene
         self.assigned_transcript = assigned_transcript
         self.transcript_strand = transcript_strand
         self.match_classification = match_classification
-        self.score = score
+        self.penalty_score = penalty_score
         if match_subclassification is None:
             self.match_subclassifications = []
         elif isinstance(match_subclassification, list):
@@ -424,7 +429,7 @@ def deserialize(cls, infile):
         match.assigned_transcript = read_string_or_none(infile)
         match.transcript_strand = read_string(infile)
         match.match_classification = MatchClassification(read_short_int(infile))
-        match.score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER)
+        match.penalty_score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER)
         match.match_subclassifications = read_list(infile, MatchEvent.deserialize)
         return match
 
@@ -433,7 +438,7 @@ def serialize(self, outfile):
         write_string_or_none(self.assigned_transcript, outfile)
         write_string(self.transcript_strand, outfile)
         write_short_int(self.match_classification.value, outfile)
-        write_int(int(self.score * SHORT_FLOAT_MULTIPLIER), outfile)
+        write_int(int(self.penalty_score * SHORT_FLOAT_MULTIPLIER), outfile)
         write_list(self.match_subclassifications, outfile, MatchEvent.serialize)
 
     def add_subclassification(self, match_subclassification):
@@ -459,10 +464,22 @@ def __init__(self, read_assignment):
         self.multimapper = read_assignment.multimapper
         self.polyA_found = read_assignment.polyA_found
         self.assignment_type = read_assignment.assignment_type
+        self.gene_assignment_type = read_assignment.gene_assignment_type
+        self.penalty_score = 0.0
+        self.isoforms = []
+        self.genes = []
+
         if read_assignment.isoform_matches:
-            self.score = read_assignment.isoform_matches[0].score
-        else:
-            self.score = 0.0
+            gene_set = set()
+            isoform_set = set()
+            for m in read_assignment.isoform_matches:
+                self.penalty_score = min(self.penalty_score, read_assignment.isoform_matches[0].penalty_score)
+                if m.assigned_gene:
+                    gene_set.add(m.assigned_gene)
+                if m.assigned_transcript:
+                    isoform_set.add(m.assigned_transcript)
+            self.genes = list(gene_set)
+            self.isoforms = list(isoform_set)
 
     @classmethod
     def deserialize(cls, infile):
@@ -474,7 +491,10 @@ def deserialize(cls, infile):
         read_assignment.multimapper = bool_arr[0]
         read_assignment.polyA_found = bool_arr[1]
         read_assignment.assignment_type = ReadAssignmentType(read_short_int(infile))
-        read_assignment.score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER)
+        read_assignment.gene_assignment_type = ReadAssignmentType(read_short_int(infile))
+        read_assignment.penalty_score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER)
+        read_assignment.genes = read_list(infile, read_string)
+        read_assignment.isoforms = read_list(infile, read_string)
         return read_assignment
 
     def serialize(self, outfile):
@@ -483,7 +503,10 @@ def serialize(self, outfile):
         write_string(self.chr_id, outfile)
         write_bool_array([self.multimapper, self.polyA_found], outfile)
         write_short_int(self.assignment_type.value, outfile)
-        write_int(int(self.score * SHORT_FLOAT_MULTIPLIER), outfile)
+        write_short_int(self.gene_assignment_type.value, outfile)
+        write_int(int(self.penalty_score * SHORT_FLOAT_MULTIPLIER), outfile)
+        write_list(self.genes, outfile, write_string)
+        write_list(self.isoforms, outfile, write_string)
 
 
 class ReadAssignment:
diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py
index 174d3e6a..d50aeea7 100644
--- a/src/long_read_assigner.py
+++ b/src/long_read_assigner.py
@@ -594,7 +594,7 @@ def match_inconsistent(self, read_id, combined_read_profile):
         # logger.debug("* Inconsistencies detected: " + str(read_matches))
 
         # select ones with least number of inconsistent events
-        best_isoforms, best_score = self.select_best_among_inconsistent(combined_read_profile, read_matches)
+        best_isoforms, penalty_score = self.select_best_among_inconsistent(combined_read_profile, read_matches)
         if not best_isoforms:
             return ReadAssignment(read_id, ReadAssignmentType.noninformative)
         # logger.debug("* Selected isoforms: " + str(best_isoforms))
@@ -604,7 +604,7 @@ def match_inconsistent(self, read_id, combined_read_profile):
         if not combined_read_profile.read_intron_profile.read_profile:
             isoform_matches = self.create_monoexon_matches(read_matches, best_isoforms)
         elif ReadAssignmentType.is_inconsistent(assignment_type):
-            isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, best_score)
+            isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, penalty_score)
         else:
             isoform_matches = self.create_consistent_matches(read_matches, best_isoforms, combined_read_profile)
         return ReadAssignment(read_id, assignment_type, isoform_matches)
@@ -656,14 +656,14 @@ def create_consistent_matches(self, read_matches, selected_isoforms, combined_re
             matches.append(isoform_match)
         return matches
 
-    def create_inconsistent_matches(self, read_matches, selected_isoforms, score=0.0):
+    def create_inconsistent_matches(self, read_matches, selected_isoforms, penalty_score=0.0):
         matches = []
         for isoform_id in selected_isoforms:
             read_match = read_matches[isoform_id]
             match_classification = MatchClassification.get_inconsistency_classification(read_match)
             isoform_match = IsoformMatch(match_classification, self.get_gene_id(isoform_id), isoform_id,
                                          read_match, self.gene_info.isoform_strands[isoform_id],
-                                         score=score)
+                                         penalty_score=penalty_score)
             matches.append(isoform_match)
         return matches
 
@@ -720,7 +720,7 @@ def select_best_among_inconsistent(self, combined_read_profile, read_matches):
         isoform_scores = []
         for isoform_id, match_events in read_matches.items():
             # logger.debug("* * Scoring inconsistencies for " + isoform_id + ": " + str(match_events))
-            score = 0.0
+            penalty_score = 0.0
             for e in match_events:
                 event_count = 1
                 if e.isoform_region != SupplementaryMatchConstants.undefined_region and \
@@ -751,25 +751,25 @@ def select_best_among_inconsistent(self, combined_read_profile, read_matches):
                                     MatchEventSubtype.exon_elongation_left}:
                     event_cost = elongation_cost(self.params, e.event_info)
 
-                score +=  event_cost * event_count
+                penalty_score +=  event_cost * event_count
                 # logger.debug("* * * Event " + str(e.event_type) + ", introns affected " + str(event_count) +
                 #             ", event cost " + str(event_cost) +
-                #             ". Updated score: " + str(score))
-            # logger.debug("* * Final score for isoform " + isoform_id + ": " + str(score))
-            isoform_scores.append((isoform_id, score))
+                #             ". Updated penalty_score: " + str(penalty_score))
+            # logger.debug("* * Final penalty_score for isoform " + isoform_id + ": " + str(penalty_score))
+            isoform_scores.append((isoform_id, penalty_score))
 
-        best_score = min(isoform_scores, key=lambda x:x[1])[1]
-        # logger.debug("* * Best score " + str(best_score))
-        best_isoforms = [x[0] for x in filter(lambda x:x[1] == best_score, isoform_scores)]
+        min_penalty_score = min(isoform_scores, key=lambda x:x[1])[1]
+        # logger.debug("* * Best penalty_score " + str(min_penalty_score))
+        best_isoforms = [x[0] for x in filter(lambda x:x[1] == min_penalty_score, isoform_scores)]
         # logger.debug("* * Best isoforms " + str(best_isoforms))
 
-        # if several isoforms are tied select the best according to nucl score
+        # if several isoforms are tied select the best according to nucl penalty_score
         if len(best_isoforms) > 1:
             best_isoforms = self.resolve_by_nucleotide_score(combined_read_profile, best_isoforms,
                                                              similarity_function=self.coverage_based_nucleotide_score)
-            # logger.debug("* * Resolved by nucl score " + str(best_isoforms))
+            # logger.debug("* * Resolved by nucl penalty_score " + str(best_isoforms))
 
-        return best_isoforms, best_score
+        return best_isoforms, min_penalty_score
 
     # ==== POLYA STUFF ====
     def verify_read_ends_for_assignment(self, combined_read_profile, assignment):
diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py
index c36d03d5..26fc1abc 100644
--- a/src/multimap_resolver.py
+++ b/src/multimap_resolver.py
@@ -34,16 +34,11 @@ def resolve(self, assignment_list):
             return assignment_list
 
         elif self.strategy == MultimapResolvingStrategy.merge:
-            informative_assignments = set()
-            for i in range(len(assignment_list)):
-                if assignment_list[i].assignment_type != ReadAssignmentType.noninformative:
-                    informative_assignments.add(i)
-            if len(informative_assignments) == 1:
-                return self.suspend_assignments(assignment_list, informative_assignments)
-            return self.suspend_assignments(assignment_list, informative_assignments, ReadAssignmentType.ambiguous)
+            return self.merge_assignments(assignment_list)
 
         elif self.strategy == MultimapResolvingStrategy.take_best:
             return self.select_best_assignment(assignment_list)
+
         else:
             raise ValueError("Unsupported multimap strategy")
 
@@ -53,53 +48,81 @@ def select_best_assignment(self, assignment_list):
         consistent_assignments = set()
         inconsistent_assignments = set()
         primary_inconsistent = set()
+
         for i, a in enumerate(assignment_list):
-            if a.assignment_type in [ReadAssignmentType.unique, ReadAssignmentType.unique_minor_difference] and \
-                    not a.multimapper:
-                primary_unique.add(i)
-            elif a.assignment_type == ReadAssignmentType.inconsistent and not a.multimapper:
-                primary_inconsistent.add(i)
-            if a.assignment_type in [ReadAssignmentType.unique,
-                                     ReadAssignmentType.unique_minor_difference,
-                                     ReadAssignmentType.ambiguous]:
-                consistent_assignments.add(i)
-            elif a.assignment_type in [ReadAssignmentType.inconsistent]:
+            if ReadAssignmentType.is_inconsistent(a.assignment_type):
+                if not a.multimapper:
+                    primary_inconsistent.add(i)
                 inconsistent_assignments.add(i)
+            elif ReadAssignmentType.is_consistent(a.assignment_type):
+                consistent_assignments.add(i)
+                if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous:
+                    primary_unique.add(i)
 
         if primary_unique:
-            if len(primary_unique) > 1:
-                return self.suspend_assignments(assignment_list, primary_unique, ReadAssignmentType.ambiguous)
-            # primary unique is found, rest is noninformative
-            return self.suspend_assignments(assignment_list, primary_unique)
+            return self.filter_assignments(assignment_list, primary_unique)
 
         if consistent_assignments:
-            return self.suspend_assignments(assignment_list, consistent_assignments, ReadAssignmentType.ambiguous)
+            return self.filter_assignments(assignment_list, consistent_assignments)
 
         if primary_inconsistent:
             return self.select_best_inconsistent(assignment_list, primary_inconsistent)
 
         if inconsistent_assignments:
             return self.select_best_inconsistent(assignment_list, inconsistent_assignments)
+
         return assignment_list
 
-    def select_best_inconsistent(self, assignment_list, inconsistent_assignments):
-        if len(inconsistent_assignments) > 1:
-            assignment_scores = []
-            for i in inconsistent_assignments:
-                assignment_scores.append((assignment_list[i].score, i))
-            best_score = min(assignment_scores, key=lambda x: x[0])[0]
-            best_isoforms = [x[1] for x in filter(lambda x: x[0] == best_score, assignment_scores)]
-            return self.suspend_assignments(assignment_list, best_isoforms,
-                                            ReadAssignmentType.inconsistent if len(best_isoforms) > 1 else None)
+    @staticmethod
+    def select_best_inconsistent(assignment_list, inconsistent_assignments):
+        if len(inconsistent_assignments) <= 1:
+            return MultimapResolver.filter_assignments(assignment_list, inconsistent_assignments)
+
+        assignment_scores = []
+        for i in inconsistent_assignments:
+            assignment_scores.append((assignment_list[i].penalty_score, i))
+        best_score = min(assignment_scores, key=lambda x: x[0])[0]
+        best_assignments = [x[1] for x in filter(lambda x: x[0] == best_score, assignment_scores)]
+        return MultimapResolver.filter_assignments(assignment_list, best_assignments)
+
+    @staticmethod
+    def merge_assignments(assignment_list):
+        informative_assignments = set()
+        for i in range(len(assignment_list)):
+            if assignment_list[i].assignment_type != ReadAssignmentType.noninformative:
+                informative_assignments.add(i)
+
+        return MultimapResolver.filter_assignments(assignment_list, informative_assignments)
 
-        return self.suspend_assignments(assignment_list, inconsistent_assignments)
+    # select all given assignments from the list, makr rest as suspended
+    # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly
+    @staticmethod
+    def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True):
+        all_genes = set()
+        all_isoforms = set()
+        for i in assignments_to_keep:
+            assignment = assignment_list[i]
+            all_genes.update(assignment.genes)
+            all_isoforms.update(assignment.isoforms)
+
+        change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1
+        change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1
 
-    def suspend_assignments(self, assignment_list, assignments_to_keep, new_type=None):
         for i in range(len(assignment_list)):
+            assignment = assignment_list[i]
             if i in assignments_to_keep:
-                if new_type is not None:
-                    assignment_list[i].assignment_type = new_type
-                    assignment_list[i].multimapper = True
-                continue
-            assignment_list[i].assignment_type = ReadAssignmentType.suspended
+                if ReadAssignmentType.is_inconsistent(assignment.assignment_type):
+                    ambiguity_assignment_type = ReadAssignmentType.inconsistent_amb
+                else:
+                    ambiguity_assignment_type = ReadAssignmentType.ambiguous
+
+                if change_transcript_assignment_type:
+                    assignment.assignment_type = ambiguity_assignment_type
+                if change_gene_assignment_type:
+                    assignment.gene_assignment_type = ambiguity_assignment_type
+                    assignment.multimapper = True
+            else:
+                assignment.assignment_type = ReadAssignmentType.suspended
+                assignment.gene_assignment_type = ReadAssignmentType.suspended
+
         return assignment_list

From 025a54766a850c0d07f47c0b8a5d93d92fa8906b Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 18 Apr 2024 15:57:25 +0300
Subject: [PATCH 05/21] fix GTF checker and loader

---
 isoquant.py   |  2 +-
 src/gtf2db.py | 27 +++++++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/isoquant.py b/isoquant.py
index 07d03145..363ef061 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -22,7 +22,7 @@
 import gffutils
 import pyfaidx
 
-from src.gtf2db import convert_gtf_to_db, check_gtf_duplicates
+from src.gtf2db import convert_gtf_to_db
 from src.read_mapper import (
     DATA_TYPE_ALIASES,
     SUPPORTED_STRANDEDNESS,
diff --git a/src/gtf2db.py b/src/gtf2db.py
index 5dd43976..473e8773 100755
--- a/src/gtf2db.py
+++ b/src/gtf2db.py
@@ -117,7 +117,7 @@ def gtf2db(gtf, db, complete_db=False):
         elif has_meta_features == 0:
             logger.warning("You set --complete_genedb option but it looks like the provided annotation "
                            "does not contain all necessary gene or transcript records.")
-            logger.warning("The results of this run might not be meaningful.")
+            logger.warning("The results of this run might not be correct.")
             logger.warning("Remove the output folder and restart IsoQuant without --complete_genedb.")
 
     logger.info("Converting gene annotation file to .db format (takes a while)...")
@@ -188,7 +188,10 @@ def check_gtf_duplicates(gtf):
             gtf_correct = False
             continue
 
-        gene_id = attrs[gene_id_pos + 1][1:-2]
+        gene_str = attrs[gene_id_pos + 1]
+        start_pos = gene_str.find('"')
+        end_pos = gene_str.rfind('"')
+        gene_id = gene_str[start_pos+1:end_pos]
         if feature_type == "gene":
             if gene_id in gene_ids:
                 logger.warning("Duplicated gene id %s on line %d" % (gene_id, line_count))
@@ -210,7 +213,10 @@ def check_gtf_duplicates(gtf):
             gtf_correct = False
             continue
 
-        transcript_id = attrs[transcript_id_pos + 1][1:-2]
+        transcript_str = attrs[transcript_id_pos + 1]
+        start_pos = transcript_str.find('"')
+        end_pos = transcript_str.rfind('"')
+        transcript_id = transcript_str[start_pos+1:end_pos]
         if feature_type in ["transcript", "mRNA"]:
             if transcript_id in transcript_ids:
                 logger.warning("Duplicated transcript id %s on line %d" % (transcript_id, line_count))
@@ -251,7 +257,7 @@ def check_gtf_duplicates(gtf):
     return gtf_correct, corrected_gtf, gtf_name + ".corrected" + inner_ext.lower(), complete_genedb
 
 
-def find_coverted_db(converted_gtfs, gtf_filename):
+def find_converted_db(converted_gtfs, gtf_filename):
     gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime')
     db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime')
     db_file = converted_gtfs.get(gtf_filename, {}).get('genedb')
@@ -261,12 +267,13 @@ def find_coverted_db(converted_gtfs, gtf_filename):
     return None
 
 
-def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename):
+def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename, complete_genedb):
     gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime')
     db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime')
-    if os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime:
-        if os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime:
-            return True
+    is_complete = converted_gtfs.get(gtf_filename, {}).get('complete_db')
+    return (os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime and
+            os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime
+            and complete_genedb == is_complete)
 
 
 def convert_db(gtf_filename, genedb_filename, convert_fn, args):
@@ -277,13 +284,13 @@ def convert_db(gtf_filename, genedb_filename, convert_fn, args):
 
     if not args.clean_start:
         if convert_fn == gtf2db:
-            converted_db = find_coverted_db(converted_gtfs, gtf_filename)
+            converted_db = find_converted_db(converted_gtfs, gtf_filename)
             if converted_db is not None:
                 logger.info("Gene annotation file found. Using " + converted_db)
                 return gtf_filename, converted_db
         else:
             for converted_gtf in converted_gtfs:
-                if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename):
+                if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename, args.complete_genedb):
                     logger.info("Gene annotation file found. Using " + converted_gtf)
                     return converted_gtf, genedb_filename
 

From 90b4f9c5fd70609e6363fd097f4f9d278b12ff02 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 18 Apr 2024 16:04:33 +0300
Subject: [PATCH 06/21] add --no_gtf_check option

---
 README.md     |  3 +++
 isoquant.py   |  2 ++
 src/gtf2db.py | 11 ++++++++---
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f74c72bc..c0afb9cc 100644
--- a/README.md
+++ b/README.md
@@ -552,6 +552,9 @@ We recommend _not_ to modify these options unless you are clearly aware of their
 `--no_gzip`
     Do not compress large output files.
 
+`--no_gzip`
+    Do not perform input GTF checks.
+
 `--no_secondary`
     Ignore secondary alignments.
 
diff --git a/isoquant.py b/isoquant.py
index 363ef061..aaefde88 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -193,6 +193,8 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
     # ADDITIONAL
     add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped",
                           action='store_false', default=True)
+    add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check",
+                          action='store_false', default=True)
     add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)",
                           action='store_true', default=False)
     add_additional_option("--no_junc_bed", action="store_true", default=False,
diff --git a/src/gtf2db.py b/src/gtf2db.py
index 473e8773..ffcd7d31 100755
--- a/src/gtf2db.py
+++ b/src/gtf2db.py
@@ -91,7 +91,7 @@ def get_color(transcript_kind):
     logger.info("Gene database BED written to " + bed)
 
 
-def gtf2db(gtf, db, complete_db=False):
+def check_input_gtf(gtf, db, complete_db):
     logger.info("Checking input gene annotation")
     gtf_is_correct, corrected_gtf, out_fname, has_meta_features = check_gtf_duplicates(gtf)
     if not gtf_is_correct:
@@ -120,6 +120,11 @@ def gtf2db(gtf, db, complete_db=False):
             logger.warning("The results of this run might not be correct.")
             logger.warning("Remove the output folder and restart IsoQuant without --complete_genedb.")
 
+
+def gtf2db(gtf, db, complete_db=False, check_gtf=True):
+    if check_gtf:
+        check_input_gtf(gtf, db, complete_db)
+
     logger.info("Converting gene annotation file to .db format (takes a while)...")
     gffutils.create_db(gtf, db, force=True, keep_order=True, merge_strategy='error',
                        sort_attribute_values=True, disable_infer_transcripts=complete_db,
@@ -128,7 +133,7 @@ def gtf2db(gtf, db, complete_db=False):
     logger.info("Provide this database next time to avoid excessive conversion")
 
 
-def convert_gtf_to_db(args, output_is_dir=True):
+def convert_gtf_to_db(args, output_is_dir=True, check_input_gtf=True):
     gtf_filename = args.genedb
     gtf_filename = os.path.abspath(gtf_filename)
     output_path =  args.output if args.genedb_output is None else args.genedb_output
@@ -295,7 +300,7 @@ def convert_db(gtf_filename, genedb_filename, convert_fn, args):
                     return converted_gtf, genedb_filename
 
     if convert_fn == gtf2db:
-        convert_fn(gtf_filename, genedb_filename, args.complete_genedb)
+        convert_fn(gtf_filename, genedb_filename, args.complete_genedb, args.gtf_check)
     else:
         convert_fn(genedb_filename, gtf_filename)
     converted_gtfs[gtf_filename] = {

From 11ed7299377ffa5ec455c8f00c9766f6599834a0 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 18 Apr 2024 16:12:35 +0300
Subject: [PATCH 07/21] check for converted DB correctness

---
 src/gtf2db.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/gtf2db.py b/src/gtf2db.py
index ffcd7d31..c8220014 100755
--- a/src/gtf2db.py
+++ b/src/gtf2db.py
@@ -262,23 +262,22 @@ def check_gtf_duplicates(gtf):
     return gtf_correct, corrected_gtf, gtf_name + ".corrected" + inner_ext.lower(), complete_genedb
 
 
-def find_converted_db(converted_gtfs, gtf_filename):
+def find_converted_db(converted_gtfs, gtf_filename, complete_genedb):
     gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime')
     db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime')
     db_file = converted_gtfs.get(gtf_filename, {}).get('genedb')
-    if os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime:
-        if os.path.exists(db_file) and os.path.getmtime(db_file) == db_mtime:
-            return db_file
+    is_complete = converted_gtfs.get(gtf_filename, {}).get('complete_db')
+    if (os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime and
+            os.path.exists(db_file) and os.path.getmtime(db_file) == db_mtime and complete_genedb == is_complete):
+        return db_file
     return None
 
 
-def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename, complete_genedb):
+def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename):
     gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime')
     db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime')
-    is_complete = converted_gtfs.get(gtf_filename, {}).get('complete_db')
     return (os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime and
-            os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime
-            and complete_genedb == is_complete)
+            os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime)
 
 
 def convert_db(gtf_filename, genedb_filename, convert_fn, args):
@@ -289,13 +288,13 @@ def convert_db(gtf_filename, genedb_filename, convert_fn, args):
 
     if not args.clean_start:
         if convert_fn == gtf2db:
-            converted_db = find_converted_db(converted_gtfs, gtf_filename)
+            converted_db = find_converted_db(converted_gtfs, gtf_filename, args.complete_genedb)
             if converted_db is not None:
                 logger.info("Gene annotation file found. Using " + converted_db)
                 return gtf_filename, converted_db
         else:
             for converted_gtf in converted_gtfs:
-                if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename, args.complete_genedb):
+                if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename):
                     logger.info("Gene annotation file found. Using " + converted_gtf)
                     return converted_gtf, genedb_filename
 

From daaef96f1ff1ea314c09c5606b8d84b573aa4ef4 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 19 Apr 2024 01:48:59 +0300
Subject: [PATCH 08/21] simply assignment type logic, fix isofrom discovery

---
 src/graph_based_model_construction.py | 16 ++++++-------
 src/isoform_assignment.py             | 34 ++++++++++++++++-----------
 src/long_read_assigner.py             |  8 +++----
 src/long_read_counter.py              |  8 +++----
 src/multimap_resolver.py              |  8 +++----
 5 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 0c377623..7395e55a 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -197,7 +197,7 @@ def compare_models_with_known(self):
             assignment.introns_match = all(e == 1 for e in combined_profile.read_intron_profile.read_profile)
             assignment.gene_info = self.gene_info
 
-            if assignment.assignment_type in [ReadAssignmentType.intergenic, ReadAssignmentType.noninformative] or \
+            if assignment.assignment_type.is_unassigned() or \
                     not assignment.isoform_matches:
                 # create intergenic
                 assignment.assignment_type = ReadAssignmentType.intergenic
@@ -481,13 +481,15 @@ def construct_assignment_based_isoforms(self, read_assignment_storage):
 
             if not read_assignment:
                 continue
-            if len(read_assignment.corrected_exons) == 1 and read_assignment.polyA_found and not read_assignment.multimapper and \
-                    read_assignment.assignment_type in {ReadAssignmentType.noninformative, ReadAssignmentType.inconsistent, ReadAssignmentType.intergenic}:
+            if (len(read_assignment.corrected_exons) == 1 and
+                    read_assignment.polyA_found and not read_assignment.multimapper and
+                    (read_assignment.assignment_type.is_unassigned() or
+                     read_assignment.assignment_type.is_inconsistent())):
                 novel_mono_exon_reads.append(read_assignment)
 
-            if read_assignment.assignment_type not in {ReadAssignmentType.unique,
-                                                       ReadAssignmentType.unique_minor_difference}:
+            if not read_assignment.assignment_type.is_unique():
                 continue
+
             refrenence_isoform_id = read_assignment.isoform_matches[0].assigned_transcript
             if refrenence_isoform_id in GraphBasedModelConstructor.detected_known_isoforms:
                 continue
@@ -715,9 +717,7 @@ def assign_reads_to_models(self, read_assignments):
             model_assignment = assigner.assign_to_isoform(assignment.read_id, model_combined_profile)
             model_assignment.read_group = assignment.read_group
             # check that no serious contradiction occurs
-            if model_assignment.assignment_type in [ReadAssignmentType.unique,
-                                                    ReadAssignmentType.unique_minor_difference,
-                                                    ReadAssignmentType.ambiguous]:
+            if model_assignment.assignment_type.is_consistent():
                 matched_isoforms = [m.assigned_transcript for m in model_assignment.isoform_matches]
 
                 if len(matched_isoforms) == 1:
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 0de604c3..916b59f4 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -22,20 +22,26 @@ class ReadAssignmentType(Enum):
     unique_minor_difference = 2
     inconsistent = 30
     inconsistent_non_intronic = 31
-    inconsistent_amb = 32
+    inconsistent_ambiguous = 32
     suspended = 255
 
-    @staticmethod
-    def is_inconsistent(assignment_type):
-        return assignment_type in [ReadAssignmentType.inconsistent,
-                                   ReadAssignmentType.inconsistent_amb,
-                                   ReadAssignmentType.inconsistent_non_intronic]
+    def is_inconsistent(self):
+        return self in [ReadAssignmentType.inconsistent,
+                        ReadAssignmentType.inconsistent_ambiguous,
+                        ReadAssignmentType.inconsistent_non_intronic]
 
-    @staticmethod
-    def is_consistent(assignment_type):
-        return assignment_type in [ReadAssignmentType.unique,
-                                   ReadAssignmentType.unique_minor_difference,
-                                   ReadAssignmentType.ambiguous]
+    def is_consistent(self):
+        return self in [ReadAssignmentType.unique,
+                        ReadAssignmentType.unique_minor_difference,
+                        ReadAssignmentType.ambiguous]
+
+    def is_unassigned(self):
+        return self in [ReadAssignmentType.noninformative,
+                        ReadAssignmentType.intergenic]
+
+    def is_unique(self):
+        return self in [ReadAssignmentType.unique_minor_difference,
+                        ReadAssignmentType.unique]
 
 # SQANTI-like
 @unique
@@ -538,9 +544,9 @@ def __init__(self, read_id, assignment_type, match=None):
             assigned_genes = set([m.assigned_gene for m in self.isoform_matches])
             self.gene_assignment_type = ReadAssignmentType.ambiguous if len(
                 assigned_genes) > 1 else ReadAssignmentType.unique
-        elif self.assignment_type == ReadAssignmentType.inconsistent_amb:
+        elif self.assignment_type == ReadAssignmentType.inconsistent_ambiguous:
             assigned_genes = set([m.assigned_gene for m in self.isoform_matches])
-            self.gene_assignment_type = ReadAssignmentType.inconsistent_amb if len(
+            self.gene_assignment_type = ReadAssignmentType.inconsistent_ambiguous if len(
                 assigned_genes) > 1 else ReadAssignmentType.inconsistent
         else:
             self.gene_assignment_type = self.assignment_type
@@ -728,7 +734,7 @@ def match_subtype_to_str_with_additional_info(event, strand, read_introns, isofo
 def is_matching_assignment(isoform_assignment):
     if isoform_assignment.assignment_type == ReadAssignmentType.unique:
         return True
-    elif isoform_assignment.assignment_type in [ReadAssignmentType.unique, ReadAssignmentType.unique_minor_difference]:
+    elif isoform_assignment.assignment_type.is_unique():
         allowed_set = {MatchEventSubtype.none,
                        MatchEventSubtype.fsm,
                        MatchEventSubtype.exon_misalignment,
diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py
index d50aeea7..8041e920 100644
--- a/src/long_read_assigner.py
+++ b/src/long_read_assigner.py
@@ -322,7 +322,7 @@ def check_read_ends(self, read_split_exon_profile, assignment):
 
             if any(MatchEventSubtype.is_major_elongation(e.event_type) for e in exon_elongation_types):
                 # serious exon elongation
-                if not ReadAssignmentType.is_inconsistent(assignment.assignment_type):
+                if not assignment.assignment_type.is_inconsistent():
                     assignment.set_assignment_type(ReadAssignmentType.inconsistent_non_intronic)
             elif any(MatchEventSubtype.is_minor_elongation(e.event_type) for e in exon_elongation_types):
                 # minor exon elongation
@@ -514,7 +514,7 @@ def match_consistent(self, read_id, combined_read_profile):
             self.verify_read_ends_for_assignment(combined_read_profile, assignment)
 
             # if apa detected isoform should be treated as inconsistent and rechecked
-            if ReadAssignmentType.is_inconsistent(assignment.assignment_type):
+            if assignment.assignment_type.is_inconsistent():
                 return None
 
         return assignment
@@ -603,7 +603,7 @@ def match_inconsistent(self, read_id, combined_read_profile):
         assignment_type = self.classify_assignment(best_isoforms, read_matches)
         if not combined_read_profile.read_intron_profile.read_profile:
             isoform_matches = self.create_monoexon_matches(read_matches, best_isoforms)
-        elif ReadAssignmentType.is_inconsistent(assignment_type):
+        elif assignment_type.is_inconsistent():
             isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, penalty_score)
         else:
             isoform_matches = self.create_consistent_matches(read_matches, best_isoforms, combined_read_profile)
@@ -626,7 +626,7 @@ def classify_assignment(self, best_isoforms, read_matches):
                 inconsistency_type = ReadAssignmentType.inconsistent
             else:
                 inconsistency_type = ReadAssignmentType.inconsistent_non_intronic
-            assignment_type = ReadAssignmentType.inconsistent_amb if is_abmiguous else inconsistency_type
+            assignment_type = ReadAssignmentType.inconsistent_ambiguous if is_abmiguous else inconsistency_type
         elif any(MatchEventSubtype.is_minor_error(e) for e in all_event_types):
             #logger.debug("* * Assignment has minor errors")
             assignment_type = ReadAssignmentType.ambiguous if is_abmiguous else ReadAssignmentType.unique_minor_difference
diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index f7ffa22b..43657def 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -164,8 +164,7 @@ def add_read_info(self, read_assignment=None):
         # TODO: add __alignment_not_unique / __too_low_aQual ?
         if not read_assignment:
             self.not_aligned_reads += 1
-        if read_assignment.assignment_type in [ReadAssignmentType.noninformative, ReadAssignmentType.intergenic] or \
-                not read_assignment.isoform_matches:
+        if read_assignment.assignment_type.is_unassigned() or not read_assignment.isoform_matches:
             self.not_assigned_reads += 1
         elif read_assignment.isoform_matches and read_assignment.isoform_matches[0].assigned_transcript is None:
             self.not_assigned_reads += 1
@@ -186,7 +185,7 @@ def add_read_info(self, read_assignment=None):
                     if count_value > 0:
                         self.all_features.add(feature_id)
                 self.ambiguous_reads += 1
-        elif read_assignment.assignment_type == ReadAssignmentType.inconsistent:
+        elif read_assignment.assignment_type.is_inconsistent():
             feature_ids = set([self.get_feature_id(m) for m in read_assignment.isoform_matches])
             group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
             count_value = self.read_counter.process_inconsistent(read_assignment.isoform_matches[0], len(feature_ids))
@@ -196,8 +195,7 @@ def add_read_info(self, read_assignment=None):
                         self.feature_counter[group_id][feature_id] += count_value
                         self.all_features.add(feature_id)
                         self.confirmed_features.add((group_id, feature_id))
-        elif read_assignment.assignment_type == ReadAssignmentType.unique or\
-                read_assignment.assignment_type == ReadAssignmentType.unique_minor_difference:
+        elif read_assignment.assignment_type.is_unique():
             feature_id = self.get_feature_id(read_assignment.isoform_matches[0])
             group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
             self.feature_counter[group_id][feature_id] += 1.0
diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py
index 26fc1abc..1567e455 100644
--- a/src/multimap_resolver.py
+++ b/src/multimap_resolver.py
@@ -50,11 +50,11 @@ def select_best_assignment(self, assignment_list):
         primary_inconsistent = set()
 
         for i, a in enumerate(assignment_list):
-            if ReadAssignmentType.is_inconsistent(a.assignment_type):
+            if a.assignment_type.is_inconsistent():
                 if not a.multimapper:
                     primary_inconsistent.add(i)
                 inconsistent_assignments.add(i)
-            elif ReadAssignmentType.is_consistent(a.assignment_type):
+            elif a.assignment_type.is_consistent():
                 consistent_assignments.add(i)
                 if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous:
                     primary_unique.add(i)
@@ -111,8 +111,8 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
         for i in range(len(assignment_list)):
             assignment = assignment_list[i]
             if i in assignments_to_keep:
-                if ReadAssignmentType.is_inconsistent(assignment.assignment_type):
-                    ambiguity_assignment_type = ReadAssignmentType.inconsistent_amb
+                if assignment.assignment_type.is_inconsistent():
+                    ambiguity_assignment_type = ReadAssignmentType.inconsistent_ambiguous
                 else:
                     ambiguity_assignment_type = ReadAssignmentType.ambiguous
 

From 9bfcb8a409d6929fb5b2506a9bafebee68f16e2c Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 19 Apr 2024 18:15:20 +0300
Subject: [PATCH 09/21] fix multimapping

---
 src/multimap_resolver.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py
index 1567e455..0583b06f 100644
--- a/src/multimap_resolver.py
+++ b/src/multimap_resolver.py
@@ -118,6 +118,7 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
 
                 if change_transcript_assignment_type:
                     assignment.assignment_type = ambiguity_assignment_type
+                    assignment.multimapper = True
                 if change_gene_assignment_type:
                     assignment.gene_assignment_type = ambiguity_assignment_type
                     assignment.multimapper = True

From b5a246b99a75b508080b813929cf85dcb8345571 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Mon, 22 Apr 2024 22:58:37 +0300
Subject: [PATCH 10/21] proper duplicates detection and removal

---
 src/alignment_processor.py |  6 ++--
 src/dataset_processor.py   |  2 +-
 src/isoform_assignment.py  | 26 ++++++++++++++
 src/multimap_resolver.py   | 73 +++++++++++++++++++++++++++++++++-----
 4 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/src/alignment_processor.py b/src/alignment_processor.py
index 2022ee3f..24f038e5 100644
--- a/src/alignment_processor.py
+++ b/src/alignment_processor.py
@@ -284,7 +284,7 @@ def process_alignments_in_region(self, current_region, alignment_storage):
         if gene_info.empty():
             assignment_storage = self.process_intergenic(alignment_storage, current_region)
         else:
-            assignment_storage = self.process_genic(alignment_storage, gene_info)
+            assignment_storage = self.process_genic(alignment_storage, gene_info, current_region)
 
         return gene_info, assignment_storage
 
@@ -332,6 +332,7 @@ def process_intergenic(self, alignment_storage, region):
                                            alignment_info.polya_info.internal_polyt_pos != -1)
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
+            read_assignment.genomic_region = region
             read_assignment.exons = alignment_info.read_exons
             read_assignment.corrected_exons = corrector.correct_read(alignment_info)
             read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
@@ -346,7 +347,7 @@ def process_intergenic(self, alignment_storage, region):
             assignment_storage.append(read_assignment)
         return assignment_storage
 
-    def process_genic(self, alignment_storage, gene_info):
+    def process_genic(self, alignment_storage, gene_info, region):
         assigner = LongReadAssigner(gene_info, self.params)
         profile_constructor = CombinedProfileConstructor(gene_info, self.params)
         exon_corrector = ExonCorrector(gene_info, self.params, self.chr_record)
@@ -388,6 +389,7 @@ def process_genic(self, alignment_storage, gene_info):
                                            alignment_info.polya_info.internal_polyt_pos != -1)
             read_assignment.polya_info = alignment_info.polya_info
             read_assignment.cage_found = len(alignment_info.cage_hits) > 0
+            read_assignment.genomic_region = region
             read_assignment.exons = alignment_info.read_exons
             read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info,
                                                                                    read_assignment)
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 884930cf..8ddb4a6c 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -353,7 +353,7 @@ def finalize_aggregators(self, sample):
             logger.info("Gene counts are stored in " + self.gene_counter.output_counts_file_name)
             logger.info("Transcript counts are stored in " + self.transcript_counter.output_counts_file_name)
             logger.info("Read assignments are stored in " + self.basic_printer.output_file_name +
-                        ".gz" if self.args.gzipped else "")
+                        (".gz" if self.args.gzipped else ""))
         self.read_stat_counter.print_start("Read assignment statistics")
 
 
diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 916b59f4..41c1b525 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -467,6 +467,12 @@ def __init__(self, read_assignment):
         self.assignment_id = read_assignment.assignment_id
         self.read_id = read_assignment.read_id
         self.chr_id = read_assignment.chr_id
+        self.start = 0
+        self.end = 0
+        if read_assignment.exons:
+            self.start = read_assignment.exons[0][0]
+            self.end = read_assignment.exons[-1][1]
+        self.genomic_region = read_assignment.genomic_region
         self.multimapper = read_assignment.multimapper
         self.polyA_found = read_assignment.polyA_found
         self.assignment_type = read_assignment.assignment_type
@@ -487,12 +493,24 @@ def __init__(self, read_assignment):
             self.genes = list(gene_set)
             self.isoforms = list(isoform_set)
 
+    def __eq__(self, other):
+        if isinstance(other, BasicReadAssignment):
+            return (self.read_id == other.read_id and
+                    self.chr_id == other.chr_id and
+                    self.start == other.start and
+                    self.end == other.end and
+                    self.isoforms == other.isoforms)
+        return False
+
     @classmethod
     def deserialize(cls, infile):
         read_assignment = cls.__new__(cls)
         read_assignment.assignment_id = read_int(infile)
         read_assignment.read_id = read_string(infile)
         read_assignment.chr_id = read_string(infile)
+        read_assignment.start = read_int(infile)
+        read_assignment.end = read_int(infile)
+        read_assignment.genomic_region = (read_int(infile), read_int(infile))
         bool_arr = read_bool_array(infile, 2)
         read_assignment.multimapper = bool_arr[0]
         read_assignment.polyA_found = bool_arr[1]
@@ -507,6 +525,10 @@ def serialize(self, outfile):
         write_int(self.assignment_id, outfile)
         write_string(self.read_id, outfile)
         write_string(self.chr_id, outfile)
+        write_int(self.start, outfile)
+        write_int(self.end, outfile)
+        write_int(self.genomic_region[0], outfile)
+        write_int(self.genomic_region[1], outfile)
         write_bool_array([self.multimapper, self.polyA_found], outfile)
         write_short_int(self.assignment_type.value, outfile)
         write_short_int(self.gene_assignment_type.value, outfile)
@@ -520,6 +542,7 @@ class ReadAssignment:
     def __init__(self, read_id, assignment_type, match=None):
         self.assignment_id = ReadAssignment.assignment_id_generator.increment()
         self.read_id = read_id
+        self.genomic_region = (0, 0)
         self.exons = None
         self.corrected_exons = None
         self.corrected_introns = None
@@ -562,6 +585,7 @@ def deserialize(cls, infile, gene_info):
         read_assignment = cls.__new__(cls)
         read_assignment.assignment_id = read_int(infile)
         read_assignment.read_id = read_string(infile)
+        read_assignment.genomic_region = (read_int(infile), read_int(infile))
         read_assignment.exons = read_list_of_pairs(infile, read_int)
         read_assignment.corrected_exons = read_list_of_pairs(infile, read_int)
         read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
@@ -589,6 +613,8 @@ def deserialize(cls, infile, gene_info):
     def serialize(self, outfile):
         write_int(self.assignment_id, outfile)
         write_string(self.read_id, outfile)
+        write_int(self.genomic_region[0], outfile)
+        write_int(self.genomic_region[1], outfile)
         write_list_of_pairs(self.exons, outfile, write_int)
         write_list_of_pairs(self.corrected_exons, outfile, write_int)
         write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile)
diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py
index 0583b06f..07a70ef2 100644
--- a/src/multimap_resolver.py
+++ b/src/multimap_resolver.py
@@ -6,6 +6,7 @@
 
 import logging
 from enum import Enum
+from collections import defaultdict
 
 from .isoform_assignment import ReadAssignmentType
 
@@ -19,6 +20,7 @@ class MultimapResolvingStrategy(Enum):
 
 
 class MultimapResolver:
+    duplicate_counter = 0
 
     def __init__(self, strategy):
         self.strategy = strategy
@@ -44,20 +46,20 @@ def resolve(self, assignment_list):
 
     def select_best_assignment(self, assignment_list):
         logger.debug("Resolving read %s" % assignment_list[0].read_id)
-        primary_unique = set()
-        consistent_assignments = set()
-        inconsistent_assignments = set()
-        primary_inconsistent = set()
+        primary_unique = []
+        consistent_assignments = []
+        inconsistent_assignments = []
+        primary_inconsistent = []
 
         for i, a in enumerate(assignment_list):
             if a.assignment_type.is_inconsistent():
                 if not a.multimapper:
-                    primary_inconsistent.add(i)
-                inconsistent_assignments.add(i)
+                    primary_inconsistent.append(i)
+                inconsistent_assignments.append(i)
             elif a.assignment_type.is_consistent():
-                consistent_assignments.add(i)
+                consistent_assignments.append(i)
                 if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous:
-                    primary_unique.add(i)
+                    primary_unique.append(i)
 
         if primary_unique:
             return self.filter_assignments(assignment_list, primary_unique)
@@ -98,6 +100,8 @@ def merge_assignments(assignment_list):
     # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly
     @staticmethod
     def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True):
+        assignments_to_keep = MultimapResolver.find_duplicates(assignment_list, assignments_to_keep)
+
         all_genes = set()
         all_isoforms = set()
         for i in assignments_to_keep:
@@ -108,9 +112,10 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
         change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1
         change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1
 
+        assignments_to_keep_set = set(assignments_to_keep)
         for i in range(len(assignment_list)):
             assignment = assignment_list[i]
-            if i in assignments_to_keep:
+            if i in assignments_to_keep_set:
                 if assignment.assignment_type.is_inconsistent():
                     ambiguity_assignment_type = ReadAssignmentType.inconsistent_ambiguous
                 else:
@@ -127,3 +132,53 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
                 assignment.gene_assignment_type = ReadAssignmentType.suspended
 
         return assignment_list
+
+    # finds groups of duplicated assignments
+    @staticmethod
+    def find_duplicates(assignment_list, assignment_indices):
+        if len(assignment_indices) <= 1:
+            return assignment_indices
+
+        selected_assignments = []
+        discarded_duplicates = set()
+        for i in range(len(assignment_indices)):
+            index1 = assignment_indices[i]
+            if index1 in discarded_duplicates:
+                # this assignment was already discarded - no need to compare it
+                continue
+            # if it was not discarded earlier - keep anyway; next assignments will either duplicate it or won't
+            selected_assignments.append(index1)
+
+            for j in range(i + 1, len(assignment_indices)):
+                # find duplicating assignments among the next onex
+                index2 = assignment_indices[j]
+                if index2 in discarded_duplicates:
+                    # this assignment was already discarded - no need to compare it
+                    continue
+
+                if assignment_list[index1] == assignment_list[index2]:
+                    # discard duplicating assignment with larger index
+                    discarded_duplicates.add(index2)
+
+                    # check if duplicates come from the same region,
+                    # if they are - it means we have identical records in the BAM files, print a warning
+                    if assignment_list[index1].genomic_region == assignment_list[index2].genomic_region:
+                        MultimapResolver.duplicate_counter += 1
+                        example_assignment = assignment_list[index1]
+                        if MultimapResolver.duplicate_counter == 5:
+                            logger.warning(
+                                "More possible duplicates were detected but will not be printed to limit the log size. "
+                                "All duplicated entries will be ignored. "
+                                "Use --debug to see all of them in the isoquant.log.")
+                        if MultimapResolver.duplicate_counter >= 5:
+                            logger.debug(
+                                "Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. "
+                                "Please, check you input." %
+                                (example_assignment.read_id, example_assignment.chr_id, example_assignment.start))
+                        else:
+                            logger.warning(
+                                "Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. "
+                                "Please, check you input."
+                                % (example_assignment.read_id, example_assignment.chr_id, example_assignment.start))
+
+        return selected_assignments

From 875d8357d0f60c0940fa2e311c3c53fd85809713 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Mon, 22 Apr 2024 23:45:23 +0300
Subject: [PATCH 11/21] fix printing order

---
 src/isoform_assignment.py | 1 +
 src/long_read_assigner.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 41c1b525..8697444b 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -539,6 +539,7 @@ def serialize(self, outfile):
 
 class ReadAssignment:
     assignment_id_generator = AtomicCounter()
+
     def __init__(self, read_id, assignment_type, match=None):
         self.assignment_id = ReadAssignment.assignment_id_generator.increment()
         self.read_id = read_id
diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py
index 8041e920..7e605cfb 100644
--- a/src/long_read_assigner.py
+++ b/src/long_read_assigner.py
@@ -381,7 +381,7 @@ def categorize_correct_splice_match(self, combined_read_profile, isoform_id):
     # make proper match subtype
     def categorize_multiple_splice_matches(self, combined_read_profile, isoform_ids):
         isoform_matches = []
-        for isoform_id in isoform_ids:
+        for isoform_id in sorted(isoform_ids):
             isoform_matches.append(self.categorize_correct_splice_match(combined_read_profile, isoform_id))
         return isoform_matches
 
@@ -400,7 +400,7 @@ def categorize_correct_unspliced_match(self, combined_read_profile, isoform_id):
 
     def categorize_multiple_unspliced_matches(self, combined_read_profile, isoform_ids):
         isoform_matches = []
-        for isoform_id in isoform_ids:
+        for isoform_id in sorted(isoform_ids):
             isoform_matches.append(self.categorize_correct_unspliced_match(combined_read_profile, isoform_id))
         return isoform_matches
 
@@ -593,7 +593,7 @@ def match_inconsistent(self, read_id, combined_read_profile):
             return ReadAssignment(read_id, ReadAssignmentType.noninformative)
         # logger.debug("* Inconsistencies detected: " + str(read_matches))
 
-        # select ones with least number of inconsistent events
+        # select ones with the least number of inconsistent events
         best_isoforms, penalty_score = self.select_best_among_inconsistent(combined_read_profile, read_matches)
         if not best_isoforms:
             return ReadAssignment(read_id, ReadAssignmentType.noninformative)

From e1457461d50a22980551b4a466b34f0c39b20963 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Mon, 22 Apr 2024 23:59:13 +0300
Subject: [PATCH 12/21] remove unused flag

---
 src/multimap_resolver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py
index 07a70ef2..f02c97b0 100644
--- a/src/multimap_resolver.py
+++ b/src/multimap_resolver.py
@@ -99,7 +99,7 @@ def merge_assignments(assignment_list):
     # select all given assignments from the list, makr rest as suspended
     # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly
     @staticmethod
-    def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True):
+    def filter_assignments(assignment_list, assignments_to_keep):
         assignments_to_keep = MultimapResolver.find_duplicates(assignment_list, assignments_to_keep)
 
         all_genes = set()
@@ -109,8 +109,8 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
             all_genes.update(assignment.genes)
             all_isoforms.update(assignment.isoforms)
 
-        change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1
-        change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1
+        change_transcript_assignment_type = len(all_isoforms) > 1
+        change_gene_assignment_type = len(all_genes) > 1
 
         assignments_to_keep_set = set(assignments_to_keep)
         for i in range(len(assignment_list)):

From 757f3a3e01ba54286d4c43c016af7e7ce1bf9421 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Tue, 23 Apr 2024 22:44:57 +0300
Subject: [PATCH 13/21] refactor and improve long read counter

---
 src/isoform_assignment.py |  12 +--
 src/long_read_counter.py  | 183 ++++++++++++++++++++++++--------------
 2 files changed, 119 insertions(+), 76 deletions(-)

diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py
index 8697444b..212ae02b 100644
--- a/src/isoform_assignment.py
+++ b/src/isoform_assignment.py
@@ -43,6 +43,10 @@ def is_unique(self):
         return self in [ReadAssignmentType.unique_minor_difference,
                         ReadAssignmentType.unique]
 
+    def is_ambiguous(self):
+        return self in [ReadAssignmentType.ambiguous,
+                        ReadAssignmentType.inconsistent_ambiguous]
+
 # SQANTI-like
 @unique
 class MatchClassification(Enum):
@@ -667,14 +671,6 @@ def add_match_attribute(self, match_event):
             m.add_subclassification(match_event)
 
 
-def get_assigned_transcript_id(match):
-    return match.assigned_transcript
-
-
-def get_assigned_gene_id(match):
-    return match.assigned_gene
-
-
 match_subtype_printable_names = \
     {MatchEventSubtype.ism_left : ('ism_5', 'ism_3', 'ism'),
      MatchEventSubtype.ism_right : ('ism_3', 'ism_5', 'ism'),
diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index 43657def..cdaaf2ba 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -11,8 +11,6 @@
 from .isoform_assignment import (
     ReadAssignmentType,
     nonintronic_events,
-    get_assigned_gene_id,
-    get_assigned_transcript_id
 )
 from .gene_info import FeatureInfo
 from .read_groups import AbstractReadGrouper
@@ -21,54 +19,106 @@
 
 
 @unique
-class CountingStrategies(Enum):
+class CountingStrategy(Enum):
     unique_only = 1
     with_ambiguous = 2
-    inconsistent_all = 10
-    inconsistent_minor = 11
+    with_inconsistent = 10
+    with_inconsistent_minor = 11
     all = 20
     all_minor = 21
 
+    def no_inconsistent(self):
+        return self in [CountingStrategy.unique_only, CountingStrategy.with_ambiguous]
 
-COUNTING_STRATEGIES = ["unique_only", "with_ambiguous", "with_inconsistent", "all"]
+    def ambiguous(self):
+        return self in [CountingStrategy.all, CountingStrategy.all_minor, CountingStrategy.with_ambiguous]
 
+    def inconsistent_minor(self):
+        return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor]
+
+    def inconsistent(self):
+        return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor,
+                        CountingStrategy.with_inconsistent, CountingStrategy.all]
+
+
+COUNTING_STRATEGIES = [CountingStrategy.unique_only.name,
+                       CountingStrategy.with_ambiguous.name,
+                       CountingStrategy.with_inconsistent.name,
+                       CountingStrategy.all.name]
 
-class ReadWeightCounter:
-    def __init__(self, strategy, gene_counting=True):
-        if strategy == "unique_only":
-            self.strategy = CountingStrategies.unique_only
-        elif strategy == "with_ambiguous":
-            self.strategy = CountingStrategies.with_ambiguous
-        elif strategy == "all":
-            if gene_counting:
-                self.strategy = CountingStrategies.all
-            else:
-                self.strategy = CountingStrategies.all_minor
-        elif strategy == "with_inconsistent":
-            if gene_counting:
-                self.strategy = CountingStrategies.inconsistent_all
-            else:
-                self.strategy = CountingStrategies.inconsistent_minor
-        else:
-            raise KeyError("Unknown quantification strategy: " + strategy)
 
-    def process_inconsistent(self, isoform_match, feature_count):
-        if self.strategy == CountingStrategies.unique_only or self.strategy == CountingStrategies.with_ambiguous \
-                or feature_count > 1:
+class CountingStrategyFlags:
+    def __init__(self, counting_strategy):
+        self.use_ambiguous = counting_strategy.ambiguous()
+        self.use_inconsistent_minor = counting_strategy.inconsistent_minor()
+        self.use_inconsistent = counting_strategy.inconsistent()
+
+
+class GeneAssignmentExtractor:
+    @staticmethod
+    def get_features(read_assignment):
+        gene_set = set()
+        for m in read_assignment.isoform_matches:
+            if m.assigned_gene:
+                gene_set.add(m.assigned_gene)
+        return gene_set
+
+    @staticmethod
+    def get_assignment_type(read_assignment):
+        return read_assignment.gene_assignment_type
+
+    @staticmethod
+    def confirms_feature(read_assignment):
+        return read_assignment.gene_assignment_type.is_unique()
+
+
+class TranscriptAssignmentExtractor:
+    @staticmethod
+    def get_features(read_assignment):
+        transcript_set = set()
+        for m in read_assignment.isoform_matches:
+            if m.assigned_transcript:
+                transcript_set.add(m.assigned_transcript)
+        return transcript_set
+
+    @staticmethod
+    def get_assignment_type(read_assignment):
+        return read_assignment.assignment_type
+
+    @staticmethod
+    def confirms_feature(read_assignment):
+        transcript_id = read_assignment.isoform_matches[0].assigned_transcript
+        return (read_assignment.assignment_type.is_unique() and
+                (len(read_assignment.gene_info.all_isoforms_introns[transcript_id]) == 0 or
+                 len(read_assignment.corrected_exons) > 1))
+
+
+class ReadWeightCounter:
+    def __init__(self, strategy_str, relax_strategy=True):
+        self.strategy = CountingStrategy[strategy_str]
+        if relax_strategy:
+            if self.strategy == CountingStrategy.with_inconsistent:
+                self.strategy = CountingStrategy.with_inconsistent_minor
+            elif self.strategy == CountingStrategy.all:
+                self.strategy = CountingStrategy.all_minor
+        self.strategy_flags = CountingStrategyFlags(self.strategy)
+
+    def process_inconsistent(self, assignment_type, feature_count):
+        # use only for inconsistent assignments
+        if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1:
             return 0.0
-        elif self.strategy == CountingStrategies.all or self.strategy == CountingStrategies.inconsistent_all:
+        if self.strategy_flags.use_inconsistent:
             return 1.0
-        elif self.strategy == CountingStrategies.all_minor or self.strategy == CountingStrategies.inconsistent_minor:
-            for m in isoform_match.match_subclassifications:
-                if m.event_type not in nonintronic_events:
-                    return 0.0
+        if self.strategy_flags.use_inconsistent_minor and assignment_type == ReadAssignmentType.inconsistent_non_intronic:
             return 1.0
         return 0.0
 
     def process_ambiguous(self, feature_count):
-        if self.strategy in {CountingStrategies.with_ambiguous,
-                             CountingStrategies.all_minor,
-                             CountingStrategies.all}:
+        if feature_count == 0:
+            return 0.0
+        if feature_count == 1:
+            return 1.0
+        if self.strategy_flags.use_ambiguous:
             return 1.0 / float(feature_count)
         else:
             return 0.0
@@ -141,10 +191,10 @@ def add_unaligned(self, n_reads=1):
 # count meta-features assigned to reads (genes or isoforms)
 # get_feature_id --- function that returns feature id form IsoformMatch object
 class AssignedFeatureCounter(AbstractCounter):
-    def __init__(self, output_prefix, get_feature_id, read_groups, read_counter,
+    def __init__(self, output_prefix, assignment_extractor, read_groups, read_counter,
                  all_features=None, ignore_read_groups=False, output_zeroes=True):
         AbstractCounter.__init__(self, output_prefix, ignore_read_groups, output_zeroes)
-        self.get_feature_id = get_feature_id
+        self.assignment_extractor = assignment_extractor
         self.all_features = set(all_features) if all_features is not None else set()
         if ignore_read_groups:
             self.all_groups = [AbstractReadGrouper.default_group_id]
@@ -161,48 +211,45 @@ def __init__(self, output_prefix, get_feature_id, read_groups, read_counter,
         self.output_stats_file_name = self.output_counts_file_name + ".stats"
 
     def add_read_info(self, read_assignment=None):
-        # TODO: add __alignment_not_unique / __too_low_aQual ?
         if not read_assignment:
             self.not_aligned_reads += 1
-        if read_assignment.assignment_type.is_unassigned() or not read_assignment.isoform_matches:
+            return
+        elif read_assignment.assignment_type.is_unassigned() or not read_assignment.isoform_matches:
             self.not_assigned_reads += 1
+            return
         elif read_assignment.isoform_matches and read_assignment.isoform_matches[0].assigned_transcript is None:
             self.not_assigned_reads += 1
             logger.warning("Assigned feature for read %s is None, will be skipped. "
                            "This message may be reported to the developers." % read_assignment.read_id)
-        elif read_assignment.assignment_type == ReadAssignmentType.ambiguous:
-            feature_ids = set([self.get_feature_id(m) for m in read_assignment.isoform_matches])
-            group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
+            return
 
-            if len(feature_ids) == 1:  # different isoforms of same gene
-                feature_id = list(feature_ids)[0]
-                self.feature_counter[group_id][feature_id] += 1.0
-                self.all_features.add(feature_id)
-            else:
-                for feature_id in feature_ids:
-                    count_value = self.read_counter.process_ambiguous(len(feature_ids))
-                    self.feature_counter[group_id][feature_id] += count_value
-                    if count_value > 0:
-                        self.all_features.add(feature_id)
+        feature_ids = self.assignment_extractor.get_features(read_assignment)
+        assignment_type = self.assignment_extractor.get_assignment_type(read_assignment)
+        group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
+
+        if assignment_type == ReadAssignmentType.ambiguous:
+            count_value = self.read_counter.process_ambiguous(len(feature_ids))
+            for feature_id in feature_ids:
+                self.feature_counter[group_id][feature_id] += count_value
+                if count_value > 0:
+                    self.all_features.add(feature_id)
+            if len(feature_ids):
                 self.ambiguous_reads += 1
+
         elif read_assignment.assignment_type.is_inconsistent():
-            feature_ids = set([self.get_feature_id(m) for m in read_assignment.isoform_matches])
-            group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
-            count_value = self.read_counter.process_inconsistent(read_assignment.isoform_matches[0], len(feature_ids))
+            count_value = self.read_counter.process_inconsistent(assignment_type, len(feature_ids))
             if count_value > 0:
                 for feature_id in feature_ids:
-                    if feature_id:
-                        self.feature_counter[group_id][feature_id] += count_value
-                        self.all_features.add(feature_id)
-                        self.confirmed_features.add((group_id, feature_id))
+                    # if feature_id:
+                    self.feature_counter[group_id][feature_id] += count_value
+                    self.all_features.add(feature_id)
+                    #self.confirmed_features.add((group_id, feature_id))
+
         elif read_assignment.assignment_type.is_unique():
-            feature_id = self.get_feature_id(read_assignment.isoform_matches[0])
-            group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
+            feature_id = list(feature_ids)[0]
             self.feature_counter[group_id][feature_id] += 1.0
             self.all_features.add(feature_id)
-            transcript_id = read_assignment.isoform_matches[0].assigned_transcript
-            if len(read_assignment.gene_info.all_isoforms_introns[transcript_id]) == 0 or len(read_assignment.corrected_exons) > 1:
-                # only add features to confirmed ones if it has unique spliced match
+            if self.assignment_extractor.confirms_feature(read_assignment):
                 self.confirmed_features.add((group_id, feature_id))
 
     def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.default_group_id):
@@ -316,16 +363,16 @@ def convert_counts_to_tpm(self):
 
 def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
                         read_groups=None, ignore_read_groups=False, output_zeroes=True):
-    read_weight_counter = ReadWeightCounter(strategy, gene_counting=True)
-    return AssignedFeatureCounter(output_file_name, get_assigned_gene_id,
+    read_weight_counter = ReadWeightCounter(strategy, relax_strategy=False)
+    return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor,
                                   read_groups, read_weight_counter,
                                   complete_feature_list, ignore_read_groups, output_zeroes)
 
 
 def create_transcript_counter(output_file_name, strategy, complete_feature_list=None,
                               read_groups=None, ignore_read_groups=False, output_zeroes=True):
-    read_weight_counter = ReadWeightCounter(strategy, gene_counting=False)
-    return AssignedFeatureCounter(output_file_name, get_assigned_transcript_id,
+    read_weight_counter = ReadWeightCounter(strategy, relax_strategy=True)
+    return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor,
                                   read_groups, read_weight_counter,
                                   complete_feature_list, ignore_read_groups, output_zeroes)
 

From adced67978f0afa2d0403932af03bade0b9ecc70 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Thu, 25 Apr 2024 13:03:17 +0300
Subject: [PATCH 14/21] fix assignment type bug

---
 src/long_read_counter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index cdaaf2ba..e3efe01a 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -236,7 +236,7 @@ def add_read_info(self, read_assignment=None):
             if len(feature_ids):
                 self.ambiguous_reads += 1
 
-        elif read_assignment.assignment_type.is_inconsistent():
+        elif assignment_type.is_inconsistent():
             count_value = self.read_counter.process_inconsistent(assignment_type, len(feature_ids))
             if count_value > 0:
                 for feature_id in feature_ids:
@@ -245,7 +245,7 @@ def add_read_info(self, read_assignment=None):
                     self.all_features.add(feature_id)
                     #self.confirmed_features.add((group_id, feature_id))
 
-        elif read_assignment.assignment_type.is_unique():
+        elif assignment_type.is_unique():
             feature_id = list(feature_ids)[0]
             self.feature_counter[group_id][feature_id] += 1.0
             self.all_features.add(feature_id)

From 2662595cc406a3cb601c0c1bd3a62ee3d368c31c Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Wed, 24 Apr 2024 02:12:55 +0300
Subject: [PATCH 15/21] update quantification strategies, set reasonable
 defaults

---
 README.md                | 24 ++++++-------
 isoquant.py              | 73 ++++++++++++++++++++++------------------
 src/long_read_counter.py | 33 +++++++++---------
 3 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index c0afb9cc..3f7df462 100644
--- a/README.md
+++ b/README.md
@@ -462,18 +462,18 @@ where `FILE` is the file name, `READ_COL` is column with read ids (0 if not set)
 
 #### Quantification
 
-`--transcript_quantification` Transcript quantification strategy, should be one of:
-
-* `unique_only` - only reads that are uniquely assigned and consistent with a transcript are used for quantification;
-* `with_ambiguous` - ambiguously assigned reads are split between transcripts with equal weights (e.g. 1/2 when a read is assigned to 2 transcripts simultaneously, default mode);
-* `with_inconsistent` - uniquely assigned reads with non-intronic inconsistencies (i.e. alternative poly-A site, TSS etc) are also included;
-* `all` - all of the above.
-
-`--gene_quantification` Gene quantification strategy, should be one of:
-
-* `unique_only` -  only reads that are uniquely assigned to a gene and consistent with any of gene's isoforms are used for quantification;
-* `with_ambiguous` - ambiguously assigned reads are split between genes with equal weights (e.g. 1/2 when a read is assigned to 2 genes simultaneously);
-* `with_inconsistent` - only reads that are uniquely assigned to a gene but are not necessarily consistent with genes isoforms (default);
+`--transcript_quantification` Transcript quantification strategy;
+`--gene_quantification` Gene quantification strategy;
+
+Available options for quantification:
+
+* `unique_only` - use only reads that are uniquely assigned and consistent with a transcript/gene
+(i.e. flagged as unique/unique_minor_difference), default fot transcript quantification;
+* `with_ambiguous` - in addition to unique reads, ambiguously assigned consistent reads are split between features with equal weights 
+(e.g. 1/2 when a read is assigned to 2 features simultaneously);
+* `unique_splicing_consistent` - uses uniquely assigned reads that do not contradict annotated splice sites
+(i.e. flagged as unique/unique_minor_difference or inconsistent_non_intronic), default for gene quantification;
+* `unique_inconsistent` - uses uniquely assigned reads allowing any kind of inconsistency;
 * `all` - all of the above.
 
 
diff --git a/isoquant.py b/isoquant.py
index aaefde88..69ec59bf 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -35,7 +35,7 @@
 from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
 from src.graph_based_model_construction import StrandnessReportingLevel
 from src.long_read_assigner import AmbiguityResolvingMethod
-from src.long_read_counter import COUNTING_STRATEGIES
+from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy
 from src.input_data_storage import InputDataStorage
 from src.multimap_resolver import MultimapResolvingStrategy
 from src.stats import combine_counts
@@ -56,6 +56,7 @@ def parse_args(cmd_args=None, namespace=None):
     input_args_group = parser.add_argument_group('Input data')
     output_args_group = parser.add_argument_group('Output naming')
     pipeline_args_group = parser.add_argument_group('Pipeline options')
+    algo_args_group = parser.add_argument_group('Algorithm settings')
 
     other_options = parser.add_argument_group("Additional options:")
     show_full_help = '--full_help' in cmd_args
@@ -132,37 +133,40 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                         help="reads represent FL transcripts; both ends of the read are considered to be reliable")
 
     # ALGORITHM
-    add_additional_option("--delta", type=int, default=None,
-                          help="delta for inexact splice junction comparison, chosen automatically based on data type")
-    add_hidden_option("--graph_clustering_distance", type=int, default=None,
-                      help="intron graph clustering distance, "
-                           "splice junctions less that this number of bp apart will not be differentiated")
-    add_additional_option("--matching_strategy", choices=["exact", "precise", "default", "loose"],
-                          help="read-to-isoform matching strategy from the most strict to least", type=str, default=None)
-    add_additional_option("--splice_correction_strategy", choices=["none", "default_pacbio", "default_ont", "conservative_ont", "all", "assembly"],
-                          help="read alignment correction strategy to use",
-                          type=str, default=None)
-    add_additional_option("--model_construction_strategy", choices=["reliable", "default_pacbio",
-                                                                    "sensitive_pacbio", "fl_pacbio", "default_ont",
-                                                                    "sensitive_ont", "all", "assembly"],
-                          help="transcript model construction strategy to use",
-                          type=str, default=None)
-
-    add_additional_option("--transcript_quantification", choices=COUNTING_STRATEGIES,
-                          help="transcript quantification strategy", type=str, default="with_ambiguous")
-    add_additional_option("--gene_quantification", choices=COUNTING_STRATEGIES,
-                          help="gene quantification strategy", type=str, default="with_inconsistent")
-    add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
-                          help="report novel monoexonic transcripts (true/false), "
-                               "default: false for ONT, true for other data types")
-    add_additional_option("--report_canonical",  type=str, choices=[e.name for e in StrandnessReportingLevel],
-                          help="reporting level for novel transcripts based on canonical splice sites;"
-                               " default: " + StrandnessReportingLevel.auto.name,
-                          default=StrandnessReportingLevel.only_stranded.name)
-    add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
-                          help="require polyA tails to be present when reporting transcripts; "
-                               "default: auto (requires polyA only when polyA percentage is >= 70%%)",
-                          default=PolyAUsageStrategies.auto.name)
+    add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str,
+                                   help="report novel monoexonic transcripts (true/false), "
+                                        "default: false for ONT, true for other data types")
+    add_additional_option_to_group(algo_args_group, "--report_canonical",  type=str,
+                                   choices=[e.name for e in StrandnessReportingLevel],
+                                   help="reporting level for novel transcripts based on canonical splice sites;"
+                                        " default: " + StrandnessReportingLevel.auto.name,
+                                   default=StrandnessReportingLevel.only_stranded.name)
+    add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str,
+                                   choices=[e.name for e in PolyAUsageStrategies],
+                                   help="require polyA tails to be present when reporting transcripts; "
+                                        "default: auto (requires polyA only when polyA percentage is >= 70%%)",
+                                   default=PolyAUsageStrategies.auto.name)
+
+    add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES,
+                                   help="transcript quantification strategy", type=str,
+                                   default=CountingStrategy.unique_only.name)
+    add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES,
+                                   help="gene quantification strategy", type=str,
+                                   default=CountingStrategy.unique_splicing_consistent.name)
+
+    add_additional_option_to_group(algo_args_group, "--matching_strategy",
+                                   choices=["exact", "precise", "default", "loose"],
+                                   help="read-to-isoform matching strategy from the most strict to least",
+                                   type=str, default=None)
+    add_additional_option_to_group(algo_args_group, "--splice_correction_strategy",
+                                   choices=["none", "default_pacbio", "default_ont",
+                                            "conservative_ont", "all", "assembly"],
+                                   help="read alignment correction strategy to use", type=str, default=None)
+    add_additional_option_to_group(algo_args_group, "--model_construction_strategy",
+                                   choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio",
+                                            "default_ont", "sensitive_ont", "all", "assembly"],
+                                   help="transcript model construction strategy to use", type=str, default=None)
+
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                      default="16")
@@ -191,6 +195,11 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                    help="align reads to reference without running further analysis")
 
     # ADDITIONAL
+    add_additional_option("--delta", type=int, default=None,
+                          help="delta for inexact splice junction comparison, chosen automatically based on data type")
+    add_hidden_option("--graph_clustering_distance", type=int, default=None,
+                      help="intron graph clustering distance, "
+                           "splice junctions less that this number of bp apart will not be differentiated")
     add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped",
                           action='store_false', default=True)
     add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check",
diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index e3efe01a..de2cb31d 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -22,28 +22,29 @@
 class CountingStrategy(Enum):
     unique_only = 1
     with_ambiguous = 2
-    with_inconsistent = 10
-    with_inconsistent_minor = 11
+    unique_splicing_consistent = 10
+    unique_inconsistent = 11
+    # all_splicing_consistent = 19
     all = 20
-    all_minor = 21
 
     def no_inconsistent(self):
         return self in [CountingStrategy.unique_only, CountingStrategy.with_ambiguous]
 
     def ambiguous(self):
-        return self in [CountingStrategy.all, CountingStrategy.all_minor, CountingStrategy.with_ambiguous]
+        return self in [CountingStrategy.all, CountingStrategy.with_ambiguous]
 
     def inconsistent_minor(self):
-        return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor]
+        return self == CountingStrategy.unique_splicing_consistent
 
     def inconsistent(self):
-        return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor,
-                        CountingStrategy.with_inconsistent, CountingStrategy.all]
+        return self in [CountingStrategy.unique_splicing_consistent,
+                        CountingStrategy.unique_inconsistent, CountingStrategy.all]
 
 
 COUNTING_STRATEGIES = [CountingStrategy.unique_only.name,
                        CountingStrategy.with_ambiguous.name,
-                       CountingStrategy.with_inconsistent.name,
+                       CountingStrategy.unique_splicing_consistent.name,
+                       CountingStrategy.unique_inconsistent.name,
                        CountingStrategy.all.name]
 
 
@@ -94,19 +95,17 @@ def confirms_feature(read_assignment):
 
 
 class ReadWeightCounter:
-    def __init__(self, strategy_str, relax_strategy=True):
+    def __init__(self, strategy_str):
         self.strategy = CountingStrategy[strategy_str]
-        if relax_strategy:
-            if self.strategy == CountingStrategy.with_inconsistent:
-                self.strategy = CountingStrategy.with_inconsistent_minor
-            elif self.strategy == CountingStrategy.all:
-                self.strategy = CountingStrategy.all_minor
         self.strategy_flags = CountingStrategyFlags(self.strategy)
 
     def process_inconsistent(self, assignment_type, feature_count):
         # use only for inconsistent assignments
         if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1:
-            return 0.0
+            if self.strategy_flags.use_ambiguous:
+                return 1.0 / feature_count
+            else:
+                return 0.0
         if self.strategy_flags.use_inconsistent:
             return 1.0
         if self.strategy_flags.use_inconsistent_minor and assignment_type == ReadAssignmentType.inconsistent_non_intronic:
@@ -363,7 +362,7 @@ def convert_counts_to_tpm(self):
 
 def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
                         read_groups=None, ignore_read_groups=False, output_zeroes=True):
-    read_weight_counter = ReadWeightCounter(strategy, relax_strategy=False)
+    read_weight_counter = ReadWeightCounter(strategy)
     return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor,
                                   read_groups, read_weight_counter,
                                   complete_feature_list, ignore_read_groups, output_zeroes)
@@ -371,7 +370,7 @@ def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
 
 def create_transcript_counter(output_file_name, strategy, complete_feature_list=None,
                               read_groups=None, ignore_read_groups=False, output_zeroes=True):
-    read_weight_counter = ReadWeightCounter(strategy, relax_strategy=True)
+    read_weight_counter = ReadWeightCounter(strategy)
     return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor,
                                   read_groups, read_weight_counter,
                                   complete_feature_list, ignore_read_groups, output_zeroes)

From e1b931e1d5f31e5607e39ee691e9244ace29624d Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 26 Apr 2024 01:21:56 +0300
Subject: [PATCH 16/21] refactor file merging, introduce new TPM normalization

---
 src/dataset_processor.py | 66 ++++++++++++----------------------------
 src/file_utils.py        | 42 ++++++++++++++++---------
 src/long_read_counter.py | 15 +++++++--
 3 files changed, 59 insertions(+), 64 deletions(-)

diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 8ddb4a6c..ec3e6007 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -18,11 +18,11 @@
 import pysam
 from pyfaidx import Fasta, Faidx, UnsupportedCompressionFormat
 
-from .common import proper_plural_form, rreplace
+from .common import proper_plural_form
 from .serialization import *
 from .isoform_assignment import BasicReadAssignment, ReadAssignmentType
 from .stats import EnumStats
-from .file_utils import merge_files
+from .file_utils import merge_files, merge_counts
 from .input_data_storage import SampleData
 from .alignment_processor import AlignmentCollector, AlignmentType
 from .long_read_counter import (
@@ -635,26 +635,14 @@ def process_assigned_reads(self, sample, dump_filename):
             self.merge_transcript_models(sample.prefix, aggregator, chr_ids, gff_printer)
             logger.info("Transcript model file " + gff_printer.model_fname)
             if self.args.genedb:
-                merge_files(
-                    [
-                        rreplace(extended_gff_printer.model_fname, sample.prefix, f"{sample.prefix}_{chr_id}")
-                        for chr_id in chr_ids
-                    ],
-                    extended_gff_printer.out_gff,
-                    copy_header=False
-                )
+                merge_files(extended_gff_printer.model_fname, sample.prefix, chr_ids,
+                            extended_gff_printer.out_gff, copy_header=False)
                 logger.info("Extended annotation is saved to " + extended_gff_printer.model_fname)
             transcript_stat_counter.print_start("Transcript model statistics")
 
         self.merge_assignments(sample, aggregator, chr_ids)
         if self.args.sqanti_output:
-            merge_files(
-                [
-                    rreplace(sample.out_t2t_tsv, sample.prefix, f"{sample.prefix}_{chr_id}")
-                    for chr_id in chr_ids
-                ],
-                open(sample.out_t2t_tsv, "w"), copy_header=False
-            )
+            merge_files(sample.out_t2t_tsv, sample.prefix, chr_ids, open(sample.out_t2t_tsv, "w"), copy_header=False)
 
         aggregator.finalize_aggregators(sample)
 
@@ -668,34 +656,20 @@ def load_read_info(self, dump_filename):
 
     def merge_assignments(self, sample, aggregator, chr_ids):
         if self.args.genedb:
-            merge_files(
-                [rreplace(sample.out_assigned_tsv, sample.prefix, sample.prefix + "_" + chr_id) for chr_id in chr_ids],
-                aggregator.basic_printer.output_file, copy_header=False)
-        merge_files(
-            [rreplace(sample.out_corrected_bed, sample.prefix, sample.prefix + "_" + chr_id) for chr_id in chr_ids],
-            aggregator.corrected_bed_printer.output_file, copy_header=False)
-        for p in aggregator.global_counter.counters:
-            merge_files(
-                [rreplace(p.output_counts_file_name, sample.prefix, sample.prefix + "_" + chr_id) for chr_id in chr_ids],
-                p.get_output_file_handler(),
-                stats_file_names=[rreplace(p.output_stats_file_name, sample.prefix, sample.prefix + "_" + chr_id) for
-                                  chr_id in chr_ids]
-                if p.output_stats_file_name else None,
-                ignore_read_groups=p.ignore_read_groups,
-                unaligned_reads=self.alignment_stat_counter.stats_dict[AlignmentType.unaligned])
-            p.convert_counts_to_tpm()
+            merge_files(sample.out_assigned_tsv, sample.prefix, chr_ids,
+                        aggregator.basic_printer.output_file, copy_header=False)
+        merge_files(sample.out_corrected_bed, sample.prefix, chr_ids,
+                    aggregator.corrected_bed_printer.output_file, copy_header=False)
+
+        for counter in aggregator.global_counter.counters:
+            unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned]
+            merge_counts(counter, sample.prefix, chr_ids, unaligned)
+            counter.convert_counts_to_tpm()
 
     def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer):
-        merge_files([rreplace(gff_printer.model_fname, label, label + "_" + chr_id) for chr_id in chr_ids],
-                    gff_printer.out_gff, copy_header=False)
-        merge_files([rreplace(gff_printer.r2t_fname, label, label + "_" + chr_id) for chr_id in chr_ids],
-                    gff_printer.out_r2t, copy_header=False)
-        for p in aggregator.transcript_model_global_counter.counters:
-            merge_files([rreplace(p.output_counts_file_name, label, label + "_" + chr_id) for chr_id in chr_ids],
-                        p.get_output_file_handler(),
-                        stats_file_names=[rreplace(p.output_stats_file_name, label, label + "_" + chr_id) for chr_id in
-                                          chr_ids]
-                        if p.output_stats_file_name else None,
-                        ignore_read_groups=p.ignore_read_groups,
-                        unaligned_reads=self.alignment_stat_counter.stats_dict[AlignmentType.unaligned])
-            p.convert_counts_to_tpm()
+        merge_files(gff_printer.model_fname, label, chr_ids, gff_printer.out_gff, copy_header=False)
+        merge_files(gff_printer.r2t_fname, label, chr_ids, gff_printer.out_r2t, copy_header=False)
+        for counter in aggregator.transcript_model_global_counter.counters:
+            unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned]
+            merge_counts(counter, label, chr_ids, unaligned)
+            counter.convert_counts_to_tpm()
diff --git a/src/file_utils.py b/src/file_utils.py
index 4800f682..c0bf318c 100644
--- a/src/file_utils.py
+++ b/src/file_utils.py
@@ -4,11 +4,17 @@
 import shutil
 import gzip
 
+from .common import  rreplace
+
 logger = logging.getLogger('IsoQuant')
 
 
-def merge_files(file_names, merged_file_handler, stats_file_names=None, ignore_read_groups=True, copy_header=True,
-                unaligned_reads=0):
+def merge_file_list(fname, label, chr_ids):
+    return [rreplace(fname, label, f"{label}_{chr_id}") for chr_id in chr_ids]
+
+
+def merge_files(file_name, label, chr_ids, merged_file_handler, copy_header=True):
+    file_names = merge_file_list(file_name, label, chr_ids)
     file_names.sort(key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split('(\d+)', s)])
     for i, file_name in enumerate(file_names):
         if not os.path.exists(file_name): continue
@@ -24,22 +30,28 @@ def merge_files(file_names, merged_file_handler, stats_file_names=None, ignore_r
     for file_name in file_names:
         os.remove(file_name)
 
-    ambiguous_reads, not_assigned_reads, not_aligned_reads = 0, 0, unaligned_reads
-    if stats_file_names and ignore_read_groups:
+
+def merge_counts(counter, label, chr_ids, unaligned_reads=0):
+    file_name = counter.output_counts_file_name
+    merged_file_handler = counter.get_output_file_handler()
+    merge_files(file_name, label, chr_ids, merged_file_handler)
+
+    counter.reads_for_tpm = 0
+    stat_dict = {"__ambiguous": 0, "__no_feature": 0, "__not_aligned": 0, "__usable": 0}
+
+    if counter.output_stats_file_name and counter.ignore_read_groups:
+        stats_file_names = merge_file_list(counter.output_stats_file_name, label, chr_ids)
         for file_name in stats_file_names:
-            with open(file_name) as f:
-                line = f.readline()
-                ambiguous_reads += int(line.split()[1])
-                line = f.readline()
-                not_assigned_reads += int(line.split()[1])
-                line = f.readline()
-                if unaligned_reads == 0:
-                    not_aligned_reads += int(line.split()[1])
+            for l in open(file_name):
+                v = l.strip().split()
+                stat_dict[v[0]] += int(v[1])
             os.remove(file_name)
-        merged_file_handler.write("__ambiguous\t%d\n" % ambiguous_reads)
-        merged_file_handler.write("__no_feature\t%d\n" % not_assigned_reads)
-        merged_file_handler.write("__not_aligned\t%d\n" % not_aligned_reads)
 
+        if unaligned_reads > 0:
+            stat_dict["__not_aligned"] = unaligned_reads
+        for v in ["__ambiguous", "__no_feature", "__not_aligned"]:
+            merged_file_handler.write("%s\t%d\n" % (v, stat_dict[v]))
+        counter.reads_for_tpm = stat_dict[ "__usable"]
 
 def normalize_path(config_path, file_path):
     if os.path.isabs(file_path):
diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index de2cb31d..823dccd0 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -202,6 +202,7 @@ def __init__(self, output_prefix, assignment_extractor, read_groups, read_counte
         self.read_counter = read_counter
 
         self.ambiguous_reads = 0
+        self.reads_for_tpm = 0
         self.not_assigned_reads = 0
         self.not_aligned_reads = 0
         # group_id -> (feature_id -> count)
@@ -226,14 +227,14 @@ def add_read_info(self, read_assignment=None):
         assignment_type = self.assignment_extractor.get_assignment_type(read_assignment)
         group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group
 
+        self.reads_for_tpm += 1
         if assignment_type == ReadAssignmentType.ambiguous:
             count_value = self.read_counter.process_ambiguous(len(feature_ids))
             for feature_id in feature_ids:
                 self.feature_counter[group_id][feature_id] += count_value
                 if count_value > 0:
                     self.all_features.add(feature_id)
-            if len(feature_ids):
-                self.ambiguous_reads += 1
+            self.ambiguous_reads += 1
 
         elif assignment_type.is_inconsistent():
             count_value = self.read_counter.process_inconsistent(assignment_type, len(feature_ids))
@@ -260,6 +261,7 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d
             self.not_assigned_reads += 1
         elif len(feature_ids) > 1:
             self.ambiguous_reads += 1
+            self.reads_for_tpm += 1
             for feature_id in feature_ids:
                 count_value = self.read_counter.process_ambiguous(len(feature_ids))
                 self.feature_counter[group_id][feature_id] += count_value
@@ -267,9 +269,11 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d
         else:
             self.feature_counter[group_id][feature_ids[0]] += 1
             self.all_features.add(feature_ids[0])
+            self.reads_for_tpm += 1
 
     def add_unassigned(self, n_reads=1):
         self.not_assigned_reads += n_reads
+        self.reads_for_tpm += n_reads
 
     def add_unaligned(self, n_reads=1):
         self.not_aligned_reads += n_reads
@@ -321,6 +325,7 @@ def dump(self):
                 f.write("__ambiguous\t%d\n" % self.ambiguous_reads)
                 f.write("__no_feature\t%d\n" % self.not_assigned_reads)
                 f.write("__not_aligned\t%d\n" % self.not_aligned_reads)
+                f.write("__usable\t%d\n" % self.reads_for_tpm)
 
     def convert_counts_to_tpm(self):
         total_counts = defaultdict(float)
@@ -337,7 +342,11 @@ def convert_counts_to_tpm(self):
 
         scale_factors = {}
         for group_id in total_counts.keys():
-            scale_factors[group_id] = 1000000.0 / total_counts[group_id] if total_counts[group_id] > 0 else 1.0
+            if self.ignore_read_groups and self.reads_for_tpm:
+                total_reads = self.reads_for_tpm
+            else:
+                total_reads = total_counts[group_id] if total_counts[group_id] > 0 else 1.0
+            scale_factors[group_id] = 1000000.0 / total_reads
             logger.debug("Scale factor for group %s = %.2f" % (group_id, scale_factors[group_id]))
 
         with open(self.output_tpm_file_name, "w") as outf:

From 4d8d122730cb1786f9fb4f7b5d3a33b919e746a0 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 26 Apr 2024 13:13:45 +0300
Subject: [PATCH 17/21] introduce normalization option

---
 README.md                | 16 +++++++++++++---
 isoquant.py              |  6 +++++-
 src/dataset_processor.py |  4 ++--
 src/long_read_counter.py | 17 ++++++++++++++---
 4 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 3f7df462..368f39fb 100644
--- a/README.md
+++ b/README.md
@@ -552,7 +552,7 @@ We recommend _not_ to modify these options unless you are clearly aware of their
 `--no_gzip`
     Do not compress large output files.
 
-`--no_gzip`
+`--no_gtf_check`
     Do not perform input GTF checks.
 
 `--no_secondary`
@@ -562,13 +562,13 @@ We recommend _not_ to modify these options unless you are clearly aware of their
     Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable.
 
 `--no_junc_bed`
-    Do not use annotation for read mapping.
+    Do not use gene annotation for read mapping.
 
 `--junc_bed_file`
     Annotation in BED12 format produced by `paftools.js gff2bed` (can be found in `minimap2`), will be created automatically if not given.
 
 `--delta`
-    Delta for inexact splice junction comparison, chosen automatically based on data type.
+    Delta for inexact splice junction comparison, chosen automatically based on data type (e.g. 4bp for PacBio, 6pb for ONT).
 
 `--genedb_output`
     If your output folder is located on a shared storage (e.g. NFS share), use this option to set another path
@@ -588,6 +588,16 @@ but may improve running time when disk I/O is relatively slow.
 `--simple_alignments_mapq_cutoff`
     Filers out alignments with 1 or 2 exons and MAPQ less than this value (works only in annotation-free mode, default is 1).
 
+`--normalization_method`
+    Method for normalizing non-grouped counts into TPMs:
+* `simple` - standard method, scale factor equals to 1 million divided by the counts sum (default);
+* `usable_reads` - includes all reads assigned to a feature including the ones that were filtered out
+during quantification (i.e. inconsistent or ambiguous);
+scale factor equals to 1 million divided by the number of all assigned reads.
+In this case the sum of all gene/transcript TPMs may not add up to 1 million.
+Experiments with simulated data show that this method could give more accurate estimations.
+However, normalization method does not affect correlation/relative proportions.
+
 
 ### Examples
 <a name="examples"></a>
diff --git a/isoquant.py b/isoquant.py
index 69ec59bf..3c575563 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -35,7 +35,7 @@
 from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
 from src.graph_based_model_construction import StrandnessReportingLevel
 from src.long_read_assigner import AmbiguityResolvingMethod
-from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy
+from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, NormalizationMethod
 from src.input_data_storage import InputDataStorage
 from src.multimap_resolver import MultimapResolvingStrategy
 from src.stats import combine_counts
@@ -221,6 +221,10 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
     add_additional_option("--simple_alignments_mapq_cutoff", help="ignore alignments with 1 or 2 exons and "
                                                                   "MAPQ < this (works only in annotation-free mode, "
                                                                   "default=1)", type=int, default=1)
+    add_additional_option("--normalization_method", type=str, choices=[e.name for e in NormalizationMethod],
+                          help="TPM normalization method: simple - conventional normalization using all counted reads;"
+                               "usable_reads - includes all assigned reads.",
+                          default=NormalizationMethod.simple.name)
 
     add_additional_option_to_group(pipeline_args_group, "--keep_tmp", help="do not remove temporary files "
                                                                            "in the end", action='store_true',
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index ec3e6007..5ed99a7d 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -664,7 +664,7 @@ def merge_assignments(self, sample, aggregator, chr_ids):
         for counter in aggregator.global_counter.counters:
             unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned]
             merge_counts(counter, sample.prefix, chr_ids, unaligned)
-            counter.convert_counts_to_tpm()
+            counter.convert_counts_to_tpm(self.args.normalization_method)
 
     def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer):
         merge_files(gff_printer.model_fname, label, chr_ids, gff_printer.out_gff, copy_header=False)
@@ -672,4 +672,4 @@ def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer):
         for counter in aggregator.transcript_model_global_counter.counters:
             unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned]
             merge_counts(counter, label, chr_ids, unaligned)
-            counter.convert_counts_to_tpm()
+            counter.convert_counts_to_tpm(self.args.normalization_method)
diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index 823dccd0..0a813c8b 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -48,6 +48,12 @@ def inconsistent(self):
                        CountingStrategy.all.name]
 
 
+@unique
+class NormalizationMethod(Enum):
+    simple = 1
+    usable_reads = 2
+
+
 class CountingStrategyFlags:
     def __init__(self, counting_strategy):
         self.use_ambiguous = counting_strategy.ambiguous()
@@ -327,7 +333,8 @@ def dump(self):
                 f.write("__not_aligned\t%d\n" % self.not_aligned_reads)
                 f.write("__usable\t%d\n" % self.reads_for_tpm)
 
-    def convert_counts_to_tpm(self):
+    def convert_counts_to_tpm(self, normalization_str=NormalizationMethod.simple.name):
+        normalization = NormalizationMethod[normalization_str]
         total_counts = defaultdict(float)
         with open(self.output_counts_file_name) as f:
             for line in f:
@@ -341,9 +348,11 @@ def convert_counts_to_tpm(self):
                         total_counts[j] += float(fs[j + 1])
 
         scale_factors = {}
+        unassigned_tpm = 0.0
         for group_id in total_counts.keys():
-            if self.ignore_read_groups and self.reads_for_tpm:
+            if normalization == NormalizationMethod.usable_reads and self.ignore_read_groups and self.reads_for_tpm:
                 total_reads = self.reads_for_tpm
+                unassigned_tpm = 1000000.0 * (1 - total_counts[group_id] / total_reads)
             else:
                 total_reads = total_counts[group_id] if total_counts[group_id] > 0 else 1.0
             scale_factors[group_id] = 1000000.0 / total_reads
@@ -367,6 +376,8 @@ def convert_counts_to_tpm(self):
                         feature_id, counts = fs[0], list(map(float, fs[1:]))
                         tpm_values = [scale_factors[i] * counts[i] for i in range(len(scale_factors))]
                         outf.write("%s\t%s\n" % (feature_id, "\t".join(["%.6f" % c for c in tpm_values])))
+                if self.ignore_read_groups:
+                    outf.write("%s\t%.6f\n" % ("__unassigned", unassigned_tpm))
 
 
 def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
@@ -424,7 +435,7 @@ def dump(self):
                     if self.print_zeroes or incl_count > 0 or excl_count > 0:
                         f.write("%s\t%s\t%d\t%d\n" % (feature_name, group_id, incl_count, excl_count))
 
-    def convert_counts_to_tpm(self):
+    def convert_counts_to_tpm(self, normalization=NormalizationMethod.simple):
         return
 
     @staticmethod

From 0646c31484be7e3dcd0b2243ee15afe9ef56e755 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 26 Apr 2024 14:41:27 +0300
Subject: [PATCH 18/21] fix console tests

---
 tests/console_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/console_test.py b/tests/console_test.py
index c4f34420..d1396014 100644
--- a/tests/console_test.py
+++ b/tests/console_test.py
@@ -18,7 +18,7 @@ def test__help(option):
     print(result.returncode)
     assert result.returncode == 0
     assert b"usage" in result.stdout
-    assert b"optional arguments:" in result.stdout
+    assert b"options:" in result.stdout
 
 
 def test_clean_start():

From 7b80226c6fdf3b38679fa5f4fc250ab11597efc8 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 26 Apr 2024 15:01:59 +0300
Subject: [PATCH 19/21] fix inconsistency tests

---
 tests/test_long_read_assigner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_long_read_assigner.py b/tests/test_long_read_assigner.py
index 6f3d5ec8..12201fcc 100644
--- a/tests/test_long_read_assigner.py
+++ b/tests/test_long_read_assigner.py
@@ -542,7 +542,7 @@ def test_assign_inconsistent(self, sorted_blocks, polya_pos, polyt_pos, isoform_
                                                 PolyAInfo(polya_pos, polyt_pos, -1, -1))
 
         read_assignment = self.assigner.assign_to_isoform("read_id", combined_profile)
-        assert read_assignment.assignment_type == ReadAssignmentType.inconsistent
+        assert read_assignment.assignment_type.is_inconsistent()
         if isoform_id:
             assert isoform_id in {im.assigned_transcript for im in read_assignment.isoform_matches}
         if expected_event:

From 07006909aadf78f7437c862de32f9fe00758853d Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Fri, 26 Apr 2024 15:33:01 +0300
Subject: [PATCH 20/21] various options tests

---
 .github/workflows/SIRVs.Set4.options.yml | 63 ++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 .github/workflows/SIRVs.Set4.options.yml

diff --git a/.github/workflows/SIRVs.Set4.options.yml b/.github/workflows/SIRVs.Set4.options.yml
new file mode 100644
index 00000000..fed58328
--- /dev/null
+++ b/.github/workflows/SIRVs.Set4.options.yml
@@ -0,0 +1,63 @@
+name: SIRVs real ONT various options
+
+on:
+  workflow_dispatch:
+  schedule:
+  - cron: '0 21 * * 6'
+
+env:
+  RUN_NAME: SIRVs.Set4.R10.OPTS
+  LAUNCHER: ${{github.workspace}}/tests/github/run_pipeline.py
+  CFG_DIR: /abga/work/andreyp/ci_isoquant/data
+  BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/
+  OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/
+
+concurrency:
+  group: ${{github.workflow}}
+  cancel-in-progress: false
+
+jobs:
+  launch-runner:
+    runs-on:
+      labels: [self-hosted]
+    name: 'Running IsoQuant and QC'
+
+    steps:
+      - name: 'Cleanup'
+        run: >
+          set -e &&
+          shopt -s dotglob &&
+          rm -rf *
+
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 1
+
+      - name: 'IsoQuant SIRVs R10.4 pipeline options'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: SIRVs.Set4.R10.opts1
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}
+
+      - name: 'IsoQuant SIRVs R10.4 read filtering'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: SIRVs.Set4.R10.opts2
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}
+
+      - name: 'IsoQuant SIRVs R10.4 strategies'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: SIRVs.Set4.R10.opts3
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}}
+

From 5154915b551dae0f252cfdc56d07b4fd4bd91190 Mon Sep 17 00:00:00 2001
From: Andrey Prjibelski <andrewprzh@gmail.com>
Date: Mon, 6 May 2024 14:15:34 +0300
Subject: [PATCH 21/21] fix counting strategy flags

---
 src/long_read_counter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/long_read_counter.py b/src/long_read_counter.py
index 0a813c8b..ba4f4a18 100644
--- a/src/long_read_counter.py
+++ b/src/long_read_counter.py
@@ -34,12 +34,12 @@ def ambiguous(self):
         return self in [CountingStrategy.all, CountingStrategy.with_ambiguous]
 
     def inconsistent_minor(self):
-        return self == CountingStrategy.unique_splicing_consistent
-
-    def inconsistent(self):
         return self in [CountingStrategy.unique_splicing_consistent,
                         CountingStrategy.unique_inconsistent, CountingStrategy.all]
 
+    def inconsistent(self):
+        return self in [CountingStrategy.unique_inconsistent, CountingStrategy.all]
+
 
 COUNTING_STRATEGIES = [CountingStrategy.unique_only.name,
                        CountingStrategy.with_ambiguous.name,
@@ -108,7 +108,7 @@ def __init__(self, strategy_str):
     def process_inconsistent(self, assignment_type, feature_count):
         # use only for inconsistent assignments
         if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1:
-            if self.strategy_flags.use_ambiguous:
+            if self.strategy_flags.use_ambiguous and self.strategy_flags.use_inconsistent:
                 return 1.0 / feature_count
             else:
                 return 0.0