From e4736fbd4761a65dae612b262914f4d3a0e447ac Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 12 Apr 2024 17:34:14 +0300 Subject: [PATCH 01/21] add gene assignment type --- src/isoform_assignment.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 5e4c4883..b2356e77 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -496,6 +496,11 @@ def __init__(self, read_id, assignment_type, match=None): self.isoform_matches = match else: self.isoform_matches = [match] + if self.assignment_type != ReadAssignmentType.ambiguous: + self.gene_assignment_type = self.assignment_type + else: + assigned_genes = set([m.assigned_gene for m in self.isoform_matches]) + self.gene_assignment_type = ReadAssignmentType.ambiguous if len(assigned_genes) > 1 else ReadAssignmentType.unique self.additional_info = {} self.additional_attributes = {} self.introns_match = False @@ -522,6 +527,7 @@ def deserialize(cls, infile, gene_info): read_assignment.chr_id = read_string(infile) read_assignment.mapping_quality = read_short_int(infile) read_assignment.assignment_type = ReadAssignmentType(read_short_int(infile)) + read_assignment.gene_assignment_type = ReadAssignmentType(read_short_int(infile)) read_assignment.isoform_matches = read_list(infile, IsoformMatch.deserialize) read_assignment.additional_info = read_dict(infile) read_assignment.additional_attributes = read_dict(infile) @@ -546,6 +552,7 @@ def serialize(self, outfile): write_string(self.chr_id, outfile) write_short_int(self.mapping_quality, outfile) write_short_int(self.assignment_type.value, outfile) + write_short_int(self.gene_assignment_type.value, outfile) write_list(self.isoform_matches, outfile, IsoformMatch.serialize) write_dict(self.additional_info, outfile) write_dict(self.additional_attributes, outfile) From 92994253f03d8b803d0fa0f6e9dc686197beb7b4 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 16 Apr 2024 02:10:09 +0300 Subject: [PATCH 02/21] inroduce various ambiguity assignment types --- src/assignment_io.py | 11 +++++----- src/isoform_assignment.py | 45 +++++++++++++++++++++++++++++++++------ src/long_read_assigner.py | 13 +++++++---- 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/src/assignment_io.py b/src/assignment_io.py index a47bd937..2d5cbc91 100644 --- a/src/assignment_io.py +++ b/src/assignment_io.py @@ -21,8 +21,8 @@ read_short_int, SHORT_TERMINATION_INT ) -from .isoform_assignment import match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification -from .long_read_assigner import ReadAssignmentType +from .isoform_assignment import (match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification, + ReadAssignmentType, ReadAssignmentTypeNaming) from .gene_info import GeneInfo @@ -241,9 +241,10 @@ def add_read_info(self, read_assignment): read_introns, isoform_introns) for x in m.match_subclassifications]) strand = read_assignment.strand - line = read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" + \ - m.assigned_transcript + "\t" + m.assigned_gene + "\t" + \ - read_assignment.assignment_type.name + "\t" + event_string + "\t" + range_list_to_str(read_exons) + line = (read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" + + m.assigned_transcript + "\t" + m.assigned_gene + "\t" + + ReadAssignmentTypeNaming[read_assignment.assignment_type] + "\t" + event_string + "\t" + + range_list_to_str(read_exons)) additional_info = [] additional_info.append("PolyA=" + str(read_assignment.polyA_found) + ";") diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index b2356e77..04150433 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -20,9 +20,27 @@ class ReadAssignmentType(Enum): intergenic = 20 ambiguous = 10 unique_minor_difference = 2 - inconsistent = 3 + inconsistent = 30 + inconsistent_non_intronic = 31 + inconsistent_amb = 32 suspended = 255 + @staticmethod + def is_inconsistent(assignment_type): + return assignment_type in [ReadAssignmentType.inconsistent, + ReadAssignmentType.inconsistent_amb, + ReadAssignmentType.inconsistent_non_intronic] + + +ReadAssignmentTypeNaming = {ReadAssignmentType.unique: ReadAssignmentType.unique.name, + ReadAssignmentType.noninformative: ReadAssignmentType.noninformative.name, + ReadAssignmentType.intergenic: ReadAssignmentType.intergenic.name, + ReadAssignmentType.ambiguous: ReadAssignmentType.ambiguous.name, + ReadAssignmentType.unique_minor_difference: ReadAssignmentType.unique_minor_difference.name, + ReadAssignmentType.inconsistent: ReadAssignmentType.inconsistent.name, + ReadAssignmentType.inconsistent_non_intronic: ReadAssignmentType.inconsistent.name, + ReadAssignmentType.inconsistent_amb: ReadAssignmentType.inconsistent.name, + ReadAssignmentType.suspended: ReadAssignmentType.suspended.name} # SQANTI-like @unique @@ -206,7 +224,11 @@ def is_minor_elongation(match_event_subtype): @staticmethod def is_major_inconsistency(match_event_subtype): - return match_event_subtype in nnic_event_types or match_event_subtype in nic_event_types + return match_event_subtype in all_major_events + + @staticmethod + def is_intronic_inconsistency(match_event_subtype): + return match_event_subtype in intronic_major_events event_subtype_cost = { @@ -332,9 +354,12 @@ def elongation_cost(params, elongation_len): MatchEventSubtype.internal_polya_right, MatchEventSubtype.internal_polya_left, MatchEventSubtype.major_exon_elongation_left, MatchEventSubtype.major_exon_elongation_right, MatchEventSubtype.exon_elongation_left, MatchEventSubtype.exon_elongation_right, - MatchEventSubtype.antisense } +all_major_events = nic_event_types.union(nnic_event_types) + +intronic_major_events = all_major_events.difference(nonintronic_events) + # (side, is_known) -> alternation type alternative_sites = {("left", True): MatchEventSubtype.alt_left_site_known, @@ -496,11 +521,17 @@ def __init__(self, read_id, assignment_type, match=None): self.isoform_matches = match else: self.isoform_matches = [match] - if self.assignment_type != ReadAssignmentType.ambiguous: - self.gene_assignment_type = self.assignment_type - else: + if self.assignment_type == ReadAssignmentType.ambiguous: + assigned_genes = set([m.assigned_gene for m in self.isoform_matches]) + self.gene_assignment_type = ReadAssignmentType.ambiguous if len( + assigned_genes) > 1 else ReadAssignmentType.unique + elif self.assignment_type == ReadAssignmentType.inconsistent_amb: assigned_genes = set([m.assigned_gene for m in self.isoform_matches]) - self.gene_assignment_type = ReadAssignmentType.ambiguous if len(assigned_genes) > 1 else ReadAssignmentType.unique + self.gene_assignment_type = ReadAssignmentType.inconsistent_amb if len( + assigned_genes) > 1 else ReadAssignmentType.inconsistent + else: + self.gene_assignment_type = self.assignment_type + self.additional_info = {} self.additional_attributes = {} self.introns_match = False diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py index ab294336..174d3e6a 100644 --- a/src/long_read_assigner.py +++ b/src/long_read_assigner.py @@ -322,7 +322,8 @@ def check_read_ends(self, read_split_exon_profile, assignment): if any(MatchEventSubtype.is_major_elongation(e.event_type) for e in exon_elongation_types): # serious exon elongation - assignment.set_assignment_type(ReadAssignmentType.inconsistent) + if not ReadAssignmentType.is_inconsistent(assignment.assignment_type): + assignment.set_assignment_type(ReadAssignmentType.inconsistent_non_intronic) elif any(MatchEventSubtype.is_minor_elongation(e.event_type) for e in exon_elongation_types): # minor exon elongation if assignment.assignment_type == ReadAssignmentType.unique: @@ -513,7 +514,7 @@ def match_consistent(self, read_id, combined_read_profile): self.verify_read_ends_for_assignment(combined_read_profile, assignment) # if apa detected isoform should be treated as inconsistent and rechecked - if assignment.assignment_type == ReadAssignmentType.inconsistent: + if ReadAssignmentType.is_inconsistent(assignment.assignment_type): return None return assignment @@ -602,7 +603,7 @@ def match_inconsistent(self, read_id, combined_read_profile): assignment_type = self.classify_assignment(best_isoforms, read_matches) if not combined_read_profile.read_intron_profile.read_profile: isoform_matches = self.create_monoexon_matches(read_matches, best_isoforms) - elif assignment_type == ReadAssignmentType.inconsistent: + elif ReadAssignmentType.is_inconsistent(assignment_type): isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, best_score) else: isoform_matches = self.create_consistent_matches(read_matches, best_isoforms, combined_read_profile) @@ -621,7 +622,11 @@ def classify_assignment(self, best_isoforms, read_matches): assignment_type = ReadAssignmentType.ambiguous if is_abmiguous else ReadAssignmentType.unique elif any(MatchEventSubtype.is_major_inconsistency(e) for e in all_event_types): #logger.debug("* * Assignment is inconsistent") - assignment_type = ReadAssignmentType.inconsistent + if any(MatchEventSubtype.is_intronic_inconsistency(e) for e in all_event_types): + inconsistency_type = ReadAssignmentType.inconsistent + else: + inconsistency_type = ReadAssignmentType.inconsistent_non_intronic + assignment_type = ReadAssignmentType.inconsistent_amb if is_abmiguous else inconsistency_type elif any(MatchEventSubtype.is_minor_error(e) for e in all_event_types): #logger.debug("* * Assignment has minor errors") assignment_type = ReadAssignmentType.ambiguous if is_abmiguous else ReadAssignmentType.unique_minor_difference From 9271b0abc45d8557132b4698ee918e55a84fbd6b Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 18 Apr 2024 01:56:15 +0300 Subject: [PATCH 03/21] conventional naming --- src/assignment_io.py | 2 +- src/isoform_assignment.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/assignment_io.py b/src/assignment_io.py index 2d5cbc91..4c430db1 100644 --- a/src/assignment_io.py +++ b/src/assignment_io.py @@ -243,7 +243,7 @@ def add_read_info(self, read_assignment): strand = read_assignment.strand line = (read_assignment.read_id + "\t" + read_assignment.chr_id + "\t" + strand + "\t" + m.assigned_transcript + "\t" + m.assigned_gene + "\t" + - ReadAssignmentTypeNaming[read_assignment.assignment_type] + "\t" + event_string + "\t" + + read_assignment.assignment_type.name + "\t" + event_string + "\t" + range_list_to_str(read_exons)) additional_info = [] diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 04150433..441e8b8d 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -32,16 +32,6 @@ def is_inconsistent(assignment_type): ReadAssignmentType.inconsistent_non_intronic] -ReadAssignmentTypeNaming = {ReadAssignmentType.unique: ReadAssignmentType.unique.name, - ReadAssignmentType.noninformative: ReadAssignmentType.noninformative.name, - ReadAssignmentType.intergenic: ReadAssignmentType.intergenic.name, - ReadAssignmentType.ambiguous: ReadAssignmentType.ambiguous.name, - ReadAssignmentType.unique_minor_difference: ReadAssignmentType.unique_minor_difference.name, - ReadAssignmentType.inconsistent: ReadAssignmentType.inconsistent.name, - ReadAssignmentType.inconsistent_non_intronic: ReadAssignmentType.inconsistent.name, - ReadAssignmentType.inconsistent_amb: ReadAssignmentType.inconsistent.name, - ReadAssignmentType.suspended: ReadAssignmentType.suspended.name} - # SQANTI-like @unique class MatchClassification(Enum): From 30f3388a397d9371c5df84b2ae888a798ebd3d7b Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 18 Apr 2024 15:22:45 +0300 Subject: [PATCH 04/21] refactor multimap resolver, print gene type assignment --- src/assignment_io.py | 7 ++- src/dataset_processor.py | 9 ++-- src/isoform_assignment.py | 41 ++++++++++++---- src/long_read_assigner.py | 30 ++++++------ src/multimap_resolver.py | 99 ++++++++++++++++++++++++--------------- 5 files changed, 118 insertions(+), 68 deletions(-) diff --git a/src/assignment_io.py b/src/assignment_io.py index 4c430db1..682a808c 100644 --- a/src/assignment_io.py +++ b/src/assignment_io.py @@ -21,8 +21,10 @@ read_short_int, SHORT_TERMINATION_INT ) -from .isoform_assignment import (match_subtype_to_str_with_additional_info, ReadAssignment, MatchClassification, - ReadAssignmentType, ReadAssignmentTypeNaming) +from .isoform_assignment import (match_subtype_to_str_with_additional_info, + ReadAssignment, + MatchClassification, + ReadAssignmentType) from .gene_info import GeneInfo @@ -247,6 +249,7 @@ def add_read_info(self, read_assignment): range_list_to_str(read_exons)) additional_info = [] + additional_info.append("gene_assignment=" + str(read_assignment.gene_assignment_type.name) + ";") additional_info.append("PolyA=" + str(read_assignment.polyA_found) + ";") if self.params.cage is not None: additional_info.append("CAGE=" + str(read_assignment.cage_found) + ";") diff --git a/src/dataset_processor.py b/src/dataset_processor.py index ea48a87e..884930cf 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -180,6 +180,7 @@ def get_next(self): continue else: read_assignment.assignment_type = resolved_assignment.assignment_type + read_assignment.gene_assignment_type = resolved_assignment.gene_assignment_type read_assignment.multimapper = resolved_assignment.multimapper assignment_storage.append(read_assignment) @@ -506,14 +507,14 @@ def collect_reads(self, sample): results = proc.map(*read_gen, chunksize=1) for storage, read_groups, alignment_stats in results: - for read_assignment in storage: - multimapped_reads[read_assignment.read_id].append(read_assignment) + for basic_read_assignment in storage: + multimapped_reads[basic_read_assignment.read_id].append(basic_read_assignment) all_read_groups.update(read_groups) self.alignment_stat_counter.merge(alignment_stats) else: for storage, read_groups, alignment_stats in map(*read_gen): - for read_assignment in storage: - multimapped_reads[read_assignment.read_id].append(read_assignment) + for basic_read_assignment in storage: + multimapped_reads[basic_read_assignment.read_id].append(basic_read_assignment) all_read_groups.update(read_groups) self.alignment_stat_counter.merge(alignment_stats) diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 441e8b8d..0de604c3 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -31,6 +31,11 @@ def is_inconsistent(assignment_type): ReadAssignmentType.inconsistent_amb, ReadAssignmentType.inconsistent_non_intronic] + @staticmethod + def is_consistent(assignment_type): + return assignment_type in [ReadAssignmentType.unique, + ReadAssignmentType.unique_minor_difference, + ReadAssignmentType.ambiguous] # SQANTI-like @unique @@ -403,12 +408,12 @@ def __repr__(self): class IsoformMatch: def __init__(self, match_classification, assigned_gene=None, assigned_transcript=None, - match_subclassification = None, transcript_strand='.', score=0): + match_subclassification = None, transcript_strand='.', penalty_score=0): self.assigned_gene = assigned_gene self.assigned_transcript = assigned_transcript self.transcript_strand = transcript_strand self.match_classification = match_classification - self.score = score + self.penalty_score = penalty_score if match_subclassification is None: self.match_subclassifications = [] elif isinstance(match_subclassification, list): @@ -424,7 +429,7 @@ def deserialize(cls, infile): match.assigned_transcript = read_string_or_none(infile) match.transcript_strand = read_string(infile) match.match_classification = MatchClassification(read_short_int(infile)) - match.score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER) + match.penalty_score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER) match.match_subclassifications = read_list(infile, MatchEvent.deserialize) return match @@ -433,7 +438,7 @@ def serialize(self, outfile): write_string_or_none(self.assigned_transcript, outfile) write_string(self.transcript_strand, outfile) write_short_int(self.match_classification.value, outfile) - write_int(int(self.score * SHORT_FLOAT_MULTIPLIER), outfile) + write_int(int(self.penalty_score * SHORT_FLOAT_MULTIPLIER), outfile) write_list(self.match_subclassifications, outfile, MatchEvent.serialize) def add_subclassification(self, match_subclassification): @@ -459,10 +464,22 @@ def __init__(self, read_assignment): self.multimapper = read_assignment.multimapper self.polyA_found = read_assignment.polyA_found self.assignment_type = read_assignment.assignment_type + self.gene_assignment_type = read_assignment.gene_assignment_type + self.penalty_score = 0.0 + self.isoforms = [] + self.genes = [] + if read_assignment.isoform_matches: - self.score = read_assignment.isoform_matches[0].score - else: - self.score = 0.0 + gene_set = set() + isoform_set = set() + for m in read_assignment.isoform_matches: + self.penalty_score = min(self.penalty_score, read_assignment.isoform_matches[0].penalty_score) + if m.assigned_gene: + gene_set.add(m.assigned_gene) + if m.assigned_transcript: + isoform_set.add(m.assigned_transcript) + self.genes = list(gene_set) + self.isoforms = list(isoform_set) @classmethod def deserialize(cls, infile): @@ -474,7 +491,10 @@ def deserialize(cls, infile): read_assignment.multimapper = bool_arr[0] read_assignment.polyA_found = bool_arr[1] read_assignment.assignment_type = ReadAssignmentType(read_short_int(infile)) - read_assignment.score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER) + read_assignment.gene_assignment_type = ReadAssignmentType(read_short_int(infile)) + read_assignment.penalty_score = float(read_int(infile)) / float(SHORT_FLOAT_MULTIPLIER) + read_assignment.genes = read_list(infile, read_string) + read_assignment.isoforms = read_list(infile, read_string) return read_assignment def serialize(self, outfile): @@ -483,7 +503,10 @@ def serialize(self, outfile): write_string(self.chr_id, outfile) write_bool_array([self.multimapper, self.polyA_found], outfile) write_short_int(self.assignment_type.value, outfile) - write_int(int(self.score * SHORT_FLOAT_MULTIPLIER), outfile) + write_short_int(self.gene_assignment_type.value, outfile) + write_int(int(self.penalty_score * SHORT_FLOAT_MULTIPLIER), outfile) + write_list(self.genes, outfile, write_string) + write_list(self.isoforms, outfile, write_string) class ReadAssignment: diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py index 174d3e6a..d50aeea7 100644 --- a/src/long_read_assigner.py +++ b/src/long_read_assigner.py @@ -594,7 +594,7 @@ def match_inconsistent(self, read_id, combined_read_profile): # logger.debug("* Inconsistencies detected: " + str(read_matches)) # select ones with least number of inconsistent events - best_isoforms, best_score = self.select_best_among_inconsistent(combined_read_profile, read_matches) + best_isoforms, penalty_score = self.select_best_among_inconsistent(combined_read_profile, read_matches) if not best_isoforms: return ReadAssignment(read_id, ReadAssignmentType.noninformative) # logger.debug("* Selected isoforms: " + str(best_isoforms)) @@ -604,7 +604,7 @@ def match_inconsistent(self, read_id, combined_read_profile): if not combined_read_profile.read_intron_profile.read_profile: isoform_matches = self.create_monoexon_matches(read_matches, best_isoforms) elif ReadAssignmentType.is_inconsistent(assignment_type): - isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, best_score) + isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, penalty_score) else: isoform_matches = self.create_consistent_matches(read_matches, best_isoforms, combined_read_profile) return ReadAssignment(read_id, assignment_type, isoform_matches) @@ -656,14 +656,14 @@ def create_consistent_matches(self, read_matches, selected_isoforms, combined_re matches.append(isoform_match) return matches - def create_inconsistent_matches(self, read_matches, selected_isoforms, score=0.0): + def create_inconsistent_matches(self, read_matches, selected_isoforms, penalty_score=0.0): matches = [] for isoform_id in selected_isoforms: read_match = read_matches[isoform_id] match_classification = MatchClassification.get_inconsistency_classification(read_match) isoform_match = IsoformMatch(match_classification, self.get_gene_id(isoform_id), isoform_id, read_match, self.gene_info.isoform_strands[isoform_id], - score=score) + penalty_score=penalty_score) matches.append(isoform_match) return matches @@ -720,7 +720,7 @@ def select_best_among_inconsistent(self, combined_read_profile, read_matches): isoform_scores = [] for isoform_id, match_events in read_matches.items(): # logger.debug("* * Scoring inconsistencies for " + isoform_id + ": " + str(match_events)) - score = 0.0 + penalty_score = 0.0 for e in match_events: event_count = 1 if e.isoform_region != SupplementaryMatchConstants.undefined_region and \ @@ -751,25 +751,25 @@ def select_best_among_inconsistent(self, combined_read_profile, read_matches): MatchEventSubtype.exon_elongation_left}: event_cost = elongation_cost(self.params, e.event_info) - score += event_cost * event_count + penalty_score += event_cost * event_count # logger.debug("* * * Event " + str(e.event_type) + ", introns affected " + str(event_count) + # ", event cost " + str(event_cost) + - # ". Updated score: " + str(score)) - # logger.debug("* * Final score for isoform " + isoform_id + ": " + str(score)) - isoform_scores.append((isoform_id, score)) + # ". Updated penalty_score: " + str(penalty_score)) + # logger.debug("* * Final penalty_score for isoform " + isoform_id + ": " + str(penalty_score)) + isoform_scores.append((isoform_id, penalty_score)) - best_score = min(isoform_scores, key=lambda x:x[1])[1] - # logger.debug("* * Best score " + str(best_score)) - best_isoforms = [x[0] for x in filter(lambda x:x[1] == best_score, isoform_scores)] + min_penalty_score = min(isoform_scores, key=lambda x:x[1])[1] + # logger.debug("* * Best penalty_score " + str(min_penalty_score)) + best_isoforms = [x[0] for x in filter(lambda x:x[1] == min_penalty_score, isoform_scores)] # logger.debug("* * Best isoforms " + str(best_isoforms)) - # if several isoforms are tied select the best according to nucl score + # if several isoforms are tied select the best according to nucl penalty_score if len(best_isoforms) > 1: best_isoforms = self.resolve_by_nucleotide_score(combined_read_profile, best_isoforms, similarity_function=self.coverage_based_nucleotide_score) - # logger.debug("* * Resolved by nucl score " + str(best_isoforms)) + # logger.debug("* * Resolved by nucl penalty_score " + str(best_isoforms)) - return best_isoforms, best_score + return best_isoforms, min_penalty_score # ==== POLYA STUFF ==== def verify_read_ends_for_assignment(self, combined_read_profile, assignment): diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index c36d03d5..26fc1abc 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -34,16 +34,11 @@ def resolve(self, assignment_list): return assignment_list elif self.strategy == MultimapResolvingStrategy.merge: - informative_assignments = set() - for i in range(len(assignment_list)): - if assignment_list[i].assignment_type != ReadAssignmentType.noninformative: - informative_assignments.add(i) - if len(informative_assignments) == 1: - return self.suspend_assignments(assignment_list, informative_assignments) - return self.suspend_assignments(assignment_list, informative_assignments, ReadAssignmentType.ambiguous) + return self.merge_assignments(assignment_list) elif self.strategy == MultimapResolvingStrategy.take_best: return self.select_best_assignment(assignment_list) + else: raise ValueError("Unsupported multimap strategy") @@ -53,53 +48,81 @@ def select_best_assignment(self, assignment_list): consistent_assignments = set() inconsistent_assignments = set() primary_inconsistent = set() + for i, a in enumerate(assignment_list): - if a.assignment_type in [ReadAssignmentType.unique, ReadAssignmentType.unique_minor_difference] and \ - not a.multimapper: - primary_unique.add(i) - elif a.assignment_type == ReadAssignmentType.inconsistent and not a.multimapper: - primary_inconsistent.add(i) - if a.assignment_type in [ReadAssignmentType.unique, - ReadAssignmentType.unique_minor_difference, - ReadAssignmentType.ambiguous]: - consistent_assignments.add(i) - elif a.assignment_type in [ReadAssignmentType.inconsistent]: + if ReadAssignmentType.is_inconsistent(a.assignment_type): + if not a.multimapper: + primary_inconsistent.add(i) inconsistent_assignments.add(i) + elif ReadAssignmentType.is_consistent(a.assignment_type): + consistent_assignments.add(i) + if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous: + primary_unique.add(i) if primary_unique: - if len(primary_unique) > 1: - return self.suspend_assignments(assignment_list, primary_unique, ReadAssignmentType.ambiguous) - # primary unique is found, rest is noninformative - return self.suspend_assignments(assignment_list, primary_unique) + return self.filter_assignments(assignment_list, primary_unique) if consistent_assignments: - return self.suspend_assignments(assignment_list, consistent_assignments, ReadAssignmentType.ambiguous) + return self.filter_assignments(assignment_list, consistent_assignments) if primary_inconsistent: return self.select_best_inconsistent(assignment_list, primary_inconsistent) if inconsistent_assignments: return self.select_best_inconsistent(assignment_list, inconsistent_assignments) + return assignment_list - def select_best_inconsistent(self, assignment_list, inconsistent_assignments): - if len(inconsistent_assignments) > 1: - assignment_scores = [] - for i in inconsistent_assignments: - assignment_scores.append((assignment_list[i].score, i)) - best_score = min(assignment_scores, key=lambda x: x[0])[0] - best_isoforms = [x[1] for x in filter(lambda x: x[0] == best_score, assignment_scores)] - return self.suspend_assignments(assignment_list, best_isoforms, - ReadAssignmentType.inconsistent if len(best_isoforms) > 1 else None) + @staticmethod + def select_best_inconsistent(assignment_list, inconsistent_assignments): + if len(inconsistent_assignments) <= 1: + return MultimapResolver.filter_assignments(assignment_list, inconsistent_assignments) + + assignment_scores = [] + for i in inconsistent_assignments: + assignment_scores.append((assignment_list[i].penalty_score, i)) + best_score = min(assignment_scores, key=lambda x: x[0])[0] + best_assignments = [x[1] for x in filter(lambda x: x[0] == best_score, assignment_scores)] + return MultimapResolver.filter_assignments(assignment_list, best_assignments) + + @staticmethod + def merge_assignments(assignment_list): + informative_assignments = set() + for i in range(len(assignment_list)): + if assignment_list[i].assignment_type != ReadAssignmentType.noninformative: + informative_assignments.add(i) + + return MultimapResolver.filter_assignments(assignment_list, informative_assignments) - return self.suspend_assignments(assignment_list, inconsistent_assignments) + # select all given assignments from the list, makr rest as suspended + # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly + @staticmethod + def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True): + all_genes = set() + all_isoforms = set() + for i in assignments_to_keep: + assignment = assignment_list[i] + all_genes.update(assignment.genes) + all_isoforms.update(assignment.isoforms) + + change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1 + change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1 - def suspend_assignments(self, assignment_list, assignments_to_keep, new_type=None): for i in range(len(assignment_list)): + assignment = assignment_list[i] if i in assignments_to_keep: - if new_type is not None: - assignment_list[i].assignment_type = new_type - assignment_list[i].multimapper = True - continue - assignment_list[i].assignment_type = ReadAssignmentType.suspended + if ReadAssignmentType.is_inconsistent(assignment.assignment_type): + ambiguity_assignment_type = ReadAssignmentType.inconsistent_amb + else: + ambiguity_assignment_type = ReadAssignmentType.ambiguous + + if change_transcript_assignment_type: + assignment.assignment_type = ambiguity_assignment_type + if change_gene_assignment_type: + assignment.gene_assignment_type = ambiguity_assignment_type + assignment.multimapper = True + else: + assignment.assignment_type = ReadAssignmentType.suspended + assignment.gene_assignment_type = ReadAssignmentType.suspended + return assignment_list From 025a54766a850c0d07f47c0b8a5d93d92fa8906b Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 18 Apr 2024 15:57:25 +0300 Subject: [PATCH 05/21] fix GTF checker and loader --- isoquant.py | 2 +- src/gtf2db.py | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/isoquant.py b/isoquant.py index 07d03145..363ef061 100755 --- a/isoquant.py +++ b/isoquant.py @@ -22,7 +22,7 @@ import gffutils import pyfaidx -from src.gtf2db import convert_gtf_to_db, check_gtf_duplicates +from src.gtf2db import convert_gtf_to_db from src.read_mapper import ( DATA_TYPE_ALIASES, SUPPORTED_STRANDEDNESS, diff --git a/src/gtf2db.py b/src/gtf2db.py index 5dd43976..473e8773 100755 --- a/src/gtf2db.py +++ b/src/gtf2db.py @@ -117,7 +117,7 @@ def gtf2db(gtf, db, complete_db=False): elif has_meta_features == 0: logger.warning("You set --complete_genedb option but it looks like the provided annotation " "does not contain all necessary gene or transcript records.") - logger.warning("The results of this run might not be meaningful.") + logger.warning("The results of this run might not be correct.") logger.warning("Remove the output folder and restart IsoQuant without --complete_genedb.") logger.info("Converting gene annotation file to .db format (takes a while)...") @@ -188,7 +188,10 @@ def check_gtf_duplicates(gtf): gtf_correct = False continue - gene_id = attrs[gene_id_pos + 1][1:-2] + gene_str = attrs[gene_id_pos + 1] + start_pos = gene_str.find('"') + end_pos = gene_str.rfind('"') + gene_id = gene_str[start_pos+1:end_pos] if feature_type == "gene": if gene_id in gene_ids: logger.warning("Duplicated gene id %s on line %d" % (gene_id, line_count)) @@ -210,7 +213,10 @@ def check_gtf_duplicates(gtf): gtf_correct = False continue - transcript_id = attrs[transcript_id_pos + 1][1:-2] + transcript_str = attrs[transcript_id_pos + 1] + start_pos = transcript_str.find('"') + end_pos = transcript_str.rfind('"') + transcript_id = transcript_str[start_pos+1:end_pos] if feature_type in ["transcript", "mRNA"]: if transcript_id in transcript_ids: logger.warning("Duplicated transcript id %s on line %d" % (transcript_id, line_count)) @@ -251,7 +257,7 @@ def check_gtf_duplicates(gtf): return gtf_correct, corrected_gtf, gtf_name + ".corrected" + inner_ext.lower(), complete_genedb -def find_coverted_db(converted_gtfs, gtf_filename): +def find_converted_db(converted_gtfs, gtf_filename): gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime') db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime') db_file = converted_gtfs.get(gtf_filename, {}).get('genedb') @@ -261,12 +267,13 @@ def find_coverted_db(converted_gtfs, gtf_filename): return None -def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename): +def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename, complete_genedb): gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime') db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime') - if os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime: - if os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime: - return True + is_complete = converted_gtfs.get(gtf_filename, {}).get('complete_db') + return (os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime and + os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime + and complete_genedb == is_complete) def convert_db(gtf_filename, genedb_filename, convert_fn, args): @@ -277,13 +284,13 @@ def convert_db(gtf_filename, genedb_filename, convert_fn, args): if not args.clean_start: if convert_fn == gtf2db: - converted_db = find_coverted_db(converted_gtfs, gtf_filename) + converted_db = find_converted_db(converted_gtfs, gtf_filename) if converted_db is not None: logger.info("Gene annotation file found. Using " + converted_db) return gtf_filename, converted_db else: for converted_gtf in converted_gtfs: - if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename): + if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename, args.complete_genedb): logger.info("Gene annotation file found. Using " + converted_gtf) return converted_gtf, genedb_filename From 90b4f9c5fd70609e6363fd097f4f9d278b12ff02 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 18 Apr 2024 16:04:33 +0300 Subject: [PATCH 06/21] add --no_gtf_check option --- README.md | 3 +++ isoquant.py | 2 ++ src/gtf2db.py | 11 ++++++++--- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f74c72bc..c0afb9cc 100644 --- a/README.md +++ b/README.md @@ -552,6 +552,9 @@ We recommend _not_ to modify these options unless you are clearly aware of their `--no_gzip` Do not compress large output files. +`--no_gzip` + Do not perform input GTF checks. + `--no_secondary` Ignore secondary alignments. diff --git a/isoquant.py b/isoquant.py index 363ef061..aaefde88 100755 --- a/isoquant.py +++ b/isoquant.py @@ -193,6 +193,8 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help # ADDITIONAL add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped", action='store_false', default=True) + add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check", + action='store_false', default=True) add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)", action='store_true', default=False) add_additional_option("--no_junc_bed", action="store_true", default=False, diff --git a/src/gtf2db.py b/src/gtf2db.py index 473e8773..ffcd7d31 100755 --- a/src/gtf2db.py +++ b/src/gtf2db.py @@ -91,7 +91,7 @@ def get_color(transcript_kind): logger.info("Gene database BED written to " + bed) -def gtf2db(gtf, db, complete_db=False): +def check_input_gtf(gtf, db, complete_db): logger.info("Checking input gene annotation") gtf_is_correct, corrected_gtf, out_fname, has_meta_features = check_gtf_duplicates(gtf) if not gtf_is_correct: @@ -120,6 +120,11 @@ def gtf2db(gtf, db, complete_db=False): logger.warning("The results of this run might not be correct.") logger.warning("Remove the output folder and restart IsoQuant without --complete_genedb.") + +def gtf2db(gtf, db, complete_db=False, check_gtf=True): + if check_gtf: + check_input_gtf(gtf, db, complete_db) + logger.info("Converting gene annotation file to .db format (takes a while)...") gffutils.create_db(gtf, db, force=True, keep_order=True, merge_strategy='error', sort_attribute_values=True, disable_infer_transcripts=complete_db, @@ -128,7 +133,7 @@ def gtf2db(gtf, db, complete_db=False): logger.info("Provide this database next time to avoid excessive conversion") -def convert_gtf_to_db(args, output_is_dir=True): +def convert_gtf_to_db(args, output_is_dir=True, check_input_gtf=True): gtf_filename = args.genedb gtf_filename = os.path.abspath(gtf_filename) output_path = args.output if args.genedb_output is None else args.genedb_output @@ -295,7 +300,7 @@ def convert_db(gtf_filename, genedb_filename, convert_fn, args): return converted_gtf, genedb_filename if convert_fn == gtf2db: - convert_fn(gtf_filename, genedb_filename, args.complete_genedb) + convert_fn(gtf_filename, genedb_filename, args.complete_genedb, args.gtf_check) else: convert_fn(genedb_filename, gtf_filename) converted_gtfs[gtf_filename] = { From 11ed7299377ffa5ec455c8f00c9766f6599834a0 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 18 Apr 2024 16:12:35 +0300 Subject: [PATCH 07/21] check for converted DB correctness --- src/gtf2db.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/gtf2db.py b/src/gtf2db.py index ffcd7d31..c8220014 100755 --- a/src/gtf2db.py +++ b/src/gtf2db.py @@ -262,23 +262,22 @@ def check_gtf_duplicates(gtf): return gtf_correct, corrected_gtf, gtf_name + ".corrected" + inner_ext.lower(), complete_genedb -def find_converted_db(converted_gtfs, gtf_filename): +def find_converted_db(converted_gtfs, gtf_filename, complete_genedb): gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime') db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime') db_file = converted_gtfs.get(gtf_filename, {}).get('genedb') - if os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime: - if os.path.exists(db_file) and os.path.getmtime(db_file) == db_mtime: - return db_file + is_complete = converted_gtfs.get(gtf_filename, {}).get('complete_db') + if (os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime and + os.path.exists(db_file) and os.path.getmtime(db_file) == db_mtime and complete_genedb == is_complete): + return db_file return None -def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename, complete_genedb): +def compare_stored_gtf(converted_gtfs, gtf_filename, genedb_filename): gtf_mtime = converted_gtfs.get(gtf_filename, {}).get('gtf_mtime') db_mtime = converted_gtfs.get(gtf_filename, {}).get('db_mtime') - is_complete = converted_gtfs.get(gtf_filename, {}).get('complete_db') return (os.path.exists(gtf_filename) and os.path.getmtime(gtf_filename) == gtf_mtime and - os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime - and complete_genedb == is_complete) + os.path.exists(genedb_filename) and os.path.getmtime(genedb_filename) == db_mtime) def convert_db(gtf_filename, genedb_filename, convert_fn, args): @@ -289,13 +288,13 @@ def convert_db(gtf_filename, genedb_filename, convert_fn, args): if not args.clean_start: if convert_fn == gtf2db: - converted_db = find_converted_db(converted_gtfs, gtf_filename) + converted_db = find_converted_db(converted_gtfs, gtf_filename, args.complete_genedb) if converted_db is not None: logger.info("Gene annotation file found. Using " + converted_db) return gtf_filename, converted_db else: for converted_gtf in converted_gtfs: - if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename, args.complete_genedb): + if compare_stored_gtf(converted_gtfs, converted_gtf, genedb_filename): logger.info("Gene annotation file found. Using " + converted_gtf) return converted_gtf, genedb_filename From daaef96f1ff1ea314c09c5606b8d84b573aa4ef4 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 19 Apr 2024 01:48:59 +0300 Subject: [PATCH 08/21] simply assignment type logic, fix isofrom discovery --- src/graph_based_model_construction.py | 16 ++++++------- src/isoform_assignment.py | 34 ++++++++++++++++----------- src/long_read_assigner.py | 8 +++---- src/long_read_counter.py | 8 +++---- src/multimap_resolver.py | 8 +++---- 5 files changed, 39 insertions(+), 35 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 0c377623..7395e55a 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -197,7 +197,7 @@ def compare_models_with_known(self): assignment.introns_match = all(e == 1 for e in combined_profile.read_intron_profile.read_profile) assignment.gene_info = self.gene_info - if assignment.assignment_type in [ReadAssignmentType.intergenic, ReadAssignmentType.noninformative] or \ + if assignment.assignment_type.is_unassigned() or \ not assignment.isoform_matches: # create intergenic assignment.assignment_type = ReadAssignmentType.intergenic @@ -481,13 +481,15 @@ def construct_assignment_based_isoforms(self, read_assignment_storage): if not read_assignment: continue - if len(read_assignment.corrected_exons) == 1 and read_assignment.polyA_found and not read_assignment.multimapper and \ - read_assignment.assignment_type in {ReadAssignmentType.noninformative, ReadAssignmentType.inconsistent, ReadAssignmentType.intergenic}: + if (len(read_assignment.corrected_exons) == 1 and + read_assignment.polyA_found and not read_assignment.multimapper and + (read_assignment.assignment_type.is_unassigned() or + read_assignment.assignment_type.is_inconsistent())): novel_mono_exon_reads.append(read_assignment) - if read_assignment.assignment_type not in {ReadAssignmentType.unique, - ReadAssignmentType.unique_minor_difference}: + if not read_assignment.assignment_type.is_unique(): continue + refrenence_isoform_id = read_assignment.isoform_matches[0].assigned_transcript if refrenence_isoform_id in GraphBasedModelConstructor.detected_known_isoforms: continue @@ -715,9 +717,7 @@ def assign_reads_to_models(self, read_assignments): model_assignment = assigner.assign_to_isoform(assignment.read_id, model_combined_profile) model_assignment.read_group = assignment.read_group # check that no serious contradiction occurs - if model_assignment.assignment_type in [ReadAssignmentType.unique, - ReadAssignmentType.unique_minor_difference, - ReadAssignmentType.ambiguous]: + if model_assignment.assignment_type.is_consistent(): matched_isoforms = [m.assigned_transcript for m in model_assignment.isoform_matches] if len(matched_isoforms) == 1: diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 0de604c3..916b59f4 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -22,20 +22,26 @@ class ReadAssignmentType(Enum): unique_minor_difference = 2 inconsistent = 30 inconsistent_non_intronic = 31 - inconsistent_amb = 32 + inconsistent_ambiguous = 32 suspended = 255 - @staticmethod - def is_inconsistent(assignment_type): - return assignment_type in [ReadAssignmentType.inconsistent, - ReadAssignmentType.inconsistent_amb, - ReadAssignmentType.inconsistent_non_intronic] + def is_inconsistent(self): + return self in [ReadAssignmentType.inconsistent, + ReadAssignmentType.inconsistent_ambiguous, + ReadAssignmentType.inconsistent_non_intronic] - @staticmethod - def is_consistent(assignment_type): - return assignment_type in [ReadAssignmentType.unique, - ReadAssignmentType.unique_minor_difference, - ReadAssignmentType.ambiguous] + def is_consistent(self): + return self in [ReadAssignmentType.unique, + ReadAssignmentType.unique_minor_difference, + ReadAssignmentType.ambiguous] + + def is_unassigned(self): + return self in [ReadAssignmentType.noninformative, + ReadAssignmentType.intergenic] + + def is_unique(self): + return self in [ReadAssignmentType.unique_minor_difference, + ReadAssignmentType.unique] # SQANTI-like @unique @@ -538,9 +544,9 @@ def __init__(self, read_id, assignment_type, match=None): assigned_genes = set([m.assigned_gene for m in self.isoform_matches]) self.gene_assignment_type = ReadAssignmentType.ambiguous if len( assigned_genes) > 1 else ReadAssignmentType.unique - elif self.assignment_type == ReadAssignmentType.inconsistent_amb: + elif self.assignment_type == ReadAssignmentType.inconsistent_ambiguous: assigned_genes = set([m.assigned_gene for m in self.isoform_matches]) - self.gene_assignment_type = ReadAssignmentType.inconsistent_amb if len( + self.gene_assignment_type = ReadAssignmentType.inconsistent_ambiguous if len( assigned_genes) > 1 else ReadAssignmentType.inconsistent else: self.gene_assignment_type = self.assignment_type @@ -728,7 +734,7 @@ def match_subtype_to_str_with_additional_info(event, strand, read_introns, isofo def is_matching_assignment(isoform_assignment): if isoform_assignment.assignment_type == ReadAssignmentType.unique: return True - elif isoform_assignment.assignment_type in [ReadAssignmentType.unique, ReadAssignmentType.unique_minor_difference]: + elif isoform_assignment.assignment_type.is_unique(): allowed_set = {MatchEventSubtype.none, MatchEventSubtype.fsm, MatchEventSubtype.exon_misalignment, diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py index d50aeea7..8041e920 100644 --- a/src/long_read_assigner.py +++ b/src/long_read_assigner.py @@ -322,7 +322,7 @@ def check_read_ends(self, read_split_exon_profile, assignment): if any(MatchEventSubtype.is_major_elongation(e.event_type) for e in exon_elongation_types): # serious exon elongation - if not ReadAssignmentType.is_inconsistent(assignment.assignment_type): + if not assignment.assignment_type.is_inconsistent(): assignment.set_assignment_type(ReadAssignmentType.inconsistent_non_intronic) elif any(MatchEventSubtype.is_minor_elongation(e.event_type) for e in exon_elongation_types): # minor exon elongation @@ -514,7 +514,7 @@ def match_consistent(self, read_id, combined_read_profile): self.verify_read_ends_for_assignment(combined_read_profile, assignment) # if apa detected isoform should be treated as inconsistent and rechecked - if ReadAssignmentType.is_inconsistent(assignment.assignment_type): + if assignment.assignment_type.is_inconsistent(): return None return assignment @@ -603,7 +603,7 @@ def match_inconsistent(self, read_id, combined_read_profile): assignment_type = self.classify_assignment(best_isoforms, read_matches) if not combined_read_profile.read_intron_profile.read_profile: isoform_matches = self.create_monoexon_matches(read_matches, best_isoforms) - elif ReadAssignmentType.is_inconsistent(assignment_type): + elif assignment_type.is_inconsistent(): isoform_matches = self.create_inconsistent_matches(read_matches, best_isoforms, penalty_score) else: isoform_matches = self.create_consistent_matches(read_matches, best_isoforms, combined_read_profile) @@ -626,7 +626,7 @@ def classify_assignment(self, best_isoforms, read_matches): inconsistency_type = ReadAssignmentType.inconsistent else: inconsistency_type = ReadAssignmentType.inconsistent_non_intronic - assignment_type = ReadAssignmentType.inconsistent_amb if is_abmiguous else inconsistency_type + assignment_type = ReadAssignmentType.inconsistent_ambiguous if is_abmiguous else inconsistency_type elif any(MatchEventSubtype.is_minor_error(e) for e in all_event_types): #logger.debug("* * Assignment has minor errors") assignment_type = ReadAssignmentType.ambiguous if is_abmiguous else ReadAssignmentType.unique_minor_difference diff --git a/src/long_read_counter.py b/src/long_read_counter.py index f7ffa22b..43657def 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -164,8 +164,7 @@ def add_read_info(self, read_assignment=None): # TODO: add __alignment_not_unique / __too_low_aQual ? if not read_assignment: self.not_aligned_reads += 1 - if read_assignment.assignment_type in [ReadAssignmentType.noninformative, ReadAssignmentType.intergenic] or \ - not read_assignment.isoform_matches: + if read_assignment.assignment_type.is_unassigned() or not read_assignment.isoform_matches: self.not_assigned_reads += 1 elif read_assignment.isoform_matches and read_assignment.isoform_matches[0].assigned_transcript is None: self.not_assigned_reads += 1 @@ -186,7 +185,7 @@ def add_read_info(self, read_assignment=None): if count_value > 0: self.all_features.add(feature_id) self.ambiguous_reads += 1 - elif read_assignment.assignment_type == ReadAssignmentType.inconsistent: + elif read_assignment.assignment_type.is_inconsistent(): feature_ids = set([self.get_feature_id(m) for m in read_assignment.isoform_matches]) group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group count_value = self.read_counter.process_inconsistent(read_assignment.isoform_matches[0], len(feature_ids)) @@ -196,8 +195,7 @@ def add_read_info(self, read_assignment=None): self.feature_counter[group_id][feature_id] += count_value self.all_features.add(feature_id) self.confirmed_features.add((group_id, feature_id)) - elif read_assignment.assignment_type == ReadAssignmentType.unique or\ - read_assignment.assignment_type == ReadAssignmentType.unique_minor_difference: + elif read_assignment.assignment_type.is_unique(): feature_id = self.get_feature_id(read_assignment.isoform_matches[0]) group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group self.feature_counter[group_id][feature_id] += 1.0 diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index 26fc1abc..1567e455 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -50,11 +50,11 @@ def select_best_assignment(self, assignment_list): primary_inconsistent = set() for i, a in enumerate(assignment_list): - if ReadAssignmentType.is_inconsistent(a.assignment_type): + if a.assignment_type.is_inconsistent(): if not a.multimapper: primary_inconsistent.add(i) inconsistent_assignments.add(i) - elif ReadAssignmentType.is_consistent(a.assignment_type): + elif a.assignment_type.is_consistent(): consistent_assignments.add(i) if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous: primary_unique.add(i) @@ -111,8 +111,8 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T for i in range(len(assignment_list)): assignment = assignment_list[i] if i in assignments_to_keep: - if ReadAssignmentType.is_inconsistent(assignment.assignment_type): - ambiguity_assignment_type = ReadAssignmentType.inconsistent_amb + if assignment.assignment_type.is_inconsistent(): + ambiguity_assignment_type = ReadAssignmentType.inconsistent_ambiguous else: ambiguity_assignment_type = ReadAssignmentType.ambiguous From 9bfcb8a409d6929fb5b2506a9bafebee68f16e2c Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 19 Apr 2024 18:15:20 +0300 Subject: [PATCH 09/21] fix multimapping --- src/multimap_resolver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index 1567e455..0583b06f 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -118,6 +118,7 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T if change_transcript_assignment_type: assignment.assignment_type = ambiguity_assignment_type + assignment.multimapper = True if change_gene_assignment_type: assignment.gene_assignment_type = ambiguity_assignment_type assignment.multimapper = True From b5a246b99a75b508080b813929cf85dcb8345571 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 22 Apr 2024 22:58:37 +0300 Subject: [PATCH 10/21] proper duplicates detection and removal --- src/alignment_processor.py | 6 ++-- src/dataset_processor.py | 2 +- src/isoform_assignment.py | 26 ++++++++++++++ src/multimap_resolver.py | 73 +++++++++++++++++++++++++++++++++----- 4 files changed, 95 insertions(+), 12 deletions(-) diff --git a/src/alignment_processor.py b/src/alignment_processor.py index 2022ee3f..24f038e5 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -284,7 +284,7 @@ def process_alignments_in_region(self, current_region, alignment_storage): if gene_info.empty(): assignment_storage = self.process_intergenic(alignment_storage, current_region) else: - assignment_storage = self.process_genic(alignment_storage, gene_info) + assignment_storage = self.process_genic(alignment_storage, gene_info, current_region) return gene_info, assignment_storage @@ -332,6 +332,7 @@ def process_intergenic(self, alignment_storage, region): alignment_info.polya_info.internal_polyt_pos != -1) read_assignment.polya_info = alignment_info.polya_info read_assignment.cage_found = len(alignment_info.cage_hits) > 0 + read_assignment.genomic_region = region read_assignment.exons = alignment_info.read_exons read_assignment.corrected_exons = corrector.correct_read(alignment_info) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) @@ -346,7 +347,7 @@ def process_intergenic(self, alignment_storage, region): assignment_storage.append(read_assignment) return assignment_storage - def process_genic(self, alignment_storage, gene_info): + def process_genic(self, alignment_storage, gene_info, region): assigner = LongReadAssigner(gene_info, self.params) profile_constructor = CombinedProfileConstructor(gene_info, self.params) exon_corrector = ExonCorrector(gene_info, self.params, self.chr_record) @@ -388,6 +389,7 @@ def process_genic(self, alignment_storage, gene_info): alignment_info.polya_info.internal_polyt_pos != -1) read_assignment.polya_info = alignment_info.polya_info read_assignment.cage_found = len(alignment_info.cage_hits) > 0 + read_assignment.genomic_region = region read_assignment.exons = alignment_info.read_exons read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info, read_assignment) diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 884930cf..8ddb4a6c 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -353,7 +353,7 @@ def finalize_aggregators(self, sample): logger.info("Gene counts are stored in " + self.gene_counter.output_counts_file_name) logger.info("Transcript counts are stored in " + self.transcript_counter.output_counts_file_name) logger.info("Read assignments are stored in " + self.basic_printer.output_file_name + - ".gz" if self.args.gzipped else "") + (".gz" if self.args.gzipped else "")) self.read_stat_counter.print_start("Read assignment statistics") diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 916b59f4..41c1b525 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -467,6 +467,12 @@ def __init__(self, read_assignment): self.assignment_id = read_assignment.assignment_id self.read_id = read_assignment.read_id self.chr_id = read_assignment.chr_id + self.start = 0 + self.end = 0 + if read_assignment.exons: + self.start = read_assignment.exons[0][0] + self.end = read_assignment.exons[-1][1] + self.genomic_region = read_assignment.genomic_region self.multimapper = read_assignment.multimapper self.polyA_found = read_assignment.polyA_found self.assignment_type = read_assignment.assignment_type @@ -487,12 +493,24 @@ def __init__(self, read_assignment): self.genes = list(gene_set) self.isoforms = list(isoform_set) + def __eq__(self, other): + if isinstance(other, BasicReadAssignment): + return (self.read_id == other.read_id and + self.chr_id == other.chr_id and + self.start == other.start and + self.end == other.end and + self.isoforms == other.isoforms) + return False + @classmethod def deserialize(cls, infile): read_assignment = cls.__new__(cls) read_assignment.assignment_id = read_int(infile) read_assignment.read_id = read_string(infile) read_assignment.chr_id = read_string(infile) + read_assignment.start = read_int(infile) + read_assignment.end = read_int(infile) + read_assignment.genomic_region = (read_int(infile), read_int(infile)) bool_arr = read_bool_array(infile, 2) read_assignment.multimapper = bool_arr[0] read_assignment.polyA_found = bool_arr[1] @@ -507,6 +525,10 @@ def serialize(self, outfile): write_int(self.assignment_id, outfile) write_string(self.read_id, outfile) write_string(self.chr_id, outfile) + write_int(self.start, outfile) + write_int(self.end, outfile) + write_int(self.genomic_region[0], outfile) + write_int(self.genomic_region[1], outfile) write_bool_array([self.multimapper, self.polyA_found], outfile) write_short_int(self.assignment_type.value, outfile) write_short_int(self.gene_assignment_type.value, outfile) @@ -520,6 +542,7 @@ class ReadAssignment: def __init__(self, read_id, assignment_type, match=None): self.assignment_id = ReadAssignment.assignment_id_generator.increment() self.read_id = read_id + self.genomic_region = (0, 0) self.exons = None self.corrected_exons = None self.corrected_introns = None @@ -562,6 +585,7 @@ def deserialize(cls, infile, gene_info): read_assignment = cls.__new__(cls) read_assignment.assignment_id = read_int(infile) read_assignment.read_id = read_string(infile) + read_assignment.genomic_region = (read_int(infile), read_int(infile)) read_assignment.exons = read_list_of_pairs(infile, read_int) read_assignment.corrected_exons = read_list_of_pairs(infile, read_int) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) @@ -589,6 +613,8 @@ def deserialize(cls, infile, gene_info): def serialize(self, outfile): write_int(self.assignment_id, outfile) write_string(self.read_id, outfile) + write_int(self.genomic_region[0], outfile) + write_int(self.genomic_region[1], outfile) write_list_of_pairs(self.exons, outfile, write_int) write_list_of_pairs(self.corrected_exons, outfile, write_int) write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile) diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index 0583b06f..07a70ef2 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -6,6 +6,7 @@ import logging from enum import Enum +from collections import defaultdict from .isoform_assignment import ReadAssignmentType @@ -19,6 +20,7 @@ class MultimapResolvingStrategy(Enum): class MultimapResolver: + duplicate_counter = 0 def __init__(self, strategy): self.strategy = strategy @@ -44,20 +46,20 @@ def resolve(self, assignment_list): def select_best_assignment(self, assignment_list): logger.debug("Resolving read %s" % assignment_list[0].read_id) - primary_unique = set() - consistent_assignments = set() - inconsistent_assignments = set() - primary_inconsistent = set() + primary_unique = [] + consistent_assignments = [] + inconsistent_assignments = [] + primary_inconsistent = [] for i, a in enumerate(assignment_list): if a.assignment_type.is_inconsistent(): if not a.multimapper: - primary_inconsistent.add(i) - inconsistent_assignments.add(i) + primary_inconsistent.append(i) + inconsistent_assignments.append(i) elif a.assignment_type.is_consistent(): - consistent_assignments.add(i) + consistent_assignments.append(i) if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous: - primary_unique.add(i) + primary_unique.append(i) if primary_unique: return self.filter_assignments(assignment_list, primary_unique) @@ -98,6 +100,8 @@ def merge_assignments(assignment_list): # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly @staticmethod def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True): + assignments_to_keep = MultimapResolver.find_duplicates(assignment_list, assignments_to_keep) + all_genes = set() all_isoforms = set() for i in assignments_to_keep: @@ -108,9 +112,10 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1 change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1 + assignments_to_keep_set = set(assignments_to_keep) for i in range(len(assignment_list)): assignment = assignment_list[i] - if i in assignments_to_keep: + if i in assignments_to_keep_set: if assignment.assignment_type.is_inconsistent(): ambiguity_assignment_type = ReadAssignmentType.inconsistent_ambiguous else: @@ -127,3 +132,53 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T assignment.gene_assignment_type = ReadAssignmentType.suspended return assignment_list + + # finds groups of duplicated assignments + @staticmethod + def find_duplicates(assignment_list, assignment_indices): + if len(assignment_indices) <= 1: + return assignment_indices + + selected_assignments = [] + discarded_duplicates = set() + for i in range(len(assignment_indices)): + index1 = assignment_indices[i] + if index1 in discarded_duplicates: + # this assignment was already discarded - no need to compare it + continue + # if it was not discarded earlier - keep anyway; next assignments will either duplicate it or won't + selected_assignments.append(index1) + + for j in range(i + 1, len(assignment_indices)): + # find duplicating assignments among the next onex + index2 = assignment_indices[j] + if index2 in discarded_duplicates: + # this assignment was already discarded - no need to compare it + continue + + if assignment_list[index1] == assignment_list[index2]: + # discard duplicating assignment with larger index + discarded_duplicates.add(index2) + + # check if duplicates come from the same region, + # if they are - it means we have identical records in the BAM files, print a warning + if assignment_list[index1].genomic_region == assignment_list[index2].genomic_region: + MultimapResolver.duplicate_counter += 1 + example_assignment = assignment_list[index1] + if MultimapResolver.duplicate_counter == 5: + logger.warning( + "More possible duplicates were detected but will not be printed to limit the log size. " + "All duplicated entries will be ignored. " + "Use --debug to see all of them in the isoquant.log.") + if MultimapResolver.duplicate_counter >= 5: + logger.debug( + "Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. " + "Please, check you input." % + (example_assignment.read_id, example_assignment.chr_id, example_assignment.start)) + else: + logger.warning( + "Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. " + "Please, check you input." + % (example_assignment.read_id, example_assignment.chr_id, example_assignment.start)) + + return selected_assignments From 875d8357d0f60c0940fa2e311c3c53fd85809713 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 22 Apr 2024 23:45:23 +0300 Subject: [PATCH 11/21] fix printing order --- src/isoform_assignment.py | 1 + src/long_read_assigner.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 41c1b525..8697444b 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -539,6 +539,7 @@ def serialize(self, outfile): class ReadAssignment: assignment_id_generator = AtomicCounter() + def __init__(self, read_id, assignment_type, match=None): self.assignment_id = ReadAssignment.assignment_id_generator.increment() self.read_id = read_id diff --git a/src/long_read_assigner.py b/src/long_read_assigner.py index 8041e920..7e605cfb 100644 --- a/src/long_read_assigner.py +++ b/src/long_read_assigner.py @@ -381,7 +381,7 @@ def categorize_correct_splice_match(self, combined_read_profile, isoform_id): # make proper match subtype def categorize_multiple_splice_matches(self, combined_read_profile, isoform_ids): isoform_matches = [] - for isoform_id in isoform_ids: + for isoform_id in sorted(isoform_ids): isoform_matches.append(self.categorize_correct_splice_match(combined_read_profile, isoform_id)) return isoform_matches @@ -400,7 +400,7 @@ def categorize_correct_unspliced_match(self, combined_read_profile, isoform_id): def categorize_multiple_unspliced_matches(self, combined_read_profile, isoform_ids): isoform_matches = [] - for isoform_id in isoform_ids: + for isoform_id in sorted(isoform_ids): isoform_matches.append(self.categorize_correct_unspliced_match(combined_read_profile, isoform_id)) return isoform_matches @@ -593,7 +593,7 @@ def match_inconsistent(self, read_id, combined_read_profile): return ReadAssignment(read_id, ReadAssignmentType.noninformative) # logger.debug("* Inconsistencies detected: " + str(read_matches)) - # select ones with least number of inconsistent events + # select ones with the least number of inconsistent events best_isoforms, penalty_score = self.select_best_among_inconsistent(combined_read_profile, read_matches) if not best_isoforms: return ReadAssignment(read_id, ReadAssignmentType.noninformative) From e1457461d50a22980551b4a466b34f0c39b20963 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 22 Apr 2024 23:59:13 +0300 Subject: [PATCH 12/21] remove unused flag --- src/multimap_resolver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index 07a70ef2..f02c97b0 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -99,7 +99,7 @@ def merge_assignments(assignment_list): # select all given assignments from the list, makr rest as suspended # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly @staticmethod - def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True): + def filter_assignments(assignment_list, assignments_to_keep): assignments_to_keep = MultimapResolver.find_duplicates(assignment_list, assignments_to_keep) all_genes = set() @@ -109,8 +109,8 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T all_genes.update(assignment.genes) all_isoforms.update(assignment.isoforms) - change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1 - change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1 + change_transcript_assignment_type = len(all_isoforms) > 1 + change_gene_assignment_type = len(all_genes) > 1 assignments_to_keep_set = set(assignments_to_keep) for i in range(len(assignment_list)): From 757f3a3e01ba54286d4c43c016af7e7ce1bf9421 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 23 Apr 2024 22:44:57 +0300 Subject: [PATCH 13/21] refactor and improve long read counter --- src/isoform_assignment.py | 12 +-- src/long_read_counter.py | 183 ++++++++++++++++++++++++-------------- 2 files changed, 119 insertions(+), 76 deletions(-) diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 8697444b..212ae02b 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -43,6 +43,10 @@ def is_unique(self): return self in [ReadAssignmentType.unique_minor_difference, ReadAssignmentType.unique] + def is_ambiguous(self): + return self in [ReadAssignmentType.ambiguous, + ReadAssignmentType.inconsistent_ambiguous] + # SQANTI-like @unique class MatchClassification(Enum): @@ -667,14 +671,6 @@ def add_match_attribute(self, match_event): m.add_subclassification(match_event) -def get_assigned_transcript_id(match): - return match.assigned_transcript - - -def get_assigned_gene_id(match): - return match.assigned_gene - - match_subtype_printable_names = \ {MatchEventSubtype.ism_left : ('ism_5', 'ism_3', 'ism'), MatchEventSubtype.ism_right : ('ism_3', 'ism_5', 'ism'), diff --git a/src/long_read_counter.py b/src/long_read_counter.py index 43657def..cdaaf2ba 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -11,8 +11,6 @@ from .isoform_assignment import ( ReadAssignmentType, nonintronic_events, - get_assigned_gene_id, - get_assigned_transcript_id ) from .gene_info import FeatureInfo from .read_groups import AbstractReadGrouper @@ -21,54 +19,106 @@ @unique -class CountingStrategies(Enum): +class CountingStrategy(Enum): unique_only = 1 with_ambiguous = 2 - inconsistent_all = 10 - inconsistent_minor = 11 + with_inconsistent = 10 + with_inconsistent_minor = 11 all = 20 all_minor = 21 + def no_inconsistent(self): + return self in [CountingStrategy.unique_only, CountingStrategy.with_ambiguous] -COUNTING_STRATEGIES = ["unique_only", "with_ambiguous", "with_inconsistent", "all"] + def ambiguous(self): + return self in [CountingStrategy.all, CountingStrategy.all_minor, CountingStrategy.with_ambiguous] + def inconsistent_minor(self): + return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor] + + def inconsistent(self): + return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor, + CountingStrategy.with_inconsistent, CountingStrategy.all] + + +COUNTING_STRATEGIES = [CountingStrategy.unique_only.name, + CountingStrategy.with_ambiguous.name, + CountingStrategy.with_inconsistent.name, + CountingStrategy.all.name] -class ReadWeightCounter: - def __init__(self, strategy, gene_counting=True): - if strategy == "unique_only": - self.strategy = CountingStrategies.unique_only - elif strategy == "with_ambiguous": - self.strategy = CountingStrategies.with_ambiguous - elif strategy == "all": - if gene_counting: - self.strategy = CountingStrategies.all - else: - self.strategy = CountingStrategies.all_minor - elif strategy == "with_inconsistent": - if gene_counting: - self.strategy = CountingStrategies.inconsistent_all - else: - self.strategy = CountingStrategies.inconsistent_minor - else: - raise KeyError("Unknown quantification strategy: " + strategy) - def process_inconsistent(self, isoform_match, feature_count): - if self.strategy == CountingStrategies.unique_only or self.strategy == CountingStrategies.with_ambiguous \ - or feature_count > 1: +class CountingStrategyFlags: + def __init__(self, counting_strategy): + self.use_ambiguous = counting_strategy.ambiguous() + self.use_inconsistent_minor = counting_strategy.inconsistent_minor() + self.use_inconsistent = counting_strategy.inconsistent() + + +class GeneAssignmentExtractor: + @staticmethod + def get_features(read_assignment): + gene_set = set() + for m in read_assignment.isoform_matches: + if m.assigned_gene: + gene_set.add(m.assigned_gene) + return gene_set + + @staticmethod + def get_assignment_type(read_assignment): + return read_assignment.gene_assignment_type + + @staticmethod + def confirms_feature(read_assignment): + return read_assignment.gene_assignment_type.is_unique() + + +class TranscriptAssignmentExtractor: + @staticmethod + def get_features(read_assignment): + transcript_set = set() + for m in read_assignment.isoform_matches: + if m.assigned_transcript: + transcript_set.add(m.assigned_transcript) + return transcript_set + + @staticmethod + def get_assignment_type(read_assignment): + return read_assignment.assignment_type + + @staticmethod + def confirms_feature(read_assignment): + transcript_id = read_assignment.isoform_matches[0].assigned_transcript + return (read_assignment.assignment_type.is_unique() and + (len(read_assignment.gene_info.all_isoforms_introns[transcript_id]) == 0 or + len(read_assignment.corrected_exons) > 1)) + + +class ReadWeightCounter: + def __init__(self, strategy_str, relax_strategy=True): + self.strategy = CountingStrategy[strategy_str] + if relax_strategy: + if self.strategy == CountingStrategy.with_inconsistent: + self.strategy = CountingStrategy.with_inconsistent_minor + elif self.strategy == CountingStrategy.all: + self.strategy = CountingStrategy.all_minor + self.strategy_flags = CountingStrategyFlags(self.strategy) + + def process_inconsistent(self, assignment_type, feature_count): + # use only for inconsistent assignments + if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1: return 0.0 - elif self.strategy == CountingStrategies.all or self.strategy == CountingStrategies.inconsistent_all: + if self.strategy_flags.use_inconsistent: return 1.0 - elif self.strategy == CountingStrategies.all_minor or self.strategy == CountingStrategies.inconsistent_minor: - for m in isoform_match.match_subclassifications: - if m.event_type not in nonintronic_events: - return 0.0 + if self.strategy_flags.use_inconsistent_minor and assignment_type == ReadAssignmentType.inconsistent_non_intronic: return 1.0 return 0.0 def process_ambiguous(self, feature_count): - if self.strategy in {CountingStrategies.with_ambiguous, - CountingStrategies.all_minor, - CountingStrategies.all}: + if feature_count == 0: + return 0.0 + if feature_count == 1: + return 1.0 + if self.strategy_flags.use_ambiguous: return 1.0 / float(feature_count) else: return 0.0 @@ -141,10 +191,10 @@ def add_unaligned(self, n_reads=1): # count meta-features assigned to reads (genes or isoforms) # get_feature_id --- function that returns feature id form IsoformMatch object class AssignedFeatureCounter(AbstractCounter): - def __init__(self, output_prefix, get_feature_id, read_groups, read_counter, + def __init__(self, output_prefix, assignment_extractor, read_groups, read_counter, all_features=None, ignore_read_groups=False, output_zeroes=True): AbstractCounter.__init__(self, output_prefix, ignore_read_groups, output_zeroes) - self.get_feature_id = get_feature_id + self.assignment_extractor = assignment_extractor self.all_features = set(all_features) if all_features is not None else set() if ignore_read_groups: self.all_groups = [AbstractReadGrouper.default_group_id] @@ -161,48 +211,45 @@ def __init__(self, output_prefix, get_feature_id, read_groups, read_counter, self.output_stats_file_name = self.output_counts_file_name + ".stats" def add_read_info(self, read_assignment=None): - # TODO: add __alignment_not_unique / __too_low_aQual ? if not read_assignment: self.not_aligned_reads += 1 - if read_assignment.assignment_type.is_unassigned() or not read_assignment.isoform_matches: + return + elif read_assignment.assignment_type.is_unassigned() or not read_assignment.isoform_matches: self.not_assigned_reads += 1 + return elif read_assignment.isoform_matches and read_assignment.isoform_matches[0].assigned_transcript is None: self.not_assigned_reads += 1 logger.warning("Assigned feature for read %s is None, will be skipped. " "This message may be reported to the developers." % read_assignment.read_id) - elif read_assignment.assignment_type == ReadAssignmentType.ambiguous: - feature_ids = set([self.get_feature_id(m) for m in read_assignment.isoform_matches]) - group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group + return - if len(feature_ids) == 1: # different isoforms of same gene - feature_id = list(feature_ids)[0] - self.feature_counter[group_id][feature_id] += 1.0 - self.all_features.add(feature_id) - else: - for feature_id in feature_ids: - count_value = self.read_counter.process_ambiguous(len(feature_ids)) - self.feature_counter[group_id][feature_id] += count_value - if count_value > 0: - self.all_features.add(feature_id) + feature_ids = self.assignment_extractor.get_features(read_assignment) + assignment_type = self.assignment_extractor.get_assignment_type(read_assignment) + group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group + + if assignment_type == ReadAssignmentType.ambiguous: + count_value = self.read_counter.process_ambiguous(len(feature_ids)) + for feature_id in feature_ids: + self.feature_counter[group_id][feature_id] += count_value + if count_value > 0: + self.all_features.add(feature_id) + if len(feature_ids): self.ambiguous_reads += 1 + elif read_assignment.assignment_type.is_inconsistent(): - feature_ids = set([self.get_feature_id(m) for m in read_assignment.isoform_matches]) - group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group - count_value = self.read_counter.process_inconsistent(read_assignment.isoform_matches[0], len(feature_ids)) + count_value = self.read_counter.process_inconsistent(assignment_type, len(feature_ids)) if count_value > 0: for feature_id in feature_ids: - if feature_id: - self.feature_counter[group_id][feature_id] += count_value - self.all_features.add(feature_id) - self.confirmed_features.add((group_id, feature_id)) + # if feature_id: + self.feature_counter[group_id][feature_id] += count_value + self.all_features.add(feature_id) + #self.confirmed_features.add((group_id, feature_id)) + elif read_assignment.assignment_type.is_unique(): - feature_id = self.get_feature_id(read_assignment.isoform_matches[0]) - group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group + feature_id = list(feature_ids)[0] self.feature_counter[group_id][feature_id] += 1.0 self.all_features.add(feature_id) - transcript_id = read_assignment.isoform_matches[0].assigned_transcript - if len(read_assignment.gene_info.all_isoforms_introns[transcript_id]) == 0 or len(read_assignment.corrected_exons) > 1: - # only add features to confirmed ones if it has unique spliced match + if self.assignment_extractor.confirms_feature(read_assignment): self.confirmed_features.add((group_id, feature_id)) def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.default_group_id): @@ -316,16 +363,16 @@ def convert_counts_to_tpm(self): def create_gene_counter(output_file_name, strategy, complete_feature_list=None, read_groups=None, ignore_read_groups=False, output_zeroes=True): - read_weight_counter = ReadWeightCounter(strategy, gene_counting=True) - return AssignedFeatureCounter(output_file_name, get_assigned_gene_id, + read_weight_counter = ReadWeightCounter(strategy, relax_strategy=False) + return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor, read_groups, read_weight_counter, complete_feature_list, ignore_read_groups, output_zeroes) def create_transcript_counter(output_file_name, strategy, complete_feature_list=None, read_groups=None, ignore_read_groups=False, output_zeroes=True): - read_weight_counter = ReadWeightCounter(strategy, gene_counting=False) - return AssignedFeatureCounter(output_file_name, get_assigned_transcript_id, + read_weight_counter = ReadWeightCounter(strategy, relax_strategy=True) + return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor, read_groups, read_weight_counter, complete_feature_list, ignore_read_groups, output_zeroes) From adced67978f0afa2d0403932af03bade0b9ecc70 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Thu, 25 Apr 2024 13:03:17 +0300 Subject: [PATCH 14/21] fix assignment type bug --- src/long_read_counter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/long_read_counter.py b/src/long_read_counter.py index cdaaf2ba..e3efe01a 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -236,7 +236,7 @@ def add_read_info(self, read_assignment=None): if len(feature_ids): self.ambiguous_reads += 1 - elif read_assignment.assignment_type.is_inconsistent(): + elif assignment_type.is_inconsistent(): count_value = self.read_counter.process_inconsistent(assignment_type, len(feature_ids)) if count_value > 0: for feature_id in feature_ids: @@ -245,7 +245,7 @@ def add_read_info(self, read_assignment=None): self.all_features.add(feature_id) #self.confirmed_features.add((group_id, feature_id)) - elif read_assignment.assignment_type.is_unique(): + elif assignment_type.is_unique(): feature_id = list(feature_ids)[0] self.feature_counter[group_id][feature_id] += 1.0 self.all_features.add(feature_id) From 2662595cc406a3cb601c0c1bd3a62ee3d368c31c Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 24 Apr 2024 02:12:55 +0300 Subject: [PATCH 15/21] update quantification strategies, set reasonable defaults --- README.md | 24 ++++++------- isoquant.py | 73 ++++++++++++++++++++++------------------ src/long_read_counter.py | 33 +++++++++--------- 3 files changed, 69 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index c0afb9cc..3f7df462 100644 --- a/README.md +++ b/README.md @@ -462,18 +462,18 @@ where `FILE` is the file name, `READ_COL` is column with read ids (0 if not set) #### Quantification -`--transcript_quantification` Transcript quantification strategy, should be one of: - -* `unique_only` - only reads that are uniquely assigned and consistent with a transcript are used for quantification; -* `with_ambiguous` - ambiguously assigned reads are split between transcripts with equal weights (e.g. 1/2 when a read is assigned to 2 transcripts simultaneously, default mode); -* `with_inconsistent` - uniquely assigned reads with non-intronic inconsistencies (i.e. alternative poly-A site, TSS etc) are also included; -* `all` - all of the above. - -`--gene_quantification` Gene quantification strategy, should be one of: - -* `unique_only` - only reads that are uniquely assigned to a gene and consistent with any of gene's isoforms are used for quantification; -* `with_ambiguous` - ambiguously assigned reads are split between genes with equal weights (e.g. 1/2 when a read is assigned to 2 genes simultaneously); -* `with_inconsistent` - only reads that are uniquely assigned to a gene but are not necessarily consistent with genes isoforms (default); +`--transcript_quantification` Transcript quantification strategy; +`--gene_quantification` Gene quantification strategy; + +Available options for quantification: + +* `unique_only` - use only reads that are uniquely assigned and consistent with a transcript/gene +(i.e. flagged as unique/unique_minor_difference), default fot transcript quantification; +* `with_ambiguous` - in addition to unique reads, ambiguously assigned consistent reads are split between features with equal weights +(e.g. 1/2 when a read is assigned to 2 features simultaneously); +* `unique_splicing_consistent` - uses uniquely assigned reads that do not contradict annotated splice sites +(i.e. flagged as unique/unique_minor_difference or inconsistent_non_intronic), default for gene quantification; +* `unique_inconsistent` - uses uniquely assigned reads allowing any kind of inconsistency; * `all` - all of the above. diff --git a/isoquant.py b/isoquant.py index aaefde88..69ec59bf 100755 --- a/isoquant.py +++ b/isoquant.py @@ -35,7 +35,7 @@ from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies from src.graph_based_model_construction import StrandnessReportingLevel from src.long_read_assigner import AmbiguityResolvingMethod -from src.long_read_counter import COUNTING_STRATEGIES +from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy from src.input_data_storage import InputDataStorage from src.multimap_resolver import MultimapResolvingStrategy from src.stats import combine_counts @@ -56,6 +56,7 @@ def parse_args(cmd_args=None, namespace=None): input_args_group = parser.add_argument_group('Input data') output_args_group = parser.add_argument_group('Output naming') pipeline_args_group = parser.add_argument_group('Pipeline options') + algo_args_group = parser.add_argument_group('Algorithm settings') other_options = parser.add_argument_group("Additional options:") show_full_help = '--full_help' in cmd_args @@ -132,37 +133,40 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="reads represent FL transcripts; both ends of the read are considered to be reliable") # ALGORITHM - add_additional_option("--delta", type=int, default=None, - help="delta for inexact splice junction comparison, chosen automatically based on data type") - add_hidden_option("--graph_clustering_distance", type=int, default=None, - help="intron graph clustering distance, " - "splice junctions less that this number of bp apart will not be differentiated") - add_additional_option("--matching_strategy", choices=["exact", "precise", "default", "loose"], - help="read-to-isoform matching strategy from the most strict to least", type=str, default=None) - add_additional_option("--splice_correction_strategy", choices=["none", "default_pacbio", "default_ont", "conservative_ont", "all", "assembly"], - help="read alignment correction strategy to use", - type=str, default=None) - add_additional_option("--model_construction_strategy", choices=["reliable", "default_pacbio", - "sensitive_pacbio", "fl_pacbio", "default_ont", - "sensitive_ont", "all", "assembly"], - help="transcript model construction strategy to use", - type=str, default=None) - - add_additional_option("--transcript_quantification", choices=COUNTING_STRATEGIES, - help="transcript quantification strategy", type=str, default="with_ambiguous") - add_additional_option("--gene_quantification", choices=COUNTING_STRATEGIES, - help="gene quantification strategy", type=str, default="with_inconsistent") - add_additional_option("--report_novel_unspliced", "-u", type=bool_str, - help="report novel monoexonic transcripts (true/false), " - "default: false for ONT, true for other data types") - add_additional_option("--report_canonical", type=str, choices=[e.name for e in StrandnessReportingLevel], - help="reporting level for novel transcripts based on canonical splice sites;" - " default: " + StrandnessReportingLevel.auto.name, - default=StrandnessReportingLevel.only_stranded.name) - add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies], - help="require polyA tails to be present when reporting transcripts; " - "default: auto (requires polyA only when polyA percentage is >= 70%%)", - default=PolyAUsageStrategies.auto.name) + add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str, + help="report novel monoexonic transcripts (true/false), " + "default: false for ONT, true for other data types") + add_additional_option_to_group(algo_args_group, "--report_canonical", type=str, + choices=[e.name for e in StrandnessReportingLevel], + help="reporting level for novel transcripts based on canonical splice sites;" + " default: " + StrandnessReportingLevel.auto.name, + default=StrandnessReportingLevel.only_stranded.name) + add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str, + choices=[e.name for e in PolyAUsageStrategies], + help="require polyA tails to be present when reporting transcripts; " + "default: auto (requires polyA only when polyA percentage is >= 70%%)", + default=PolyAUsageStrategies.auto.name) + + add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES, + help="transcript quantification strategy", type=str, + default=CountingStrategy.unique_only.name) + add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES, + help="gene quantification strategy", type=str, + default=CountingStrategy.unique_splicing_consistent.name) + + add_additional_option_to_group(algo_args_group, "--matching_strategy", + choices=["exact", "precise", "default", "loose"], + help="read-to-isoform matching strategy from the most strict to least", + type=str, default=None) + add_additional_option_to_group(algo_args_group, "--splice_correction_strategy", + choices=["none", "default_pacbio", "default_ont", + "conservative_ont", "all", "assembly"], + help="read alignment correction strategy to use", type=str, default=None) + add_additional_option_to_group(algo_args_group, "--model_construction_strategy", + choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio", + "default_ont", "sensitive_ont", "all", "assembly"], + help="transcript model construction strategy to use", type=str, default=None) + # OUTPUT PROPERTIES pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int, default="16") @@ -191,6 +195,11 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="align reads to reference without running further analysis") # ADDITIONAL + add_additional_option("--delta", type=int, default=None, + help="delta for inexact splice junction comparison, chosen automatically based on data type") + add_hidden_option("--graph_clustering_distance", type=int, default=None, + help="intron graph clustering distance, " + "splice junctions less that this number of bp apart will not be differentiated") add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped", action='store_false', default=True) add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check", diff --git a/src/long_read_counter.py b/src/long_read_counter.py index e3efe01a..de2cb31d 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -22,28 +22,29 @@ class CountingStrategy(Enum): unique_only = 1 with_ambiguous = 2 - with_inconsistent = 10 - with_inconsistent_minor = 11 + unique_splicing_consistent = 10 + unique_inconsistent = 11 + # all_splicing_consistent = 19 all = 20 - all_minor = 21 def no_inconsistent(self): return self in [CountingStrategy.unique_only, CountingStrategy.with_ambiguous] def ambiguous(self): - return self in [CountingStrategy.all, CountingStrategy.all_minor, CountingStrategy.with_ambiguous] + return self in [CountingStrategy.all, CountingStrategy.with_ambiguous] def inconsistent_minor(self): - return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor] + return self == CountingStrategy.unique_splicing_consistent def inconsistent(self): - return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor, - CountingStrategy.with_inconsistent, CountingStrategy.all] + return self in [CountingStrategy.unique_splicing_consistent, + CountingStrategy.unique_inconsistent, CountingStrategy.all] COUNTING_STRATEGIES = [CountingStrategy.unique_only.name, CountingStrategy.with_ambiguous.name, - CountingStrategy.with_inconsistent.name, + CountingStrategy.unique_splicing_consistent.name, + CountingStrategy.unique_inconsistent.name, CountingStrategy.all.name] @@ -94,19 +95,17 @@ def confirms_feature(read_assignment): class ReadWeightCounter: - def __init__(self, strategy_str, relax_strategy=True): + def __init__(self, strategy_str): self.strategy = CountingStrategy[strategy_str] - if relax_strategy: - if self.strategy == CountingStrategy.with_inconsistent: - self.strategy = CountingStrategy.with_inconsistent_minor - elif self.strategy == CountingStrategy.all: - self.strategy = CountingStrategy.all_minor self.strategy_flags = CountingStrategyFlags(self.strategy) def process_inconsistent(self, assignment_type, feature_count): # use only for inconsistent assignments if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1: - return 0.0 + if self.strategy_flags.use_ambiguous: + return 1.0 / feature_count + else: + return 0.0 if self.strategy_flags.use_inconsistent: return 1.0 if self.strategy_flags.use_inconsistent_minor and assignment_type == ReadAssignmentType.inconsistent_non_intronic: @@ -363,7 +362,7 @@ def convert_counts_to_tpm(self): def create_gene_counter(output_file_name, strategy, complete_feature_list=None, read_groups=None, ignore_read_groups=False, output_zeroes=True): - read_weight_counter = ReadWeightCounter(strategy, relax_strategy=False) + read_weight_counter = ReadWeightCounter(strategy) return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor, read_groups, read_weight_counter, complete_feature_list, ignore_read_groups, output_zeroes) @@ -371,7 +370,7 @@ def create_gene_counter(output_file_name, strategy, complete_feature_list=None, def create_transcript_counter(output_file_name, strategy, complete_feature_list=None, read_groups=None, ignore_read_groups=False, output_zeroes=True): - read_weight_counter = ReadWeightCounter(strategy, relax_strategy=True) + read_weight_counter = ReadWeightCounter(strategy) return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor, read_groups, read_weight_counter, complete_feature_list, ignore_read_groups, output_zeroes) From e1b931e1d5f31e5607e39ee691e9244ace29624d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 26 Apr 2024 01:21:56 +0300 Subject: [PATCH 16/21] refactor file merging, introduce new TPM normalization --- src/dataset_processor.py | 66 ++++++++++++---------------------------- src/file_utils.py | 42 ++++++++++++++++--------- src/long_read_counter.py | 15 +++++++-- 3 files changed, 59 insertions(+), 64 deletions(-) diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 8ddb4a6c..ec3e6007 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -18,11 +18,11 @@ import pysam from pyfaidx import Fasta, Faidx, UnsupportedCompressionFormat -from .common import proper_plural_form, rreplace +from .common import proper_plural_form from .serialization import * from .isoform_assignment import BasicReadAssignment, ReadAssignmentType from .stats import EnumStats -from .file_utils import merge_files +from .file_utils import merge_files, merge_counts from .input_data_storage import SampleData from .alignment_processor import AlignmentCollector, AlignmentType from .long_read_counter import ( @@ -635,26 +635,14 @@ def process_assigned_reads(self, sample, dump_filename): self.merge_transcript_models(sample.prefix, aggregator, chr_ids, gff_printer) logger.info("Transcript model file " + gff_printer.model_fname) if self.args.genedb: - merge_files( - [ - rreplace(extended_gff_printer.model_fname, sample.prefix, f"{sample.prefix}_{chr_id}") - for chr_id in chr_ids - ], - extended_gff_printer.out_gff, - copy_header=False - ) + merge_files(extended_gff_printer.model_fname, sample.prefix, chr_ids, + extended_gff_printer.out_gff, copy_header=False) logger.info("Extended annotation is saved to " + extended_gff_printer.model_fname) transcript_stat_counter.print_start("Transcript model statistics") self.merge_assignments(sample, aggregator, chr_ids) if self.args.sqanti_output: - merge_files( - [ - rreplace(sample.out_t2t_tsv, sample.prefix, f"{sample.prefix}_{chr_id}") - for chr_id in chr_ids - ], - open(sample.out_t2t_tsv, "w"), copy_header=False - ) + merge_files(sample.out_t2t_tsv, sample.prefix, chr_ids, open(sample.out_t2t_tsv, "w"), copy_header=False) aggregator.finalize_aggregators(sample) @@ -668,34 +656,20 @@ def load_read_info(self, dump_filename): def merge_assignments(self, sample, aggregator, chr_ids): if self.args.genedb: - merge_files( - [rreplace(sample.out_assigned_tsv, sample.prefix, sample.prefix + "_" + chr_id) for chr_id in chr_ids], - aggregator.basic_printer.output_file, copy_header=False) - merge_files( - [rreplace(sample.out_corrected_bed, sample.prefix, sample.prefix + "_" + chr_id) for chr_id in chr_ids], - aggregator.corrected_bed_printer.output_file, copy_header=False) - for p in aggregator.global_counter.counters: - merge_files( - [rreplace(p.output_counts_file_name, sample.prefix, sample.prefix + "_" + chr_id) for chr_id in chr_ids], - p.get_output_file_handler(), - stats_file_names=[rreplace(p.output_stats_file_name, sample.prefix, sample.prefix + "_" + chr_id) for - chr_id in chr_ids] - if p.output_stats_file_name else None, - ignore_read_groups=p.ignore_read_groups, - unaligned_reads=self.alignment_stat_counter.stats_dict[AlignmentType.unaligned]) - p.convert_counts_to_tpm() + merge_files(sample.out_assigned_tsv, sample.prefix, chr_ids, + aggregator.basic_printer.output_file, copy_header=False) + merge_files(sample.out_corrected_bed, sample.prefix, chr_ids, + aggregator.corrected_bed_printer.output_file, copy_header=False) + + for counter in aggregator.global_counter.counters: + unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned] + merge_counts(counter, sample.prefix, chr_ids, unaligned) + counter.convert_counts_to_tpm() def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer): - merge_files([rreplace(gff_printer.model_fname, label, label + "_" + chr_id) for chr_id in chr_ids], - gff_printer.out_gff, copy_header=False) - merge_files([rreplace(gff_printer.r2t_fname, label, label + "_" + chr_id) for chr_id in chr_ids], - gff_printer.out_r2t, copy_header=False) - for p in aggregator.transcript_model_global_counter.counters: - merge_files([rreplace(p.output_counts_file_name, label, label + "_" + chr_id) for chr_id in chr_ids], - p.get_output_file_handler(), - stats_file_names=[rreplace(p.output_stats_file_name, label, label + "_" + chr_id) for chr_id in - chr_ids] - if p.output_stats_file_name else None, - ignore_read_groups=p.ignore_read_groups, - unaligned_reads=self.alignment_stat_counter.stats_dict[AlignmentType.unaligned]) - p.convert_counts_to_tpm() + merge_files(gff_printer.model_fname, label, chr_ids, gff_printer.out_gff, copy_header=False) + merge_files(gff_printer.r2t_fname, label, chr_ids, gff_printer.out_r2t, copy_header=False) + for counter in aggregator.transcript_model_global_counter.counters: + unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned] + merge_counts(counter, label, chr_ids, unaligned) + counter.convert_counts_to_tpm() diff --git a/src/file_utils.py b/src/file_utils.py index 4800f682..c0bf318c 100644 --- a/src/file_utils.py +++ b/src/file_utils.py @@ -4,11 +4,17 @@ import shutil import gzip +from .common import rreplace + logger = logging.getLogger('IsoQuant') -def merge_files(file_names, merged_file_handler, stats_file_names=None, ignore_read_groups=True, copy_header=True, - unaligned_reads=0): +def merge_file_list(fname, label, chr_ids): + return [rreplace(fname, label, f"{label}_{chr_id}") for chr_id in chr_ids] + + +def merge_files(file_name, label, chr_ids, merged_file_handler, copy_header=True): + file_names = merge_file_list(file_name, label, chr_ids) file_names.sort(key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split('(\d+)', s)]) for i, file_name in enumerate(file_names): if not os.path.exists(file_name): continue @@ -24,22 +30,28 @@ def merge_files(file_names, merged_file_handler, stats_file_names=None, ignore_r for file_name in file_names: os.remove(file_name) - ambiguous_reads, not_assigned_reads, not_aligned_reads = 0, 0, unaligned_reads - if stats_file_names and ignore_read_groups: + +def merge_counts(counter, label, chr_ids, unaligned_reads=0): + file_name = counter.output_counts_file_name + merged_file_handler = counter.get_output_file_handler() + merge_files(file_name, label, chr_ids, merged_file_handler) + + counter.reads_for_tpm = 0 + stat_dict = {"__ambiguous": 0, "__no_feature": 0, "__not_aligned": 0, "__usable": 0} + + if counter.output_stats_file_name and counter.ignore_read_groups: + stats_file_names = merge_file_list(counter.output_stats_file_name, label, chr_ids) for file_name in stats_file_names: - with open(file_name) as f: - line = f.readline() - ambiguous_reads += int(line.split()[1]) - line = f.readline() - not_assigned_reads += int(line.split()[1]) - line = f.readline() - if unaligned_reads == 0: - not_aligned_reads += int(line.split()[1]) + for l in open(file_name): + v = l.strip().split() + stat_dict[v[0]] += int(v[1]) os.remove(file_name) - merged_file_handler.write("__ambiguous\t%d\n" % ambiguous_reads) - merged_file_handler.write("__no_feature\t%d\n" % not_assigned_reads) - merged_file_handler.write("__not_aligned\t%d\n" % not_aligned_reads) + if unaligned_reads > 0: + stat_dict["__not_aligned"] = unaligned_reads + for v in ["__ambiguous", "__no_feature", "__not_aligned"]: + merged_file_handler.write("%s\t%d\n" % (v, stat_dict[v])) + counter.reads_for_tpm = stat_dict[ "__usable"] def normalize_path(config_path, file_path): if os.path.isabs(file_path): diff --git a/src/long_read_counter.py b/src/long_read_counter.py index de2cb31d..823dccd0 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -202,6 +202,7 @@ def __init__(self, output_prefix, assignment_extractor, read_groups, read_counte self.read_counter = read_counter self.ambiguous_reads = 0 + self.reads_for_tpm = 0 self.not_assigned_reads = 0 self.not_aligned_reads = 0 # group_id -> (feature_id -> count) @@ -226,14 +227,14 @@ def add_read_info(self, read_assignment=None): assignment_type = self.assignment_extractor.get_assignment_type(read_assignment) group_id = AbstractReadGrouper.default_group_id if self.ignore_read_groups else read_assignment.read_group + self.reads_for_tpm += 1 if assignment_type == ReadAssignmentType.ambiguous: count_value = self.read_counter.process_ambiguous(len(feature_ids)) for feature_id in feature_ids: self.feature_counter[group_id][feature_id] += count_value if count_value > 0: self.all_features.add(feature_id) - if len(feature_ids): - self.ambiguous_reads += 1 + self.ambiguous_reads += 1 elif assignment_type.is_inconsistent(): count_value = self.read_counter.process_inconsistent(assignment_type, len(feature_ids)) @@ -260,6 +261,7 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d self.not_assigned_reads += 1 elif len(feature_ids) > 1: self.ambiguous_reads += 1 + self.reads_for_tpm += 1 for feature_id in feature_ids: count_value = self.read_counter.process_ambiguous(len(feature_ids)) self.feature_counter[group_id][feature_id] += count_value @@ -267,9 +269,11 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d else: self.feature_counter[group_id][feature_ids[0]] += 1 self.all_features.add(feature_ids[0]) + self.reads_for_tpm += 1 def add_unassigned(self, n_reads=1): self.not_assigned_reads += n_reads + self.reads_for_tpm += n_reads def add_unaligned(self, n_reads=1): self.not_aligned_reads += n_reads @@ -321,6 +325,7 @@ def dump(self): f.write("__ambiguous\t%d\n" % self.ambiguous_reads) f.write("__no_feature\t%d\n" % self.not_assigned_reads) f.write("__not_aligned\t%d\n" % self.not_aligned_reads) + f.write("__usable\t%d\n" % self.reads_for_tpm) def convert_counts_to_tpm(self): total_counts = defaultdict(float) @@ -337,7 +342,11 @@ def convert_counts_to_tpm(self): scale_factors = {} for group_id in total_counts.keys(): - scale_factors[group_id] = 1000000.0 / total_counts[group_id] if total_counts[group_id] > 0 else 1.0 + if self.ignore_read_groups and self.reads_for_tpm: + total_reads = self.reads_for_tpm + else: + total_reads = total_counts[group_id] if total_counts[group_id] > 0 else 1.0 + scale_factors[group_id] = 1000000.0 / total_reads logger.debug("Scale factor for group %s = %.2f" % (group_id, scale_factors[group_id])) with open(self.output_tpm_file_name, "w") as outf: From 4d8d122730cb1786f9fb4f7b5d3a33b919e746a0 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 26 Apr 2024 13:13:45 +0300 Subject: [PATCH 17/21] introduce normalization option --- README.md | 16 +++++++++++++--- isoquant.py | 6 +++++- src/dataset_processor.py | 4 ++-- src/long_read_counter.py | 17 ++++++++++++++--- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 3f7df462..368f39fb 100644 --- a/README.md +++ b/README.md @@ -552,7 +552,7 @@ We recommend _not_ to modify these options unless you are clearly aware of their `--no_gzip` Do not compress large output files. -`--no_gzip` +`--no_gtf_check` Do not perform input GTF checks. `--no_secondary` @@ -562,13 +562,13 @@ We recommend _not_ to modify these options unless you are clearly aware of their Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable. `--no_junc_bed` - Do not use annotation for read mapping. + Do not use gene annotation for read mapping. `--junc_bed_file` Annotation in BED12 format produced by `paftools.js gff2bed` (can be found in `minimap2`), will be created automatically if not given. `--delta` - Delta for inexact splice junction comparison, chosen automatically based on data type. + Delta for inexact splice junction comparison, chosen automatically based on data type (e.g. 4bp for PacBio, 6pb for ONT). `--genedb_output` If your output folder is located on a shared storage (e.g. NFS share), use this option to set another path @@ -588,6 +588,16 @@ but may improve running time when disk I/O is relatively slow. `--simple_alignments_mapq_cutoff` Filers out alignments with 1 or 2 exons and MAPQ less than this value (works only in annotation-free mode, default is 1). +`--normalization_method` + Method for normalizing non-grouped counts into TPMs: +* `simple` - standard method, scale factor equals to 1 million divided by the counts sum (default); +* `usable_reads` - includes all reads assigned to a feature including the ones that were filtered out +during quantification (i.e. inconsistent or ambiguous); +scale factor equals to 1 million divided by the number of all assigned reads. +In this case the sum of all gene/transcript TPMs may not add up to 1 million. +Experiments with simulated data show that this method could give more accurate estimations. +However, normalization method does not affect correlation/relative proportions. + ### Examples diff --git a/isoquant.py b/isoquant.py index 69ec59bf..3c575563 100755 --- a/isoquant.py +++ b/isoquant.py @@ -35,7 +35,7 @@ from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies from src.graph_based_model_construction import StrandnessReportingLevel from src.long_read_assigner import AmbiguityResolvingMethod -from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy +from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, NormalizationMethod from src.input_data_storage import InputDataStorage from src.multimap_resolver import MultimapResolvingStrategy from src.stats import combine_counts @@ -221,6 +221,10 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help add_additional_option("--simple_alignments_mapq_cutoff", help="ignore alignments with 1 or 2 exons and " "MAPQ < this (works only in annotation-free mode, " "default=1)", type=int, default=1) + add_additional_option("--normalization_method", type=str, choices=[e.name for e in NormalizationMethod], + help="TPM normalization method: simple - conventional normalization using all counted reads;" + "usable_reads - includes all assigned reads.", + default=NormalizationMethod.simple.name) add_additional_option_to_group(pipeline_args_group, "--keep_tmp", help="do not remove temporary files " "in the end", action='store_true', diff --git a/src/dataset_processor.py b/src/dataset_processor.py index ec3e6007..5ed99a7d 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -664,7 +664,7 @@ def merge_assignments(self, sample, aggregator, chr_ids): for counter in aggregator.global_counter.counters: unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned] merge_counts(counter, sample.prefix, chr_ids, unaligned) - counter.convert_counts_to_tpm() + counter.convert_counts_to_tpm(self.args.normalization_method) def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer): merge_files(gff_printer.model_fname, label, chr_ids, gff_printer.out_gff, copy_header=False) @@ -672,4 +672,4 @@ def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer): for counter in aggregator.transcript_model_global_counter.counters: unaligned = self.alignment_stat_counter.stats_dict[AlignmentType.unaligned] merge_counts(counter, label, chr_ids, unaligned) - counter.convert_counts_to_tpm() + counter.convert_counts_to_tpm(self.args.normalization_method) diff --git a/src/long_read_counter.py b/src/long_read_counter.py index 823dccd0..0a813c8b 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -48,6 +48,12 @@ def inconsistent(self): CountingStrategy.all.name] +@unique +class NormalizationMethod(Enum): + simple = 1 + usable_reads = 2 + + class CountingStrategyFlags: def __init__(self, counting_strategy): self.use_ambiguous = counting_strategy.ambiguous() @@ -327,7 +333,8 @@ def dump(self): f.write("__not_aligned\t%d\n" % self.not_aligned_reads) f.write("__usable\t%d\n" % self.reads_for_tpm) - def convert_counts_to_tpm(self): + def convert_counts_to_tpm(self, normalization_str=NormalizationMethod.simple.name): + normalization = NormalizationMethod[normalization_str] total_counts = defaultdict(float) with open(self.output_counts_file_name) as f: for line in f: @@ -341,9 +348,11 @@ def convert_counts_to_tpm(self): total_counts[j] += float(fs[j + 1]) scale_factors = {} + unassigned_tpm = 0.0 for group_id in total_counts.keys(): - if self.ignore_read_groups and self.reads_for_tpm: + if normalization == NormalizationMethod.usable_reads and self.ignore_read_groups and self.reads_for_tpm: total_reads = self.reads_for_tpm + unassigned_tpm = 1000000.0 * (1 - total_counts[group_id] / total_reads) else: total_reads = total_counts[group_id] if total_counts[group_id] > 0 else 1.0 scale_factors[group_id] = 1000000.0 / total_reads @@ -367,6 +376,8 @@ def convert_counts_to_tpm(self): feature_id, counts = fs[0], list(map(float, fs[1:])) tpm_values = [scale_factors[i] * counts[i] for i in range(len(scale_factors))] outf.write("%s\t%s\n" % (feature_id, "\t".join(["%.6f" % c for c in tpm_values]))) + if self.ignore_read_groups: + outf.write("%s\t%.6f\n" % ("__unassigned", unassigned_tpm)) def create_gene_counter(output_file_name, strategy, complete_feature_list=None, @@ -424,7 +435,7 @@ def dump(self): if self.print_zeroes or incl_count > 0 or excl_count > 0: f.write("%s\t%s\t%d\t%d\n" % (feature_name, group_id, incl_count, excl_count)) - def convert_counts_to_tpm(self): + def convert_counts_to_tpm(self, normalization=NormalizationMethod.simple): return @staticmethod From 0646c31484be7e3dcd0b2243ee15afe9ef56e755 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 26 Apr 2024 14:41:27 +0300 Subject: [PATCH 18/21] fix console tests --- tests/console_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/console_test.py b/tests/console_test.py index c4f34420..d1396014 100644 --- a/tests/console_test.py +++ b/tests/console_test.py @@ -18,7 +18,7 @@ def test__help(option): print(result.returncode) assert result.returncode == 0 assert b"usage" in result.stdout - assert b"optional arguments:" in result.stdout + assert b"options:" in result.stdout def test_clean_start(): From 7b80226c6fdf3b38679fa5f4fc250ab11597efc8 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 26 Apr 2024 15:01:59 +0300 Subject: [PATCH 19/21] fix inconsistency tests --- tests/test_long_read_assigner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_long_read_assigner.py b/tests/test_long_read_assigner.py index 6f3d5ec8..12201fcc 100644 --- a/tests/test_long_read_assigner.py +++ b/tests/test_long_read_assigner.py @@ -542,7 +542,7 @@ def test_assign_inconsistent(self, sorted_blocks, polya_pos, polyt_pos, isoform_ PolyAInfo(polya_pos, polyt_pos, -1, -1)) read_assignment = self.assigner.assign_to_isoform("read_id", combined_profile) - assert read_assignment.assignment_type == ReadAssignmentType.inconsistent + assert read_assignment.assignment_type.is_inconsistent() if isoform_id: assert isoform_id in {im.assigned_transcript for im in read_assignment.isoform_matches} if expected_event: From 07006909aadf78f7437c862de32f9fe00758853d Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 26 Apr 2024 15:33:01 +0300 Subject: [PATCH 20/21] various options tests --- .github/workflows/SIRVs.Set4.options.yml | 63 ++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 .github/workflows/SIRVs.Set4.options.yml diff --git a/.github/workflows/SIRVs.Set4.options.yml b/.github/workflows/SIRVs.Set4.options.yml new file mode 100644 index 00000000..fed58328 --- /dev/null +++ b/.github/workflows/SIRVs.Set4.options.yml @@ -0,0 +1,63 @@ +name: SIRVs real ONT various options + +on: + workflow_dispatch: + schedule: + - cron: '0 21 * * 6' + +env: + RUN_NAME: SIRVs.Set4.R10.OPTS + LAUNCHER: ${{github.workspace}}/tests/github/run_pipeline.py + CFG_DIR: /abga/work/andreyp/ci_isoquant/data + BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/ + OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/ + +concurrency: + group: ${{github.workflow}} + cancel-in-progress: false + +jobs: + launch-runner: + runs-on: + labels: [self-hosted] + name: 'Running IsoQuant and QC' + + steps: + - name: 'Cleanup' + run: > + set -e && + shopt -s dotglob && + rm -rf * + + - name: 'Checkout' + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: 'IsoQuant SIRVs R10.4 pipeline options' + if: always() + shell: bash + env: + STEP_NAME: SIRVs.Set4.R10.opts1 + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'IsoQuant SIRVs R10.4 read filtering' + if: always() + shell: bash + env: + STEP_NAME: SIRVs.Set4.R10.opts2 + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + + - name: 'IsoQuant SIRVs R10.4 strategies' + if: always() + shell: bash + env: + STEP_NAME: SIRVs.Set4.R10.opts3 + run: | + export PATH=$PATH:${{env.BIN_PATH}} + python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.cfg -o ${{env.OUTPUT_BASE}} + From 5154915b551dae0f252cfdc56d07b4fd4bd91190 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Mon, 6 May 2024 14:15:34 +0300 Subject: [PATCH 21/21] fix counting strategy flags --- src/long_read_counter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/long_read_counter.py b/src/long_read_counter.py index 0a813c8b..ba4f4a18 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -34,12 +34,12 @@ def ambiguous(self): return self in [CountingStrategy.all, CountingStrategy.with_ambiguous] def inconsistent_minor(self): - return self == CountingStrategy.unique_splicing_consistent - - def inconsistent(self): return self in [CountingStrategy.unique_splicing_consistent, CountingStrategy.unique_inconsistent, CountingStrategy.all] + def inconsistent(self): + return self in [CountingStrategy.unique_inconsistent, CountingStrategy.all] + COUNTING_STRATEGIES = [CountingStrategy.unique_only.name, CountingStrategy.with_ambiguous.name, @@ -108,7 +108,7 @@ def __init__(self, strategy_str): def process_inconsistent(self, assignment_type, feature_count): # use only for inconsistent assignments if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1: - if self.strategy_flags.use_ambiguous: + if self.strategy_flags.use_ambiguous and self.strategy_flags.use_inconsistent: return 1.0 / feature_count else: return 0.0