From d6b4bfbb1b5673f9e8316698f2db0c706fdac73a Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Fri, 12 Apr 2024 15:17:29 +0300 Subject: [PATCH] improve transcript merger by using exonic regions --- src/common.py | 58 +++++++++++++++++++++++++++ src/graph_based_model_construction.py | 20 ++++----- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/src/common.py b/src/common.py index e7957580..70ac72c0 100644 --- a/src/common.py +++ b/src/common.py @@ -306,6 +306,64 @@ def jaccard_similarity(sorted_range_list1, sorted_range_list2): return float(intersection) / float(union) +def merge_ranges(sorted_range_list1, sorted_range_list2): + union = [] + intersection = 0 + pos1 = 0 + pos2 = 0 + included1 = [0 for i in range(len(sorted_range_list1))] + included2 = [0 for i in range(len(sorted_range_list2))] + + while pos1 < len(sorted_range_list1) and pos2 < len(sorted_range_list2): + block1 = sorted_range_list1[pos1] + block2 = sorted_range_list2[pos2] + if overlaps(block1, block2): + assert (included2[pos2] == 0 or included1[pos1] == 0) + + intersection += min(block1[1], block2[1]) - max(block1[0], block2[0]) + 1 + if included2[pos2] == 0 and included1[pos1] == 0: + # both blocks were not counted + union.append((min(block1[0], block2[0]), max(block1[1], block2[1]))) + elif included2[pos2] == 1: + # second block was included, take extra bite from block1 + last_block = union[-1] + union[-1] = (last_block[0], max(last_block[1], block1[1])) + else: + # first block was included, take extra bite from block2 + last_block = union[-1] + union[-1] = (last_block[0], max(last_block[1], block2[1])) + + included1[pos1] = 1 + included2[pos2] = 1 + if block2[1] < block1[1]: + pos2 += 1 + else: + pos1 += 1 + elif left_of(block2, block1): + if included2[pos2] == 0: + union.append(block2) + included2[pos2] = 1 + pos2 += 1 + else: + if included1[pos1] == 0: + union.append(block1) + included1[pos1] = 1 + pos1 += 1 + + assert pos1 == len(sorted_range_list1) or pos2 == len(sorted_range_list2) + while pos1 < len(sorted_range_list1): + if included1[pos1] == 0: + union.append(sorted_range_list1[pos1]) + pos1 += 1 + while pos2 < len(sorted_range_list2): + if included2[pos2] == 0: + union.append(sorted_range_list2[pos2]) + pos2 += 1 + + assert len(union) != 0 + return union + + def read_coverage_fraction(read_range_list, isoform_range_list): intersection = 0 pos1 = 0 diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index b09b3369..375d6c58 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -18,6 +18,8 @@ interval_len, junctions_from_blocks, read_coverage_fraction, + jaccard_similarity, + merge_ranges ) from .assignment_io import ReadAssignmentType from .gene_info import GeneInfo, StrandDetector, TranscriptModel, TranscriptModelType @@ -924,10 +926,10 @@ def __init__(self, transcipt_model_storage): self.transcipt_model_storage = transcipt_model_storage self.gene_introns = {} self.gene_strands = {} - self.gene_regions = {} + self.gene_exon_regions = {} self.gene_to_transcripts = {} for t in self.transcipt_model_storage: - self.gene_regions[t.gene_id] = (t.get_start(), t.get_end()) + self.gene_exon_regions[t.gene_id] = t.exon_blocks self.gene_introns[t.gene_id] = set(junctions_from_blocks(t.exon_blocks)) self.gene_strands[t.gene_id] = t.strand self.gene_to_transcripts[t.gene_id] = {t.transcript_id} @@ -939,9 +941,9 @@ def count_score(self, gene1, gene2): return 0.0 intronic_overlap = len(self.gene_introns[gene1].intersection(self.gene_introns[gene2])) / \ max(1, len(self.gene_introns[gene1].union(self.gene_introns[gene2]))) - range1 = self.gene_regions[gene1] - range2 = self.gene_regions[gene2] - position_overlap = intersection_len(range1, range2) / (max(range1[1], range2[1]) - min(range1[0], range2[0]) + 1) + exonic_ranges1 = self.gene_exon_regions[gene1] + exonic_ranges2 = self.gene_exon_regions[gene2] + position_overlap = jaccard_similarity(exonic_ranges1, exonic_ranges2) return position_overlap + intronic_overlap def count_scores(self): @@ -955,14 +957,14 @@ def count_scores(self): def merge_genes(self, gene1, gene2): logger.debug("Merging %s into %s" % (gene2, gene1)) - range1 = self.gene_regions[gene1] - range2 = self.gene_regions[gene2] - self.gene_regions[gene1] = (min(range1[0], range2[0]), max(range1[1], range2[1])) + exonic_ranges1 = self.gene_exon_regions[gene1] + exonic_ranges2 = self.gene_exon_regions[gene2] + self.gene_exon_regions[gene1] = merge_ranges(exonic_ranges1, exonic_ranges2) self.gene_introns[gene1].update(self.gene_introns[gene2]) self.gene_to_transcripts[gene1].update(self.gene_to_transcripts[gene2]) if self.gene_strands[gene2] != self.gene_strands[gene1]: logger.error("Merging genes with different strands: %s, %s" % (gene1, gene2)) - del self.gene_regions[gene2] + del self.gene_exon_regions[gene2] del self.gene_introns[gene2] del self.gene_to_transcripts[gene2] del self.gene_strands[gene2]