Skip to content

Commit

Permalink
improve transcript merger by using exonic regions
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Apr 16, 2024
1 parent c23eef8 commit d6b4bfb
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
58 changes: 58 additions & 0 deletions src/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,64 @@ def jaccard_similarity(sorted_range_list1, sorted_range_list2):
return float(intersection) / float(union)


def merge_ranges(sorted_range_list1, sorted_range_list2):
union = []
intersection = 0
pos1 = 0
pos2 = 0
included1 = [0 for i in range(len(sorted_range_list1))]
included2 = [0 for i in range(len(sorted_range_list2))]

while pos1 < len(sorted_range_list1) and pos2 < len(sorted_range_list2):
block1 = sorted_range_list1[pos1]
block2 = sorted_range_list2[pos2]
if overlaps(block1, block2):
assert (included2[pos2] == 0 or included1[pos1] == 0)

intersection += min(block1[1], block2[1]) - max(block1[0], block2[0]) + 1
if included2[pos2] == 0 and included1[pos1] == 0:
# both blocks were not counted
union.append((min(block1[0], block2[0]), max(block1[1], block2[1])))
elif included2[pos2] == 1:
# second block was included, take extra bite from block1
last_block = union[-1]
union[-1] = (last_block[0], max(last_block[1], block1[1]))
else:
# first block was included, take extra bite from block2
last_block = union[-1]
union[-1] = (last_block[0], max(last_block[1], block2[1]))

included1[pos1] = 1
included2[pos2] = 1
if block2[1] < block1[1]:
pos2 += 1
else:
pos1 += 1
elif left_of(block2, block1):
if included2[pos2] == 0:
union.append(block2)
included2[pos2] = 1
pos2 += 1
else:
if included1[pos1] == 0:
union.append(block1)
included1[pos1] = 1
pos1 += 1

assert pos1 == len(sorted_range_list1) or pos2 == len(sorted_range_list2)
while pos1 < len(sorted_range_list1):
if included1[pos1] == 0:
union.append(sorted_range_list1[pos1])
pos1 += 1
while pos2 < len(sorted_range_list2):
if included2[pos2] == 0:
union.append(sorted_range_list2[pos2])
pos2 += 1

assert len(union) != 0
return union


def read_coverage_fraction(read_range_list, isoform_range_list):
intersection = 0
pos1 = 0
Expand Down
20 changes: 11 additions & 9 deletions src/graph_based_model_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
interval_len,
junctions_from_blocks,
read_coverage_fraction,
jaccard_similarity,
merge_ranges
)
from .assignment_io import ReadAssignmentType
from .gene_info import GeneInfo, StrandDetector, TranscriptModel, TranscriptModelType
Expand Down Expand Up @@ -924,10 +926,10 @@ def __init__(self, transcipt_model_storage):
self.transcipt_model_storage = transcipt_model_storage
self.gene_introns = {}
self.gene_strands = {}
self.gene_regions = {}
self.gene_exon_regions = {}
self.gene_to_transcripts = {}
for t in self.transcipt_model_storage:
self.gene_regions[t.gene_id] = (t.get_start(), t.get_end())
self.gene_exon_regions[t.gene_id] = t.exon_blocks
self.gene_introns[t.gene_id] = set(junctions_from_blocks(t.exon_blocks))
self.gene_strands[t.gene_id] = t.strand
self.gene_to_transcripts[t.gene_id] = {t.transcript_id}
Expand All @@ -939,9 +941,9 @@ def count_score(self, gene1, gene2):
return 0.0
intronic_overlap = len(self.gene_introns[gene1].intersection(self.gene_introns[gene2])) / \
max(1, len(self.gene_introns[gene1].union(self.gene_introns[gene2])))
range1 = self.gene_regions[gene1]
range2 = self.gene_regions[gene2]
position_overlap = intersection_len(range1, range2) / (max(range1[1], range2[1]) - min(range1[0], range2[0]) + 1)
exonic_ranges1 = self.gene_exon_regions[gene1]
exonic_ranges2 = self.gene_exon_regions[gene2]
position_overlap = jaccard_similarity(exonic_ranges1, exonic_ranges2)
return position_overlap + intronic_overlap

def count_scores(self):
Expand All @@ -955,14 +957,14 @@ def count_scores(self):

def merge_genes(self, gene1, gene2):
logger.debug("Merging %s into %s" % (gene2, gene1))
range1 = self.gene_regions[gene1]
range2 = self.gene_regions[gene2]
self.gene_regions[gene1] = (min(range1[0], range2[0]), max(range1[1], range2[1]))
exonic_ranges1 = self.gene_exon_regions[gene1]
exonic_ranges2 = self.gene_exon_regions[gene2]
self.gene_exon_regions[gene1] = merge_ranges(exonic_ranges1, exonic_ranges2)
self.gene_introns[gene1].update(self.gene_introns[gene2])
self.gene_to_transcripts[gene1].update(self.gene_to_transcripts[gene2])
if self.gene_strands[gene2] != self.gene_strands[gene1]:
logger.error("Merging genes with different strands: %s, %s" % (gene1, gene2))
del self.gene_regions[gene2]
del self.gene_exon_regions[gene2]
del self.gene_introns[gene2]
del self.gene_to_transcripts[gene2]
del self.gene_strands[gene2]
Expand Down

0 comments on commit d6b4bfb

Please sign in to comment.