Skip to content

Commit

Permalink
proper duplicates detection and removal
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed May 7, 2024
1 parent 7efea56 commit 557e583
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 12 deletions.
6 changes: 4 additions & 2 deletions src/alignment_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def process_alignments_in_region(self, current_region, alignment_storage):
if gene_info.empty():
assignment_storage = self.process_intergenic(alignment_storage, current_region)
else:
assignment_storage = self.process_genic(alignment_storage, gene_info)
assignment_storage = self.process_genic(alignment_storage, gene_info, current_region)

return gene_info, assignment_storage

Expand Down Expand Up @@ -332,6 +332,7 @@ def process_intergenic(self, alignment_storage, region):
alignment_info.polya_info.internal_polyt_pos != -1)
read_assignment.polya_info = alignment_info.polya_info
read_assignment.cage_found = len(alignment_info.cage_hits) > 0
read_assignment.genomic_region = region
read_assignment.exons = alignment_info.read_exons
read_assignment.corrected_exons = corrector.correct_read(alignment_info)
read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
Expand All @@ -346,7 +347,7 @@ def process_intergenic(self, alignment_storage, region):
assignment_storage.append(read_assignment)
return assignment_storage

def process_genic(self, alignment_storage, gene_info):
def process_genic(self, alignment_storage, gene_info, region):
assigner = LongReadAssigner(gene_info, self.params)
profile_constructor = CombinedProfileConstructor(gene_info, self.params)
exon_corrector = ExonCorrector(gene_info, self.params, self.chr_record)
Expand Down Expand Up @@ -388,6 +389,7 @@ def process_genic(self, alignment_storage, gene_info):
alignment_info.polya_info.internal_polyt_pos != -1)
read_assignment.polya_info = alignment_info.polya_info
read_assignment.cage_found = len(alignment_info.cage_hits) > 0
read_assignment.genomic_region = region
read_assignment.exons = alignment_info.read_exons
read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info,
read_assignment)
Expand Down
2 changes: 1 addition & 1 deletion src/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def finalize_aggregators(self, sample):
logger.info("Gene counts are stored in " + self.gene_counter.output_counts_file_name)
logger.info("Transcript counts are stored in " + self.transcript_counter.output_counts_file_name)
logger.info("Read assignments are stored in " + self.basic_printer.output_file_name +
".gz" if self.args.gzipped else "")
(".gz" if self.args.gzipped else ""))
self.read_stat_counter.print_start("Read assignment statistics")


Expand Down
26 changes: 26 additions & 0 deletions src/isoform_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,12 @@ def __init__(self, read_assignment):
self.assignment_id = read_assignment.assignment_id
self.read_id = read_assignment.read_id
self.chr_id = read_assignment.chr_id
self.start = 0
self.end = 0
if read_assignment.exons:
self.start = read_assignment.exons[0][0]
self.end = read_assignment.exons[-1][1]
self.genomic_region = read_assignment.genomic_region
self.multimapper = read_assignment.multimapper
self.polyA_found = read_assignment.polyA_found
self.assignment_type = read_assignment.assignment_type
Expand All @@ -487,12 +493,24 @@ def __init__(self, read_assignment):
self.genes = list(gene_set)
self.isoforms = list(isoform_set)

def __eq__(self, other):
if isinstance(other, BasicReadAssignment):
return (self.read_id == other.read_id and
self.chr_id == other.chr_id and
self.start == other.start and
self.end == other.end and
self.isoforms == other.isoforms)
return False

@classmethod
def deserialize(cls, infile):
read_assignment = cls.__new__(cls)
read_assignment.assignment_id = read_int(infile)
read_assignment.read_id = read_string(infile)
read_assignment.chr_id = read_string(infile)
read_assignment.start = read_int(infile)
read_assignment.end = read_int(infile)
read_assignment.genomic_region = (read_int(infile), read_int(infile))
bool_arr = read_bool_array(infile, 2)
read_assignment.multimapper = bool_arr[0]
read_assignment.polyA_found = bool_arr[1]
Expand All @@ -507,6 +525,10 @@ def serialize(self, outfile):
write_int(self.assignment_id, outfile)
write_string(self.read_id, outfile)
write_string(self.chr_id, outfile)
write_int(self.start, outfile)
write_int(self.end, outfile)
write_int(self.genomic_region[0], outfile)
write_int(self.genomic_region[1], outfile)
write_bool_array([self.multimapper, self.polyA_found], outfile)
write_short_int(self.assignment_type.value, outfile)
write_short_int(self.gene_assignment_type.value, outfile)
Expand All @@ -520,6 +542,7 @@ class ReadAssignment:
def __init__(self, read_id, assignment_type, match=None):
self.assignment_id = ReadAssignment.assignment_id_generator.increment()
self.read_id = read_id
self.genomic_region = (0, 0)
self.exons = None
self.corrected_exons = None
self.corrected_introns = None
Expand Down Expand Up @@ -562,6 +585,7 @@ def deserialize(cls, infile, gene_info):
read_assignment = cls.__new__(cls)
read_assignment.assignment_id = read_int(infile)
read_assignment.read_id = read_string(infile)
read_assignment.genomic_region = (read_int(infile), read_int(infile))
read_assignment.exons = read_list_of_pairs(infile, read_int)
read_assignment.corrected_exons = read_list_of_pairs(infile, read_int)
read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons)
Expand Down Expand Up @@ -589,6 +613,8 @@ def deserialize(cls, infile, gene_info):
def serialize(self, outfile):
write_int(self.assignment_id, outfile)
write_string(self.read_id, outfile)
write_int(self.genomic_region[0], outfile)
write_int(self.genomic_region[1], outfile)
write_list_of_pairs(self.exons, outfile, write_int)
write_list_of_pairs(self.corrected_exons, outfile, write_int)
write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile)
Expand Down
73 changes: 64 additions & 9 deletions src/multimap_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import logging
from enum import Enum
from collections import defaultdict

from .isoform_assignment import ReadAssignmentType

Expand All @@ -19,6 +20,7 @@ class MultimapResolvingStrategy(Enum):


class MultimapResolver:
duplicate_counter = 0

def __init__(self, strategy):
self.strategy = strategy
Expand All @@ -44,20 +46,20 @@ def resolve(self, assignment_list):

def select_best_assignment(self, assignment_list):
logger.debug("Resolving read %s" % assignment_list[0].read_id)
primary_unique = set()
consistent_assignments = set()
inconsistent_assignments = set()
primary_inconsistent = set()
primary_unique = []
consistent_assignments = []
inconsistent_assignments = []
primary_inconsistent = []

for i, a in enumerate(assignment_list):
if a.assignment_type.is_inconsistent():
if not a.multimapper:
primary_inconsistent.add(i)
inconsistent_assignments.add(i)
primary_inconsistent.append(i)
inconsistent_assignments.append(i)
elif a.assignment_type.is_consistent():
consistent_assignments.add(i)
consistent_assignments.append(i)
if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous:
primary_unique.add(i)
primary_unique.append(i)

if primary_unique:
return self.filter_assignments(assignment_list, primary_unique)
Expand Down Expand Up @@ -98,6 +100,8 @@ def merge_assignments(assignment_list):
# if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly
@staticmethod
def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True):
assignments_to_keep = MultimapResolver.find_duplicates(assignment_list, assignments_to_keep)

all_genes = set()
all_isoforms = set()
for i in assignments_to_keep:
Expand All @@ -108,9 +112,10 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1
change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1

assignments_to_keep_set = set(assignments_to_keep)
for i in range(len(assignment_list)):
assignment = assignment_list[i]
if i in assignments_to_keep:
if i in assignments_to_keep_set:
if assignment.assignment_type.is_inconsistent():
ambiguity_assignment_type = ReadAssignmentType.inconsistent_ambiguous
else:
Expand All @@ -127,3 +132,53 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T
assignment.gene_assignment_type = ReadAssignmentType.suspended

return assignment_list

# finds groups of duplicated assignments
@staticmethod
def find_duplicates(assignment_list, assignment_indices):
if len(assignment_indices) <= 1:
return assignment_indices

selected_assignments = []
discarded_duplicates = set()
for i in range(len(assignment_indices)):
index1 = assignment_indices[i]
if index1 in discarded_duplicates:
# this assignment was already discarded - no need to compare it
continue
# if it was not discarded earlier - keep anyway; next assignments will either duplicate it or won't
selected_assignments.append(index1)

for j in range(i + 1, len(assignment_indices)):
# find duplicating assignments among the next onex
index2 = assignment_indices[j]
if index2 in discarded_duplicates:
# this assignment was already discarded - no need to compare it
continue

if assignment_list[index1] == assignment_list[index2]:
# discard duplicating assignment with larger index
discarded_duplicates.add(index2)

# check if duplicates come from the same region,
# if they are - it means we have identical records in the BAM files, print a warning
if assignment_list[index1].genomic_region == assignment_list[index2].genomic_region:
MultimapResolver.duplicate_counter += 1
example_assignment = assignment_list[index1]
if MultimapResolver.duplicate_counter == 5:
logger.warning(
"More possible duplicates were detected but will not be printed to limit the log size. "
"All duplicated entries will be ignored. "
"Use --debug to see all of them in the isoquant.log.")
if MultimapResolver.duplicate_counter >= 5:
logger.debug(
"Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. "
"Please, check you input." %
(example_assignment.read_id, example_assignment.chr_id, example_assignment.start))
else:
logger.warning(
"Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. "
"Please, check you input."
% (example_assignment.read_id, example_assignment.chr_id, example_assignment.start))

return selected_assignments

0 comments on commit 557e583

Please sign in to comment.