diff --git a/src/alignment_processor.py b/src/alignment_processor.py index 2022ee3f..24f038e5 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -284,7 +284,7 @@ def process_alignments_in_region(self, current_region, alignment_storage): if gene_info.empty(): assignment_storage = self.process_intergenic(alignment_storage, current_region) else: - assignment_storage = self.process_genic(alignment_storage, gene_info) + assignment_storage = self.process_genic(alignment_storage, gene_info, current_region) return gene_info, assignment_storage @@ -332,6 +332,7 @@ def process_intergenic(self, alignment_storage, region): alignment_info.polya_info.internal_polyt_pos != -1) read_assignment.polya_info = alignment_info.polya_info read_assignment.cage_found = len(alignment_info.cage_hits) > 0 + read_assignment.genomic_region = region read_assignment.exons = alignment_info.read_exons read_assignment.corrected_exons = corrector.correct_read(alignment_info) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) @@ -346,7 +347,7 @@ def process_intergenic(self, alignment_storage, region): assignment_storage.append(read_assignment) return assignment_storage - def process_genic(self, alignment_storage, gene_info): + def process_genic(self, alignment_storage, gene_info, region): assigner = LongReadAssigner(gene_info, self.params) profile_constructor = CombinedProfileConstructor(gene_info, self.params) exon_corrector = ExonCorrector(gene_info, self.params, self.chr_record) @@ -388,6 +389,7 @@ def process_genic(self, alignment_storage, gene_info): alignment_info.polya_info.internal_polyt_pos != -1) read_assignment.polya_info = alignment_info.polya_info read_assignment.cage_found = len(alignment_info.cage_hits) > 0 + read_assignment.genomic_region = region read_assignment.exons = alignment_info.read_exons read_assignment.corrected_exons = exon_corrector.correct_assigned_read(alignment_info, read_assignment) diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 884930cf..8ddb4a6c 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -353,7 +353,7 @@ def finalize_aggregators(self, sample): logger.info("Gene counts are stored in " + self.gene_counter.output_counts_file_name) logger.info("Transcript counts are stored in " + self.transcript_counter.output_counts_file_name) logger.info("Read assignments are stored in " + self.basic_printer.output_file_name + - ".gz" if self.args.gzipped else "") + (".gz" if self.args.gzipped else "")) self.read_stat_counter.print_start("Read assignment statistics") diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 916b59f4..41c1b525 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -467,6 +467,12 @@ def __init__(self, read_assignment): self.assignment_id = read_assignment.assignment_id self.read_id = read_assignment.read_id self.chr_id = read_assignment.chr_id + self.start = 0 + self.end = 0 + if read_assignment.exons: + self.start = read_assignment.exons[0][0] + self.end = read_assignment.exons[-1][1] + self.genomic_region = read_assignment.genomic_region self.multimapper = read_assignment.multimapper self.polyA_found = read_assignment.polyA_found self.assignment_type = read_assignment.assignment_type @@ -487,12 +493,24 @@ def __init__(self, read_assignment): self.genes = list(gene_set) self.isoforms = list(isoform_set) + def __eq__(self, other): + if isinstance(other, BasicReadAssignment): + return (self.read_id == other.read_id and + self.chr_id == other.chr_id and + self.start == other.start and + self.end == other.end and + self.isoforms == other.isoforms) + return False + @classmethod def deserialize(cls, infile): read_assignment = cls.__new__(cls) read_assignment.assignment_id = read_int(infile) read_assignment.read_id = read_string(infile) read_assignment.chr_id = read_string(infile) + read_assignment.start = read_int(infile) + read_assignment.end = read_int(infile) + read_assignment.genomic_region = (read_int(infile), read_int(infile)) bool_arr = read_bool_array(infile, 2) read_assignment.multimapper = bool_arr[0] read_assignment.polyA_found = bool_arr[1] @@ -507,6 +525,10 @@ def serialize(self, outfile): write_int(self.assignment_id, outfile) write_string(self.read_id, outfile) write_string(self.chr_id, outfile) + write_int(self.start, outfile) + write_int(self.end, outfile) + write_int(self.genomic_region[0], outfile) + write_int(self.genomic_region[1], outfile) write_bool_array([self.multimapper, self.polyA_found], outfile) write_short_int(self.assignment_type.value, outfile) write_short_int(self.gene_assignment_type.value, outfile) @@ -520,6 +542,7 @@ class ReadAssignment: def __init__(self, read_id, assignment_type, match=None): self.assignment_id = ReadAssignment.assignment_id_generator.increment() self.read_id = read_id + self.genomic_region = (0, 0) self.exons = None self.corrected_exons = None self.corrected_introns = None @@ -562,6 +585,7 @@ def deserialize(cls, infile, gene_info): read_assignment = cls.__new__(cls) read_assignment.assignment_id = read_int(infile) read_assignment.read_id = read_string(infile) + read_assignment.genomic_region = (read_int(infile), read_int(infile)) read_assignment.exons = read_list_of_pairs(infile, read_int) read_assignment.corrected_exons = read_list_of_pairs(infile, read_int) read_assignment.corrected_introns = junctions_from_blocks(read_assignment.corrected_exons) @@ -589,6 +613,8 @@ def deserialize(cls, infile, gene_info): def serialize(self, outfile): write_int(self.assignment_id, outfile) write_string(self.read_id, outfile) + write_int(self.genomic_region[0], outfile) + write_int(self.genomic_region[1], outfile) write_list_of_pairs(self.exons, outfile, write_int) write_list_of_pairs(self.corrected_exons, outfile, write_int) write_bool_array([self.multimapper, self.polyA_found, self.cage_found], outfile) diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index 0583b06f..07a70ef2 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -6,6 +6,7 @@ import logging from enum import Enum +from collections import defaultdict from .isoform_assignment import ReadAssignmentType @@ -19,6 +20,7 @@ class MultimapResolvingStrategy(Enum): class MultimapResolver: + duplicate_counter = 0 def __init__(self, strategy): self.strategy = strategy @@ -44,20 +46,20 @@ def resolve(self, assignment_list): def select_best_assignment(self, assignment_list): logger.debug("Resolving read %s" % assignment_list[0].read_id) - primary_unique = set() - consistent_assignments = set() - inconsistent_assignments = set() - primary_inconsistent = set() + primary_unique = [] + consistent_assignments = [] + inconsistent_assignments = [] + primary_inconsistent = [] for i, a in enumerate(assignment_list): if a.assignment_type.is_inconsistent(): if not a.multimapper: - primary_inconsistent.add(i) - inconsistent_assignments.add(i) + primary_inconsistent.append(i) + inconsistent_assignments.append(i) elif a.assignment_type.is_consistent(): - consistent_assignments.add(i) + consistent_assignments.append(i) if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous: - primary_unique.add(i) + primary_unique.append(i) if primary_unique: return self.filter_assignments(assignment_list, primary_unique) @@ -98,6 +100,8 @@ def merge_assignments(assignment_list): # if selected assignments feature more than 1 gene/isoform - mark as ambiguous accordingly @staticmethod def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=True): + assignments_to_keep = MultimapResolver.find_duplicates(assignment_list, assignments_to_keep) + all_genes = set() all_isoforms = set() for i in assignments_to_keep: @@ -108,9 +112,10 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T change_transcript_assignment_type = mark_as_ambiguous and len(all_isoforms) > 1 change_gene_assignment_type = mark_as_ambiguous and len(all_genes) > 1 + assignments_to_keep_set = set(assignments_to_keep) for i in range(len(assignment_list)): assignment = assignment_list[i] - if i in assignments_to_keep: + if i in assignments_to_keep_set: if assignment.assignment_type.is_inconsistent(): ambiguity_assignment_type = ReadAssignmentType.inconsistent_ambiguous else: @@ -127,3 +132,53 @@ def filter_assignments(assignment_list, assignments_to_keep, mark_as_ambiguous=T assignment.gene_assignment_type = ReadAssignmentType.suspended return assignment_list + + # finds groups of duplicated assignments + @staticmethod + def find_duplicates(assignment_list, assignment_indices): + if len(assignment_indices) <= 1: + return assignment_indices + + selected_assignments = [] + discarded_duplicates = set() + for i in range(len(assignment_indices)): + index1 = assignment_indices[i] + if index1 in discarded_duplicates: + # this assignment was already discarded - no need to compare it + continue + # if it was not discarded earlier - keep anyway; next assignments will either duplicate it or won't + selected_assignments.append(index1) + + for j in range(i + 1, len(assignment_indices)): + # find duplicating assignments among the next onex + index2 = assignment_indices[j] + if index2 in discarded_duplicates: + # this assignment was already discarded - no need to compare it + continue + + if assignment_list[index1] == assignment_list[index2]: + # discard duplicating assignment with larger index + discarded_duplicates.add(index2) + + # check if duplicates come from the same region, + # if they are - it means we have identical records in the BAM files, print a warning + if assignment_list[index1].genomic_region == assignment_list[index2].genomic_region: + MultimapResolver.duplicate_counter += 1 + example_assignment = assignment_list[index1] + if MultimapResolver.duplicate_counter == 5: + logger.warning( + "More possible duplicates were detected but will not be printed to limit the log size. " + "All duplicated entries will be ignored. " + "Use --debug to see all of them in the isoquant.log.") + if MultimapResolver.duplicate_counter >= 5: + logger.debug( + "Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. " + "Please, check you input." % + (example_assignment.read_id, example_assignment.chr_id, example_assignment.start)) + else: + logger.warning( + "Read %s seems to have duplicated alignment entries at %s: %d and will be ignored. " + "Please, check you input." + % (example_assignment.read_id, example_assignment.chr_id, example_assignment.start)) + + return selected_assignments