From 6acfb0471e38925d6fc8962a599fe535b88aa5f2 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 7 May 2024 01:38:23 +0300 Subject: [PATCH 1/5] fix model prefiltering --- src/graph_based_model_construction.py | 28 +++++++++++---------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 375d6c58..b7e65e41 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -256,19 +256,23 @@ def pre_filter_transcripts(self): if (model.transcript_type != TranscriptModelType.known and self.internal_counter[model.transcript_id] < coverage_cutoff): - del self.transcript_read_ids[model.transcript_id] - del self.internal_counter[model.transcript_id] + self.delete_from_storage(model.transcript_id) continue if (model.transcript_type != TranscriptModelType.known and self.mapping_quality(model.transcript_id) < self.params.simple_models_mapq_cutoff): - del self.transcript_read_ids[model.transcript_id] - del self.internal_counter[model.transcript_id] + self.delete_from_storage(model.transcript_id) continue filtered_storage.append(model) self.transcript_model_storage = filtered_storage + def delete_from_storage(self, transcript_id): + for a in self.transcript_read_ids[transcript_id]: + self.read_assignment_counts[a.read_id] -= 1 + del self.transcript_read_ids[transcript_id] + del self.internal_counter[transcript_id] + def filter_transcripts(self): filtered_storage = [] to_substitute = self.detect_similar_isoforms(self.transcript_model_storage) @@ -290,21 +294,14 @@ def filter_transcripts(self): if model.transcript_id in to_substitute: #logger.debug("Novel model %s has a similar isoform %s" % (model.transcript_id, to_substitute[model.transcript_id])) - #self.transcript_read_ids[to_substitute[model.transcript_id]] += self.transcript_read_ids[model.transcript_id] - for a in self.transcript_read_ids[model.transcript_id]: - self.read_assignment_counts[a.read_id] -= 1 - del self.transcript_read_ids[model.transcript_id] - del self.internal_counter[model.transcript_id] + self.delete_from_storage(model.transcript_id) continue if self.internal_counter[model.transcript_id] < novel_isoform_cutoff: #logger.debug("Novel model %s has coverage %d < %.2f, component cov = %d" % (model.transcript_id, # self.internal_counter[model.transcript_id], # novel_isoform_cutoff, component_coverage)) - for a in self.transcript_read_ids[model.transcript_id]: - self.read_assignment_counts[a.read_id] -= 1 - del self.transcript_read_ids[model.transcript_id] - del self.internal_counter[model.transcript_id] + self.delete_from_storage(model.transcript_id) continue if len(model.exon_blocks) <= 2: @@ -312,10 +309,7 @@ def filter_transcripts(self): #logger.debug("Novel model %s has quality %.2f" % (model.transcript_id, mapq)) if mapq < self.params.simple_models_mapq_cutoff: #logger.debug("Novel model %s has poor quality" % model.transcript_id) - for a in self.transcript_read_ids[model.transcript_id]: - self.read_assignment_counts[a.read_id] -= 1 - del self.transcript_read_ids[model.transcript_id] - del self.internal_counter[model.transcript_id] + self.delete_from_storage(model.transcript_id) continue # TODO: correct ends for known From c7b5eaacd5cd1471c4e169d81a2875ee3eb86bc0 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 7 May 2024 01:39:46 +0300 Subject: [PATCH 2/5] fix ambiguous read read forwarding --- src/graph_based_model_construction.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index b7e65e41..e600eada 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -172,9 +172,9 @@ def forward_counts(self): ambiguous_assignments[read_id] = [read_assignment.read_group] ambiguous_assignments[read_id].append(transcript_id) - for read_id in ambiguous_assignments.keys(): - self.transcript_counter.add_read_info_raw(read_id, ambiguous_assignments[read_id][1:], - ambiguous_assignments[read_id][0]) + for read_id in ambiguous_assignments.keys(): + self.transcript_counter.add_read_info_raw(read_id, ambiguous_assignments[read_id][1:], + ambiguous_assignments[read_id][0]) self.transcript_counter.add_unassigned(sum(value == 0 for value in self.read_assignment_counts.values())) self.transcript_counter.add_confirmed_features([model.transcript_id for model in self.transcript_model_storage]) @@ -444,7 +444,7 @@ def construct_fl_isoforms(self): and transcript_clean_strand == '.') or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded and transcript_strand == '.')): - logger.info("Avoiding unreliable transcript with %d exons (strand cannot be detected)" % len(novel_exons)) + logger.debug("Avoiding unreliable transcript with %d exons (strand cannot be detected)" % len(novel_exons)) pass else: if self.params.use_technical_replicas and \ From b366f77e5b2d6da753b1fab9713c51c23939e080 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 7 May 2024 01:50:28 +0300 Subject: [PATCH 3/5] remove old code --- src/long_read_counter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/long_read_counter.py b/src/long_read_counter.py index 7004eaa1..f7ffa22b 100644 --- a/src/long_read_counter.py +++ b/src/long_read_counter.py @@ -219,11 +219,9 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d for feature_id in feature_ids: count_value = self.read_counter.process_ambiguous(len(feature_ids)) self.feature_counter[group_id][feature_id] += count_value - # self.confirmed_features.add((group_id, feature_id)) self.all_features.add(feature_id) else: self.feature_counter[group_id][feature_ids[0]] += 1 - # self.confirmed_features.add((group_id, feature_ids[0])) self.all_features.add(feature_ids[0]) def add_unassigned(self, n_reads=1): From efc132aa99e25ec68658382819058a23bf145f36 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 23 Apr 2024 12:43:14 +0300 Subject: [PATCH 4/5] fix error code for multi-step assignment tests --- src/alignment_processor.py | 2 +- tests/github/run_pipeline.py | 46 ++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/alignment_processor.py b/src/alignment_processor.py index 0812a64b..d094889b 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -1,5 +1,5 @@ ############################################################################ -# Copyright (c) 2020 Saint Petersburg State University +# Copyright (c) 2024 University of Helsinki # # All Rights Reserved # See file LICENSE for details. ############################################################################ diff --git a/tests/github/run_pipeline.py b/tests/github/run_pipeline.py index c9c17b4f..103a2a94 100755 --- a/tests/github/run_pipeline.py +++ b/tests/github/run_pipeline.py @@ -138,18 +138,18 @@ def check_value(etalon_value, output_value, name): if output_value < 0: if output_value != etalon_value: log.error("Value of %s = %2.2f is not equal to value %2.2f" % (name, output_value, lower_bound)) - exit_code = -40 + exit_code = -2 else: log.info("Value of %s = %2.2f == %2.2f as expected" % (name, output_value, upper_bound)) else: if output_value < lower_bound: log.error("Value of %s = %2.2f is lower than the expected value %2.2f" % (name, output_value, lower_bound)) - exit_code = -41 + exit_code = -2 else: log.info("Value of %s = %2.2f >= %2.2f as expected" % (name, output_value, lower_bound)) if output_value > upper_bound: log.error("Value of %s = %2.2f is higher than the expected value %2.2f" % (name, output_value, upper_bound)) - exit_code = -42 + exit_code = -2 else: log.info("Value of %s = %2.2f <= %2.2f as expected" % (name, output_value, upper_bound)) return exit_code @@ -186,7 +186,7 @@ def run_assignment_quality(args, config_dict): bam = find_bam(output_folder, label) if not bam or not os.path.exists(bam): log.error("BAM file was not found") - return -21 + return -10 else: bam = fix_path(config_file, config_dict["bam"]) @@ -205,7 +205,7 @@ def run_assignment_quality(args, config_dict): result = subprocess.run(qa_command_list) if result.returncode != 0: log.error("QA exited with non-zero status: %d" % result.returncode) - return -13 + return -11 if "etalon" not in config_dict: return 0 @@ -218,7 +218,7 @@ def run_assignment_quality(args, config_dict): for k, v in etalon_qaulity_dict.items(): if k not in quality_report_dict: log.error("Metric %s was not found in the report" % k) - exit_code = -22 + exit_code = -12 continue new_etalon_outf.write("%s\t%.2f\n" % (k, float(quality_report_dict[k]))) err_code = check_value(float(v), float(quality_report_dict[k]), k) @@ -251,7 +251,7 @@ def run_transcript_quality(args, config_dict): out_gtf = os.path.join(output_folder, "%s/%s.transcript_models.gtf" % (label, label)) if not os.path.exists(out_gtf): log.error("Output GTF file was not found" % out_gtf) - return -31 + return -20 quality_output = os.path.join(output_folder, "gffcompare") genedb_prefix = fix_path(config_file, config_dict["reduced_db"]) @@ -262,7 +262,7 @@ def run_transcript_quality(args, config_dict): result = subprocess.run(qa_command_list) if result.returncode != 0: log.error("Transcript evaluation exited with non-zero status: %d" % result.returncode) - return -13 + return -21 if "etalon" not in config_dict: return 0 @@ -311,14 +311,14 @@ def run_quantification(args, config_dict, mode): if not os.path.exists(out_tpm): log.error("Output TPM file %s was not found" % out_tpm) - return -32 + return -30 if "reference_tpm" not in config_dict: return 0 ref_tpm = config_dict["reference_gene_tpm"] if mode == "gene" else config_dict["reference_tpm"] if not os.path.exists(ref_tpm): log.error("File %s with reference TPM was not detected" % ref_tpm) - return -18 + return -31 quantification_stats_output = os.path.join(output_folder, mode + ".quantification.tsv") qa_command_list = ["python3", os.path.join(isoquant_dir, "misc/quantification_stats.py"), @@ -336,7 +336,7 @@ def run_quantification(args, config_dict, mode): result = subprocess.run(qa_command_list) if result.returncode != 0: log.error("Quantification evaluation exited with non-zero status: %d" % result.returncode) - return -14 + return -34 etalon_to_use = "etalon_quantification_" + mode if etalon_to_use not in config_dict: @@ -346,14 +346,14 @@ def run_quantification(args, config_dict, mode): ref_value_files = fix_path(config_file, config_dict[etalon_to_use]) if not os.path.exists(ref_value_files): log.error("File %s with etalon metric values was not detected" % ref_value_files) - return -19 + return -35 etalon_quality_dict = load_tsv_config(ref_value_files) real_dict = load_tsv_config(quantification_stats_output) exit_code = 0 for metric_name in etalon_quality_dict.keys(): if metric_name not in real_dict: - exit_code = -15 + exit_code = -36 log.warning("Metric not found %s" % metric_name) continue @@ -399,25 +399,26 @@ def main(): for k in required: if k not in config_dict: log.error(k + " is not set in the config") - return -10 + exit(-4) err_code = run_isoquant(args, config_dict) if err_code != 0: return err_code run_types = set(map(lambda x: x.strip().replace('"', ''), config_dict["run_type"].split(","))) + err_codes = [] if RT_VOID in run_types: - err_code = 0 + err_codes.append(0) if RT_ASSIGNMENT in run_types: - err_code = run_assignment_quality(args, config_dict) + err_codes.append(run_assignment_quality(args, config_dict)) if RT_TRANSCRIPTS in run_types: - err_code = run_transcript_quality(args, config_dict) + err_codes.append(run_transcript_quality(args, config_dict)) if RT_QUANTIFICATION_KNOWN in run_types: - err_code = run_quantification(args, config_dict, "ref") + err_codes.append(run_quantification(args, config_dict, "ref")) if RT_QUANTIFICATION_NOVEL in run_types: - err_code = run_quantification(args, config_dict, "novel") + err_codes.append(run_quantification(args, config_dict, "novel")) if RT_QUANTIFICATION_GENES in run_types: - err_code = run_quantification(args, config_dict, "gene") + err_codes.append(run_quantification(args, config_dict, "gene")) if "check_input_files" in config_dict: files_list = config_dict["check_input_files"].split() @@ -427,7 +428,10 @@ def main(): missing_files = check_output_files(output_folder, label, files_list) if missing_files: log.error("The following files were not detected in the output folder: %s" % " ".join(missing_files)) - err_code = -11 + err_codes.append(-5) + + if any(ec != 0 for ec in err_codes): + err_code = -6 return err_code From 6a1893c53a648e1f6a1e44ee46eaa120f8bc424e Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 7 May 2024 11:22:07 +0300 Subject: [PATCH 5/5] do not forget some of the unassigned reads --- src/graph_based_model_construction.py | 7 +++++-- src/transcript_printer.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index e600eada..83ea48ed 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -146,7 +146,7 @@ def process(self, read_assignment_storage): self.forward_counts() # FIXME: remove asserts below - if self.transcript_model_storage and len(set([x.read_id for x in read_assignment_storage])) != len(self.read_assignment_counts): + if len(set([x.read_id for x in read_assignment_storage])) != len(self.read_assignment_counts): logger.warning("Some reads were not assigned %d %d" % (len(set([x.read_id for x in read_assignment_storage])), len(self.read_assignment_counts))) # FIXME: remove asserts below if any(value < 0 for value in self.read_assignment_counts.values()): @@ -706,6 +706,9 @@ def transcript_from_reference(self, isoform_id): # assign reads back to constructed isoforms def assign_reads_to_models(self, read_assignments): if not self.transcript_model_storage: + for assignment in read_assignments: + read_id = assignment.read_id + self.read_assignment_counts[read_id] = 0 logger.debug("No transcripts were assigned") return @@ -721,7 +724,7 @@ def assign_reads_to_models(self, read_assignments): continue read_exons = assignment.corrected_exons - #logger.debug("# Checking read %s: %s" % (assignment.read_id, str(read_exons))) + # logger.debug("# Checking read %s: %s" % (assignment.read_id, str(read_exons))) model_combined_profile = profile_constructor.construct_profiles(read_exons, assignment.polya_info, []) model_assignment = assigner.assign_to_isoform(assignment.read_id, model_combined_profile) model_assignment.read_group = assignment.read_group diff --git a/src/transcript_printer.py b/src/transcript_printer.py index c566f49e..fa8b7beb 100644 --- a/src/transcript_printer.py +++ b/src/transcript_printer.py @@ -138,7 +138,8 @@ def dump_read_assignments(self, transcript_model_constructor): # write read_id -> transcript_id map if not self.output_r2t: return - for model_id, read_assignments in transcript_model_constructor.transcript_read_ids.items(): + for model_id in transcript_model_constructor.transcript_read_ids.keys(): + read_assignments = transcript_model_constructor.transcript_read_ids[model_id] for a in read_assignments: self.out_r2t.write("%s\t%s\n" % (a.read_id, model_id)) for read_id in transcript_model_constructor.read_assignment_counts.keys():