ablab · andrewprzh · May 7, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/src/alignment_processor.py b/src/alignment_processor.py
@@ -1,5 +1,5 @@
 ############################################################################
-# Copyright (c) 2020 Saint Petersburg State University
+# Copyright (c) 2024 University of Helsinki
 # # All Rights Reserved
 # See file LICENSE for details.
 ############################################################################

diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
@@ -146,7 +146,7 @@ def process(self, read_assignment_storage):
         self.forward_counts()
 
         # FIXME: remove asserts below
-        if self.transcript_model_storage and len(set([x.read_id for x in read_assignment_storage])) != len(self.read_assignment_counts):
+        if len(set([x.read_id for x in read_assignment_storage])) != len(self.read_assignment_counts):
             logger.warning("Some reads were not assigned %d %d" % (len(set([x.read_id for x in read_assignment_storage])), len(self.read_assignment_counts)))
         # FIXME: remove asserts below
         if any(value < 0 for value in self.read_assignment_counts.values()):
@@ -172,9 +172,9 @@ def forward_counts(self):
                     ambiguous_assignments[read_id] = [read_assignment.read_group]
                 ambiguous_assignments[read_id].append(transcript_id)
 
-            for read_id in ambiguous_assignments.keys():
-                self.transcript_counter.add_read_info_raw(read_id, ambiguous_assignments[read_id][1:],
-                                                          ambiguous_assignments[read_id][0])
+        for read_id in ambiguous_assignments.keys():
+            self.transcript_counter.add_read_info_raw(read_id, ambiguous_assignments[read_id][1:],
+                                                      ambiguous_assignments[read_id][0])
 
         self.transcript_counter.add_unassigned(sum(value == 0 for value in self.read_assignment_counts.values()))
         self.transcript_counter.add_confirmed_features([model.transcript_id for model in self.transcript_model_storage])
@@ -256,19 +256,23 @@ def pre_filter_transcripts(self):
 
             if (model.transcript_type != TranscriptModelType.known and
                     self.internal_counter[model.transcript_id] < coverage_cutoff):
-                del self.transcript_read_ids[model.transcript_id]
-                del self.internal_counter[model.transcript_id]
+                self.delete_from_storage(model.transcript_id)
                 continue
 
             if (model.transcript_type != TranscriptModelType.known and
                     self.mapping_quality(model.transcript_id) < self.params.simple_models_mapq_cutoff):
-                del self.transcript_read_ids[model.transcript_id]
-                del self.internal_counter[model.transcript_id]
+                self.delete_from_storage(model.transcript_id)
                 continue
             filtered_storage.append(model)
 
         self.transcript_model_storage = filtered_storage
 
+    def delete_from_storage(self, transcript_id):
+        for a in self.transcript_read_ids[transcript_id]:
+            self.read_assignment_counts[a.read_id] -= 1
+        del self.transcript_read_ids[transcript_id]
+        del self.internal_counter[transcript_id]
+
     def filter_transcripts(self):
         filtered_storage = []
         to_substitute = self.detect_similar_isoforms(self.transcript_model_storage)
@@ -290,32 +294,22 @@ def filter_transcripts(self):
 
             if model.transcript_id in to_substitute:
                 #logger.debug("Novel model %s has a similar isoform %s" % (model.transcript_id, to_substitute[model.transcript_id]))
-                #self.transcript_read_ids[to_substitute[model.transcript_id]] += self.transcript_read_ids[model.transcript_id]
-                for a in self.transcript_read_ids[model.transcript_id]:
-                    self.read_assignment_counts[a.read_id] -= 1
-                del self.transcript_read_ids[model.transcript_id]
-                del self.internal_counter[model.transcript_id]
+                self.delete_from_storage(model.transcript_id)
                 continue
 
             if self.internal_counter[model.transcript_id] < novel_isoform_cutoff:
                 #logger.debug("Novel model %s has coverage %d < %.2f, component cov = %d" % (model.transcript_id,
                 #                                                        self.internal_counter[model.transcript_id],
                 #                                                        novel_isoform_cutoff, component_coverage))
-                for a in self.transcript_read_ids[model.transcript_id]:
-                    self.read_assignment_counts[a.read_id] -= 1
-                del self.transcript_read_ids[model.transcript_id]
-                del self.internal_counter[model.transcript_id]
+                self.delete_from_storage(model.transcript_id)
                 continue
 
             if len(model.exon_blocks) <= 2:
                 mapq = self.mapping_quality(model.transcript_id)
                 #logger.debug("Novel model %s has quality %.2f" % (model.transcript_id, mapq))
                 if mapq < self.params.simple_models_mapq_cutoff:
                     #logger.debug("Novel model %s has poor quality" % model.transcript_id)
-                    for a in self.transcript_read_ids[model.transcript_id]:
-                        self.read_assignment_counts[a.read_id] -= 1
-                    del self.transcript_read_ids[model.transcript_id]
-                    del self.internal_counter[model.transcript_id]
+                    self.delete_from_storage(model.transcript_id)
                     continue
 
             # TODO: correct ends for known
@@ -450,7 +444,7 @@ def construct_fl_isoforms(self):
                        and transcript_clean_strand == '.')
                       or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded
                           and transcript_strand == '.')):
-                    logger.info("Avoiding unreliable transcript with %d exons (strand cannot be detected)" % len(novel_exons))
+                    logger.debug("Avoiding unreliable transcript with %d exons (strand cannot be detected)" % len(novel_exons))
                     pass
                 else:
                     if self.params.use_technical_replicas and \
@@ -712,6 +706,9 @@ def transcript_from_reference(self, isoform_id):
     # assign reads back to constructed isoforms
     def assign_reads_to_models(self, read_assignments):
         if not self.transcript_model_storage:
+            for assignment in read_assignments:
+                read_id = assignment.read_id
+                self.read_assignment_counts[read_id] = 0
             logger.debug("No transcripts were assigned")
             return
 
@@ -727,7 +724,7 @@ def assign_reads_to_models(self, read_assignments):
                 continue
 
             read_exons = assignment.corrected_exons
-            #logger.debug("# Checking read %s: %s" % (assignment.read_id, str(read_exons)))
+            # logger.debug("# Checking read %s: %s" % (assignment.read_id, str(read_exons)))
             model_combined_profile = profile_constructor.construct_profiles(read_exons, assignment.polya_info, [])
             model_assignment = assigner.assign_to_isoform(assignment.read_id, model_combined_profile)
             model_assignment.read_group = assignment.read_group

diff --git a/src/long_read_counter.py b/src/long_read_counter.py
@@ -219,11 +219,9 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d
             for feature_id in feature_ids:
                 count_value = self.read_counter.process_ambiguous(len(feature_ids))
                 self.feature_counter[group_id][feature_id] += count_value
-                # self.confirmed_features.add((group_id, feature_id))
                 self.all_features.add(feature_id)
         else:
             self.feature_counter[group_id][feature_ids[0]] += 1
-            # self.confirmed_features.add((group_id, feature_ids[0]))
             self.all_features.add(feature_ids[0])
 
     def add_unassigned(self, n_reads=1):

diff --git a/src/transcript_printer.py b/src/transcript_printer.py
@@ -138,7 +138,8 @@ def dump_read_assignments(self, transcript_model_constructor):
         # write read_id -> transcript_id map
         if not self.output_r2t:
             return
-        for model_id, read_assignments in transcript_model_constructor.transcript_read_ids.items():
+        for model_id in transcript_model_constructor.transcript_read_ids.keys():
+            read_assignments = transcript_model_constructor.transcript_read_ids[model_id]
             for a in read_assignments:
                 self.out_r2t.write("%s\t%s\n" % (a.read_id, model_id))
         for read_id in transcript_model_constructor.read_assignment_counts.keys():

diff --git a/tests/github/run_pipeline.py b/tests/github/run_pipeline.py
@@ -138,18 +138,18 @@ def check_value(etalon_value, output_value, name):
     if output_value < 0:
         if output_value != etalon_value:
             log.error("Value of %s = %2.2f is not equal to value %2.2f" % (name, output_value, lower_bound))
-            exit_code = -40
+            exit_code = -2
         else:
             log.info("Value of %s = %2.2f == %2.2f as expected" % (name, output_value, upper_bound))
     else:
         if output_value < lower_bound:
             log.error("Value of %s = %2.2f is lower than the expected value %2.2f" % (name, output_value, lower_bound))
-            exit_code = -41
+            exit_code = -2
         else:
             log.info("Value of %s = %2.2f >= %2.2f as expected" % (name, output_value, lower_bound))
         if output_value > upper_bound:
             log.error("Value of %s = %2.2f is higher than the expected value %2.2f" % (name, output_value, upper_bound))
-            exit_code = -42
+            exit_code = -2
         else:
             log.info("Value of %s = %2.2f <= %2.2f as expected" % (name, output_value, upper_bound))
     return exit_code
@@ -186,7 +186,7 @@ def run_assignment_quality(args, config_dict):
         bam = find_bam(output_folder, label)
         if not bam or not os.path.exists(bam):
             log.error("BAM file was not found")
-            return -21
+            return -10
     else:
         bam = fix_path(config_file, config_dict["bam"])
 
@@ -205,7 +205,7 @@ def run_assignment_quality(args, config_dict):
     result = subprocess.run(qa_command_list)
     if result.returncode != 0:
         log.error("QA exited with non-zero status: %d" % result.returncode)
-        return -13
+        return -11
 
     if "etalon" not in config_dict:
         return 0
@@ -218,7 +218,7 @@ def run_assignment_quality(args, config_dict):
     for k, v in etalon_qaulity_dict.items():
         if k not in quality_report_dict:
             log.error("Metric %s was not found in the report" % k)
-            exit_code = -22
+            exit_code = -12
             continue
         new_etalon_outf.write("%s\t%.2f\n" % (k, float(quality_report_dict[k])))
         err_code = check_value(float(v), float(quality_report_dict[k]), k)
@@ -251,7 +251,7 @@ def run_transcript_quality(args, config_dict):
     out_gtf = os.path.join(output_folder, "%s/%s.transcript_models.gtf" % (label, label))
     if not os.path.exists(out_gtf):
         log.error("Output GTF file was not found" % out_gtf)
-        return -31
+        return -20
 
     quality_output = os.path.join(output_folder, "gffcompare")
     genedb_prefix = fix_path(config_file, config_dict["reduced_db"])
@@ -262,7 +262,7 @@ def run_transcript_quality(args, config_dict):
     result = subprocess.run(qa_command_list)
     if result.returncode != 0:
         log.error("Transcript evaluation exited with non-zero status: %d" % result.returncode)
-        return -13
+        return -21
 
     if "etalon" not in config_dict:
         return 0
@@ -311,14 +311,14 @@ def run_quantification(args, config_dict, mode):
 
     if not os.path.exists(out_tpm):
         log.error("Output TPM file %s was not found" % out_tpm)
-        return -32
+        return -30
 
     if "reference_tpm" not in config_dict:
         return 0
     ref_tpm = config_dict["reference_gene_tpm"] if mode == "gene" else config_dict["reference_tpm"]
     if not os.path.exists(ref_tpm):
         log.error("File %s with reference TPM was not detected" % ref_tpm)
-        return -18
+        return -31
 
     quantification_stats_output = os.path.join(output_folder, mode + ".quantification.tsv")
     qa_command_list = ["python3", os.path.join(isoquant_dir, "misc/quantification_stats.py"),
@@ -336,7 +336,7 @@ def run_quantification(args, config_dict, mode):
     result = subprocess.run(qa_command_list)
     if result.returncode != 0:
         log.error("Quantification evaluation exited with non-zero status: %d" % result.returncode)
-        return -14
+        return -34
 
     etalon_to_use = "etalon_quantification_" + mode
     if etalon_to_use not in config_dict:
@@ -346,14 +346,14 @@ def run_quantification(args, config_dict, mode):
     ref_value_files = fix_path(config_file, config_dict[etalon_to_use])
     if not os.path.exists(ref_value_files):
         log.error("File %s with etalon metric values was not detected" % ref_value_files)
-        return -19
+        return -35
     etalon_quality_dict = load_tsv_config(ref_value_files)
     real_dict = load_tsv_config(quantification_stats_output)
     exit_code = 0
 
     for metric_name in etalon_quality_dict.keys():
         if metric_name not in real_dict:
-            exit_code = -15
+            exit_code = -36
             log.warning("Metric not found %s" % metric_name)
             continue
 
@@ -399,25 +399,26 @@ def main():
     for k in required:
         if k not in config_dict:
             log.error(k + " is not set in the config")
-            return -10
+            exit(-4)
 
     err_code = run_isoquant(args, config_dict)
     if err_code != 0:
         return err_code
 
     run_types = set(map(lambda x: x.strip().replace('"', ''), config_dict["run_type"].split(",")))
+    err_codes = []
     if RT_VOID in run_types:
-        err_code = 0
+        err_codes.append(0)
     if RT_ASSIGNMENT in run_types:
-        err_code = run_assignment_quality(args, config_dict)
+        err_codes.append(run_assignment_quality(args, config_dict))
     if RT_TRANSCRIPTS in run_types:
-        err_code = run_transcript_quality(args, config_dict)
+        err_codes.append(run_transcript_quality(args, config_dict))
     if RT_QUANTIFICATION_KNOWN in run_types:
-        err_code = run_quantification(args, config_dict, "ref")
+        err_codes.append(run_quantification(args, config_dict, "ref"))
     if RT_QUANTIFICATION_NOVEL in run_types:
-        err_code = run_quantification(args, config_dict, "novel")
+        err_codes.append(run_quantification(args, config_dict, "novel"))
     if RT_QUANTIFICATION_GENES in run_types:
-        err_code = run_quantification(args, config_dict, "gene")
+        err_codes.append(run_quantification(args, config_dict, "gene"))
 
     if "check_input_files" in config_dict:
         files_list = config_dict["check_input_files"].split()
@@ -427,7 +428,10 @@ def main():
         missing_files = check_output_files(output_folder, label, files_list)
         if missing_files:
             log.error("The following files were not detected in the output folder: %s" % "  ".join(missing_files))
-            err_code = -11
+            err_codes.append(-5)
+
+    if any(ec != 0 for ec in err_codes):
+        err_code = -6
 
     return err_code