Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revamp model read assignment #183

Merged
merged 5 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/alignment_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
############################################################################
# Copyright (c) 2020 Saint Petersburg State University
# Copyright (c) 2024 University of Helsinki
# # All Rights Reserved
# See file LICENSE for details.
############################################################################
Expand Down
43 changes: 20 additions & 23 deletions src/graph_based_model_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def process(self, read_assignment_storage):
self.forward_counts()

# FIXME: remove asserts below
if self.transcript_model_storage and len(set([x.read_id for x in read_assignment_storage])) != len(self.read_assignment_counts):
if len(set([x.read_id for x in read_assignment_storage])) != len(self.read_assignment_counts):
logger.warning("Some reads were not assigned %d %d" % (len(set([x.read_id for x in read_assignment_storage])), len(self.read_assignment_counts)))
# FIXME: remove asserts below
if any(value < 0 for value in self.read_assignment_counts.values()):
Expand All @@ -172,9 +172,9 @@ def forward_counts(self):
ambiguous_assignments[read_id] = [read_assignment.read_group]
ambiguous_assignments[read_id].append(transcript_id)

for read_id in ambiguous_assignments.keys():
self.transcript_counter.add_read_info_raw(read_id, ambiguous_assignments[read_id][1:],
ambiguous_assignments[read_id][0])
for read_id in ambiguous_assignments.keys():
self.transcript_counter.add_read_info_raw(read_id, ambiguous_assignments[read_id][1:],
ambiguous_assignments[read_id][0])

self.transcript_counter.add_unassigned(sum(value == 0 for value in self.read_assignment_counts.values()))
self.transcript_counter.add_confirmed_features([model.transcript_id for model in self.transcript_model_storage])
Expand Down Expand Up @@ -256,19 +256,23 @@ def pre_filter_transcripts(self):

if (model.transcript_type != TranscriptModelType.known and
self.internal_counter[model.transcript_id] < coverage_cutoff):
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
self.delete_from_storage(model.transcript_id)
continue

if (model.transcript_type != TranscriptModelType.known and
self.mapping_quality(model.transcript_id) < self.params.simple_models_mapq_cutoff):
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
self.delete_from_storage(model.transcript_id)
continue
filtered_storage.append(model)

self.transcript_model_storage = filtered_storage

def delete_from_storage(self, transcript_id):
for a in self.transcript_read_ids[transcript_id]:
self.read_assignment_counts[a.read_id] -= 1
del self.transcript_read_ids[transcript_id]
del self.internal_counter[transcript_id]

def filter_transcripts(self):
filtered_storage = []
to_substitute = self.detect_similar_isoforms(self.transcript_model_storage)
Expand All @@ -290,32 +294,22 @@ def filter_transcripts(self):

if model.transcript_id in to_substitute:
#logger.debug("Novel model %s has a similar isoform %s" % (model.transcript_id, to_substitute[model.transcript_id]))
#self.transcript_read_ids[to_substitute[model.transcript_id]] += self.transcript_read_ids[model.transcript_id]
for a in self.transcript_read_ids[model.transcript_id]:
self.read_assignment_counts[a.read_id] -= 1
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
self.delete_from_storage(model.transcript_id)
continue

if self.internal_counter[model.transcript_id] < novel_isoform_cutoff:
#logger.debug("Novel model %s has coverage %d < %.2f, component cov = %d" % (model.transcript_id,
# self.internal_counter[model.transcript_id],
# novel_isoform_cutoff, component_coverage))
for a in self.transcript_read_ids[model.transcript_id]:
self.read_assignment_counts[a.read_id] -= 1
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
self.delete_from_storage(model.transcript_id)
continue

if len(model.exon_blocks) <= 2:
mapq = self.mapping_quality(model.transcript_id)
#logger.debug("Novel model %s has quality %.2f" % (model.transcript_id, mapq))
if mapq < self.params.simple_models_mapq_cutoff:
#logger.debug("Novel model %s has poor quality" % model.transcript_id)
for a in self.transcript_read_ids[model.transcript_id]:
self.read_assignment_counts[a.read_id] -= 1
del self.transcript_read_ids[model.transcript_id]
del self.internal_counter[model.transcript_id]
self.delete_from_storage(model.transcript_id)
continue

# TODO: correct ends for known
Expand Down Expand Up @@ -450,7 +444,7 @@ def construct_fl_isoforms(self):
and transcript_clean_strand == '.')
or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded
and transcript_strand == '.')):
logger.info("Avoiding unreliable transcript with %d exons (strand cannot be detected)" % len(novel_exons))
logger.debug("Avoiding unreliable transcript with %d exons (strand cannot be detected)" % len(novel_exons))
pass
else:
if self.params.use_technical_replicas and \
Expand Down Expand Up @@ -712,6 +706,9 @@ def transcript_from_reference(self, isoform_id):
# assign reads back to constructed isoforms
def assign_reads_to_models(self, read_assignments):
if not self.transcript_model_storage:
for assignment in read_assignments:
read_id = assignment.read_id
self.read_assignment_counts[read_id] = 0
logger.debug("No transcripts were assigned")
return

Expand All @@ -727,7 +724,7 @@ def assign_reads_to_models(self, read_assignments):
continue

read_exons = assignment.corrected_exons
#logger.debug("# Checking read %s: %s" % (assignment.read_id, str(read_exons)))
# logger.debug("# Checking read %s: %s" % (assignment.read_id, str(read_exons)))
model_combined_profile = profile_constructor.construct_profiles(read_exons, assignment.polya_info, [])
model_assignment = assigner.assign_to_isoform(assignment.read_id, model_combined_profile)
model_assignment.read_group = assignment.read_group
Expand Down
2 changes: 0 additions & 2 deletions src/long_read_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,9 @@ def add_read_info_raw(self, read_id, feature_ids, group_id=AbstractReadGrouper.d
for feature_id in feature_ids:
count_value = self.read_counter.process_ambiguous(len(feature_ids))
self.feature_counter[group_id][feature_id] += count_value
# self.confirmed_features.add((group_id, feature_id))
self.all_features.add(feature_id)
else:
self.feature_counter[group_id][feature_ids[0]] += 1
# self.confirmed_features.add((group_id, feature_ids[0]))
self.all_features.add(feature_ids[0])

def add_unassigned(self, n_reads=1):
Expand Down
3 changes: 2 additions & 1 deletion src/transcript_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def dump_read_assignments(self, transcript_model_constructor):
# write read_id -> transcript_id map
if not self.output_r2t:
return
for model_id, read_assignments in transcript_model_constructor.transcript_read_ids.items():
for model_id in transcript_model_constructor.transcript_read_ids.keys():
read_assignments = transcript_model_constructor.transcript_read_ids[model_id]
for a in read_assignments:
self.out_r2t.write("%s\t%s\n" % (a.read_id, model_id))
for read_id in transcript_model_constructor.read_assignment_counts.keys():
Expand Down
46 changes: 25 additions & 21 deletions tests/github/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,18 +138,18 @@ def check_value(etalon_value, output_value, name):
if output_value < 0:
if output_value != etalon_value:
log.error("Value of %s = %2.2f is not equal to value %2.2f" % (name, output_value, lower_bound))
exit_code = -40
exit_code = -2
else:
log.info("Value of %s = %2.2f == %2.2f as expected" % (name, output_value, upper_bound))
else:
if output_value < lower_bound:
log.error("Value of %s = %2.2f is lower than the expected value %2.2f" % (name, output_value, lower_bound))
exit_code = -41
exit_code = -2
else:
log.info("Value of %s = %2.2f >= %2.2f as expected" % (name, output_value, lower_bound))
if output_value > upper_bound:
log.error("Value of %s = %2.2f is higher than the expected value %2.2f" % (name, output_value, upper_bound))
exit_code = -42
exit_code = -2
else:
log.info("Value of %s = %2.2f <= %2.2f as expected" % (name, output_value, upper_bound))
return exit_code
Expand Down Expand Up @@ -186,7 +186,7 @@ def run_assignment_quality(args, config_dict):
bam = find_bam(output_folder, label)
if not bam or not os.path.exists(bam):
log.error("BAM file was not found")
return -21
return -10
else:
bam = fix_path(config_file, config_dict["bam"])

Expand All @@ -205,7 +205,7 @@ def run_assignment_quality(args, config_dict):
result = subprocess.run(qa_command_list)
if result.returncode != 0:
log.error("QA exited with non-zero status: %d" % result.returncode)
return -13
return -11

if "etalon" not in config_dict:
return 0
Expand All @@ -218,7 +218,7 @@ def run_assignment_quality(args, config_dict):
for k, v in etalon_qaulity_dict.items():
if k not in quality_report_dict:
log.error("Metric %s was not found in the report" % k)
exit_code = -22
exit_code = -12
continue
new_etalon_outf.write("%s\t%.2f\n" % (k, float(quality_report_dict[k])))
err_code = check_value(float(v), float(quality_report_dict[k]), k)
Expand Down Expand Up @@ -251,7 +251,7 @@ def run_transcript_quality(args, config_dict):
out_gtf = os.path.join(output_folder, "%s/%s.transcript_models.gtf" % (label, label))
if not os.path.exists(out_gtf):
log.error("Output GTF file was not found" % out_gtf)
return -31
return -20

quality_output = os.path.join(output_folder, "gffcompare")
genedb_prefix = fix_path(config_file, config_dict["reduced_db"])
Expand All @@ -262,7 +262,7 @@ def run_transcript_quality(args, config_dict):
result = subprocess.run(qa_command_list)
if result.returncode != 0:
log.error("Transcript evaluation exited with non-zero status: %d" % result.returncode)
return -13
return -21

if "etalon" not in config_dict:
return 0
Expand Down Expand Up @@ -311,14 +311,14 @@ def run_quantification(args, config_dict, mode):

if not os.path.exists(out_tpm):
log.error("Output TPM file %s was not found" % out_tpm)
return -32
return -30

if "reference_tpm" not in config_dict:
return 0
ref_tpm = config_dict["reference_gene_tpm"] if mode == "gene" else config_dict["reference_tpm"]
if not os.path.exists(ref_tpm):
log.error("File %s with reference TPM was not detected" % ref_tpm)
return -18
return -31

quantification_stats_output = os.path.join(output_folder, mode + ".quantification.tsv")
qa_command_list = ["python3", os.path.join(isoquant_dir, "misc/quantification_stats.py"),
Expand All @@ -336,7 +336,7 @@ def run_quantification(args, config_dict, mode):
result = subprocess.run(qa_command_list)
if result.returncode != 0:
log.error("Quantification evaluation exited with non-zero status: %d" % result.returncode)
return -14
return -34

etalon_to_use = "etalon_quantification_" + mode
if etalon_to_use not in config_dict:
Expand All @@ -346,14 +346,14 @@ def run_quantification(args, config_dict, mode):
ref_value_files = fix_path(config_file, config_dict[etalon_to_use])
if not os.path.exists(ref_value_files):
log.error("File %s with etalon metric values was not detected" % ref_value_files)
return -19
return -35
etalon_quality_dict = load_tsv_config(ref_value_files)
real_dict = load_tsv_config(quantification_stats_output)
exit_code = 0

for metric_name in etalon_quality_dict.keys():
if metric_name not in real_dict:
exit_code = -15
exit_code = -36
log.warning("Metric not found %s" % metric_name)
continue

Expand Down Expand Up @@ -399,25 +399,26 @@ def main():
for k in required:
if k not in config_dict:
log.error(k + " is not set in the config")
return -10
exit(-4)

err_code = run_isoquant(args, config_dict)
if err_code != 0:
return err_code

run_types = set(map(lambda x: x.strip().replace('"', ''), config_dict["run_type"].split(",")))
err_codes = []
if RT_VOID in run_types:
err_code = 0
err_codes.append(0)
if RT_ASSIGNMENT in run_types:
err_code = run_assignment_quality(args, config_dict)
err_codes.append(run_assignment_quality(args, config_dict))
if RT_TRANSCRIPTS in run_types:
err_code = run_transcript_quality(args, config_dict)
err_codes.append(run_transcript_quality(args, config_dict))
if RT_QUANTIFICATION_KNOWN in run_types:
err_code = run_quantification(args, config_dict, "ref")
err_codes.append(run_quantification(args, config_dict, "ref"))
if RT_QUANTIFICATION_NOVEL in run_types:
err_code = run_quantification(args, config_dict, "novel")
err_codes.append(run_quantification(args, config_dict, "novel"))
if RT_QUANTIFICATION_GENES in run_types:
err_code = run_quantification(args, config_dict, "gene")
err_codes.append(run_quantification(args, config_dict, "gene"))

if "check_input_files" in config_dict:
files_list = config_dict["check_input_files"].split()
Expand All @@ -427,7 +428,10 @@ def main():
missing_files = check_output_files(output_folder, label, files_list)
if missing_files:
log.error("The following files were not detected in the output folder: %s" % " ".join(missing_files))
err_code = -11
err_codes.append(-5)

if any(ec != 0 for ec in err_codes):
err_code = -6

return err_code

Expand Down
Loading