From 3727d7258e44074d13cfd49d065ebdb0a880d986 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 26 Dec 2023 13:32:25 +0300 Subject: [PATCH 01/10] move threshold --- isoquant.py | 1 + src/dataset_processor.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/isoquant.py b/isoquant.py index 0d89a940..f876cb79 100755 --- a/isoquant.py +++ b/isoquant.py @@ -673,6 +673,7 @@ def set_additional_params(args): args.needs_reference = False args.simple_models_mapq_cutoff = 30 + args.polya_percentage_threshold = 0.7 def run_pipeline(args): diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 2e5099e5..9388f29b 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -406,7 +406,7 @@ def process_sample(self, sample): polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0 logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" % (total_alignments, polya_found, polya_fraction * 100.0)) - self.args.needs_polya_for_construction = polya_fraction >= 0.7 + self.args.needs_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold self.process_assigned_reads(sample, saves_file) if not self.args.read_assignments and not self.args.keep_tmp: From a1bac778f569c7c37044abff9bc2e2c57edf9c4f Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 26 Dec 2023 13:51:16 +0300 Subject: [PATCH 02/10] add condition for reporting novel monointronic transcripts --- isoquant.py | 22 ++++++++++++---------- src/dataset_processor.py | 13 ++++++++++++- src/graph_based_model_construction.py | 8 ++++---- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/isoquant.py b/isoquant.py index f876cb79..77353912 100755 --- a/isoquant.py +++ b/isoquant.py @@ -501,7 +501,7 @@ def set_data_dependent_options(args): args.splice_correction_strategy = splice_correction_strategies[args.data_type] args.resolve_ambiguous = 'monoexon_and_fsm' if args.fl_data else 'default' - args.needs_polya_for_construction = False + args.requires_polya_for_construction = False if args.read_group is None and args.input_data.has_replicas(): args.read_group = "file_name" args.use_technical_replicas = args.read_group == "file_name" @@ -591,16 +591,16 @@ def set_model_construction_options(args): 'min_known_count', 'min_nonfl_count', 'min_novel_count', 'min_novel_count_rel', 'min_mono_count_rel', 'singleton_adjacent_cov', - 'fl_only', 'novel_monoexonic')) + 'fl_only', 'novel_monoexonic', 'require_monointronic_polya')) strategies = { - 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, True, False), - 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, False, True), - 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, False, True), - 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, False, False), - 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, False, True), - 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, True, True), - 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True), - 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, False, True) + 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, True, False, True), + 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, False, True, False), + 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, False, True, False), + 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, False, False, True), + 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, False, True, False), + 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, True, True, True), + 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False), + 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, False, True, False) } strategy = strategies[args.model_construction_strategy] @@ -633,6 +633,8 @@ def set_model_construction_options(args): logger.info("Novel unspliced transcripts will not be reported, " "set --report_novel_unspliced true to discover them") + args.require_monointronic_polya = strategy.require_monointronic_polya + def set_configs_directory(args): config_dir = os.path.join(os.environ['HOME'], '.config', 'IsoQuant') diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 9388f29b..b8d17fe7 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -406,7 +406,9 @@ def process_sample(self, sample): polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0 logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" % (total_alignments, polya_found, polya_fraction * 100.0)) - self.args.needs_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold + self.args.requires_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold + # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low + self.args.require_monointronic_polya = self.args.require_monointronic_polya or self.args.requires_polya_for_construction self.process_assigned_reads(sample, saves_file) if not self.args.read_assignments and not self.args.keep_tmp: @@ -510,6 +512,15 @@ def process_assigned_reads(self, sample, dump_filename): reverse=True ) logger.info("Processing assigned reads " + sample.prefix) + logger.info("Transcript construction options:") + logger.info(" Novel monoexonic transcripts will be reported: %s" % + "yes" if self.args.report_novel_unspliced else "no") + logger.info(" Presence of polyA tail is required for a multi-exon transcript to be reported: %s" + % "yes" if self.args.requires_polya_for_construction else "no") + logger.info(" Presence of polyA tail is required for a 2-exon transcript to be reported: %s" + % "yes" if self.args.require_monointronic_polya else "no") + logger.info(" Report transcript for which strand cannot be detected using canonical splice sites: %s" + % "yes" if self.args.report_unstranded else "no") # set up aggregators and outputs aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups) diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 8b7b3820..7bbee789 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -376,7 +376,7 @@ def construct_fl_isoforms(self): if count < novel_isoform_cutoff: # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff)) pass - elif len(novel_exons) == 2 and (not polya_site or transcript_ss_strand == '.'): + elif len(novel_exons) == 2 and self.params.require_monointronic_polya and not polya_site: # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path))) pass elif transcript_strand == '.' and not self.params.report_unstranded: @@ -468,13 +468,13 @@ def construct_assignment_based_isoforms(self, read_assignment_storage): # (read_assignment.read_id, refrenence_isoform_id)) spliced_isoform_reads[refrenence_isoform_id].append(read_assignment) - if self.params.needs_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '-': + if self.params.requires_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '-': if any(x.event_type == MatchEventSubtype.correct_polya_site_left for x in events): isoform_left_support[refrenence_isoform_id] += 1 elif abs(self.gene_info.all_isoforms_exons[refrenence_isoform_id][0][0] - read_assignment.corrected_exons[0][0]) <= self.params.apa_delta: isoform_left_support[refrenence_isoform_id] += 1 - if self.params.needs_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '+': + if self.params.requires_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '+': if any(x.event_type == MatchEventSubtype.correct_polya_site_right for x in events): isoform_right_support[refrenence_isoform_id] += 1 elif abs(self.gene_info.all_isoforms_exons[refrenence_isoform_id][-1][1] - read_assignment.corrected_exons[-1][1]) <= self.params.apa_delta: @@ -761,7 +761,7 @@ def fill(self, read_assignments): path_tuple = tuple(intron_path) self.paths[path_tuple] += 1 if terminal_vertex and starting_vertex: - if not self.params.needs_polya_for_construction or\ + if not self.params.requires_polya_for_construction or\ (terminal_vertex[0] == VERTEX_polya or starting_vertex[0] == VERTEX_polyt): self.fl_paths.add(path_tuple) self.paths_to_reads[path_tuple].append(a) From 8088fa2eeb0d82ffd9a29042d0176cf99f62bdb9 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 26 Dec 2023 15:50:39 +0300 Subject: [PATCH 03/10] add polya strategy option --- README.md | 11 ++++++++ isoquant.py | 19 ++++++++------ src/dataset_processor.py | 54 ++++++++++++++++++++++++++++++---------- 3 files changed, 64 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 5793368e..075ce0f8 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,9 @@ IsoQuant support all kinds of long RNA data: Reads must be provided in FASTQ or FASTA format (can be gzipped). If you have already aligned your reads to the reference genome, simply provide sorted and indexed BAM files. +IsoQuant expect reads to contain polyA tails. For more reliable transcript model construction do not trim polyA tails. + + ## Supported reference data @@ -519,6 +522,14 @@ The main explanation that some aligner report a lot of false unspliced alignment for ONT reads. +`--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of: + +* `default` - polyA tails are required if at least 70% of the reads have polyA tail; +polyA tails are always required for 1/2-exon transcripts when using ONT data (this is caused by elevated number of false 1/2-exonic alignments reported by minimap2); +* `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data; +* `always` - reported transcripts are always required to have polyA support in the reads. + + ### Hidden options diff --git a/isoquant.py b/isoquant.py index 77353912..22ce3ca8 100755 --- a/isoquant.py +++ b/isoquant.py @@ -29,7 +29,7 @@ NANOPORE_DATA, DataSetReadMapper ) -from src.dataset_processor import DatasetProcessor +from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies from src.long_read_assigner import AmbiguityResolvingMethod from src.long_read_counter import COUNTING_STRATEGIES from src.input_data_storage import InputDataStorage @@ -151,6 +151,10 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help add_additional_option("--report_novel_unspliced", "-u", type=bool_str, help="report novel monoexonic transcripts (true/false), " "default: False for ONT, True for other data types") + add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies], + help="require polyA tails to be present when reporting transcripts (default/never/always), " + "default: require polyA only when polyA percentage is >= 70%", + default=PolyAUsageStrategies.default.name) # OUTPUT PROPERTIES pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int, default="16") @@ -271,11 +275,12 @@ def check_and_load_args(args, parser): logger.warning("Output folder already contains a previous run, will be overwritten.") else: logger.warning("Output folder already contains a previous run, some files may be overwritten. " - "Use --resume to resume a failed run. Use --force to avoid this message. " - "Press Ctrl+C to interrupt the run now.") + "Use --resume to resume a failed run. Use --force to avoid this message.") + logger.warning("Press Ctrl+C to interrupt the run now.") delay = 9 for i in range(delay): - sys.stdout.write("Resuming the run in %d seconds\r" % (delay - i)) + countdown = delay - i + sys.stdout.write("Resuming the run in %d second%s\r" % (countdown, "s" if countdown > 1 else "")) time.sleep(1) logger.info("Overwriting the previous run") time.sleep(1) @@ -598,7 +603,7 @@ def set_model_construction_options(args): 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, False, True, False), 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, False, False, True), 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, False, True, False), - 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, True, True, True), + 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, True, True, False), 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False), 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, False, True, False) } @@ -634,6 +639,7 @@ def set_model_construction_options(args): "set --report_novel_unspliced true to discover them") args.require_monointronic_polya = strategy.require_monointronic_polya + args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement] def set_configs_directory(args): @@ -658,8 +664,6 @@ def set_additional_params(args): set_splice_correction_options(args) args.print_additional_info = True - args.no_polya = False - args.indel_near_splice_site_dist = 10 args.upstream_region_len = 20 @@ -676,6 +680,7 @@ def set_additional_params(args): args.simple_models_mapq_cutoff = 30 args.polya_percentage_threshold = 0.7 + args.low_polya_percentage_threshold = 0.1 def run_pipeline(args): diff --git a/src/dataset_processor.py b/src/dataset_processor.py index b8d17fe7..fa9b3472 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -10,7 +10,7 @@ import logging import os import shutil -import time +from enum import Enum, unique from collections import defaultdict from concurrent.futures import ProcessPoolExecutor @@ -72,6 +72,22 @@ def clean_locks(chr_ids, base_name, fname_function): os.remove(fname) +@unique +class PolyAUsageStrategies(Enum): + default = 1 + never = 2 + always = 3 + + +def set_polya_requirement_strategy(flag, polya_requirement_strategy): + if polya_requirement_strategy == PolyAUsageStrategies.default: + return flag + elif polya_requirement_strategy == PolyAUsageStrategies.never: + return False + else: + return True + + def collect_reads_in_parallel(sample, chr_id, args): current_chr_record = Fasta(args.reference)[chr_id] if not args.low_memory: @@ -406,9 +422,18 @@ def process_sample(self, sample): polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0 logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" % (total_alignments, polya_found, polya_fraction * 100.0)) - self.args.requires_polya_for_construction = polya_fraction >= self.args.polya_percentage_threshold - # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low - self.args.require_monointronic_polya = self.args.require_monointronic_polya or self.args.requires_polya_for_construction + if polya_fraction < self.args.low_polya_percentage_threshold: + logger.warning("PolyA percentage is suspiciously low. IsoQuant expects non-polya-trimmed reads. " + "If you aim to construct transcript models, consider using --polya_requirement option.") + + self.args.requires_polya_for_construction = set_polya_requirement_strategy( + polya_fraction >= self.args.polya_percentage_threshold, + self.args.polya_requirement_strategy) + self.args.require_monointronic_polya = set_polya_requirement_strategy( + # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low + self.args.require_monointronic_polya or self.args.requires_polya_for_construction, + self.args.polya_requirement_strategy) + self.process_assigned_reads(sample, saves_file) if not self.args.read_assignments and not self.args.keep_tmp: @@ -512,21 +537,24 @@ def process_assigned_reads(self, sample, dump_filename): reverse=True ) logger.info("Processing assigned reads " + sample.prefix) - logger.info("Transcript construction options:") - logger.info(" Novel monoexonic transcripts will be reported: %s" % - "yes" if self.args.report_novel_unspliced else "no") - logger.info(" Presence of polyA tail is required for a multi-exon transcript to be reported: %s" - % "yes" if self.args.requires_polya_for_construction else "no") - logger.info(" Presence of polyA tail is required for a 2-exon transcript to be reported: %s" - % "yes" if self.args.require_monointronic_polya else "no") - logger.info(" Report transcript for which strand cannot be detected using canonical splice sites: %s" - % "yes" if self.args.report_unstranded else "no") + logger.info(" Transcript models construction is turned %s" % + "off" if self.args.no_model_construction else "on") # set up aggregators and outputs aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups) transcript_stat_counter = EnumStats() if not self.args.no_model_construction: + logger.info("Transcript construction options:") + logger.info(" Novel monoexonic transcripts will be reported: %s" % + "yes" if self.args.report_novel_unspliced else "no") + logger.info(" Presence of polyA tail is required for a multi-exon transcript to be reported: %s" + % "yes" if self.args.requires_polya_for_construction else "no") + logger.info(" Presence of polyA tail is required for a 2-exon transcript to be reported: %s" + % "yes" if self.args.require_monointronic_polya else "no") + logger.info(" Report transcript for which strand cannot be detected using canonical splice sites: %s" + % "yes" if self.args.report_unstranded else "no") + gff_printer = GFFPrinter( sample.out_dir, sample.prefix, self.io_support, header=self.common_header ) From bee9888267a73499a9551781a269eac58aa9dfa3 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 26 Dec 2023 16:38:37 +0300 Subject: [PATCH 04/10] monoexonic polya strategy --- README.md | 2 ++ isoquant.py | 28 ++++++++++++++++++--------- src/dataset_processor.py | 26 +++++++++++++++---------- src/graph_based_model_construction.py | 4 +++- 4 files changed, 40 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 075ce0f8..dce014ad 100644 --- a/README.md +++ b/README.md @@ -529,6 +529,8 @@ polyA tails are always required for 1/2-exon transcripts when using ONT data (th * `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data; * `always` - reported transcripts are always required to have polyA support in the reads. +Note, that polyA tails are always required for reporting novel unspliced isoforms. + ### Hidden options diff --git a/isoquant.py b/isoquant.py index 22ce3ca8..328eaa3f 100755 --- a/isoquant.py +++ b/isoquant.py @@ -596,16 +596,25 @@ def set_model_construction_options(args): 'min_known_count', 'min_nonfl_count', 'min_novel_count', 'min_novel_count_rel', 'min_mono_count_rel', 'singleton_adjacent_cov', - 'fl_only', 'novel_monoexonic', 'require_monointronic_polya')) + 'fl_only', 'novel_monoexonic', + 'require_monointronic_polya', 'require_monoexonic_polya')) strategies = { - 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, True, False, True), - 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, False, True, False), - 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, False, True, False), - 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, False, False, True), - 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, False, True, False), - 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, True, True, False), - 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False), - 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, False, True, False) + 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, + True, False, True, True), + 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, + False, True, False, True), + 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, + False, True, False, False), + 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, + False, False, True, True), + 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, + False, True, False, False), + 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, + True, True, False, False), + 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, + False, True, False, False), + 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, + False, True, False, False) } strategy = strategies[args.model_construction_strategy] @@ -639,6 +648,7 @@ def set_model_construction_options(args): "set --report_novel_unspliced true to discover them") args.require_monointronic_polya = strategy.require_monointronic_polya + args.require_monoexonic_polya = strategy.require_monoexonic_polya args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement] diff --git a/src/dataset_processor.py b/src/dataset_processor.py index fa9b3472..5a28062f 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -433,7 +433,10 @@ def process_sample(self, sample): # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low self.args.require_monointronic_polya or self.args.requires_polya_for_construction, self.args.polya_requirement_strategy) - + self.args.require_monoexonic_polya = set_polya_requirement_strategy( + # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low + self.args.require_monoexonic_polya or self.args.requires_polya_for_construction, + self.args.polya_requirement_strategy) self.process_assigned_reads(sample, saves_file) if not self.args.read_assignments and not self.args.keep_tmp: @@ -538,7 +541,7 @@ def process_assigned_reads(self, sample, dump_filename): ) logger.info("Processing assigned reads " + sample.prefix) logger.info(" Transcript models construction is turned %s" % - "off" if self.args.no_model_construction else "on") + ("off" if self.args.no_model_construction else "on")) # set up aggregators and outputs aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups) @@ -546,14 +549,17 @@ def process_assigned_reads(self, sample, dump_filename): if not self.args.no_model_construction: logger.info("Transcript construction options:") - logger.info(" Novel monoexonic transcripts will be reported: %s" % - "yes" if self.args.report_novel_unspliced else "no") - logger.info(" Presence of polyA tail is required for a multi-exon transcript to be reported: %s" - % "yes" if self.args.requires_polya_for_construction else "no") - logger.info(" Presence of polyA tail is required for a 2-exon transcript to be reported: %s" - % "yes" if self.args.require_monointronic_polya else "no") - logger.info(" Report transcript for which strand cannot be detected using canonical splice sites: %s" - % "yes" if self.args.report_unstranded else "no") + logger.info(" Novel monoexonic transcripts will be reported: %s" + % ("yes" if self.args.report_novel_unspliced else "no")) + logger.info(" PolyA tails are required for multi-exon transcripts to be reported: %s" + % ("yes" if self.args.requires_polya_for_construction else "no")) + logger.info(" PolyA tails are required for 2-exon transcripts to be reported: %s" + % ("yes" if self.args.require_monointronic_polya else "no")) + logger.info(" PolyA tails are required for known monoexon transcripts to be reported: %s" + % ("yes" if self.args.require_monoexonic_polya else "no")) + logger.info(" PolyA tails are required for novel monoexon transcripts to be reported: %s" % "yes") + logger.info(" Report transcript for which the strand cannot be detected using canonical splice sites: %s" + % ("yes" if self.args.report_unstranded else "no")) gff_printer = GFFPrinter( sample.out_dir, sample.prefix, self.io_support, header=self.common_header diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 7bbee789..3c1cc8c0 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -128,6 +128,7 @@ def process(self, read_assignment_storage): self.construct_fl_isoforms() self.construct_assignment_based_isoforms(read_assignment_storage) + # FIXME: assign reads AFTER filtering self.assign_reads_to_models(read_assignment_storage) self.filter_transcripts() @@ -596,7 +597,8 @@ def construct_monoexon_isoforms(self, mono_exon_isoform_reads, mono_exon_isoform polya_support = polya_sites[isoform_id] # logger.debug(">> Monoexon transcript %s: %d\t%d\t%.4f\t%d" % (isoform_id, self.intron_graph.max_coverage, count, coverage, polya_support)) - if count < self.params.min_known_count or coverage < self.params.min_mono_exon_coverage or polya_support == 0: + if (count < self.params.min_known_count or coverage < self.params.min_mono_exon_coverage or + (self.params.require_monoexonic_polya and polya_support == 0)): pass # logger.debug(">> Will NOT be added, abs cutoff=%d" % (self.params.min_known_count)) elif isoform_id not in GraphBasedModelConstructor.detected_known_isoforms: new_model = self.transcript_from_reference(isoform_id) From 47db3b434001223e488eb527fc280f839a82dfb1 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Tue, 26 Dec 2023 17:03:11 +0300 Subject: [PATCH 05/10] fix help message --- isoquant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/isoquant.py b/isoquant.py index 328eaa3f..0dff1955 100755 --- a/isoquant.py +++ b/isoquant.py @@ -153,7 +153,7 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help "default: False for ONT, True for other data types") add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies], help="require polyA tails to be present when reporting transcripts (default/never/always), " - "default: require polyA only when polyA percentage is >= 70%", + "default: require polyA only when polyA percentage is >= 70%%", default=PolyAUsageStrategies.default.name) # OUTPUT PROPERTIES pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int, From d9315b50c17014d1f5f1562fd7303b44085a9824 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 27 Dec 2023 02:08:19 +0300 Subject: [PATCH 06/10] fix SS strand filtering --- src/dataset_processor.py | 2 +- src/graph_based_model_construction.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 5a28062f..8afe13dd 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -540,7 +540,7 @@ def process_assigned_reads(self, sample, dump_filename): reverse=True ) logger.info("Processing assigned reads " + sample.prefix) - logger.info(" Transcript models construction is turned %s" % + logger.info("Transcript models construction is turned %s" % ("off" if self.args.no_model_construction else "on")) # set up aggregators and outputs diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 3c1cc8c0..6520418e 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -377,7 +377,8 @@ def construct_fl_isoforms(self): if count < novel_isoform_cutoff: # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff)) pass - elif len(novel_exons) == 2 and self.params.require_monointronic_polya and not polya_site: + elif (len(novel_exons) == 2 and + ((self.params.require_monointronic_polya and not polya_site) or transcript_ss_strand == '.')): # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path))) pass elif transcript_strand == '.' and not self.params.report_unstranded: From 8adb85d8dde33f0b4682cff7032607197153d955 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 27 Dec 2023 02:44:35 +0300 Subject: [PATCH 07/10] allow user to set splice site filtration strategy --- README.md | 12 +++++++++--- isoquant.py | 16 ++++++++++------ src/gene_info.py | 15 ++++++++++++++- src/graph_based_model_construction.py | 19 +++++++++++++++---- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index dce014ad..4eb69179 100644 --- a/README.md +++ b/README.md @@ -522,6 +522,15 @@ The main explanation that some aligner report a lot of false unspliced alignment for ONT reads. +`--report_canonical` + Strategy for reporting novel transcripts based on canonical splice sites, should be one of: + +* `only_canonical` - report novel transcripts, which contain only canonical splice sites; +* `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and +presence of a polyA tail, allowing some splice sites to be non-canonical (default); +* `all` -- report all transcript model regardless of their splice sites. + + `--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of: * `default` - polyA tails are required if at least 70% of the reads have polyA tail; @@ -541,9 +550,6 @@ We recommend _not_ to modify these options unless you are clearly aware of their `--no_secondary` Ignore secondary alignments. -`--report_unstranded` - Report transcripts for which the strand cannot be detected using canonical splice sites. - `--aligner` Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable. diff --git a/isoquant.py b/isoquant.py index 0dff1955..8e473f65 100755 --- a/isoquant.py +++ b/isoquant.py @@ -30,6 +30,7 @@ DataSetReadMapper ) from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies +from src.graph_based_model_construction import StrandnessReportingLevel from src.long_read_assigner import AmbiguityResolvingMethod from src.long_read_counter import COUNTING_STRATEGIES from src.input_data_storage import InputDataStorage @@ -150,9 +151,15 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="gene quantification strategy", type=str, default="with_inconsistent") add_additional_option("--report_novel_unspliced", "-u", type=bool_str, help="report novel monoexonic transcripts (true/false), " - "default: False for ONT, True for other data types") + "default: false for ONT, true for other data types") + add_additional_option("--report_canonical", type=str, choices=[e.name for e in StrandnessReportingLevel], + help="reporting level for novel transcripts based on canonical splice sites " + + "/".join([e.name for e in StrandnessReportingLevel]) + + "default: " + StrandnessReportingLevel.only_canonical.name, + default=StrandnessReportingLevel.only_canonical.name) add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies], - help="require polyA tails to be present when reporting transcripts (default/never/always), " + help="require polyA tails to be present when reporting transcripts " + + "/".join([e.name for e in StrandnessReportingLevel]) + "default: require polyA only when polyA percentage is >= 70%%", default=PolyAUsageStrategies.default.name) # OUTPUT PROPERTIES @@ -160,10 +167,6 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help default="16") pipeline_args_group.add_argument('--check_canonical', action='store_true', default=False, help="report whether splice junctions are canonical") - add_additional_option_to_group(pipeline_args_group, "--report_unstranded", - help="report transcripts for which the strand cannot be detected " - "using canonical splice sites", - action='store_true', default=False) pipeline_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output", action='store_true', default=False) pipeline_args_group.add_argument("--count_exons", help="perform exon and intron counting", @@ -650,6 +653,7 @@ def set_model_construction_options(args): args.require_monointronic_polya = strategy.require_monointronic_polya args.require_monoexonic_polya = strategy.require_monoexonic_polya args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement] + args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical] def set_configs_directory(args): diff --git a/src/gene_info.py b/src/gene_info.py index 88ce30f2..2a0e318d 100644 --- a/src/gene_info.py +++ b/src/gene_info.py @@ -635,7 +635,7 @@ def set_strand(self, intron, strand=None): elif self.chr_record: self.strand_dict[intron] = get_intron_strand(intron, self.chr_record) - def get_strand(self, introns, has_polya=False, has_polyt=False): + def count_canonical_sites(self, introns): count_fwd = 0 count_rev = 0 for intron in introns: @@ -648,6 +648,19 @@ def get_strand(self, introns, has_polya=False, has_polyt=False): count_fwd += 1 elif strand == '-': count_rev += 1 + return count_fwd, count_rev + + # all splice sites must be canonical from the same strand, not just the majority + def get_clean_strand(self, introns): + count_fwd, count_rev = self.count_canonical_sites(introns) + if count_fwd == 0 and count_rev > 0: + return '-' + elif count_fwd > 0 and count_rev == 0: + return '+' + return '.' + + def get_strand(self, introns, has_polya=False, has_polyt=False): + count_fwd, count_rev = self.count_canonical_sites(introns) if count_fwd == count_rev: if has_polya and not has_polyt: return '+' diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index 6520418e..fb135e45 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -7,6 +7,7 @@ import logging from collections import defaultdict from functools import cmp_to_key +from enum import unique, Enum from .common import ( AtomicCounter, @@ -30,6 +31,13 @@ logger = logging.getLogger('IsoQuant') +@unique +class StrandnessReportingLevel(Enum): + only_canonical = 1 + only_stranded = 2 + all = 3 + + class GraphBasedModelConstructor: transcript_id_counter = AtomicCounter() transcript_prefix = "transcript" @@ -370,7 +378,7 @@ def construct_fl_isoforms(self): has_polya = path[-1][0] == VERTEX_polya polya_site = has_polya or has_polyt transcript_strand = self.strand_detector.get_strand(intron_path, has_polya, has_polyt) - transcript_ss_strand = self.strand_detector.get_strand(intron_path) + transcript_clean_strand = self.strand_detector.get_clean_strand(intron_path) #logger.debug("uuu Novel isoform %s has coverage: %d cutoff = %d, component cov = %d, max_coverage = %d" # % (new_transcript_id, count, novel_isoform_cutoff, component_coverage, self.intron_graph.max_coverage)) @@ -378,11 +386,14 @@ def construct_fl_isoforms(self): # logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff)) pass elif (len(novel_exons) == 2 and - ((self.params.require_monointronic_polya and not polya_site) or transcript_ss_strand == '.')): + ((self.params.require_monointronic_polya and not polya_site) or transcript_clean_strand == '.')): # logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path))) pass - elif transcript_strand == '.' and not self.params.report_unstranded: - logger.info("Avoiding unreliable transcript with %d exons" % len(novel_exons)) + elif ((self.params.report_canonical_strategy == StrandnessReportingLevel.only_canonical + and transcript_clean_strand == '.') + or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded + and transcript_strand == '.')): + logger.debug("Avoiding unreliable transcript with %d exons" % len(novel_exons)) pass else: if self.params.use_technical_replicas and \ From fc4366d4dc276d0a6d75079e46775617c97bd376 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 27 Dec 2023 03:19:25 +0300 Subject: [PATCH 08/10] introduce automatic filtering selection --- README.md | 5 ++-- isoquant.py | 43 ++++++++++++++------------- src/dataset_processor.py | 7 ++--- src/graph_based_model_construction.py | 1 + 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 4eb69179..713f615f 100644 --- a/README.md +++ b/README.md @@ -525,15 +525,16 @@ for ONT reads. `--report_canonical` Strategy for reporting novel transcripts based on canonical splice sites, should be one of: +* `auto` - automatic selection based on the data type and model construction strategy (default); * `only_canonical` - report novel transcripts, which contain only canonical splice sites; * `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and -presence of a polyA tail, allowing some splice sites to be non-canonical (default); +presence of a polyA tail, allowing some splice sites to be non-canonical * `all` -- report all transcript model regardless of their splice sites. `--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of: -* `default` - polyA tails are required if at least 70% of the reads have polyA tail; +* `auto` - default behaviour: polyA tails are required if at least 70% of the reads have polyA tail; polyA tails are always required for 1/2-exon transcripts when using ONT data (this is caused by elevated number of false 1/2-exonic alignments reported by minimap2); * `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data; * `always` - reported transcripts are always required to have polyA support in the reads. diff --git a/isoquant.py b/isoquant.py index 8e473f65..be1e39ca 100755 --- a/isoquant.py +++ b/isoquant.py @@ -49,10 +49,10 @@ def bool_str(s): def parse_args(cmd_args=None, namespace=None): parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter) - ref_args_group = parser.add_argument_group('Reference data:') - input_args_group = parser.add_argument_group('Input data:') - output_args_group = parser.add_argument_group('Output naming:') - pipeline_args_group = parser.add_argument_group('Pipeline options:') + ref_args_group = parser.add_argument_group('Reference data') + input_args_group = parser.add_argument_group('Input data') + output_args_group = parser.add_argument_group('Output naming') + pipeline_args_group = parser.add_argument_group('Pipeline options') other_options = parser.add_argument_group("Additional options:") show_full_help = '--full_help' in cmd_args @@ -153,15 +153,13 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="report novel monoexonic transcripts (true/false), " "default: false for ONT, true for other data types") add_additional_option("--report_canonical", type=str, choices=[e.name for e in StrandnessReportingLevel], - help="reporting level for novel transcripts based on canonical splice sites " + - "/".join([e.name for e in StrandnessReportingLevel]) + - "default: " + StrandnessReportingLevel.only_canonical.name, - default=StrandnessReportingLevel.only_canonical.name) + help="reporting level for novel transcripts based on canonical splice sites;" + " default: " + StrandnessReportingLevel.auto.name, + default=StrandnessReportingLevel.only_stranded.name) add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies], - help="require polyA tails to be present when reporting transcripts " + - "/".join([e.name for e in StrandnessReportingLevel]) + - "default: require polyA only when polyA percentage is >= 70%%", - default=PolyAUsageStrategies.default.name) + help="require polyA tails to be present when reporting transcripts; " + "default: auto (requires polyA only when polyA percentage is >= 70%%)", + default=PolyAUsageStrategies.auto.name) # OUTPUT PROPERTIES pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int, default="16") @@ -600,24 +598,25 @@ def set_model_construction_options(args): 'min_novel_count', 'min_novel_count_rel', 'min_mono_count_rel', 'singleton_adjacent_cov', 'fl_only', 'novel_monoexonic', - 'require_monointronic_polya', 'require_monoexonic_polya')) + 'require_monointronic_polya', 'require_monoexonic_polya', + 'report_canonical')) strategies = { 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, - True, False, True, True), + True, False, True, True, StrandnessReportingLevel.only_canonical), 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, - False, True, False, True), + False, True, False, True, StrandnessReportingLevel.only_stranded), 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, - False, True, False, False), + False, True, False, False, StrandnessReportingLevel.only_stranded), 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, - False, False, True, True), + False, False, True, True, StrandnessReportingLevel.only_stranded), 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, - False, True, False, False), + False, True, False, False, StrandnessReportingLevel.only_stranded), 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, - True, True, False, False), + True, True, False, False, StrandnessReportingLevel.only_stranded), 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, - False, True, False, False), + False, True, False, False, StrandnessReportingLevel.all), 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, - False, True, False, False) + False, True, False, False, StrandnessReportingLevel.only_stranded) } strategy = strategies[args.model_construction_strategy] @@ -654,6 +653,8 @@ def set_model_construction_options(args): args.require_monoexonic_polya = strategy.require_monoexonic_polya args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement] args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical] + if args.report_canonical_strategy == StrandnessReportingLevel.auto: + args.report_canonical_strategy = strategy.report_canonical def set_configs_directory(args): diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 8afe13dd..9c16f82a 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -74,13 +74,13 @@ def clean_locks(chr_ids, base_name, fname_function): @unique class PolyAUsageStrategies(Enum): - default = 1 + auto = 1 never = 2 always = 3 def set_polya_requirement_strategy(flag, polya_requirement_strategy): - if polya_requirement_strategy == PolyAUsageStrategies.default: + if polya_requirement_strategy == PolyAUsageStrategies.auto: return flag elif polya_requirement_strategy == PolyAUsageStrategies.never: return False @@ -558,8 +558,7 @@ def process_assigned_reads(self, sample, dump_filename): logger.info(" PolyA tails are required for known monoexon transcripts to be reported: %s" % ("yes" if self.args.require_monoexonic_polya else "no")) logger.info(" PolyA tails are required for novel monoexon transcripts to be reported: %s" % "yes") - logger.info(" Report transcript for which the strand cannot be detected using canonical splice sites: %s" - % ("yes" if self.args.report_unstranded else "no")) + logger.info(" Splice site reporting level: %s" % self.args.report_canonical_strategy.name) gff_printer = GFFPrinter( sample.out_dir, sample.prefix, self.io_support, header=self.common_header diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index fb135e45..526deff8 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -36,6 +36,7 @@ class StrandnessReportingLevel(Enum): only_canonical = 1 only_stranded = 2 all = 3 + auto = 10 class GraphBasedModelConstructor: From 864e150be04512786d0760473a258a22d338577c Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 27 Dec 2023 17:03:30 +0300 Subject: [PATCH 09/10] set default strategy to only_canonical --- isoquant.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/isoquant.py b/isoquant.py index be1e39ca..a4753cb4 100755 --- a/isoquant.py +++ b/isoquant.py @@ -604,15 +604,15 @@ def set_model_construction_options(args): 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, True, False, True, True, StrandnessReportingLevel.only_canonical), 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, - False, True, False, True, StrandnessReportingLevel.only_stranded), + False, True, False, True, StrandnessReportingLevel.only_canonical), 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, False, True, False, False, StrandnessReportingLevel.only_stranded), 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, - False, False, True, True, StrandnessReportingLevel.only_stranded), + False, False, True, True, StrandnessReportingLevel.only_canonical), 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, False, True, False, False, StrandnessReportingLevel.only_stranded), 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, - True, True, False, False, StrandnessReportingLevel.only_stranded), + True, True, False, False, StrandnessReportingLevel.only_canonical), 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True, False, False, StrandnessReportingLevel.all), 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, From af910aa788b61aeaeb61ed2402d9fac65ff15792 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 27 Dec 2023 17:26:07 +0300 Subject: [PATCH 10/10] remove low_memory --- README.md | 6 ++---- isoquant.py | 13 ++++++------- src/alignment_processor.py | 4 ++-- src/dataset_processor.py | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 713f615f..b4eb3100 100644 --- a/README.md +++ b/README.md @@ -568,11 +568,9 @@ We recommend _not_ to modify these options unless you are clearly aware of their for storing the annotation database, because SQLite database cannot be created on a shared disks. The folder will be created automatically. -`--low_memory` - Deprecated, default behaviour since 3.2. - `--high_memory` - Cache read alignments instead for making several passes over a BAM file, noticeably increases RAM usage. + Cache read alignments instead for making several passes over a BAM file, noticeably increases RAM usage, +but may improve running time when disk I/O is relatively slow. `--min_mapq` Filers out all alignments with MAPQ less than this value (will also filter all secondary alignments, as they typically have MAPQ = 0). diff --git a/isoquant.py b/isoquant.py index a4753cb4..dd8bbd27 100755 --- a/isoquant.py +++ b/isoquant.py @@ -185,8 +185,6 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help help="align reads to reference without running further analysis") # ADDITIONAL - add_additional_option("--low_memory", help="decrease RAM consumption (deprecated, set by default)", - action='store_true', default=True) add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)", action='store_true', default=False) add_additional_option("--no_junc_bed", action="store_true", default=False, @@ -239,8 +237,12 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help type=str, required=True) resume_parser.add_argument('--debug', action='store_true', default=argparse.SUPPRESS, help='Debug log output.') - resume_parser.add_argument("--threads", "-t", help="number of threads to use", type=int, default=argparse.SUPPRESS) - resume_parser.add_argument("--low_memory", help="decrease RAM consumption (deprecated, set by default)", + resume_parser.add_argument("--threads", "-t", help="number of threads to use", + type=int, default=argparse.SUPPRESS) + resume_parser.add_argument("--high_memory", + help="increase RAM consumption (store alignment and the genome in RAM)", + action='store_true', default=False) + resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end", action='store_true', default=argparse.SUPPRESS) resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end", action='store_true', default=argparse.SUPPRESS) @@ -294,9 +296,6 @@ def check_and_load_args(args, parser): elif not os.path.exists(args.genedb_output): os.makedirs(args.genedb_output) - if args.high_memory: - args.low_memory = False - if not check_input_params(args): parser.print_usage() exit(-1) diff --git a/src/alignment_processor.py b/src/alignment_processor.py index d916ff94..67df2262 100644 --- a/src/alignment_processor.py +++ b/src/alignment_processor.py @@ -229,7 +229,7 @@ def __init__(self, chr_id, bam_pairs, params, illumina_bam, genedb=None, chr_rec self.bam_merger = BAMOnlineMerger(self.bam_pairs, self.chr_id, 0, self.bam_pairs[0][0].get_reference_length(self.chr_id), - multiple_iterators=self.params.low_memory) + multiple_iterators=not self.params.high_memory) self.strand_detector = StrandDetector(self.chr_record) self.read_groupper = read_groupper self.polya_finder = PolyAFinder(self.params.polya_window, self.params.polya_fraction) @@ -238,7 +238,7 @@ def __init__(self, chr_id, bam_pairs, params, illumina_bam, genedb=None, chr_rec def process(self): current_region = None - alignment_storage = BAMAlignmentStorage(self.bam_merger) if self.params.low_memory else InMemoryAlignmentStorage() + alignment_storage = BAMAlignmentStorage(self.bam_merger) if not self.params.high_memory else InMemoryAlignmentStorage() for bam_index, alignment in self.bam_merger.get(): if alignment_storage.alignment_is_not_adjacent(alignment): diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 9c16f82a..8559097a 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -90,7 +90,7 @@ def set_polya_requirement_strategy(flag, polya_requirement_strategy): def collect_reads_in_parallel(sample, chr_id, args): current_chr_record = Fasta(args.reference)[chr_id] - if not args.low_memory: + if args.high_memory: current_chr_record = str(current_chr_record) read_grouper = create_read_grouper(args, sample, chr_id) lock_file = reads_collected_lock_file_name(sample.out_raw_file, chr_id)