From e0d536687eb26d5071f8618fba492718cd70a73c Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Tue, 22 Aug 2023 10:52:46 +0200 Subject: [PATCH] removed sortedlist typing fixes --- bigscape.py | 38 ++++++++++++++++---------------------- src/genbank/cds.py | 13 ++++--------- src/hmm/hsp.py | 5 +---- 3 files changed, 21 insertions(+), 35 deletions(-) diff --git a/bigscape.py b/bigscape.py index 334b0eba..ea576759 100644 --- a/bigscape.py +++ b/bigscape.py @@ -51,7 +51,7 @@ def load_data(run: RunParameters): profiler = Profiler(run.output.profile_path) profiler.start() - if run.output.db_path.exists(): + if False: logging.info("Loading existing run from disk...") DB.load_from_disk(run.output.db_path) @@ -60,9 +60,6 @@ def load_data(run: RunParameters): for gbk in gbks: HSP.load_all(gbk.genes) - for cds in gbk.genes: - # TODO: remove once we get rid of sortedlists - cds.lock() HMMer.init(run.input.pfam_path) @@ -110,7 +107,7 @@ def callback(tasks_done): if platform.system() == "Darwin": logging.warning("Running on mac-OS: hmmsearch_simple single threaded") - all_hsps = list(HMMer.hmmsearch_simple(all_cds, 1)) + HMMer.hmmsearch_simple(all_cds, 1) else: logging.debug( "Running on %s: hmmsearch_multiprocess with %d cores", @@ -118,30 +115,25 @@ def callback(tasks_done): run.cores, ) - # if legacy is true, set cutoff to 1.1 for the domain filtering so we can use - # legacy filtering later - - if run.legacy: - domain_overlap_cutoff = 1.1 - else: - domain_overlap_cutoff = run.hmmer.domain_overlap_cutoff + # TODO: the overlap filtering in this function does not seem to work HMMer.hmmsearch_multiprocess( all_cds, - domain_overlap_cutoff=domain_overlap_cutoff, + domain_overlap_cutoff=run.hmmer.domain_overlap_cutoff, cores=run.cores, ) # TODO: move, or remove after the add_hsp_overlap function is fixed (if it is broken # in the first place) - for cds in all_cds: - if run.legacy: - cds.hsps = legacy_filter_overlap(cds.hsps, 0.1) - - # TODO: remove when sortedlists are removed - # this converts the sortedlist used internally to regular lists. - # for some reason, doing this beforehand really messes things up for reasons I don't - # understand. - cds.lock() + # this sorts all CDS and then filters them using the old filtering system, which + # is less efficient than the flitering using the CDS.add_hsp_overlap_filter + # method. however, that method seems to be broken somehow + all_hsps = [] + for gbk in gbks: + for cds in gbk.genes: + cds.hsps = sorted(cds.hsps) + all_hsps.extend( + [hsp.domain for hsp in legacy_filter_overlap(cds.hsps, 0.1)] + ) all_hsps = [] for cds in all_cds: @@ -171,6 +163,8 @@ def callback(tasks_done): all_alignments = list() for cds in all_cds: for hsp in cds.hsps: + if hsp.alignment is None: + continue all_alignments.append(hsp.alignment) logging.info("%d alignments", len(all_alignments)) diff --git a/src/genbank/cds.py b/src/genbank/cds.py index a99b6ad0..c1872869 100644 --- a/src/genbank/cds.py +++ b/src/genbank/cds.py @@ -7,7 +7,6 @@ # from dependencies from Bio.SeqFeature import SeqFeature -from sortedcontainers import SortedList # from other modules from src.errors import InvalidGBKError @@ -32,7 +31,7 @@ class CDS: gene_kind: str strand: Bool aa_seq: SeqRecord.seq - hsps: SortedList[HSP] + hsps: list[HSP] """ def __init__(self, nt_start: int, nt_stop: int): @@ -43,7 +42,7 @@ def __init__(self, nt_start: int, nt_stop: int): self.gene_kind: Optional[str] = None self.strand: Optional[int] = None self.aa_seq: str = "" - self.hsps: SortedList[HSP] | list[HSP] = SortedList() + self.hsps: list[HSP] = [] self.__locked = False # db specific fields @@ -65,13 +64,9 @@ def add_hsp_overlap_filter(self, new_hsp: HSP, domain_overlap_cutoff=0.1) -> Non 0.1 """ - # TODO: remove once sorted lists are gone entirely - if self.__locked or isinstance(self.hsps, list): - raise AttributeError("Cannot add HSPs to a locked CDS") - # if no hsps added yet, just add and continue if len(self.hsps) == 0: - self.hsps.add(new_hsp) + self.hsps.append(new_hsp) return delete_list = [] @@ -117,7 +112,7 @@ def add_hsp_overlap_filter(self, new_hsp: HSP, domain_overlap_cutoff=0.1) -> Non # if we got through all of that without the function, we never replaced an HSP # so add a new one here - self.hsps.add(new_hsp) + self.hsps.append(new_hsp) def save(self, commit=True): """Saves this CDS to the database and optionally executes a commit diff --git a/src/hmm/hsp.py b/src/hmm/hsp.py index ff862d1f..670904cf 100644 --- a/src/hmm/hsp.py +++ b/src/hmm/hsp.py @@ -163,10 +163,7 @@ def load_all(cds_list: list[CDS]) -> None: ) new_hsp.alignment = HSPAlignment(new_hsp, result.alignment) - if isinstance(cds.hsps, list): - raise ValueError("HSP list of CDS is a list. Did the CDS get locked?") - - cds.hsps.add(new_hsp) + cds.hsps.append(new_hsp) @staticmethod def has_overlap(hsp_a: HSP, hsp_b: HSP) -> bool: