From ff9eab67c90cb8869ad48dc064e4623b471a7b53 Mon Sep 17 00:00:00 2001 From: mschechter Date: Fri, 8 Dec 2023 16:39:40 +0100 Subject: [PATCH] update filter-hmm-hits flag names --- anvio/workflows/ecophylo/Snakefile | 6 +- anvio/workflows/ecophylo/__init__.py | 4 +- sandbox/anvi-script-filter-hmm-hits-table | 70 +++++++++++------------ 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/anvio/workflows/ecophylo/Snakefile b/anvio/workflows/ecophylo/Snakefile index 287b494a65..06f0e271d1 100644 --- a/anvio/workflows/ecophylo/Snakefile +++ b/anvio/workflows/ecophylo/Snakefile @@ -183,7 +183,7 @@ rule filter_hmm_hits_by_model_coverage: done = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}", "{hmm}-dom-hmmsearch", "{sample_name}-{hmm}-DB_filtered.done"), hmm_hits_filtered = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}", "{hmm}-dom-hmmsearch", "hmm_hits_filtered.txt") params: - model_coverage = M.get_param_value_from_config(['filter_hmm_hits_by_model_coverage', '--model-coverage']), + model_coverage = M.get_param_value_from_config(['filter_hmm_hits_by_model_coverage', '--min-model-coverage']), partial_ORFs = M.get_rule_param('filter_hmm_hits_by_model_coverage', '--filter-out-partial-gene-calls'), additional_params = M.get_param_value_from_config(['filter_hmm_hits_by_model_coverage', 'additional_params']) threads: M.T('filter_hmm_hits_by_model_coverage') @@ -212,7 +212,7 @@ rule filter_hmm_hits_by_model_coverage: shell("anvi-script-filter-hmm-hits-table -c {contigs_db} \ --domain-hits-table {domtblout} \ --hmm-source {hmm_source} \ - --model-coverage {params.model_coverage} \ + --min-model-coverage {params.model_coverage} \ {params.partial_ORFs} \ {params.additional_params} >> {log} 2>&1") else: @@ -221,7 +221,7 @@ rule filter_hmm_hits_by_model_coverage: --domain-hits-table {domtblout} \ --hmm-profile-dir {hmm_dir} \ --hmm-source {hmm_source} \ - --model-coverage {params.model_coverage} \ + --min-model-coverage {params.model_coverage} \ {params.partial_ORFs} \ {params.additional_params} >> {log} 2>&1") diff --git a/anvio/workflows/ecophylo/__init__.py b/anvio/workflows/ecophylo/__init__.py index 848be11362..7245bcc5e9 100644 --- a/anvio/workflows/ecophylo/__init__.py +++ b/anvio/workflows/ecophylo/__init__.py @@ -77,7 +77,7 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): rule_acceptable_params_dict = {} rule_acceptable_params_dict['anvi_run_hmms_hmmsearch'] = ['threads_genomes', 'threads_metagenomes', 'additional_params'] - rule_acceptable_params_dict['filter_hmm_hits_by_model_coverage'] = ['--model-coverage', '--filter-out-partial-gene-calls', 'additional_params'] + rule_acceptable_params_dict['filter_hmm_hits_by_model_coverage'] = ['--min-model-coverage', '--min-gene-coverage', '--filter-out-partial-gene-calls', 'additional_params'] rule_acceptable_params_dict['cluster_X_percent_sim_mmseqs'] = ['--min-seq-id', '--cov-mode', 'clustering_threshold_for_OTUs', 'AA_mode'] rule_acceptable_params_dict['align_sequences'] = ['additional_params'] rule_acceptable_params_dict['trim_alignment'] = ['-gt', "-gappyout", 'additional_params'] @@ -96,7 +96,7 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): 'samples_txt': 'samples.txt', 'cluster_representative_method': {'method': 'mmseqs'}, 'anvi_run_hmms_hmmsearch': {'threads_genomes': 1, 'threads_metagenomes': 5}, - 'filter_hmm_hits_by_model_coverage': {'threads': 5, '--model-coverage': 0.8, '--filter-out-partial-gene-calls': True}, + 'filter_hmm_hits_by_model_coverage': {'threads': 5, '--min-model-coverage': 0.8, '--filter-out-partial-gene-calls': True}, 'process_hmm_hits': {'threads': 2}, 'combine_sequence_data': {'threads': 2}, 'anvi_get_external_gene_calls_file': {'threads': 2}, diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index ea3be8aa2d..c09b6a88b4 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -48,8 +48,8 @@ class FilterHmmHitsTable(object): self.contigs_db_path=A("contigs_db") self.domtblout=A("domain_hits_table") self.hmm_source=A("hmm_source") - self.gene_coverage=A("gene_coverage") - self.model_coverage=A("model_coverage") + self.min_gene_coverage=A("min_gene_coverage") + self.min_model_coverage=A("min_model_coverage") self.list_hmm_sources=A("list_hmm_sources") self.hmm_profile_dir=A("hmm_profile_dir") self.merge_partial_hits_within_X_nts = A('merge_partial_hits_within_X_nts') @@ -60,8 +60,8 @@ class FilterHmmHitsTable(object): sys.exit() if self.hmm_source not in anvio.data.hmm.sources and not self.hmm_profile_dir: - raise ConfigError("Hold up, if you are using a external HMM source, you need to provide the path to the HMM directory " - "with the parameter --hmm-profile-dir") + raise ConfigError(f"Hold up, if you are using a external HMM source, you need to provide the path to the HMM directory " + f"with the parameter --hmm-profile-dir") # Grab HMM sources from internal or external HMMs if args.hmm_profile_dir: @@ -109,26 +109,26 @@ class FilterHmmHitsTable(object): filesnpaths.is_file_exists(self.domtblout) self.run.info("Domtblout Path", self.domtblout) - if not self.model_coverage or self.gene_coverage: + if not self.min_model_coverage or self.min_gene_coverage: raise ConfigError("You didn't provide anvi-script-filter-hmm-hits-table with either a " - "--model-coverage or --gene-coverage. Please provide at least one " + "--min-model-coverage or --min-gene-coverage. Please provide at least one " "so anvi'o can filter hmm_hits for you :)") - if self.model_coverage: - if self.model_coverage < 0: - raise ConfigError(f"--model-coverage must be a percentage between 0 and 1 " - f"and you put a negative number: {self.model_coverage}") - if self.model_coverage > 1: - raise ConfigError(f"--model-coverage must be a percentage between 0 and 1 " - f"and you put a a value larger than 100%: {self.model_coverage}") - - if self.gene_coverage: - if self.gene_coverage < 0: - raise ConfigError(f"--gene-coverage must be a percentage between 0 and 1 " - f"and you put a negative number: {self.gene_coverage}") - if self.gene_coverage > 1: - raise ConfigError(f"--gene-coverage must be a percentage between 0 and 1 " - f"and you put a a value larger than 100%: {self.gene_coverage}") + if self.min_model_coverage: + if self.min_model_coverage < 0: + raise ConfigError(f"--min-model-coverage must be a percentage between 0 and 1 " + f"and you put a negative number: {self.min_model_coverage}") + if self.min_model_coverage > 1: + raise ConfigError(f"--min-model-coverage must be a percentage between 0 and 1 " + f"and you put a a value larger than 100%: {self.min_model_coverage}") + + if self.min_gene_coverage: + if self.min_gene_coverage < 0: + raise ConfigError(f"--min-gene-coverage must be a percentage between 0 and 1 " + f"and you put a negative number: {self.min_gene_coverage}") + if self.min_gene_coverage > 1: + raise ConfigError(f"--min-gene-coverage must be a percentage between 0 and 1 " + f"and you put a a value larger than 100%: {self.min_gene_coverage}") if not self.hmm_source: raise ConfigError("Please provide a hmm-source :)") @@ -302,8 +302,8 @@ class FilterHmmHitsTable(object): f"Please look at this error message to find out what happened: " f"{e}") - self.df['model_coverage'] = ((self.df['hmm_stop'] - self.df['hmm_start'])/ self.df['hmm_length']) - self.df['gene_coverage'] = ((self.df['gene_stop'] - self.df['gene_start'])/ self.df['gene_length']) + self.df['min_model_coverage'] = ((self.df['hmm_stop'] - self.df['hmm_start'])/ self.df['hmm_length']) + self.df['min_gene_coverage'] = ((self.df['gene_stop'] - self.df['gene_start'])/ self.df['gene_length']) def filter_domtblout(self): @@ -311,7 +311,7 @@ class FilterHmmHitsTable(object): Filter the hmm_hits table based on HMM model coverage, gene coverage, or open reading frame completeness. Returns: - CSV: Filtered domtblout file from hmmsearch. + CSV: Filtered domtblout file from hmmsearch. str: The file path of the temporary file where the filtered DataFrame is saved. """ @@ -339,13 +339,13 @@ class FilterHmmHitsTable(object): # to more complex filtering tasks. # Alignment coverage filtering conditions - if self.model_coverage and self.gene_coverage: - df_filtered = self.df[(self.df['model_coverage'] > float(self.model_coverage)) & - (self.df['gene_coverage'] > float(self.gene_coverage))] - elif self.gene_coverage: - df_filtered = self.df[self.df['gene_coverage'] > float(self.gene_coverage)] - elif self.model_coverage: - df_filtered = self.df[self.df['model_coverage'] > float(self.model_coverage)] + if self.min_model_coverage and self.min_gene_coverage: + df_filtered = self.df[(self.df['min_model_coverage'] > float(self.min_model_coverage)) & + (self.df['min_gene_coverage'] > float(self.min_gene_coverage))] + elif self.min_gene_coverage: + df_filtered = self.df[self.df['min_gene_coverage'] > float(self.min_gene_coverage)] + elif self.min_model_coverage: + df_filtered = self.df[self.df['min_model_coverage'] > float(self.min_model_coverage)] else: df_filtered = self.df.copy() @@ -354,8 +354,8 @@ class FilterHmmHitsTable(object): self.run.info("Num hmm-hits after alignment filtering", df_filtered.shape[0]) self.run.info("Num hmm-hits filtered from alignment coverage", self.df.shape[0] - df_filtered.shape[0]) - # Reformat domtblout back to the original format by dropping 'model_coverage' and 'gene_coverage' columns - df_final = df_filtered.drop(columns=['model_coverage', 'gene_coverage']) + # Reformat domtblout back to the original format by dropping 'min_model_coverage' and 'min_gene_coverage' columns + df_final = df_filtered.drop(columns=['min_model_coverage', 'min_gene_coverage']) # Additional filtering for partial gene calls if required if self.filter_out_partial_gene_calls: @@ -441,9 +441,9 @@ if __name__ == '__main__': parser.add_argument(*anvio.A('hmm-profile-dir'), **anvio.K('hmm-profile-dir')) parser.add_argument('--domain-hits-table', metavar='PATH', help="Please provide the path to the domain-table-output. " "You can get this file from running anvi-run-hmms with the flag --domain-hits-table.") - parser.add_argument('--gene-coverage',type=float, help="Percent of the gene that is covered by the profile HMM after hmmsearch. " + parser.add_argument('--min-gene-coverage',type=float, help="The minimum percent of the gene that is covered by the profile HMM after hmmsearch. " "This is the formula using the domtblout from hmmsearch: (ali_coord_to - ali_coord_from)/target_length") - parser.add_argument('--model-coverage', type=float, help="Percent of the profile HMM model that is covered by the gene after hmmsearch. " + parser.add_argument('--min-model-coverage', type=float, help="The minimum percent of the profile HMM model that is covered by the gene after hmmsearch. " "This is the formula using the domtblout from hmmsearch: (hmm_coord_to - hmm_coord_from)/hmm_length") parser.add_argument('--merge-partial-hits-within-X-nts', metavar='LENGTH', default=0, type=int, help="Filtering HMM hits based on " "target or query coverage can be difficult when a gene is covered by multiple independent hits of a single "