Skip to content

Commit

Permalink
update filter-hmm-hits flag names
Browse files Browse the repository at this point in the history
  • Loading branch information
mschecht committed Dec 8, 2023
1 parent 1d8f32e commit ff9eab6
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 40 deletions.
6 changes: 3 additions & 3 deletions anvio/workflows/ecophylo/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ rule filter_hmm_hits_by_model_coverage:
done = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}", "{hmm}-dom-hmmsearch", "{sample_name}-{hmm}-DB_filtered.done"),
hmm_hits_filtered = os.path.join(dirs_dict['EXTRACTED_RIBO_PROTEINS_DIR'], "{sample_name}", "{hmm}-dom-hmmsearch", "hmm_hits_filtered.txt")
params:
model_coverage = M.get_param_value_from_config(['filter_hmm_hits_by_model_coverage', '--model-coverage']),
model_coverage = M.get_param_value_from_config(['filter_hmm_hits_by_model_coverage', '--min-model-coverage']),
partial_ORFs = M.get_rule_param('filter_hmm_hits_by_model_coverage', '--filter-out-partial-gene-calls'),
additional_params = M.get_param_value_from_config(['filter_hmm_hits_by_model_coverage', 'additional_params'])
threads: M.T('filter_hmm_hits_by_model_coverage')
Expand Down Expand Up @@ -212,7 +212,7 @@ rule filter_hmm_hits_by_model_coverage:
shell("anvi-script-filter-hmm-hits-table -c {contigs_db} \
--domain-hits-table {domtblout} \
--hmm-source {hmm_source} \
--model-coverage {params.model_coverage} \
--min-model-coverage {params.model_coverage} \
{params.partial_ORFs} \
{params.additional_params} >> {log} 2>&1")
else:
Expand All @@ -221,7 +221,7 @@ rule filter_hmm_hits_by_model_coverage:
--domain-hits-table {domtblout} \
--hmm-profile-dir {hmm_dir} \
--hmm-source {hmm_source} \
--model-coverage {params.model_coverage} \
--min-model-coverage {params.model_coverage} \
{params.partial_ORFs} \
{params.additional_params} >> {log} 2>&1")

Expand Down
4 changes: 2 additions & 2 deletions anvio/workflows/ecophylo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
rule_acceptable_params_dict = {}

rule_acceptable_params_dict['anvi_run_hmms_hmmsearch'] = ['threads_genomes', 'threads_metagenomes', 'additional_params']
rule_acceptable_params_dict['filter_hmm_hits_by_model_coverage'] = ['--model-coverage', '--filter-out-partial-gene-calls', 'additional_params']
rule_acceptable_params_dict['filter_hmm_hits_by_model_coverage'] = ['--min-model-coverage', '--min-gene-coverage', '--filter-out-partial-gene-calls', 'additional_params']
rule_acceptable_params_dict['cluster_X_percent_sim_mmseqs'] = ['--min-seq-id', '--cov-mode', 'clustering_threshold_for_OTUs', 'AA_mode']
rule_acceptable_params_dict['align_sequences'] = ['additional_params']
rule_acceptable_params_dict['trim_alignment'] = ['-gt', "-gappyout", 'additional_params']
Expand All @@ -96,7 +96,7 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
'samples_txt': 'samples.txt',
'cluster_representative_method': {'method': 'mmseqs'},
'anvi_run_hmms_hmmsearch': {'threads_genomes': 1, 'threads_metagenomes': 5},
'filter_hmm_hits_by_model_coverage': {'threads': 5, '--model-coverage': 0.8, '--filter-out-partial-gene-calls': True},
'filter_hmm_hits_by_model_coverage': {'threads': 5, '--min-model-coverage': 0.8, '--filter-out-partial-gene-calls': True},
'process_hmm_hits': {'threads': 2},
'combine_sequence_data': {'threads': 2},
'anvi_get_external_gene_calls_file': {'threads': 2},
Expand Down
70 changes: 35 additions & 35 deletions sandbox/anvi-script-filter-hmm-hits-table
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ class FilterHmmHitsTable(object):
self.contigs_db_path=A("contigs_db")
self.domtblout=A("domain_hits_table")
self.hmm_source=A("hmm_source")
self.gene_coverage=A("gene_coverage")
self.model_coverage=A("model_coverage")
self.min_gene_coverage=A("min_gene_coverage")
self.min_model_coverage=A("min_model_coverage")
self.list_hmm_sources=A("list_hmm_sources")
self.hmm_profile_dir=A("hmm_profile_dir")
self.merge_partial_hits_within_X_nts = A('merge_partial_hits_within_X_nts')
Expand All @@ -60,8 +60,8 @@ class FilterHmmHitsTable(object):
sys.exit()

if self.hmm_source not in anvio.data.hmm.sources and not self.hmm_profile_dir:
raise ConfigError("Hold up, if you are using a external HMM source, you need to provide the path to the HMM directory "
"with the parameter --hmm-profile-dir")
raise ConfigError(f"Hold up, if you are using a external HMM source, you need to provide the path to the HMM directory "
f"with the parameter --hmm-profile-dir")

# Grab HMM sources from internal or external HMMs
if args.hmm_profile_dir:
Expand Down Expand Up @@ -109,26 +109,26 @@ class FilterHmmHitsTable(object):
filesnpaths.is_file_exists(self.domtblout)
self.run.info("Domtblout Path", self.domtblout)

if not self.model_coverage or self.gene_coverage:
if not self.min_model_coverage or self.min_gene_coverage:
raise ConfigError("You didn't provide anvi-script-filter-hmm-hits-table with either a "
"--model-coverage or --gene-coverage. Please provide at least one "
"--min-model-coverage or --min-gene-coverage. Please provide at least one "
"so anvi'o can filter hmm_hits for you :)")

if self.model_coverage:
if self.model_coverage < 0:
raise ConfigError(f"--model-coverage must be a percentage between 0 and 1 "
f"and you put a negative number: {self.model_coverage}")
if self.model_coverage > 1:
raise ConfigError(f"--model-coverage must be a percentage between 0 and 1 "
f"and you put a a value larger than 100%: {self.model_coverage}")

if self.gene_coverage:
if self.gene_coverage < 0:
raise ConfigError(f"--gene-coverage must be a percentage between 0 and 1 "
f"and you put a negative number: {self.gene_coverage}")
if self.gene_coverage > 1:
raise ConfigError(f"--gene-coverage must be a percentage between 0 and 1 "
f"and you put a a value larger than 100%: {self.gene_coverage}")
if self.min_model_coverage:
if self.min_model_coverage < 0:
raise ConfigError(f"--min-model-coverage must be a percentage between 0 and 1 "
f"and you put a negative number: {self.min_model_coverage}")
if self.min_model_coverage > 1:
raise ConfigError(f"--min-model-coverage must be a percentage between 0 and 1 "
f"and you put a a value larger than 100%: {self.min_model_coverage}")

if self.min_gene_coverage:
if self.min_gene_coverage < 0:
raise ConfigError(f"--min-gene-coverage must be a percentage between 0 and 1 "
f"and you put a negative number: {self.min_gene_coverage}")
if self.min_gene_coverage > 1:
raise ConfigError(f"--min-gene-coverage must be a percentage between 0 and 1 "
f"and you put a a value larger than 100%: {self.min_gene_coverage}")

if not self.hmm_source:
raise ConfigError("Please provide a hmm-source :)")
Expand Down Expand Up @@ -302,16 +302,16 @@ class FilterHmmHitsTable(object):
f"Please look at this error message to find out what happened: "
f"{e}")

self.df['model_coverage'] = ((self.df['hmm_stop'] - self.df['hmm_start'])/ self.df['hmm_length'])
self.df['gene_coverage'] = ((self.df['gene_stop'] - self.df['gene_start'])/ self.df['gene_length'])
self.df['min_model_coverage'] = ((self.df['hmm_stop'] - self.df['hmm_start'])/ self.df['hmm_length'])
self.df['min_gene_coverage'] = ((self.df['gene_stop'] - self.df['gene_start'])/ self.df['gene_length'])


def filter_domtblout(self):
"""
Filter the hmm_hits table based on HMM model coverage, gene coverage, or open reading frame completeness.
Returns:
CSV: Filtered domtblout file from hmmsearch.
CSV: Filtered domtblout file from hmmsearch.
str: The file path of the temporary file where the filtered DataFrame is saved.
"""

Expand Down Expand Up @@ -339,13 +339,13 @@ class FilterHmmHitsTable(object):
# to more complex filtering tasks.

# Alignment coverage filtering conditions
if self.model_coverage and self.gene_coverage:
df_filtered = self.df[(self.df['model_coverage'] > float(self.model_coverage)) &
(self.df['gene_coverage'] > float(self.gene_coverage))]
elif self.gene_coverage:
df_filtered = self.df[self.df['gene_coverage'] > float(self.gene_coverage)]
elif self.model_coverage:
df_filtered = self.df[self.df['model_coverage'] > float(self.model_coverage)]
if self.min_model_coverage and self.min_gene_coverage:
df_filtered = self.df[(self.df['min_model_coverage'] > float(self.min_model_coverage)) &
(self.df['min_gene_coverage'] > float(self.min_gene_coverage))]
elif self.min_gene_coverage:
df_filtered = self.df[self.df['min_gene_coverage'] > float(self.min_gene_coverage)]
elif self.min_model_coverage:
df_filtered = self.df[self.df['min_model_coverage'] > float(self.min_model_coverage)]
else:
df_filtered = self.df.copy()

Expand All @@ -354,8 +354,8 @@ class FilterHmmHitsTable(object):
self.run.info("Num hmm-hits after alignment filtering", df_filtered.shape[0])
self.run.info("Num hmm-hits filtered from alignment coverage", self.df.shape[0] - df_filtered.shape[0])

# Reformat domtblout back to the original format by dropping 'model_coverage' and 'gene_coverage' columns
df_final = df_filtered.drop(columns=['model_coverage', 'gene_coverage'])
# Reformat domtblout back to the original format by dropping 'min_model_coverage' and 'min_gene_coverage' columns
df_final = df_filtered.drop(columns=['min_model_coverage', 'min_gene_coverage'])

# Additional filtering for partial gene calls if required
if self.filter_out_partial_gene_calls:
Expand Down Expand Up @@ -441,9 +441,9 @@ if __name__ == '__main__':
parser.add_argument(*anvio.A('hmm-profile-dir'), **anvio.K('hmm-profile-dir'))
parser.add_argument('--domain-hits-table', metavar='PATH', help="Please provide the path to the domain-table-output. "
"You can get this file from running anvi-run-hmms with the flag --domain-hits-table.")
parser.add_argument('--gene-coverage',type=float, help="Percent of the gene that is covered by the profile HMM after hmmsearch. "
parser.add_argument('--min-gene-coverage',type=float, help="The minimum percent of the gene that is covered by the profile HMM after hmmsearch. "
"This is the formula using the domtblout from hmmsearch: (ali_coord_to - ali_coord_from)/target_length")
parser.add_argument('--model-coverage', type=float, help="Percent of the profile HMM model that is covered by the gene after hmmsearch. "
parser.add_argument('--min-model-coverage', type=float, help="The minimum percent of the profile HMM model that is covered by the gene after hmmsearch. "
"This is the formula using the domtblout from hmmsearch: (hmm_coord_to - hmm_coord_from)/hmm_length")
parser.add_argument('--merge-partial-hits-within-X-nts', metavar='LENGTH', default=0, type=int, help="Filtering HMM hits based on "
"target or query coverage can be difficult when a gene is covered by multiple independent hits of a single "
Expand Down

0 comments on commit ff9eab6

Please sign in to comment.