From 8a75c20f83352b90f26beba89136c728c434a6d3 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Thu, 16 Nov 2023 17:33:32 +0100 Subject: [PATCH 01/25] new params for init_gene_clusters_functions_summary_dict. They enable smaller gene cluster function summary dict to be initialized --- anvio/dbops.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/anvio/dbops.py b/anvio/dbops.py index ce700b784d..fee9b387b3 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -1938,10 +1938,14 @@ def get_gene_clusters_functions_summary_dict(self, functional_annotation_source) return gene_clusters_functions_summary_dict - def init_gene_clusters_functions_summary_dict(self): + def init_gene_clusters_functions_summary_dict(self, source_list = None, gene_clusters_of_interest = None): """ A function to initialize the `gene_clusters_functions_summary_dict` by calling the atomic function `get_gene_clusters_functions_summary_dict` for all the - functional annotaiton sources. + functional annotation sources. + + You can restrict which sources to use and which gene clusters to initialize using the `source_list` and + `gene_clusters_of_interest` parameters, respectively. Use this ability with caution as it is possible that + downstream code may expect all sources and/or all gene clusters to be initialized. """ if not self.functions_initialized: @@ -1953,17 +1957,38 @@ def init_gene_clusters_functions_summary_dict(self): "on without any summary dict for functions and/or drama about it to let the downstream analyses " "decide how to punish the unlucky.") return - - self.progress.new('Generating a gene cluster functions summary dict', progress_total_items=len(self.gene_clusters_functions_dict)) + + gene_clusters_to_init = self.gene_clusters_functions_dict.keys() + if gene_clusters_of_interest: + gene_clusters_to_init = gene_clusters_of_interest + + # make sure that all of the requested gene clusters exist + for cluster_id in gene_clusters_of_interest: + if cluster_id not in self.gene_clusters: + raise ConfigError(f"Someone requested the function `init_gene_clusters_functions_summary_dict()` " + f"to work on a gene cluster called {cluster_id} but it does not exist in the " + f"pangenome. :/") + + sources_to_summarize = self.gene_clusters_function_sources + if source_list: + sources_to_summarize = source_list + + # make sure that all of the requested sources are in the pangenome + for s in source_list: + if s not in self.gene_clusters_function_sources: + raise ConfigError(f"Someone requested the function `init_gene_clusters_functions_summary_dict()` " + f"to use the annotation source {s}, but that source is not found in the pangenome.") + + self.progress.new('Generating a gene cluster functions summary dict', progress_total_items=len(gene_clusters_to_init)) counter = 0 - for gene_cluster_id in self.gene_clusters_functions_dict: + for gene_cluster_id in gene_clusters_to_init: if counter % 100 == 0: self.progress.increment(increment_to=counter) self.progress.update(f'{gene_cluster_id} ...') self.gene_clusters_functions_summary_dict[gene_cluster_id] = {} - for functional_annotation_source in self.gene_clusters_function_sources: + for functional_annotation_source in sources_to_summarize: accession, function = self.get_gene_cluster_function_summary(gene_cluster_id, functional_annotation_source, discard_ties=self.discard_ties, consensus_threshold=self.consensus_threshold) self.gene_clusters_functions_summary_dict[gene_cluster_id][functional_annotation_source] = {'function': function, 'accession': accession} From fb46c005561fbce932a7a1ca6714f6a924a09026 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 11:42:02 +0100 Subject: [PATCH 02/25] add arg group for pangenomes --- bin/anvi-estimate-metabolism | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/anvi-estimate-metabolism b/bin/anvi-estimate-metabolism index dd14473c25..ef2ea7f7ea 100755 --- a/bin/anvi-estimate-metabolism +++ b/bin/anvi-estimate-metabolism @@ -84,6 +84,12 @@ if __name__ == '__main__': "metabolic capabilities encoded by this list of enzymes.") groupX.add_argument(*anvio.A('enzymes-txt'), **anvio.K('enzymes-txt', {'required': False})) + groupN = parser.add_argument_group('INPUT #5 - ESTIMATION ON A PANGENOME', + "If you have a collection of binned gene clusters in a pangenome, you can estimate " + "metabolism on each bin. This program will use the most common annotation (for each " + "annotation source) from each gene cluster. Note that this input option is not compatible " + "with the `--add-copy-number` or `--add-coverage` flags.") + groupC = parser.add_argument_group('OUTPUT - GENERAL OPTIONS', "Parameters for controlling estimation output of any type. The output will be " "TAB-delimited files which by default are prefixed with 'kegg-metabolism', but you can of course " "change that name here.") From c126e95653868dd2aeb91104f6cc14430fee4817 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 14:35:08 +0100 Subject: [PATCH 03/25] args, better help for pangenome input --- bin/anvi-estimate-metabolism | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bin/anvi-estimate-metabolism b/bin/anvi-estimate-metabolism index ef2ea7f7ea..dc6eac00c6 100755 --- a/bin/anvi-estimate-metabolism +++ b/bin/anvi-estimate-metabolism @@ -86,9 +86,12 @@ if __name__ == '__main__': groupN = parser.add_argument_group('INPUT #5 - ESTIMATION ON A PANGENOME', "If you have a collection of binned gene clusters in a pangenome, you can estimate " - "metabolism on each bin. This program will use the most common annotation (for each " - "annotation source) from each gene cluster. Note that this input option is not compatible " - "with the `--add-copy-number` or `--add-coverage` flags.") + "metabolism on each bin. You will need to provide the collection name (and optionally, " + "bin name(s)) -- see INPUT #2 section above. This program will use the most common " + "annotation (for each annotation source) from each gene cluster. Note that this input " + "option is not compatible with the `--add-copy-number` or `--add-coverage` flags.") + groupN.add_argument('--pan-db', **anvio.K('pan-db', {'required': False})) + groupN.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage')) groupC = parser.add_argument_group('OUTPUT - GENERAL OPTIONS', "Parameters for controlling estimation output of any type. The output will be " "TAB-delimited files which by default are prefixed with 'kegg-metabolism', but you can of course " From 368a60a072001779598f509da3c3b410a52e11aa Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 15:15:29 +0100 Subject: [PATCH 04/25] accept new args and sanity check them --- anvio/kegg.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 75fb236cf6..54b0ae8542 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2828,6 +2828,8 @@ def __init__(self, args, run=run, progress=progress): A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.contigs_db_path = A('contigs_db') self.profile_db_path = A('profile_db') + self.pan_db_path = A('pan_db') + self.genomes_storage_path = A('genomes_storage') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file = A('bin_ids_file') @@ -2849,6 +2851,8 @@ def __init__(self, args, run=run, progress=progress): self.name_header = 'contig_name' elif self.profile_db_path and self.collection_name and not self.metagenome_mode: self.name_header = 'bin_name' + elif self.pan_db_path: + self.name_header = 'gene_cluster_bin_name' else: self.name_header = 'genome_name' @@ -2869,8 +2873,13 @@ def __init__(self, args, run=run, progress=progress): self.contigs_db_project_name = os.path.basename(self.enzymes_txt).replace(".", "_") # INPUT OPTIONS SANITY CHECKS - if not self.estimate_from_json and not self.contigs_db_path and not self.enzymes_txt: + if not self.estimate_from_json and not self.contigs_db_path and not self.enzymes_txt and not self.pan_db_path: raise ConfigError("NO INPUT PROVIDED. Please use the `-h` flag to see possible input options.") + # incompatible input options + if (self.contigs_db_path and (self.pan_db_path or self.enzymes_txt)) or \ + (self.enzymes_txt and self.pan_db_path): + raise ConfigError("MULTIPLE INPUT OPTIONS DETECTED. Please check your parameters. You cannot provide more than one " + "of the following: a contigs database, an enzymes-txt file, or a pangenome database.") if self.only_user_modules and not self.user_input_dir: raise ConfigError("You can only use the flag --only-user-modules if you provide a --user-modules directory.") @@ -2885,15 +2894,27 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.is_file_exists(self.bin_ids_file) self.bin_ids_to_process = [line.strip() for line in open(self.bin_ids_file).readlines()] - if (self.bin_id or self.bin_ids_file or self.collection_name) and not self.profile_db_path: + # required with collection/bin input + if (self.bin_id or self.bin_ids_file or self.collection_name) and not self.profile_db_path and not self.pan_db_path: raise ConfigError("You have requested metabolism estimation for a bin or set of bins, but you haven't provided " - "a profiles database. Unfortunately, this just does not work. Please try again.") - + "a profile database or pan database. Unfortunately, this just does not work. Please try again.") + # required with profile db input if self.profile_db_path and not (self.collection_name or self.add_coverage or self.metagenome_mode): raise ConfigError("If you provide a profile DB, you should also provide either a collection name (to estimate metabolism " "on a collection of bins) or use the --add-coverage flag (so that coverage info goes into the output " "files), or both. Otherwise the profile DB is useless.") - + # required/forbidden with pangenome input + if self.pan_db_path and not self.genomes_storage_path: + raise ConfigError("You have provided a pan database but not its associated genomes storage database. Please give the " + "path to the genomes storage db using the `-g` flag.") + if self.pan_db_path and not self.collection_name: + raise ConfigError("You need to provide a collection name when you estimate metabolism on a pangenome. If you don't " + "already have a collection of gene clusters in the pan database, please make one first. Then provide " + "the collection to this program; you can find the collection name parameter in the INPUT #2 section " + "of the `-h` output.") + if self.pan_db_path and (self.add_copy_number or self.add_coverage): + raise ConfigError("The flags --add-copy-number or --add-coverage do not work for pangenome input.") + # required/forbidden with JSON output if self.store_json_without_estimation and not self.json_output_file_path: raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " "Please use the --get-raw-data-as-json flag to do so.") From 4598177fbdff0453689bb6e0c45da6d6e5d1a148 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 15:44:21 +0100 Subject: [PATCH 05/25] reporting on input options including pan dbs --- anvio/kegg.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 54b0ae8542..ae1354c9fb 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2914,7 +2914,7 @@ def __init__(self, args, run=run, progress=progress): "of the `-h` output.") if self.pan_db_path and (self.add_copy_number or self.add_coverage): raise ConfigError("The flags --add-copy-number or --add-coverage do not work for pangenome input.") - # required/forbidden with JSON output + # required/forbidden with JSON estimation if self.store_json_without_estimation and not self.json_output_file_path: raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " "Please use the --get-raw-data-as-json flag to do so.") @@ -3014,6 +3014,9 @@ def __init__(self, args, run=run, progress=progress): self.run.info("Contigs DB", self.contigs_db_path, quiet=self.quiet) if self.profile_db_path: self.run.info("Profile DB", self.profile_db_path, quiet=self.quiet) + if self.pan_db_path: + self.run.info("Pan DB", self.pan_db_path, quiet=self.quiet) + self.run.info("Genomes Storage DB", self.genomes_storage_path, quiet=self.quiet) if self.collection_name: self.run.info('Collection', self.collection_name, quiet=self.quiet) if self.bin_id: @@ -3023,7 +3026,17 @@ def __init__(self, args, run=run, progress=progress): if self.enzymes_txt: self.run.info("Enzymes txt file", self.enzymes_txt, quiet=self.quiet) - self.run.info('Metagenome mode', self.metagenome_mode, quiet=self.quiet) + estimation_mode = "Genome (or metagenome assembly)" + if self.profile_db_path and self.collection: + estimation_mode = "Bins in a metagenome" + elif self.metagenome_mode: + estimation_mode = "Individual contigs in a metagenome" + elif self.enzymes_txt: + estimation_mode = "List of enzymes" + elif self.pan_db_path: + estimation_mode = "Gene cluster bins in a pangenome" + + self.run.info('Mode (what we are estimating metabolism for)', estimation_mode, quiet=self.quiet) if not self.estimate_from_json and not self.enzymes_txt: From 26579a4a4524c0df0887495c3c90141dace6bdf1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 16:32:39 +0100 Subject: [PATCH 06/25] better if statements detect contigs db rather than exclude all other input options --- anvio/kegg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ae1354c9fb..f682f64c1c 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -3039,7 +3039,7 @@ def __init__(self, args, run=run, progress=progress): self.run.info('Mode (what we are estimating metabolism for)', estimation_mode, quiet=self.quiet) - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: utils.is_contigs_db(self.contigs_db_path) # here we load the contigs DB just for sanity check purposes. # We will need to load it again later just before accessing data to avoid SQLite error that comes from different processes accessing the DB @@ -3067,7 +3067,7 @@ def __init__(self, args, run=run, progress=progress): f"`anvi-setup-kegg-data`, though we are not sure how you got to this point in that case." f"But fine. Hopefully you now know what you need to do to make this message go away.") - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: # sanity check that contigs db was annotated with same version of MODULES.db that will be used for metabolism estimation if 'modules_db_hash' not in contigs_db.meta: raise ConfigError("Based on the contigs DB metadata, the contigs DB that you are working with has not been annotated with hits to the " @@ -3111,7 +3111,7 @@ def __init__(self, args, run=run, progress=progress): # LOAD USER DATA - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: if self.user_input_dir: # check for user modules db if not os.path.exists(self.user_modules_db_path): @@ -3143,7 +3143,7 @@ def __init__(self, args, run=run, progress=progress): self.ko_dict[k] = user_kos[k] user_modules_db.disconnect() - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: contigs_db.disconnect() self.annotation_sources_to_use = list(annotation_source_set) From 187015840cf061afa534f8922a1c3490e12b788a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 16:52:21 +0100 Subject: [PATCH 07/25] new function to check if pan db and genomes storage db are compatible --- anvio/kegg.py | 2 ++ anvio/utils.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index f682f64c1c..680ad0e872 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2924,6 +2924,8 @@ def __init__(self, args, run=run, progress=progress): if self.profile_db_path: utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) + if self.pan_db_path: + utils.is_pan_db_and_genomes_storage_db_compatible(self.pan_db_path, self.genomes_storage_path) if self.add_coverage: if not self.enzymes_txt: diff --git a/anvio/utils.py b/anvio/utils.py index e6b79b3f19..6411c517df 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -4274,6 +4274,18 @@ def is_structure_db_and_contigs_db_compatible(structure_db_path, contigs_db_path return True +def is_pan_db_and_genomes_storage_db_compatible(pan_db_path, genomes_storage_path): + pdb = dbi(pan_db_path) + gdb = dbi(genomes_storage_path) + + if pdb.hash != gdb.hash: + raise ConfigError(f"The pan database and the genomes storage database do not seem to " + f"be compatible. More specifically, the genomes storage database is " + f"not the one that was used when the pangenome was created. " + f"({pdb.hash} != {gdb.hash})") + + return True + # # FIXME # def is_external_genomes_compatible_with_pan_database(pan_db_path, external_genomes_path): From 34212fcfe231b042b2536f3aefc294183ac00c58 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 17:31:38 +0100 Subject: [PATCH 08/25] function to return func annotation sources (if any) from DBInfo --- anvio/dbinfo.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/anvio/dbinfo.py b/anvio/dbinfo.py index cd9365a503..dc52ffafe0 100644 --- a/anvio/dbinfo.py +++ b/anvio/dbinfo.py @@ -88,6 +88,7 @@ class DBInfo(ABC): db_type = None hash_name = None + functional_annotation_sources_name = None def __new__(cls, path, dont_raise=False, expecting=None): if not cls.is_db(path, dont_raise=dont_raise): @@ -208,11 +209,20 @@ def load_db(self): def get_self_table(self): with DB(self.path, None, ignore_version=True) as database: return dict(database.get_table_as_list_of_tuples('self')) + + def get_functional_annotation_sources(self): + """If sources are defined in the database, we return them as a list. Otherwise, we return None.""" + if self.functional_annotation_sources_name: + with self.load_db() as database: + return database.get_meta_value(self.functional_annotation_sources_name, return_none_if_not_in_table=True).split(',') + else: + return None class ContigsDBInfo(DBInfo): db_type = 'contigs' hash_name = 'contigs_db_hash' + functional_annotation_sources_name = 'gene_function_sources' def __init__(self, path, *args, **kwargs): DBInfo.__init__(self, path) @@ -260,6 +270,7 @@ def __init__(self, path, *args, **kwargs): class GenomeStorageDBInfo(DBInfo): db_type = 'genomestorage' hash_name = 'hash' + functional_annotation_sources_name = 'gene_function_sources' def __init__(self, path, *args, **kwargs): DBInfo.__init__(self, path) @@ -281,6 +292,7 @@ def __init__(self, path, *args, **kwargs): class ModulesDBInfo(DBInfo): db_type = 'modules' hash_name = 'hash' + functional_annotation_sources_name = 'annotation_sources' def __init__(self, path, *args, **kwargs): DBInfo.__init__(self, path) From 43e053e67ffe798918f665ebf6d1add40f5a690b Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 17:49:14 +0100 Subject: [PATCH 09/25] change if statement order for loading user pathway data --- anvio/kegg.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 680ad0e872..fea14ee7df 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -3113,8 +3113,7 @@ def __init__(self, args, run=run, progress=progress): # LOAD USER DATA - if self.contigs_db_path: - if self.user_input_dir: + if self.user_input_dir: # check for user modules db if not os.path.exists(self.user_modules_db_path): raise ConfigError(f"It appears that a USER-DEFINED modules database ({self.user_modules_db_path}) does not exist in the provided data directory. " @@ -3125,15 +3124,16 @@ def __init__(self, args, run=run, progress=progress): user_modules_db = ModulesDatabase(self.user_modules_db_path, args=self.args, quiet=self.quiet) modules_db_sources = set(user_modules_db.db.get_meta_value('annotation_sources').split(',')) - contigs_db_sources = set(contigs_db.meta['gene_function_sources']) - source_in_modules_not_contigs = modules_db_sources.difference(contigs_db_sources) + if self.contigs_db_path: + contigs_db_sources = set(contigs_db.meta['gene_function_sources']) + source_in_modules_not_contigs = modules_db_sources.difference(contigs_db_sources) - if source_in_modules_not_contigs: - missing_sources = ", ".join(source_in_modules_not_contigs) - raise ConfigError(f"Your contigs database is missing one or more functional annotation sources that are " - f"required for the modules in the database at {self.user_modules_db_path}. You will have to " - f"annotate the contigs DB with these sources (or import them using `anvi-import-functions`) " - f"before running this program again. Here are the missing sources: {missing_sources}") + if source_in_modules_not_contigs: + missing_sources = ", ".join(source_in_modules_not_contigs) + raise ConfigError(f"Your contigs database is missing one or more functional annotation sources that are " + f"required for the modules in the database at {self.user_modules_db_path}. You will have to " + f"annotate the contigs DB with these sources (or import them using `anvi-import-functions`) " + f"before running this program again. Here are the missing sources: {missing_sources}") # expand annotation source set to include those in user db annotation_source_set.update(modules_db_sources) From ac184104acf1f5b111f04a4915050b19509ca1cd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Fri, 17 Nov 2023 17:49:28 +0100 Subject: [PATCH 10/25] sanity check for sources in genome storage db --- anvio/kegg.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index fea14ee7df..0fa4833670 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -31,6 +31,7 @@ from anvio.tables.genefunctions import TableForGeneFunctions from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase from anvio.genomedescriptions import MetagenomeDescriptions, GenomeDescriptions +from anvio.dbinfo import DBInfo __author__ = "Developers of anvi'o (see AUTHORS.txt)" @@ -3159,6 +3160,20 @@ def __init__(self, args, run=run, progress=progress): else: self.run.info('Metabolism data', "KEGG only") + # sanity check for annotation sources in pangenome + if self.genomes_storage_path: + available_sources = DBInfo(self.genomes_storage_path).get_functional_annotation_sources() + missing_sources = [] + for source in self.annotation_sources_to_use: + if source not in available_sources: + missing_sources.append(source) + if missing_sources: + miss_str = ", ".join(missing_sources) + raise ConfigError(f"The following functional annotation sources are required for metabolism " + f"estimation on the chosen metabolism data, but are missing from your genome " + f"storage database: {miss_str}. You'll need to figure out which genomes in the db " + f"are missing those sources, and annotate them before re-making the genomes storage.") + def list_output_modes(self): """This function prints out the available output modes for the metabolism estimation script.""" From 1449e799abaae8c60f723b3d8c7cc8207177abb9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 11:52:08 +0100 Subject: [PATCH 11/25] start option for pangenomes --- anvio/kegg.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index 0fa4833670..5cbb12de59 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -29,7 +29,7 @@ from anvio.drivers.hmmer import HMMer from anvio.parsers import parser_modules from anvio.tables.genefunctions import TableForGeneFunctions -from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase +from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase, PanSuperclass from anvio.genomedescriptions import MetagenomeDescriptions, GenomeDescriptions from anvio.dbinfo import DBInfo @@ -5110,7 +5110,7 @@ def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=N return_subset_for_matrix_format=False, all_modules_in_db=None, all_kos_in_db=None, module_paths_dict=None): """This is the driver function for estimating metabolism for a single contigs DB. - It will decide what to do based on whether the input contigs DB is a genome or metagenome. + It will decide what to do based on whether the input DB is a genome, metagenome, or pangenome. It usually avoids returning the metabolism data to save on memory (as this data is typically appended to files immediately), but this behavior can be changed by setting return_superdicts to True (for the entire modules/ko superdictionaries) or return_subset_for_matrix_format to True (for a subset of these dicts that @@ -5175,6 +5175,7 @@ def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=N if self.enzymes_txt: self.enzymes_txt_data = self.load_data_from_enzymes_txt() kegg_metabolism_superdict, kofam_hits_superdict = self.estimate_metabolism_from_enzymes_txt() + elif self.pan_db_path: else: kofam_hits_info = self.init_hits_and_splits(annotation_sources=self.annotation_sources_to_use) From 6b2021cbc09ed566215ad81fbdbba9df7b32493a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 11:52:28 +0100 Subject: [PATCH 12/25] load collection with sanity check for its existence --- anvio/kegg.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 5cbb12de59..668e429a90 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -5176,6 +5176,14 @@ def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=N self.enzymes_txt_data = self.load_data_from_enzymes_txt() kegg_metabolism_superdict, kofam_hits_superdict = self.estimate_metabolism_from_enzymes_txt() elif self.pan_db_path: + gene_cluster_collections = ccollections.Collections() + gene_cluster_collections.populate_collections_dict(self.pan_db_path) + if self.collection_name not in gene_cluster_collections.collections_dict: + c_str = ', '.join(gene_cluster_collections.collections_dict.keys()) + raise ConfigError(f"The collection name you provided ('{self.collection_name}') is not valid for this " + f"Pan DB. Here are the collections in this database: {c_str}") + collection_dict = gene_cluster_collections.get_collection_dict(self.collection_name) + else: kofam_hits_info = self.init_hits_and_splits(annotation_sources=self.annotation_sources_to_use) From 42db72992a7979fb01d99b5dda3b0c3c1086b7df Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 11:52:46 +0100 Subject: [PATCH 13/25] get list of all gene clusters in collection --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 668e429a90..68f747392f 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -5184,6 +5184,9 @@ def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=N f"Pan DB. Here are the collections in this database: {c_str}") collection_dict = gene_cluster_collections.get_collection_dict(self.collection_name) + all_gene_clusters_in_collection = [] + for bin_name, gene_cluster_list in collection_dict.items(): + all_gene_clusters_in_collection += gene_cluster_list else: kofam_hits_info = self.init_hits_and_splits(annotation_sources=self.annotation_sources_to_use) From 89782725788a0312872c62a95c1f7da7f6c87ba9 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 13:47:21 +0100 Subject: [PATCH 14/25] function to load summary of gene cluster annotations --- anvio/kegg.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 68f747392f..6a7a681b59 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -5106,6 +5106,36 @@ def estimate_metabolism_from_enzymes_txt(self): return enzyme_metabolism_superdict, enzyme_ko_superdict + def init_hits_for_pangenome(self, gene_cluster_list: list): + """This function loads enzyme annotations from the pangenome for use by downstream metabolism estimation. + + For each gene cluster, it takes the most common function from each annotation source relevant to the modules. + + PARAMETERS + ========== + gene_cluster_list : list + which gene cluster IDs to load from the pan DB + + RETURNS + ======= + enzyme_cluster_split_contig : list + (enzyme_accession, gene_cluster_id, split, contig) tuples in which split and contig are both NAs + """ + + pan_super = PanSuperclass(self.args) + pan_super.init_gene_clusters(gene_cluster_ids_to_focus = gene_cluster_list) + pan_super.init_gene_clusters_functions_summary_dict(source_list = self.annotation_sources_to_use, gene_clusters_of_interest = gene_cluster_list) + + enzyme_cluster_split_contig = [] + # no splits or contigs here + for cluster_id in gene_cluster_list: + for source in self.annotation_sources_to_use: + if source in pan_super.gene_clusters_functions_summary_dict[cluster_id]: + acc = pan_super.gene_clusters_functions_summary_dict[cluster_id][source]['accession'] + enzyme_cluster_split_contig.append((acc,cluster_id,"NA","NA")) + + return enzyme_cluster_split_contig + def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=None, return_superdicts=False, return_subset_for_matrix_format=False, all_modules_in_db=None, all_kos_in_db=None, module_paths_dict=None): """This is the driver function for estimating metabolism for a single contigs DB. From a1cd9ee398897fcf742dc838152645202393dc47 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:10:05 +0100 Subject: [PATCH 15/25] estimation function for pan bins --- anvio/kegg.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 6a7a681b59..a0d555c279 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -5126,6 +5126,7 @@ def init_hits_for_pangenome(self, gene_cluster_list: list): pan_super.init_gene_clusters(gene_cluster_ids_to_focus = gene_cluster_list) pan_super.init_gene_clusters_functions_summary_dict(source_list = self.annotation_sources_to_use, gene_clusters_of_interest = gene_cluster_list) + enzyme_cluster_split_contig = [] # no splits or contigs here for cluster_id in gene_cluster_list: @@ -5136,6 +5137,52 @@ def init_hits_for_pangenome(self, gene_cluster_list: list): return enzyme_cluster_split_contig + + def estimate_metabolism_for_pangenome_bins(self, enzyme_cluster_split_contig, cluster_collection): + """Estimates metabolism individually on each bin in a pangenome. + + PARAMETERS + ========== + enzyme_cluster_split_contig : list + (enzyme_accession, gene_cluster_id, split, contig) tuples in which split and contig are both NAs + + cluster_collection : dictionary + maps bin names in the collection to the list of gene clusters in each bin + """ + + gc_bins_metabolism_superdict = {} + gc_bins_ko_superdict = {} + num_bins = len(cluster_collection) + + self.progress.new("Estimating metabolism for each bin of gene clusters", progress_total_items=num_bins) + + for bin_name, gc_list in cluster_collection.items(): + self.progress.update("[%d of %d] %s" % (self.progress.progress_current_item + 1, num_bins, bin_name)) + + enzymes_in_bin = [tpl for tpl in enzyme_cluster_split_contig if tpl[1] in gc_list] + metabolism_dict_for_bin, ko_dict_for_bin = self.mark_kos_present_for_list_of_splits(enzymes_in_bin, bin_name=bin_name) + + if not self.store_json_without_estimation: + gc_bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(metabolism_dict_for_bin, bin_name=bin_name) + single_bin_module_superdict = {bin_name: gc_bins_metabolism_superdict[bin_name]} + gc_bins_ko_superdict[bin_name] = ko_dict_for_bin + else: + gc_bins_metabolism_superdict[bin_name] = metabolism_dict_for_bin + single_bin_module_superdict = {bin_name: metabolism_dict_for_bin} + gc_bins_ko_superdict[bin_name] = ko_dict_for_bin + + # append individual bin to file + single_bin_ko_superdict = {bin_name: ko_dict_for_bin} + self.append_kegg_metabolism_superdicts(single_bin_module_superdict, single_bin_ko_superdict) + + self.progress.increment() + self.progress.reset() + + self.progress.end() + + return gc_bins_metabolism_superdict, gc_bins_ko_superdict + + def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=None, return_superdicts=False, return_subset_for_matrix_format=False, all_modules_in_db=None, all_kos_in_db=None, module_paths_dict=None): """This is the driver function for estimating metabolism for a single contigs DB. From c00534947ab2016cbaa4d0e6d222accbf25f1ac1 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:10:53 +0100 Subject: [PATCH 16/25] better header for gene cluster ids --- anvio/kegg.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index a0d555c279..9a370ffb61 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -2869,6 +2869,8 @@ def __init__(self, args, run=run, progress=progress): 'mode_type' : 'all', 'description': "Name of genome/bin/metagenome in which we find gene annotations (hits) and/or modules" } + if self.pan_db_path: + self.update_available_headers_for_pan() if self.enzymes_txt: self.contigs_db_project_name = os.path.basename(self.enzymes_txt).replace(".", "_") @@ -3175,6 +3177,20 @@ def __init__(self, args, run=run, progress=progress): f"are missing those sources, and annotate them before re-making the genomes storage.") + def update_available_headers_for_pan(self): + """This function updates the available headers dictionary for pangenome-specific headers.""" + + # in modules mode, we replace 'gene_caller_ids_in_module' with 'gene_clusters_in_module' + self.available_headers['gene_clusters_in_module'] = { + 'cdict_key': None, + 'mode_type': 'modules', + 'description': "Comma-separated list of gene cluster IDs that contribute to a module" + } + self.available_headers.pop('gene_caller_ids_in_module') + + self.available_modes['modules']["headers"] = ['gene_clusters_in_module' if x == 'gene_caller_ids_in_module' else x for x in self.available_modes['modules']["headers"]] + + def list_output_modes(self): """This function prints out the available output modes for the metabolism estimation script.""" run.warning(None, header="AVAILABLE OUTPUT MODES", lc="green") @@ -5381,6 +5397,8 @@ def add_common_elements_to_output_dict_for_module_in_bin(self, bin, mnum, c_dict d[self.modules_unique_id]["enzyme_hits_in_module"] = ",".join(kos_in_mod_list) if "gene_caller_ids_in_module" in headers_to_include: d[self.modules_unique_id]["gene_caller_ids_in_module"] = ",".join(gcids_in_mod) + if "gene_clusters_in_module" in headers_to_include: + d[self.modules_unique_id]["gene_clusters_in_module"] = ",".join(gcids_in_mod) # comma-separated list of warnings if "warnings" in headers_to_include: From 91ff8dc4d66720892bf4ac9924beca044c85c25c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:11:03 +0100 Subject: [PATCH 17/25] use the new functions to estimate --- anvio/kegg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anvio/kegg.py b/anvio/kegg.py index 9a370ffb61..ffc658f0c3 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -5280,6 +5280,9 @@ def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=N all_gene_clusters_in_collection = [] for bin_name, gene_cluster_list in collection_dict.items(): all_gene_clusters_in_collection += gene_cluster_list + + kofam_hits_info = self.init_hits_for_pangenome(gene_cluster_list = all_gene_clusters_in_collection) + kegg_metabolism_superdict, kofam_hits_superdict = self.estimate_metabolism_for_pangenome_bins(kofam_hits_info, collection_dict) else: kofam_hits_info = self.init_hits_and_splits(annotation_sources=self.annotation_sources_to_use) From fc9707b8877483face78ca21f6df17958bc34a63 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:24:30 +0100 Subject: [PATCH 18/25] a better tutorial for metabolism code --- anvio/docs/programs/anvi-estimate-metabolism.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 2c9379976a..7035abef96 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -6,7 +6,7 @@ Alternatively or additionally, you can define your own set of metabolic modules Given a properly annotated %(contigs-db)s, this program determines which enzymes are present and uses these functions to compute the completeness of each metabolic module. There are currently two strategies for estimating module completeness - pathwise and stepwise - which are discussed in the technical details section on this page. The output of this program is one or more tabular text files - see %(kegg-metabolism)s for the output description and examples. -For a practical tutorial on how to use this program, visit [this link](https://merenlab.org/tutorials/infant-gut/#chapter-v-metabolism-prediction). A more abstract discussion of available parameters, as well as technical details about how the metabolism estimation is done, can be found below. +For a practical tutorial on how to use this program, visit [this link](https://anvio.org/tutorials/fmt-mag-metabolism/). A more abstract discussion of available parameters, as well as technical details about how the metabolism estimation is done, can be found below. ## What metabolism data can I use? From 4d2f24527e7453dc3bf1a4536ec4047dfc51a918 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:42:33 +0100 Subject: [PATCH 19/25] add pan and gs as accepted artifacts --- bin/anvi-estimate-metabolism | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/anvi-estimate-metabolism b/bin/anvi-estimate-metabolism index dc6eac00c6..99dfdf3c0b 100755 --- a/bin/anvi-estimate-metabolism +++ b/bin/anvi-estimate-metabolism @@ -16,7 +16,7 @@ __version__ = anvio.__version__ __authors__ = ['ivagljiva'] __requires__ = ["contigs-db", "kegg-data", "kegg-functions", "profile-db", "collection", "bin", "external-genomes", "internal-genomes", "metagenomes", "user-modules-data", - "enzymes-txt"] + "enzymes-txt", "pan-db", "genomes-storage-db"] __provides__ = ["kegg-metabolism","user-metabolism"] __description__ = "Reconstructs metabolic pathways and estimates pathway completeness for a given set of contigs" From ebb0247e959cc3b174d0b9a73aaa9bfecbbb6cfd Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:44:11 +0100 Subject: [PATCH 20/25] clarify that genome mode can be used with metagenomes --- anvio/docs/programs/anvi-estimate-metabolism.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 7035abef96..509ae2b171 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -46,7 +46,7 @@ If you don't have any sequences, there is an additional input option for you: Different input contexts can require different parameters or additional inputs. The following sections describe what is necessary for each input type. -### Estimation for a single genome +### Estimation for a single genome or unbinned metagenome assembly The most basic use-case for this program is when you have one contigs database describing a single genome. Since all of the sequences in this database belong to the same genome, all of the gene annotations will be used for metabolism estimation. @@ -54,6 +54,8 @@ The most basic use-case for this program is when you have one contigs database d anvi-estimate-metabolism -c %(contigs-db)s {{ codestop }} +In some cases -- for instance, to compute community-level pathway copy numbers -- it is also appropriate to do this for unbinned metagenome assemblies. + ### Estimation for bins in a metagenome You can estimate metabolism for different subsets of the sequences in your contigs database if you first %(bin)s them and save them as a %(collection)s. For each bin, only the gene annotations from its subset of sequences will contribute to the module completeness scores. @@ -86,7 +88,7 @@ bin_3 bin_5 ``` -### Estimation for a metagenome +### Estimation for contigs in a metagenome assembly If you have an unbinned metagenome assembly, you can estimate metabolism for it using `--metagenome-mode`. In this case, since there is no way to determine which contigs belong to which microbial populations in the sample, estimation will be done on a per-contig basis; that is, for each contig, only the genes present on that contig will be used to determine pathway completeness within the contig. From 69d58c457acc032757d11d276fc27f34f9200f9c Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:44:22 +0100 Subject: [PATCH 21/25] new help page section on pangenomes --- anvio/docs/programs/anvi-estimate-metabolism.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 509ae2b171..ea3e5f3688 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -4,7 +4,7 @@ The metabolic pathways that this program considers (by default) are those define Alternatively or additionally, you can define your own set of metabolic modules and estimate their completeness with this program. Detailed instructions for doing this can be found by looking at the %(user-modules-data)s and %(anvi-setup-user-modules)s pages. -Given a properly annotated %(contigs-db)s, this program determines which enzymes are present and uses these functions to compute the completeness of each metabolic module. There are currently two strategies for estimating module completeness - pathwise and stepwise - which are discussed in the technical details section on this page. The output of this program is one or more tabular text files - see %(kegg-metabolism)s for the output description and examples. +Given a properly annotated %(contigs-db)s or %(pan-db)s, this program determines which enzymes are present and uses these functions to compute the completeness of each metabolic module. There are currently two strategies for estimating module completeness - pathwise and stepwise - which are discussed in the technical details section on this page. The output of this program is one or more tabular text files - see %(kegg-metabolism)s for the output description and examples. For a practical tutorial on how to use this program, visit [this link](https://anvio.org/tutorials/fmt-mag-metabolism/). A more abstract discussion of available parameters, as well as technical details about how the metabolism estimation is done, can be found below. @@ -37,8 +37,9 @@ You can run metabolism estimation on any set of annotated sequences, but these s - Single genomes, also referred to as %(external-genomes)s. These can be isolate genomes or metagenome-assembled genomes, for example. Each one is described in its own individual %(contigs-db)s. - Bins, also referred to as %(internal-genomes)s. These often represent metagenome-assembled genomes, but generally can be any subset of sequences within a database. A single %(contigs-db)s can contain multiple bins. - Assembled, unbinned metagenomes. There is no distinction between sequences that belong to different microbial populations in the %(contigs-db)s for an unbinned metagenome. +- Pangenomes in which you have binned gene clusters. Each pangenome is described in a %(pan-db)s, and must include a %(collection)s of your gene cluster bins of interest. -As you can see, %(anvi-estimate-metabolism)s takes one or more contigs database(s) as input, but the information that is taken from those databases depends on the context (ie, genome, metagenome, bin). In the case of internal genomes (or bins), is possible to have multiple inputs but only one input contigs db. So for clarity's sake, we sometimes refer to the inputs as 'samples' in the descriptions below. If you are getting confused, just try to remember that a 'sample' can be a genome, a metagenome, or a bin. +As you can see, %(anvi-estimate-metabolism)s can take one or more contigs database(s) as input, but the information that is taken from those databases depends on the context (ie, genome, metagenome, bin). In the case of internal genomes (or bins), is possible to have multiple inputs but only one input contigs db. So for clarity's sake, we sometimes refer to the inputs as 'samples' in the descriptions below. If you are getting confused, just try to remember that a 'sample' can be a genome, a metagenome, or a bin. If you don't have any sequences, there is an additional input option for you: - A list of enzymes, as described in an %(enzymes-txt)s file. For the purposes of metabolism estimation, the enzymes in this file will be interpreted as all coming from the same 'genome'. @@ -99,6 +100,17 @@ anvi-estimate-metabolism -c %(contigs-db)s --metagenome-mode {:.notice} In metagenome mode, this program will estimate metabolism for each contig in the metagenome separately. This will tend to underestimate module completeness because it is likely that many modules will be broken up across multiple contigs belonging to the same population. If you prefer to instead treat all enzyme annotations in the metagenome as belonging to one collective genome, you can do so by simply leaving out the `--metagenome-mode` flag (to effectively pretend that you are doing estimation for a single genome, although in your heart you will know that your contigs database really contains a metagenome). Please note that this will result in the opposite tendency to overestimate module completeness (as the enzymes will in reality be coming from multiple different populations), and there will be a lot of redundancy. We are working on improving our estimation algorithm for metagenome mode. In the meantime, if you are worried about the misleading results from either of these situations, we suggest binning your metagenomes first and running estimation for the bins as described below. + +### Estimation for gene cluster bins in a pangenome + +You can estimate the metabolisms collectively encoded by a set of gene clusters in a pangenome by providing this program with a pangenome database, its associated genomes storage database, and the name of the collection describing your gene cluster bins: + +{{ codestart }} +anvi-estimate-metabolism --pan-db %(pan-db)s -g %(genomes-storage-db)s -C COLLECTION_NAME +{{ codestop }} + +In this case, the program will estimate metabolism for each bin of gene clusters independently, by considering the set of enzyme annotations encoded within the set of gene clusters in the bin. Each gene cluster typically includes more than one gene from different genomes, and can therefore have multiple functions associated with it. To select which annotation is most relevant for estimation purposes, we pick the dominant function from each annotation source -- for instance, the KOfam with the highest number of annotations within the cluster and the COG with the highest number of annotations. Please note that this means that a gene cluster can still have multple annotations associated with it (a maximum of one per annotation source), so we don't allow calculation of copy numbers for pangenomes (i.e., you can't use the `--add-copy-number` flag for this input type). + ### Estimation for a set of enzymes Suppose you have a list of enzymes. This could be an entirely theoretical list, or they could come from some annotation data that you got outside of anvi'o - regardless of where you came up with this set, you can figure out what metabolic pathways these enzymes contribute to. All you have to do is format that list as an %(enzymes-txt)s file, and give that input file to this program, like so: From 736150f590eddca678d80703ae605123c327cd14 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:49:13 +0100 Subject: [PATCH 22/25] note on annotation for pangenome input --- anvio/docs/programs/anvi-estimate-metabolism.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index ea3e5f3688..0f1662adba 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -28,7 +28,7 @@ Both %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s rely on the %(keg If you want to estimate for your own metabolism data, then you have a couple of extra steps to go through: 3. Define your own metabolic modules by following the formatting guidelines described [here](https://merenlab.org/software/anvio/help/main/programs/anvi-setup-user-modules/#how-do-i-format-the-module-files) and [here](https://merenlab.org/software/anvio/help/main/artifacts/user-modules-data/#a-step-by-step-guide-to-creating), and then run %(anvi-setup-user-modules)s to parse them into a %(modules-db)s, -4. Annotate your %(contigs-db)s with the functional annotation sources that are required for your module definitions. This may require running a few different programs. For instance, if your modules are defined in terms of NCBI COGS (ie, the `COG20_FUNCTION` annotation source), you will need to run %(anvi-run-ncbi-cogs)s. If you are using a set of custom HMMs, you will need to run %(anvi-run-hmms)s on that set using the `--add-to-functions-table` parameter. If you already have annotations from one or more of these sources, you could also import them into the contigs database using the program %(anvi-import-functions)s. +4. Annotate your %(contigs-db)s with the functional annotation sources that are required for your module definitions. This may require running a few different programs. For instance, if your modules are defined in terms of NCBI COGS (ie, the `COG20_FUNCTION` annotation source), you will need to run %(anvi-run-ncbi-cogs)s. If you are using a set of custom HMMs, you will need to run %(anvi-run-hmms)s on that set using the `--add-to-functions-table` parameter. If you already have annotations from one or more of these sources, you could also import them into the contigs database using the program %(anvi-import-functions)s. Note that if you want to estimate metabolism on a pangenome, this annotation step needs to be run before you create the %(genomes-storage-db)s and %(pan-db)s. ## Running metabolism estimation From 5053bba174c7b075c59ab93569bb2f67ba21e60a Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 15:54:02 +0100 Subject: [PATCH 23/25] note on full pangenome --- anvio/docs/programs/anvi-estimate-metabolism.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 0f1662adba..788b0e2ea9 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -111,6 +111,8 @@ anvi-estimate-metabolism --pan-db %(pan-db)s -g %(genomes-storage-db)s -C COLLEC In this case, the program will estimate metabolism for each bin of gene clusters independently, by considering the set of enzyme annotations encoded within the set of gene clusters in the bin. Each gene cluster typically includes more than one gene from different genomes, and can therefore have multiple functions associated with it. To select which annotation is most relevant for estimation purposes, we pick the dominant function from each annotation source -- for instance, the KOfam with the highest number of annotations within the cluster and the COG with the highest number of annotations. Please note that this means that a gene cluster can still have multple annotations associated with it (a maximum of one per annotation source), so we don't allow calculation of copy numbers for pangenomes (i.e., you can't use the `--add-copy-number` flag for this input type). +Want to run the estimation on all the gene clusters in the pangenome? You should add a default collection first using %(anvi-script-add-default-collection)s. + ### Estimation for a set of enzymes Suppose you have a list of enzymes. This could be an entirely theoretical list, or they could come from some annotation data that you got outside of anvi'o - regardless of where you came up with this set, you can figure out what metabolic pathways these enzymes contribute to. All you have to do is format that list as an %(enzymes-txt)s file, and give that input file to this program, like so: From eefc80c416ee46b026a8cf9b18f666618e0fd7f8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 17:03:10 +0100 Subject: [PATCH 24/25] fix wrong attribute for collection --- anvio/kegg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index ffc658f0c3..c01c3ad85d 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -3032,7 +3032,7 @@ def __init__(self, args, run=run, progress=progress): self.run.info("Enzymes txt file", self.enzymes_txt, quiet=self.quiet) estimation_mode = "Genome (or metagenome assembly)" - if self.profile_db_path and self.collection: + if self.profile_db_path and self.collection_name: estimation_mode = "Bins in a metagenome" elif self.metagenome_mode: estimation_mode = "Individual contigs in a metagenome" From 3934b39bd6a2e7e1660679709948e551687c08c8 Mon Sep 17 00:00:00 2001 From: Iva Veseli Date: Mon, 20 Nov 2023 17:29:42 +0100 Subject: [PATCH 25/25] don't add annotation if accession is None --- anvio/kegg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anvio/kegg.py b/anvio/kegg.py index c01c3ad85d..855cfb5984 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -5149,7 +5149,8 @@ def init_hits_for_pangenome(self, gene_cluster_list: list): for source in self.annotation_sources_to_use: if source in pan_super.gene_clusters_functions_summary_dict[cluster_id]: acc = pan_super.gene_clusters_functions_summary_dict[cluster_id][source]['accession'] - enzyme_cluster_split_contig.append((acc,cluster_id,"NA","NA")) + if acc: # avoid introducing 'None' values here + enzyme_cluster_split_contig.append((acc,cluster_id,"NA","NA")) return enzyme_cluster_split_contig