diff --git a/anvio/dbinfo.py b/anvio/dbinfo.py index cd9365a503..dc52ffafe0 100644 --- a/anvio/dbinfo.py +++ b/anvio/dbinfo.py @@ -88,6 +88,7 @@ class DBInfo(ABC): db_type = None hash_name = None + functional_annotation_sources_name = None def __new__(cls, path, dont_raise=False, expecting=None): if not cls.is_db(path, dont_raise=dont_raise): @@ -208,11 +209,20 @@ def load_db(self): def get_self_table(self): with DB(self.path, None, ignore_version=True) as database: return dict(database.get_table_as_list_of_tuples('self')) + + def get_functional_annotation_sources(self): + """If sources are defined in the database, we return them as a list. Otherwise, we return None.""" + if self.functional_annotation_sources_name: + with self.load_db() as database: + return database.get_meta_value(self.functional_annotation_sources_name, return_none_if_not_in_table=True).split(',') + else: + return None class ContigsDBInfo(DBInfo): db_type = 'contigs' hash_name = 'contigs_db_hash' + functional_annotation_sources_name = 'gene_function_sources' def __init__(self, path, *args, **kwargs): DBInfo.__init__(self, path) @@ -260,6 +270,7 @@ def __init__(self, path, *args, **kwargs): class GenomeStorageDBInfo(DBInfo): db_type = 'genomestorage' hash_name = 'hash' + functional_annotation_sources_name = 'gene_function_sources' def __init__(self, path, *args, **kwargs): DBInfo.__init__(self, path) @@ -281,6 +292,7 @@ def __init__(self, path, *args, **kwargs): class ModulesDBInfo(DBInfo): db_type = 'modules' hash_name = 'hash' + functional_annotation_sources_name = 'annotation_sources' def __init__(self, path, *args, **kwargs): DBInfo.__init__(self, path) diff --git a/anvio/dbops.py b/anvio/dbops.py index 50f9511580..3e66542a36 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -1938,10 +1938,14 @@ def get_gene_clusters_functions_summary_dict(self, functional_annotation_source) return gene_clusters_functions_summary_dict - def init_gene_clusters_functions_summary_dict(self): + def init_gene_clusters_functions_summary_dict(self, source_list = None, gene_clusters_of_interest = None): """ A function to initialize the `gene_clusters_functions_summary_dict` by calling the atomic function `get_gene_clusters_functions_summary_dict` for all the - functional annotaiton sources. + functional annotation sources. + + You can restrict which sources to use and which gene clusters to initialize using the `source_list` and + `gene_clusters_of_interest` parameters, respectively. Use this ability with caution as it is possible that + downstream code may expect all sources and/or all gene clusters to be initialized. """ if not self.functions_initialized: @@ -1953,17 +1957,38 @@ def init_gene_clusters_functions_summary_dict(self): "on without any summary dict for functions and/or drama about it to let the downstream analyses " "decide how to punish the unlucky.") return - - self.progress.new('Generating a gene cluster functions summary dict', progress_total_items=len(self.gene_clusters_functions_dict)) + + gene_clusters_to_init = self.gene_clusters_functions_dict.keys() + if gene_clusters_of_interest: + gene_clusters_to_init = gene_clusters_of_interest + + # make sure that all of the requested gene clusters exist + for cluster_id in gene_clusters_of_interest: + if cluster_id not in self.gene_clusters: + raise ConfigError(f"Someone requested the function `init_gene_clusters_functions_summary_dict()` " + f"to work on a gene cluster called {cluster_id} but it does not exist in the " + f"pangenome. :/") + + sources_to_summarize = self.gene_clusters_function_sources + if source_list: + sources_to_summarize = source_list + + # make sure that all of the requested sources are in the pangenome + for s in source_list: + if s not in self.gene_clusters_function_sources: + raise ConfigError(f"Someone requested the function `init_gene_clusters_functions_summary_dict()` " + f"to use the annotation source {s}, but that source is not found in the pangenome.") + + self.progress.new('Generating a gene cluster functions summary dict', progress_total_items=len(gene_clusters_to_init)) counter = 0 - for gene_cluster_id in self.gene_clusters_functions_dict: + for gene_cluster_id in gene_clusters_to_init: if counter % 100 == 0: self.progress.increment(increment_to=counter) self.progress.update(f'{gene_cluster_id} ...') self.gene_clusters_functions_summary_dict[gene_cluster_id] = {} - for functional_annotation_source in self.gene_clusters_function_sources: + for functional_annotation_source in sources_to_summarize: accession, function = self.get_gene_cluster_function_summary(gene_cluster_id, functional_annotation_source, discard_ties=self.discard_ties, consensus_threshold=self.consensus_threshold) self.gene_clusters_functions_summary_dict[gene_cluster_id][functional_annotation_source] = {'function': function, 'accession': accession} diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 2c9379976a..788b0e2ea9 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -4,9 +4,9 @@ The metabolic pathways that this program considers (by default) are those define Alternatively or additionally, you can define your own set of metabolic modules and estimate their completeness with this program. Detailed instructions for doing this can be found by looking at the %(user-modules-data)s and %(anvi-setup-user-modules)s pages. -Given a properly annotated %(contigs-db)s, this program determines which enzymes are present and uses these functions to compute the completeness of each metabolic module. There are currently two strategies for estimating module completeness - pathwise and stepwise - which are discussed in the technical details section on this page. The output of this program is one or more tabular text files - see %(kegg-metabolism)s for the output description and examples. +Given a properly annotated %(contigs-db)s or %(pan-db)s, this program determines which enzymes are present and uses these functions to compute the completeness of each metabolic module. There are currently two strategies for estimating module completeness - pathwise and stepwise - which are discussed in the technical details section on this page. The output of this program is one or more tabular text files - see %(kegg-metabolism)s for the output description and examples. -For a practical tutorial on how to use this program, visit [this link](https://merenlab.org/tutorials/infant-gut/#chapter-v-metabolism-prediction). A more abstract discussion of available parameters, as well as technical details about how the metabolism estimation is done, can be found below. +For a practical tutorial on how to use this program, visit [this link](https://anvio.org/tutorials/fmt-mag-metabolism/). A more abstract discussion of available parameters, as well as technical details about how the metabolism estimation is done, can be found below. ## What metabolism data can I use? @@ -28,7 +28,7 @@ Both %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s rely on the %(keg If you want to estimate for your own metabolism data, then you have a couple of extra steps to go through: 3. Define your own metabolic modules by following the formatting guidelines described [here](https://merenlab.org/software/anvio/help/main/programs/anvi-setup-user-modules/#how-do-i-format-the-module-files) and [here](https://merenlab.org/software/anvio/help/main/artifacts/user-modules-data/#a-step-by-step-guide-to-creating), and then run %(anvi-setup-user-modules)s to parse them into a %(modules-db)s, -4. Annotate your %(contigs-db)s with the functional annotation sources that are required for your module definitions. This may require running a few different programs. For instance, if your modules are defined in terms of NCBI COGS (ie, the `COG20_FUNCTION` annotation source), you will need to run %(anvi-run-ncbi-cogs)s. If you are using a set of custom HMMs, you will need to run %(anvi-run-hmms)s on that set using the `--add-to-functions-table` parameter. If you already have annotations from one or more of these sources, you could also import them into the contigs database using the program %(anvi-import-functions)s. +4. Annotate your %(contigs-db)s with the functional annotation sources that are required for your module definitions. This may require running a few different programs. For instance, if your modules are defined in terms of NCBI COGS (ie, the `COG20_FUNCTION` annotation source), you will need to run %(anvi-run-ncbi-cogs)s. If you are using a set of custom HMMs, you will need to run %(anvi-run-hmms)s on that set using the `--add-to-functions-table` parameter. If you already have annotations from one or more of these sources, you could also import them into the contigs database using the program %(anvi-import-functions)s. Note that if you want to estimate metabolism on a pangenome, this annotation step needs to be run before you create the %(genomes-storage-db)s and %(pan-db)s. ## Running metabolism estimation @@ -37,8 +37,9 @@ You can run metabolism estimation on any set of annotated sequences, but these s - Single genomes, also referred to as %(external-genomes)s. These can be isolate genomes or metagenome-assembled genomes, for example. Each one is described in its own individual %(contigs-db)s. - Bins, also referred to as %(internal-genomes)s. These often represent metagenome-assembled genomes, but generally can be any subset of sequences within a database. A single %(contigs-db)s can contain multiple bins. - Assembled, unbinned metagenomes. There is no distinction between sequences that belong to different microbial populations in the %(contigs-db)s for an unbinned metagenome. +- Pangenomes in which you have binned gene clusters. Each pangenome is described in a %(pan-db)s, and must include a %(collection)s of your gene cluster bins of interest. -As you can see, %(anvi-estimate-metabolism)s takes one or more contigs database(s) as input, but the information that is taken from those databases depends on the context (ie, genome, metagenome, bin). In the case of internal genomes (or bins), is possible to have multiple inputs but only one input contigs db. So for clarity's sake, we sometimes refer to the inputs as 'samples' in the descriptions below. If you are getting confused, just try to remember that a 'sample' can be a genome, a metagenome, or a bin. +As you can see, %(anvi-estimate-metabolism)s can take one or more contigs database(s) as input, but the information that is taken from those databases depends on the context (ie, genome, metagenome, bin). In the case of internal genomes (or bins), is possible to have multiple inputs but only one input contigs db. So for clarity's sake, we sometimes refer to the inputs as 'samples' in the descriptions below. If you are getting confused, just try to remember that a 'sample' can be a genome, a metagenome, or a bin. If you don't have any sequences, there is an additional input option for you: - A list of enzymes, as described in an %(enzymes-txt)s file. For the purposes of metabolism estimation, the enzymes in this file will be interpreted as all coming from the same 'genome'. @@ -46,7 +47,7 @@ If you don't have any sequences, there is an additional input option for you: Different input contexts can require different parameters or additional inputs. The following sections describe what is necessary for each input type. -### Estimation for a single genome +### Estimation for a single genome or unbinned metagenome assembly The most basic use-case for this program is when you have one contigs database describing a single genome. Since all of the sequences in this database belong to the same genome, all of the gene annotations will be used for metabolism estimation. @@ -54,6 +55,8 @@ The most basic use-case for this program is when you have one contigs database d anvi-estimate-metabolism -c %(contigs-db)s {{ codestop }} +In some cases -- for instance, to compute community-level pathway copy numbers -- it is also appropriate to do this for unbinned metagenome assemblies. + ### Estimation for bins in a metagenome You can estimate metabolism for different subsets of the sequences in your contigs database if you first %(bin)s them and save them as a %(collection)s. For each bin, only the gene annotations from its subset of sequences will contribute to the module completeness scores. @@ -86,7 +89,7 @@ bin_3 bin_5 ``` -### Estimation for a metagenome +### Estimation for contigs in a metagenome assembly If you have an unbinned metagenome assembly, you can estimate metabolism for it using `--metagenome-mode`. In this case, since there is no way to determine which contigs belong to which microbial populations in the sample, estimation will be done on a per-contig basis; that is, for each contig, only the genes present on that contig will be used to determine pathway completeness within the contig. @@ -97,6 +100,19 @@ anvi-estimate-metabolism -c %(contigs-db)s --metagenome-mode {:.notice} In metagenome mode, this program will estimate metabolism for each contig in the metagenome separately. This will tend to underestimate module completeness because it is likely that many modules will be broken up across multiple contigs belonging to the same population. If you prefer to instead treat all enzyme annotations in the metagenome as belonging to one collective genome, you can do so by simply leaving out the `--metagenome-mode` flag (to effectively pretend that you are doing estimation for a single genome, although in your heart you will know that your contigs database really contains a metagenome). Please note that this will result in the opposite tendency to overestimate module completeness (as the enzymes will in reality be coming from multiple different populations), and there will be a lot of redundancy. We are working on improving our estimation algorithm for metagenome mode. In the meantime, if you are worried about the misleading results from either of these situations, we suggest binning your metagenomes first and running estimation for the bins as described below. + +### Estimation for gene cluster bins in a pangenome + +You can estimate the metabolisms collectively encoded by a set of gene clusters in a pangenome by providing this program with a pangenome database, its associated genomes storage database, and the name of the collection describing your gene cluster bins: + +{{ codestart }} +anvi-estimate-metabolism --pan-db %(pan-db)s -g %(genomes-storage-db)s -C COLLECTION_NAME +{{ codestop }} + +In this case, the program will estimate metabolism for each bin of gene clusters independently, by considering the set of enzyme annotations encoded within the set of gene clusters in the bin. Each gene cluster typically includes more than one gene from different genomes, and can therefore have multiple functions associated with it. To select which annotation is most relevant for estimation purposes, we pick the dominant function from each annotation source -- for instance, the KOfam with the highest number of annotations within the cluster and the COG with the highest number of annotations. Please note that this means that a gene cluster can still have multple annotations associated with it (a maximum of one per annotation source), so we don't allow calculation of copy numbers for pangenomes (i.e., you can't use the `--add-copy-number` flag for this input type). + +Want to run the estimation on all the gene clusters in the pangenome? You should add a default collection first using %(anvi-script-add-default-collection)s. + ### Estimation for a set of enzymes Suppose you have a list of enzymes. This could be an entirely theoretical list, or they could come from some annotation data that you got outside of anvi'o - regardless of where you came up with this set, you can figure out what metabolic pathways these enzymes contribute to. All you have to do is format that list as an %(enzymes-txt)s file, and give that input file to this program, like so: diff --git a/anvio/kegg.py b/anvio/kegg.py index 75fb236cf6..855cfb5984 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -29,8 +29,9 @@ from anvio.drivers.hmmer import HMMer from anvio.parsers import parser_modules from anvio.tables.genefunctions import TableForGeneFunctions -from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase +from anvio.dbops import ContigsSuperclass, ContigsDatabase, ProfileSuperclass, ProfileDatabase, PanSuperclass from anvio.genomedescriptions import MetagenomeDescriptions, GenomeDescriptions +from anvio.dbinfo import DBInfo __author__ = "Developers of anvi'o (see AUTHORS.txt)" @@ -2828,6 +2829,8 @@ def __init__(self, args, run=run, progress=progress): A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.contigs_db_path = A('contigs_db') self.profile_db_path = A('profile_db') + self.pan_db_path = A('pan_db') + self.genomes_storage_path = A('genomes_storage') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file = A('bin_ids_file') @@ -2849,6 +2852,8 @@ def __init__(self, args, run=run, progress=progress): self.name_header = 'contig_name' elif self.profile_db_path and self.collection_name and not self.metagenome_mode: self.name_header = 'bin_name' + elif self.pan_db_path: + self.name_header = 'gene_cluster_bin_name' else: self.name_header = 'genome_name' @@ -2864,13 +2869,20 @@ def __init__(self, args, run=run, progress=progress): 'mode_type' : 'all', 'description': "Name of genome/bin/metagenome in which we find gene annotations (hits) and/or modules" } + if self.pan_db_path: + self.update_available_headers_for_pan() if self.enzymes_txt: self.contigs_db_project_name = os.path.basename(self.enzymes_txt).replace(".", "_") # INPUT OPTIONS SANITY CHECKS - if not self.estimate_from_json and not self.contigs_db_path and not self.enzymes_txt: + if not self.estimate_from_json and not self.contigs_db_path and not self.enzymes_txt and not self.pan_db_path: raise ConfigError("NO INPUT PROVIDED. Please use the `-h` flag to see possible input options.") + # incompatible input options + if (self.contigs_db_path and (self.pan_db_path or self.enzymes_txt)) or \ + (self.enzymes_txt and self.pan_db_path): + raise ConfigError("MULTIPLE INPUT OPTIONS DETECTED. Please check your parameters. You cannot provide more than one " + "of the following: a contigs database, an enzymes-txt file, or a pangenome database.") if self.only_user_modules and not self.user_input_dir: raise ConfigError("You can only use the flag --only-user-modules if you provide a --user-modules directory.") @@ -2885,15 +2897,27 @@ def __init__(self, args, run=run, progress=progress): filesnpaths.is_file_exists(self.bin_ids_file) self.bin_ids_to_process = [line.strip() for line in open(self.bin_ids_file).readlines()] - if (self.bin_id or self.bin_ids_file or self.collection_name) and not self.profile_db_path: + # required with collection/bin input + if (self.bin_id or self.bin_ids_file or self.collection_name) and not self.profile_db_path and not self.pan_db_path: raise ConfigError("You have requested metabolism estimation for a bin or set of bins, but you haven't provided " - "a profiles database. Unfortunately, this just does not work. Please try again.") - + "a profile database or pan database. Unfortunately, this just does not work. Please try again.") + # required with profile db input if self.profile_db_path and not (self.collection_name or self.add_coverage or self.metagenome_mode): raise ConfigError("If you provide a profile DB, you should also provide either a collection name (to estimate metabolism " "on a collection of bins) or use the --add-coverage flag (so that coverage info goes into the output " "files), or both. Otherwise the profile DB is useless.") - + # required/forbidden with pangenome input + if self.pan_db_path and not self.genomes_storage_path: + raise ConfigError("You have provided a pan database but not its associated genomes storage database. Please give the " + "path to the genomes storage db using the `-g` flag.") + if self.pan_db_path and not self.collection_name: + raise ConfigError("You need to provide a collection name when you estimate metabolism on a pangenome. If you don't " + "already have a collection of gene clusters in the pan database, please make one first. Then provide " + "the collection to this program; you can find the collection name parameter in the INPUT #2 section " + "of the `-h` output.") + if self.pan_db_path and (self.add_copy_number or self.add_coverage): + raise ConfigError("The flags --add-copy-number or --add-coverage do not work for pangenome input.") + # required/forbidden with JSON estimation if self.store_json_without_estimation and not self.json_output_file_path: raise ConfigError("Whoops. You seem to want to store the metabolism dictionary in a JSON file, but you haven't provided the name of that file. " "Please use the --get-raw-data-as-json flag to do so.") @@ -2903,6 +2927,8 @@ def __init__(self, args, run=run, progress=progress): if self.profile_db_path: utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) + if self.pan_db_path: + utils.is_pan_db_and_genomes_storage_db_compatible(self.pan_db_path, self.genomes_storage_path) if self.add_coverage: if not self.enzymes_txt: @@ -2993,6 +3019,9 @@ def __init__(self, args, run=run, progress=progress): self.run.info("Contigs DB", self.contigs_db_path, quiet=self.quiet) if self.profile_db_path: self.run.info("Profile DB", self.profile_db_path, quiet=self.quiet) + if self.pan_db_path: + self.run.info("Pan DB", self.pan_db_path, quiet=self.quiet) + self.run.info("Genomes Storage DB", self.genomes_storage_path, quiet=self.quiet) if self.collection_name: self.run.info('Collection', self.collection_name, quiet=self.quiet) if self.bin_id: @@ -3002,10 +3031,20 @@ def __init__(self, args, run=run, progress=progress): if self.enzymes_txt: self.run.info("Enzymes txt file", self.enzymes_txt, quiet=self.quiet) - self.run.info('Metagenome mode', self.metagenome_mode, quiet=self.quiet) + estimation_mode = "Genome (or metagenome assembly)" + if self.profile_db_path and self.collection_name: + estimation_mode = "Bins in a metagenome" + elif self.metagenome_mode: + estimation_mode = "Individual contigs in a metagenome" + elif self.enzymes_txt: + estimation_mode = "List of enzymes" + elif self.pan_db_path: + estimation_mode = "Gene cluster bins in a pangenome" + + self.run.info('Mode (what we are estimating metabolism for)', estimation_mode, quiet=self.quiet) - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: utils.is_contigs_db(self.contigs_db_path) # here we load the contigs DB just for sanity check purposes. # We will need to load it again later just before accessing data to avoid SQLite error that comes from different processes accessing the DB @@ -3033,7 +3072,7 @@ def __init__(self, args, run=run, progress=progress): f"`anvi-setup-kegg-data`, though we are not sure how you got to this point in that case." f"But fine. Hopefully you now know what you need to do to make this message go away.") - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: # sanity check that contigs db was annotated with same version of MODULES.db that will be used for metabolism estimation if 'modules_db_hash' not in contigs_db.meta: raise ConfigError("Based on the contigs DB metadata, the contigs DB that you are working with has not been annotated with hits to the " @@ -3077,8 +3116,7 @@ def __init__(self, args, run=run, progress=progress): # LOAD USER DATA - if not self.estimate_from_json and not self.enzymes_txt: - if self.user_input_dir: + if self.user_input_dir: # check for user modules db if not os.path.exists(self.user_modules_db_path): raise ConfigError(f"It appears that a USER-DEFINED modules database ({self.user_modules_db_path}) does not exist in the provided data directory. " @@ -3089,15 +3127,16 @@ def __init__(self, args, run=run, progress=progress): user_modules_db = ModulesDatabase(self.user_modules_db_path, args=self.args, quiet=self.quiet) modules_db_sources = set(user_modules_db.db.get_meta_value('annotation_sources').split(',')) - contigs_db_sources = set(contigs_db.meta['gene_function_sources']) - source_in_modules_not_contigs = modules_db_sources.difference(contigs_db_sources) + if self.contigs_db_path: + contigs_db_sources = set(contigs_db.meta['gene_function_sources']) + source_in_modules_not_contigs = modules_db_sources.difference(contigs_db_sources) - if source_in_modules_not_contigs: - missing_sources = ", ".join(source_in_modules_not_contigs) - raise ConfigError(f"Your contigs database is missing one or more functional annotation sources that are " - f"required for the modules in the database at {self.user_modules_db_path}. You will have to " - f"annotate the contigs DB with these sources (or import them using `anvi-import-functions`) " - f"before running this program again. Here are the missing sources: {missing_sources}") + if source_in_modules_not_contigs: + missing_sources = ", ".join(source_in_modules_not_contigs) + raise ConfigError(f"Your contigs database is missing one or more functional annotation sources that are " + f"required for the modules in the database at {self.user_modules_db_path}. You will have to " + f"annotate the contigs DB with these sources (or import them using `anvi-import-functions`) " + f"before running this program again. Here are the missing sources: {missing_sources}") # expand annotation source set to include those in user db annotation_source_set.update(modules_db_sources) @@ -3109,7 +3148,7 @@ def __init__(self, args, run=run, progress=progress): self.ko_dict[k] = user_kos[k] user_modules_db.disconnect() - if not self.estimate_from_json and not self.enzymes_txt: + if self.contigs_db_path: contigs_db.disconnect() self.annotation_sources_to_use = list(annotation_source_set) @@ -3123,6 +3162,34 @@ def __init__(self, args, run=run, progress=progress): else: self.run.info('Metabolism data', "KEGG only") + # sanity check for annotation sources in pangenome + if self.genomes_storage_path: + available_sources = DBInfo(self.genomes_storage_path).get_functional_annotation_sources() + missing_sources = [] + for source in self.annotation_sources_to_use: + if source not in available_sources: + missing_sources.append(source) + if missing_sources: + miss_str = ", ".join(missing_sources) + raise ConfigError(f"The following functional annotation sources are required for metabolism " + f"estimation on the chosen metabolism data, but are missing from your genome " + f"storage database: {miss_str}. You'll need to figure out which genomes in the db " + f"are missing those sources, and annotate them before re-making the genomes storage.") + + + def update_available_headers_for_pan(self): + """This function updates the available headers dictionary for pangenome-specific headers.""" + + # in modules mode, we replace 'gene_caller_ids_in_module' with 'gene_clusters_in_module' + self.available_headers['gene_clusters_in_module'] = { + 'cdict_key': None, + 'mode_type': 'modules', + 'description': "Comma-separated list of gene cluster IDs that contribute to a module" + } + self.available_headers.pop('gene_caller_ids_in_module') + + self.available_modes['modules']["headers"] = ['gene_clusters_in_module' if x == 'gene_caller_ids_in_module' else x for x in self.available_modes['modules']["headers"]] + def list_output_modes(self): """This function prints out the available output modes for the metabolism estimation script.""" @@ -5055,11 +5122,89 @@ def estimate_metabolism_from_enzymes_txt(self): return enzyme_metabolism_superdict, enzyme_ko_superdict + def init_hits_for_pangenome(self, gene_cluster_list: list): + """This function loads enzyme annotations from the pangenome for use by downstream metabolism estimation. + + For each gene cluster, it takes the most common function from each annotation source relevant to the modules. + + PARAMETERS + ========== + gene_cluster_list : list + which gene cluster IDs to load from the pan DB + + RETURNS + ======= + enzyme_cluster_split_contig : list + (enzyme_accession, gene_cluster_id, split, contig) tuples in which split and contig are both NAs + """ + + pan_super = PanSuperclass(self.args) + pan_super.init_gene_clusters(gene_cluster_ids_to_focus = gene_cluster_list) + pan_super.init_gene_clusters_functions_summary_dict(source_list = self.annotation_sources_to_use, gene_clusters_of_interest = gene_cluster_list) + + + enzyme_cluster_split_contig = [] + # no splits or contigs here + for cluster_id in gene_cluster_list: + for source in self.annotation_sources_to_use: + if source in pan_super.gene_clusters_functions_summary_dict[cluster_id]: + acc = pan_super.gene_clusters_functions_summary_dict[cluster_id][source]['accession'] + if acc: # avoid introducing 'None' values here + enzyme_cluster_split_contig.append((acc,cluster_id,"NA","NA")) + + return enzyme_cluster_split_contig + + + def estimate_metabolism_for_pangenome_bins(self, enzyme_cluster_split_contig, cluster_collection): + """Estimates metabolism individually on each bin in a pangenome. + + PARAMETERS + ========== + enzyme_cluster_split_contig : list + (enzyme_accession, gene_cluster_id, split, contig) tuples in which split and contig are both NAs + + cluster_collection : dictionary + maps bin names in the collection to the list of gene clusters in each bin + """ + + gc_bins_metabolism_superdict = {} + gc_bins_ko_superdict = {} + num_bins = len(cluster_collection) + + self.progress.new("Estimating metabolism for each bin of gene clusters", progress_total_items=num_bins) + + for bin_name, gc_list in cluster_collection.items(): + self.progress.update("[%d of %d] %s" % (self.progress.progress_current_item + 1, num_bins, bin_name)) + + enzymes_in_bin = [tpl for tpl in enzyme_cluster_split_contig if tpl[1] in gc_list] + metabolism_dict_for_bin, ko_dict_for_bin = self.mark_kos_present_for_list_of_splits(enzymes_in_bin, bin_name=bin_name) + + if not self.store_json_without_estimation: + gc_bins_metabolism_superdict[bin_name] = self.estimate_for_list_of_splits(metabolism_dict_for_bin, bin_name=bin_name) + single_bin_module_superdict = {bin_name: gc_bins_metabolism_superdict[bin_name]} + gc_bins_ko_superdict[bin_name] = ko_dict_for_bin + else: + gc_bins_metabolism_superdict[bin_name] = metabolism_dict_for_bin + single_bin_module_superdict = {bin_name: metabolism_dict_for_bin} + gc_bins_ko_superdict[bin_name] = ko_dict_for_bin + + # append individual bin to file + single_bin_ko_superdict = {bin_name: ko_dict_for_bin} + self.append_kegg_metabolism_superdicts(single_bin_module_superdict, single_bin_ko_superdict) + + self.progress.increment() + self.progress.reset() + + self.progress.end() + + return gc_bins_metabolism_superdict, gc_bins_ko_superdict + + def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=None, return_superdicts=False, return_subset_for_matrix_format=False, all_modules_in_db=None, all_kos_in_db=None, module_paths_dict=None): """This is the driver function for estimating metabolism for a single contigs DB. - It will decide what to do based on whether the input contigs DB is a genome or metagenome. + It will decide what to do based on whether the input DB is a genome, metagenome, or pangenome. It usually avoids returning the metabolism data to save on memory (as this data is typically appended to files immediately), but this behavior can be changed by setting return_superdicts to True (for the entire modules/ko superdictionaries) or return_subset_for_matrix_format to True (for a subset of these dicts that @@ -5124,6 +5269,21 @@ def estimate_metabolism(self, skip_storing_data=False, output_files_dictionary=N if self.enzymes_txt: self.enzymes_txt_data = self.load_data_from_enzymes_txt() kegg_metabolism_superdict, kofam_hits_superdict = self.estimate_metabolism_from_enzymes_txt() + elif self.pan_db_path: + gene_cluster_collections = ccollections.Collections() + gene_cluster_collections.populate_collections_dict(self.pan_db_path) + if self.collection_name not in gene_cluster_collections.collections_dict: + c_str = ', '.join(gene_cluster_collections.collections_dict.keys()) + raise ConfigError(f"The collection name you provided ('{self.collection_name}') is not valid for this " + f"Pan DB. Here are the collections in this database: {c_str}") + collection_dict = gene_cluster_collections.get_collection_dict(self.collection_name) + + all_gene_clusters_in_collection = [] + for bin_name, gene_cluster_list in collection_dict.items(): + all_gene_clusters_in_collection += gene_cluster_list + + kofam_hits_info = self.init_hits_for_pangenome(gene_cluster_list = all_gene_clusters_in_collection) + kegg_metabolism_superdict, kofam_hits_superdict = self.estimate_metabolism_for_pangenome_bins(kofam_hits_info, collection_dict) else: kofam_hits_info = self.init_hits_and_splits(annotation_sources=self.annotation_sources_to_use) @@ -5241,6 +5401,8 @@ def add_common_elements_to_output_dict_for_module_in_bin(self, bin, mnum, c_dict d[self.modules_unique_id]["enzyme_hits_in_module"] = ",".join(kos_in_mod_list) if "gene_caller_ids_in_module" in headers_to_include: d[self.modules_unique_id]["gene_caller_ids_in_module"] = ",".join(gcids_in_mod) + if "gene_clusters_in_module" in headers_to_include: + d[self.modules_unique_id]["gene_clusters_in_module"] = ",".join(gcids_in_mod) # comma-separated list of warnings if "warnings" in headers_to_include: diff --git a/anvio/utils.py b/anvio/utils.py index e6b79b3f19..6411c517df 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -4274,6 +4274,18 @@ def is_structure_db_and_contigs_db_compatible(structure_db_path, contigs_db_path return True +def is_pan_db_and_genomes_storage_db_compatible(pan_db_path, genomes_storage_path): + pdb = dbi(pan_db_path) + gdb = dbi(genomes_storage_path) + + if pdb.hash != gdb.hash: + raise ConfigError(f"The pan database and the genomes storage database do not seem to " + f"be compatible. More specifically, the genomes storage database is " + f"not the one that was used when the pangenome was created. " + f"({pdb.hash} != {gdb.hash})") + + return True + # # FIXME # def is_external_genomes_compatible_with_pan_database(pan_db_path, external_genomes_path): diff --git a/bin/anvi-estimate-metabolism b/bin/anvi-estimate-metabolism index dd14473c25..99dfdf3c0b 100755 --- a/bin/anvi-estimate-metabolism +++ b/bin/anvi-estimate-metabolism @@ -16,7 +16,7 @@ __version__ = anvio.__version__ __authors__ = ['ivagljiva'] __requires__ = ["contigs-db", "kegg-data", "kegg-functions", "profile-db", "collection", "bin", "external-genomes", "internal-genomes", "metagenomes", "user-modules-data", - "enzymes-txt"] + "enzymes-txt", "pan-db", "genomes-storage-db"] __provides__ = ["kegg-metabolism","user-metabolism"] __description__ = "Reconstructs metabolic pathways and estimates pathway completeness for a given set of contigs" @@ -84,6 +84,15 @@ if __name__ == '__main__': "metabolic capabilities encoded by this list of enzymes.") groupX.add_argument(*anvio.A('enzymes-txt'), **anvio.K('enzymes-txt', {'required': False})) + groupN = parser.add_argument_group('INPUT #5 - ESTIMATION ON A PANGENOME', + "If you have a collection of binned gene clusters in a pangenome, you can estimate " + "metabolism on each bin. You will need to provide the collection name (and optionally, " + "bin name(s)) -- see INPUT #2 section above. This program will use the most common " + "annotation (for each annotation source) from each gene cluster. Note that this input " + "option is not compatible with the `--add-copy-number` or `--add-coverage` flags.") + groupN.add_argument('--pan-db', **anvio.K('pan-db', {'required': False})) + groupN.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage')) + groupC = parser.add_argument_group('OUTPUT - GENERAL OPTIONS', "Parameters for controlling estimation output of any type. The output will be " "TAB-delimited files which by default are prefixed with 'kegg-metabolism', but you can of course " "change that name here.")