diff --git a/databases.py b/databases.py index 53cd0f8..b67d188 100644 --- a/databases.py +++ b/databases.py @@ -74,7 +74,8 @@ def get_pathway(self, pathway_id: str, self_loops: bool = False): if G.node[node]['type'] != 'gene': for in_edge in G.in_edges(node): for out_edge in G.out_edges(node): - G.add_edge(in_edge[0], out_edge[1], type=['indirect']) + if in_edge[0] != out_edge[1] or (in_edge[0] == out_edge[1] and self_loops): + G.add_edge(in_edge[0], out_edge[1], type=['indirect']) not_gene_nodes.append(node) G.remove_nodes_from(not_gene_nodes) diff --git a/methods/impact_analysis/constants.py b/methods/impact_analysis/constants.py index 767669a..ee2611b 100644 --- a/methods/impact_analysis/constants.py +++ b/methods/impact_analysis/constants.py @@ -10,6 +10,7 @@ interaction_weights = { 'activation': 1.0, 'complex': 1.0, + 'compound': 1.0, 'dissociation': 1.0, 'glycosylation': 1.0, 'indirect': 1.0, diff --git a/methods/impact_analysis/impact_analysis.py b/methods/impact_analysis/impact_analysis.py index 3a274c0..a6a7ee5 100644 --- a/methods/impact_analysis/impact_analysis.py +++ b/methods/impact_analysis/impact_analysis.py @@ -3,69 +3,99 @@ from metrics import mean from models import Experiment, Gene from networkx import get_edge_attributes -from numpy import log2, isnan, nan +from numpy import log2, isnan from stats import ttest, hypergeom_distribution +from statsmodels.stats.multitest import multipletests from .constants import * +import pandas as pd class IAPathway: - def __init__(self, IF: float, pvalue: float, name: str): - self.IF = IF - self.pvalue = pvalue - self.name = name - - def set_corrections(self, fdr: float, bonf: float): - self.FDR = fdr - self.Bonferroni = bonf + def __init__(self, data): + self.IF = round(data['IF'], 3) + self.pvalue = round(data['pvalue'], 3) + self.name = data['name'] + self.FDR = round(data['FDR'], 3) + self.Bonferroni = round(data['Bonferroni'], 3) class ImpactAnalysisResult(MethodResult): - # columns = ['name', 'IF', 'pvalue', 'FDR', 'Bonferroni'] #TODO - columns = ['name', 'IF', 'pvalue'] - description = """ test """ # TODO + columns = ['name', 'IF', 'pvalue', 'FDR', 'Bonferroni'] class ImpactAnalysis(Method): """ - method and workflow description and sources - """ # TODO + Impact analysis is a pathways analysis method that considers the magnitude of each gene’s expression change, + their type and position in the given pathways, and their interactions. + + Pipeline schema: + 1. Gene expression change between two conditions (logFC) is calculated. + 2. Differentially expressed genes are identified. + 3. KEGG Pathways database is searched for pathways containing at least one differentially expressed gene. + 4. An impact factor (IF) is calculated for each pathway incorporating parameters such as the normalized + fold change of the differentially expressed genes, the statistical significance of the set of pathway genes, + and the topology of the pathway. + 5. Multiple hypothesis testing adjustments of each pathway significance are performed by FDR and Bonferroni + correction calculation. + + Additional method arguments allow specifying organism for database selection and the threshold for identification + of differentially expressed genes. + + For more information, please refer to: + Draghici S, Khatri P, Tarca AL, Amin K, Done A, et al. (2007), + A systems biology approach for pathway level analysis. Genome Res 17: 1537–1545. + + """ help = __doc__ name = 'impact_analysis' - legal_disclaimer = """ test """ # TODO - def __init__(self, organism: str = 'Homo sapiens', threshold: float = 0.05, **kwargs): + """ + + Args: + organism: organism name (ex. 'Homo sapiens', 'human') + threshold: float: threshold for identification of differentially expressed genes + """ if threshold < 0 or threshold > 1: raise ValueError('Indices need to be in (0,1) range') self.threshold = threshold self.org = organism - self.DEGs = None + self.degs = None # differentially expressed genes self.FC = None self.experiment_genes = None def run(self, experiment: Experiment) -> ImpactAnalysisResult: + """ + + Returns: + list of pathways sorted by their impact factor. Each pathway in the list has values of FDR and + Bonferroni corrections assigned. + """ + + # calculate fold change + self.FC = experiment.calculate_fold_change() + + # remove genes for witch fold change cannot be calculated correctly + experiment.exclude_genes(list(self.FC['FC'][isnan(self.FC['FC'])].index)) # select differentialy expressed genes pvalue = ttest(experiment) <= self.threshold - self.DEGs = pvalue[pvalue == True] + self.degs = pvalue[pvalue == True] - if self.DEGs.size == 0: + if self.degs.size == 0: # if there are no DEGs anywhere, the problem of finding the impact on various pathways is meaningless print('No differentialy expressed genes.') return ImpactAnalysisResult([]) - # calculate fold change - self.FC = experiment.calculate_fold_change() - self.experiment_genes = set([gene.name for gene in experiment.get_all().genes]) db = KEGGPathways(self.org) pathways = {} - for gene in [g.name for g in list(self.DEGs.index)]: + for gene in [g.name for g in list(self.degs.index)]: ps = db.search_by_gene(gene) for (k, v) in ps.items(): if k not in pathways.keys(): @@ -75,40 +105,42 @@ def run(self, experiment: Experiment) -> ImpactAnalysisResult: print('No pathways found in database.') return ImpactAnalysisResult([]) - ifp_pathways = [] + res = pd.DataFrame(columns=['name', 'IF', 'pvalue']) for (code, descr) in pathways.items(): pathway = db.get_pathway(code) impact_factor, pval = self.calculate_impact_factor(experiment, pathway) if impact_factor is not None and pval is not None: - ifp = IAPathway(impact_factor, pval, descr) - ifp_pathways.append(ifp) + res.loc[len(res.index)] = [descr, impact_factor, pval] + res['FDR'], res['Bonferroni'] = self.calculate_corrections(res['pvalue']) + ifp_pathways = [IAPathway(res.loc[i]) for i in range(len(res.index))] ifp_pathways.sort(key=lambda x: x.IF if not isnan(x.IF) else 0, reverse=True) + return ImpactAnalysisResult(ifp_pathways) def calculate_impact_factor(self, experiment, pathway): path_genes = set([x.strip() for x in ' ,'.join(pathway.nodes).split(',')]) - DEG_genes = set([gene.name for gene in list(self.DEGs.index)]) + DEGs_set = set([gene.name for gene in list(self.degs.index)]) # no DEGs in pathway - if len(path_genes & DEG_genes) == 0: + if len(path_genes & DEGs_set) == 0: return None, None - pval_path = hypergeom_distribution(len(path_genes & DEG_genes), len(self.experiment_genes), self.DEGs.size, + pval_path = hypergeom_distribution(len(path_genes & DEGs_set), len(self.experiment_genes), self.degs.size, len(path_genes & self.experiment_genes)) if pval_path != 0: impact_factor = log2(pval_path) impact_factor += sum( - [self.calculate_perturbation_factor(experiment, gene, pathway) for gene in pathway.nodes]) / len( - path_genes & DEG_genes) * mean([abs(i) for i in self.FC['FC'].values if not isnan(i)]) + [abs(self.calculate_perturbation_factor(experiment, gene, pathway)) for gene in pathway.nodes]) / len( + path_genes & DEGs_set) * mean([abs(i) for i in self.FC['logFC'].values if not isnan(i)]) else: impact_factor = MAX_IF - return round(impact_factor, 3), round(pval_path, 3) + return impact_factor, pval_path def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None): @@ -119,8 +151,9 @@ def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None) else: for name in gene.split(','): if name.strip() in self.experiment_genes: - # get ΔE if measurable #TODO diffrent measures of expression change - pf = self.FC['FC'][Gene(name)] if not isnan(self.FC['FC'][Gene(name)]) else 0 + # get ΔE (logFC) + pf = self.FC['logFC'][Gene(name)] if not isnan(self.FC['logFC'][Gene(name)]) else MAX_IF + # genes directly upstream for edge in pathway.in_edges(gene): if edge[0] not in visited: @@ -131,3 +164,16 @@ def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None) pf += self.calculate_perturbation_factor(experiment, edge[0], pathway, visited + [edge[0]]) * beta / dstream return pf + + def calculate_corrections(self, pvalues): + """ + + Args: + pvalues: array_like vector of p-values + + Returns: + two arrays of p-values corrected for multiple tests using FDR and Bonferroni correction method + + """ + + return multipletests(pvalues, method='fdr_bh')[1], multipletests(pvalues, method='bonferroni')[1] diff --git a/models.py b/models.py index 4ef91ab..e2f73e0 100644 --- a/models.py +++ b/models.py @@ -107,6 +107,12 @@ def __eq__(self, other): def __repr__(self): return f'' + def exclude_genes(self, gene_list: list): + for gene in gene_list: + assert isinstance(gene, Gene) + if gene in self.data.keys(): + del self.data[gene] + def first_line(file_object, skip_rows=0): line = None @@ -420,6 +426,10 @@ def from_csv_file(cls, name, file_object, **kwargs): kwargs['delimiter'] = ',' return cls.from_file(name, file_object, **kwargs) + def exclude_genes(self, gene_list: list): + for sample in self.samples: + sample.exclude_genes(gene_list) + # TODO class variable with set of genes + method(s) for checking data integrity class Experiment: @@ -450,3 +460,6 @@ def calculate_fold_change(self): ratio = ratio_of_classes(case, control) fc[idx] = [ratio, log2(ratio)] return pd.DataFrame.from_dict(fc, orient="index", columns=['FC', 'logFC']) + + def exclude_genes(self, gene_list: list): + self.get_all().exclude_genes(gene_list)