Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IA docstrings and PEP8, exclude_genes for Sample, SampleCollection and Experiment, KEGGPathways self_loops correction #9

Merged
merged 3 commits into from
Jun 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def get_pathway(self, pathway_id: str, self_loops: bool = False):
if G.node[node]['type'] != 'gene':
for in_edge in G.in_edges(node):
for out_edge in G.out_edges(node):
G.add_edge(in_edge[0], out_edge[1], type=['indirect'])
if in_edge[0] != out_edge[1] or (in_edge[0] == out_edge[1] and self_loops):
G.add_edge(in_edge[0], out_edge[1], type=['indirect'])
not_gene_nodes.append(node)
G.remove_nodes_from(not_gene_nodes)

Expand Down
1 change: 1 addition & 0 deletions methods/impact_analysis/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
interaction_weights = {
'activation': 1.0,
'complex': 1.0,
'compound': 1.0,
'dissociation': 1.0,
'glycosylation': 1.0,
'indirect': 1.0,
Expand Down
114 changes: 80 additions & 34 deletions methods/impact_analysis/impact_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,99 @@
from metrics import mean
from models import Experiment, Gene
from networkx import get_edge_attributes
from numpy import log2, isnan, nan
from numpy import log2, isnan
from stats import ttest, hypergeom_distribution
from statsmodels.stats.multitest import multipletests
from .constants import *
import pandas as pd


class IAPathway:

def __init__(self, IF: float, pvalue: float, name: str):
self.IF = IF
self.pvalue = pvalue
self.name = name

def set_corrections(self, fdr: float, bonf: float):
self.FDR = fdr
self.Bonferroni = bonf
def __init__(self, data):
self.IF = round(data['IF'], 3)
self.pvalue = round(data['pvalue'], 3)
self.name = data['name']
self.FDR = round(data['FDR'], 3)
self.Bonferroni = round(data['Bonferroni'], 3)


class ImpactAnalysisResult(MethodResult):
# columns = ['name', 'IF', 'pvalue', 'FDR', 'Bonferroni'] #TODO
columns = ['name', 'IF', 'pvalue']
description = """ test """ # TODO
columns = ['name', 'IF', 'pvalue', 'FDR', 'Bonferroni']


class ImpactAnalysis(Method):
"""
method and workflow description and sources
""" # TODO
Impact analysis is a pathways analysis method that considers the magnitude of each gene’s expression change,
their type and position in the given pathways, and their interactions.

Pipeline schema:
1. Gene expression change between two conditions (logFC) is calculated.
2. Differentially expressed genes are identified.
3. KEGG Pathways database is searched for pathways containing at least one differentially expressed gene.
4. An impact factor (IF) is calculated for each pathway incorporating parameters such as the normalized
fold change of the differentially expressed genes, the statistical significance of the set of pathway genes,
and the topology of the pathway.
5. Multiple hypothesis testing adjustments of each pathway significance are performed by FDR and Bonferroni
correction calculation.

Additional method arguments allow specifying organism for database selection and the threshold for identification
of differentially expressed genes.

For more information, please refer to:
Draghici S, Khatri P, Tarca AL, Amin K, Done A, et al. (2007),
A systems biology approach for pathway level analysis. Genome Res 17: 1537–1545.

"""

help = __doc__

name = 'impact_analysis'

legal_disclaimer = """ test """ # TODO

def __init__(self, organism: str = 'Homo sapiens', threshold: float = 0.05, **kwargs):
"""

Args:
organism: organism name (ex. 'Homo sapiens', 'human')
threshold: float: threshold for identification of differentially expressed genes
"""
if threshold < 0 or threshold > 1:
raise ValueError('Indices need to be in (0,1) range')
self.threshold = threshold
self.org = organism
self.DEGs = None
self.degs = None # differentially expressed genes
self.FC = None
self.experiment_genes = None

def run(self, experiment: Experiment) -> ImpactAnalysisResult:
"""

Returns:
list of pathways sorted by their impact factor. Each pathway in the list has values of FDR and
Bonferroni corrections assigned.
"""

# calculate fold change
self.FC = experiment.calculate_fold_change()

# remove genes for witch fold change cannot be calculated correctly
experiment.exclude_genes(list(self.FC['FC'][isnan(self.FC['FC'])].index))

# select differentialy expressed genes
pvalue = ttest(experiment) <= self.threshold
self.DEGs = pvalue[pvalue == True]
self.degs = pvalue[pvalue == True]

if self.DEGs.size == 0:
if self.degs.size == 0:
# if there are no DEGs anywhere, the problem of finding the impact on various pathways is meaningless
print('No differentialy expressed genes.')
return ImpactAnalysisResult([])

# calculate fold change
self.FC = experiment.calculate_fold_change()

self.experiment_genes = set([gene.name for gene in experiment.get_all().genes])

db = KEGGPathways(self.org)
pathways = {}

for gene in [g.name for g in list(self.DEGs.index)]:
for gene in [g.name for g in list(self.degs.index)]:
ps = db.search_by_gene(gene)
for (k, v) in ps.items():
if k not in pathways.keys():
Expand All @@ -75,40 +105,42 @@ def run(self, experiment: Experiment) -> ImpactAnalysisResult:
print('No pathways found in database.')
return ImpactAnalysisResult([])

ifp_pathways = []
res = pd.DataFrame(columns=['name', 'IF', 'pvalue'])
for (code, descr) in pathways.items():
pathway = db.get_pathway(code)
impact_factor, pval = self.calculate_impact_factor(experiment, pathway)
if impact_factor is not None and pval is not None:
ifp = IAPathway(impact_factor, pval, descr)
ifp_pathways.append(ifp)
res.loc[len(res.index)] = [descr, impact_factor, pval]

res['FDR'], res['Bonferroni'] = self.calculate_corrections(res['pvalue'])
ifp_pathways = [IAPathway(res.loc[i]) for i in range(len(res.index))]
ifp_pathways.sort(key=lambda x: x.IF if not isnan(x.IF) else 0, reverse=True)

return ImpactAnalysisResult(ifp_pathways)

def calculate_impact_factor(self, experiment, pathway):

path_genes = set([x.strip() for x in ' ,'.join(pathway.nodes).split(',')])
DEG_genes = set([gene.name for gene in list(self.DEGs.index)])
DEGs_set = set([gene.name for gene in list(self.degs.index)])

# no DEGs in pathway
if len(path_genes & DEG_genes) == 0:
if len(path_genes & DEGs_set) == 0:
return None, None

pval_path = hypergeom_distribution(len(path_genes & DEG_genes), len(self.experiment_genes), self.DEGs.size,
pval_path = hypergeom_distribution(len(path_genes & DEGs_set), len(self.experiment_genes), self.degs.size,
len(path_genes & self.experiment_genes))

if pval_path != 0:
impact_factor = log2(pval_path)

impact_factor += sum(
[self.calculate_perturbation_factor(experiment, gene, pathway) for gene in pathway.nodes]) / len(
path_genes & DEG_genes) * mean([abs(i) for i in self.FC['FC'].values if not isnan(i)])
[abs(self.calculate_perturbation_factor(experiment, gene, pathway)) for gene in pathway.nodes]) / len(
path_genes & DEGs_set) * mean([abs(i) for i in self.FC['logFC'].values if not isnan(i)])

else:
impact_factor = MAX_IF

return round(impact_factor, 3), round(pval_path, 3)
return impact_factor, pval_path

def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None):

Expand All @@ -119,8 +151,9 @@ def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None)
else:
for name in gene.split(','):
if name.strip() in self.experiment_genes:
# get ΔE if measurable #TODO diffrent measures of expression change
pf = self.FC['FC'][Gene(name)] if not isnan(self.FC['FC'][Gene(name)]) else 0
# get ΔE (logFC)
pf = self.FC['logFC'][Gene(name)] if not isnan(self.FC['logFC'][Gene(name)]) else MAX_IF

# genes directly upstream
for edge in pathway.in_edges(gene):
if edge[0] not in visited:
Expand All @@ -131,3 +164,16 @@ def calculate_perturbation_factor(self, experiment, gene, pathway, visited=None)
pf += self.calculate_perturbation_factor(experiment, edge[0], pathway,
visited + [edge[0]]) * beta / dstream
return pf

def calculate_corrections(self, pvalues):
"""

Args:
pvalues: array_like vector of p-values

Returns:
two arrays of p-values corrected for multiple tests using FDR and Bonferroni correction method

"""

return multipletests(pvalues, method='fdr_bh')[1], multipletests(pvalues, method='bonferroni')[1]
13 changes: 13 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ def __eq__(self, other):
def __repr__(self):
return f'<Sample "{self.name}" with {len(self.data)} genes>'

def exclude_genes(self, gene_list: list):
for gene in gene_list:
assert isinstance(gene, Gene)
if gene in self.data.keys():
del self.data[gene]


def first_line(file_object, skip_rows=0):
line = None
Expand Down Expand Up @@ -420,6 +426,10 @@ def from_csv_file(cls, name, file_object, **kwargs):
kwargs['delimiter'] = ','
return cls.from_file(name, file_object, **kwargs)

def exclude_genes(self, gene_list: list):
for sample in self.samples:
sample.exclude_genes(gene_list)


# TODO class variable with set of genes + method(s) for checking data integrity
class Experiment:
Expand Down Expand Up @@ -450,3 +460,6 @@ def calculate_fold_change(self):
ratio = ratio_of_classes(case, control)
fc[idx] = [ratio, log2(ratio)]
return pd.DataFrame.from_dict(fc, orient="index", columns=['FC', 'logFC'])

def exclude_genes(self, gene_list: list):
self.get_all().exclude_genes(gene_list)